Commit 85bb4612 by Maude Le Jeune

### import first slideshow attempt

parent e058aec1
general.tex 0 → 100644
 \documentclass[hyperref={colorlinks=true}]{beamer} \usepackage{graphicx} \usepackage{amsmath} \usepackage[utf8]{inputenc} \usepackage{multicol} \usepackage{ulem} \usepackage{color} \usepackage{xspace} \usepackage{listings} \usepackage{wasysym} \useoutertheme{infolines} \usepackage{hangcaption} \newcommand{\pipelet}{\textbf{\small{PIPELET}}\xspace} \title[Pipelet]{The \pipelet software} \author[Betoule, Le Jeune]{Marc \textsc{Betoule}, Maude \textsc{Le Jeune}} \institute[CNRS]{} \date[2010/09/04]{september, 4th, 2010} \newcommand{\unnumberedcaption}% % {\@dblarg{\@unnumberedcaption\@captype}} \begin{document} \begin{frame}{\pipelet} \tableofcontents \end{frame} \section{Context} \begin{frame} \tableofcontents[currentsection] \end{frame} \begin{frame}{Context and needs} Usually in scientific data processing: \begin{itemize} \item Big data sets \item Complex processing (multiple interdependant steps) \item Optimal parameters unknown \end{itemize} \begin{centering} $\rightarrow$ Computational \textbf{and development} cost a lot.\\ \end{centering} \begin{figure} \includegraphics[width=0.50\textwidth]{img/pipelet_scheme_small2.pdf} \end{figure} The \pipelet software answers the 3 above items: \begin{itemize} \item Computational cost limited to its lower limit \item Guarranty traceability \item Offer comparison facilities \end{itemize} \end{frame} \begin{frame}{The \pipelet software} The main idea behind \pipelet is to: \begin{itemize} \item Cut the whole processing into \textbf{segments} (script files) \item Save intermediate products on disk \item Use an unique indentifier wrt code, parameters and I/Os. \end{itemize} \begin{figure} \includegraphics[width=0.50\textwidth]{img/pipelet_scheme_small3.pdf} \end{figure} \pipelet is written in Python: \begin{itemize} \item High level language offering lots of functionalities \item Known as a glue language ideal for interfacing heterogenous codes \item Ease debugging and interactivity \end{itemize} \end{frame} \section{How it works} \begin{frame} \tableofcontents[currentsection] \end{frame} \begin{frame}{The \pipelet big scheme} \begin{figure} \includegraphics[width=0.90\textwidth]{img/pipelet_scheme.pdf} \end{figure} \end{frame} \subsection{Building a pipeline} \begin{frame}[fragile]{Building a pipeline} \begin{verbatim}P = Pipeline(pipedot, codedir='./', prefix='/data/...') \end{verbatim} \begin{figure} \includegraphics[width=0.5\textwidth]{img/pipelet_scheme_small.pdf} \end{figure} \begin{itemize} \item \verb pipedot is the string description of the pipeline \begin{verbatim}pipedot = """ 1->2->4 3->4 """ \end{verbatim} \item \verb codedir is the path of the processing code files (.py) \item \verb prefix is the path of the processed data repository \end{itemize} \end{frame} \subsection{Writing segment scripts} \begin{frame}[fragile]{Writing segment scripts} \begin{itemize} \item A segment is a python script (\verb .py file) \item It benefits from an improved namespace to: \begin{itemize} \item control the pipe parallelization scheme; \begin{figure} \includegraphics[width=0.98\textwidth]{img/seg_scheme.pdf} \end{figure} \item save and load I/O's and provide filenames; \item save and load parameters; \item execute or include subprocess \end{itemize} \end{itemize} \end{frame} \subsection{Running a pipeline} \begin{frame}[fragile]{Running a pipeline} The pipe engine converts each pair of (processing code, data to process) into a \textcolor{blue}{task list}. \begin{figure} \includegraphics[width=0.80\textwidth]{img/task_scheme.pdf} \end{figure} One can empty the \textcolor{blue}{task list} in different modes: \begin{itemize} \item the interactive mode (or debugging mode) \item the process/thread mode (for smp machine) \item the batch mode (for cluster) \end{itemize} \end{frame} \subsection{Browsing a pipeline} \begin{frame}[fragile]{Browsing a pipeline : \href{http://localhost:8080}{http://localhost:8080}} \begin{figure} \includegraphics[width=0.70\textwidth]{img/snapshot.png} \end{figure} From the web interface one can: \\ \vspace{0.5cm} \begin{tabular}{ll} $\bullet$ Filter/delete pipe instances & from the pipeline page\\ $\bullet$ Highlight dependencies & from the segment page\\ $\bullet$ Read code & from the segment page\\ $\bullet$ Read log files & from the log page\\ $\bullet$ Download/visualize/delete product files & from the product page\\ \end{tabular} \end{frame} \section{Getting started} \begin{frame} \tableofcontents[currentsection] \end{frame} \begin{frame}[fragile]{Getting \pipelet} $\rhd$ Download from \url{http://gitorious.org/pipelet} \begin{itemize} \item Git repository\\ \begin{centering}\verb!git clone git@gitorious.org:pipelet/pipelet.git! \end{centering} \item Open wiki including documentation \end{itemize} \vspace{0.5cm} $\rhd$ Features and bugs are tracked from the IN2P3 forge. \end{frame} \section{Going further} \begin{frame} \tableofcontents[currentsection] \end{frame} \begin{frame}{The \pipelet actors} \begin{figure} \includegraphics[width=1\textwidth]{img/pipelet_actors.pdf} \end{figure} \end{frame} \begin{frame}[fragile]{The pipeline object} The \pipelet scheme is resumed by its segment's relations: \begin{itemize} \item a tree view (dot scheme) \item a flat view \begin{figure} \includegraphics[width=0.5\textwidth]{img/pipeline.pdf} \end{figure} \end{itemize} Segment's code files are found from: \begin{itemize} \item a local repository \item \textcolor{red}{a git, CVS repository} \end{itemize} For each segment, a unique hash key is computed from: \begin{itemize} \item the segment code script (\verb seg_name_code.py ) \item the hooks scripts (\verb seg_name_hookname.py ) \end{itemize} \textsl{removing blank lines and comments.} \end{frame} \begin{frame}[fragile]{The task object} A task is the association of a \textcolor{blue}{segment} with its \textcolor{blue}{input} product, its execution \textcolor{blue}{status} and its \textcolor{blue}{output} product(s). \\ \vspace{0.5cm} The task attributs: \begin{columns} \begin{column}[l]{0.65\textwidth} \begin{itemize} \item segment name (string) \item task input (list) \item task output (list) \item task identifier (integer) \item status string \verb! queued, running, done, failed! \item date string \verb! queued_on, begun_on, ended_on! \item task parents (list of identifiers) \end{itemize} \end{column} \begin{column}[r]{0.35\textwidth} \begin{figure} \includegraphics[width=1\textwidth]{img/task.pdf} \end{figure} \end{column} \end{columns} \end{frame} \begin{frame}[fragile]{The scheduler object} \begin{columns} \begin{column}[l]{0.65\textwidth} The \verb scheduler.push_next_seg() function: \begin{itemize} \item add tasks to the task list \item all segment's tasks are pushed at the same time \item using the flat view \item if no task has never been done \end{itemize} \end{column} \begin{column}[r]{0.35\textwidth} \begin{figure} \includegraphics[width=1\textwidth]{img/scheduler.pdf} \end{figure} \end{column} \end{columns} \vspace{0.5cm} The tasks inputs are build from: \begin{itemize} \item the segment \verb seg_input and \verb seg_output variables \item the segment \verb #multiplex directives \verb!#multiplex cross_prod group_by '0' !\\ \verb!#multiplex cross_prod group_by 'p1' !\\ \verb!#multiplex cross_prod group_by 'p1[0]' !\\ \verb!#multiplex union ! \end{itemize} \end{frame} \begin{frame}[fragile]{The worker object} \begin{figure} \includegraphics[width=0.5\textwidth]{img/worker.pdf} \end{figure} The \verb worker.execute_task(task) function: \begin{enumerate} \item load environment namespace \item put task inputs to namespace \item \verb try to execute segment script using namespace \item set new task status (\verb!done or failed!) \item if \verb done get task output from namespace \end{enumerate} \end{frame} \begin{frame}[fragile]{The tracker object} \begin{figure} \includegraphics[width=0.75\textwidth]{img/tracker.pdf} \end{figure} The \verb tracker.segment_registration() function: \begin{itemize} \item insert new entry in sql segments and segment relations tables \end{itemize} The \verb tracker.add_queued(task) function: \begin{itemize} \item insert new entry in sql tasks and task relations tables \end{itemize} The \verb tracker.update_status(task) function: \begin{itemize} \item update task entry in sql tasks table using \textcolor{blue}{asynchronous} request \end{itemize} \end{frame} \end{document}
img/pipeline.dia 0 → 100644