# -*- coding: utf-8 -*- """report_dataframe module """ import json import numpy as np import pandas as pd import time from datetime import date from gluon import current from gluon.storage import Storage DATE_MIN = date(1990, 01, 01) DATE_MAX = date(2050, 12, 31) HISTORY_QUERY_FIELDS = [ "id_events", "id_domains", "id_fundings", "id_objects", "id_object_categories", "id_people", "id_people_code", "id_projects", "id_teams", "object_categories_category", "people_categories_category", "year_end", "year_start"] def active_period(**kwargs): """Determine the period of activity for a person / object when the domain and/or project and/or team and or category is fixed. For example, for a person belonging to a domain and team for which people category / project evolve as function of time, the start and the end of the period of activity are shown by the `^`:: ....... ........ .......... ^ ^ Keyword Args: id_events (int): identifier of the event [required] id_domains (int): id_fundings (int): id_objects (int): id_object_categories (int): id_people (int): id_people_categories (int): id_projects (int): id_teams (int): Returns tuple: start (date): is fixed to DATE_MIN when not defined stop (date): is fixed to DATE_MAX when not defined """ db = current.globalenv['db'] history = db.history # extract database information query = query_history(db, **kwargs) fields = [history.start_date, history.end_date] rows = db.executesql(db(query)._select(*fields)) # fin the start / end point using DataFrame df = (pd.DataFrame(list(rows), columns=["start", "end"]) .assign( start=lambda x: x.start.fillna(DATE_MIN), end=lambda x: x.end.fillna(DATE_MAX)) .sort_values("start")) return (df.start.min(), df.end.max()) def coverage(start_date, end_date, year_start, year_end): """Vectorized function to compute the quantify coverage. Args: start_date (datetime64): start date of the event start_date (datetime64): end date of the event year_start (datetime64): start of the year end_year (datetime64): end of the year Returns: float64 """ coverage = \ np.minimum(end_date, year_end) - np.maximum(start_date, year_start) return coverage / (year_end - year_start) def db2df(db, query, fields=[], columns=None): """Transform a database query into a dataframe. Example:: # database query query = history.id_events == id_event query &= history.id_projects == id_project query &= history.id_fundings == db.fundings.id query &= history.id_people == db.people.id query &= history.id_people_categories == db.people_categories.id query &= history.id_teams == db.teams.id # fields to be extracted from the database fields = [ history.id_domains, history.id_events, history.id_people, history.id_projects, history.id_teams, db.teams.team, db.people_categories.category, db.people_categories.code, db.people.first_name, db.people.last_name, history.percentage, history.start_date, history.end_date, history.data, db.fundings.agency] # get the DataFrame df = db2df(query, fields) print df.info() Args: db (pyDAL.DAL): database connection. query: database query including statement to join foreign table. fields (list): list of pyDAL.Field to be extracted from the query. All fields are extracted when not defined. columns (list): name of the columns in the dataframe. There is one to one correspondence between the fields and columns list. Names of database field are used when columns is not defined. Returns: pandas.DataFrame """ if columns is None and len(fields) > 0: columns = [field.name for field in fields] rows = db.executesql(db(query)._select(*fields)) return pd.DataFrame.from_records(list(rows), columns=columns) def debug_df(df, opt=""): """Helper tool to debug a DataFrame. Args df (pandas.DataFrame): opt (str): debug options: * ``a``: equivalent to icIht * ``c``: show columns * ``I``: show index * ``h``: show head * ``t``: show tail * ``f``: show the whole dataframe """ if "i" in opt or "a" in opt: df.memory_usage() print df.info() if "c" in opt or "a" in opt: print df.columns.sort_values() if "I" in opt or "a" in opt: print df.index if "h" in opt or "a" in opt: print df.head() if "t" in opt or "a" in opt: print df.tail() if "f" in opt or "a" in opt: print df def elapse_time(start_date, end_date, xdate): """Vectorized function to compute the elapse time between the starting date and xdate or end_date: * it is computed between the start_date and the end_date when the end_date is above xdate * otherwise it is computed between the start_date and xdate Args: start_date (pandas.Series): starting date for items end_date (pandas.Series): ending date for items xdate (date) Returns: pandas.Series: with the timedelta in days """ return end_date.mask(end_date > xdate, xdate) - start_date def expand_per_year(df, year_start, year_end): """Expand the list of active items between year_start and year_end to a list of active items per year. An item appears several time when it is active during several years. Graphically, the item --+...+ 2012-06 2014-02 +--++ becomes 2012 --+...+ 2012-06 2014-02 +--++ 2013 --+...+ 2012-06 2014-02 +--++ 2014 --+...+ 2012-06 2014-02 +--++ Note: The columns `start_date` and `end_date` have to be defined in the DataFrame. Args: year_start (date): year_end (date): Returns: pandas.DataFrame: Columns are: * those of the initial DataFrame * year (int) """ # for each item compute the list of year for which the item is active # the information is kept in a PeriodIndex stored in the column years. # NOTE: # - failed to implemented a faster way using df.apply # li = [] for row in df.itertuples(): li.append([pd.period_range(row.start_date, row.end_date, freq="A")]) df1 = df.merge(pd.DataFrame(li, columns=["years"]), left_index=True, right_index=True) # for each year keep the list of active items # an item appears several time when it is active during several years. df2 = pd.DataFrame() for year in xrange(year_start, year_end+1): dfi = (df1.loc[[str(year) in el for el in df1.years]] .assign(year=year)) df2 = pd.concat((df2, dfi), ignore_index=True) # Keep the memory foot print low by removing the years column df2 = df2.drop("years", axis="columns") return df2 def full_name(first_name, last_name): """Vectorized function to build the full name of a people. Args first (series of string): last (series of string): Returns: Series of string """ return first_name.str.cat(last_name, sep=" ") def get_column_names(fields): """Snippet to get list of column names from a list of database field. Rules are applied to resolve ambiguities when the field name is used in different table, *e.g* code, category or note. Args: fields (list): list of gluon.dal.Field Returns: list """ # columns name for the DataFrame from database fields # resolve ambiguities for code, category and note columns = [] for field in fields: name = field.name if name in ("code", "category", "note"): name = "%s_%s" % (field._table._tablename, name) columns.append(name) return columns def get_items(**kwargs): """Return the DataFrame with active items belonging to the event id_event. The DataFrame contains all information for each item, namely the meta-data, categories, domain, funding, object, people, project, team information as well as the history.data. The latter is a dictionary which is "json normalised", one column per key. The selection of the item is performed by querying the database. The query is built using the keyword arguments. Active items are selected using the arguments ``year_start`` and ``year_end``. Keyword Args: id_events (int): identifier of the event [required] id_domains (int): id_fundings (int): id_objects (int): id_object_categories (int): id_people (int): id_people_categories (int): id_projects (int): id_teams (int): year_end (date): year_start (date): Returns: pandas.DataFrame: The index of the DataFrame is the history table identifier. The columns of the DataFrame are: * id_domains (int) * id_events (int) * id_fundings (int) * id_objects (int) * id_objects_categories (int) * id_people (int) * id_people_categories (int) * id_projects (int) * id_teams (int) * agency (str) * batch_number (str) * birth_date (str) * domain (str) * duration (float): in days * end_date (date): undefined dates are forced to ``DATE_MAX`` * first_name (str) * history_note (str) * last_name (str) * object_categories_category (str) * object_categories_code (str) * objects_note (str) * people_categories_category (str) * people_categories_code (str) * people_note (str) * percentage (int) * project (str) * reference (str) * serial_number (str) * start_date (date): undefined dates are forced to ``DATE_MIN`` * team (str) * one column for each key of the history.data dictionary """ db = current.globalenv['db'] history = db.history # database query including foreign table query = query_history(db, **kwargs) query &= history.id_domains == db.domains.id query &= history.id_fundings == db.fundings.id query &= history.id_objects == db.objects.id query &= history.id_object_categories == db.object_categories.id query &= history.id_people == db.people.id query &= history.id_people_categories == db.people_categories.id query &= history.id_projects == db.projects.id query &= history.id_teams == db.teams.id # fields to be extracted from the database fields = [ history.id, history.id_domains, history.id_events, history.id_fundings, history.id_objects, history.id_object_categories, history.id_people, history.id_people_categories, history.id_projects, history.id_teams, history.data, history.note, history.percentage, history.start_date, history.end_date, db.domains.domain, db.fundings.agency, db.objects.reference, db.objects.serial_number, db.objects.batch_number, db.objects.note, db.object_categories.code, db.object_categories.category, db.people.first_name, db.people.last_name, db.people.birth_date, db.people.note, db.people_categories.code, db.people_categories.category, db.projects.project, db.teams.team] # columns name for the DataFrame # resolve ambiguities for code, category and note columns = get_column_names(fields) # the DataFrame df = (db2df(db, query, fields, columns) .set_index("id") .pipe(normalize_history_data) .assign( start_date=lambda x: x.start_date.fillna(DATE_MIN), end_date=lambda x: x.end_date.fillna(DATE_MAX))) df["duration"] = (elapse_time(df.start_date, df.end_date, date.today()) .apply(lambda x: x.days)) return df def get_items_per_year(**kwargs): """Return the DataFrame with active items for each year, belonging to the event id_event. An item appears several time when it is active during several years. The DataFrame contains all information for each item, namely the meta-data, categories, domain, funding, object, people, project, team information as well as the history.data. The latter is a dictionary which is "json normalised", one column per key. The selection of the item is performed by querying the database. The query is built using the keyword arguments. The range of years are selected using the arguments ``year_start`` and ``year_end``. The scan is between ``DATE_MIN`` and/or ``DATE_MAX`` when ``year_start`` and/or ``DATE_MIN`` is/are not defined. Keyword Args: id_events (int): identifier of the event [required] id_domains (int): id_fundings (int): id_objects (int): id_object_categories (int): id_people (int): id_people_categories (int): id_projects (int): id_teams (int): year_end (date): year_start (date): Returns: pandas.DataFrame: The columns of the DataFrame are: * id (int): history table identifier for the item * id_domains (int) * id_events (int) * id_fundings (int) * id_objects (int) * id_objects_categories (int) * id_people (int) * id_people_categories (int) * id_projects (int) * id_teams (int) * agency (str) * batch_number (str) * birth_date (str) * domain (str) * duration (float): in days * end_date (date): undefined dates are forced to ``DATE_MAX`` * first_name (str) * history_note (str) * last_name (str) * object_categories_category (str) * object_categories_code (str) * objects_note (str) * people_categories_category (str) * people_categories_code (str) * people_note (str) * percentage (int) * project (str) * reference (str) * serial_number (str) * start_date (date): undefined dates are forced to ``DATE_MIN`` * team (str) * one column for each key of the history.data dictionary """ return to_items_per_year(get_items, **kwargs) def get_items_small(**kwargs): """Return the DataFrame with active items belonging to the event id_event. The DataFrame contains the minimal set of information for each item, namely the meta-data as well as the history.data. The latter is a dictionary which is "json normalised", one column per key. The selection of the item is performed by querying the database. The query is built using the keyword arguments. Active items are selected using the arguments ``year_start`` and ``year_end``. Keyword Args: id_events (int): identifier of the event [required] id_domains (int): id_fundings (int): id_objects (int): id_object_categories (int): id_people (int): id_people_categories (int): id_projects (int): id_teams (int): year_end (date): year_start (date): Returns: pandas.DataFrame: The index of the DataFrame is the history table identifier. The columns of the DataFrame are: * id_domains (int) * id_events (int) * id_fundings (int) * id_objects (int) * id_objects_categories (int) * id_people (int) * id_people_categories (int) * id_projects (int) * id_teams (int) * duration (float): in days * end_date (date): undefined dates are forced to ``DATE_MAX`` * start_date (date): undefined dates are forced to ``DATE_MIN`` * one column for each key of the history.data dictionary """ db = current.globalenv['db'] history = db.history # database query query = query_history(db, **kwargs) # fields to be extracted from the database fields = [ history.id, history.id_domains, history.id_events, history.id_fundings, history.id_objects, history.id_object_categories, history.id_people, history.id_people_categories, history.id_projects, history.id_teams, history.start_date, history.end_date, history.data] # columns name for the DataFrame columns = [field.name for field in fields] # the DataFrame df = (db2df(db, query, fields, columns) .set_index("id") .pipe(normalize_history_data) .assign( start_date=lambda x: x.start_date.fillna(DATE_MIN), end_date=lambda x: x.end_date.fillna(DATE_MAX))) df["duration"] = (elapse_time(df.start_date, df.end_date, date.today()) .apply(lambda x: x.days)) return df def get_items_small_per_year(**kwargs): """Return the DataFrame with active items for each year, belonging to the event id_event. An item appears several time when it is active during several years. The DataFrame contains the minimal set of information for each item, namely the meta-data, the year as well as the history.data. The latter is a dictionary which is "json normalised", one column per key. The selection of the item is performed by querying the database. The query is built using the keyword arguments. The range of years are selected using the arguments ``year_start`` and ``year_end``. The scan is between ``DATE_MIN`` and/or ``DATE_MAX`` when ``year_start`` and/or ``DATE_MIN`` is/are not defined. Keyword Args: id_events (int): identifier of the event [required] id_domains (int): id_fundings (int): id_objects (int): id_object_categories (int): id_people (int): id_people_categories (int): id_projects (int): id_teams (int): year_end (date): year_start (date): Returns: pandas.DataFrame: The columns of the DataFrame are: * id (int): history table identifier for the item * id_domains (int) * id_events (int) * id_objects (int) * id_objects_categories (int) * id_people (int) * id_people_categories (int) * id_projects (int) * id_teams (int) * duration (float): in days * end_date (date): undefined dates are forced to ``DATE_MAX`` * start_date (date): undefined dates are forced to ``DATE_MIN`` * year (int) * one column for each key of the history.data dictionary """ return to_items_per_year(get_items_small, **kwargs) def get_objectlike_items(**kwargs): """Return the DataFrame with active items belonging to the event id_event when items are related to object. The DataFrame contains a reduced set of information for each item, namely the meta-data, category, domain, funding, object, project, team information as well as the history.data. The latter is a dictionary which is "json normalised", one column per key. The selection of the item is performed by querying the database. The query is built using the keyword arguments. Active items are selected using the arguments ``year_start`` and ``year_end``. Keyword Args: id_events (int): identifier of the event [required] id_domains (int): id_fundings (int): id_objects (int): id_object_categories (int): id_projects (int): id_teams (int): year_end (date): year_start (date): Returns: pandas.DataFrame: The index of the DataFrame is the history table identifier. The columns of the DataFrame are: * id_domains (int) * id_events (int) * id_fundings (int) * id_objects (int) * id_objects_categories (int) * id_projects (int) * id_teams (int) * agency (str) * batch_number (str) * domain (str) * duration (float): in days * end_date (date): undefined dates are forced to ``DATE_MAX`` * history_note (str) * object_categories_category (str) * object_categories_code (str) * objects_note (str) * project (str) * reference (str) * serial_number (str) * start_date (date): undefined dates are forced to ``DATE_MIN`` * team (str) * one column for each key of the history.data dictionary """ db = current.globalenv['db'] history = db.history # database query including foreign table query = query_history(db, **kwargs) query &= history.id_domains == db.domains.id query &= history.id_fundings == db.fundings.id query &= history.id_objects == db.objects.id query &= history.id_object_categories == db.object_categories.id query &= history.id_projects == db.projects.id query &= history.id_teams == db.teams.id # fields to be extracted from the database fields = [ history.id, history.id_domains, history.id_events, history.id_fundings, history.id_objects, history.id_object_categories, history.id_projects, history.id_teams, history.data, history.note, history.start_date, history.end_date, db.domains.domain, db.fundings.agency, db.objects.reference, db.objects.serial_number, db.objects.batch_number, db.objects.note, db.projects.project, db.teams.team] # columns name for the DataFrame # resolve ambiguities for code, category and note columns = get_column_names(fields) # the DataFrame df = (db2df(db, query, fields, columns) .set_index("id") .pipe(normalize_history_data) .assign( start_date=lambda x: x.start_date.fillna(DATE_MIN), end_date=lambda x: x.end_date.fillna(DATE_MAX))) df["duration"] = (elapse_time(df.start_date, df.end_date, date.today()) .apply(lambda x: x.days)) return df def get_objectlike_items_per_year(**kwargs): """Return the DataFrame with active items for each year, belonging to the event id_event when the event is related to object. An item appears several time when it is active during several years. The DataFrame contains a reduced set of information for each item, namely the meta-data, category, domain, funding, object, project, team information as well as the history.data. The latter is a dictionary which is "json normalised", one column per key. The selection of the item is performed by querying the database. The query is built using the keyword arguments. The range of years are selected using the arguments ``year_start`` and ``year_end``. The scan is between ``DATE_MIN`` and/or ``DATE_MAX`` when ``year_start`` and/or ``DATE_MIN`` is/are not defined. Keyword Args: id_events (int): identifier of the event [required] id_domains (int): id_fundings (int): id_objects (int): id_object_categories (int): id_projects (int): id_teams (int): year_end (date): year_start (date): Returns: pandas.DataFrame: The columns of the DataFrame are: * id (int): history table identifier for the item * id_domains (int) * id_events (int) * id_fundings (int) * id_objects (int) * id_objects_categories (int) * id_projects (int) * id_teams (int) * agency (str) * batch_number (str) * domain (str) * duration (float): in days * end_date (date): undefined dates are forced to ``DATE_MAX`` * history_note (str) * object_categories_category (str) * object_categories_code (str) * objects_note (str) * project (str) * reference (str) * serial_number (str) * start_date (date): undefined dates are forced to ``DATE_MIN`` * team (str) * one column for each key of the history.data dictionary """ return to_items_per_year(get_objectlike_items, **kwargs) def get_peoplelike_items(**kwargs): """Return the dataFrame with active items belonging to the event id_event when the event is related to people. The DataFrame contains a reduced set of information for each item, namely the meta-data, category, domain, funding, people, project, team information as well as the history.data. The latter is a dictionary which is "json normalised", one column per key. The selection of the item is performed by querying the database. The query is built using the keyword arguments. Active items are selected using the arguments ``year_start`` and ``year_end``. Keyword Args: id_events (int): identifier of the event [required] id_domains (int): id_fundings (int): id_people (int): id_people_categories (int): id_projects (int): id_teams (int): year_end (date): year_start (date): Returns: pandas.DataFrame: The index of the DataFrame is the history table identifier. The columns of the DataFrame are: * id_domains (int) * id_fundings (int) * id_events (int) * id_people (int) * id_people_categories (int) * id_projects (int) * id_teams (int) * agency (str) * birth_date (str) * domain (str) * duration (float): in days * end_date (date): undefined dates are forced to ``DATE_MAX`` * first_name (str) * full_name(str) * history_note (str) * last_name (str) * people_categories_category (str) * people_categories_code (str) * people_note (str) * percentage (int) * project (str) * start_date (date): undefined dates are forced to ``DATE_MIN`` * team (str) * one column for each key of the history.data dictionary """ db = current.globalenv['db'] history = db.history # database query including foreign table query = query_history(db, **kwargs) query &= history.id_domains == db.domains.id query &= history.id_fundings == db.fundings.id query &= history.id_people == db.people.id query &= history.id_people_categories == db.people_categories.id query &= history.id_projects == db.projects.id query &= history.id_teams == db.teams.id # fields to be extracted from the database fields = [ history.id, history.id_domains, history.id_events, history.id_fundings, history.id_people, history.id_people_categories, history.id_projects, history.id_teams, history.data, history.note, history.percentage, history.start_date, history.end_date, db.domains.domain, db.fundings.agency, db.people.first_name, db.people.last_name, db.people.birth_date, db.people.note, db.people_categories.code, db.people_categories.category, db.projects.project, db.teams.team] # columns name for the DataFrame # resolve ambiguities for code, category and note columns = get_column_names(fields) # the DataFrame df = (db2df(db, query, fields, columns) .set_index("id") .pipe(normalize_history_data) .assign( full_name=lambda x: full_name(x.first_name, x.last_name), start_date=lambda x: x.start_date.fillna(DATE_MIN), end_date=lambda x: x.end_date.fillna(DATE_MAX))) df["duration"] = (elapse_time(df.start_date, df.end_date, date.today()) .apply(lambda x: x.days)) return df def get_people_per_year(**kwargs): """Return the DataFrame with active items for each year, belonging to a people like event. An item appears several time when it is active during several years. The DataFrame contains a reduced set of information for each item, namely the meta-data, category, domain, funding, people, project, team information, the ``history.data`` and a set of computed values. The ``history.data`` is a dictionary which is "json normalised", one column per key. The DataFrame contains computed columns: * coverage * fte * full_name * is_end * is_start * period_end * period_start The selection of items is performed by querying the database. The query is built using the keyword arguments. The range of years are selected using the arguments ``year_start`` and ``year_end``. The scan is between ``DATE_MIN`` and/or ``DATE_MAX`` when ``year_start`` and/or ``DATE_MIN`` is/are not defined. Item belonging to the same year, domain, team project and people are group in order to compute properly coverage and fte values. Keyword Args: id_events (int): identifier of the people like event [required] id_domains (int): id_fundings (int): id_people (int): id_people_categories (int): id_projects (int): id_teams (int): year_end (date): year_start (date): Returns: pandas.DataFrame: The columns of the DataFrame are: * id (int): history table identifier for the item * id_domains (int) * id_events (int) * id_fundings (int) * id_people (int) * id_people_categories (int) * id_projects (int) * id_teams (int) * agency (str) * birth_date (str) * coverage (float) * domain (str) * end_date (date): undefined dates are forced to ``DATE_MAX`` * first_name (str) * full_name (str) * fte (float) * history_note (str) * is_end (bool) * is_start (bool) * last_name (str) * people_categories_category (str) * people_categories_code (str) * people_note (str) * percentage * period_start (date): * period_end (date): * period_duraton (float): in days * project (str) * start_date (date): undefined dates are forced to ``DATE_MIN`` * team (str) * one column for each key of the history.data dictionary """ # compute the period of activity when the person is attached to the # same domain, project and team # compute the beginning and ending of each year # it is temporarily columns which are used in coverage computation df = (to_items_per_year(get_peoplelike_items, **kwargs) .drop("duration", axis="columns") .assign( period=lambda x: x.apply(lambda y: active_period(id_events=y.id_events, id_people=y.id_people, id_domains=y.id_domains, id_teams=y.id_teams, id_projects=y.id_projects), axis="columns")) .assign( period_start=lambda x: x.period.apply(lambda y: y[0]), period_end=lambda x: x.period.apply(lambda y: y[1]), year_start=lambda x: x.year.apply(lambda y: date(y, 01, 01)), year_end=lambda x: x.year.apply(lambda y: date(y, 12, 31)))) # # compute coverage, fte, full_name, is_start and is_end # df = (df .assign( coverage=lambda x: coverage(x.start_date, x.end_date, x.year_start, x.year_end), full_name=lambda x: full_name(x.first_name, x.last_name), is_start=lambda x: is_start(x.year_start, x.year_end, x.period_start), is_end=lambda x: is_end(x.year_start, x.year_end, x.period_end)) .assign(fte=lambda x: x.coverage * x.percentage * 0.01)) # # aggregate item split during the year and sum the fte # cols = ["year", "id_domains", "id_projects", "id_teams", "id_people"] group = df.groupby(cols) dfa = group[["fte"]].aggregate(np.sum).reset_index() dfb = df.drop("fte", axis="columns") df = (pd.merge(dfa, dfb, on=cols, how="left") .drop_duplicates(cols)) # # Clean and keep memory footprint low # df = (df.drop(["period", "year_start", "year_end"], axis="columns") .assign( agency=lambda x: pd.Categorical(x.agency), people_categories_category=lambda x: pd.Categorical(x.people_categories_category), people_categories_code=lambda x: pd.Categorical(x.people_categories_code), team=lambda x: pd.Categorical(x.team))) df["period_duration"] = \ (elapse_time(df.period_start, df.period_end, date.today()) .apply(lambda x: x.days)) return df def get_peoplelike_items_per_year(**kwargs): """Return the DataFrame with active items for each year, belonging to the event id_event when the event is related to people. An item appears several time when it is active during several years. The DataFrame contains a reduced set of information for each item, namely the meta-data, category, domain, funding, people, project, team information as well as the history.data. The latter is a dictionary which is "json normalised", one column per key. The selection of the item is performed by querying the database. The query is built using the keyword arguments. The range of years are selected using the arguments ``year_start`` and ``year_end``. The scan is between ``DATE_MIN`` and/or ``DATE_MAX`` when ``year_start`` and/or ``DATE_MIN`` is/are not defined. Keyword Args: id_events (int): identifier of the event [required] id_domains (int): id_fundings (int): id_people (int): id_people_categories (int): id_projects (int): id_teams (int): year_end (date): year_start (date): Returns: pandas.DataFrame: The columns of the DataFrame are: * id (int): history table identifier for the item * id_domains (int) * id_events (int) * id_fundings (int) * id_people (int) * id_people_categories (int) * id_projects (int) * id_teams (int) * agency (str) * birth_date (str) * domain (str) * duration (float): in days * end_date (date): undefined dates are forced to ``DATE_MAX`` * first_name (str) * full_name (str) * history_note (str) * last_name (str) * people_categories_category (str) * people_categories_code (str) * people_note (str) * percentage (int) * project (str) * start_date (date): undefined dates are forced to ``DATE_MIN`` * team (str) * one column for each key of the history.data dictionary """ return to_items_per_year(get_peoplelike_items, **kwargs) def is_end(year_start, year_end, period_end): """Vectorized function to determine if the active period end in the year. Args year_start (Series of date) year_end (Series of date) period_end (series of date) Returns Series of boolean """ return (year_start <= period_end) & (period_end <= year_end) def is_start(year_start, year_end, period_start): """Vectorized function to determine if the active period start in the year. Args year_start (Series of date) year_end (Series of date) period_start (series of date) Returns Series of boolean """ return (year_start <= period_start) & (period_start <= year_end) def normalize_history_data(df): """Normalise the JSON field ``history.data``. The database field ``history.data`` contains a JSON string serialising a dictionary. Each key is transformed into a DataFrame columns which are add to the initial DataFrame. The column data is dropped. Args: df (pandas.DataFrame): the DataFrame has to contains the columns data. Returns: pandas.DataFrame """ # 1) convert json string into json object for each row # 2) convert the data serie into a json string preserving index (split) # Fix the issue with "name":"data" in pandas version 0.18 sdata = (df.data .fillna("{}") .apply(lambda x: json.loads(x)) .to_json(orient="split") .replace('"name":"data",', "")) # 3) convert the json string into a dataframe, with a column for each key # 4) fix possible wrong boolean value # 5) merge dataframe using index # 6) drop the obsolete columns data df = (pd.read_json(sdata, orient="split") .replace("false", False) .replace("true", True) .merge(df, left_index=True, right_index=True, how="right") .drop("data", axis="columns")) return df def query_history(db, **kwargs): """Build the query for the history table from the keyword arguments. The operator AND is applied between the conditions. Args: db (gluon.dal.DAL): database connection. Keyword Args: id_events (int): id_domains (int): id_fundings (int): id_objects (int): id_object_code (int): select object per category.code id_people (int): id_people_code (int): select people per category.code (quality) id_projects (int): id_teams (int): object_categories_category (str): select object per category people_categories_category (str): select people per category year_end (date): year_start (date): Returns: gluon.DAL.Query """ history = db.history query = None di = {k: kwargs[k] for k in HISTORY_QUERY_FIELDS if k in kwargs} for k, v in di.iteritems(): if (v in ('', None)) or (k == "data"): continue if k == "year_end": year_end = date(int(di["year_end"]), 12, 31) qi = history.start_date <= year_end elif k == "year_start": year_start = date(int(di["year_start"]), 01, 01) qi = (history.end_date == None) | (history.end_date >= year_start) elif k == "id_object_code": qi = history.id_object_categories == int(v) elif k == "object_categories_category": object_cats = db.object_categories cat_ids = db(object_cats.category == v)._select(object_cats.id) qi = history.id_object_categories.belongs(cat_ids) elif k == "id_people_code": qi = history.id_people_categories == int(v) elif k == "people_categories_category": people_cats = db.people_categories cat_ids = db(people_cats.category == v)._select(people_cats.id) qi = history.id_people_categories.belongs(cat_ids) else: qi = history[k] == v query = (qi if query is None else (query) & (qi)) return query class Timer(object): """Utility to time a part of the code. From https://www.huyng.com/posts/python-performance-analysis Example:: with Timer("build data frame for RH") as t: df = get_human_resources("LHCb", 2014, 2018) """ def __init__(self, msg): self.msg = msg def __enter__(self): self.start = time.time() return self def __exit__(self, *args): self.end = time.time() self.secs = self.end - self.start self.msecs = self.secs * 1000 # millisecs print '\n%s\n\telapsed time: %f ms' % (self.msg, self.msecs) def to_extjs_gridcolumns(df, meta=False): """Introspect the DataFrame in order to derive configuration for ``Ext.grid.column.Column`` which are used by an ``Ext.grid.Panel``. Note: * The first columns shows the row number. * The type of the grid column is derived from the DataFrame.dtype as well as from the name of the DataFrame column. * The default xtype ``gridcolumn`` is used for string and for dtype == object which are not resolved. Later, the user can tune the column type. Args: df (pandas.DataFrame): meta (bool): add column with metadata identifier when true Returns: list: configuration are stored in a dictionary. """ li = [dict(xtype="rownumberer")] cfgcol = Storage(dataIndex="", flex=1, hidden=False, text="", xtype="gridcolumn") for colname, coltype in df.dtypes.sort_index().iteritems(): cfg = Storage(cfgcol) cfg.dataIndex = cfg.text = colname if colname.startswith("id"): if not meta: continue cfg.flex = 0.5 cfg.format = "0" cfg.hidden = True cfg.xtype = "numbercolumn" elif coltype in (np.int8, np.int16, np.int32, np.int64): cfg.format = "0" cfg.xtype = "numbercolumn" elif coltype in (np.float16, np.float32, np.float64): cfg.formatter = "round(2)" cfg.summaryFormatter = "round(2)" cfg.summaryType = "sum" cfg.xtype = "numbercolumn" elif coltype == np.bool_ or colname.startswith("is"): cfg.falseText = "False" cfg.trueText = "True" cfg.xtype = "booleancolumn" elif coltype == np.datetime64 or colname in ("date", "end_date", "period_end", "period_start", "start_date"): cfg.format = "Y-m-d" cfg.xtype = "datecolumn" li.append(cfg) return li def to_items_per_year(func, **kwargs): """Snippet transforming a of active items between ``year_start`` and ``year_end`` to a list of active items per year. The year range is defined by the keyword ``year_start`` and ``year_end``. The underlying logic deal with all cases. Args: func (reference): reference to a function generating the dataframe for active items. **kwargs (dict): key, value pair to select item, the event and the year range. They are use by the function func. Returns: pandas.DataFrame """ is_start = \ "year_start" in kwargs and kwargs["year_start"] not in (None, '') is_end = "year_end" in kwargs and kwargs["year_end"] not in (None, '') if not is_start and not is_end: kwargs["year_start"] = DATE_MIN.year kwargs["year_end"] = DATE_MAX.year elif is_start and not is_end: kwargs["year_end"] = kwargs["year_start"] elif is_end and not is_start: kwargs["year_start"] = kwargs["year_end"] ystart = int(kwargs["year_start"]) yend = int(kwargs["year_end"]) df = (func(**kwargs) .reset_index() .pipe(expand_per_year, ystart, yend)) return df