Commit 307c0484 authored by LE GAC Renaud's avatar LE GAC Renaud
Browse files

Update report_dataframes to add get_items_small, get_items_per_year_samll, expand_per_year

parent fb6fb65f
......@@ -18,9 +18,12 @@ from model_report import Report
from model_selector import Selector
from report_dataframes import (db2df,
get_event_minimal,
debug_df,
get_items_small,
get_items_per_year_small,
normalize_history_data,
query_history)
query_history,
Timer)
from report_objects import (do_title,
get_value,
......
......@@ -4,6 +4,7 @@
"""
import json
import pandas as pd
import time
from datetime import date
......@@ -11,8 +12,8 @@ from gluon import current
from plugin_dbui import get_id
YEAR_MIN = date(1900, 01, 01)
YEAR_MAX = date(9999, 12, 31)
DATE_MIN = date(1990, 01, 01)
DATE_MAX = date(2050, 12, 31)
def db2df(db, query, fields=[], columns=None):
......@@ -73,27 +74,56 @@ def db2df(db, query, fields=[], columns=None):
return pd.DataFrame.from_records(list(rows), columns=columns)
def get_event_minimal(**kwargs):
"""DataFrame with a minimal set of information for an event.
The selection of the event is performed via the keyword arguments.
def debug_df(df, opt=""):
"""Helper tool to debug a DataFrame.
The index of the DataFrame is the database id.
The columns of the DataFrame are:
Args
df (pandas.DataFrame):
opt (str): debug options:
* ``a``: equivalent to icIht
* ``c``: show columns
* ``I``: show index
* ``h``: show head
* ``t``: show tail
* ``f``: show the whole dataframe
* one column for each key of the history.data dictionary
* end_date (date)
* id_domains (int)
* id_events (int)
* id_objects (int)
* id_objects_categories (int)
* id_people (int)
* id_people_categories (int)
* id_projects (int)
* id_teams (int)
* start_date (date)
"""
if "i" in opt or "a" in opt:
df.memory_usage()
print df.info()
if "c" in opt or "a" in opt:
print df.columns
if "I" in opt or "a" in opt:
print df.index
if "h" in opt or "a" in opt:
print df.head()
if "t" in opt or "a" in opt:
print df.tail()
if "f" in opt or "a" in opt:
print df
def get_items_small(**kwargs):
"""Return the DataFrame with active items belonging to the event id_event.
The DataFrame contains the minimal set of information for each item,
namely the meta-data as well as the history.data. The latter is a
dictionary which is "json normalised", one column per key.
The selection of the item is performed by querying the database.
The query is built using the keyword arguments.
Active items are selected using the arguments ``year_start``
and ``year_end``.
Keyword Args:
id_events (int): identifier of the event
id_events (int): identifier of the event [required]
id_domains (int):
id_fundings (int):
id_objects (int):
......@@ -106,7 +136,21 @@ def get_event_minimal(**kwargs):
year_start (date):
Returns:
pandas.DataFrame
pandas.DataFrame:
The index of the DataFrame is the history table identifier.
The columns of the DataFrame are:
* one column for each key of the history.data dictionary
* end_date (date): undefined dates are forced to ``DATE_MAX``
* id_domains (int)
* id_events (int)
* id_objects (int)
* id_objects_categories (int)
* id_people (int)
* id_people_categories (int)
* id_projects (int)
* id_teams (int)
* start_date (date): undefined dates are forced to ``DATE_MIN``
"""
db = current.globalenv['db']
......@@ -139,8 +183,129 @@ def get_event_minimal(**kwargs):
.set_index("id")
.pipe(normalize_history_data)
.assign(
start_date=lambda x: x.start_date.fillna(YEAR_MIN),
end_date=lambda x: x.end_date.fillna(YEAR_MAX)))
start_date=lambda x: x.start_date.fillna(DATE_MIN),
end_date=lambda x: x.end_date.fillna(DATE_MAX)))
return df
def expand_per_year(df, year_start, year_end):
"""Expand the list of active items between year_start and year_end to
a list of active items per year. An item appears several time when it
is active during several years.
Graphically, the item
--+...+ 2012-06 2014-02 +--++
becomes
2012 --+...+ 2012-06 2014-02 +--++
2013 --+...+ 2012-06 2014-02 +--++
2014 --+...+ 2012-06 2014-02 +--++
Note:
The columns `start_date` and `end_date` have to be defined in the
DataFrame.
Args:
year_start (date):
year_end (date):
Returns:
pandas.DataFrame:
Columns are:
* those of the initial DataFrame
* year (int)
"""
# for each item compute the list of year for which the item is active
# the information is kept in a PeriodIndex stored in the column years.
# NOTE:
# - failed to implemented a faster way using df.apply
#
li = []
for row in df.itertuples():
li.append([pd.period_range(row.start_date, row.end_date, freq="A")])
df1 = df.merge(pd.DataFrame(li, columns=["years"]),
left_index=True,
right_index=True)
# for each year keep the list of active items
# an item appears several time when it is active during several years.
df2 = pd.DataFrame()
for year in xrange(year_start, year_end+1):
dfi = (df1.loc[[str(year) in el for el in df1.years]]
.assign(year=year))
df2 = pd.concat((df2, dfi), ignore_index=True)
# Keep the memory foot print low by removing the years column
df2 = df2.drop("years", axis="columns")
return df2
def get_items_per_year_small(**kwargs):
"""Return the DataFrame with active items for each year,
belonging to the event id_event. An item appears several time when it
is active during several years.
The DataFrame contains the minimal set of information for each item,
namely the meta-data, the year as well as the history.data.
The latter is a dictionary which is "json normalised", one column per key.
The selection of the item is performed by querying the database.
The query is built using the keyword arguments.
The range of years are selected using the arguments ``year_start``
and ``year_end``. The scan is between ``DATE_MIN`` and/or ``DATE_MAX`` when
``year_start`` and/or ``DATE_MIN`` is/are not defined.
Keyword Args:
id_events (int): identifier of the event [required]
id_domains (int):
id_fundings (int):
id_objects (int):
id_object_categories (int):
id_people (int):
id_people_categories (int):
id_projects (int):
id_teams (int):
year_end (date):
year_start (date):
Returns:
pandas.DataFrame:
The columns of the DataFrame are:
* one column for each key of the history.data dictionary
* end_date (date): undefined dates are forced to ``DATE_MAX``
* id (int): history table identifier for the item
* id_domains (int)
* id_events (int)
* id_objects (int)
* id_objects_categories (int)
* id_people (int)
* id_people_categories (int)
* id_projects (int)
* id_teams (int)
* start_date (date): undefined dates are forced to ``DATE_MIN``
* year (int)
"""
if "year_start" not in kwargs:
kwargs["year_start"] = DATE_MIN.year
if "year_end" not in kwargs:
kwargs["year_end"] = DATE_MAX.year
ystart = int(kwargs["year_start"])
yend = int(kwargs["year_end"])
df = (get_items_small(**kwargs)
.reset_index()
.pipe(expand_per_year, ystart, yend))
return df
......@@ -227,3 +392,27 @@ def query_history(db, **kwargs):
query = (qi if query is None else (query) & (qi))
return query
class Timer(object):
"""Utility to time a part of the code.
From https://www.huyng.com/posts/python-performance-analysis
Example::
with Timer("build data frame for RH") as t:
df = get_human_resources("LHCb", 2014, 2018)
"""
def __init__(self, msg):
self.msg = msg
def __enter__(self):
self.start = time.time()
return self
def __exit__(self, *args):
self.end = time.time()
self.secs = self.end - self.start
self.msecs = self.secs * 1000 # millisecs
print '\n%s\n\telapsed time: %f ms' % (self.msg, self.msecs)
# -*- coding: utf-8 -*-
""" Test script for DataFrames use to build reports.
"""
import pandas as pd
from plugin_event import (debug_df,
get_items_small,
get_items_per_year_small,
Timer)
# pandas options
pd.set_option("display.encoding", "utf-8")
pd.set_option("display.width", 150)
# build the model and selector
event.Event.register_source("items small", event.get_items_small)
event.Core.define_tables(db, T)
event.Report.define_tables(db, T)
#
# get_items_small
#
# with Timer("get_items_small") as t:
# df = get_items_small(id_events=7)
# print df.memory_usage()
# print df.info()
# print df.columns
# print df.index
# print df.head()
# print df
#
# get_items_per_year_small
# id_event = 6 (People)
# id_event = 7 (CHANGELOG
#
with Timer("get_items_per_year_small") as t:
df = get_items_per_year_small(id_events=6,
year_start=2012,
year_end=2017)
debug_df(df, opt="ih")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment