graph_tools.py 16.1 KB
Newer Older
LE GAC Renaud's avatar
LE GAC Renaud committed
1 2 3
"""A collection of tools to build graphs in controllers.

"""
4 5
import matplotlib as mpl
import pandas as pd
6 7 8


from gluon import current
9 10
from io import StringIO
from .model_selector import MONTH_SUBMIT, YEAR_PUBLISH
LE GAC Renaud's avatar
LE GAC Renaud committed
11
from pandas import DataFrame, DatetimeIndex, to_datetime
12 13


14 15 16
DATE_PUB = "Publication date"
DATE_SUB = "Submission date"

17
FROM_TO = "from %s to %s"
18 19 20 21 22 23 24

LABELY = "Number of publications"
LABELY_CUM_MONTH = "Cumulative sum of publications / month"
LABELY_CUM_YEAR = "Cumulative sum of publications / year"
LABELY_MONTH = "Number of publications / month"
LABELY_YEAR = "Number of publications / year"

25 26 27
TITLE = "Publications"
TITLE_CUMSUM = "Cumulative sum of publications"

28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91
PUBLICATIONS_QUERY_FIELDS = [
    "Graph_selectorAuthor",
    "Graph_selectorId_authors_roles",
    "Graph_selectorId_categories",
    "Graph_selectorId_fundings",
    "Graph_selectorId_projects",
    "Graph_selectorId_teams",
    "Graph_selectorYear_end",
    "Graph_selectorYear_start"]


def db2df(db, query, fields=[], columns=None):
    """Transform a database query into a dataframe.

    Example::

        # database query
        query = history.id_events == id_event
        query &= history.id_projects == id_project
        query &= history.id_fundings == db.fundings.id
        query &= history.id_people == db.people.id
        query &= history.id_people_categories == db.people_categories.id
        query &= history.id_teams == db.teams.id

        # fields to be extracted from the database
        fields = [
            history.id_domains,
            history.id_events,
            history.id_people,
            history.id_projects,
            history.id_teams,
            db.teams.team,
            db.people_categories.category,
            db.people_categories.code,
            db.people.first_name,
            db.people.last_name,
            history.percentage,
            history.start_date,
            history.end_date,
            history.data,
            db.fundings.agency]

        # get the DataFrame
        df = db2df(query, fields)
        print df.info()

    Args:
        db (pyDAL.DAL): database connection.

        query: database query including statement to join foreign table.

        fields (list): list of pyDAL.Field to be extracted from the query.
            All fields are extracted when not defined.

        columns (list): name of the columns in the dataframe. There is one
            to one correspondence between the fields and columns list.
            Names of database field are used when columns is not defined.

    Returns:
        pandas.DataFrame

    """
    if columns is None and len(fields) > 0:
        columns = [field.name for field in fields]
92

93 94 95 96 97
    rows = db.executesql(db(query)._select(*fields))
    return pd.DataFrame.from_records(list(rows), columns=columns)


def emptychart(db, selector, target=None):
LE GAC Renaud's avatar
LE GAC Renaud committed
98 99 100 101 102
    """Empty graph as a function of month.

    Args:
        db (gluon.DAL): database connection
        selector (plugin_dbui.Selector): selector with user critera
103 104
        target (matplotlib.AxesSubplot):
            an existing axes useful when working with subplot.
LE GAC Renaud's avatar
LE GAC Renaud committed
105 106

    Returns:
107
        matplotlib.AxesSubplot: axes subplot
LE GAC Renaud's avatar
LE GAC Renaud committed
108 109

    """
110 111 112 113
    T = current.T

    year_start = selector.Graph_selectorYear_start
    year_end = selector.Graph_selectorYear_end
LE GAC Renaud's avatar
LE GAC Renaud committed
114 115

    if year_start and not year_end:
116
        dates = to_datetime(["%s-01-01" % year_start, "%s-12-31" % year_start])
LE GAC Renaud's avatar
LE GAC Renaud committed
117

118
    elif year_start and year_end:
119
        dates = to_datetime(["%s-01-01" % year_start, "%s-12-31" % year_end])
LE GAC Renaud's avatar
LE GAC Renaud committed
120

121
    else:
122
        year = current.request.now.year
123
        dates = to_datetime(["%i-01-01" % year, "%i-12-31" % year])
124

125
    df = DataFrame([0]*2, index=dates)
LE GAC Renaud's avatar
LE GAC Renaud committed
126

127
    ax = df.plot(legend=False, grid=True, x_compat=True, ax=target)
128

129 130 131 132
    # x and y ticks
    ax.minorticks_on()
    ax.tick_params(which="major", length=8)
    ax.tick_params(which="minor", length=4)
133

134 135
    # x and y labels
    xlabel = T(DATE_SUB).decode("utf-8")
LE GAC Renaud's avatar
LE GAC Renaud committed
136
    ax.set_xlabel(xlabel, x=1, horizontalalignment="right")
137

138 139
    ylabel = T(LABELY).decode("utf-8")
    ax.set_ylabel(ylabel, y=1, horizontalalignment="right")
140

141 142
    # main title of the graph
    set_title(ax, db, selector)
143

144
    return ax
145 146


147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170
def mplstyle():
    """Helper function to define matplotlib style for plots.

    """
    mpl.rcParams["font.size"] = 8

    mpl.rcParams["grid.alpha"] = 0.5
    mpl.rcParams["grid.linestyle"] = "dotted"

    mpl.rcParams["xtick.direction"] = "in"
    mpl.rcParams["xtick.top"] = True
    mpl.rcParams["xtick.bottom"] = True
    mpl.rcParams["xtick.major.size"] = 8
    mpl.rcParams["xtick.minor.size"] = 4
    mpl.rcParams["xtick.labelsize"] = 8

    mpl.rcParams["ytick.direction"] = "in"
    mpl.rcParams["ytick.left"] = True
    mpl.rcParams["ytick.right"] = True
    mpl.rcParams["ytick.major.size"] = 8
    mpl.rcParams["ytick.minor.size"] = 4
    mpl.rcParams["ytick.labelsize"] = 8


171
def linechart(db, selector, target=None, title=None, xlabel=None, ylabel=None):
172
    """Build a line chart showing the number of publications as a function
173
    of the time.
174 175

    Args:
176 177 178 179 180 181 182
        db (pydal.DAL): database connection
        selector (Storage): value returned by the graph selector.
        target (matplotlib.AxesSubplot):
            an existing axes useful when working with subplot.
        title (bool): define from the user criteria when None.
        xlabel (bool): define automatically when None.
        ylabel (bool): define automatically when None.
183 184 185 186 187

    Returns:
        matplotlib.AxesSubplot: axes subplot

    """
188 189 190 191 192 193 194 195
    T = current.T
    publications = db.publications

    # ........................................................................
    #
    # variables depending on the time axis
    #
    time_axis = selector.Graph_selectorTime
196 197 198 199 200 201

    is_submit_month = time_axis == T(MONTH_SUBMIT)
    is_publish_year = time_axis == T(YEAR_PUBLISH)

    to_time = (submitted2YM if is_submit_month else submitted2Y)
    to_time = (published2Y if is_publish_year else to_time)
202 203 204 205 206

    # ........................................................................
    #
    # instantiate the DataFrame
    #
207
    query = query_publications(db, is_publish_year, **selector)
208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236

    fields = [
        publications.year,
        publications.submitted]

    columns = ["published", "submitted"]

    df = (db2df(db, query, fields, columns)
          .assign(
              axis=1,
              time=to_time)
          .drop(["published", "submitted"], axis="columns")
          .groupby(["time", "axis"])
          .size()
          .unstack(level=1)
          .fillna(0))

    if df.empty:
        return emptychart(db, selector)

    # post processing
    is_cumsum = selector.Graph_selectorCumulative == "true"
    if is_cumsum:
        df = df.cumsum()

    # ........................................................................
    #
    # instantiate the graph
    #
237
    if is_submit_month:
238
        df.index = DatetimeIndex(df.index)
239

240
    if df.size == 1:
241
        kwargs = dict(color="lavender", kind="bar", edgecolor="blue", rot=0)
LE GAC Renaud's avatar
LE GAC Renaud committed
242

243
    else:
244
        kwargs = dict(colormap="Pastel1", kind="line", rot=35, x_compat=True)
245

246
    ax = df.plot(grid=True,
247 248 249 250
                 legend=False,
                 ylim=(0, None),
                 ax=target,
                 **kwargs)
251

252 253 254 255
    # x and y ticks
    ax.minorticks_on()
    ax.tick_params(which="major", length=8)
    ax.tick_params(which="minor", length=4)
256

257 258
    # x label
    if xlabel is None:
259 260
        xlabel = T(DATE_PUB if is_publish_year else DATE_SUB).decode("utf-8")

261
    ax.set_xlabel(xlabel, x=1, horizontalalignment="right")
262

263 264
    # y label
    if ylabel is None:
265
        if is_cumsum and is_submit_month:
266
            ylabel = LABELY_CUM_MONTH
267

268 269
        elif is_cumsum:
            ylabel = LABELY_CUM_YEAR
270

271
        elif is_submit_month:
272
            ylabel = LABELY_MONTH
273
        else:
274
            ylabel = LABELY_YEAR
275

276 277
    ylabel = T(ylabel).decode("utf-8")
    ax.set_ylabel(ylabel, y=1, horizontalalignment="right")
278

279 280 281
    # main title of the graph
    if title is None:
        set_title(ax, db, selector)
282

283 284
    else:
        ax.set_title(title)
285 286 287 288

    return ax


289
def query_publications(db, is_publish, **kwargs):
290 291 292 293
    """Construct the query to select publications according to user criteria,
    team, project, year, etc. The operator AND is applied between conditions.

    The keyword dictionary is the request.vars coming from the graph_selector.
294 295

    Args:
296
        db (gluon.dal.DAL): database connection.
297
        is_publish (bool): select using either published year or submitted one.
298 299 300 301 302 303 304 305 306 307 308 309

    Keyword Args:
        Graph_selectorAuthor (str):
        Graph_selectorId_authors_roles (str):
        Graph_selectorId_categories (str):
        Graph_selectorId_fundings (str):
        Graph_selectorId_projects (str):
        Graph_selectorId_teams (str):
        Graph_selectorYear_end (str):
        Graph_selectorYear_start (str):

        * other keywords are ignored
310 311

    Returns:
312
        gluon.DAL.Query
313 314

    """
315
    di = {k: kwargs[k] for k in PUBLICATIONS_QUERY_FIELDS if k in kwargs}
316

317 318 319 320 321
    publications = db.publications
    query = None

    year_end = di["Graph_selectorYear_end"]
    year_start = di["Graph_selectorYear_start"]
322

323 324 325
    if is_publish:
        if year_start and not year_end:
            query = publications.year == int(year_start)
326

327 328 329 330
        elif year_start and year_end:
            q_start = publications.year >= int(year_start)
            q_end = publications.year <= int(year_end)
            query = ((q_start) & (q_end))
331

332 333 334 335 336 337 338 339 340 341 342 343 344 345
        elif year_end and not year_start:
            query = publications.year <= int(year_end)

    else:
        if year_start and not year_end:
            query = publications.submitted[0:4] == year_start

        elif year_start and year_end:
            q_start = publications.submitted[0:4] >= year_start
            q_end = publications.submitted[0:4] <= year_end
            query = ((q_start) & (q_end))

        elif year_end and not year_start:
            query = publications.submitted[0:4] <= year_end
346

347
    for k, v in di.items():
348

349 350
        if k in ("Graph_selectorYear_start", "Graph_selectorYear_end"):
            continue
351

352
        if v in ('', None):
353 354
            continue

355 356
        if k == "Graph_selectorAuthor":
            qi = publications.authors_institute.contains(v)
357

358 359
        elif k == "Graph_selectorId_authors_roles":
            qi = publications.id_authors_role == int(v)
360

361 362
        elif k == "Graph_selectorId_categories":
            qi = publications.id_categories == int(v)
363

364 365
        elif k == "Graph_selectorId_projects":
            qi = publications.id_projects == int(v)
366

367 368
        elif k == "Graph_selectorId_teams":
            qi = publications.id_teams == int(v)
369

370 371
        else:
            continue
372

373
        query = (qi if query is None else (query) & (qi))
374

375
    query = (publications.id > 0 if query is None else query)
376
    return query
377

378

379 380
def savefig(fig, fmt):
    """Save the figure as a string.
381

382 383 384
    Args:
        fig (matplotlib.figure): the figure.
        fmt (str): possible values are pdf, png and svg.
385

386 387
    Return:
        str:
388

389 390 391 392 393
    """
    fi = StringIO()
    fig.savefig(fi, format=fmt)
    data = fi.getvalue()
    fi.close()
394

395 396
    fig.clear()
    mpl.pyplot.close(fig)
397

398
    return data
399 400


401 402
def set_title(ax, db, selector):
    """Set the title for the graph.
403 404

    Args:
405
        ax (matplotlib.AxesSubplot): axes subplot
406 407 408 409 410 411
        db (gluon.DAL): database connection
        selector (plugin_dbui.Selector): selector with user critera

    """
    T = current.T

412 413 414 415 416 417
    id_project = selector.Graph_selectorId_projects
    id_team = selector.Graph_selectorId_teams

    team_project = ""
    if id_team and not id_project:
        team_project = db.teams[int(id_team)].team
418

419 420 421 422 423 424 425 426 427
    elif not id_team and id_project:
        team_project = db.projects[int(id_project)].project

    elif id_team and id_project:
        team = db.teams[int(id_team)].team
        project = db.projects[int(id_project)].project
        team_project = "%s/%s" % (team, project)

    if selector.Graph_selectorCumulative == "true":
428
        title = "%s %s" % (T(TITLE_CUMSUM), team_project)
429
    else:
430
        title = "%s %s" % (T(TITLE), team_project)
431

432 433 434
    year_start = selector.Graph_selectorYear_start
    year_end = selector.Graph_selectorYear_end

435
    period = ""
436 437
    if year_start and year_end and year_start != year_end:
        period = T(FROM_TO) % (year_start, year_end)
438

439
    elif year_start:
440
        period = year_start
441

442 443 444 445
    title = "%s %s" % (title, period)
    ax.set_title(title.decode("utf-8"))


446 447 448
def stackchart(db, selector, target=None):
    """Build a stacked chart showing the number of publications as a function
    of the time.
449 450

    Args:
451 452 453 454
        db (pydal.DAL): database connection
        selector (Storage): value returned by the graph selector.
        target (matplotlib.AxesSubplot):
            an existing axes useful when working with subplot.
455

456 457
    Returns:
        matplotlib.AxesSubplot: axes subplot
458 459

    """
460
    T = current.T
461

462 463 464 465 466 467 468 469
    graph = db.graphs[int(selector.Graph_selectorId_graphs)]
    publications = db.publications

    # ........................................................................
    #
    # variables depending on the time axis
    #
    time_axis = selector.Graph_selectorTime
470 471 472 473 474 475

    is_submit_month = time_axis == T(MONTH_SUBMIT)
    is_publish_year = time_axis == T(YEAR_PUBLISH)

    to_time = (submitted2YM if is_submit_month else submitted2Y)
    to_time = (published2Y if is_publish_year else to_time)
476 477 478 479 480

    # ........................................................................
    #
    # instantiate the DataFrame
    #
481
    query = query_publications(db, is_publish_year, **selector)
482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513

    # link to the axis table
    axis = graph.stack_axis
    granularity = graph.stack_granularity

    table = db[axis]
    field = table[granularity]

    query &= publications["id_%s" % axis] == table["id"]

    fields = [
        publications.year,
        publications.submitted,
        field]

    columns = ["published", "submitted", "axis"]

    df = (db2df(db, query, fields, columns)
          .assign(time=to_time)
          .drop(["published", "submitted"], axis="columns")
          .groupby(["time", "axis"])
          .size()
          .unstack(level=1)
          .fillna(0))

    if df.empty:
        return emptychart(db, selector, target)

    # post processing
    is_cumsum = selector.Graph_selectorCumulative == "true"
    if is_cumsum:
        df = df.cumsum()
514

515 516 517 518 519
    # ........................................................................
    #
    # instantiate the graph
    #
    kwargs = {}
520
    if is_submit_month:
521 522 523 524
        df.index = DatetimeIndex(df.index)
        kwargs = dict(kind="area", x_compat=True)

    else:
525 526
        rot = (0 if df.index.size == 1 else 15)
        kwargs = dict(kind="bar", linewidth=0, rot=rot, width=1.)
527 528 529 530 531 532 533 534 535 536 537 538 539 540

    ax = df.plot(colormap="Pastel1",
                 grid=True,
                 stacked=True,
                 ylim=(0, None),
                 ax=target,
                 **kwargs)

    # x and y ticks
    ax.minorticks_on()
    ax.tick_params(which="major", length=8)
    ax.tick_params(which="minor", length=4)

    # x and y labels
541
    xlabel = T(DATE_PUB if is_publish_year else DATE_SUB).decode("utf-8")
542 543 544
    ax.set_xlabel(xlabel, x=1, horizontalalignment="right")

    # y label
545
    if is_cumsum and is_submit_month:
546 547 548 549 550
        ylabel = LABELY_CUM_MONTH

    elif is_cumsum:
        ylabel = LABELY_CUM_YEAR

551
    elif is_submit_month:
552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592
        ylabel = LABELY_MONTH
    else:
        ylabel = LABELY_YEAR

    ylabel = T(ylabel).decode("utf-8")
    ax.set_ylabel(ylabel, y=1, horizontalalignment="right")

    # legend title is the name of the axis
    axis = (axis if axis == "categories" else graph.stack_granularity)

    legend = ax.get_legend()
    legend.set_title(T(axis).decode("utf-8"))

    # main title of the graph
    set_title(ax, db, selector)

    return ax


def submitted2YM(df):
    """
    Args:
        df (pandas.DataFrame): contains the column ``submitted``.

    Returns:
        str: YYYY-MM

    """
    return df.submitted.str[0:7]


def submitted2Y(df):
    """
    Args:
        df (pandas.DataFrame): contains the column ``submitted``.

    Returns:
        str: YYYY

    """
    return df.submitted.str[0:4]
593 594 595 596 597 598 599 600 601 602 603 604


def published2Y(df):
    """
    Args:
        df (pandas.DataFrame): contains the column ``published``.

    Returns:
        str: YYYY

    """
    return df.published.astype(str)