Commit aad8e305 authored by LE GAC Renaud's avatar LE GAC Renaud
Browse files

Add the script deploy.py and data harvesters.csv to prepare the deployment on a large scale.

parent 871c3954
"collaboration","controller","host","collection","comment"
"a1","articles","inspirehep.net","find cn a1 and tc p and not tc c","mami accelerator"
"auger","articles","inspirehep.net","find cn auger and tc p and not tc c","auger or pierre auger give the same result"
"aleph","articles","cds.cern.ch","ALEPH Papers",
"alice","articles","cds.cern.ch","ALICE Papers",
"ams","articles","inspirehep.net","find cn ams and tc p and not tc c",
"antares","articles","inspirehep.net","find cn antares and tc p and not tc c",
"athena","articles","inspirehep.net","find athena and not cn athena and tc p and not tc c","warning: détecteur X embarqué ESA not CERN coll."
"atlas","articles","cds.cern.ch","ATLAS Papers",
"babar","articles","inspirehep.net","find cn babar and tc p and not tc c",
"belle","articles","inspirehep.net","find cn belle and tc p and not tc c",
"belle-ii","articles","inspirehep.net","find cn belle-ii and tc p and not tc c",
"borexino","articles","inspirehep.net","find cn borexino and tc p and not tc c",
"boss","articles","inspirehep.net","find cn boss and tc p and not tc c",
"calice","articles","inspirehep.net","find cn calice and tc p and not tc c",
"celeste","articles","inspirehep.net","find cn celeste and tc p and not tc c",
"ckmfitter","articles","inspirehep.net","find cn ckmfitter and tc p and not tc c",
"clas","articles","inspirehep.net","find cn clas and tc p and not tc c",
"cms","articles","cds.cern.ch","CMS Papers",
"codalema","articles","inspirehep.net","find cn codalema and tc p and not tc c",
"cta","articles","inspirehep.net","find cn cta and tc p and not tc c",
"d0","articles","inspirehep.net","find cn d0 and tc p and not tc c",
"darkside","articles","inspirehep.net","find cn darkside and tc p and not tc c",
"delphi","articles","cds.cern.ch","DELPHI Papers",
"desi","articles","inspirehep.net","find cn desi and tc p and not tc c",
"double chooz","articles","inspirehep.net","find cn double chooz and tc p and not tc c",
"dune","articles","inspirehep.net","find cn dune and tc p and not tc c",
"eli-np","articles","inspirehep.net","finc cn eli-np and tc p and not tc c",
"elisa","articles","inspirehep.net","finc 693__e:elisa and tc p and not tc c","an instrument not a coll."
"euclid","articles","inspirehep.net","find cn euclid and tc p and not tc c",
"fazia","articles","inspirehep.net","find cn fazia and tc p and not tc c",
"fermi-lat","articles","inspirehep.net","find cn fermi-lat and tc p and not tc c",
"gaspard","articles","inspirehep.net","find cn gaspard and tc p and not tc c",
"graal","articles","inspirehep.net","find cn graal and tc p and not tc c",
"hades","articles","inspirehep.net","find cn hades and tc p and not tc c",
"h1","articles","inspirehep.net","find cn h1 and tc p and not tc c",
"hess","articles","inspirehep.net","find (cn hess or cn h e s s) and tc p and not tc c",
"ilc","articles","inspirehep.net","finc cn ilc and tc p and not tc c",
"integral","articles","inspirehep.net","finc cn integral and tc p and not tc c",
"jean-euso","articles","inspirehep.net","find cn jem-euso and tc p and not tc c",
"juno","articles","inspirehep.net","find cn juno and tc p and not tc c",
"km3net","articles","inspirehep.net","find cn km3net and tc p and not tc c",
"l3","articles","cds.cern.ch","L3 Papers",
"lhcb","articles","cds.cern.ch","LHCb Papers, LHCb Detector Performance Papers",
"ligo","articles","inspirehep.net","finc cn ligo and tc p and not tc c",
"Lisa-pathfinder","articles","inspirehep.net","finc 693__e:lisa-pathfinder and tc p and not tc c","an instrument not a coll."
"lsst","articles","inspirehep.net","find cn lsst and tc p and not tc c",
"na50","articles","inspirehep.net","find cn na50 and tc p and not tc c",
"nemo-3","articles","inspirehep.net","find cn nemo-3 and tc p and not tc c",
"opal","articles","cds.cern.ch","OPAL Papers",
"orca","articles","inspirehep.net","find orca and tc p and not tc c","no cn since orca is not an int. coll."
"panda","articles","inspirehep.net","find cn panda and tc p and not tc c",
"phenix","articles","inspirehep.net","find cn phenix and tc p and not tc c",
"planck","articles","inspirehep.net","find cn planck and tc p and not tc c",
"snls","articles","inspirehep.net","find cn snls and tc p and not tc c",
"solid","articles","inspirehep.net","find cn solid and tc p and not tc c","warning: find articles for SoLid and SoLID !"
"sox","articles","inspirehep.net","find sox and tc p and not tc c","no cn since sox is not an int. coll."
"supernemo","articles","inspirehep.net","find cn supernemo and tc p and not tc c",
"thomx","articles","inspirehep.net","find thomx and tc p and not tc c","no cn since thomx is not int. coll."
"virgo","articles","inspirehep.net","find cn virgo and tc p and not tc c",
"xenon","articles","inspirehep.net","find cn xenon and tc p and not tc c",
,,,,
"a1","proceedings","inspirehep.net","find cn a1 and tc c",
"auger","proceedings","inspirehep.net","find cn auger and tc c","auger or pierre auger give the same result"
"aleph","proceedings","inspirehep.net","find cn aleph and tc c",
"alice","proceedings","inspirehep.net","find cn alice and tc c",
"ams","proceedings","inspirehep.net","find cn ams and tc c",
"antares","proceedings","inspirehep.net","find cn antares and tc c",
"athena","proceedings","inspirehep.net","find athena and not cn athena and tc c","warning: détecteur X embarqué ESA not CERN coll."
"atlas","proceedings","inspirehep.net","find cn atlas and tc c",
"babar","proceedings","inspirehep.net","find cn babar and tc c",
"belle","proceedings","inspirehep.net","find cn belle and tc c",
"belle-ii","proceedings","inspirehep.net","find cn belle-ii and tc c",
"borexino","proceedings","inspirehep.net","find cn borexino and tc c",
"boss","proceedings","inspirehep.net","find cn boss and tc c",
"calice","proceedings","inspirehep.net","find cn calice and tc c",
"celeste","proceedings","inspirehep.net","find cn celeste and tc c",
"ckmfitter","proceedings","inspirehep.net","find cn ckmfitter and tc c",
"clas","proceedings","inspirehep.net","find cn clas and tc c",
"cms","proceedings","inspirehep.net","find cn cms and tc c",
"codalema","proceedings","inspirehep.net","find cn codalema and tc c",
"cta","proceedings","inspirehep.net","find cn cta and tc c",
"d0","proceedings","inspirehep.net","find cn d0 and tc c",
"darkside","proceedings","inspirehep.net","find darkside and tc c","no cn since proceeding are also signed by people"
"delphi","proceedings","cds.cern.ch","DELPHI Conference Proceedings",
"desi","proceedings","inspirehep.net","find cn desi and tc c",
"double chooz","proceedings","inspirehep.net","find cn double chooz and tc c",
"dune","proceedings","inspirehep.net","find cn dune and tc c",
"eli-np","proceedings","inspirehep.net","finc cn eli-np and tc c",
"elisa","proceedings","inspirehep.net","finc 693__e:elisa and tc c","an instrument not a coll."
"euclid","proceedings","inspirehep.net","find cn euclid and tc c",
"fazia","proceedings","inspirehep.net","find cn fazia and tc c",
"fermi-lat","proceedings","inspirehep.net","find cn fermi-lat and tc c",
"gaspard","proceedings","inspirehep.net","find cn gaspard and tc c",
"graal","proceedings","inspirehep.net","find cn graal and tc c",
"hades","proceedings","inspirehep.net","find cn hades and tc c",
"h1","proceedings","inspirehep.net","find cn h1 and tc c",
"hess","proceedings","inspirehep.net","find (cn hess or cn h e s s) and tc c",
"ilc","proceedings","inspirehep.net","finc cn ilc and tc c",
"integral","proceedings","inspirehep.net","finc cn integral and tc c",
"Jean-euso","proceedings","inspirehep.net","find cn jem-euso and tc c",
"juno","proceedings","inspirehep.net","find cn juno and tc c",
"km3net","proceedings","inspirehep.net","find cn km3net and tc c",
"l3","proceedings","cds.cern.ch","L3 Conference Proceedings",
"lhcb","proceedings","cds.cern.ch","LHCb Conference Proceedings",
"ligo","proceedings","inspirehep.net","finc cn ligo and tc c",
"Lisa-pathfinder","proceedings","inspirehep.net","finc 693__e:lisa-pathfinder and tc c",
"lsst","proceedings","inspirehep.net","find cn lsst and tc c",
"na50","proceedings","inspirehep.net","find cn na50 and tc c",
"nemo-3","proceedings","inspirehep.net","find cn nemo-3 and tc c",
"opal","proceedings","cds.cern.ch","OPAL Conference Proceedings",
"orca","proceedings","inspirehep.net","find orca and tc c","no cn since orca is not an int. coll."
"panda","proceedings","inspirehep.net","find cn panda and tc c",
"phenix","proceedings","inspirehep.net","find cn phenix and tc c",
"planck","proceedings","inspirehep.net","find cn planck and tc c",
"snls","proceedings","inspirehep.net","find cn snls and tc c",
"solid","proceedings","inspirehep.net","find cn solid and tc c","warning: find articles for SoLid and SoLID !"
"sox","proceedings","inspirehep.net","find sox and tc c","no cn since sox is not an int. coll."
"supernemo","proceedings","inspirehep.net","find cn supernemo and tc c",
"thomx","proceedings","inspirehep.net","find thomx and tc c","no cn since thomx is not int. coll."
"virgo","proceedings","inspirehep.net","find cn virgo and tc c",
"xenon","proceedings","inspirehep.net","find cn xenon and tc c",
,,,,
"antares","talks","cds.cern.ch","ANTARES Talks",
"atlas","talks","cds.cern.ch","ATLAS Conference Slides",
"lhcb","talks","cds.cern.ch","LHCb Talks",
,,,,
"aleph","theses","cds.cern.ch","ALEPH Theses",
"alice","theses","cds.cern.ch","ALICE Theses",
"atlas","theses","cds.cern.ch","ATLAS Theses",
"cms","theses","cds.cern.ch","CMS Theses",
"delphi","theses","cds.cern.ch","DELPHI Theses",
"l3","theses","cds.cern.ch","L3 Theses",
"lhcb","theses","cds.cern.ch","LHCb Theses",
"opal","theses","cds.cern.ch","OPAL Theses",
,,,,
"alice","notes","cds.cern.ch","ALICE Public Notes",
"atlas","notes","cds.cern.ch","ATLAS Notes, ATLAS Conference Notes, ATLAS Scientific Notes",
"cms","notes","cds.cern.ch","CMS Notes, CMS Physics Analysis Summaries, CMS Detector Performance Summaries",
"delphi","notes","cds.cern.ch","DELPHI Notes",
"l3","notes","cds.cern.ch","L3 Notes",
"lhcb","notes","cds.cern.ch","LHCb Notes, LHCb Conference Contributions",
"opal","notes","cds.cern.ch","OPAL Notes",
,,,,
"alice","reports","cds.cern.ch","ALICE Reports",
"atlas","reports","cds.cern.ch","ATLAS Reports",
"cms","reports","cds.cern.ch","CMS Reports",
"l3","reports","cds.cern.ch","L3 Reports",
"lhcb","reports","cds.cern.ch","LHCb Reports",
"opal","reports","cds.cern.ch","OPAL Reports",
# -*- coding: utf-8 -*-
""" NAME
deploy my_institute.csv
SYNOPSIS
parametrise the database for a given institute
DESCRIPTION
The parameters for the institute are defined in the CSV file.
It is located scripts/data/my_institute.csv
The definition of the main harvesters are located in
scripts/data/harvesters.csv
The CSV file defining the institute contains the following columns
- domain
- team
- project
- project definition
- comments
The project definition column is split in four:
- collaboration
- instrument, JLAB experiment, etc.
- physicist
- HAL hidden
The CSV file defining the harvesters contains five columns:
- collaboration (identifier)
- controller
- host
- collection
- comment
Firstly, the script create teams, projects and their relations.
Secondly, it create all the harvesters for each project using
OPTIONS
-h, --help
Display the help and exit.
EXAMPLE
> cd ...limbra/scripts
> run -S test_limbra script deploy.py data/my_institute.csv
AUTHOR
R. Le Gac -- Jul 2017
"""
import argparse
import os
import pandas as pd
import sys
from invenio_tools import load_record
from plugin_dbui import get_id
CONTROLLER_TO_CAT = {
"articles": "ACL",
"notes": "AP",
"preprints": "PRE",
"proceedings": "ACTI",
"reports": "AP",
"talks": "COM",
"theses": "PHD"}
def create_affiliation(opt):
"""Create the affiliation keys in the database.
Args:
opt (argparse.Namespace): options
"""
rep = raw_input("\tDo you want to create affiliation keys [y/N]: ")
if rep != "y":
return
rep = raw_input("\tEnter the inspirehep identifier for your institute: ")
if not rep.isdigit():
print "\t\tERROR: invalid answer!"
# ........................................................................
#
# get and process inspirehep record
#
record = load_record("inspirehep.net", int(rep))
# extract keys defining the affiliation
# u and v are the main keys use in inspirehep and cds
# b is uses by some note in Atlas
keys = (record[u"110"][k] for k in ("u", "t", "b") if k in record[u"110"])
keys = (dict(key_u=key, key_v="") for key in keys)
# check that the rules does not exist
# load new rules
is_key_add = False
for key in keys:
if get_id(db.affiliation_keys, **key) is None:
print "\t\tAdding the affiliation keys '%s'" % key["key_u"]
db.affiliation_keys[0] = key
is_key_add = True
if not is_key_add:
print "\t\tAffiliation keys already exist!"
# ........................................................................
#
# commit
#
rep = raw_input("\n\tDo you want to commit database changes [y/N]: ")
if rep == "y":
db.commit()
def create_harvesters(opt, dfi, dfh):
"""Create the harvesters in the database.
Args:
opt (argparse.Namespace):
user options
dfi (pandas.DataFrame):
institute parameter. Columns are:
- domain
- team
- project
- collaboration
- instrument
- people
- hal_hidden
dfh (pandas.DataFrame):
harvester's definition. Columns are:
- collaboration (identifier)
- controller
- host
- collection
"""
rep = raw_input("\tDo you want to create harvesters [y/N]: ")
if rep != "y":
return
# ........................................................................
#
# destroy existing harvester
#
recset = db(db.harvesters.id > 0)
if recset.count() > 0:
rep = \
raw_input("\n\tHarvesters already exist. Remove them all [y/N]: ")
if rep == "y":
for row in recset.iterselect(db.harvesters.id):
del db.harvesters[int(row.id)]
# ........................................................................
#
# categories
#
for key, value in CONTROLLER_TO_CAT.iteritems():
id_category = get_id(db.categories, code=value)
if id_category is None:
print "\n\t\tERROR: category '%s' is not defined. exit" % value
sys.exit(1)
CONTROLLER_TO_CAT[key] = id_category
# ........................................................................
#
# create harvesters for all projects
#
# scan scan projects
for row in dfi.itertuples(index=False):
team = row.team
project = row.project
id_team = get_id(db.teams, team=team)
if id_team is None:
print "\n\t\tTeam '%s' is not defined. skip it!" % team
continue
id_project = get_id(db.projects, project=project)
if id_project is None:
print "\n\t\tProject '%s' is not defined. skip it!" % project
continue
data = dict(id_teams=id_team, id_projects=id_project)
#
# harvester defined via collaboration
#
collaboration = row.collaboration
if pd.notnull(collaboration):
query = dfh.collaboration == collaboration
for el in dfh[query].itertuples(index=False):
controller = el.controller
data["controller"] = controller
data["host"] = el.host
data["collections"] = el.collection
data["id_categories"] = CONTROLLER_TO_CAT[controller]
insert_harvester(**data)
continue
#
# harvester defined via experiment / instrument
#
instrument = row.instrument
if pd.notnull(instrument):
data["controller"] = "articles"
data["host"] = "inspirehep.net"
# article
collection = "693__e:%s and tc p and not tc c" % instrument
data["collections"] = collection
data["id_categories"] = CONTROLLER_TO_CAT["articles"]
insert_harvester(**data)
# proceeding
collection = "693__e:%s and tc c" % instrument
data["collections"] = collection
data["id_categories"] = CONTROLLER_TO_CAT["proceedings"]
insert_harvester(**data)
continue
#
# harvester defined via people
#
people = row.people
if pd.notnull(people):
data["controller"] = "articles"
data["host"] = "inspirehep.net"
# article
li = ["a %s" % el.strip() for el in people.split(",")]
collection = "find (%s) and tc p and not tc c " % " ".join(li)
if pd.notnull(row.hal_hidden):
collection = collection.replace("find", "find cc Hal Hidden")
data["collections"] = collection
data["id_categories"] = CONTROLLER_TO_CAT["articles"]
insert_harvester(**data)
# proceeding
collection = "find (%s) and tc c " % " ".join(li)
if pd.notnull(row.hal_hidden):
collection = collection.replace("find", "find cc Hal Hidden")
data["collections"] = collection
data["id_categories"] = CONTROLLER_TO_CAT["proceedings"]
insert_harvester(**data)
continue
# ........................................................................
#
# commit
#
rep = raw_input("\n\tDo you want to commit database changes [y/N]: ")
if rep == "y":
db.commit()
def create_teams_projects(opt, dfi):
"""Create the teams / projects structure in the database.
Args:
opt (argparse.Namespace): options
dfi (pandas.DataFrame): institute parameter
"""
rep = raw_input("\tDo you want to create teams/projects [y/N]: ")
if rep != "y":
return
# ........................................................................
#
# teams / projects
#
for row in dfi.itertuples(index=False):
team, project = row.team, row.project
#
# team
#
id_team = get_id(db.teams, team=team)
if id_team is None:
print "\n\t\tCreate team '%s'" % team
id_team = db.teams.insert(team=team, domain=row.domain)
else:
print "\n\t\tteam '%s' already exist!" % team
#
# project
#
id_project = get_id(db.projects, project=project)
if id_project is None:
print "\t\tCreate project '%s'" % project
id_project = \
db.projects.insert(project=project, agencies="CNRS/IN2P3")
else:
print "\t\tproject '%s' already exist!" % project
#
# relation team / project
#
rec_id = \
get_id(db.organisation, id_teams=id_team, id_projects=id_project)
if rec_id is None:
print "Create the relation %s/%s" % (team, project)
db.organisation.insert(id_teams=id_team, id_projects=id_project)
else:
print "\t\tRelation %s/%s already exist!" % (team, project)
# ........................................................................
#
# commit
#
rep = raw_input("\n\tDo you want to commit database changes [y/N]: ")
if rep == "y":
db.commit()
def harvesters(opt):
"""Create the DataFrame with the harvesters definitions.
Args:
opt (argparse.Namespace): options
Return:
pandas.DataFrame
columns are:
- collaboration (identifier)
- controller
- host
- collection
"""
print "\tCollect definitions of harvesters", opt.path_harvesters
# absolute path in the docker container
path = os.path.join(os.getcwd(),
"applications",
request.application,
"scripts",
opt.path_harvesters)
dh = (pd.read_csv(path)
.dropna(how="all", axis="index")
.drop("comment", axis="columns"))
return dh
def insert_harvester(**data):
"""Helper function
Keyword arguments:
- id_teams
- id_projects
- controller
- host
- collections
- id_categories
"""
id_harvester = get_id(db.harvesters, **data)
if id_harvester is None:
print "\n\t\tCreate harvester"
db.harvesters.insert(**data)
else:
print "\n\t\tHarvester exist!"
print "\t\t\t team:", db.teams[data["id_teams"]].team
print "\t\t\t project:", db.projects[data["id_projects"]].project
print "\t\t\t controller:", data["controller"]
print "\t\t\t host:", data["host"]
print "\t\t\t collection:", data["collections"]
def institute(opt):
"""Create the DataFrame with the institute parameters.
Args:
opt (argparse.Namespace): option
Return:
pandas.DataFrame:
columns are:
- domain
- team
- project
- collaboration
- instrument
- people
- hal_hidden
"""
print "\tCollect parameters for the institute", opt.institute
# absolute path in the docker container
path = os.path.join(os.getcwd(),
"applications",
request.application,
"scripts",
opt.institute)
df = pd.read_csv(path)
# rename columns
df.columns = ["domain",
"team",
"project",
"collaboration",
"instrument",
"people",
"hal_hidden"]
return df
if __name__ == "__main__":
print "\n%s\nStart deploy" % ("."*80,)
# pandas options and matplotlib style
pd.set_option("display.encoding", "utf-8")
pd.set_option("display.width", 150)
# parse options
PARSER = argparse.ArgumentParser()
PARSER.add_argument(
"-a", "--affiliation",
action="store_true",
help="create the affiliation keys [%(default)s].")
PARSER.add_argument(
"--all",
action="store_true",
help="create everything [%(default)s]. "
"Equivalent to --affiliation --teams --harvesters.")
PARSER.add_argument(
"-H", "--harvesters",
action="store_true",
help="create harvesters [%(default)s]. "
"Teams and projects have to be created beforehand.")
PARSER.add_argument(
"--path-harvesters",
default="data/harvesters.csv",
help="path to the CSV file defining harvesters [%(default)s]. "
"The path is relative to the scripts directory.",
metavar="<path>")
PARSER.add_argument(
"institute",
help="path to the CSV file defining the institute. "
"The path is relative to the scripts directory.",
metavar="<path>")