Commit d1084fb3 authored by LE GAC Renaud's avatar LE GAC Renaud
Browse files

Merge branch '24-inspirehep-institute-id' into 'master'

24 inspirehep institute

* Parameters of the institute are searched in the `inspirehep` database.
* The preference `reg_institute` has been removed
* The preference `inspirehep_insitute_id` has been added
* The function `fix_amu` has been removed and replaced by a more generic approach: the method `CheckAndFix.format_universities`. The latter depends on `inspirehep_insitute_id`
* Close #24

See merge request !23
parents adc62693 77341193
......@@ -34,7 +34,7 @@ def free_run():
All harvester parameters are defined via the selector.
"""
if not current.app.reg_institute:
if not current.app.inspirehep_institute_id:
return INLINE_ALERT % (T("Error"), MSG_NO_REG_INSTITUTE)
table = virtdb.free_harvester_selector
......@@ -93,7 +93,7 @@ def edit_insert():
no checks are run. The user is editing the record to fix problems.
"""
if not current.app.reg_institute:
if not current.app.inspirehep_institute_id:
return INLINE_ALERT % (T("Error"), MSG_NO_REG_INSTITUTE)
fields = ('controller',
......@@ -236,7 +236,7 @@ def insert_marcxml():
"""Insert a MarcXML record in the database.
"""
if not current.app.reg_institute:
if not current.app.inspirehep_institute_id:
return INLINE_ALERT % (T("Error"), MSG_NO_REG_INSTITUTE)
try:
......@@ -284,7 +284,7 @@ def run():
Search arguments are defined via the harvester selector.
"""
if not current.app.reg_institute:
if not current.app.inspirehep_institute_id:
return INLINE_ALERT % (T("Error"), MSG_NO_REG_INSTITUTE)
try:
......@@ -329,7 +329,7 @@ def run_all():
"""Run all harvesters in one go.
"""
if not current.app.reg_institute:
if not current.app.inspirehep_institute_id:
return INLINE_ALERT % (T("Error"), MSG_NO_REG_INSTITUTE)
collection_logs = []
......
CPPM,`902989 <http://inspirehep.net/record/902989>`_
IPNL,`902974 <http://inspirehep.net/record/902974>`_
LAL,`903100 <http://inspirehep.net/record/903100>`_
LAPP,`903421 <http://inspirehep.net/record/903421>`_
LPC Caen,`902703 <http://inspirehep.net/record/902703>`_
......@@ -57,25 +57,19 @@ du laboratoire, il faut définir *l'identifiant* du laboratoire :
les paramètres de configuration de l'application :
.. index::
pair: préférences; reg_institute
pair: préférences; inspirehep_institute_id
::
Application > préférences
* Editer la propriété ``reg_institute`` et la remplir avec le sigle
du laboratoire.
* Editer la propriété ``inspirehep_institute_id`` et la remplir avec le
numéro de la fiche décrivant le laboratoire dans l'entrepôt
`inspirehep.net`_.
.. note::
Déterminer le bon identifiant demande un peu de doigté.
Une bonne approche est de regarder l'affection des
auteurs dans les entrepôts `cds.cern.ch`_ et `inspirehep.net`_, notamment
le champ ``700 u`` quand les enregistrements sont au format `MARC`_
(plus de détails dans :ref:`arcanes`)
.. note::
le sigle peut être remplacé par une `expression régulière`_
qui permet de choisir entre différentes alternatives.
* Les indentifiants des laboratoires de l'IN2P3 sont donnés
dans l'annexe :ref:`institutes`.
Définir un moissonneur
^^^^^^^^^^^^^^^^^^^^^^
......
......@@ -23,6 +23,14 @@ Guide utilisateur
workflow
license
Annexe
======
.. toctree::
:maxdepth: 2
institute_identifier
Index
=====
......
.. include:: hyperlinks.txt
.. _institutes:
Les identifiants des laboratoires dans inspirehep
-------------------------------------------------
L'identifiant du laboratoire est le numéro de la fiche qui décrit le
laboratoire dans l'entrepôt `inspirehep.net`_. Les fiches sont consultables via
l'hyperlien http://inspirehep.net/collection/Institutions
Pour les laboratoires de l'IN2P3, les identifiants sont les suivants:
.. table:: Les identifiants des laboratoires de l'IN2P3 dans
l'entrepôt `inspirehep.net`_.
.. csv-table::
:file: csv/institutes.csv
:widths: 50 20
......@@ -297,6 +297,7 @@
'insert new %s': 'insert new %s',
'install': 'installé',
'Institute': 'Institut',
'Institute identifier in inspirehep.net.': 'Identifiant du laboratoire dans inspirehep.net.',
'Institute number associated to CPPM authors': "Numéro de l'Institut associé aux auteurs du CPPM",
'Invalid': 'Non conforme',
"Invalid database table '%s'": "Invalid database table '%s'",
......@@ -485,6 +486,7 @@
'Reject no %s authors': "Rejeté pas d'autheur(s) du %s",
'Reject no author(s)': "Rejeté pas d'autheur(s)",
'Reject no authors': "Rejeté pas d'auteurs",
'Reject no authors of my institute': "Rejeté pas d'auteurs de mon laboratoire",
'Reject no conference information': "Rejeté pas d'information sur la conférence",
'Reject no CPPM authors': "Rejeté pas d'auteurs du CPPM",
'Reject no OAI identifier': "Rejeté pas d'identifiant OAI",
......
......@@ -22,4 +22,9 @@ app = Storage()
for row in db(db.preferences).select():
app[row.property] = row.value
# add local variable
app["reg_institute"] = ""
app["institute"] = None
current.app = app
......@@ -32,12 +32,12 @@ cfgPreferences = dict(dbtable='preferences',
'authorize_harvester_scan': False,
'authorize_user_login': False,
'harvester_start_year': year,
'reg_institute': None},
'inspirehep_institute_id': 0},
sourceConfig={
'authorize_harvester_scan': {'type': 'boolean'},
'authorize_user_login': {'type': 'boolean'},
'harvester_start_year': {'type': 'number'},
'reg_institute': {'type': 'string'}},
'inspirehep_institute_id': {'type': 'number'}},
width=250,
xtype='xpreferences')
......
......@@ -21,7 +21,6 @@ if db(db.preferences).count() != 3:
"It starts with the given value and "
"ends with the current year."))
if not db(db.preferences.property == "reg_institute").select():
db.preferences.insert(property="reg_institute",
definition=T("Regular expression defining the "
"name of our institute."))
if not db(db.preferences.property == "inspirehep_institute_id").select():
db.preferences.insert(property="inspirehep_institute_id",
definition=T("Institute identifier in inspirehep.net."))
......@@ -5,7 +5,6 @@ and to push them in the database.
"""
from base import (DRY_RUN,
family_name_fr,
fix_amu,
format_author_fr,
ToolException)
......
......@@ -25,29 +25,6 @@ def family_name_fr(full_name):
return full_name[full_name.find(' ') + 1:]
def fix_amu(record):
"""Fix the name of the C{Aix Marseille University}
@type record: L{Record}
@rtype: unicode
@return: the university names separated by comma.
"""
universities = record.these_universities()
for idx in range(len(universities)):
if re.search(current.app.reg_institute, universities[idx]):
year = re.search(r"(\d\d\d\d)", record.these_defense()).group(1)
if int(year) < 2012:
universities[idx] = \
u"Université de la Méditerrannée Aix-Marseille II"
else:
universities[idx] = u"Aix Marseille Université"
return ', '.join(universities)
def format_author_fr(name):
"""Format the author name according to French typographic rules,
I{i.e.} C{J.-P. Doe}.
......
......@@ -6,7 +6,7 @@ import re
import traceback
from base import family_name_fr, fix_amu, MSG_CRASH, MSG_LOAD
from base import family_name_fr, MSG_CRASH, MSG_LOAD
from invenio_tools import CheckException
from publicationstool import PublicationsTool
from plugin_dbui import get_id, UNDEF_ID
......@@ -39,7 +39,7 @@ class Thesis(PublicationsTool):
id_category = get_id(db.categories, code='PHD')
oai_url = record.oai_url()
title = record.title()
universities = fix_amu(record)
universities = ', '.join(record.these_universities())
# extract the year from the defense date
# this approach seems the most reliable
......@@ -99,6 +99,7 @@ class Thesis(PublicationsTool):
self.check.oai(record)
self.check.submitted(record)
self.check.year(record)
self.check.format_universities(record)
except CheckException as e:
self.logs[-1].reject(e, record.year())
......
......@@ -3,32 +3,19 @@
@note: details on the invenio API at U{http://invenio-software.org/}
"""
from base import OAI_URL, REG_OAI, REG_YEAR
from base import (is_institute,
OAI_URL,
REG_OAI,
REG_YEAR)
from exception import (CdsException,
CheckException,
InstituteException,
Marc12Exception,
RecordException,
XmlException)
from checkandfix import CheckAndFix
from checkandfix import CheckAndFix, load_record
from institute import Institute
from inveniostore import InvenioStore
from marc12 import Marc12
from record import Record
def load_record(host, record_id):
"""Load a record from an invenio store
Args:
host (str): host of the store. Either cds.cern.ch or inspirhep.net.
record_id (integer): the record identifier
Returns:
Record: the decoded record
"""
store = InvenioStore(host)
xml = store.get_record(record_id)
svc = Marc12()
return svc(xml)[0]
......@@ -8,3 +8,32 @@ OAI_URL = "http://%s/record/%s"
REG_OAI = re.compile('oai:([a-z\.]+):([\d]+)')
REG_YEAR = re.compile("(\d{4})")
def is_institute(record):
""" True when the record describe an institute.
Args:
record (Record):
Return:
bool: true when the MARC record describes an institute
"""
# u'980': [{'b': [u'CK90', u'HEP200', u'PDGLIST', u'PPF', u'TOP500', u'WEB']},
# {'a': u'INSTITUTION'},
# {'a': u'CORE'}]}
if u"980" in record:
if isinstance(record[u"980"], list):
for di in record[u"980"]:
for k, v in di.iteritems():
if k == "a" and v == u"INSTITUTION":
return True
elif isinstance(record[u"980"], dict) and \
"a" in record[u"980"] and \
record[u"980"]["a"] == u"INSTITUTION":
return True
return False
......@@ -6,9 +6,9 @@ import re
import regex
from base import OAI_URL, REG_OAI, REG_YEAR
from gluon import current
from exception import CheckException
from filters import CLEAN_REVIEW
from gluon import current
from inveniostore import InvenioStore
from marc12 import Marc12
from plugin_dbui import get_id
......@@ -47,7 +47,7 @@ MSG_NO_AUTHOR = "Reject no author(s)"
MSG_NO_CONF = "Reject no conference information"
MSG_NO_COUNTRY = "Reject invalid country"
MSG_NO_DATE = "Reject no submission date"
MSG_NO_MY_AUTHOR = "Reject no %s authors"
MSG_NO_MY_AUTHOR = "Reject no authors of my institute"
MSG_NO_OAI = "Reject no OAI identifier"
MSG_NO_REF = "Reject incomplete paper reference"
MSG_NO_YEAR = "Reject no publication year"
......@@ -74,6 +74,26 @@ REG_CONF_DATES_2 = re.compile("(\d+) *([A-Z][a-z]{2}) *-? *(\d+) *([A-Z][a-z]{2}
REG_CONF_DATES = re.compile(regex.REG_CONF_DATES)
REG_SUBMITTED = re.compile(regex.REG_SUBMITTED)
UNIVERSITY = "University"
def load_record(host, record_id):
"""Helper function to load a single record from an invenio store.
Args:
host (str): host of the store. Either cds.cern.ch or inspirhep.net.
record_id (integer): the record identifier
Returns:
Record: the decoded record
"""
store = InvenioStore(host)
xml = store.get_record(record_id)
svc = Marc12()
return svc(xml)[0]
class CheckAndFix(object):
"""Tool to check and repair the content of the Marc12 record:
......@@ -464,6 +484,57 @@ class CheckAndFix(object):
record["773"][i]["p"] = editor
record["773"][i]["v"] = volume
def format_universities(self, record):
"""Format the name of the university for PhD:
- Fix the name of Aix-Marseille University
- Replace U. by University
@type record: L{Record}
@param record:
"""
# protection
if not record.is_thesis():
return
# CPPM: fix the name of Aix-Marseille university
if current.app.inspirehep_institute_id == 902989:
year = REG_YEAR.search(record.these_defense()).group(1)
if int(year) < 2012:
university = u"Université de la Méditerrannée Aix-Marseille II"
else:
university = u"Aix Marseille Université"
if u'502' in record and "b" in record[u'502']:
if isinstance(record[u'502']['b'], unicode):
if "Marseille" in record[u'502']['b']:
record[u'502']['b'] = university
elif isinstance(record[u'502']['b'], list):
for i in xrange(len(record[u'502']['b'])):
if "Marseille" in record[u'502']['b'][i]:
record[u'502']['b'][i] = university
# Other: replace U. by University
else:
university = current.T(UNIVERSITY).decode("utf8")
if u'502' in record and "b" in record[u'502']:
if isinstance(record[u'502']['b'], unicode):
value = record[u'502']['b']
if "U." in value:
value = value.replace('U.', university)
record[u'502']['b'] = value
elif isinstance(record[u'502']['b'], list):
for i in xrange(len(record[u'502']['b'])):
value = record[u'502']['b'][i]
if "U." in value:
value = value.replace('U.', university)
record[u'502']['b'][i] = value
def my_authors(self, record, reference=[], cmpFct=None):
"""Check that authors of my institutes signed the record.
Fill the meta data record.my_authors.
......@@ -482,10 +553,23 @@ class CheckAndFix(object):
"""
# alias
institute = current.app.reg_institute
app = current.app
reg_institute = app.reg_institute
# regular expression for the institute is not defined
# find it using the institute definition in inspirehep
# store the regular expression in current.app for a later use
if not reg_institute:
institute_id = app.inspirehep_institute_id
institute = load_record("inspirehep.net", institute_id)
reg_institute = institute.rex()
app.institute = institute
app.reg_institute = reg_institute
# find authors of my institute signing the record
s = record.find_authors_by_institute(institute, cmpFct)
s = record.find_authors_by_institute(reg_institute, cmpFct)
# nothing found try with the rescue list
if not s and reference:
......@@ -502,7 +586,7 @@ class CheckAndFix(object):
record.my_authors = s
return
raise CheckException(MSG_NO_MY_AUTHOR % institute.encode("utf8"))
raise CheckException(MSG_NO_MY_AUTHOR)
def oai(self, record):
"""Check that the OAI field is defined and well formed.
......
......@@ -16,6 +16,7 @@ class ExceptionUTF8(Exception):
class CdsException(ExceptionUTF8): pass
class CheckException(ExceptionUTF8): pass
class InstituteException(ExceptionUTF8): pass
class Marc12Exception(ExceptionUTF8): pass
class RecordException(ExceptionUTF8): pass
class XmlException(ExceptionUTF8): pass
# -*- coding: utf-8 -*-
""" invenio_tools.institute
"""
from base import is_institute
from exception import InstituteException
from record import Record
MSG_INVALID_ARG = "Invalid argument record"
MSG_INVALID_HOST = "Invalid record host"
MSG_INVALID_RECORD = "Invalid record, it is not describing an institute"
class Institute(dict):
"""MARC record representing an institute. More information on MARC
standard at U{http://www.loc.gov/marc/bibliographic/}).
The relation between methods and MARC field is the following::
| INSPIREHEP |
----------------------+-------------+
institute identifier | 110 u |
future institute id | 110 t |
name | 110 b |
type of record | 980 a |
----------------------+-------------+
"""
def __init__(self, record):
"""Constructor from a decoded MARC12 record
Args:
record (Record):
"""
if not isinstance(record, Record):
raise InstituteException(MSG_INVALID_ARG)
if not is_institute(record):
raise InstituteException(MSG_INVALID_RECORD)
if record.host() != 'inspirehep.net':
raise InstituteException(MSG_INVALID_INSTITUTE)
dict.__init__(self, record)
def future_id(self):
"""
Returns:
unicode: the future inspirehep id.
"""
return self[u"110"]["t"]
def id(self):
"""
Returns:
unicode: the inspirehep id.
"""
return self[u"110"]["u"]
def name(self):
"""
Returns:
unicode: the name of the institute.
"""
return self[u"110"]["b"]
def rex(self):
"""
Returns:
unicode: the regular expression to search author by institute
in cds.cern.ch or inspirehep.net store
"""
return r"%s|%s" % (self[u"110"]["u"], self[u"110"]["t"])
......@@ -5,7 +5,9 @@
import re
from base import is_institute
from exception import Marc12Exception
from institute import Institute
from record import Record
from xml.dom.minidom import parseString
......@@ -242,7 +244,8 @@ class Marc12(object):
return True
def __call__(self, xml, filter=None, func=None):
"""Transform the the XML string into a list of L{Record}.
"""Transform the the XML string into a list of L{Record}
or L{Institute}
@type xml: unicode
@param xml: the XML string has the following structure::
......@@ -275,7 +278,7 @@ class Marc12(object):
The argument of the function is a Record.
It can be used to polish the record content.
@rtype: list of L{Record}
@rtype: list of L{Record} or L{Institute}
@raise Marc12Exception: not well formed XML.
......@@ -292,6 +295,9 @@ class Marc12(object):
record = self._decode_record(node)
self._clean_record(record)
if is_institute(record):
record = Institute(record)
if filter and not filter(record):
continue
......
......@@ -535,7 +535,7 @@ class Record(dict):
return False
def is_thesis(self):
"""C{True} when the record corresponf to a thesis.
"""C{True} when the record corresponds to a thesis.
@rtype: bool
@return:
......
# -*- coding: utf-8 -*-
""" NAME
fix-institute-id
SYNOPSIS
fix the institute identifier
DESCRIPTION
Up to version 0.8.13, the institute identifier is stored
in the preference reg_institute.
In the following version used the institute idenfier defined
in the inspirehep.net store.
This script remove the old property and create the new one.
OPTIONS
-h, --help
Display the help and exit.
EXAMPLE
> cd ...track_publications/scripts
> ./run fix-institute-id
AUTHOR
R. Le Gac -- Sep 2015
"""
if __name__ == "__main__":