# -*- coding: utf-8 -*- """a collection of tools to search of publications in invenio store and to push them in the database. @author: R. Le Gac """ import difflib import datetime import json import re import traceback from gluon import current from gluon.storage import Storage from invenio_tools import (OAI_URL, CheckAndFix, CheckException, InvenioStore, Marc12) from plugin_dbui import (UNDEF_ID, UNKNOWN, get_create_id, get_id) DRY_RUN = current.T("dry run") # explain message MSG_CRASH = "Crash: %s" MSG_DELETE_TALK = current.T("Delete the associated talk", lazy=False) MSG_FIX_ORIGIN = current.T("Fixed the origin field", lazy=False) MSG_IN_DB = current.T("Already in the database", lazy=False) MSG_LOAD = current.T("Load in the database", lazy=False) MSG_MATCH = current.T("Reject the talk match a proceeding", lazy=False) MSG_NO_CAT = current.T('Select a "category" !!!', lazy=False) MSG_NO_CONF = current.T("Reject no conference information", lazy=False) MSG_NO_EDITOR = current.T("Reject article is not published", lazy=False) MSG_NO_HARVESTER = current.T('Harvester parameters not defined in the database.', lazy=False) MSG_NO_PROJECT = current.T('Select a "project" !!!', lazy=False) MSG_NO_TEAM = current.T('Select a "team" !!!', lazy=False) MSG_NO_THESIS = current.T("Reject not a thesis record", lazy=False) MSG_PREPRINT_IS_PAPER = current.T("Reject preprint is a published paper", lazy=False) MSG_PREPRINT_IS_CONFERENCE = current.T("Reject preprint is a conference", lazy=False) MSG_PREPRINT_IS_THESIS = current.T("Reject preprint is a thesis", lazy=False) MSG_PREPRINT_NO_NUMBER = current.T("Reject no preprint number", lazy=False) MSG_REPORT_NO_NUMBER = current.T("Reject no report number", lazy=False) MSG_TRANSFORM_PREPRINT = current.T("Transform the preprint into an article", lazy=False) MSG_TRANSFORM_TALK = current.T("Transform the talk into a proceeding", lazy=False) REG_YEAR = re.compile("(\d{4})") class ToolException(Exception): pass def family_name_fr(x): """Extract the family name when the full name is encoded as C{J. Doe}. @type x: unicode @rtype: unicode """ return x[x.find(' ')+1:] def fix_amu(record): """Fix the name of the C{Aix Marseille University} @type record: L{Record} @rtype: unicode @return: the university names separated by comma. """ li = record.these_universities() for i in range(len(li)): if re.search(current.app.reg_institute, li[i]): year = re.search(r"(\d\d\d\d)", record.these_defense()).group(1) if int(year) < 2012: li[i] = u"Université de la Méditerrannée Aix-Marseille II" else: li[i] = u"Aix Marseille Université" return ', '.join(li) def format_author_fr(name): """Format the author name according to French typographic rules, I{i.e.} C{J.-P. Doe}. The name stays unchanged when the formatting failed. @type name: unicode @param name: @rtype: unicode """ # protection if name == '' or name == None: return name # name are encoded Family, L # Family, P L # Family, M -H # Family Name, J # Family-Name, J # Family, F Name # Family, First # To avoid to deal with unicode character # look for non empty string \S m = re.match('(.+), (\S+)( |\-)*(\S+)*', name) # reformat the name as L. Family # or keep it as it is if m: if m.group(3) and m.group(4): t = (m.group(2)[0], m.group(3)[0], m.group(4)[0], m.group(1)) r = '%s.%s%s. %s' % t else: r = '%s. %s' % (m.group(2)[0], m.group(1)) else: r = name # avoid author name in upper case (R. LE FOO --> R. Le Foo) r = r.title() return r def get_harvester_tool(controller): """Get the harvester tool associated to the controller or None if . @note: valid names for the controller are: - articles - notes - preprints - proceedings - reports - talks - theses @type controller: unicode @param controller: name of the controller @rtype: class reference or None @return: None when the controller corresponds to nothing. """ if controller == 'articles': Tool = Articles elif controller == 'notes': Tool = Notes elif controller == 'preprints': Tool = Preprints elif controller == 'proceedings': Tool = Proceedings elif controller == 'reports': Tool = Reports elif controller == 'talks': Tool = Talks elif controller == 'theses': Tool = Thesis else: Tool = None return Tool def learn_my_authors(db, authors=None, id_project=None, id_team=None, year=None): """Train the rescue list of the authors of my institute, stored in the database, using the list C{authors} provided in argument. @note: all keyword arguments have to be defined. @type db: gluon.dal.DAL @param db: @type authors: list @param authors: authors names @type id_project: int @param id_project: project identifier @type id_team: int @param id_team: team idnetifier @type year: int @param year: """ # get the list of authors store in the database row = db.my_authors(id_projects=id_project, id_teams=id_team, year=year) # no entry in the database if not row: db.my_authors[0] = dict(authors=authors, id_projects=id_project, id_teams=id_team, year=year) return database_authors = row.authors.split(', ') # compare with the input list # and extract authors which are not in the db new = set(authors.split(', ')) ref = set(database_authors) diff = new.difference(ref) # update the database if diff: # NOTE1: be careful with the string encoding # NOTE2: handle the case J. Foo and J. M. Foo are the same person li = [] for el in diff: if isinstance(el, unicode): el = el.encode('utf8') fn = el[el.rfind('. ')+2:] # extract family name if fn not in row.authors: li.append(el) database_authors.extend(li) database_authors.sort(key=family_name_fr) db.my_authors[row.id] = dict(authors=', '.join(database_authors)) class Msg(Storage): """Message and action taken for a publication. - The publication is found by an harvester tool, in a store. - The action refers to the database. Fours action are defined: - C{idle} - C{load} - C{modify} - C{reject} The class contains the attributes: - C{action}: action taken - C{collection}: the harvester collection - C{harvester}: the harvester encoded as a JSON string - C{record_id}; the store identifier of the record - C{title}: title of the publication - C{txt}: text of the message - C{url}: url of the record - C{year}: year of the publication """ def __init__(self, collection=None, harvester=None, record_id=None, title=None): """ @type collection: str @param collection: the collection containing the record @type harvester: gluon.dal.Row @param harvester: the current harvester used to retrieve the record. @type record_id: int @param record_id: the store identifier of the record @type title: str @param title: the title associated to the record """ self.action = None self.collection = collection self.harvester = json.dumps(harvester.as_dict()) self.record_id = record_id self.title = title self.txt = None self.url = OAI_URL % (harvester.host, record_id) self.year = None def idle(self, txt, year=None): """Set the action as idle and the message as C{txt}. @type txt: unicode @param txt: message @type year: unicode @param year: year of the publication """ self.action = 'idle' self._set(txt, year) def load(self, txt, year=None): """Set the action as C{load} and the message as C{txt}. @type txt: unicode @param txt: message @type year: unicode @param year: year of the publication """ self.action = 'load' self._set(txt, year) def modify(self, txt, year=None): """Set the action as C{modify} and the message as C{txt}. @type txt: unicode @param txt: message @type year: unicode @param year: year of the publication """ self.action = 'modify' self._set(txt, year) def reject(self, txt, year=None): """Set the action as C{reject} set the message as C{txt}. @type txt: unicode @param txt: message @type year: unicode @param year: year of the publication """ self.action = 'reject' self._set(txt, year) def _set(self, txt, year): if isinstance(txt, unicode): txt = txt.encode("utf-8") elif not isinstance(txt, str): txt = str(txt) self.txt = txt if year: if isinstance(year, list): self.year = ', '.join(year) else: self.year = year class MsgCollection(Storage): """Message for a collection. The class contains five public attributes: - C{error}: error when scanning the collection - C{found}: number of publication found in the harvester repository - C{url}: URL used to scan the harvester repository, returning a list ids. - C{title}: title of the collection """ def __init__(self, error="", found=0, title="", url=""): Storage.__init__(self) self.error = error self.found = found self.title = title self.url = url def url_hb(self): """ @rtype: str @return: the URL return a list of record in readable format. """ return self.url.replace("of=id", "of=hb") class PublicationsTool(object): """Base class to search and process publications. - Decode the parameter of a selector defining user criteria. - Search for publications in the store, according to user criteria or process and XML string. - Load records in the database. The parameters of the search as well as the parameters of the harvester are defined by the current request. """ def __init__(self, db, selector, debug=False): """ @type db: gluon.dal.DAL @param db: @type selector: plugin_dbui.Selector @param selector: the selector defining the parameters to search and to process the publications. @type debug: bool @param debug: activate the debug mode """ self.collection_logs = [] self.db = db self.dbg = debug self.harvester = None self.logs = [] self.check = CheckAndFix() self.marc12 = Marc12() self.selector = selector # private cache for my_author rescue list self.__par = None self.__reference = None def _search_parameters(self, collection): """Build the keywords to steer the URL search in invenio store. The main parameter is the collection and the date range defined in the selector. @type collection: unicode @param collection: statement defining the collection in the store, I{i.e.} C{"find cn d0 and tc p and not tc c"} or C{"LHCb Papers"}. The syntax depends on the invenio store. @rtype: dict @return: the key are a sub-set of those defined in L{invenio_tools.InvenioStore.get_ids}. """ selector = self.selector # INSPIREHEP store if collection.startswith('find'): query = collection if selector.year_start and not selector.year_end: query += " and date %s" % self.selector.year_start elif not selector.year_start and selector.year_end: query += " and date %s" % selector.year_end elif selector.year_start and selector.year_end: query += " and date > %s and date < %s " \ % (selector.year_start-1, selector.year_end+1) di = dict(p=query, # query à la spires rg=1000, # maximum number of records returned sf='year', # sort by date so='d') # descending order # CERN INVENIO store else: if selector.year_start and not selector.year_end: rex = selector.year_start elif not selector.year_start and selector.year_end: rex = self.y2 elif selector.year_start and selector.year_end: li = [] for year in range(selector.year_start, selector.year_end+1): li.append(str(year)) rex = '|'.join(li) di = dict(cc=collection, # collection f1='year', # search on year m1='r', # use regular expression p1=rex, # regular expression defining year sf='year', # sort by date so='d') # descending order return di def _my_author_list(self, record): """Extract the rescue list for my authors in the database. @type record: L{Record} @param record: @rtype: list @return: empty when not defined """ year = record.year() # try to recover year when not defined if not year: # published article, proceeding if "773" in record and "y" in record["773"]: year = record["773"]["y"] # start date of a conference elif "111" in record and "x" in record["111"]: year = record["111"]["x"] # end date of a conference elif "111" in record and "z" in record["111"]: year = record["111"]["z"] # submitted date elif "269" in record and "c" in record["269"]: year = record["269"]["c"] else: return [] # # NOTE: # keep in mind that the CheckAndfix mechanism is not yet run # therefore year can be a list due to erratum, ... # if isinstance(year, list): year.sort() year = year[0] # the value can have several format 1992, 1992-12-31, .... m = REG_YEAR.search(year) if m: year = m.group(1) else: return [] # caching t = (year, self.selector.id_projects, self.selector.id_teams) if t == self.__par: return self.__reference # extract the list from the database row = self.db.my_authors(year=year, id_projects=self.selector.id_projects, id_teams=self.selector.id_teams) if row: self.__reference = row['authors'].split(', ') else: self.__reference = [] return self.__reference def check_by_origin(self, oai_url=None, year=None): """Check that a record already exist using the origin field. - Actions are logged. @type oai_url: unicode @param oai_url: typical value is "http://cds.cern.ch/record/123456" @type year: unicode @param year: @note: this method can be customised in inherited class to perform dedicated action. @rtype: tuple @return: the tuple (id, status). The id of the record or None. The status is equal to one when the existing record was modified zero otherwise """ if self.dbg: print "check existing record by origin" id = get_id(self.db.publications, origin=oai_url) if not id: return (None, 0) self.logs[-1].idle(MSG_IN_DB, year) return (id, 0) def check_by_fields(self, **kwargs): """Check that a record already exist using the fields defined in the keyword arguments. - Fix the field origin when a match is found. - Actions are logged. @keyword oai_url: typical value is "http://cds.cern.ch/record/123456" @keyword year: @note: this method can be customised in inherited class to perform dedicated action. @rtype: tuple @return: the tuple (id, status). The id of the record or None. The status is equal to one when the existing record was modified zero otherwise """ if self.dbg: print "check existing record by fields" db = self.db # origin can't be used for the search oai_url = kwargs["oai_url"] del kwargs["oai_url"] # look for an existing record id = get_id(db.publications, **kwargs) if not id: return (None, 0) # fix origin field ok = db.publications[id].origin and db.publications[id].origin == oai_url if not ok: if self.selector.mode != DRY_RUN: db.publications[id] = dict(origin=oai_url) self.logs[-1].modify(MSG_FIX_ORIGIN, kwargs["year"]) return (id, 1) self.logs[-1].idle(MSG_IN_DB, kwargs["year"]) return (id, 0) def check_collaboration(self, value): """Check that the collaboration exit in the database, create it if not. @type value: str or None @param value: the name of the collaboration. @rtype: int @return: the id of the collaboration, UNDEF_ID when not defined """ if not value: return UNDEF_ID return get_create_id(self.db.collaborations, collaboration=value) def check_publisher(self, value): """Check that publisher exit in the database, create it if not. @type value: str or None @param value: the abbreviation of the publisher name. @rtype: int @return: the id of the publisher, UNDEF_ID when not defined """ if not value: return UNDEF_ID return get_create_id(self.db.publishers, abbreviation=value) def select_record(self, record): """C{True} when the C{record} is selected. This method check and format the author field. @note: The checks depend on the type of publications and have to be implemented in inherited class. @type record: L{Record} @param record: @rtype: bool """ if self.dbg: print "select record and check / format authors" try: self.check.temporary_record(record) self.check.authors(record) self.check.format_authors(record, format_author_fr) self.check.collaboration(record) except BaseException as e: self.logs[-1].reject(e, record.year()) return False return True def load_db(self, record): """Load the record in the database. @note: This method depend on the type of publications. It has to be implemented for each inherited class. @type record: L{Record} @param record: @rtype: int @return: one when the record is inserted / updated in the database zero otherwise. """ return 0 def process_url(self): """Retrieve the MARC XML string and launch its decoding. @raise Exception: depending on what happen, can be StoreException, Marc12ZException, ... """ if self.dbg: print "process URL search" store = InvenioStore(self.harvester.host) # list of collections collections = self.harvester.collections collections = re.sub(' *, *', ',', collections).split(',') # alias controller = self.harvester.controller project = self.db.projects[self.harvester.id_projects].project # extract the list of publications from the store for each collection # the search is perform on a range of creation date # if not defined all element are return # # The method use here minimise the memory usage # on the server as well as on the client side for collection in collections: # log collection information # A collection is identified as "Project Controller collection" title = "%s / %s / %s" % (project, controller, collection) self.collection_logs.append(MsgCollection(title=title)) # search record in the harvester repository kwargs = self._search_parameters(collection) try: ids = store.get_ids(**kwargs) except Exception as error: self.collection_logs[-1].url = store.last_search_url() self.collection_logs[-1].error = error continue self.collection_logs[-1].url = store.last_search_url() self.collection_logs[-1].found = len(ids) if not ids: continue if self.dbg: print '%i records found in %s' % (len(ids), collection) for id in ids: if self.dbg: print "\nprocessing record", id try: xml = store.get_record(id) self.process_xml(xml) except BaseException as e: url = OAI_URL % (self.harvester.host, id) self.logs.append(Msg(harvester=self.harvester, collection=title, record_id=id, title = url)) self.logs[-1].reject(e) def process_xml(self, xml): """Decode the MARC XML string and load records in the database. @type xml: unicode @param xml: MARC XML string """ if self.dbg: print "process xml record" # NOTE: BaseException and inherited class # are catched by the previous stage li = self.marc12(xml) # process individual record for record in li: if self.dbg: print "record decoded" # start the log for the record self.logs.append(Msg(harvester=self.harvester, collection=self.collection_logs[-1].title, record_id=record.id(), title=record.title())) # additional selection stage # at this step the validity of the record is checked # and non-conformities are repaired if not self.select_record(record): continue if self.dbg: print "start loading in the database" # laod record in the database i = self.load_db(record) if self.dbg: print self.logs[-1].action.upper(), self.logs[-1].txt def report(self): """Build the processing report. @rtype: dict @return: - C{collection_logs} (list) one L{MsgCollection}) for each collection - C{controller} (str) - C{logs} (list) one L{Msg} for each publication - C{selector} (Selector) """ return dict(collection_logs=self.collection_logs, controller=self.harvester.controller, logs=self.logs, selector=self.selector) def __call__(self): """Search publication in the invenio store according to criteria and load them in the database. @raise Exception: the type of exception depends on what happen: - L{ToolException} when projet, team or category identifier are not defined. - C{StoreException} when somethings goes wrong interrogating the store. - C{Marc12Exception} when somethings goes wrong decoding the XML string return by the store - C{CheckException} if the L{Record} is not valid - C{Exception} if the python code crash """ selector = self.selector if self.dbg: print "start processing", self.__class__.__name__ print "decode request" # protection team, project and/or category have to be defined if not selector.id_projects: raise ToolException(MSG_NO_PROJECT) if not selector.id_teams: raise ToolException(MSG_NO_TEAM) if selector.xml and not selector.id_categories: raise ToolException(MSG_NO_CAT) if self.dbg: print "get harvest parameters" # process an XML request if selector.xml: self.harvester = Storage(controller=selector.controller, id_categories=selector.id_categories, id_projects=selector.id_projects, id_teams=selector.id_teams) self.collection_logs.append(MsgCollection(found=1)) self.process_xml(selector.xml) return # retrieve the harvester parameter in the database # if not yet defined (free run) if not self.harvester: row = selector.select(self.db.harvesters).first() if not row: raise ToolException(MSG_NO_HARVESTER) self.harvester = row.harvesters # retrieve records in the store and load them in the database self.process_url() class Articles(PublicationsTool): """Publications tool for articles. """ def __init__(self, *args, **kwargs): PublicationsTool.__init__(self, *args, **kwargs) # the preprint categories self.id_preprint = get_id(self.db.categories, code="PRE") def check_by_origin(self, id_publisher=None, my_authors=None, oai_url=None, pages=None, publication_url=None, title=None, volume=None, year=None): """Check that a record already exist using the origin field. - Transform a preprint into article. - Actions are logged. @keyword id_publisher: @keyword oai_url: @keyword pages: @keyword publication_url: @keyword title: @keyword volume: @keyword year: @rtype: tuple @return: the tuple (id, status). The id of the record or None. The status is equal to one when the existing record was modified zero otherwise """ if self.dbg: print "check existing article by origin" db = self.db id = get_id(db.publications, origin=oai_url) if not id: return (None, 0) # not a preprint ? if db.publications[id].id_categories != self.id_preprint: self.logs[-1].idle(MSG_IN_DB, year) return (id, 0) # transform a preprint into an article self.logs[-1].modify(MSG_TRANSFORM_PREPRINT, year) if self.selector.mode != DRY_RUN: db.publications[id] = dict(authors_institute=my_authors, id_categories=self.harvester.id_categories, id_publishers=id_publisher, id_status=UNDEF_ID, pages=pages, publication_url=publication_url, title=title, volume=volume, year=year) return (id, 1) def check_by_fields(self, id_publisher=None, my_authors=None, oai_url=None, pages=None, publication_url=None, preprint_number=None, title=None, volume=None, year=None): """Check that a record already exist using the fields: id_projects, id_publishers, id_teams, pages, volume and year. - Fix the field origin when a match is found. - Transform a preprint into article. - Actions are logged. @keyword id_publisher: @keyword oai_url: @keyword pages: @keyword publication_url: @keyword preprint_number: @keyword title: @keyword volume: @keyword year: @rtype: tuple @return: the tuple (id, status). The id of the record or None. The status is equal to one when the existing record was modified zero otherwise """ if self.dbg: print "check existing article by fields" db = self.db # check against published articles id = get_id(db.publications, id_projects=self.harvester.id_projects, id_publishers=id_publisher, id_teams=self.harvester.id_teams, pages=pages, volume=volume, year=year) # fix orign field if id and not db.publications[id].origin: if self.selector.mode != DRY_RUN: db.publications[id] = dict(origin=oai_url) self.logs[-1].modify(MSG_FIX_ORIGIN, year) return (id, 1) if id: self.logs[-1].idle(MSG_IN_DB, year) return (id, 0) # check against published preprint # a preprint can be identified by its category which is PRE (15) id = get_id(db.publications, id_categories=self.id_preprint, id_projects=self.harvester.id_projects, id_teams=self.harvester.id_teams, preprint=preprint_number) if not id: return (None, 0) # transform an existing preprint into article # institute authors can be missing in the preprint # change also the status self.logs[-1].modify(MSG_TRANSFORM_PREPRINT, year) if self.selector.mode != DRY_RUN: db.publications[id] = dict(authors_institute=my_authors, id_categories=self.harvester.id_categories, id_publishers=id_publisher, id_status=UNDEF_ID, pages=pages, publication_url=publication_url, title=title, volume=volume, year=year) return (id, 1) def load_db(self, record): """Load an article in the database. The method assume that erratum are removed. @type record: L{Record} @param record: @rtype: int @return: one when the record is inserted / updated in the database zero otherwise. """ db = self.db # alias editor = record.paper_editor() first_author = record.first_author() oai_url = record.oai_url() pages = record.paper_pages() preprint_number = record.preprint_number() publication_url=record.paper_url() submitted = record.submitted()[0] title = record.title() volume = record.paper_volume() year = record.paper_year() # check the publisher id_publisher = self.check_publisher(editor) # check the collaboration id_collaboration = self.check_collaboration(record.collaboration()) # check against already published articles or preprint # A preprint is transform itno an article. # # NOTE: The check is performed by origin then by fields. # The latter is useful to cover the case where the record # is entered by hand or by another haverster. # id, status = self.check_by_origin(id_publisher=id_publisher, my_authors=record.my_authors, oai_url=oai_url, pages=pages, publication_url=publication_url, title=title, volume=volume, year=year) if id: return status id, status = self.check_by_fields(id_publisher=id_publisher, my_authors=record.my_authors, oai_url=oai_url, pages=pages, publication_url=publication_url, preprint_number=preprint_number, title=title, volume=volume, year=year) if id: return status # eventually insert a new articles in the database # try to improve the rescue list for CPPM authors if self.selector.mode != DRY_RUN: db.publications.insert(authors=record.authors(), authors_institute=record.my_authors, first_author=first_author, id_categories=self.harvester.id_categories, id_collaborations=id_collaboration, id_projects=self.harvester.id_projects, id_publishers=id_publisher, id_status=UNDEF_ID, id_teams=self.harvester.id_teams, origin=oai_url, pages=pages, preprint=preprint_number, publication_url=publication_url, submitted=submitted, title=title, volume=volume, year=year) learn_my_authors(db, authors=record.my_authors, id_project=self.harvester.id_projects, id_team=self.harvester.id_teams, year=year) self.logs[-1].load(MSG_LOAD, year) return 1 def select_record(self, record): """C{True} when the C{record} is published. @type record: L{Record} @param record: @rtype: bool """ if not PublicationsTool.select_record(self, record): return False if self.dbg: print "select article record" if not record.is_published(): self.logs[-1].reject(MSG_NO_EDITOR, record.year()) return False try: self.check.my_authors(record, reference=self._my_author_list(record), cmpFct=family_name_fr) self.check.oai(record) self.check.clean_erratum(record) self.check.submitted(record) self.check.year(record) self.check.paper_reference(record) self.check.format_editor(record) except CheckException as e: self.logs[-1].reject(e, record.year()) return False except BaseException as e: self.logs[-1].reject(MSG_CRASH % e, record.year()) print traceback.format_exc() return False return True class Notes(PublicationsTool): """Publications tool for notes. """ def load_db(self, record): """Load a public note in the database. @type record: L{Record} @param record: @rtype: int @return: one when the record is inserted / updated in the database zero otherwise. """ db = self.db # alias first_author = record.first_author() oai_url = record.oai_url() title = record.title() year = record.year() # check against already published notes id, status = self.check_by_origin(oai_url=oai_url, year=year) if id: return status id, status = self.check_by_fields(first_author=first_author, id_categories=self.harvester.id_categories, id_projects=self.harvester.id_projects, id_teams=self.harvester.id_teams, oai_url=oai_url, title=title, year=year) if id: return status # eventually insert a new report if self.selector.mode != DRY_RUN: db.publications.insert(authors=record.authors(), authors_institute=record.my_authors, first_author=first_author, id_categories=self.harvester.id_categories, id_projects=self.harvester.id_projects, id_status=UNDEF_ID, id_teams=self.harvester.id_teams, origin=oai_url, publication_url=record.paper_url(), report_numbers=record.report_number(), submitted=record.submitted()[0], title=title, year=year) self.logs[-1].load(MSG_LOAD, year) return 1 def select_record(self, record): """C{True} when the note is valid. @type record: L{Record} @param record: @rtype: bool """ if not PublicationsTool.select_record(self, record): return False if self.dbg: print "select note record" try: self.check.my_authors(record, reference=self._my_author_list(record), cmpFct=family_name_fr) self.check.oai(record) self.check.submitted(record) self.check.year(record) except CheckException as e: self.logs[-1].reject(e, record.year()) return False except BaseException as e: self.logs[-1].reject(MSG_CRASH % e, record.year()) print traceback.format_exc() return False return True class Preprints(PublicationsTool): """Publications tool for preprints. """ def load_db(self, record): """Load a preprint in the database. @type record: L{Record} @param record: @rtype: int @return: one when the record is inserted / updated in the database zero otherwise. """ db = self.db # alias first_author = record.first_author() oai_url = record.oai_url() preprint = record.preprint_number() title = record.title() submitted = record.submitted()[0] year = record.year() # check the collaboration id_collaboration = self.check_collaboration(record.collaboration()) # check against preprint or article already published id, status = self.check_by_origin(oai_url=oai_url, year=year) if id: return status id, status = self.check_by_fields(first_author=first_author, id_projects=self.harvester.id_projects, id_teams=self.harvester.id_teams, oai_url=oai_url, preprint=preprint, submitted=submitted, title=title, year=year) if id: return status # eventually insert a new preprint if self.selector.mode != DRY_RUN: db.publications.insert(authors=record.authors(), authors_institute=record.my_authors, first_author=first_author, id_categories=self.harvester.id_categories, id_collaborations=id_collaboration, id_projects=self.harvester.id_projects, id_status=UNDEF_ID, id_teams=self.harvester.id_teams, origin=oai_url, preprint=preprint, publication_url=record.paper_url(), submitted=submitted, title=title, year=year) self.logs[-1].load(MSG_LOAD, year) return 1 def select_record(self, record): """C{True} when the preprint is valid. @type record: L{Record} @param record: """ if not PublicationsTool.select_record(self, record): return False if self.dbg: print "select preprint record" if record.is_published(): self.logs[-1].reject(MSG_PREPRINT_IS_PAPER, record.year()) return False if record.is_conference_data(): self.logs[-1].reject(MSG_PREPRINT_IS_CONFERENCE, record.year()) return False if record.is_thesis(): self.logs[-1].reject(MSG_PREPRINT_IS_THESIS, record.year()) return False if not record.preprint_number(): self.logs[-1].reject(MSG_PREPRINT_NO_NUMBER, record.year()) return False try: self.check.my_authors(record, reference=self._my_author_list(record), cmpFct=family_name_fr) self.check.oai(record) self.check.submitted(record) self.check.year(record) except CheckException as e: self.logs[-1].reject(e, record.year()) return False except BaseException as e: self.logs[-1].reject(MSG_CRASH % e, record.year()) print traceback.format_exc() return False return True class Proceedings(PublicationsTool): """Publications tool for conference proceedings. """ def check_by_origin(self, authors=None, host=None, id_publisher=None, oai_url=None, preprint=None, pages=None, publication_url=None, reference_talk=None, report_numbers=None, submitted=None, volume=None, title=None, year=None): """Check that a record already exist using the origin field. - Delete the talk when both talk and the proceeding exist. - Transform a talk in a proceeding. - Actions are logged. @keyword authors: @keyword host: @keyword id_publisher: @keyword oai_url: @keyword preprint: @keyword pages: @keyword publications_url: @keyword reference_talk: the store id of the associated talk. @keyword report_numbers: @keyword submitted: @keyword volume: @keyword title: @keyword year: @rtype: tuple @return: the tuple (id, status). The id of the record or None. The status is equal to one when the existing record was modified zero otherwise """ if self.dbg: print "check existing proceeding by origin" db = self.db origin_proc = oai_url origin_talk = OAI_URL % (host, reference_talk) id_proc = get_id(db.publications, origin=origin_proc) id_talk = get_id(db.publications, origin=origin_talk) # the proceeding exit but not the talk if id_proc and not id_talk: self.logs[-1].idle(MSG_IN_DB, year) return (id_proc, 0) # no id for talk and proceeding elif not (id_talk or id_proc): return (None, 0) # the talk and the proceeding exist, delete the talk elif id_talk and id_proc: if self.selector.mode != DRY_RUN: del db.publications[id_talk] self.logs[-1].idle(MSG_DELETE_TALK, year) return (id_talk, 1) # a talk is found without associated proceeding # transform the talk into a proceeding if self.selector.mode != DRY_RUN: db.publications[id_talk] = dict(authors=authors, id_categories=self.harvester.id_categories, id_publishers=id_publisher, origin=oai_url, preprint=preprint, pages=pages, publication_url=publication_url, report_numbers=report_numbers, submitted=submitted, volume=volume, title=title, year=year) self.logs[-1].modify(MSG_TRANSFORM_TALK, year) return (id_talk, 1) def check_by_fields(self, authors=None, conference_title=None, first_author=None, id_publisher=None, oai_url=None, preprint=None, pages=None, publication_url=None, report_numbers=None, submitted=None, volume=None, title=None, year=None): """Check that a record already exist using the fields: authors, conference_dates, conference_title, id_categories, id_projects, id_publishers, id_teams, title and year. - Fix the field origin when a match is found. - Transform a preprint into article. - Actions are logged. @note: the field conference_dates is not used since its encoding is not reliable. It might be varied when the user change the store. @keyword authors: @keyword conference_title: @keyword first_author: @keyword id_publisher: @keyword oai_url: @keyword preprint: @keyword pages: @keyword publication_url: @keyword report_numbers: @keyword submlitted: @keyword volume: @keyword title: @keyword year: @rtype: tuple @return: the tuple (id, status). The id of the record or None. The status is equal to one when the existing record was modified zero otherwise """ if self.dbg: print "check existing proceeding by fields" db = self.db di = dict(authors=authors, conference_title=conference_title, id_categories=self.harvester.id_categories, id_projects=self.harvester.id_projects, id_publishers=id_publisher, id_teams=self.harvester.id_teams, oai_url=oai_url, title=title, year=year) id, status = PublicationsTool.check_by_fields(self, **di) if id: return (id, status) # update an already published talk # A talk defines authors, conference parameters, first author, title, # a category and a year. Latter on this talk might be transform # into a proceeding. it update the authors, the category, # the publication URL and the year. # Often, the title of the talk and the title of the proceeding # are similar but not equal. It is why the algorithm is rather complex query = (db.publications.conference_title==conference_title)&\ (db.publications.first_author==first_author)&\ (db.publications.id_projects==self.harvester.id_projects)&\ (db.publications.id_teams==self.harvester.id_teams)&\ (db.publications.year==year) set = db(query).select(db.publications.id, db.publications.title) for row in set: s = difflib.SequenceMatcher(None, title, row.title) if s.ratio() > self.harvester.ratio: if self.selector.mode != DRY_RUN: di = dict(authors=authors, id_categories=self.harvester.id_categories, id_publishers=id_publisher, origin=oai_url, preprint=preprint, pages=pages, publication_url=publication_url, report_numbers=report_numbers, submitted=submitted, volume=volume, title=title, year=year) db.publications[row.id] = di self.logs[-1].modify(MSG_TRANSFORM_TALK, year) return (row.id, 1) return (None, 0) def load_db(self, record): """Load a conference proceeding in the database. @type record: L{Record} @param record: @rtype: int @return: one when the record is inserted / updated in the database zero otherwise. """ db = self.db # alias authors = record.authors() editor = record.paper_editor() host = record.host() oai_url = record.oai_url() pages = record.paper_pages() preprint = record.preprint_number() report_numbers = record.report_number() submitted = record.submitted()[0] title = record.title() url= record.paper_url() volume = record.paper_volume() year = record.paper_year() # protection against proceeding not published in a journal if not year: year = record.year() # check the publisher id_publisher = self.check_publisher(editor) # check against already published proceeding using the field origin ref_talk = record.reference_conference_talk() id, status = self.check_by_origin(authors=authors, host=host, id_publisher=id_publisher, oai_url=oai_url, preprint=preprint, pages=pages, publication_url=url, reference_talk=ref_talk, report_numbers=report_numbers, submitted=submitted, volume=volume, title=title, year=year) if id: return status # alias for the conference information conference_dates = record.conference_dates() conference_title = record.conference_title() country = record.conference_country() first_author = record.first_author() # check the collaboration id_collaboration = self.check_collaboration(record.collaboration()) # check the publisher id_publisher = self.check_publisher(editor) # check against an already published proceeding id, status = self.check_by_fields(authors=authors, conference_title=conference_title, first_author=first_author, id_publisher=id_publisher, oai_url=oai_url, preprint=preprint, pages=pages, publication_url=url, report_numbers=report_numbers, submitted=submitted, volume=volume, title=title, year=year) if id: return status # eventually insert a new proceeding if self.selector.mode != DRY_RUN: di = dict(authors=authors, authors_institute=record.my_authors, conference_dates=conference_dates, conference_speaker=first_author, conference_title=conference_title, conference_town=record.conference_town(), conference_url=record.conference_url(), first_author=first_author, id_categories=self.harvester.id_categories, id_collaborations=id_collaboration, id_countries=id_countries, id_projects=self.harvester.id_projects, id_publishers=id_publisher, id_status=UNDEF_ID, id_teams=self.harvester.id_teams, origin=oai_url, pages=pages, preprint=preprint, publication_url=url, report_numbers=report_numbers, submitted=submitted, title=title, volume=volume, year=year) db.publications[0] = di self.logs[-1].load(MSG_LOAD, year) return 1 def select_record(self, record): """C{True} when the C{record} is contains conference data. @type record: L{Record} @param record: @rtype: bool """ if not PublicationsTool.select_record(self, record): return False if self.dbg: print "select and check proceeding record" try: self.check.my_authors(record, reference=self._my_author_list(record), cmpFct=family_name_fr) self.check.oai(record) self.check.conference(record) self.check.clean_erratum(record) self.check.submitted(record) self.check.year(record) self.check.paper_reference(record) self.check.format_editor(record) except CheckException as e: self.logs[-1].reject(e, record.year()) return False except BaseException as e: self.logs[-1].reject(MSG_CRASH % e, record.year()) print traceback.format_exc() return False return True class Reports(PublicationsTool): """Publications tool for reports to committee. """ def load_db(self, record): """Load a report in the database. @type record: L{Record} @param record: @rtype: int @return: one when the record is inserted / updated in the database zero otherwise. """ db = self.db # alias authors = record.authors() first_author = record.first_author() id_status = UNDEF_ID oai_url = record.oai_url() title = record.title() year = record.year() # allow undefined institute authors try: self.check.my_authors(record, reference=self._my_author_list(record), cmpFct=family_name_fr) authors_institute = record.my_authors except CheckException: authors_institute = UNKNOWN id_status = get_id(db.status, code=UNKNOWN) # check the collaboration id_collaboration = self.check_collaboration(record.collaboration()) # check against already published reports id, status = self.check_by_origin(oai_url=oai_url, year=year) if id: return status id, status = self.check_by_fields(id_categories=self.harvester.id_categories, id_projects=self.harvester.id_projects, id_teams=self.harvester.id_teams, oai_url=oai_url, title=title, year=year) if id: return status # eventually insert a new report if self.selector.mode != DRY_RUN: db.publications.insert(authors=authors, authors_institute=authors_institute, first_author=first_author, id_categories=self.harvester.id_categories, id_collaborations=id_collaboration, id_projects=self.harvester.id_projects, id_status=id_status, id_teams=self.harvester.id_teams, origin=oai_url, preprint=record.preprint_number(), publication_url=record.paper_url(), report_numbers=record.report_number(), submitted=record.submitted()[0], title=title, year=year) self.logs[-1].load(MSG_LOAD, year) return 1 def select_record(self, record): """C{True} when the report is valid. @type record: L{Record} @param record: """ if not PublicationsTool.select_record(self, record): return False if self.dbg: print "select report record" if not record.report_number(): self.logs[-1].reject(MSG_REPORT_NO_NUMBER, record.year()) return False try: self.check.oai(record) self.check.submitted(record) self.check.year(record) except CheckException as e: self.logs[-1].reject(e, record.year()) return False except BaseException as e: self.logs[-1].reject(MSG_CRASH % e, record.year()) print traceback.format_exc() return False return True class Talks(PublicationsTool): """Publications tool for conference talks. """ def check_by_origin(self, host=None, oai_url=None, reference_proceeding=None, year=None): """Check that a record already exist using the origin field. - Delete a talk when both the talk and the proceeding exist. - Actions are logged. @keyword host: @keyword oai_url: @keyword reference_proceeding: the store id of the associated proceeding. @keyword year: @rtype: tuple @return: the tuple (id, status). The id of the record or None. The status is equal to one when the existing record was modified zero otherwise """ if self.dbg: print "check existing talk by origin" db = self.db origin_proc = OAI_URL % (host, reference_proceeding) origin_talk = oai_url id_proc = get_id(db.publications, origin=origin_proc) id_talk = get_id(db.publications, origin=origin_talk) # the talk is already in the database if id_talk and not id_proc: self.logs[-1].idle(MSG_IN_DB, year) return (id_talk, 0) # a proceeding exist but not the talk. Reject the talk elif id_proc and not id_talk: self.logs[-1].idle(MSG_MATCH, year) return (id_proc, 0) # both proceedng and talks exists, delete the talk elif id_talk and id_proc: if self.selector.mode != DRY_RUN: del db.publications[id_talk] self.logs[-1].idle(MSG_DELETE_TALK, year) return (id_proc, 1) # no talk and no proceeding return (None, 0) def check_by_fields(self, conference_title=None, first_author=None, oai_url=None, title=None, year=None): """Check that a record already exist using the fields: conference_title, first_author, id_projects, id_teams, title and year. - Fix the field origin when a match is found. - Actions are logged. @note: the field conference_dates is not used since its encoding is not reliable. It might be varied when the user change the store. @keyword conference_title: @keyword first_author: @keyword oai_url: @keyword title: @keyword year: @rtype: tuple @return: the tuple (id, status). The id of the record or None. The status is equal to one when the existing record was modified zero otherwise """ if self.dbg: print "check existing talk by origin" db = self.db # check against already published talks / proceedings # A talk defines title, first author, conference parameter # a category and a year. Latter on this talk might be transform # into a proceeding. it update the authors, the category, # the publication URL and the year. # Often, the title of the talk and the title of the proceeding # are similar but not equal. It is why the algorithm is rather complex. query = (db.publications.conference_title==conference_title)&\ (db.publications.first_author==first_author)&\ (db.publications.id_projects==self.harvester.id_projects)&\ (db.publications.id_teams==self.harvester.id_teams)&\ (db.publications.year==year) id = None set = db(query).select(db.publications.id, db.publications.title) for row in set: s = difflib.SequenceMatcher(None, title, row.title) if s.ratio() > self.harvester.ratio: id = row.id break if not id: return (None, 0) # fix the field origin for a talk not a proceeding id_category = self.harvester.id_categories origin_not_defined = id and \ (not db.publications[id].origin) and \ db.publications[id].id_categories == id_category if origin_not_defined: if self.selector.mode != DRY_RUN: db.publications[id] = dict(origin=oai_url) self.logs[-1].modify(MSG_FIX_ORIGIN, year) return (id, 1) # match found self.logs[-1].idle(MSG_MATCH, year) return (row.id, 0) def load_db(self, record): """Load a conference talk in the database. @type record: L{Record} @param record: @rtype: int @return: one when the record is inserted / updated in the database zero otherwise. """ db = self.db # alias host = record.host() oai_url = record.oai_url() year = record.year() # check against already published talk using the origin field ref = record.reference_conference_proceeding() id, status = self.check_by_origin(host=host, oai_url=oai_url, reference_proceeding=ref, year=year) if id: return status # alias for the conference information conference_dates = record.conference_dates() conference_title = record.conference_title() country = record.conference_country() first_author = record.first_author() submitted = record.submitted()[0] title = record.title() # check the collaboration id_collaboration = self.check_collaboration(record.collaboration()) # check against already published talk / proceeding using fields id, status = self.check_by_fields(conference_title=conference_title, first_author=first_author, oai_url=oai_url, title=title, year=year) if id: return status # eventually insert a new talk if self.selector.mode != DRY_RUN: db.publications.insert(authors=record.authors(), authors_institute=record.my_authors, conference_dates=conference_dates, conference_speaker=first_author, conference_title=conference_title, conference_town=record.conference_town(), conference_url=record.conference_url(), first_author=first_author, id_categories=self.harvester.id_categories, id_collaborations=id_collaboration, id_countries=id_countries, id_projects=self.harvester.id_projects, id_status=UNDEF_ID, id_teams=self.harvester.id_teams, origin=oai_url, submitted=submitted, title=title, year=year) self.logs[-1].load(MSG_LOAD, year) return 1 def select_record(self, record): """C{True} when the C{record} is contains conference data. @type record: L{Record} @param record: @rtype: bool """ if not PublicationsTool.select_record(self, record): return False if self.dbg: print "select talk record" try: self.check.my_authors(record, reference=self._my_author_list(record), cmpFct=family_name_fr) self.check.oai(record) self.check.conference(record) self.check.submitted(record) self.check.year(record) except CheckException as e: self.logs[-1].reject(e, record.year()) return False except BaseException as e: self.logs[-1].reject(MSG_CRASH % e, record.year()) print traceback.format_exc() return False return True class Thesis(PublicationsTool): """Publications tool for thesis. """ def load_db(self, record): """Load a thesis in the database. @type record: L{Record} @param record: @rtype: int @return: one when the record is inserted / updated in the database zero otherwise. """ db = self.db # alias defense_date = record.these_defense() first_author = record.first_author() id_category = get_id(db.categories, code='PHD') oai_url = record.oai_url() title = record.title() universities = fix_amu(record) # extract the year from the defense date # this approach seems the most reliable year = re.search(r"(\d\d\d\d)", defense_date).group(1) # check against already published thesis id, status = self.check_by_origin(oai_url=oai_url, year=year) if id: return status id, status = self.check_by_fields(first_author=first_author, defense=defense_date, id_projects=self.harvester.id_projects, id_teams=self.harvester.id_teams, oai_url=oai_url, title=title, year=year) if id: return status # eventually insert a new thesis if self.selector.mode != DRY_RUN: db.publications.insert(authors=first_author, authors_institute=first_author, defense=defense_date, directors=record.these_directors(), first_author=first_author, id_categories=id_category, id_teams=self.harvester.id_teams, id_projects=self.harvester.id_projects, id_status=UNDEF_ID, origin=oai_url, publication_url=record.paper_url(), submitted=record.submitted()[0], title=title, universities=universities, year=year) self.logs[-1].load(MSG_LOAD, year) return 1 def select_record(self, record): """C{True} when thesis is signed by a CPPM author. @type record: L{Record} @param record: """ if not PublicationsTool.select_record(self, record): return False try: self.check.my_authors(record, reference=self._my_author_list(record), cmpFct=family_name_fr) self.check.oai(record) self.check.submitted(record) self.check.year(record) except CheckException as e: self.logs[-1].reject(e, record.year()) return False except BaseException as e: self.logs[-1].reject(MSG_CRASH % e, record.year()) print traceback.format_exc() return False if self.dbg: print "select thesis record" if record.is_thesis(): return True self.logs[-1].reject(MSG_NO_THESIS, record.year()) return False