Commit 641c0d9c authored by LE GAC Renaud's avatar LE GAC Renaud
Browse files

Add the script fix-publication-url.py.

parent fde32336
# -*- coding: utf-8 -*-
""" NAME
fix-publication-url
SYNOPSIS
fix the publications field publication_url
DESCRIPTION
The field publication_url is the URL of the pdf file.
This definition has been re-enforce in track_publications 0.8.8.
The script check this field and try to fix it.
OPTIONS
-h, --help
Display the help and exit.
EXAMPLE
> cd ...track_publications/scripts
> ./track_publications fix-paper-url
AUTHOR
R. Le Gac -- Dec 2014
"""
def get_record(host, record_id):
"""Retrieve the record, record_id, from the store.
"""
decode = Marc12()
store = InvenioStore(host)
try:
xml = store.get_record(store_id)
record = decode(xml)[0]
except Marc12Exception:
record = None
return record
def update(row, val):
"""update publication_url field for the record row.
"""
print u" - %s, %s → '%s'" % (row.id, row.publication_url, val)
db(db.publications.id==row.id).update(publication_url=val)
if __name__ == "__main__":
import os
import re
import sys
from argparse import ArgumentParser, FileType
from invenio_tools import (CheckAndFix,
CheckException,
InvenioStore,
Marc12,
Marc12Exception)
REG_ARXIV = re.compile("http://[a-z\.]*arxiv.org/abs/(?:arXiv:)?\d+\.\d+")
REG_INDICO = re.compile("https?://indico")
REG_IOP = re.compile("http://iopscience.iop.org/(\d+-\d+)/(\d+)/(\d+)/([A-Z]?\d+)/?")
REG_ORIGIN = re.compile("https?://([a-z\.]+)/record/(\d+)")
REG_TEL = re.compile("(http://tel.archives-ouvertes.fr/tel-\d+)(?:/fr/)?")
# command line options
parser = ArgumentParser()
args = parser.parse_args()
# unlock the publications update when the status is OK
db.publications._before_update.remove(INHIBIT_PUBLICATION_UPDATE_ON_OK)
# scan the publications table
# It is admit that the publication_field is OK when it contains the keyword pdf
query = ~db.publications.publication_url.contains('pdf')
for row in db(query).select():
# strip empty space
row.publication_url = row.publication_url.strip()
# http://indico.....
if REG_INDICO.match(row.publication_url):
continue
# http://tel.archives-ouvertes.fr/tel-00742181
# The pdf is http://tel.archives-ouvertes.fr/tel-00742181/document
m = REG_TEL.match(row.publication_url)
if m:
if row.publication_url.endswith('document'):
continue
val = os.path.join(m.group(1), 'document')
update(row, val)
continue
# http://iopscience.iop.org/1126-6708/2009/04/124/
# the document is http://iopscience.iop.org/1126-6708/2009/04/124/pdf/1126-6708_2009_04_124.pdf
m = REG_IOP.match(row.publication_url)
if m:
fn = "pdf/%s_%s_%s_%s.pdf" % m.groups()
val = os.path.join(row.publication_url, fn)
update(row, val)
continue
# retrieve the information from the store
m = REG_ORIGIN.match(row.origin)
if m:
host, store_id = m.groups()
record = get_record(host, store_id)
if record:
val = record.paper_url()
if val:
update(row, val)
continue
# http://arxiv.org/abs/1311.3870
m = REG_ARXIV.match(row.publication_url)
if m:
val = row.publication_url.replace('abs', 'pdf')
val = val.replace('lanl.', '')
val = val.replace('fr.', '')
val = val.replace('arXiv:', '')
update(row, val)
continue
# http(s)://cds.cern.ch/record/123456
m = REG_ORIGIN.match(row.publication_url)
if m:
host, store_id = m.groups()
record = get_record(host, store_id)
if record:
val = record.paper_url()
if val:
update(row, val)
continue
# ??? (ignore empty publication_url)
if row.publication_url:
print u" - %s, %s → %s" % (row.id, row.publication_url, "???")
# commit
rep = raw_input("Commit change in the database? [y/N]:")
if rep == 'y':
print "The database is modified."
db.commit()
# close
sys.exit(0)
......@@ -20,12 +20,12 @@ HEAD
- Remove obsolete controller toolbox. It is now replaced by standalone
scripts: export-to-csv, import-from-csv, fix-conference-dates,
fix-collaboration, fix-conference-url,fix-country, fix-defense,
fix-report-number, fix-submitted.
fix-publication-url, fix-report-number, fix-submitted.
- The list of country is almost frozen by using the default list coming
from a geographical database (www.geonames.org).
Harvester can not add country anymore.
- Re-enforce rule for fields: collaborations, defense, conference_dates
and submitted.
- Re-enforce rule for fields: collaborations, defense, conference_dates,
submitted and publications_url
0.8.7.2 (Sep 2014)
- Migrate to plugin_dbui 0.6.1.7.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment