Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Open sidebar
limbra
limbra
Commits
44907556
Commit
44907556
authored
Sep 23, 2015
by
LE GAC Renaud
Browse files
Improve the logic of the automaton and modify the article class accordingly.
parent
8982c977
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
271 additions
and
251 deletions
+271
-251
modules/harvest_tools/articles.py
modules/harvest_tools/articles.py
+49
-45
modules/harvest_tools/automaton.py
modules/harvest_tools/automaton.py
+5
-77
modules/invenio_tools/checkandfix.py
modules/invenio_tools/checkandfix.py
+211
-124
modules/invenio_tools/record.py
modules/invenio_tools/record.py
+1
-1
modules/invenio_tools/recordpubli.py
modules/invenio_tools/recordpubli.py
+5
-4
No files found.
modules/harvest_tools/articles.py
View file @
44907556
...
...
@@ -6,7 +6,12 @@ import traceback
from
automaton
import
Automaton
from
base
import
family_name_fr
,
MSG_CRASH
,
MSG_FIX_ORIGIN
,
MSG_IN_DB
,
MSG_LOAD
from
base
import
(
family_name_fr
,
format_author_fr
,
MSG_CRASH
,
MSG_FIX_ORIGIN
,
MSG_IN_DB
,
MSG_LOAD
)
from
invenio_tools
import
CheckException
from
plugin_dbui
import
get_id
,
UNDEF_ID
...
...
@@ -50,17 +55,15 @@ class Articles(Automaton):
self
.
logs
[
-
1
].
reject
(
MSG_NO_EDITOR
,
record
.
year
())
return
False
self
.
check
.
my_authors
(
record
,
reference
=
self
.
_my_author_list
(
record
),
cmpFct
=
family_name_fr
)
self
.
check
.
oai
(
record
)
self
.
check
.
paper_reference
(
record
)
self
.
check
.
submitted
(
record
)
self
.
check
.
year
(
record
)
self
.
check
.
paper_reference
(
record
)
self
.
check
.
format_editor
(
record
)
self
.
check
.
format_authors
(
record
,
format_author_fr
)
self
.
check
.
get_my_authors
(
record
,
family_name_fr
)
self
.
check
.
fix_oai
(
record
)
except
CheckException
as
e
:
self
.
logs
[
-
1
].
reject
(
e
,
record
.
year
())
...
...
@@ -73,15 +76,15 @@ class Articles(Automaton):
return
True
def
get_by_origin
(
self
,
id_publisher
=
None
,
my_authors
=
None
,
oai_url
=
None
,
pages
=
None
,
publication_url
=
None
,
title
=
None
,
volume
=
None
,
year
=
None
):
def
get_
record_
by_origin
(
self
,
id_publisher
=
None
,
my_authors
=
None
,
oai_url
=
None
,
pages
=
None
,
publication_url
=
None
,
title
=
None
,
volume
=
None
,
year
=
None
):
"""Get an existing record using the origin field.
- Transform a preprint into article.
...
...
@@ -130,16 +133,16 @@ class Articles(Automaton):
return
(
rec_id
,
1
)
def
get_by_fields
(
self
,
id_publisher
=
None
,
my_authors
=
None
,
oai_url
=
None
,
pages
=
None
,
publication_url
=
None
,
preprint_number
=
None
,
title
=
None
,
volume
=
None
,
year
=
None
):
def
get_
record_
by_fields
(
self
,
id_publisher
=
None
,
my_authors
=
None
,
oai_url
=
None
,
pages
=
None
,
publication_url
=
None
,
preprint_number
=
None
,
title
=
None
,
volume
=
None
,
year
=
None
):
"""Get a record matching the fields: id_projects,
id_publishers, id_teams, pages, volume and year.
...
...
@@ -234,6 +237,7 @@ class Articles(Automaton):
# alias
editor
=
record
.
paper_editor
()
first_author
=
record
.
first_author
()
my_authors
=
record
.
my_authors
oai_url
=
record
.
oai_url
()
pages
=
record
.
paper_pages
()
preprint_number
=
record
.
preprint_number
()
...
...
@@ -254,26 +258,26 @@ class Articles(Automaton):
# The latter is useful to cover the case where the record
# is entered by hand or by another harvester.
#
rec_id
,
status
=
self
.
get_by_origin
(
id_publisher
=
id_publisher
,
my_authors
=
record
.
my_authors
,
oai_url
=
oai_url
,
pages
=
pages
,
publication_url
=
publication_url
,
title
=
title
,
volume
=
volume
,
year
=
year
)
rec_id
,
status
=
self
.
get_
record_
by_origin
(
id_publisher
=
id_publisher
,
my_authors
=
my_authors
,
oai_url
=
oai_url
,
pages
=
pages
,
publication_url
=
publication_url
,
title
=
title
,
volume
=
volume
,
year
=
year
)
if
rec_id
:
return
status
rec_id
,
status
=
self
.
get_by_fields
(
id_publisher
=
id_publisher
,
my_authors
=
record
.
my_authors
,
oai_url
=
oai_url
,
pages
=
pages
,
publication_url
=
publication_url
,
preprint_number
=
preprint_number
,
title
=
title
,
volume
=
volume
,
year
=
year
)
rec_id
,
status
=
self
.
get_
record_
by_fields
(
id_publisher
=
id_publisher
,
my_authors
=
my_authors
,
oai_url
=
oai_url
,
pages
=
pages
,
publication_url
=
publication_url
,
preprint_number
=
preprint_number
,
title
=
title
,
volume
=
volume
,
year
=
year
)
if
rec_id
:
return
status
...
...
@@ -282,7 +286,7 @@ class Articles(Automaton):
if
not
self
.
dry_run
:
db
.
publications
.
insert
(
authors
=
record
.
authors
(),
authors_institute
=
record
.
my_authors
,
authors_institute
=
my_authors
,
first_author
=
first_author
,
id_categories
=
self
.
id_category
,
id_collaborations
=
id_collaboration
,
...
...
modules/harvest_tools/automaton.py
View file @
44907556
...
...
@@ -6,14 +6,13 @@ import re
import
traceback
from
base
import
format_author_fr
,
MSG_FIX_ORIGIN
,
MSG_IN_DB
,
ToolException
from
base
import
MSG_FIX_ORIGIN
,
MSG_IN_DB
,
ToolException
from
gluon.storage
import
Storage
from
invenio_tools
import
(
CheckAndFix
,
InvenioStore
,
Marc12
,
OAI_URL
,
REG_OAI
,
REG_YEAR
)
REG_OAI
)
from
msg
import
Msg
from
msgcollection
import
MsgCollection
from
plugin_dbui
import
get_create_id
,
get_id
,
UNDEF_ID
...
...
@@ -115,10 +114,6 @@ class Automaton(object):
controller
=
self
.
controller
,
id_categories
=
self
.
id_category
)
# private cache for my_author rescue list
self
.
__par
=
None
self
.
__reference
=
None
def
_is_record_in_db
(
self
,
rec_id
,
title
):
"""Return C{True} if the record is already in the database.
The search is based on the origin field.
...
...
@@ -221,74 +216,7 @@ class Automaton(object):
so
=
'd'
)
# descending order
return
dic
def
_my_author_list
(
self
,
record
):
"""Extract the rescue list for my authors in the database.
@type record: L{Record}
@param record:
@rtype: list
@return: empty when not defined
"""
year
=
record
.
year
()
# try to recover year when not defined
if
not
year
:
# published article, proceeding
if
"773"
in
record
and
"y"
in
record
[
"773"
]:
year
=
record
[
"773"
][
"y"
]
# start date of a conference
elif
"111"
in
record
and
"x"
in
record
[
"111"
]:
year
=
record
[
"111"
][
"x"
]
# end date of a conference
elif
"111"
in
record
and
"z"
in
record
[
"111"
]:
year
=
record
[
"111"
][
"z"
]
# submitted date
elif
"269"
in
record
and
"c"
in
record
[
"269"
]:
year
=
record
[
"269"
][
"c"
]
else
:
return
[]
#
# NOTE:
# keep in mind that the CheckAndfix mechanism is not yet run
# therefore year can be a list due to erratum, ...
#
if
isinstance
(
year
,
list
):
year
.
sort
()
year
=
year
[
0
]
# the value can have several format 1992, 1992-12-31, ....
m
=
REG_YEAR
.
search
(
year
)
if
m
:
year
=
m
.
group
(
1
)
else
:
return
[]
# caching
t
=
(
year
,
self
.
id_project
,
self
.
id_team
)
if
t
==
self
.
__par
:
return
self
.
__reference
# extract the list from the database
row
=
self
.
db
.
my_authors
(
year
=
year
,
id_projects
=
self
.
id_project
,
id_teams
=
self
.
id_team
)
if
row
:
self
.
__reference
=
row
[
'authors'
].
split
(
', '
)
else
:
self
.
__reference
=
[]
return
self
.
__reference
def
check_record
(
self
,
record
):
def
check_record
(
self
,
record
,
cmpFct
=
None
):
"""Check the content of the record in order to fix non conformities.
Return False when a non conformities has been found and can not be
corrected.
...
...
@@ -314,7 +242,7 @@ class Automaton(object):
try
:
self
.
check
.
temporary_record
(
record
)
self
.
check
.
authors
(
record
)
self
.
check
.
format_authors
(
record
,
format_author_fr
)
self
.
check
.
my_affiliation
(
record
,
self
.
id_project
,
self
.
id_team
)
self
.
check
.
collaboration
(
record
)
except
Exception
as
e
:
...
...
@@ -323,7 +251,7 @@ class Automaton(object):
return
True
def
get_by_fields
(
self
,
**
kwargs
):
def
get_
record_
by_fields
(
self
,
**
kwargs
):
"""Get database record matching fields defined in the keyword arguments.
- Fix the field origin when a match is found.
...
...
modules/invenio_tools/checkandfix.py
View file @
44907556
...
...
@@ -100,16 +100,119 @@ def load_record(host, record_id):
class
CheckAndFix
(
object
):
"""Tool to check and repair the content of the Marc12 record:
"""A collection of tools to check and repair the content
of the Marc12 record.
- Check the validity of the record.
- Conference information are added for talk and proceeding.
- Fix as far as possible inconsistencies and non-conformity.
"""
def
__init__
(
self
):
They can be call separately or in one go.
Most of the method raise the CheckException when something went wrong.
self
.
db
=
current
.
globalenv
[
'db'
]
self
.
reg_institute
=
self
.
_get_reg_institute
()
# private cache for my_author rescue list
self
.
__par
=
None
self
.
__reference
=
None
# private cache for my authors list
self
.
__my_authors
=
{}
def
_get_reg_institute
(
self
):
"""
@rtype: unicode
@return: the regular expression defining my institute
"""
# alias
app
=
current
.
app
reg_institute
=
app
.
reg_institute
# regular expression for the institute is not defined
# find it using the institute definition in inspirehep
# store the regular expression in current.app for a later use
if
not
reg_institute
:
institute_id
=
app
.
inspirehep_institute_id
institute
=
load_record
(
"inspirehep.net"
,
institute_id
)
reg_institute
=
institute
.
rex
()
app
.
institute
=
institute
app
.
reg_institute
=
reg_institute
return
reg_institute
def
_get_author_rescue_list
(
self
,
record
,
id_project
,
id_team
):
"""Get the rescue list for my authors.
@type record: L{Record}
@param record:
@type id_project: int
@param id_project: Identifier of the project in the database
@type id_team: int
@param id_team: Identifier of the team in the database
@rtype: list
@return: empty when not defined
"""
year
=
record
.
year
()
# try to recover year when not defined
if
not
year
:
# published article, proceeding
if
"773"
in
record
and
"y"
in
record
[
"773"
]:
year
=
record
[
"773"
][
"y"
]
# start date of a conference
elif
"111"
in
record
and
"x"
in
record
[
"111"
]:
year
=
record
[
"111"
][
"x"
]
# end date of a conference
elif
"111"
in
record
and
"z"
in
record
[
"111"
]:
year
=
record
[
"111"
][
"z"
]
# submitted date
elif
"269"
in
record
and
"c"
in
record
[
"269"
]:
year
=
record
[
"269"
][
"c"
]
else
:
return
[]
#
# NOTE
# keep in mind that the CheckAndfix mechanism is not yet run
# therefore year can be a list due to erratum, ...
#
if
isinstance
(
year
,
list
):
year
.
sort
()
year
=
year
[
0
]
# the value can have several format 1992, 1992-12-31, ....
m
=
REG_YEAR
.
search
(
year
)
if
m
:
year
=
m
.
group
(
1
)
else
:
return
[]
# caching
t
=
(
year
,
self
.
id_project
,
self
.
id_team
)
if
t
==
self
.
__par
:
return
self
.
__reference
# extract the list from the database
row
=
self
.
db
.
my_authors
(
year
=
year
,
id_projects
=
self
.
id_project
,
id_teams
=
self
.
id_team
)
if
row
:
self
.
__reference
=
row
[
'authors'
].
split
(
', '
)
else
:
self
.
__reference
=
[]
return
self
.
__reference
"""
def
_recover_submitted
(
self
,
record
):
"""Recover submitted date using conference, preprint or thesis
information.
...
...
@@ -202,7 +305,7 @@ class CheckAndFix(object):
raise
CheckException
(
MSG_NO_REF
)
def
authors
(
self
,
record
):
"""Check th
e consistency between
author fields.
"""Check th
at
author fields
are defined
.
@type record: L{Record}
@param record:
...
...
@@ -266,7 +369,7 @@ class CheckAndFix(object):
return
# check country information (all valid countries have been enter once)
db
=
current
.
globalenv
[
'db'
]
db
=
self
.
db
id
=
get_id
(
db
.
countries
,
country
=
record
.
conference_country
())
if
not
id
:
raise
CheckException
(
MSG_NO_COUNTRY
)
...
...
@@ -292,6 +395,46 @@ class CheckAndFix(object):
else
:
raise
CheckException
(
MSG_WELL_FORMED_CONF_DATES
)
def
fix_oai
(
self
,
record
):
"""The id in the OAI field might be different from the record id.
In INVENIO there is a mechanism to redirect to the correct one
The fix depend on the content of the database
@type record: L{Record}
@param record:
"""
value
=
record
.
oai
()
match
=
REG_OAI
.
match
(
value
)
myid
=
record
.
id
()
# The id in the OAI field might be different from the record id.
# In INVENIO there is a mechanism to redirect to the correct one
#
# The fix depend on the content of the database
if
match
.
group
(
2
)
!=
myid
:
db
=
self
.
db
# The record OAI is already used in the database. Do nothing
oai_url
=
OAI_URL
%
(
match
.
group
(
1
),
match
.
group
(
2
))
if
get_id
(
db
.
publications
,
origin
=
oai_url
):
return
# The OAI based on the record id is already used in the database.
# Modify the record OAI
oai_url
=
OAI_URL
%
(
match
.
group
(
1
),
myid
)
if
get_id
(
db
.
publications
,
origin
=
oai_url
):
# the location of the oai information depends on the store
# cds: (248, a), inspirehep: (909C0, o)
if
u
"0248"
in
record
:
field
,
subfield
=
u
"0248"
,
"a"
elif
u
"909CO"
in
record
:
field
,
subfield
=
u
"909CO"
,
"o"
record
[
field
][
subfield
]
=
OAI_INVENIO
%
(
match
.
group
(
1
),
myid
)
def
format_authors
(
self
,
record
,
func
):
"""Format the author names using the function func.
...
...
@@ -450,6 +593,40 @@ class CheckAndFix(object):
value
=
value
.
replace
(
'U.'
,
university
)
record
[
u
'502'
][
'b'
][
i
]
=
value
def
get_my_authors
(
self
,
record
,
cmpFct
=
None
):
"""Get authors of my institutes signing the record.
The information is append to the L{Record} via the attribute C{my_authors}.
@type record: L{Record}
@param record:
@type cmpFct: reference to a function
@param cmpFct: Extract the family name from the full name.
It is used to sort my author list according to the author family name.
@rtype: unicode
@return: the list of authors separated by comma
@raise CheckException: when the list is empty
"""
# might have been computed when affiliation is checked
rec_id
=
record
.
id
()
if
rec_id
in
self
.
__my_authors
:
li
=
self
.
__my_authors
[
rec_id
]
li
.
sort
(
key
=
cmpFct
)
value
=
u
', '
.
join
(
li
)
# find authors of my institute signing the record
else
:
reg_institute
=
self
.
reg_institute
value
=
record
.
find_authors_by_affiliation
(
reg_institute
,
cmpFct
)
if
not
value
:
raise
CheckException
(
MSG_NO_MY_AUTHOR
)
record
.
my_author
=
value
def
is_conference
(
self
,
record
):
"""Check that the record described a conference talk / proceeding.
...
...
@@ -474,92 +651,44 @@ class CheckAndFix(object):
if
not
isinstance
(
record
,
RecordThesis
):
raise
CheckException
(
MSG_NO_THESIS
)
def
my_authors
(
self
,
record
,
reference
=
[],
cmpFct
=
None
):
"""Check that authors of my institutes signed the record.
Fill the meta data record.my_authors.
def
my_affiliation
(
self
,
record
,
id_project
,
id_team
):
"""Check that authors of my institute are signatories.
Launch a recovery procedure when affiliations are not defined.
It is based on the author rescue list stored in the database.
@type record: L{Record}
@param record:
@type
reference: lis
t
@param
reference: list of author names belong
in
g
t
o my institut
e
@type
id_project: in
t
@param
id_project: Identifier of the project
in t
he databas
e
@type cmpFct: reference to a function
@param cmpFct: Extract the family name from the full name.
It is used to sort my author list according to the author family name.
@type id_team: int
@param id_team: Identifier of the team in the database
@raise CheckException:
"""
# alias
app
=
current
.
app
reg_institute
=
app
.
reg_institute
# regular expression for the institute is not defined
# find it using the institute definition in inspirehep
# store the regular expression in current.app for a later use
if
not
reg_institute
:
institute_id
=
app
.
inspirehep_institute_id
institute
=
load_record
(
"inspirehep.net"
,
institute_id
)
reg_institute
=
institute
.
rex
()
app
.
institute
=
institute
app
.
reg_institute
=
reg_institute
# find authors of my institute signing the record
s
=
record
.
find_authors_by_affiliation
(
reg_institute
,
cmpFct
)
# nothing found try with the rescue list
if
not
s
and
reference
:
s1
=
set
(
record
.
authors_as_list
())
s2
=
set
(
reference
)
li
=
list
(
s1
.
intersection
(
s2
))
li
.
sort
(
key
=
cmpFct
)
value
=
record
.
find_affiliation
(
self
.
reg_institute
)
if
value
:
return
value
s
=
u
', '
.
join
(
li
)
# affiliation is not defined
# try to recover using the authors rescue list
rescue_list
=
self
.
_get_author_rescue_list
(
record
,
id_project
,
id_team
)
if
not
rescue_list
:
raise
CheckException
(
MSG_NO_MY_AUTHOR
)
if
s
:
record
.
my_authors
=
s
return
raise
CheckException
(
MSG_NO_MY_AUTHOR
)
def
oai
(
self
,
record
):
"""Check that the OAI field is defined and well formed.
In some tricky case the OAI can evolve with time. Therefore, the record
has several values. In that case, the method selects the one matching the
OAI use in the database.
# compute the intersection between the authors and the rescue list
set_1
=
set
(
record
.
authors_as_list
())
set_2
=
set
(
rescue_list
)
@type record: L{Record}
@param record:
li
=
list
(
set_1
.
intersection
(
set_2
))
if
not
li
:
raise
CheckException
(
MSG_NO_MY_AUTHOR
)
@raise CheckException:
"""
value
=
record
.
oai
()
match
=
REG_OAI
.
match
(
value
)
myid
=
record
.
id
()
# The id in the OAI field might be different from the record id.
# In INVENIO there is a mechanism to redirect to the correct one
#
# The fix depend on the content of the database
if
match
.
group
(
2
)
!=
myid
:
db
=
current
.
globalenv
[
'db'
]
# The record OAI is already used in the database. Do nothing
oai_url
=
OAI_URL
%
(
match
.
group
(
1
),
match
.
group
(
2
))
if
get_id
(
db
.
publications
,
origin
=
oai_url
):
return
# The OAI based on the record id is already used in the database.
# Modify the record OAI
oai_url
=
OAI_URL
%
(
match
.
group
(
1
),
myid
)
if
get_id
(
db
.
publications
,
origin
=
oai_url
):
record
[
field
][
subfield
]
=
OAI_INVENIO
%
(
match
.
group
(
1
),
myid
)
# cache the result for a latter use
self
.
__my_authors
[
record
.
id
()]
=
li
def
paper_reference
(
self
,
record
):
"""Check that editor, page, volume and paper year are defined
...
...
@@ -735,45 +864,3 @@ class CheckAndFix(object):