Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Open sidebar
limbra
limbra
Commits
1e863a07
Commit
1e863a07
authored
Sep 10, 2015
by
LE GAC Renaud
Browse files
break harvest_tools/__init__.py in small pieces.
parent
4020bbb7
Changes
13
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
13 changed files
with
2097 additions
and
1967 deletions
+2097
-1967
modules/harvest_tools/__init__.py
modules/harvest_tools/__init__.py
+17
-1967
modules/harvest_tools/articles.py
modules/harvest_tools/articles.py
+312
-0
modules/harvest_tools/base.py
modules/harvest_tools/base.py
+168
-0
modules/harvest_tools/factory.py
modules/harvest_tools/factory.py
+115
-0
modules/harvest_tools/msg.py
modules/harvest_tools/msg.py
+134
-0
modules/harvest_tools/msgcollection.py
modules/harvest_tools/msgcollection.py
+32
-0
modules/harvest_tools/notes.py
modules/harvest_tools/notes.py
+106
-0
modules/harvest_tools/preprints.py
modules/harvest_tools/preprints.py
+136
-0
modules/harvest_tools/proceedings.py
modules/harvest_tools/proceedings.py
+157
-0
modules/harvest_tools/publicationstool.py
modules/harvest_tools/publicationstool.py
+555
-0
modules/harvest_tools/reports.py
modules/harvest_tools/reports.py
+125
-0
modules/harvest_tools/talks.py
modules/harvest_tools/talks.py
+120
-0
modules/harvest_tools/thesis.py
modules/harvest_tools/thesis.py
+120
-0
No files found.
modules/harvest_tools/__init__.py
View file @
1e863a07
This diff is collapsed.
Click to expand it.
modules/harvest_tools/articles.py
0 → 100644
View file @
1e863a07
# -*- coding: utf-8 -*-
""" harvest_tools.articles
"""
import
traceback
from
base
import
family_name_fr
,
MSG_CRASH
,
MSG_FIX_ORIGIN
,
MSG_IN_DB
,
MSG_LOAD
from
gluon
import
current
from
invenio_tools
import
CheckException
from
publicationstool
import
PublicationsTool
from
plugin_dbui
import
get_id
,
UNDEF_ID
MSG_NO_EDITOR
=
current
.
T
(
"Reject article is not published"
,
lazy
=
False
)
MSG_TRANSFORM_PREPRINT
=
\
current
.
T
(
"Transform the preprint into an article"
,
lazy
=
False
)
class
Articles
(
PublicationsTool
):
"""Publications tool for articles.
"""
def
__init__
(
self
,
*
args
,
**
kwargs
):
PublicationsTool
.
__init__
(
self
,
*
args
,
**
kwargs
)
# the preprint categories
self
.
id_preprint
=
get_id
(
self
.
db
.
categories
,
code
=
"PRE"
)
def
check_by_origin
(
self
,
id_publisher
=
None
,
my_authors
=
None
,
oai_url
=
None
,
pages
=
None
,
publication_url
=
None
,
title
=
None
,
volume
=
None
,
year
=
None
):
"""Check that a record already exist using the origin field.
- Transform a preprint into article.
- Actions are logged.
@keyword id_publisher:
@keyword oai_url:
@keyword pages:
@keyword publication_url:
@keyword title:
@keyword volume:
@keyword year:
@rtype: tuple
@return: the tuple (id, status). The id of the record or None.
The status is equal to one when the existing record was modified
zero otherwise
"""
if
self
.
dbg
:
print
"check existing article by origin"
db
=
self
.
db
rec_id
=
get_id
(
db
.
publications
,
origin
=
oai_url
)
if
not
rec_id
:
return
(
None
,
0
)
# not a preprint ?
if
db
.
publications
[
rec_id
].
id_categories
!=
self
.
id_preprint
:
self
.
logs
[
-
1
].
idle
(
MSG_IN_DB
,
year
)
return
(
rec_id
,
0
)
# transform a preprint into an article
self
.
logs
[
-
1
].
modify
(
MSG_TRANSFORM_PREPRINT
,
year
)
if
not
self
.
dry_run
:
db
.
publications
[
rec_id
]
=
dict
(
authors_institute
=
my_authors
,
id_categories
=
self
.
id_category
,
id_publishers
=
id_publisher
,
id_status
=
UNDEF_ID
,
pages
=
pages
,
publication_url
=
publication_url
,
title
=
title
,
volume
=
volume
,
year
=
year
)
return
(
rec_id
,
1
)
def
check_by_fields
(
self
,
id_publisher
=
None
,
my_authors
=
None
,
oai_url
=
None
,
pages
=
None
,
publication_url
=
None
,
preprint_number
=
None
,
title
=
None
,
volume
=
None
,
year
=
None
):
"""Check that a record already exist using the fields: id_projects,
id_publishers, id_teams, pages, volume and year.
- Fix the field origin when a match is found.
- Transform a preprint into article.
- Actions are logged.
@keyword id_publisher:
@keyword oai_url:
@keyword pages:
@keyword publication_url:
@keyword preprint_number:
@keyword title:
@keyword volume:
@keyword year:
@rtype: tuple
@return: the tuple (id, status). The id of the record or None.
The status is equal to one when the existing record was modified
zero otherwise
"""
if
self
.
dbg
:
print
"check existing article by fields"
db
=
self
.
db
# check against published articles
rec_id
=
get_id
(
db
.
publications
,
id_projects
=
self
.
id_project
,
id_publishers
=
id_publisher
,
id_teams
=
self
.
id_team
,
pages
=
pages
,
volume
=
volume
,
year
=
year
)
# fix orign field
if
rec_id
and
not
db
.
publications
[
rec_id
].
origin
:
if
not
self
.
dry_run
:
db
.
publications
[
rec_id
]
=
dict
(
origin
=
oai_url
)
self
.
logs
[
-
1
].
modify
(
MSG_FIX_ORIGIN
,
year
)
return
(
rec_id
,
1
)
if
rec_id
:
self
.
logs
[
-
1
].
idle
(
MSG_IN_DB
,
year
)
return
(
rec_id
,
0
)
# check against published preprint
# a preprint can be identified by its category which is PRE (15)
rec_id
=
get_id
(
db
.
publications
,
id_categories
=
self
.
id_preprint
,
id_projects
=
self
.
id_project
,
id_teams
=
self
.
id_team
,
preprint
=
preprint_number
)
if
not
rec_id
:
return
(
None
,
0
)
# transform an existing preprint into article
# institute authors can be missing in the preprint
# change also the status
self
.
logs
[
-
1
].
modify
(
MSG_TRANSFORM_PREPRINT
,
year
)
if
not
self
.
dry_run
:
db
.
publications
[
rec_id
]
=
dict
(
authors_institute
=
my_authors
,
id_categories
=
self
.
id_category
,
id_publishers
=
id_publisher
,
id_status
=
UNDEF_ID
,
pages
=
pages
,
publication_url
=
publication_url
,
title
=
title
,
volume
=
volume
,
year
=
year
)
return
(
rec_id
,
1
)
def
load_db
(
self
,
record
):
"""Load an article in the database.
The method assume that erratum are removed.
@type record: L{Record}
@param record:
@rtype: int
@return: one when the record is inserted / updated in the database
zero otherwise.
"""
db
=
self
.
db
# alias
editor
=
record
.
paper_editor
()
first_author
=
record
.
first_author
()
oai_url
=
record
.
oai_url
()
pages
=
record
.
paper_pages
()
preprint_number
=
record
.
preprint_number
()
publication_url
=
record
.
paper_url
()
submitted
=
record
.
submitted
()[
0
]
title
=
record
.
title
()
volume
=
record
.
paper_volume
()
year
=
record
.
paper_year
()
# check the publisher
id_publisher
=
self
.
check_publisher
(
editor
)
# check the collaboration
id_collaboration
=
self
.
check_collaboration
(
record
.
collaboration
())
# check against already published articles or preprint
# A preprint is transform itno an article.
#
# NOTE: The check is performed by origin then by fields.
# The latter is useful to cover the case where the record
# is entered by hand or by another haverster.
#
rec_id
,
status
=
self
.
check_by_origin
(
id_publisher
=
id_publisher
,
my_authors
=
record
.
my_authors
,
oai_url
=
oai_url
,
pages
=
pages
,
publication_url
=
publication_url
,
title
=
title
,
volume
=
volume
,
year
=
year
)
if
rec_id
:
return
status
rec_id
,
status
=
self
.
check_by_fields
(
id_publisher
=
id_publisher
,
my_authors
=
record
.
my_authors
,
oai_url
=
oai_url
,
pages
=
pages
,
publication_url
=
publication_url
,
preprint_number
=
preprint_number
,
title
=
title
,
volume
=
volume
,
year
=
year
)
if
rec_id
:
return
status
# eventually insert a new articles in the database
# try to improve the rescue list for CPPM authors
if
not
self
.
dry_run
:
db
.
publications
.
insert
(
authors
=
record
.
authors
(),
authors_institute
=
record
.
my_authors
,
first_author
=
first_author
,
id_categories
=
self
.
id_category
,
id_collaborations
=
id_collaboration
,
id_projects
=
self
.
id_project
,
id_publishers
=
id_publisher
,
id_status
=
UNDEF_ID
,
id_teams
=
self
.
id_team
,
origin
=
oai_url
,
pages
=
pages
,
preprint
=
preprint_number
,
publication_url
=
publication_url
,
submitted
=
submitted
,
title
=
title
,
volume
=
volume
,
year
=
year
)
learn_my_authors
(
db
,
authors
=
record
.
my_authors
,
id_project
=
self
.
id_project
,
id_team
=
self
.
id_team
,
year
=
year
)
self
.
logs
[
-
1
].
load
(
MSG_LOAD
,
year
)
return
1
def
select_record
(
self
,
record
):
"""C{True} when the C{record} is published.
@type record: L{Record}
@param record:
@rtype: bool
"""
if
not
PublicationsTool
.
select_record
(
self
,
record
):
return
False
if
self
.
dbg
:
print
"select article record"
try
:
self
.
check
.
clean_erratum
(
record
)
if
not
record
.
is_published
():
self
.
logs
[
-
1
].
reject
(
MSG_NO_EDITOR
,
record
.
year
())
return
False
self
.
check
.
my_authors
(
record
,
reference
=
self
.
_my_author_list
(
record
),
cmpFct
=
family_name_fr
)
self
.
check
.
oai
(
record
)
self
.
check
.
submitted
(
record
)
self
.
check
.
year
(
record
)
self
.
check
.
paper_reference
(
record
)
self
.
check
.
format_editor
(
record
)
except
CheckException
as
e
:
self
.
logs
[
-
1
].
reject
(
e
,
record
.
year
())
return
False
except
BaseException
as
e
:
self
.
logs
[
-
1
].
reject
(
MSG_CRASH
%
e
,
record
.
year
())
print
traceback
.
format_exc
()
return
False
return
True
modules/harvest_tools/base.py
0 → 100644
View file @
1e863a07
# -*- coding: utf-8 -*-
""" harvest_tools.base
"""
import
re
from
gluon
import
current
MSG_CRASH
=
"Crash: %s"
DRY_RUN
=
current
.
T
(
"dry run"
)
MSG_FIX_ORIGIN
=
current
.
T
(
"Fixed the origin field"
,
lazy
=
False
)
MSG_IN_DB
=
current
.
T
(
"Already in the database"
,
lazy
=
False
)
MSG_LOAD
=
current
.
T
(
"Load in the database"
,
lazy
=
False
)
def
family_name_fr
(
full_name
):
"""Extract the family name when the full name is encoded as C{J. Doe}.
@type full_name: unicode
@rtype: unicode
"""
return
full_name
[
full_name
.
find
(
' '
)
+
1
:]
def
fix_amu
(
record
):
"""Fix the name of the C{Aix Marseille University}
@type record: L{Record}
@rtype: unicode
@return: the university names separated by comma.
"""
universities
=
record
.
these_universities
()
for
idx
in
range
(
len
(
universities
)):
if
re
.
search
(
current
.
app
.
reg_institute
,
universities
[
idx
]):
year
=
re
.
search
(
r
"(\d\d\d\d)"
,
record
.
these_defense
()).
group
(
1
)
if
int
(
year
)
<
2012
:
universities
[
idx
]
=
\
u
"Université de la Méditerrannée Aix-Marseille II"
else
:
universities
[
idx
]
=
u
"Aix Marseille Université"
return
', '
.
join
(
universities
)
def
format_author_fr
(
name
):
"""Format the author name according to French typographic rules,
I{i.e.} C{J.-P. Doe}.
The name stays unchanged when the formatting failed.
@type name: unicode
@param name:
@rtype: unicode
"""
# protection
if
name
==
''
or
name
is
None
:
return
name
# name are encoded Family, L
# Family, P L
# Family, M -H
# Family Name, J
# Family-Name, J
# Family, F Name
# Family, First
# To avoid to deal with unicode character
# look for non empty string \S
match
=
re
.
match
(
r
'(.+), (\S+)( |\-)*(\S+)*'
,
name
)
# reformat the name as L. Family
# or keep it as it is
if
match
:
if
match
.
group
(
3
)
and
match
.
group
(
4
):
result
=
'%s.%s%s. %s'
%
(
match
.
group
(
2
)[
0
],
match
.
group
(
3
)[
0
],
match
.
group
(
4
)[
0
],
match
.
group
(
1
))
elif
"-"
in
match
.
group
(
2
):
li
=
[
el
[
0
]
for
el
in
match
.
group
(
2
).
split
(
"-"
)]
li
.
append
(
match
.
group
(
1
))
result
=
"%s.-%s. %s"
%
tuple
(
li
)
else
:
result
=
'%s. %s'
%
(
match
.
group
(
2
)[
0
],
match
.
group
(
1
))
else
:
result
=
name
# avoid author name in upper case (R. LE FOO --> R. Le Foo)
result
=
result
.
title
()
return
result
def
learn_my_authors
(
db
,
authors
=
None
,
id_project
=
None
,
id_team
=
None
,
year
=
None
):
"""Train the rescue list of the authors of my institute,
stored in the database, using the list C{authors} provided in argument.
@note: all keyword arguments have to be defined.
@type db: gluon.dal.DAL
@param db:
@type authors: list
@param authors: authors names
@type id_project: int
@param id_project: project identifier
@type id_team: int
@param id_team: team identifier
@type year: int
@param year:
"""
# get the list of authors store in the database
row
=
db
.
my_authors
(
id_projects
=
id_project
,
id_teams
=
id_team
,
year
=
year
)
# no entry in the database
if
not
row
:
db
.
my_authors
[
0
]
=
dict
(
authors
=
authors
,
id_projects
=
id_project
,
id_teams
=
id_team
,
year
=
year
)
return
database_authors
=
row
.
authors
.
split
(
', '
)
# compare with the input list
# and extract authors which are not in the db
new
=
set
(
authors
.
split
(
', '
))
ref
=
set
(
database_authors
)
diff
=
new
.
difference
(
ref
)
# update the database
if
diff
:
# NOTE1: be careful with the string encoding
# NOTE2: handle the case J. Foo and J. M. Foo are the same person
elems
=
[]
for
elem
in
diff
:
if
isinstance
(
elem
,
unicode
):
elem
=
elem
.
encode
(
'utf8'
)
family_name
=
elem
[
elem
.
rfind
(
'. '
)
+
2
:]
# extract family name
if
family_name
not
in
row
.
authors
:
elems
.
append
(
elem
)
database_authors
.
extend
(
elems
)
database_authors
.
sort
(
key
=
family_name_fr
)
db
.
my_authors
[
row
.
id
]
=
dict
(
authors
=
', '
.
join
(
database_authors
))
class
ToolException
(
Exception
):
pass
modules/harvest_tools/factory.py
0 → 100644
View file @
1e863a07
# -*- coding: utf-8 -*-
""" harvest_tools.factory
"""
from
articles
import
Articles
from
notes
import
Notes
from
preprints
import
Preprints
from
proceedings
import
Proceedings
from
reports
import
Reports
from
talks
import
Talks
from
thesis
import
Thesis
def
build_harvester_tool
(
db
,
id_team
,
id_project
,
controller
,
id_category
,
year_start
=
None
,
year_end
=
None
,
dry_run
=
True
,
debug
=
False
):
"""
Harvest tool factory function, returns the appropriate harverster tool or
None if no factory exist for the specified controller.
@type db: gluon.dal.DAL
@param db:
@type id_team: int
@param id_team: Identifier of the team in the db
@type id_project: int
@param id_project: Identifier of the project in the db
@type controller: unicode
@param controller: Type of publication (i.e. 'article', 'proceedings', ...)
@type id_category: int
@param id_category: Identifier of the category of publication
(i.e. ACL, ACTI, ...)
@type year_start: int
@keyword year_start: Start year of search (i.e. '2014')
@type year_end: int
@keyword year_end: End year of search (i.e. '2015')
@type dry_run: boolean
@keyword dry_run: True if no record is to be written to the db
@type debug: bool
@param debug: activate the debug mode
"""
tool_class
=
get_harvester_tool
(
controller
)
if
tool_class
is
None
:
return
None
return
tool_class
(
db
,
id_team
,
id_project
,
controller
,
id_category
,
year_start
,
year_end
,
dry_run
,
debug
)
def
get_harvester_tool
(
controller
):
"""Get the harvester tool associated to the controller
or None if .
@note: valid names for the controller are:
- articles
- notes
- preprints
- proceedings
- reports
- talks
- theses
@type controller: unicode
@param controller: name of the controller
@rtype: class reference or None
@return: None when the controller corresponds to nothing.
"""
if
controller
==
'articles'
:
tool_class
=
Articles
elif
controller
==
'notes'
:
tool_class
=
Notes
elif
controller
==
'preprints'
:
tool_class
=
Preprints
elif
controller
==
'proceedings'
:
tool_class
=
Proceedings
elif
controller
==
'reports'
:
tool_class
=
Reports
elif
controller
==
'talks'
:
tool_class
=
Talks
elif
controller
==
'theses'
:
tool_class
=
Thesis
else
:
tool_class
=
None
return
tool_class
modules/harvest_tools/msg.py
0 → 100644
View file @
1e863a07
# -*- coding: utf-8 -*-
""" harvest_tools.msg
"""
import
json
from
gluon.storage
import
Storage
from
invenio_tools
import
OAI_URL
class
Msg
(
Storage
):
"""Message and action taken for a publication.
- The publication is found by an harvester tool, in a store.
- The action refers to the database.
Fours action are defined:
- C{idle}
- C{load}