Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Open sidebar
limbra
limbra
Commits
945dcead
Commit
945dcead
authored
Jun 30, 2017
by
LE GAC Renaud
Browse files
Modify the logic to deal with synonyms.
parent
28b2ec83
Changes
9
Show whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
111 additions
and
85 deletions
+111
-85
modules/harvest_tools/articles.py
modules/harvest_tools/articles.py
+4
-2
modules/harvest_tools/automaton.py
modules/harvest_tools/automaton.py
+0
-61
modules/harvest_tools/base.py
modules/harvest_tools/base.py
+2
-1
modules/harvest_tools/checkandfix.py
modules/harvest_tools/checkandfix.py
+72
-12
modules/harvest_tools/preprints.py
modules/harvest_tools/preprints.py
+5
-2
modules/harvest_tools/proceedings.py
modules/harvest_tools/proceedings.py
+8
-4
modules/harvest_tools/reports.py
modules/harvest_tools/reports.py
+2
-1
modules/harvest_tools/talks.py
modules/harvest_tools/talks.py
+12
-2
modules/test_tools.py
modules/test_tools.py
+6
-0
No files found.
modules/harvest_tools/articles.py
View file @
945dcead
...
...
@@ -318,8 +318,10 @@ class Articles(Automaton):
year
=
record
.
paper_year
()
# get the collaboration / publisher identifiers
id_collaboration
=
self
.
search_collaboration
(
record
.
collaboration
())
id_publisher
=
self
.
search_publisher
(
editor
)
id_collaboration
=
\
get_id
(
db
.
collaborations
,
collaboration
=
record
.
collaboration
())
id_publisher
=
get_id
(
db
.
publishers
,
abbreviation
=
editor
)
# get already published articles or preprint
# A preprint is transform into an article.
...
...
modules/harvest_tools/automaton.py
View file @
945dcead
...
...
@@ -672,64 +672,3 @@ class Automaton(object):
return
dict
(
collection_logs
=
self
.
collection_logs
,
controller
=
self
.
controller
,
logs
=
self
.
logs
)
def
search_collaboration
(
self
,
value
):
"""Get the database collaboration identifier using synonyms.
Args:
value (unicode):
the name of the collaboration.
Returns:
int:
* the id of the collaboration record.
* UNDEF_ID if value is not defined.
Raises:
ToolException:
when more than one synonym is found or when the
collaboration is not defined.
"""
return
search_synonym
(
self
.
db
.
collaborations
,
"collaboration"
,
value
)
def
search_country
(
self
,
value
):
"""Get the database country identifier using synonyms.
Args:
value (unicode):
the name of the country.
Returns:
int:
* the id of the country record.
* UNDEF_ID if value is not defined.
Raises:
ToolException:
when more than one synonym is found ot when
the country is not defined.
"""
return
search_synonym
(
self
.
db
.
countries
,
"country"
,
value
)
def
search_publisher
(
self
,
value
):
"""Get the database publisher identifier using synonyms.
Args:
value (unicode):
the abbreviation of the publisher.
Returns:
int:
* the id of the publisher record.
* UNDEF_ID if value is not defined.
Raises:
ToolException:
when more than one synonym is found or when
the publisher is not defined.
"""
return
search_synonym
(
self
.
db
.
publishers
,
"abbreviation"
,
value
)
modules/harvest_tools/base.py
View file @
945dcead
...
...
@@ -127,7 +127,8 @@ def search_synonym(table, fieldname, value, create=False):
Raises:
ToolException:
more than one synonym is found.
* no synonym found and not allow to create a new one.
* more than one synonym is found.
"""
if
not
value
:
...
...
modules/harvest_tools/checkandfix.py
View file @
945dcead
...
...
@@ -20,7 +20,7 @@ from invenio_tools import (MSG_NO_CONF,
from
invenio_tools.recordpubli
import
PAPER_REFERENCE_KEYS
from
itertools
import
imap
from
plugin_dbui
import
CLEAN_SPACES
,
get_id
from
plugin_dbui
import
CLEAN_SPACES
,
get_id
,
UNDEF_ID
DECODE_ARXIV
=
re
.
compile
(
r
"arXiv:(\d{2})(\d{2})\."
)
...
...
@@ -53,6 +53,9 @@ MSG_NO_OAI = "Reject no OAI identifier"
MSG_NO_REF
=
"Reject incomplete paper reference. Check "
MSG_TEMPORARY_RECORD
=
"Temporary record"
MSG_UNKNOWN_COLLABORATION
=
"Reject collaboration is unknown."
MSG_UNKNOWN_COUNTRY
=
"Reject country is unknown."
MSG_UNKNOWN_PUBLISHER
=
"Reject publisher is unknown."
MSG_WELL_FORMED_DATE
=
"Reject submission date is not well formed"
REG_CONF_DATES_1
=
re
.
compile
(
"(\d+) *-? *(\d+) *([A-Z][a-z]{2}) *(\d{4})"
)
...
...
@@ -342,8 +345,7 @@ class CheckAndFix(object):
raise
CheckException
(
MSG_NO_AUTHOR
)
def
collaboration
(
self
,
record
):
"""Check the collaboration.
Have a look to the synonyms when the collaboration is not well formed.
"""Check synonyms for collaboration by using by the proper value.
Args:
record (RecordPubli):
...
...
@@ -351,8 +353,9 @@ class CheckAndFix(object):
Raises:
CheckException:
when the collaboration value is defined
nor entered as a synonym.
* the collaboration is unknown
(neither collaboration nor synonym)
* more than one synonym found.
"""
if
self
.
dbg
:
...
...
@@ -363,12 +366,41 @@ class CheckAndFix(object):
return
try
:
search_synonym
(
self
.
db
.
collaborations
,
"collaboration"
,
val
)
db
=
self
.
db
dbid
=
search_synonym
(
db
.
collaborations
,
"collaboration"
,
val
)
if
dbid
==
UNDEF_ID
:
raise
ToolException
(
MSG_UNKNOWN_COLLABORATION
)
collaboration
=
db
.
collaborations
[
dbid
].
collaboration
if
collaboration
!=
val
:
# one collaboration
if
isinstance
(
record
[
u
"corporate_name"
],
dict
):
record
[
u
"corporate_name"
][
u
"collaboration"
]
=
collaboration
# several collaboration
# replace the list of dictionary by a single one
else
:
record
[
u
"corporate_name"
]
=
\
{
u
"collaboration"
:
collaboration
}
except
ToolException
as
e
:
raise
CheckException
(
*
e
.
args
)
def
country
(
self
,
record
):
"""Check synonyms for conference country by using by the proper value.
Args:
record (RecordPubli):
record describing a publication.
Raises:
CheckException:
* the country is unknown (neither country nor synonym)
* more than one synonym found.
"""
"""Check conference country.
Have a look to the synonyms when the country does not exist.
...
...
@@ -390,7 +422,28 @@ class CheckAndFix(object):
val
=
record
.
conference_country
()
try
:
search_synonym
(
self
.
db
.
countries
,
"country"
,
val
)
db
=
self
.
db
dbid
=
search_synonym
(
db
.
countries
,
"country"
,
val
)
if
dbid
==
UNDEF_ID
:
raise
ToolException
(
MSG_UNKNOWN_COUNTRY
)
country
=
db
.
countries
[
dbid
].
country
if
country
!=
val
:
obj
=
record
[
u
"meeting_name"
]
if
isinstance
(
obj
,
dict
):
location
=
obj
[
u
"location"
].
replace
(
val
,
country
)
record
[
u
"meeting_name"
][
u
"location"
]
=
location
else
:
for
di
in
obj
:
if
u
"location"
in
di
:
di
[
u
"location"
]
=
\
di
[
u
"location"
].
replace
(
val
,
country
)
record
[
u
"meeting_name"
]
=
obj
except
ToolException
as
e
:
raise
CheckException
(
*
e
.
args
)
...
...
@@ -822,8 +875,7 @@ class CheckAndFix(object):
raise
ToolException
(
MSG_NO_REF
+
"[year]"
)
def
publisher
(
self
,
record
):
"""Check publisher.
Have a look to the synonyms when the publisher does not exist.
"""Check synonyms for publisher by replacing by the abbreviation value.
Args:
record (RecordPubli):
...
...
@@ -831,7 +883,8 @@ class CheckAndFix(object):
Raises:
CheckException:
the publisher is not defined nor entered as a synonym.
* the publisher is unknown (neither abbreviation nor synonym)
* more than one synonym found.
"""
if
self
.
dbg
:
...
...
@@ -841,11 +894,18 @@ class CheckAndFix(object):
if
len
(
val
)
==
0
:
return
# convert ToolException to CheckExcpetion
try
:
db
=
self
.
db
search_synonym
(
db
.
publishers
,
"abbreviation"
,
val
)
dbid
=
search_synonym
(
db
.
publishers
,
"abbreviation"
,
val
)
if
dbid
==
UNDEF_ID
:
raise
ToolException
(
MSG_UNKNOWN_PUBLISHER
)
abbreviation
=
db
.
publishers
[
dbid
].
abbreviation
if
abbreviation
!=
val
:
record
[
u
"publication_info"
].
loc
[
0
,
"title"
]
=
abbreviation
# convert ToolException to CheckExcpetion
except
ToolException
as
e
:
raise
CheckException
(
*
e
.
args
)
...
...
modules/harvest_tools/preprints.py
View file @
945dcead
...
...
@@ -8,7 +8,7 @@ from .automaton import Automaton
from
.base
import
MSG_CRASH
,
MSG_LOAD
from
.checkandfix
import
CheckException
from
invenio_tools
import
RecordConf
,
RecordThesis
from
plugin_dbui
import
UNDEF_ID
from
plugin_dbui
import
get_id
,
UNDEF_ID
MSG_PREPRINT_IS_PAPER
=
"Reject preprint is a published paper"
...
...
@@ -86,6 +86,8 @@ class Preprints(Automaton):
zero otherwise.
"""
db
=
self
.
db
# alias
first_author
=
record
.
first_author
()
oai_url
=
record
.
oai_url
()
...
...
@@ -95,7 +97,8 @@ class Preprints(Automaton):
year
=
submitted
[
0
:
4
]
# get the collaboration identifier
id_collaboration
=
self
.
search_collaboration
(
record
.
collaboration
())
id_collaboration
=
\
get_id
(
db
.
collaborations
,
collaboration
=
record
.
collaboration
())
# get existing preprint or article
fields
=
dict
(
first_author
=
first_author
,
...
...
modules/harvest_tools/proceedings.py
View file @
945dcead
...
...
@@ -7,7 +7,7 @@ import traceback
from
.automaton
import
Automaton
from
.base
import
MSG_CRASH
,
MSG_LOAD
from
.checkandfix
import
CheckException
from
plugin_dbui
import
UNDEF_ID
from
plugin_dbui
import
get_id
,
UNDEF_ID
class
Proceedings
(
Automaton
):
...
...
@@ -71,6 +71,8 @@ class Proceedings(Automaton):
zero otherwise.
"""
db
=
self
.
db
# alias
oai_url
=
record
.
oai_url
()
year
=
record
.
paper_year
()
...
...
@@ -94,11 +96,13 @@ class Proceedings(Automaton):
conference_dates
=
record
.
conference_dates
()
conference_title
=
record
.
conference_title
()
first_author
=
record
.
first_author
()
id_country
=
self
.
search_
country
(
record
.
conference_country
())
id_country
=
get_id
(
db
.
countries
,
country
=
record
.
conference_country
())
# get the collaboration/publisher identifiers
id_collaboration
=
self
.
search_collaboration
(
record
.
collaboration
())
id_publisher
=
self
.
search_publisher
(
editor
)
id_collaboration
=
\
get_id
(
db
.
collaborations
,
collaboration
=
record
.
collaboration
())
id_publisher
=
get_id
(
db
.
publishers
,
abbreviation
=
editor
)
# get an already published proceeding
fields
=
dict
(
authors
=
authors
,
...
...
modules/harvest_tools/reports.py
View file @
945dcead
...
...
@@ -90,7 +90,8 @@ class Reports(Automaton):
id_status
=
get_id
(
db
.
status
,
code
=
UNKNOWN
)
# get the collaboration identifier
id_collaboration
=
self
.
search_collaboration
(
record
.
collaboration
())
id_collaboration
=
\
get_id
(
db
.
collaborations
,
collaboration
=
record
.
collaboration
())
# get an already published reports
fields
=
dict
(
id_categories
=
self
.
id_category
,
...
...
modules/harvest_tools/talks.py
View file @
945dcead
...
...
@@ -4,10 +4,17 @@
import
traceback
<<<<<<<
HEAD
from
.automaton
import
Automaton
from
.base
import
MSG_CRASH
,
MSG_LOAD
from
.checkandfix
import
CheckException
from
plugin_dbui
import
UNDEF_ID
=======
from
automaton
import
Automaton
from
base
import
MSG_CRASH
,
MSG_LOAD
from
checkandfix
import
CheckException
from
plugin_dbui
import
get_id
,
UNDEF_ID
>>>>>>>
Modify
the
logic
to
deal
with
synonyms
.
class
Talks
(
Automaton
):
...
...
@@ -67,6 +74,8 @@ class Talks(Automaton):
zero otherwise.
"""
db
=
self
.
db
# alias
oai_url
=
record
.
oai_url
()
year
=
record
.
conference_year
()
...
...
@@ -75,12 +84,13 @@ class Talks(Automaton):
conference_dates
=
record
.
conference_dates
()
conference_title
=
record
.
conference_title
()
first_author
=
record
.
first_author
()
id_country
=
self
.
search_
country
(
record
.
conference_country
())
id_country
=
get_id
(
db
.
countries
,
country
=
record
.
conference_country
())
submitted
=
record
.
submitted
()
title
=
record
.
title
()
# get the collaboration identifier
id_collaboration
=
self
.
search_collaboration
(
record
.
collaboration
())
id_collaboration
=
\
get_id
(
db
.
collaborations
,
collaboration
=
record
.
collaboration
())
# get an already published talk
fields
=
dict
(
conference_title
=
conference_title
,
...
...
modules/test_tools.py
View file @
945dcead
...
...
@@ -28,6 +28,9 @@ from harvest_tools.checkandfix import (
MSG_NO_OAI
,
MSG_NO_REF
,
MSG_TEMPORARY_RECORD
,
MSG_UNKNOWN_COLLABORATION
,
MSG_UNKNOWN_COUNTRY
,
MSG_UNKNOWN_PUBLISHER
,
MSG_WELL_FORMED_DATE
)
from
harvest_tools.preprints
import
(
...
...
@@ -82,6 +85,9 @@ def messages():
T
(
MSG_PREPRINT_NO_NUMBER
),
T
(
MSG_REPORT_NO_NUMBER
),
T
(
MSG_TEMPORARY_RECORD
),
T
(
MSG_UNKNOWN_COLLABORATION
),
T
(
MSG_UNKNOWN_COUNTRY
),
T
(
MSG_UNKNOWN_PUBLISHER
),
T
(
MSG_WELL_FORMED_COLLABORATION
),
T
(
MSG_WELL_FORMED_DATE
)}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment