Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Open sidebar
limbra
limbra
Commits
945dcead
Commit
945dcead
authored
Jun 30, 2017
by
LE GAC Renaud
Browse files
Modify the logic to deal with synonyms.
parent
28b2ec83
Changes
9
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
111 additions
and
85 deletions
+111
-85
modules/harvest_tools/articles.py
modules/harvest_tools/articles.py
+4
-2
modules/harvest_tools/automaton.py
modules/harvest_tools/automaton.py
+0
-61
modules/harvest_tools/base.py
modules/harvest_tools/base.py
+2
-1
modules/harvest_tools/checkandfix.py
modules/harvest_tools/checkandfix.py
+72
-12
modules/harvest_tools/preprints.py
modules/harvest_tools/preprints.py
+5
-2
modules/harvest_tools/proceedings.py
modules/harvest_tools/proceedings.py
+8
-4
modules/harvest_tools/reports.py
modules/harvest_tools/reports.py
+2
-1
modules/harvest_tools/talks.py
modules/harvest_tools/talks.py
+12
-2
modules/test_tools.py
modules/test_tools.py
+6
-0
No files found.
modules/harvest_tools/articles.py
View file @
945dcead
...
@@ -318,8 +318,10 @@ class Articles(Automaton):
...
@@ -318,8 +318,10 @@ class Articles(Automaton):
year
=
record
.
paper_year
()
year
=
record
.
paper_year
()
# get the collaboration / publisher identifiers
# get the collaboration / publisher identifiers
id_collaboration
=
self
.
search_collaboration
(
record
.
collaboration
())
id_collaboration
=
\
id_publisher
=
self
.
search_publisher
(
editor
)
get_id
(
db
.
collaborations
,
collaboration
=
record
.
collaboration
())
id_publisher
=
get_id
(
db
.
publishers
,
abbreviation
=
editor
)
# get already published articles or preprint
# get already published articles or preprint
# A preprint is transform into an article.
# A preprint is transform into an article.
...
...
modules/harvest_tools/automaton.py
View file @
945dcead
...
@@ -672,64 +672,3 @@ class Automaton(object):
...
@@ -672,64 +672,3 @@ class Automaton(object):
return
dict
(
collection_logs
=
self
.
collection_logs
,
return
dict
(
collection_logs
=
self
.
collection_logs
,
controller
=
self
.
controller
,
controller
=
self
.
controller
,
logs
=
self
.
logs
)
logs
=
self
.
logs
)
def
search_collaboration
(
self
,
value
):
"""Get the database collaboration identifier using synonyms.
Args:
value (unicode):
the name of the collaboration.
Returns:
int:
* the id of the collaboration record.
* UNDEF_ID if value is not defined.
Raises:
ToolException:
when more than one synonym is found or when the
collaboration is not defined.
"""
return
search_synonym
(
self
.
db
.
collaborations
,
"collaboration"
,
value
)
def
search_country
(
self
,
value
):
"""Get the database country identifier using synonyms.
Args:
value (unicode):
the name of the country.
Returns:
int:
* the id of the country record.
* UNDEF_ID if value is not defined.
Raises:
ToolException:
when more than one synonym is found ot when
the country is not defined.
"""
return
search_synonym
(
self
.
db
.
countries
,
"country"
,
value
)
def
search_publisher
(
self
,
value
):
"""Get the database publisher identifier using synonyms.
Args:
value (unicode):
the abbreviation of the publisher.
Returns:
int:
* the id of the publisher record.
* UNDEF_ID if value is not defined.
Raises:
ToolException:
when more than one synonym is found or when
the publisher is not defined.
"""
return
search_synonym
(
self
.
db
.
publishers
,
"abbreviation"
,
value
)
modules/harvest_tools/base.py
View file @
945dcead
...
@@ -127,7 +127,8 @@ def search_synonym(table, fieldname, value, create=False):
...
@@ -127,7 +127,8 @@ def search_synonym(table, fieldname, value, create=False):
Raises:
Raises:
ToolException:
ToolException:
more than one synonym is found.
* no synonym found and not allow to create a new one.
* more than one synonym is found.
"""
"""
if
not
value
:
if
not
value
:
...
...
modules/harvest_tools/checkandfix.py
View file @
945dcead
...
@@ -20,7 +20,7 @@ from invenio_tools import (MSG_NO_CONF,
...
@@ -20,7 +20,7 @@ from invenio_tools import (MSG_NO_CONF,
from
invenio_tools.recordpubli
import
PAPER_REFERENCE_KEYS
from
invenio_tools.recordpubli
import
PAPER_REFERENCE_KEYS
from
itertools
import
imap
from
itertools
import
imap
from
plugin_dbui
import
CLEAN_SPACES
,
get_id
from
plugin_dbui
import
CLEAN_SPACES
,
get_id
,
UNDEF_ID
DECODE_ARXIV
=
re
.
compile
(
r
"arXiv:(\d{2})(\d{2})\."
)
DECODE_ARXIV
=
re
.
compile
(
r
"arXiv:(\d{2})(\d{2})\."
)
...
@@ -53,6 +53,9 @@ MSG_NO_OAI = "Reject no OAI identifier"
...
@@ -53,6 +53,9 @@ MSG_NO_OAI = "Reject no OAI identifier"
MSG_NO_REF
=
"Reject incomplete paper reference. Check "
MSG_NO_REF
=
"Reject incomplete paper reference. Check "
MSG_TEMPORARY_RECORD
=
"Temporary record"
MSG_TEMPORARY_RECORD
=
"Temporary record"
MSG_UNKNOWN_COLLABORATION
=
"Reject collaboration is unknown."
MSG_UNKNOWN_COUNTRY
=
"Reject country is unknown."
MSG_UNKNOWN_PUBLISHER
=
"Reject publisher is unknown."
MSG_WELL_FORMED_DATE
=
"Reject submission date is not well formed"
MSG_WELL_FORMED_DATE
=
"Reject submission date is not well formed"
REG_CONF_DATES_1
=
re
.
compile
(
"(\d+) *-? *(\d+) *([A-Z][a-z]{2}) *(\d{4})"
)
REG_CONF_DATES_1
=
re
.
compile
(
"(\d+) *-? *(\d+) *([A-Z][a-z]{2}) *(\d{4})"
)
...
@@ -342,8 +345,7 @@ class CheckAndFix(object):
...
@@ -342,8 +345,7 @@ class CheckAndFix(object):
raise
CheckException
(
MSG_NO_AUTHOR
)
raise
CheckException
(
MSG_NO_AUTHOR
)
def
collaboration
(
self
,
record
):
def
collaboration
(
self
,
record
):
"""Check the collaboration.
"""Check synonyms for collaboration by using by the proper value.
Have a look to the synonyms when the collaboration is not well formed.
Args:
Args:
record (RecordPubli):
record (RecordPubli):
...
@@ -351,8 +353,9 @@ class CheckAndFix(object):
...
@@ -351,8 +353,9 @@ class CheckAndFix(object):
Raises:
Raises:
CheckException:
CheckException:
when the collaboration value is defined
* the collaboration is unknown
nor entered as a synonym.
(neither collaboration nor synonym)
* more than one synonym found.
"""
"""
if
self
.
dbg
:
if
self
.
dbg
:
...
@@ -363,12 +366,41 @@ class CheckAndFix(object):
...
@@ -363,12 +366,41 @@ class CheckAndFix(object):
return
return
try
:
try
:
search_synonym
(
self
.
db
.
collaborations
,
"collaboration"
,
val
)
db
=
self
.
db
dbid
=
search_synonym
(
db
.
collaborations
,
"collaboration"
,
val
)
if
dbid
==
UNDEF_ID
:
raise
ToolException
(
MSG_UNKNOWN_COLLABORATION
)
collaboration
=
db
.
collaborations
[
dbid
].
collaboration
if
collaboration
!=
val
:
# one collaboration
if
isinstance
(
record
[
u
"corporate_name"
],
dict
):
record
[
u
"corporate_name"
][
u
"collaboration"
]
=
collaboration
# several collaboration
# replace the list of dictionary by a single one
else
:
record
[
u
"corporate_name"
]
=
\
{
u
"collaboration"
:
collaboration
}
except
ToolException
as
e
:
except
ToolException
as
e
:
raise
CheckException
(
*
e
.
args
)
raise
CheckException
(
*
e
.
args
)
def
country
(
self
,
record
):
def
country
(
self
,
record
):
"""Check synonyms for conference country by using by the proper value.
Args:
record (RecordPubli):
record describing a publication.
Raises:
CheckException:
* the country is unknown (neither country nor synonym)
* more than one synonym found.
"""
"""Check conference country.
"""Check conference country.
Have a look to the synonyms when the country does not exist.
Have a look to the synonyms when the country does not exist.
...
@@ -390,7 +422,28 @@ class CheckAndFix(object):
...
@@ -390,7 +422,28 @@ class CheckAndFix(object):
val
=
record
.
conference_country
()
val
=
record
.
conference_country
()
try
:
try
:
search_synonym
(
self
.
db
.
countries
,
"country"
,
val
)
db
=
self
.
db
dbid
=
search_synonym
(
db
.
countries
,
"country"
,
val
)
if
dbid
==
UNDEF_ID
:
raise
ToolException
(
MSG_UNKNOWN_COUNTRY
)
country
=
db
.
countries
[
dbid
].
country
if
country
!=
val
:
obj
=
record
[
u
"meeting_name"
]
if
isinstance
(
obj
,
dict
):
location
=
obj
[
u
"location"
].
replace
(
val
,
country
)
record
[
u
"meeting_name"
][
u
"location"
]
=
location
else
:
for
di
in
obj
:
if
u
"location"
in
di
:
di
[
u
"location"
]
=
\
di
[
u
"location"
].
replace
(
val
,
country
)
record
[
u
"meeting_name"
]
=
obj
except
ToolException
as
e
:
except
ToolException
as
e
:
raise
CheckException
(
*
e
.
args
)
raise
CheckException
(
*
e
.
args
)
...
@@ -822,8 +875,7 @@ class CheckAndFix(object):
...
@@ -822,8 +875,7 @@ class CheckAndFix(object):
raise
ToolException
(
MSG_NO_REF
+
"[year]"
)
raise
ToolException
(
MSG_NO_REF
+
"[year]"
)
def
publisher
(
self
,
record
):
def
publisher
(
self
,
record
):
"""Check publisher.
"""Check synonyms for publisher by replacing by the abbreviation value.
Have a look to the synonyms when the publisher does not exist.
Args:
Args:
record (RecordPubli):
record (RecordPubli):
...
@@ -831,7 +883,8 @@ class CheckAndFix(object):
...
@@ -831,7 +883,8 @@ class CheckAndFix(object):
Raises:
Raises:
CheckException:
CheckException:
the publisher is not defined nor entered as a synonym.
* the publisher is unknown (neither abbreviation nor synonym)
* more than one synonym found.
"""
"""
if
self
.
dbg
:
if
self
.
dbg
:
...
@@ -841,11 +894,18 @@ class CheckAndFix(object):
...
@@ -841,11 +894,18 @@ class CheckAndFix(object):
if
len
(
val
)
==
0
:
if
len
(
val
)
==
0
:
return
return
# convert ToolException to CheckExcpetion
try
:
try
:
db
=
self
.
db
db
=
self
.
db
search_synonym
(
db
.
publishers
,
"abbreviation"
,
val
)
dbid
=
search_synonym
(
db
.
publishers
,
"abbreviation"
,
val
)
if
dbid
==
UNDEF_ID
:
raise
ToolException
(
MSG_UNKNOWN_PUBLISHER
)
abbreviation
=
db
.
publishers
[
dbid
].
abbreviation
if
abbreviation
!=
val
:
record
[
u
"publication_info"
].
loc
[
0
,
"title"
]
=
abbreviation
# convert ToolException to CheckExcpetion
except
ToolException
as
e
:
except
ToolException
as
e
:
raise
CheckException
(
*
e
.
args
)
raise
CheckException
(
*
e
.
args
)
...
...
modules/harvest_tools/preprints.py
View file @
945dcead
...
@@ -8,7 +8,7 @@ from .automaton import Automaton
...
@@ -8,7 +8,7 @@ from .automaton import Automaton
from
.base
import
MSG_CRASH
,
MSG_LOAD
from
.base
import
MSG_CRASH
,
MSG_LOAD
from
.checkandfix
import
CheckException
from
.checkandfix
import
CheckException
from
invenio_tools
import
RecordConf
,
RecordThesis
from
invenio_tools
import
RecordConf
,
RecordThesis
from
plugin_dbui
import
UNDEF_ID
from
plugin_dbui
import
get_id
,
UNDEF_ID
MSG_PREPRINT_IS_PAPER
=
"Reject preprint is a published paper"
MSG_PREPRINT_IS_PAPER
=
"Reject preprint is a published paper"
...
@@ -86,6 +86,8 @@ class Preprints(Automaton):
...
@@ -86,6 +86,8 @@ class Preprints(Automaton):
zero otherwise.
zero otherwise.
"""
"""
db
=
self
.
db
# alias
# alias
first_author
=
record
.
first_author
()
first_author
=
record
.
first_author
()
oai_url
=
record
.
oai_url
()
oai_url
=
record
.
oai_url
()
...
@@ -95,7 +97,8 @@ class Preprints(Automaton):
...
@@ -95,7 +97,8 @@ class Preprints(Automaton):
year
=
submitted
[
0
:
4
]
year
=
submitted
[
0
:
4
]
# get the collaboration identifier
# get the collaboration identifier
id_collaboration
=
self
.
search_collaboration
(
record
.
collaboration
())
id_collaboration
=
\
get_id
(
db
.
collaborations
,
collaboration
=
record
.
collaboration
())
# get existing preprint or article
# get existing preprint or article
fields
=
dict
(
first_author
=
first_author
,
fields
=
dict
(
first_author
=
first_author
,
...
...
modules/harvest_tools/proceedings.py
View file @
945dcead
...
@@ -7,7 +7,7 @@ import traceback
...
@@ -7,7 +7,7 @@ import traceback
from
.automaton
import
Automaton
from
.automaton
import
Automaton
from
.base
import
MSG_CRASH
,
MSG_LOAD
from
.base
import
MSG_CRASH
,
MSG_LOAD
from
.checkandfix
import
CheckException
from
.checkandfix
import
CheckException
from
plugin_dbui
import
UNDEF_ID
from
plugin_dbui
import
get_id
,
UNDEF_ID
class
Proceedings
(
Automaton
):
class
Proceedings
(
Automaton
):
...
@@ -71,6 +71,8 @@ class Proceedings(Automaton):
...
@@ -71,6 +71,8 @@ class Proceedings(Automaton):
zero otherwise.
zero otherwise.
"""
"""
db
=
self
.
db
# alias
# alias
oai_url
=
record
.
oai_url
()
oai_url
=
record
.
oai_url
()
year
=
record
.
paper_year
()
year
=
record
.
paper_year
()
...
@@ -94,11 +96,13 @@ class Proceedings(Automaton):
...
@@ -94,11 +96,13 @@ class Proceedings(Automaton):
conference_dates
=
record
.
conference_dates
()
conference_dates
=
record
.
conference_dates
()
conference_title
=
record
.
conference_title
()
conference_title
=
record
.
conference_title
()
first_author
=
record
.
first_author
()
first_author
=
record
.
first_author
()
id_country
=
self
.
search_
country
(
record
.
conference_country
())
id_country
=
get_id
(
db
.
countries
,
country
=
record
.
conference_country
())
# get the collaboration/publisher identifiers
# get the collaboration/publisher identifiers
id_collaboration
=
self
.
search_collaboration
(
record
.
collaboration
())
id_collaboration
=
\
id_publisher
=
self
.
search_publisher
(
editor
)
get_id
(
db
.
collaborations
,
collaboration
=
record
.
collaboration
())
id_publisher
=
get_id
(
db
.
publishers
,
abbreviation
=
editor
)
# get an already published proceeding
# get an already published proceeding
fields
=
dict
(
authors
=
authors
,
fields
=
dict
(
authors
=
authors
,
...
...
modules/harvest_tools/reports.py
View file @
945dcead
...
@@ -90,7 +90,8 @@ class Reports(Automaton):
...
@@ -90,7 +90,8 @@ class Reports(Automaton):
id_status
=
get_id
(
db
.
status
,
code
=
UNKNOWN
)
id_status
=
get_id
(
db
.
status
,
code
=
UNKNOWN
)
# get the collaboration identifier
# get the collaboration identifier
id_collaboration
=
self
.
search_collaboration
(
record
.
collaboration
())
id_collaboration
=
\
get_id
(
db
.
collaborations
,
collaboration
=
record
.
collaboration
())
# get an already published reports
# get an already published reports
fields
=
dict
(
id_categories
=
self
.
id_category
,
fields
=
dict
(
id_categories
=
self
.
id_category
,
...
...
modules/harvest_tools/talks.py
View file @
945dcead
...
@@ -4,10 +4,17 @@
...
@@ -4,10 +4,17 @@
import
traceback
import
traceback
<<<<<<<
HEAD
from
.automaton
import
Automaton
from
.automaton
import
Automaton
from
.base
import
MSG_CRASH
,
MSG_LOAD
from
.base
import
MSG_CRASH
,
MSG_LOAD
from
.checkandfix
import
CheckException
from
.checkandfix
import
CheckException
from
plugin_dbui
import
UNDEF_ID
from
plugin_dbui
import
UNDEF_ID
=======
from
automaton
import
Automaton
from
base
import
MSG_CRASH
,
MSG_LOAD
from
checkandfix
import
CheckException
from
plugin_dbui
import
get_id
,
UNDEF_ID
>>>>>>>
Modify
the
logic
to
deal
with
synonyms
.
class
Talks
(
Automaton
):
class
Talks
(
Automaton
):
...
@@ -67,6 +74,8 @@ class Talks(Automaton):
...
@@ -67,6 +74,8 @@ class Talks(Automaton):
zero otherwise.
zero otherwise.
"""
"""
db
=
self
.
db
# alias
# alias
oai_url
=
record
.
oai_url
()
oai_url
=
record
.
oai_url
()
year
=
record
.
conference_year
()
year
=
record
.
conference_year
()
...
@@ -75,12 +84,13 @@ class Talks(Automaton):
...
@@ -75,12 +84,13 @@ class Talks(Automaton):
conference_dates
=
record
.
conference_dates
()
conference_dates
=
record
.
conference_dates
()
conference_title
=
record
.
conference_title
()
conference_title
=
record
.
conference_title
()
first_author
=
record
.
first_author
()
first_author
=
record
.
first_author
()
id_country
=
self
.
search_
country
(
record
.
conference_country
())
id_country
=
get_id
(
db
.
countries
,
country
=
record
.
conference_country
())
submitted
=
record
.
submitted
()
submitted
=
record
.
submitted
()
title
=
record
.
title
()
title
=
record
.
title
()
# get the collaboration identifier
# get the collaboration identifier
id_collaboration
=
self
.
search_collaboration
(
record
.
collaboration
())
id_collaboration
=
\
get_id
(
db
.
collaborations
,
collaboration
=
record
.
collaboration
())
# get an already published talk
# get an already published talk
fields
=
dict
(
conference_title
=
conference_title
,
fields
=
dict
(
conference_title
=
conference_title
,
...
...
modules/test_tools.py
View file @
945dcead
...
@@ -28,6 +28,9 @@ from harvest_tools.checkandfix import (
...
@@ -28,6 +28,9 @@ from harvest_tools.checkandfix import (
MSG_NO_OAI
,
MSG_NO_OAI
,
MSG_NO_REF
,
MSG_NO_REF
,
MSG_TEMPORARY_RECORD
,
MSG_TEMPORARY_RECORD
,
MSG_UNKNOWN_COLLABORATION
,
MSG_UNKNOWN_COUNTRY
,
MSG_UNKNOWN_PUBLISHER
,
MSG_WELL_FORMED_DATE
)
MSG_WELL_FORMED_DATE
)
from
harvest_tools.preprints
import
(
from
harvest_tools.preprints
import
(
...
@@ -82,6 +85,9 @@ def messages():
...
@@ -82,6 +85,9 @@ def messages():
T
(
MSG_PREPRINT_NO_NUMBER
),
T
(
MSG_PREPRINT_NO_NUMBER
),
T
(
MSG_REPORT_NO_NUMBER
),
T
(
MSG_REPORT_NO_NUMBER
),
T
(
MSG_TEMPORARY_RECORD
),
T
(
MSG_TEMPORARY_RECORD
),
T
(
MSG_UNKNOWN_COLLABORATION
),
T
(
MSG_UNKNOWN_COUNTRY
),
T
(
MSG_UNKNOWN_PUBLISHER
),
T
(
MSG_WELL_FORMED_COLLABORATION
),
T
(
MSG_WELL_FORMED_COLLABORATION
),
T
(
MSG_WELL_FORMED_DATE
)}
T
(
MSG_WELL_FORMED_DATE
)}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment