[GRASS-SVN] r73083 - sandbox/wenzeslaus/g.citation
svn_grass at osgeo.org
svn_grass at osgeo.org
Sun Aug 12 20:55:05 PDT 2018
Author: wenzeslaus
Date: 2018-08-12 20:55:05 -0700 (Sun, 12 Aug 2018)
New Revision: 73083
Modified:
sandbox/wenzeslaus/g.citation/g.citation.py
Log:
g.citation: parse email (cleans up some names)
Modified: sandbox/wenzeslaus/g.citation/g.citation.py
===================================================================
--- sandbox/wenzeslaus/g.citation/g.citation.py 2018-08-13 03:52:02 UTC (rev 73082)
+++ sandbox/wenzeslaus/g.citation/g.citation.py 2018-08-13 03:55:05 UTC (rev 73083)
@@ -201,11 +201,47 @@
# TODO: raise or fatal? should be in library or module?
raise RuntimeError("The text does not contain date entry")
+
+def get_email(text):
+ """Get email from text
+
+ Returns tuple (email, text_without_email)
+ Returns (None, text) if not found.
+ Any whitespace is stripped from the text.
+
+ >>> print(get_email("<E. Jorge Tizado (ej.tizado at unileon.es)")[0])
+ ej.tizado at unileon.es
+ >>> print(get_email("<E. Jorge Tizado (ej.tizado unileon es)")[0])
+ ej.tizado at unileon.es
+ >>> email, text = get_email("Andrea Aime (aaime libero it)")
+ >>> print(text)
+ Andrea Aime
+ >>> print(email)
+ aaime at libero.it
+ """
+ email = None
+ # ORCID as text
+ email_re = re.compile(r"\(([^@]+@[^@]+\.[^@]+)\)", re.IGNORECASE)
+ match = re.search(email_re, text)
+ if match:
+ email = match.group(1)
+ else:
+ for domain in ['com', 'es', 'it']:
+ email_re = re.compile(r"\(([^ ]+) ([^ ]+) ({})\)".format(domain), re.IGNORECASE)
+ match = re.search(email_re, text)
+ if match:
+ email = "{name}@{service}.{domain}".format(name=match.group(1), service=match.group(2), domain=match.group(3))
+ break
+ text = re.sub(email_re, "", text).strip()
+ return (email, text)
+
+
def get_orcid(text):
"""Get ORCID from text
Returns tuple (orcid, text_without_orcid)
Returns (None, text) if not found.
+ Any whitespace is stripped from the text.
>>> # URL style
>>> print(get_orcid("http://orcid.org/0000-0000-0000-0000")[0])
@@ -232,15 +268,17 @@
match = re.search(orcid_re, text)
if match:
orcid = match.group(1)
- text = re.sub(orcid_re, "", text)
+ text = re.sub(orcid_re, "", text).strip()
return (orcid, text)
+
def get_authors_from_documentation(text):
r"""Extract authors and associated info from documentation
>>> text = '<h2><a name="author">AUTHOR</a></h2>\nPaul Kelly\n<p><i>Last changed:'
- >>> pprint(get_authors_from_documentation(text))
- [{'feature': None, 'institute': None, 'name': 'Paul Kelly', 'orcid': None}]
+ >>> authors = get_authors_from_documentation(text)
+ >>> print(authors[0]['name'])
+ Paul Kelly
"""
# Some section names are singular, some plural.
# Additional tags can appear in the heading compiled documentation.
@@ -273,9 +311,12 @@
institute = None
feature = None
+ email, text = get_email(text)
orcid, text = get_orcid(text)
ai = line.split(",", 1)
name = clean_line_item(ai[0])
+ if not email:
+ email, name = get_email(name)
if len(ai) == 2:
institute = clean_line_item(ai[1])
if " by " in name:
@@ -423,6 +464,8 @@
given = names[0]
family = " ".join([names[1], names[2]])
else:
+ # TODO: since this is for legacy code, we could just
+ # hardcode the "known" authors such as Maria Antonia Brovelli
raise NotImplementedError("Not sure if <{n}> is family or middle name in <{t}>".format(n=names[1], t=text))
elif len(names) == 4:
# assuming that if you have suffix, you have a middle name
More information about the grass-commit
mailing list