[GRASS-SVN] r73083 - sandbox/wenzeslaus/g.citation

Sun Aug 12 20:55:05 PDT 2018

Author: wenzeslaus
Date: 2018-08-12 20:55:05 -0700 (Sun, 12 Aug 2018)
New Revision: 73083

Modified:
   sandbox/wenzeslaus/g.citation/g.citation.py
Log:
g.citation: parse email (cleans up some names)

Modified: sandbox/wenzeslaus/g.citation/g.citation.py
===================================================================

--- sandbox/wenzeslaus/g.citation/g.citation.py	2018-08-13 03:52:02 UTC (rev 73082)
+++ sandbox/wenzeslaus/g.citation/g.citation.py	2018-08-13 03:55:05 UTC (rev 73083)
@@ -201,11 +201,47 @@
         # TODO: raise or fatal? should be in library or module?
         raise RuntimeError("The text does not contain date entry")
 
+
+def get_email(text):
+    """Get email from text
+
+    Returns tuple (email, text_without_email)
+    Returns (None, text) if not found.
+    Any whitespace is stripped from the text.
+
+    >>> print(get_email("<E. Jorge Tizado (ej.tizado at unileon.es)")[0])
+    ej.tizado at unileon.es
+    >>> print(get_email("<E. Jorge Tizado   (ej.tizado unileon es)")[0])
+    ej.tizado at unileon.es
+    >>> email, text = get_email("Andrea Aime (aaime libero it)")
+    >>> print(text)
+    Andrea Aime
+    >>> print(email)
+    aaime at libero.it
+    """
+    email = None
+    # ORCID as text
+    email_re = re.compile(r"\(([^@]+@[^@]+\.[^@]+)\)", re.IGNORECASE)
+    match = re.search(email_re, text)
+    if match:
+        email = match.group(1)
+    else:
+        for domain in ['com', 'es', 'it']:
+            email_re = re.compile(r"\(([^ ]+) ([^ ]+) ({})\)".format(domain), re.IGNORECASE)
+            match = re.search(email_re, text)
+            if match:
+                email = "{name}@{service}.{domain}".format(name=match.group(1), service=match.group(2), domain=match.group(3))
+                break
+    text = re.sub(email_re, "", text).strip()
+    return (email, text)
+
+
 def get_orcid(text):
     """Get ORCID from text
 
     Returns tuple (orcid, text_without_orcid)
     Returns (None, text) if not found.
+    Any whitespace is stripped from the text.
 
     >>> # URL style
     >>> print(get_orcid("http://orcid.org/0000-0000-0000-0000")[0])
@@ -232,15 +268,17 @@
         match = re.search(orcid_re, text)
         if match:
             orcid = match.group(1)
-    text = re.sub(orcid_re, "", text)
+    text = re.sub(orcid_re, "", text).strip()
     return (orcid, text)
 
+
 def get_authors_from_documentation(text):
     r"""Extract authors and associated info from documentation
 
     >>> text = '<h2><a name="author">AUTHOR</a></h2>\nPaul Kelly\n<p><i>Last changed:'
-    >>> pprint(get_authors_from_documentation(text))
-    [{'feature': None, 'institute': None, 'name': 'Paul Kelly', 'orcid': None}]
+    >>> authors = get_authors_from_documentation(text)
+    >>> print(authors[0]['name'])
+    Paul Kelly
     """
     # Some section names are singular, some plural.
     # Additional tags can appear in the heading compiled documentation.
@@ -273,9 +311,12 @@
         institute = None
         feature = None
         
+        email, text = get_email(text)
         orcid, text = get_orcid(text)
         ai = line.split(",", 1)
         name = clean_line_item(ai[0])
+        if not email:
+            email, name = get_email(name)
         if len(ai) == 2:
             institute = clean_line_item(ai[1])
         if " by " in name:
@@ -423,6 +464,8 @@
             given = names[0]
             family = " ".join([names[1], names[2]])
         else:
+            # TODO: since this is for legacy code, we could just
+            # hardcode the "known" authors such as Maria Antonia Brovelli
             raise NotImplementedError("Not sure if <{n}> is family or middle name in <{t}>".format(n=names[1], t=text))
     elif len(names) == 4:
         # assuming that if you have suffix, you have a middle name