Refine location parser to handle edge cases and missing data

Defines a function to parse each GISAID location string, adds doctests for expected behavior on well-formed and malformed (but real) data, and updates the logic of the parser to more robustly handle these cases. Specific improvements include splitting fields with a regular expression to handle inconsistent delimiters, always defining missing values ("?") by default, and trimming extraneous fields and trailing delimiters.
nextstrain · May 19, 2021 · ad43a19 · ad43a19
1 parent 53d7c21
commit ad43a19
Showing 1 changed file with 55 additions and 4 deletions.
diff --git a/scripts/sanitize_metadata.py b/scripts/sanitize_metadata.py
@@ -20,6 +20,58 @@
     "Virus name",
 )
 
+def parse_location_string(location_string, location_fields):
+    """Parse location string from GISAID into the given separate geographic scales
+    and return a dictionary of parse values by scale.
+
+    Parameters
+    ----------
+    location_string : str
+    location_fields : list
+
+    Returns
+    -------
+    dict :
+        dictionary of geographic fields parsed from the given string
+
+    >>> location_fields = ["region", "country", "division", "location"]
+    >>> parse_location_string("Asia / Japan", location_fields)
+    {'region': 'Asia', 'country': 'Japan', 'division': '?', 'location': '?'}
+
+    >>> parse_location_string("Europe / Iceland / Reykjavik", location_fields)
+    {'region': 'Europe', 'country': 'Iceland', 'division': 'Reykjavik', 'location': '?'}
+
+    >>> parse_location_string("North America / USA / Washington / King County", location_fields)
+    {'region': 'North America', 'country': 'USA', 'division': 'Washington', 'location': 'King County'}
+
+    Additional location entries beyond what has been specified should be stripped from output.
+
+    >>> parse_location_string("North America / USA / Washington / King County / Extra field", location_fields)
+    {'region': 'North America', 'country': 'USA', 'division': 'Washington', 'location': 'King County'}
+
+    Trailing location delimiters should be stripped from the output.
+
+    >>> parse_location_string("North America / USA / Washington / King County / ", location_fields)
+    {'region': 'North America', 'country': 'USA', 'division': 'Washington', 'location': 'King County'}
+
+    Handle inconsistently delimited strings.
+
+    >>> parse_location_string("North America/USA/New York/New York", location_fields)
+    {'region': 'North America', 'country': 'USA', 'division': 'New York', 'location': 'New York'}
+    >>> parse_location_string("Europe/ Lithuania", location_fields)
+    {'region': 'Europe', 'country': 'Lithuania', 'division': '?', 'location': '?'}
+
+    """
+    # Try to extract values for specific geographic scales.
+    values = re.split(r"[ ]*/[ ]*", location_string)
+
+    # Create a default mapping of location fields to missing values and update
+    # these from the values in the location string.
+    locations = {field: "?" for field in location_fields}
+    locations.update(dict(zip(location_fields, values)))
+
+    return locations
+
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
@@ -48,11 +100,10 @@
         # scales. Replace missing field values with "?".
         locations = pd.DataFrame(
             (
-                str(location).split(" / ", maxsplit=len(LOCATION_FIELDS) - 1)
+                parse_location_string(location, LOCATION_FIELDS)
                 for location in metadata[args.parse_location_field].values
-            ),
-            columns=LOCATION_FIELDS
-        ).fillna("?")
+            )
+        )
 
         # Combine new location columns with original metadata and drop the
         # original location column.