Skip to content

Commit

Permalink
Refine location parser to handle edge cases and missing data
Browse files Browse the repository at this point in the history
Defines a function to parse each GISAID location string, adds doctests
for expected behavior on well-formed and malformed (but real) data, and
updates the logic of the parser to more robustly handle these cases.
Specific improvements include splitting fields with a regular expression
to handle inconsistent delimiters, always defining missing values ("?")
by default, and trimming extraneous fields and trailing delimiters.
  • Loading branch information
huddlej committed May 19, 2021
1 parent 53d7c21 commit ad43a19
Showing 1 changed file with 55 additions and 4 deletions.
59 changes: 55 additions & 4 deletions scripts/sanitize_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,58 @@
"Virus name",
)

def parse_location_string(location_string, location_fields):
"""Parse location string from GISAID into the given separate geographic scales
and return a dictionary of parse values by scale.
Parameters
----------
location_string : str
location_fields : list
Returns
-------
dict :
dictionary of geographic fields parsed from the given string
>>> location_fields = ["region", "country", "division", "location"]
>>> parse_location_string("Asia / Japan", location_fields)
{'region': 'Asia', 'country': 'Japan', 'division': '?', 'location': '?'}
>>> parse_location_string("Europe / Iceland / Reykjavik", location_fields)
{'region': 'Europe', 'country': 'Iceland', 'division': 'Reykjavik', 'location': '?'}
>>> parse_location_string("North America / USA / Washington / King County", location_fields)
{'region': 'North America', 'country': 'USA', 'division': 'Washington', 'location': 'King County'}
Additional location entries beyond what has been specified should be stripped from output.
>>> parse_location_string("North America / USA / Washington / King County / Extra field", location_fields)
{'region': 'North America', 'country': 'USA', 'division': 'Washington', 'location': 'King County'}
Trailing location delimiters should be stripped from the output.
>>> parse_location_string("North America / USA / Washington / King County / ", location_fields)
{'region': 'North America', 'country': 'USA', 'division': 'Washington', 'location': 'King County'}
Handle inconsistently delimited strings.
>>> parse_location_string("North America/USA/New York/New York", location_fields)
{'region': 'North America', 'country': 'USA', 'division': 'New York', 'location': 'New York'}
>>> parse_location_string("Europe/ Lithuania", location_fields)
{'region': 'Europe', 'country': 'Lithuania', 'division': '?', 'location': '?'}
"""
# Try to extract values for specific geographic scales.
values = re.split(r"[ ]*/[ ]*", location_string)

# Create a default mapping of location fields to missing values and update
# these from the values in the location string.
locations = {field: "?" for field in location_fields}
locations.update(dict(zip(location_fields, values)))

return locations


if __name__ == '__main__':
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
Expand Down Expand Up @@ -48,11 +100,10 @@
# scales. Replace missing field values with "?".
locations = pd.DataFrame(
(
str(location).split(" / ", maxsplit=len(LOCATION_FIELDS) - 1)
parse_location_string(location, LOCATION_FIELDS)
for location in metadata[args.parse_location_field].values
),
columns=LOCATION_FIELDS
).fillna("?")
)
)

# Combine new location columns with original metadata and drop the
# original location column.
Expand Down

0 comments on commit ad43a19

Please sign in to comment.