diff --git a/scripts/sanitize_metadata.py b/scripts/sanitize_metadata.py index 0485f133a..73711f1c3 100644 --- a/scripts/sanitize_metadata.py +++ b/scripts/sanitize_metadata.py @@ -20,6 +20,58 @@ "Virus name", ) +def parse_location_string(location_string, location_fields): + """Parse location string from GISAID into the given separate geographic scales + and return a dictionary of parse values by scale. + + Parameters + ---------- + location_string : str + location_fields : list + + Returns + ------- + dict : + dictionary of geographic fields parsed from the given string + + >>> location_fields = ["region", "country", "division", "location"] + >>> parse_location_string("Asia / Japan", location_fields) + {'region': 'Asia', 'country': 'Japan', 'division': '?', 'location': '?'} + + >>> parse_location_string("Europe / Iceland / Reykjavik", location_fields) + {'region': 'Europe', 'country': 'Iceland', 'division': 'Reykjavik', 'location': '?'} + + >>> parse_location_string("North America / USA / Washington / King County", location_fields) + {'region': 'North America', 'country': 'USA', 'division': 'Washington', 'location': 'King County'} + + Additional location entries beyond what has been specified should be stripped from output. + + >>> parse_location_string("North America / USA / Washington / King County / Extra field", location_fields) + {'region': 'North America', 'country': 'USA', 'division': 'Washington', 'location': 'King County'} + + Trailing location delimiters should be stripped from the output. + + >>> parse_location_string("North America / USA / Washington / King County / ", location_fields) + {'region': 'North America', 'country': 'USA', 'division': 'Washington', 'location': 'King County'} + + Handle inconsistently delimited strings. + + >>> parse_location_string("North America/USA/New York/New York", location_fields) + {'region': 'North America', 'country': 'USA', 'division': 'New York', 'location': 'New York'} + >>> parse_location_string("Europe/ Lithuania", location_fields) + {'region': 'Europe', 'country': 'Lithuania', 'division': '?', 'location': '?'} + + """ + # Try to extract values for specific geographic scales. + values = re.split(r"[ ]*/[ ]*", location_string) + + # Create a default mapping of location fields to missing values and update + # these from the values in the location string. + locations = {field: "?" for field in location_fields} + locations.update(dict(zip(location_fields, values))) + + return locations + if __name__ == '__main__': parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) @@ -48,11 +100,10 @@ # scales. Replace missing field values with "?". locations = pd.DataFrame( ( - str(location).split(" / ", maxsplit=len(LOCATION_FIELDS) - 1) + parse_location_string(location, LOCATION_FIELDS) for location in metadata[args.parse_location_field].values - ), - columns=LOCATION_FIELDS - ).fillna("?") + ) + ) # Combine new location columns with original metadata and drop the # original location column.