diff --git a/src/sssom/cli.py b/src/sssom/cli.py index 3ba7e03c..0b6d5d12 100644 --- a/src/sssom/cli.py +++ b/src/sssom/cli.py @@ -44,7 +44,7 @@ split_file, validate_file, ) -from .parsers import PARSING_FUNCTIONS, parse_sssom_table +from .parsers import PARSING_FUNCTIONS, from_sssom_dataframe, parse_sssom_table from .rdf_util import rewire_graph from .sparql_util import EndpointConfig, query_mappings from .util import ( @@ -58,6 +58,7 @@ remove_unmatched, sort_df_rows_columns, to_mapping_set_dataframe, + rewire_sssom_table, ) from .writers import WRITER_FUNCTIONS, write_table @@ -545,11 +546,26 @@ def rewire( # noqa: DAR101 """ msdf = parse_sssom_table(mapping_file) - g = Graph() - g.parse(input, format=input_format) - rewire_graph(g, msdf, precedence=precedence) - rdfstr = g.serialize(format=output_format) - print(rdfstr, file=output) + + if input_format == "sssom-tsv" or input.endswith("sssom.tsv"): + msdf_mapping = parse_sssom_table(input) + df_rewired = rewire_sssom_table() # This is the method you need to implement + + # updating the metadata of the rewired df so you can recognise it was rewired? + metadata = msdf.metadata + metadata["mapping_set_id"] = msdf["mapping_set_id"] + "rewired.sssom.tsv" + + # This maybe has to be revisited as the rewiring can change the SSSOM mapping + prefix_map = msdf.prefix_map + + msdf_rewired = from_sssom_dataframe(df_rewired, prefix_map=prefix_map, meta=metadata) + write_table(msdf_rewired, output) + else: + g = Graph() + g.parse(input, format=input_format) + rewire_graph(g, msdf, precedence=precedence) + outstring = g.serialize(format=output_format) + print(outstring, file=output) @main.command() diff --git a/src/sssom/util.py b/src/sssom/util.py index fc7249b0..c07c5309 100644 --- a/src/sssom/util.py +++ b/src/sssom/util.py @@ -109,6 +109,23 @@ def clean_context(self) -> None: """Clean up the context.""" self.converter = curies.chain([_get_built_in_prefix_map(), self.converter]) + def _standardize_curie_or_iri(self, curie_or_iri: str) -> str: + """Standardize a CURIE or IRI, returning the original if not possible.""" + if is_iri(curie_or_iri): + return self.converter.standardize_uri(curie_or_iri) or curie_or_iri + if is_curie(curie_or_iri): + return self.converter.standardize_curie(curie_or_iri) or curie_or_iri + return curie_or_iri + + def standardize(self) -> None: + """Standardize this MSDF.""" + for column, values in _get_sssom_schema_object().dict["slots"].items(): + if values["range"] != "EntityReference": + continue + if column not in self.df.columns: + continue + self.df[column] = self.df[column].map(self._standardize_curie_or_iri) + def merge(self, *msdfs: "MappingSetDataFrame", inplace: bool = True) -> "MappingSetDataFrame": """Merge two MappingSetDataframes. @@ -372,6 +389,16 @@ def get_row_based_on_hierarchy(df: pd.DataFrame): return hierarchical_df +def rewire_sssom_table(df_rewire: pd.DataFrame, df_mapping: pd.DataFrame): + # 1. Standardise subject and object id columns using + # https://curies.readthedocs.io/en/latest/api/curies.Converter.html#curies.Converter.pd_standardize_curie + # 2. Perform the rewiring + # 3. Store some metadata in the "other" field? + # 4. Return back out + result_df = ... + return result_df + + def assign_default_confidence( df: pd.DataFrame, ) -> Tuple[pd.DataFrame, pd.DataFrame]: @@ -1116,22 +1143,8 @@ def reconcile_prefix_and_data( converter = msdf.converter converter = curies.remap_curie_prefixes(converter, prefix_reconciliation["prefix_synonyms"]) converter = curies.rewire(converter, prefix_reconciliation["prefix_expansion_reconciliation"]) - - # TODO make this standardization code directly part of msdf after - # switching to native converter - def _upgrade(curie_or_iri: str) -> str: - if not is_iri(curie_or_iri) and is_curie(curie_or_iri): - return converter.standardize_curie(curie_or_iri) or curie_or_iri - return curie_or_iri - - for column, values in _get_sssom_schema_object().dict["slots"].items(): - if values["range"] != "EntityReference": - continue - if column not in msdf.df.columns: - continue - msdf.df[column] = msdf.df[column].map(_upgrade) - msdf.converter = converter + msdf.standardize() return msdf