Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding Property Graph example #144

Merged
merged 2 commits into from
Jul 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,11 @@ sheets2linkml personinfo.tsv -o personinfo.yaml

We recommend using [COGS](https://linkml.io/schemasheets/howto/google-sheets/) to synchronize your google sheets with local files using a git-like mechanism

## Examples

- [Person Info Schema](https://docs.google.com/spreadsheets/d/1wVoaiFg47aT9YWNeRfTZ8tYHN8s8PAuDx5i2HUcDpvQ/edit#gid=55566104)
- [Movies Property Graph Schema](https://docs.google.com/spreadsheets/d/1oMrzA41tg_nisdWInnqKJrcvv30dOXuwAhznJYYPSB8/edit?gid=1499822522#gid=1499822522)

## Finding out more

* [Schema Sheets Manual](https://linkml.io/schemasheets)
Expand Down
2 changes: 1 addition & 1 deletion docs/howto/google-sheets.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ E.g.
sheets2linkml --gsheet-id 1wVoaiFg47aT9YWNeRfTZ8tYHN8s8PAuDx5i2HUcDpvQ personinfo types prefixes -o personinfo.yaml
```

__Note__: due to a bug with google sheets API (see [this Stack Overflow question](https://stackoverflow.com/questions/61578295/google-spreadsheet-gviz-query-is-concatenating-first-two-rows-into-header)), this will not work if your sheet has floats/decimals in them. It's not clear if google will ever fix this. If you need decimals, then you should either manually download the sheet to TSV, or use COGS.
__Note__: due to a bug with google sheets API (see [this Stack Overflow question](https://stackoverflow.com/questions/61578295/google-spreadsheet-gviz-query-is-concatenating-first-two-rows-into-header)), this will not work if your sheet has floats/decimals/booleans in them. It's not clear if google will ever fix this. If you need decimals, then you should either manually download the sheet to TSV, or use COGS.

## COGS

Expand Down
2,114 changes: 1,103 additions & 1,011 deletions poetry.lock

Large diffs are not rendered by default.

11 changes: 8 additions & 3 deletions schemasheets/schema_exporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,14 @@
import click
from linkml_runtime.linkml_model import Element, SlotDefinition, SubsetDefinition, ClassDefinition, EnumDefinition, \
PermissibleValue, \
TypeDefinition, Example, Annotation, Prefix
TypeDefinition, Example, Annotation, Prefix, SchemaDefinition
from linkml_runtime.utils.formatutils import underscore
from linkml_runtime.utils.schemaview import SchemaView

from schemasheets.conf.configschema import ColumnSettings
from schemasheets.schemamaker import SchemaMaker
from schemasheets.schemasheet_datamodel import TableConfig, T_CLASS, T_SLOT, SchemaSheet, T_ENUM, T_PV, T_TYPE, \
T_SUBSET, T_PREFIX
T_SUBSET, T_PREFIX, T_SCHEMA

ROW = Dict[str, Any]

Expand Down Expand Up @@ -108,7 +108,7 @@ class SchemaExporter:
Exports a schema to Schema Sheets TSV format
"""
schemamaker: SchemaMaker = field(default_factory=lambda: SchemaMaker())
delimiter = '\t'
delimiter: str = field(default_factory=lambda: '\t')
rows: List[ROW] = field(default_factory=lambda: [])

def export(self, schemaview: SchemaView, to_file: Union[str, Path], specification: str = None,
Expand Down Expand Up @@ -229,6 +229,11 @@ def export_element(self, element: Element, parent: Optional[Element], schemaview
pk_col = col_name
else:
continue
elif t == T_SCHEMA:
if isinstance(element, SchemaDefinition):
pk_col = col_name
else:
continue
else:
raise AssertionError(f"Unexpected type: {t}")
if not pk_col:
Expand Down
143 changes: 119 additions & 24 deletions schemasheets/schemamaker.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import sys
import csv
import logging
from pathlib import Path
from urllib.request import urlopen
from copy import copy

Expand All @@ -12,11 +13,13 @@
from dataclasses import dataclass
from typing import List, Union, Any, Dict, Tuple, Generator, TextIO

from linkml_runtime.dumpers import yaml_dumper, json_dumper
from linkml_runtime.linkml_model import Annotation, Example
from linkml_runtime.linkml_model.meta import SchemaDefinition, ClassDefinition, Prefix, \
SlotDefinition, EnumDefinition, PermissibleValue, SubsetDefinition, TypeDefinition, Element, Setting
from linkml_runtime.utils.schema_as_dict import schema_as_dict
from linkml_runtime.utils.schemaview import SchemaView, re
from linkml_runtime.utils.yamlutils import YAMLRoot

from schemasheets.schemasheet_datamodel import ColumnConfig, TableConfig, get_configmodel, get_metamodel, COL_NAME, \
DESCRIPTOR, \
Expand All @@ -26,6 +29,67 @@
from schemasheets.utils.prefixtool import guess_prefix_expansion


def ensure_path_tokens(path: Union[str, List[str]]) -> List[str]:
if isinstance(path, list):
return path
if "." in path:
return path.split(".")
return [path]


def get_attr_via_path_accessor(obj: Union[dict, YAMLRoot], path: Union[str, List[str]]) -> Any:
"""
Given an object and a path, return the value at the end of the path

:param obj: object
:param path: path
:return: value
"""
toks = ensure_path_tokens(path)
tok = toks[0]
toks = toks[1:]
if isinstance(obj, dict):
v = obj.get(tok, None)
else:
# https://github.com/linkml/linkml/issues/971
v = getattr(obj, tok, None)
if v and toks:
return get_attr_via_path_accessor(v, toks)
else:
return v


def set_attr_via_path_accessor(obj: Union[dict, YAMLRoot], path: Union[str, List[str]], value: Any, depth=0) -> None:
"""
Given an object, a path, and a value, set the value at the end of the path

:param obj: object
:param path: path
:param value: value
:param depth: recursion depth
:return: None
"""
toks = ensure_path_tokens(path)
tok = toks[0]
toks = toks[1:]
logging.debug(f"[{depth}] Setting attr {tok} / {toks} in {obj} to {value}")
if isinstance(obj, dict):
if not toks:
obj[tok] = value
else:
if tok not in obj:
obj[tok] = {}
logging.info(f"Creating empty dict for: {tok}")
set_attr_via_path_accessor(obj[tok], toks, value, depth+1)
else:
if not toks:
setattr(obj, tok, value)
else:
if not hasattr(obj, tok):
setattr(obj, tok, {})
set_attr_via_path_accessor(getattr(obj, tok), toks, value, depth+1)


class SchemaSheetRowException(Exception):
pass

Expand Down Expand Up @@ -56,6 +120,8 @@ class SchemaMaker:

gsheet_id: str = None
"""Google sheet ID."""

gsheet_cache_dir: str = None

table_config_path: str = None
"""Path to table configuration file."""
Expand All @@ -82,7 +148,9 @@ def create_schema(self, csv_files: Union[str, List[str]], **kwargs) -> SchemaDef
if not isinstance(csv_files, list):
csv_files = [csv_files]
for f in csv_files:
# reconstitute schema
self.load_and_merge_sheet(f, **kwargs)
self.schema = SchemaDefinition(**json_dumper.to_dict(self.schema))
self.schema.imports.append('linkml:types')
self.schema.prefixes['linkml'] = Prefix('linkml', 'https://w3id.org/linkml/')
self._tidy_slot_usage()
Expand All @@ -102,6 +170,7 @@ def _tidy_slot_usage(self):
:return:
"""
for cn, c in self.schema.classes.items():
logging.debug(f"Tidying {cn}")
inapplicable_slots = [sn for sn, s in c.slot_usage.items() if 'inapplicable' in s.annotations]
for sn in inapplicable_slots:
c.slots.remove(sn)
Expand Down Expand Up @@ -132,39 +201,47 @@ def load_and_merge_sheet(self, file_name: str, delimiter='\t') -> None:
try:
self.add_row(row, schemasheet.table_config)
line_num += 1
except ValueError as e:
raise SchemaSheetRowException(f'Error in line {line_num}, row={row}') from e
except (ValueError, AttributeError) as e:
raise SchemaSheetRowException(f"Error in line {line_num}, row={row}\n"
f"Exception:\n{e}") from e

def add_row(self, row: Dict[str, Any], table_config: TableConfig):
"""
Add and translate a row from a schema sheet to the current schema.

A row may represent an instance of a LinkML element, such as a class, slot, type,
or enum. The row may also represent a setting, prefix, or schema-level annotation.

This is known as the "focal element"(s) of the row.

:param row:
:param table_config:
:return:
"""
for element in self.row_focal_element(row, table_config):
if isinstance(element, Prefix):
name = element.prefix_prefix
elif isinstance(element, PermissibleValue):
name = element.text
elif isinstance(element, Setting):
# print(f"\n{element = }")
name = element.setting_key
else:
logging.debug(f'EL={element} in {row}')
name = element.name
logging.debug(f'ADDING: {row} // {name}')
for k, v in row.items():
# print(f"\n{k = }")
# iterate through all column values in the row
if k not in table_config.columns:
raise ValueError(f'Expected to find {k} in {table_config.columns.keys()}')
cc = table_config.columns[k]
# print(f"{cc = }")
v = self.normalize_value(v, cc)
if v:
# print(f"{v = }")
# special case: class-context provided by settings
if cc.settings.applies_to_class:
actual_element = list(self.row_focal_element(row, table_config, column=k))[0]
else:
actual_element = element
# print(f"{cc.maps_to = }")
# print(f"{cc = }")
logging.debug(f'SETTING {name} {cc.maps_to} = {v}')
logging.debug(f'SETTING {name}.{cc.maps_to} = {v} // IK={cc.settings.inner_key}')
if cc.maps_to == 'cardinality':
self.set_cardinality(actual_element, v)
elif cc.metaslot:
Expand All @@ -179,9 +256,10 @@ def add_row(self, row: Dict[str, Any], table_config: TableConfig):
anns = yaml.safe_load(v[0])
for ann_key, ann_val in anns.items():
actual_element.annotations[ann_key] = ann_val
elif isinstance(v, list):
elif isinstance(v, list) and not cc.settings.inner_key:
# append to existing list
setattr(actual_element, cc.maps_to, getattr(actual_element, cc.maps_to, []) + v)
elif isinstance(v, dict):
elif isinstance(v, dict) and not cc.settings.inner_key:
for v_k, v_v in v.items():
curr_dict = getattr(actual_element, cc.maps_to)
curr_dict[v_k] = v_v
Expand All @@ -196,15 +274,9 @@ def add_row(self, row: Dict[str, Any], table_config: TableConfig):
# will later be converted to a metamodel object
curr_obj = {}
setattr(actual_element, cc.maps_to, curr_obj)
if isinstance(curr_obj, dict):
curr_val = curr_obj.get(cc.settings.inner_key, None)
else:
# https://github.com/linkml/linkml/issues/971
curr_val = getattr(curr_obj, cc.settings.inner_key, None)
curr_val = get_attr_via_path_accessor(curr_obj, cc.settings.inner_key)
else:
curr_val = getattr(actual_element, cc.maps_to)
# print(f"{curr_val = }")
# print(f"{v = }")

if curr_val and curr_val != 'TEMP' and curr_val != v and \
not isinstance(actual_element, SchemaDefinition) and \
Expand All @@ -213,14 +285,18 @@ def add_row(self, row: Dict[str, Any], table_config: TableConfig):
logging.warning(f'Overwriting value for {k}, was {curr_val}, now {v}')
raise ValueError(f'Cannot reset value for {k}, was {curr_val}, now {v}')
if cc.settings.inner_key:
obj_to_set = getattr(actual_element, cc.maps_to)
if isinstance(getattr(actual_element, cc.maps_to), list):
if '|' in v:
vs = v.split('|')
else:
vs = [v]
setattr(actual_element, cc.maps_to, [{cc.settings.inner_key: v} for v in vs])
for v1 in vs:
set_attr_via_path_accessor(obj_to_set, cc.settings.inner_key, v1)
# setattr(actual_element, cc.maps_to, [{cc.settings.inner_key: v} for v in vs])
else:
getattr(actual_element, cc.maps_to)[cc.settings.inner_key] = v
set_attr_via_path_accessor(obj_to_set, cc.settings.inner_key, v)
# getattr(actual_element, cc.maps_to)[cc.settings.inner_key] = v
else:
setattr(actual_element, cc.maps_to, v)
elif cc.is_element_type:
Expand Down Expand Up @@ -317,7 +393,7 @@ def row_focal_element(self, row: Dict[str, Any], table_config: TableConfig,
else:
raise ValueError(f'Unknown metatype: {typ}')
if table_config.column_by_element_type is None:
raise ValueError(f'No table_config.column_by_element_type')
raise ValueError(f"""No table_config.column_by_element_type in {row}""")
for k, elt_cls in tmap.items():
if k in table_config.column_by_element_type:
col = table_config.column_by_element_type[k]
Expand Down Expand Up @@ -512,7 +588,13 @@ def normalize_value(self, v: str, column_config: ColumnConfig = None) -> Any:
v = bmap[v.lower()]
else:
v = bool(v)
if metaslot and metaslot.multivalued and not column_config.settings.inner_key:
# TODO: use inner_key to look up the actual slot
metaslot_is_multivalued = metaslot and metaslot.multivalued and not column_config.settings.inner_key
if metaslot and column_config.settings.inner_key:
if column_config.settings.internal_separator:
# print(f"ASSUMING MV FOR {column_config.name}")
metaslot_is_multivalued = True
if metaslot_is_multivalued:
if not isinstance(v, list):
if v is None:
v = []
Expand Down Expand Up @@ -646,10 +728,21 @@ def ensure_file(self, file_name: str) -> str:
def ensure_csvreader(self, file_name: str, delimiter=None) -> str:
if self.gsheet_id:
url = gsheets_download_url(self.gsheet_id, file_name)
if self.gsheet_cache_dir:
# cache a copy of the file
dir_path = Path(self.gsheet_cache_dir)
dir_path.mkdir(parents=True, exist_ok=True)
path = dir_path / (file_name + '.csv')
stream = urlopen(url)
lines = [line for line in codecs.iterdecode(stream, 'utf-8')]
with open(path, 'w') as f:
f.write("".join(lines))
stream.close()
stream = urlopen(url)
text_stream = codecs.iterdecode(stream, 'utf-8')
reader = csv.DictReader(text_stream, delimiter=",")
yield reader

else:
with open(file_name) as file:
reader = csv.DictReader(file, delimiter=delimiter)
Expand Down Expand Up @@ -683,11 +776,13 @@ def ensure_csvreader(self, file_name: str, delimiter=None) -> str:
help="Auto-repair schema")
@click.option("--gsheet-id",
help="Google sheets ID. If this is specified then the arguments MUST be sheet names")
@click.option("--gsheet-cache-dir",
help="Directory to cache google sheets")
@click.option("--base-schema-path",
help="Base schema yaml file, the base-schema will be merged with the generated schema")
@click.option("-v", "--verbose", count=True)
@click.argument('tsv_files', nargs=-1)
def convert(tsv_files, gsheet_id, output: TextIO, name, repair, table_config_path: str, use_attributes: bool,
def convert(tsv_files, gsheet_id, gsheet_cache_dir, output: TextIO, name, repair, table_config_path: str, use_attributes: bool,
unique_slots: bool, verbose: int, sort_keys: bool, base_schema_path: str):
"""
Convert schemasheets to a LinkML schema
Expand All @@ -712,6 +807,7 @@ def convert(tsv_files, gsheet_id, output: TextIO, name, repair, table_config_pat
sm = SchemaMaker(use_attributes=use_attributes,
unique_slots=unique_slots,
gsheet_id=gsheet_id,
gsheet_cache_dir=gsheet_cache_dir,
default_name=name,
table_config_path=table_config_path,
base_schema_path=base_schema_path)
Expand All @@ -720,7 +816,6 @@ def convert(tsv_files, gsheet_id, output: TextIO, name, repair, table_config_pat
schema = sm.repair_schema(schema)
schema_dict = schema_as_dict(schema)
output.write(yaml.dump(schema_dict, sort_keys=sort_keys))
# output.write(yaml_dumper.dumps(schema))


if __name__ == '__main__':
Expand Down
5 changes: 4 additions & 1 deletion schemasheets/schemasheet_datamodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,6 @@ def add_info(self, col: COL_NAME, info: Union[Dict, DESCRIPTOR]) -> None:
"""
if col not in self.columns:
self.columns[col] = ColumnConfig(col)
#print(f'ADDING: {col}')
self.columns[col].add_info(info)
if self.columns[col].maps_to == 'metatype':
if self.metatype_column and self.metatype_column != col:
Expand Down Expand Up @@ -204,6 +203,7 @@ def from_dictreader(reader: csv.DictReader) -> "SchemaSheet":
rows = []
line_num = 1
table_config_rows = []
descriptor_line_count = 0
for row in reader:
logging.debug(f"ROW: {row}")
# google sheets
Expand All @@ -213,6 +213,7 @@ def from_dictreader(reader: csv.DictReader) -> "SchemaSheet":
if row[k0].startswith('>'):
table_config_rows.append(row)
line_num += 1
descriptor_line_count += 1
for k, v in row.items():
if v is not None and v.startswith('>'):
v = v.replace('>', '')
Expand All @@ -226,6 +227,8 @@ def from_dictreader(reader: csv.DictReader) -> "SchemaSheet":
logging.debug(f'Empty val for {k} in line {line_num}')
else:
rows.append(row)
if descriptor_line_count == 0:
logging.warning(f"No descriptor line found in {line_num} lines. Start line_num = {line_num}")
return SchemaSheet(table_config=table_config,
table_config_rows=table_config_rows,
rows=rows,
Expand Down
Loading
Loading