Skip to content

Commit

Permalink
Adding Property Graph example (#144)
Browse files Browse the repository at this point in the history
Fixes #142

Also adding roundtrip when generating schema, catches issues with #141
  • Loading branch information
cmungall authored Jul 30, 2024
1 parent a55ba85 commit 530f303
Show file tree
Hide file tree
Showing 7 changed files with 1,282 additions and 1,040 deletions.
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,11 @@ sheets2linkml personinfo.tsv -o personinfo.yaml

We recommend using [COGS](https://linkml.io/schemasheets/howto/google-sheets/) to synchronize your google sheets with local files using a git-like mechanism

## Examples

- [Person Info Schema](https://docs.google.com/spreadsheets/d/1wVoaiFg47aT9YWNeRfTZ8tYHN8s8PAuDx5i2HUcDpvQ/edit#gid=55566104)
- [Movies Property Graph Schema](https://docs.google.com/spreadsheets/d/1oMrzA41tg_nisdWInnqKJrcvv30dOXuwAhznJYYPSB8/edit?gid=1499822522#gid=1499822522)

## Finding out more

* [Schema Sheets Manual](https://linkml.io/schemasheets)
Expand Down
2 changes: 1 addition & 1 deletion docs/howto/google-sheets.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ E.g.
sheets2linkml --gsheet-id 1wVoaiFg47aT9YWNeRfTZ8tYHN8s8PAuDx5i2HUcDpvQ personinfo types prefixes -o personinfo.yaml
```

__Note__: due to a bug with google sheets API (see [this Stack Overflow question](https://stackoverflow.com/questions/61578295/google-spreadsheet-gviz-query-is-concatenating-first-two-rows-into-header)), this will not work if your sheet has floats/decimals in them. It's not clear if google will ever fix this. If you need decimals, then you should either manually download the sheet to TSV, or use COGS.
__Note__: due to a bug with google sheets API (see [this Stack Overflow question](https://stackoverflow.com/questions/61578295/google-spreadsheet-gviz-query-is-concatenating-first-two-rows-into-header)), this will not work if your sheet has floats/decimals/booleans in them. It's not clear if google will ever fix this. If you need decimals, then you should either manually download the sheet to TSV, or use COGS.

## COGS

Expand Down
2,114 changes: 1,103 additions & 1,011 deletions poetry.lock

Large diffs are not rendered by default.

11 changes: 8 additions & 3 deletions schemasheets/schema_exporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,14 @@
import click
from linkml_runtime.linkml_model import Element, SlotDefinition, SubsetDefinition, ClassDefinition, EnumDefinition, \
PermissibleValue, \
TypeDefinition, Example, Annotation, Prefix
TypeDefinition, Example, Annotation, Prefix, SchemaDefinition
from linkml_runtime.utils.formatutils import underscore
from linkml_runtime.utils.schemaview import SchemaView

from schemasheets.conf.configschema import ColumnSettings
from schemasheets.schemamaker import SchemaMaker
from schemasheets.schemasheet_datamodel import TableConfig, T_CLASS, T_SLOT, SchemaSheet, T_ENUM, T_PV, T_TYPE, \
T_SUBSET, T_PREFIX
T_SUBSET, T_PREFIX, T_SCHEMA

ROW = Dict[str, Any]

Expand Down Expand Up @@ -108,7 +108,7 @@ class SchemaExporter:
Exports a schema to Schema Sheets TSV format
"""
schemamaker: SchemaMaker = field(default_factory=lambda: SchemaMaker())
delimiter = '\t'
delimiter: str = field(default_factory=lambda: '\t')
rows: List[ROW] = field(default_factory=lambda: [])

def export(self, schemaview: SchemaView, to_file: Union[str, Path], specification: str = None,
Expand Down Expand Up @@ -229,6 +229,11 @@ def export_element(self, element: Element, parent: Optional[Element], schemaview
pk_col = col_name
else:
continue
elif t == T_SCHEMA:
if isinstance(element, SchemaDefinition):
pk_col = col_name
else:
continue
else:
raise AssertionError(f"Unexpected type: {t}")
if not pk_col:
Expand Down
143 changes: 119 additions & 24 deletions schemasheets/schemamaker.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import sys
import csv
import logging
from pathlib import Path
from urllib.request import urlopen
from copy import copy

Expand All @@ -12,11 +13,13 @@
from dataclasses import dataclass
from typing import List, Union, Any, Dict, Tuple, Generator, TextIO

from linkml_runtime.dumpers import yaml_dumper, json_dumper
from linkml_runtime.linkml_model import Annotation, Example
from linkml_runtime.linkml_model.meta import SchemaDefinition, ClassDefinition, Prefix, \
SlotDefinition, EnumDefinition, PermissibleValue, SubsetDefinition, TypeDefinition, Element, Setting
from linkml_runtime.utils.schema_as_dict import schema_as_dict
from linkml_runtime.utils.schemaview import SchemaView, re
from linkml_runtime.utils.yamlutils import YAMLRoot

from schemasheets.schemasheet_datamodel import ColumnConfig, TableConfig, get_configmodel, get_metamodel, COL_NAME, \
DESCRIPTOR, \
Expand All @@ -26,6 +29,67 @@
from schemasheets.utils.prefixtool import guess_prefix_expansion


def ensure_path_tokens(path: Union[str, List[str]]) -> List[str]:
if isinstance(path, list):
return path
if "." in path:
return path.split(".")
return [path]


def get_attr_via_path_accessor(obj: Union[dict, YAMLRoot], path: Union[str, List[str]]) -> Any:
"""
Given an object and a path, return the value at the end of the path
:param obj: object
:param path: path
:return: value
"""
toks = ensure_path_tokens(path)
tok = toks[0]
toks = toks[1:]
if isinstance(obj, dict):
v = obj.get(tok, None)
else:
# https://github.com/linkml/linkml/issues/971
v = getattr(obj, tok, None)
if v and toks:
return get_attr_via_path_accessor(v, toks)
else:
return v


def set_attr_via_path_accessor(obj: Union[dict, YAMLRoot], path: Union[str, List[str]], value: Any, depth=0) -> None:
"""
Given an object, a path, and a value, set the value at the end of the path
:param obj: object
:param path: path
:param value: value
:param depth: recursion depth
:return: None
"""
toks = ensure_path_tokens(path)
tok = toks[0]
toks = toks[1:]
logging.debug(f"[{depth}] Setting attr {tok} / {toks} in {obj} to {value}")
if isinstance(obj, dict):
if not toks:
obj[tok] = value
else:
if tok not in obj:
obj[tok] = {}
logging.info(f"Creating empty dict for: {tok}")
set_attr_via_path_accessor(obj[tok], toks, value, depth+1)
else:
if not toks:
setattr(obj, tok, value)
else:
if not hasattr(obj, tok):
setattr(obj, tok, {})
set_attr_via_path_accessor(getattr(obj, tok), toks, value, depth+1)


class SchemaSheetRowException(Exception):
pass

Expand Down Expand Up @@ -56,6 +120,8 @@ class SchemaMaker:

gsheet_id: str = None
"""Google sheet ID."""

gsheet_cache_dir: str = None

table_config_path: str = None
"""Path to table configuration file."""
Expand All @@ -82,7 +148,9 @@ def create_schema(self, csv_files: Union[str, List[str]], **kwargs) -> SchemaDef
if not isinstance(csv_files, list):
csv_files = [csv_files]
for f in csv_files:
# reconstitute schema
self.load_and_merge_sheet(f, **kwargs)
self.schema = SchemaDefinition(**json_dumper.to_dict(self.schema))
self.schema.imports.append('linkml:types')
self.schema.prefixes['linkml'] = Prefix('linkml', 'https://w3id.org/linkml/')
self._tidy_slot_usage()
Expand All @@ -102,6 +170,7 @@ def _tidy_slot_usage(self):
:return:
"""
for cn, c in self.schema.classes.items():
logging.debug(f"Tidying {cn}")
inapplicable_slots = [sn for sn, s in c.slot_usage.items() if 'inapplicable' in s.annotations]
for sn in inapplicable_slots:
c.slots.remove(sn)
Expand Down Expand Up @@ -132,39 +201,47 @@ def load_and_merge_sheet(self, file_name: str, delimiter='\t') -> None:
try:
self.add_row(row, schemasheet.table_config)
line_num += 1
except ValueError as e:
raise SchemaSheetRowException(f'Error in line {line_num}, row={row}') from e
except (ValueError, AttributeError) as e:
raise SchemaSheetRowException(f"Error in line {line_num}, row={row}\n"
f"Exception:\n{e}") from e

def add_row(self, row: Dict[str, Any], table_config: TableConfig):
"""
Add and translate a row from a schema sheet to the current schema.
A row may represent an instance of a LinkML element, such as a class, slot, type,
or enum. The row may also represent a setting, prefix, or schema-level annotation.
This is known as the "focal element"(s) of the row.
:param row:
:param table_config:
:return:
"""
for element in self.row_focal_element(row, table_config):
if isinstance(element, Prefix):
name = element.prefix_prefix
elif isinstance(element, PermissibleValue):
name = element.text
elif isinstance(element, Setting):
# print(f"\n{element = }")
name = element.setting_key
else:
logging.debug(f'EL={element} in {row}')
name = element.name
logging.debug(f'ADDING: {row} // {name}')
for k, v in row.items():
# print(f"\n{k = }")
# iterate through all column values in the row
if k not in table_config.columns:
raise ValueError(f'Expected to find {k} in {table_config.columns.keys()}')
cc = table_config.columns[k]
# print(f"{cc = }")
v = self.normalize_value(v, cc)
if v:
# print(f"{v = }")
# special case: class-context provided by settings
if cc.settings.applies_to_class:
actual_element = list(self.row_focal_element(row, table_config, column=k))[0]
else:
actual_element = element
# print(f"{cc.maps_to = }")
# print(f"{cc = }")
logging.debug(f'SETTING {name} {cc.maps_to} = {v}')
logging.debug(f'SETTING {name}.{cc.maps_to} = {v} // IK={cc.settings.inner_key}')
if cc.maps_to == 'cardinality':
self.set_cardinality(actual_element, v)
elif cc.metaslot:
Expand All @@ -179,9 +256,10 @@ def add_row(self, row: Dict[str, Any], table_config: TableConfig):
anns = yaml.safe_load(v[0])
for ann_key, ann_val in anns.items():
actual_element.annotations[ann_key] = ann_val
elif isinstance(v, list):
elif isinstance(v, list) and not cc.settings.inner_key:
# append to existing list
setattr(actual_element, cc.maps_to, getattr(actual_element, cc.maps_to, []) + v)
elif isinstance(v, dict):
elif isinstance(v, dict) and not cc.settings.inner_key:
for v_k, v_v in v.items():
curr_dict = getattr(actual_element, cc.maps_to)
curr_dict[v_k] = v_v
Expand All @@ -196,15 +274,9 @@ def add_row(self, row: Dict[str, Any], table_config: TableConfig):
# will later be converted to a metamodel object
curr_obj = {}
setattr(actual_element, cc.maps_to, curr_obj)
if isinstance(curr_obj, dict):
curr_val = curr_obj.get(cc.settings.inner_key, None)
else:
# https://github.com/linkml/linkml/issues/971
curr_val = getattr(curr_obj, cc.settings.inner_key, None)
curr_val = get_attr_via_path_accessor(curr_obj, cc.settings.inner_key)
else:
curr_val = getattr(actual_element, cc.maps_to)
# print(f"{curr_val = }")
# print(f"{v = }")

if curr_val and curr_val != 'TEMP' and curr_val != v and \
not isinstance(actual_element, SchemaDefinition) and \
Expand All @@ -213,14 +285,18 @@ def add_row(self, row: Dict[str, Any], table_config: TableConfig):
logging.warning(f'Overwriting value for {k}, was {curr_val}, now {v}')
raise ValueError(f'Cannot reset value for {k}, was {curr_val}, now {v}')
if cc.settings.inner_key:
obj_to_set = getattr(actual_element, cc.maps_to)
if isinstance(getattr(actual_element, cc.maps_to), list):
if '|' in v:
vs = v.split('|')
else:
vs = [v]
setattr(actual_element, cc.maps_to, [{cc.settings.inner_key: v} for v in vs])
for v1 in vs:
set_attr_via_path_accessor(obj_to_set, cc.settings.inner_key, v1)
# setattr(actual_element, cc.maps_to, [{cc.settings.inner_key: v} for v in vs])
else:
getattr(actual_element, cc.maps_to)[cc.settings.inner_key] = v
set_attr_via_path_accessor(obj_to_set, cc.settings.inner_key, v)
# getattr(actual_element, cc.maps_to)[cc.settings.inner_key] = v
else:
setattr(actual_element, cc.maps_to, v)
elif cc.is_element_type:
Expand Down Expand Up @@ -317,7 +393,7 @@ def row_focal_element(self, row: Dict[str, Any], table_config: TableConfig,
else:
raise ValueError(f'Unknown metatype: {typ}')
if table_config.column_by_element_type is None:
raise ValueError(f'No table_config.column_by_element_type')
raise ValueError(f"""No table_config.column_by_element_type in {row}""")
for k, elt_cls in tmap.items():
if k in table_config.column_by_element_type:
col = table_config.column_by_element_type[k]
Expand Down Expand Up @@ -512,7 +588,13 @@ def normalize_value(self, v: str, column_config: ColumnConfig = None) -> Any:
v = bmap[v.lower()]
else:
v = bool(v)
if metaslot and metaslot.multivalued and not column_config.settings.inner_key:
# TODO: use inner_key to look up the actual slot
metaslot_is_multivalued = metaslot and metaslot.multivalued and not column_config.settings.inner_key
if metaslot and column_config.settings.inner_key:
if column_config.settings.internal_separator:
# print(f"ASSUMING MV FOR {column_config.name}")
metaslot_is_multivalued = True
if metaslot_is_multivalued:
if not isinstance(v, list):
if v is None:
v = []
Expand Down Expand Up @@ -646,10 +728,21 @@ def ensure_file(self, file_name: str) -> str:
def ensure_csvreader(self, file_name: str, delimiter=None) -> str:
if self.gsheet_id:
url = gsheets_download_url(self.gsheet_id, file_name)
if self.gsheet_cache_dir:
# cache a copy of the file
dir_path = Path(self.gsheet_cache_dir)
dir_path.mkdir(parents=True, exist_ok=True)
path = dir_path / (file_name + '.csv')
stream = urlopen(url)
lines = [line for line in codecs.iterdecode(stream, 'utf-8')]
with open(path, 'w') as f:
f.write("".join(lines))
stream.close()
stream = urlopen(url)
text_stream = codecs.iterdecode(stream, 'utf-8')
reader = csv.DictReader(text_stream, delimiter=",")
yield reader

else:
with open(file_name) as file:
reader = csv.DictReader(file, delimiter=delimiter)
Expand Down Expand Up @@ -683,11 +776,13 @@ def ensure_csvreader(self, file_name: str, delimiter=None) -> str:
help="Auto-repair schema")
@click.option("--gsheet-id",
help="Google sheets ID. If this is specified then the arguments MUST be sheet names")
@click.option("--gsheet-cache-dir",
help="Directory to cache google sheets")
@click.option("--base-schema-path",
help="Base schema yaml file, the base-schema will be merged with the generated schema")
@click.option("-v", "--verbose", count=True)
@click.argument('tsv_files', nargs=-1)
def convert(tsv_files, gsheet_id, output: TextIO, name, repair, table_config_path: str, use_attributes: bool,
def convert(tsv_files, gsheet_id, gsheet_cache_dir, output: TextIO, name, repair, table_config_path: str, use_attributes: bool,
unique_slots: bool, verbose: int, sort_keys: bool, base_schema_path: str):
"""
Convert schemasheets to a LinkML schema
Expand All @@ -712,6 +807,7 @@ def convert(tsv_files, gsheet_id, output: TextIO, name, repair, table_config_pat
sm = SchemaMaker(use_attributes=use_attributes,
unique_slots=unique_slots,
gsheet_id=gsheet_id,
gsheet_cache_dir=gsheet_cache_dir,
default_name=name,
table_config_path=table_config_path,
base_schema_path=base_schema_path)
Expand All @@ -720,7 +816,6 @@ def convert(tsv_files, gsheet_id, output: TextIO, name, repair, table_config_pat
schema = sm.repair_schema(schema)
schema_dict = schema_as_dict(schema)
output.write(yaml.dump(schema_dict, sort_keys=sort_keys))
# output.write(yaml_dumper.dumps(schema))


if __name__ == '__main__':
Expand Down
5 changes: 4 additions & 1 deletion schemasheets/schemasheet_datamodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,6 @@ def add_info(self, col: COL_NAME, info: Union[Dict, DESCRIPTOR]) -> None:
"""
if col not in self.columns:
self.columns[col] = ColumnConfig(col)
#print(f'ADDING: {col}')
self.columns[col].add_info(info)
if self.columns[col].maps_to == 'metatype':
if self.metatype_column and self.metatype_column != col:
Expand Down Expand Up @@ -204,6 +203,7 @@ def from_dictreader(reader: csv.DictReader) -> "SchemaSheet":
rows = []
line_num = 1
table_config_rows = []
descriptor_line_count = 0
for row in reader:
logging.debug(f"ROW: {row}")
# google sheets
Expand All @@ -213,6 +213,7 @@ def from_dictreader(reader: csv.DictReader) -> "SchemaSheet":
if row[k0].startswith('>'):
table_config_rows.append(row)
line_num += 1
descriptor_line_count += 1
for k, v in row.items():
if v is not None and v.startswith('>'):
v = v.replace('>', '')
Expand All @@ -226,6 +227,8 @@ def from_dictreader(reader: csv.DictReader) -> "SchemaSheet":
logging.debug(f'Empty val for {k} in line {line_num}')
else:
rows.append(row)
if descriptor_line_count == 0:
logging.warning(f"No descriptor line found in {line_num} lines. Start line_num = {line_num}")
return SchemaSheet(table_config=table_config,
table_config_rows=table_config_rows,
rows=rows,
Expand Down
Loading

0 comments on commit 530f303

Please sign in to comment.