Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Serialise datamodels to RDF based on JSON-LD context #471

Draft
wants to merge 13 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions docs/api_reference/utils/rdf.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# rdf

::: oteapi.utils.rdf
113 changes: 113 additions & 0 deletions oteapi/utils/rdf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
"""Utility functions for representing instances of pydantic models as rdf.

This module uses JSON-LD with a shared context on https://w3id.org/domain/oteio/context
"""

import io
import json
from pathlib import Path
from typing import TYPE_CHECKING

import rdflib
import yaml

if TYPE_CHECKING: # pragma: no cover
from typing import Any, Optional, TextIO, Union

# import tripper


def load_content(
source: "Optional[Union[Path, str, TextIO]]" = None,
data: "Optional[str]" = None,
format: "Optional[str]" = None,
) -> "Any":
"""Load content from yaml or json source.

Arguments:
source: File name or file-like object with data documentation to add.
data: String containing the data documentation to add.
format: Input format. One of: "yaml", "json".
By default it will be inferred from `source` or `data`.

Returns:
Python representation of the content.
"""
if not source and not data:
raise TypeError("Either `source` or `data` must be given.")

if source and isinstance(source, (str, Path)):
with open(source, "rt") as f:
return load_content(source=f, format=format)

if format is None:
if source:
format = Path(source.name).suffix
elif data.lstrip().startswith("---"):
format = "yaml"
elif data.lstrip().startswith("{"):
format = "json"

if format is None:
raise ValueError("Format cannot be inferred. Use `format` argument.")

format = format.lstrip(".").lower()
if format in ("yaml", "yml"):
if not source:
source = io.StringIO(data)
content = yaml.safe_load(source)
elif format in ("json"):
content = json.load(source) if source else json.loads(data)
else:
raise TypeError(f"Unsupported format: {format}")

return content


def add_resource(
source: "Optional[Union[Path, str, TextIO]]" = None,
data: "Optional[dict, str]" = None,
format: "Optional[str]" = None,
graph: "Optional[Union[rdflib.Graph, Any]]" = None,
) -> "Union[rdflib.Graph, Any]":
"""Add documentation of data resource(s) to triplestore.

Arguments:

source: File name or file-like object with data documentation
to add.
data: Dict or string containing the data documentation to add.
format: Input format. One of: "yaml", "json".
By default it will be inferred from `source` or `data`.
graph: The graph to add the documentation to. It can be a
rdflib.Graph object or any type that has a parse() method
that supports json-ld.
If not given, a new rdflib.Graph object will be created.

Returns:
The provided graph or a new rdflib.Graph object, if `graph` is
None.
"""
if isinstance(data, dict):
content = data.copy()
else:
content = load_content(source=source, data=data, format=format)

if not isinstance(content, dict):
raise TypeError("Expected input content to be a dict.")

if "@context" not in content:
content["@context"] = "https://w3id.org/emmo/domain/oteio/context"

if not graph:
graph = rdflib.Graph()

with open("xxx.json", "wt") as f:
json.dump(content, f, indent=2)

# print("=====================================")
# print(json.dumps(content, indent=2))

# graph.parse(data=content, format="json-ld")
graph.parse(source="xxx.json", format="json-ld")
return graph
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ dependencies = [
"pydantic~=2.7",
"pydantic-settings~=2.2",
"typing-extensions~=4.11; python_version < '3.10'",
"rdflib>=6.3",

# Strategy dependencies
"celery>=5.3.5,<6",
Expand Down
237 changes: 237 additions & 0 deletions tests/models/test_jsonld.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,237 @@
import json
from pathlib import Path

import rdflib
from rdflib.plugins.shared.jsonld.context import Context

from oteapi.utils import rdf

thisdir = Path(__file__).resolve().parent
testdir = thisdir.parent
staticdir = testdir / "static"

# s = """
# {
# "@context": {
# "@vocab": "http://xmlns.com/foaf/0.1/",
# "knows": {"@type": "@id"}
# },
# "@id": "http://manu.sporny.org/about#manu",
# "@type": "Person",
# "name": "Manu Sporny",
# "knows": {
# "@id": "https://greggkellogg.net/foaf#me",
# "@type": "Person",
# "name": "Gregg Kellogg"
# }
# }
# """
# g = rdflib.Graph()
# g.parse(data=s, format="json-ld")
# #print(g.serialize(format="turtle"))
# #print("------------------------------------------------")
# #print()
#
#
# conf = """
# {
# "@context": {
# "oteio": "https://w3id.org/emmo/domain/oteio#",
# "dcat": "http://www.w3.org/ns/dcat#",
# "dcterms": "http://purl.org/dc/terms/",
#
# "downloadURL": "dcat:downloadURL",
# "mediaType": "dcat:mediaType",
# "license": "dcterms:license",
# "driver": "oteio:driver",
# "configuration": "oteio:configuration",
# "dataresource": "oteio:dataresource"
# },
# "http://example.com/ex/faithfull": {
# "@type": "oteio:Source",
# "@id": "http://example.com/ex/faithfull",
# "dataresource": {
# "downloadURL": "http://example.com/datasets/faithfull.csv",
# "mediaType": "application/csv",
# "license": "https://creativecommons.org/licenses/by/4.0/legalcode",
# "configuration": {
# "driver": "csv"
# }
# }
# }
# }
# """
# # "http://example.com/ex/faithfull": {
# g = rdflib.Graph()
# #g.bind("ex", "http://example.com/ex/")
# g.parse(data=conf, format="json-ld")
# #print(g.serialize(format="turtle"))
# #print("------------------------------------------------")
# #print()


conf2 = """
{
"@context": {
"@version": 1.1,

"rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
"dcterms": "http://purl.org/dc/terms/",
"dcat": "http://www.w3.org/ns/dcat#",
"oteio": "https://w3id.org/emmo/domain/oteio#",

"resources": "@nest",
"configuration": {
"@id": "oteio:configuration",
"@type": "@json"
},
"dataresource": {
"@id": "oteio:dataresource",
"@type": "oteio:DataResource",
"@nest": "resources"
},
"downloadURL": "dcat:downloadURL",
"mediaType": "dcat:mediaType",
"license": "dcterms:license",

"parse": "oteio:parse",
"parserType": "oteio:parserType",
"datamodel": "oteio:datamodel",

"driver": "oteio:driver"
},

"resources": [
{
"@type": "oteio:Source",
"@id": "http://example.com/ex/faithfull",
"dataresource": {
"downloadURL": "http://example.com/datasets/faithfull.csv",
"mediaType": "application/csv",
"license": "https://creativecommons.org/licenses/by/4.0/legalcode"
},
"parse": {
"parserType": "application/vnd.dlite-parse",
"datamodel": "http://onto-ns.com/meta/calm/0.1/Composition",
"configuration": {
"driver": "csv"
}
}
}
]
}
"""
# "http://example.com/ex/faithfull": {
g = rdflib.Graph()
# g.bind("ex", "http://example.com/ex/")
# print(json.loads(conf2))
g.parse(data=conf2, format="json-ld")
print(g.serialize(format="turtle"))
print("------------------------------------------------")
print()
context_data = json.loads(conf2).get("@context")
context = Context(context_data)
graph_data = json.loads(
g.serialize(format="json-ld", context_data=context_data, auto_compact=True)
).get("@graph")
# print(json.dumps(graph_data, indent=2))
# print("------------------------------------------------")
# print()


def expand(item):
"""Returns `item` with all all references to blank nodes expanded."""
d = {}
for k, v in item.items():
if k == "@id":
if v.startswith("_:"):
dct = iris[v].copy()
dct.pop("@id")
d.update(expand(dct))
else:
d[k] = v
elif isinstance(v, dict):
if "@value" in v:
if v.get("@type") == "rdf:JSON":
d[k] = json.loads(v["@value"])
else:
d[k] = v["@value"]
else:
d[k] = expand(v)
else:
d[k] = v
return d


def from_rdf(graph, context_data):
graph_data = json.loads(
g.serialize(format="json-ld", context_data=context_data, auto_compact=True)
).get("@graph")
# iris = {d["@id"]: d for d in graph_data if "@id" in d}
resources = [
expand(d) for d in graph_data if "@id" in d and not d["@id"].startswith("_:")
]
json_data = {
"resources": resources,
}
return json_data


iris = {d["@id"]: d for d in graph_data if "@id" in d}
assert "http://example.com/ex/faithfull" in iris
# resources = [
# expand(d) for d in graph_data
# if "@id" in d and not d["@id"].startswith("_:")
# ]
#
# json_data = {
# "resources": resources,
# }
# print(json.dumps(json_data, indent=2))
print(json.dumps(from_rdf(g, context_data), indent=2))


# PREFIX ex: <http://example.com/ex/>
res = g.query(
"""
PREFIX ex: <http://example.com/ex/>
CONSTRUCT { ?s ?p ?o }
WHERE {
ex:faithfull (<>|!<>) ?s .
?s ?p ?o .
}
"""
)


# data = """
# @prefix : <urn:ex:> .
#
# :A :p :B, :C .
# :B :q :D .
# :C :r :E .
#
# :F :s :G .
# :G :t :H .
# """
# query = """
# PREFIX x: <urn:ex:>
# PREFIX : <urn:ex:>
#
# CONSTRUCT {
# ?s ?p ?o
# }
# WHERE {
# :A (<>|!<>)* ?s .
# ?s ?p ?o .
# }
# """
# graph = rdflib.Graph()
# graph.parse(data=data)
# res = graph.query(query)


# with open(staticdir / "resources.yaml", "rt", encoding="utf8") as f:
# data = yaml.safe_load(f)

graph = rdf.add_resource(staticdir / "resources.yaml")
Loading