!pip install -q rdflib pyshacl
[notice] A new release of pip is available: 26.0 -> 26.0.1
[notice] To update, run: pip install --upgrade pip

This notebook demonstrates how to convert metadata from GeoCroissant, a geospatial extension of MLCommons Croissant, into GeoDCAT (DCAT-AP for geospatial datasets).
GeoDCAT is a standardized RDF-based metadata model for publishing geospatial datasets, enabling: - Metadata interoperability (with CKAN, INSPIRE, EU portals) - Semantic web support via RDF/JSON-LD and Turtle - Cataloging of spatial, temporal, and distribution metadata
| GeoCroissant Field | GeoDCAT Field |
|---|---|
name |
dct:title |
description |
dct:description |
license |
dct:license |
version |
adms:version |
datePublished |
dct:issued |
conformsTo |
dct:conformsTo |
keywords |
dcat:keyword |
spatialCoverage.geo.box |
dct:spatial + geo:asWKT |
temporalCoverage |
dct:temporal + DCAT dates |
geocr:coordinateReferenceSystem |
geocr:coordinateReferenceSystem |
geocr:spatialResolution |
geocr:spatialResolution |
geocr:temporalResolution |
geocr:temporalResolution |
distribution (cr:FileObject) |
dcat:Distribution |
distribution.contentUrl |
dcat:accessURL |
distribution.encodingFormat |
dcat:mediaType |
recordSet (cr:RecordSet) |
geocr:RecordSet |
recordSet.field (cr:Field) |
geocr:Field |
We use: - rdflib for manipulating RDF graphs - pyshacl for validating metadata using SHACL constraints
!pip install -q rdflib pyshacl
[notice] A new release of pip is available: 26.0 -> 26.0.1
[notice] To update, run: pip install --upgrade pip
This function converts proper GeoCroissant metadata with full compliance to GeoDCAT-AP format.
import json
from rdflib import Graph, Namespace, URIRef, Literal, BNode
from rdflib.namespace import DCTERMS, DCAT, FOAF, XSD, RDF
from urllib.parse import quote
def geocroissant_to_geodcat_jsonld(geocroissant_json, output_file="geodcat.jsonld"):
"""Convert GeoCroissant JSON-LD to GeoDCAT-AP compliant format"""
g = Graph()
# Namespaces
GEO = Namespace("http://www.opengis.net/ont/geosparql#")
SCHEMA = Namespace("https://schema.org/")
SPDX = Namespace("http://spdx.org/rdf/terms#")
ADMS = Namespace("http://www.w3.org/ns/adms#")
PROV = Namespace("http://www.w3.org/ns/prov#")
GEOCR = Namespace("http://mlcommons.org/croissant/geo/")
g.bind("dct", DCTERMS)
g.bind("dcat", DCAT)
g.bind("foaf", FOAF)
g.bind("geo", GEO)
g.bind("schema", SCHEMA)
g.bind("spdx", SPDX)
g.bind("adms", ADMS)
g.bind("prov", PROV)
g.bind("geocr", GEOCR)
# Create dataset URI
dataset_name = geocroissant_json.get("name", "dataset")
# URL-encode the dataset name to handle spaces and special characters
safe_name = quote(dataset_name, safe='')
dataset_uri = URIRef(f"https://example.org/{safe_name}")
# Basic dataset properties
g.add((dataset_uri, RDF.type, DCAT.Dataset))
g.add((dataset_uri, RDF.type, SCHEMA.Dataset))
g.add((dataset_uri, DCTERMS.title, Literal(geocroissant_json["name"])))
g.add((dataset_uri, DCTERMS.description, Literal(geocroissant_json["description"])))
# License
if "license" in geocroissant_json:
g.add((dataset_uri, DCTERMS.license, URIRef(geocroissant_json["license"])))
# Version
if "version" in geocroissant_json:
g.add((dataset_uri, ADMS.version, Literal(geocroissant_json["version"])))
# Date published
if "datePublished" in geocroissant_json:
g.add((dataset_uri, DCTERMS.issued, Literal(geocroissant_json["datePublished"], datatype=XSD.date)))
# ConformsTo
for conformance in geocroissant_json.get("conformsTo", []):
g.add((dataset_uri, DCTERMS.conformsTo, URIRef(conformance)))
# Keywords
for keyword in geocroissant_json.get("keywords", []):
g.add((dataset_uri, DCAT.keyword, Literal(keyword)))
# Spatial coverage
spatial_coverage = geocroissant_json.get("spatialCoverage", {})
if spatial_coverage and "geo" in spatial_coverage:
geo_shape = spatial_coverage["geo"]
if "box" in geo_shape:
# Parse the bounding box (south west north east format)
bbox = geo_shape["box"].split()
if len(bbox) == 4:
spatial_uri = URIRef(f"{dataset_uri}/spatial")
g.add((dataset_uri, DCTERMS.spatial, spatial_uri))
g.add((spatial_uri, RDF.type, DCTERMS.Location))
# Create WKT polygon from bounding box
south, west, north, east = bbox
wkt_bbox = f"POLYGON(({west} {south}, {east} {south}, {east} {north}, {west} {north}, {west} {south}))"
g.add((spatial_uri, GEO.asWKT, Literal(wkt_bbox, datatype=GEO.wktLiteral)))
# Temporal coverage
if "temporalCoverage" in geocroissant_json:
temporal_coverage = geocroissant_json["temporalCoverage"]
if "/" in temporal_coverage:
start_date, end_date = temporal_coverage.split("/")
temporal_uri = URIRef(f"{dataset_uri}/temporal")
g.add((dataset_uri, DCTERMS.temporal, temporal_uri))
g.add((temporal_uri, RDF.type, DCTERMS.PeriodOfTime))
g.add((temporal_uri, DCAT.startDate, Literal(start_date, datatype=XSD.date)))
g.add((temporal_uri, DCAT.endDate, Literal(end_date, datatype=XSD.date)))
# GeoCroissant specific properties
if "geocr:coordinateReferenceSystem" in geocroissant_json:
crs_uri = URIRef(f"http://www.opengis.net/def/crs/{geocroissant_json['geocr:coordinateReferenceSystem']}")
g.add((dataset_uri, GEOCR.coordinateReferenceSystem, crs_uri))
# Spatial resolution
if "geocr:spatialResolution" in geocroissant_json:
spatial_res = geocroissant_json["geocr:spatialResolution"]
if isinstance(spatial_res, dict) and "@type" in spatial_res:
res_node = BNode()
g.add((dataset_uri, GEOCR.spatialResolution, res_node))
g.add((res_node, RDF.type, SCHEMA.QuantitativeValue))
if "value" in spatial_res:
g.add((res_node, SCHEMA.value, Literal(spatial_res["value"])))
if "unitText" in spatial_res:
g.add((res_node, SCHEMA.unitText, Literal(spatial_res["unitText"])))
# Temporal resolution
if "geocr:temporalResolution" in geocroissant_json:
temporal_res = geocroissant_json["geocr:temporalResolution"]
if isinstance(temporal_res, dict) and "@type" in temporal_res:
res_node = BNode()
g.add((dataset_uri, GEOCR.temporalResolution, res_node))
g.add((res_node, RDF.type, SCHEMA.QuantitativeValue))
if "value" in temporal_res:
g.add((res_node, SCHEMA.value, Literal(temporal_res["value"])))
if "unitText" in temporal_res:
g.add((res_node, SCHEMA.unitText, Literal(temporal_res["unitText"])))
# Distributions
for dist in geocroissant_json.get("distribution", []):
if dist.get("@type") == "cr:FileObject":
dist_id = dist.get("@id", "distribution")
dist_uri = URIRef(f"{dataset_uri}/distribution/{dist_id}")
g.add((dataset_uri, DCAT.distribution, dist_uri))
g.add((dist_uri, RDF.type, DCAT.Distribution))
if "name" in dist:
g.add((dist_uri, DCTERMS.title, Literal(dist["name"])))
if "description" in dist:
g.add((dist_uri, DCTERMS.description, Literal(dist["description"])))
if "contentUrl" in dist:
g.add((dist_uri, DCAT.accessURL, URIRef(dist["contentUrl"])))
if "encodingFormat" in dist:
g.add((dist_uri, DCAT.mediaType, Literal(dist["encodingFormat"])))
if "md5" in dist:
checksum_node = BNode()
g.add((dist_uri, SPDX.checksum, checksum_node))
g.add((checksum_node, RDF.type, SPDX.Checksum))
g.add((checksum_node, SPDX.algorithm, SPDX.checksumAlgorithm_md5))
g.add((checksum_node, SPDX.checksumValue, Literal(dist["md5"])))
elif dist.get("@type") == "cr:FileSet":
# Handle FileSet as a special type of distribution
dist_id = dist.get("@id", "fileset")
dist_uri = URIRef(f"{dataset_uri}/distribution/{dist_id}")
g.add((dataset_uri, DCAT.distribution, dist_uri))
g.add((dist_uri, RDF.type, DCAT.Distribution))
g.add((dist_uri, RDF.type, GEOCR.FileSet))
if "name" in dist:
g.add((dist_uri, DCTERMS.title, Literal(dist["name"])))
if "description" in dist:
g.add((dist_uri, DCTERMS.description, Literal(dist["description"])))
if "encodingFormat" in dist:
g.add((dist_uri, DCAT.mediaType, Literal(dist["encodingFormat"])))
if "includes" in dist:
g.add((dist_uri, GEOCR.includes, Literal(dist["includes"])))
# Record sets and fields (as additional metadata)
for record_set in geocroissant_json.get("recordSet", []):
if record_set.get("@type") == "cr:RecordSet":
rs_id = record_set.get("@id", record_set.get("name", "recordset"))
rs_uri = URIRef(f"{dataset_uri}/recordset/{rs_id}")
g.add((dataset_uri, GEOCR.recordSet, rs_uri))
g.add((rs_uri, RDF.type, GEOCR.RecordSet))
if "name" in record_set:
g.add((rs_uri, DCTERMS.title, Literal(record_set["name"])))
if "description" in record_set:
g.add((rs_uri, DCTERMS.description, Literal(record_set["description"])))
# Handle fields
for field in record_set.get("field", []):
if field.get("@type") == "cr:Field":
field_id = field.get("@id", field.get("name", "field"))
field_uri = URIRef(f"{rs_uri}/field/{field_id}")
g.add((rs_uri, GEOCR.field, field_uri))
g.add((field_uri, RDF.type, GEOCR.Field))
if "name" in field:
g.add((field_uri, DCTERMS.title, Literal(field["name"])))
if "description" in field:
g.add((field_uri, DCTERMS.description, Literal(field["description"])))
if "dataType" in field:
g.add((field_uri, GEOCR.dataType, Literal(field["dataType"])))
# Serialize outputs
g.serialize(destination=output_file, format="json-ld", indent=2)
print(f"GeoDCAT JSON-LD metadata written to {output_file}")
ttl_file = output_file.replace(".jsonld", ".ttl")
g.serialize(destination=ttl_file, format="turtle")
print(f"✓ GeoDCAT Turtle metadata written to {ttl_file}")
return gWe load the croissant.json file and convert it using our function.
This will produce: - geodcat.jsonld: GeoDCAT in JSON-LD format - geodcat.ttl: GeoDCAT in Turtle (RDF) format
# Load GeoCroissant metadata and convert to GeoDCAT
with open("croissant.json", "r") as f:
geocroissant = json.load(f)
# Perform conversion
graph = geocroissant_to_geodcat_jsonld(geocroissant, output_file="geodcat.jsonld")
print("\ Conversion complete!")
print(f" - Input: croissant.json")
print(f" - Output JSON-LD: geodcat.jsonld")
print(f" - Output Turtle: geodcat.ttl")GeoDCAT JSON-LD metadata written to geodcat.jsonld
✓ GeoDCAT Turtle metadata written to geodcat.ttl
\ Conversion complete!
- Input: croissant.json
- Output JSON-LD: geodcat.jsonld
- Output Turtle: geodcat.ttl
We reload and pretty-print the generated RDF in JSON-LD format to verify key fields like: - Dataset identifiers - Distributions and access URLs - Creator, license, and temporal coverage
# Load and display the GeoDCAT JSON-LD content
g = Graph()
g.parse("geodcat.jsonld", format="json-ld")
print("GeoDCAT JSON-LD Output:")
print("=" * 80)
print(g.serialize(format="json-ld", indent=2))GeoDCAT JSON-LD Output:
================================================================================
[
{
"@id": "https://example.org/NASA%20POWER%20T2M%202020/spatial",
"@type": [
"http://purl.org/dc/terms/Location"
],
"http://www.opengis.net/ont/geosparql#asWKT": [
{
"@type": "http://www.opengis.net/ont/geosparql#wktLiteral",
"@value": "POLYGON((-180.0 -90.0, 179.375 -90.0, 179.375 90.0, -180.0 90.0, -180.0 -90.0))"
}
]
},
{
"@id": "https://example.org/NASA%20POWER%20T2M%202020/recordset/t2m_data/field/longitude",
"@type": [
"http://mlcommons.org/croissant/geo/Field"
],
"http://mlcommons.org/croissant/geo/dataType": [
{
"@value": "sc:Float"
}
],
"http://purl.org/dc/terms/description": [
{
"@value": "Longitude coordinate"
}
],
"http://purl.org/dc/terms/title": [
{
"@value": "longitude"
}
]
},
{
"@id": "https://example.org/NASA%20POWER%20T2M%202020/recordset/t2m_data/field/time",
"@type": [
"http://mlcommons.org/croissant/geo/Field"
],
"http://mlcommons.org/croissant/geo/dataType": [
{
"@value": "sc:Text"
}
],
"http://purl.org/dc/terms/description": [
{
"@value": "Time coordinate"
}
],
"http://purl.org/dc/terms/title": [
{
"@value": "time"
}
]
},
{
"@id": "https://example.org/NASA%20POWER%20T2M%202020/temporal",
"@type": [
"http://purl.org/dc/terms/PeriodOfTime"
],
"http://www.w3.org/ns/dcat#endDate": [
{
"@type": "http://www.w3.org/2001/XMLSchema#date",
"@value": "2020-12-31"
}
],
"http://www.w3.org/ns/dcat#startDate": [
{
"@type": "http://www.w3.org/2001/XMLSchema#date",
"@value": "2020-01-01"
}
]
},
{
"@id": "https://example.org/NASA%20POWER%20T2M%202020/recordset/t2m_data/field/T2M",
"@type": [
"http://mlcommons.org/croissant/geo/Field"
],
"http://mlcommons.org/croissant/geo/dataType": [
{
"@value": "sc:Float"
}
],
"http://purl.org/dc/terms/description": [
{
"@value": "Temperature at 2 Meters"
}
],
"http://purl.org/dc/terms/title": [
{
"@value": "T2M"
}
]
},
{
"@id": "https://example.org/NASA%20POWER%20T2M%202020/recordset/t2m_data/field/latitude",
"@type": [
"http://mlcommons.org/croissant/geo/Field"
],
"http://mlcommons.org/croissant/geo/dataType": [
{
"@value": "sc:Float"
}
],
"http://purl.org/dc/terms/description": [
{
"@value": "Latitude coordinate"
}
],
"http://purl.org/dc/terms/title": [
{
"@value": "latitude"
}
]
},
{
"@id": "https://example.org/NASA%20POWER%20T2M%202020/recordset/t2m_data",
"@type": [
"http://mlcommons.org/croissant/geo/RecordSet"
],
"http://mlcommons.org/croissant/geo/field": [
{
"@id": "https://example.org/NASA%20POWER%20T2M%202020/recordset/t2m_data/field/T2M"
},
{
"@id": "https://example.org/NASA%20POWER%20T2M%202020/recordset/t2m_data/field/latitude"
},
{
"@id": "https://example.org/NASA%20POWER%20T2M%202020/recordset/t2m_data/field/longitude"
},
{
"@id": "https://example.org/NASA%20POWER%20T2M%202020/recordset/t2m_data/field/time"
}
],
"http://purl.org/dc/terms/title": [
{
"@value": "t2m_data"
}
]
},
{
"@id": "https://example.org/NASA%20POWER%20T2M%202020",
"@type": [
"http://www.w3.org/ns/dcat#Dataset",
"https://schema.org/Dataset"
],
"http://mlcommons.org/croissant/geo/coordinateReferenceSystem": [
{
"@id": "http://www.opengis.net/def/crs/EPSG:4326"
}
],
"http://mlcommons.org/croissant/geo/recordSet": [
{
"@id": "https://example.org/NASA%20POWER%20T2M%202020/recordset/t2m_data"
}
],
"http://mlcommons.org/croissant/geo/spatialResolution": [
{
"@id": "_:N42d54b121e2640f99f08b1b2c8e217d7"
}
],
"http://mlcommons.org/croissant/geo/temporalResolution": [
{
"@id": "_:N8869c3a37bb94f9eaa99cc326ab27cc0"
}
],
"http://purl.org/dc/terms/conformsTo": [
{
"@id": "http://mlcommons.org/croissant/1.1"
},
{
"@id": "http://mlcommons.org/croissant/geo/1.0"
}
],
"http://purl.org/dc/terms/description": [
{
"@value": "Temperature at 2 Meters monthly data for 2020"
}
],
"http://purl.org/dc/terms/issued": [
{
"@type": "http://www.w3.org/2001/XMLSchema#date",
"@value": "2020-01-01"
}
],
"http://purl.org/dc/terms/license": [
{
"@id": "file:///teamspace/studios/this_studio/dcai/GeoCroissant%20to%20GeoDCAT/CC-BY-4.0"
}
],
"http://purl.org/dc/terms/spatial": [
{
"@id": "https://example.org/NASA%20POWER%20T2M%202020/spatial"
}
],
"http://purl.org/dc/terms/temporal": [
{
"@id": "https://example.org/NASA%20POWER%20T2M%202020/temporal"
}
],
"http://purl.org/dc/terms/title": [
{
"@value": "NASA POWER T2M 2020"
}
],
"http://www.w3.org/ns/adms#version": [
{
"@value": "1.0.0"
}
],
"http://www.w3.org/ns/dcat#distribution": [
{
"@id": "https://example.org/NASA%20POWER%20T2M%202020/distribution/zarr-data"
}
],
"http://www.w3.org/ns/dcat#keyword": [
{
"@value": "temperature"
},
{
"@value": "climate"
},
{
"@value": "nasa power"
},
{
"@value": "t2m"
},
{
"@value": "2020"
}
]
},
{
"@id": "_:N42d54b121e2640f99f08b1b2c8e217d7",
"@type": [
"https://schema.org/QuantitativeValue"
],
"https://schema.org/unitText": [
{
"@value": "degrees"
}
],
"https://schema.org/value": [
{
"@type": "http://www.w3.org/2001/XMLSchema#double",
"@value": 0.5
}
]
},
{
"@id": "_:N8869c3a37bb94f9eaa99cc326ab27cc0",
"@type": [
"https://schema.org/QuantitativeValue"
],
"https://schema.org/unitText": [
{
"@value": "month"
}
],
"https://schema.org/value": [
{
"@type": "http://www.w3.org/2001/XMLSchema#integer",
"@value": 1
}
]
},
{
"@id": "https://example.org/NASA%20POWER%20T2M%202020/distribution/zarr-data",
"@type": [
"http://www.w3.org/ns/dcat#Distribution"
],
"http://purl.org/dc/terms/title": [
{
"@value": "zarr-data"
}
],
"http://spdx.org/rdf/terms#checksum": [
{
"@id": "_:N7ef45e14da0f4e64aa48b329e73dd23f"
}
],
"http://www.w3.org/ns/dcat#accessURL": [
{
"@id": "https://nasa-power.s3.us-west-2.amazonaws.com/merra2/temporal/power_merra2_monthly_temporal_utc.zarr/"
}
],
"http://www.w3.org/ns/dcat#mediaType": [
{
"@value": "application/zarr"
}
]
},
{
"@id": "_:N7ef45e14da0f4e64aa48b329e73dd23f",
"@type": [
"http://spdx.org/rdf/terms#Checksum"
],
"http://spdx.org/rdf/terms#algorithm": [
{
"@id": "http://spdx.org/rdf/terms#checksumAlgorithm_md5"
}
],
"http://spdx.org/rdf/terms#checksumValue": [
{
"@value": "cf42f8c8b1d4152d36857216bdfc5056"
}
]
}
]
We validate the Turtle (.ttl) file to ensure it’s properly formatted RDF and check its statistics.
# Validate the Turtle output
print("Loading Turtle file for validation...")
ttl_graph = Graph()
ttl_graph.parse("geodcat.ttl", format="turtle")
# Print statistics
print(f"\n✓ TTL file successfully parsed!")
print(f" - Total triples: {len(ttl_graph)}")
print(f" - Unique subjects: {len(set(ttl_graph.subjects()))}")
print(f" - Unique predicates: {len(set(ttl_graph.predicates()))}")
print(f" - Unique objects: {len(set(ttl_graph.objects()))}")
# List namespaces
print("\nNamespaces:")
for prefix, namespace in sorted(ttl_graph.namespaces()):
print(f" {prefix}: {namespace}")Loading Turtle file for validation...
✓ TTL file successfully parsed!
- Total triples: 62
- Unique subjects: 12
- Unique predicates: 27
- Unique objects: 55
Namespaces:
adms: http://www.w3.org/ns/adms#
brick: https://brickschema.org/schema/Brick#
csvw: http://www.w3.org/ns/csvw#
dc: http://purl.org/dc/elements/1.1/
dcam: http://purl.org/dc/dcam/
dcat: http://www.w3.org/ns/dcat#
dcmitype: http://purl.org/dc/dcmitype/
dct: http://purl.org/dc/terms/
doap: http://usefulinc.com/ns/doap#
foaf: http://xmlns.com/foaf/0.1/
geo: http://www.opengis.net/ont/geosparql#
geocr: http://mlcommons.org/croissant/geo/
odrl: http://www.w3.org/ns/odrl/2/
org: http://www.w3.org/ns/org#
owl: http://www.w3.org/2002/07/owl#
prof: http://www.w3.org/ns/dx/prof/
prov: http://www.w3.org/ns/prov#
qb: http://purl.org/linked-data/cube#
rdf: http://www.w3.org/1999/02/22-rdf-syntax-ns#
rdfs: http://www.w3.org/2000/01/rdf-schema#
schema: https://schema.org/
sh: http://www.w3.org/ns/shacl#
skos: http://www.w3.org/2004/02/skos/core#
sosa: http://www.w3.org/ns/sosa/
spdx: http://spdx.org/rdf/terms#
ssn: http://www.w3.org/ns/ssn/
time: http://www.w3.org/2006/time#
vann: http://purl.org/vocab/vann/
void: http://rdfs.org/ns/void#
wgs: https://www.w3.org/2003/01/geo/wgs84_pos#
xml: http://www.w3.org/XML/1998/namespace
xsd: http://www.w3.org/2001/XMLSchema#
View the full RDF Turtle serialization of the GeoDCAT metadata.
# Display the complete Turtle output
print("Complete GeoDCAT Turtle (TTL) Output:")
print("=" * 80)
with open("geodcat.ttl", "r", encoding="utf-8") as f:
print(f.read())Complete GeoDCAT Turtle (TTL) Output:
================================================================================
@prefix adms: <http://www.w3.org/ns/adms#> .
@prefix dcat: <http://www.w3.org/ns/dcat#> .
@prefix dct: <http://purl.org/dc/terms/> .
@prefix geo: <http://www.opengis.net/ont/geosparql#> .
@prefix geocr: <http://mlcommons.org/croissant/geo/> .
@prefix schema: <https://schema.org/> .
@prefix spdx: <http://spdx.org/rdf/terms#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
<https://example.org/NASA%20POWER%20T2M%202020> a dcat:Dataset,
schema:Dataset ;
geocr:coordinateReferenceSystem <http://www.opengis.net/def/crs/EPSG:4326> ;
geocr:recordSet <https://example.org/NASA%20POWER%20T2M%202020/recordset/t2m_data> ;
geocr:spatialResolution [ a schema:QuantitativeValue ;
schema:unitText "degrees" ;
schema:value 5e-01 ] ;
geocr:temporalResolution [ a schema:QuantitativeValue ;
schema:unitText "month" ;
schema:value 1 ] ;
dct:conformsTo <http://mlcommons.org/croissant/1.1>,
<http://mlcommons.org/croissant/geo/1.0> ;
dct:description "Temperature at 2 Meters monthly data for 2020" ;
dct:issued "2020-01-01"^^xsd:date ;
dct:license <CC-BY-4.0> ;
dct:spatial <https://example.org/NASA%20POWER%20T2M%202020/spatial> ;
dct:temporal <https://example.org/NASA%20POWER%20T2M%202020/temporal> ;
dct:title "NASA POWER T2M 2020" ;
adms:version "1.0.0" ;
dcat:distribution <https://example.org/NASA%20POWER%20T2M%202020/distribution/zarr-data> ;
dcat:keyword "2020",
"climate",
"nasa power",
"t2m",
"temperature" .
<https://example.org/NASA%20POWER%20T2M%202020/distribution/zarr-data> a dcat:Distribution ;
dct:title "zarr-data" ;
spdx:checksum [ a spdx:Checksum ;
spdx:algorithm spdx:checksumAlgorithm_md5 ;
spdx:checksumValue "cf42f8c8b1d4152d36857216bdfc5056" ] ;
dcat:accessURL <https://nasa-power.s3.us-west-2.amazonaws.com/merra2/temporal/power_merra2_monthly_temporal_utc.zarr/> ;
dcat:mediaType "application/zarr" .
<https://example.org/NASA%20POWER%20T2M%202020/recordset/t2m_data> a geocr:RecordSet ;
geocr:field <https://example.org/NASA%20POWER%20T2M%202020/recordset/t2m_data/field/T2M>,
<https://example.org/NASA%20POWER%20T2M%202020/recordset/t2m_data/field/latitude>,
<https://example.org/NASA%20POWER%20T2M%202020/recordset/t2m_data/field/longitude>,
<https://example.org/NASA%20POWER%20T2M%202020/recordset/t2m_data/field/time> ;
dct:title "t2m_data" .
<https://example.org/NASA%20POWER%20T2M%202020/recordset/t2m_data/field/T2M> a geocr:Field ;
geocr:dataType "sc:Float" ;
dct:description "Temperature at 2 Meters" ;
dct:title "T2M" .
<https://example.org/NASA%20POWER%20T2M%202020/recordset/t2m_data/field/latitude> a geocr:Field ;
geocr:dataType "sc:Float" ;
dct:description "Latitude coordinate" ;
dct:title "latitude" .
<https://example.org/NASA%20POWER%20T2M%202020/recordset/t2m_data/field/longitude> a geocr:Field ;
geocr:dataType "sc:Float" ;
dct:description "Longitude coordinate" ;
dct:title "longitude" .
<https://example.org/NASA%20POWER%20T2M%202020/recordset/t2m_data/field/time> a geocr:Field ;
geocr:dataType "sc:Text" ;
dct:description "Time coordinate" ;
dct:title "time" .
<https://example.org/NASA%20POWER%20T2M%202020/spatial> a dct:Location ;
geo:asWKT "POLYGON((-180.0 -90.0, 179.375 -90.0, 179.375 90.0, -180.0 90.0, -180.0 -90.0))"^^geo:wktLiteral .
<https://example.org/NASA%20POWER%20T2M%202020/temporal> a dct:PeriodOfTime ;
dcat:endDate "2020-12-31"^^xsd:date ;
dcat:startDate "2020-01-01"^^xsd:date .
Use SPARQL queries to extract specific metadata from the GeoDCAT RDF graph.
# Query the GeoDCAT metadata using SPARQL
from rdflib import Graph, Namespace
from rdflib.namespace import DCAT, DCTERMS
# Load the TTL file
g = Graph()
g.parse("geodcat.ttl", format="turtle")
# Query 1: Get dataset basic info
query = """
PREFIX dct: <http://purl.org/dc/terms/>
PREFIX dcat: <http://www.w3.org/ns/dcat#>
PREFIX schema: <https://schema.org/>
SELECT ?dataset ?title ?description ?license
WHERE {
?dataset a dcat:Dataset .
?dataset dct:title ?title .
?dataset dct:description ?description .
OPTIONAL { ?dataset dct:license ?license }
}
"""
print("Dataset Information:")
print("=" * 80)
for row in g.query(query):
print(f"Title: {row.title}")
print(f"Description: {row.description}")
if row.license:
print(f"License: {row.license}")
# Query 2: Get all distributions
query = """
PREFIX dct: <http://purl.org/dc/terms/>
PREFIX dcat: <http://www.w3.org/ns/dcat#>
SELECT ?dist ?title ?url ?format
WHERE {
?dataset a dcat:Dataset .
?dataset dcat:distribution ?dist .
OPTIONAL { ?dist dct:title ?title }
OPTIONAL { ?dist dcat:accessURL ?url }
OPTIONAL { ?dist dcat:mediaType ?format }
}
"""
print("\n\nDistribution URLs:")
print("=" * 80)
for row in g.query(query):
if row.title:
print(f"Distribution: {row.title}")
if row.url:
print(f" Access URL: {row.url}")
if row.format:
print(f" Format: {row.format}")
print()Dataset Information:
================================================================================
Title: NASA POWER T2M 2020
Description: Temperature at 2 Meters monthly data for 2020
License: file:///teamspace/studios/this_studio/dcai/GeoCroissant%20to%20GeoDCAT/CC-BY-4.0
Distribution URLs:
================================================================================
Distribution: zarr-data
Access URL: https://nasa-power.s3.us-west-2.amazonaws.com/merra2/temporal/power_merra2_monthly_temporal_utc.zarr/
Format: application/zarr