GeoCroissant to GeoDCAT Conversion

GeoCroissant

This notebook demonstrates how to convert metadata from GeoCroissant, a geospatial extension of MLCommons Croissant, into GeoDCAT (DCAT-AP for geospatial datasets).

GeoDCAT is a standardized RDF-based metadata model for publishing geospatial datasets, enabling: - Metadata interoperability (with CKAN, INSPIRE, EU portals) - Semantic web support via RDF/JSON-LD and Turtle - Cataloging of spatial, temporal, and distribution metadata

Field Mapping: GeoCroissant → GeoDCAT

GeoCroissant Field GeoDCAT Field
name dct:title
description dct:description
license dct:license
version adms:version
datePublished dct:issued
conformsTo dct:conformsTo
keywords dcat:keyword
spatialCoverage.geo.box dct:spatial + geo:asWKT
temporalCoverage dct:temporal + DCAT dates
geocr:coordinateReferenceSystem geocr:coordinateReferenceSystem
geocr:spatialResolution geocr:spatialResolution
geocr:temporalResolution geocr:temporalResolution
distribution (cr:FileObject) dcat:Distribution
distribution.contentUrl dcat:accessURL
distribution.encodingFormat dcat:mediaType
recordSet (cr:RecordSet) geocr:RecordSet
recordSet.field (cr:Field) geocr:Field

Install Required Libraries

We use: - rdflib for manipulating RDF graphs - pyshacl for validating metadata using SHACL constraints

!pip install -q rdflib pyshacl

[notice] A new release of pip is available: 26.0 -> 26.0.1
[notice] To update, run: pip install --upgrade pip

Define GeoCroissant to GeoDCAT Conversion Function

This function converts proper GeoCroissant metadata with full compliance to GeoDCAT-AP format.

import json
from rdflib import Graph, Namespace, URIRef, Literal, BNode
from rdflib.namespace import DCTERMS, DCAT, FOAF, XSD, RDF
from urllib.parse import quote


def geocroissant_to_geodcat_jsonld(geocroissant_json, output_file="geodcat.jsonld"):
    """Convert GeoCroissant JSON-LD to GeoDCAT-AP compliant format"""
    g = Graph()

    # Namespaces
    GEO = Namespace("http://www.opengis.net/ont/geosparql#")
    SCHEMA = Namespace("https://schema.org/")
    SPDX = Namespace("http://spdx.org/rdf/terms#")
    ADMS = Namespace("http://www.w3.org/ns/adms#")
    PROV = Namespace("http://www.w3.org/ns/prov#")
    GEOCR = Namespace("http://mlcommons.org/croissant/geo/")

    g.bind("dct", DCTERMS)
    g.bind("dcat", DCAT)
    g.bind("foaf", FOAF)
    g.bind("geo", GEO)
    g.bind("schema", SCHEMA)
    g.bind("spdx", SPDX)
    g.bind("adms", ADMS)
    g.bind("prov", PROV)
    g.bind("geocr", GEOCR)

    # Create dataset URI
    dataset_name = geocroissant_json.get("name", "dataset")
    # URL-encode the dataset name to handle spaces and special characters
    safe_name = quote(dataset_name, safe='')
    dataset_uri = URIRef(f"https://example.org/{safe_name}")
    
    # Basic dataset properties
    g.add((dataset_uri, RDF.type, DCAT.Dataset))
    g.add((dataset_uri, RDF.type, SCHEMA.Dataset))
    g.add((dataset_uri, DCTERMS.title, Literal(geocroissant_json["name"])))
    g.add((dataset_uri, DCTERMS.description, Literal(geocroissant_json["description"])))
    
    # License
    if "license" in geocroissant_json:
        g.add((dataset_uri, DCTERMS.license, URIRef(geocroissant_json["license"])))
    
    # Version
    if "version" in geocroissant_json:
        g.add((dataset_uri, ADMS.version, Literal(geocroissant_json["version"])))
    
    # Date published
    if "datePublished" in geocroissant_json:
        g.add((dataset_uri, DCTERMS.issued, Literal(geocroissant_json["datePublished"], datatype=XSD.date)))
    
    # ConformsTo
    for conformance in geocroissant_json.get("conformsTo", []):
        g.add((dataset_uri, DCTERMS.conformsTo, URIRef(conformance)))
    
    # Keywords
    for keyword in geocroissant_json.get("keywords", []):
        g.add((dataset_uri, DCAT.keyword, Literal(keyword)))
    
    # Spatial coverage
    spatial_coverage = geocroissant_json.get("spatialCoverage", {})
    if spatial_coverage and "geo" in spatial_coverage:
        geo_shape = spatial_coverage["geo"]
        if "box" in geo_shape:
            # Parse the bounding box (south west north east format)
            bbox = geo_shape["box"].split()
            if len(bbox) == 4:
                spatial_uri = URIRef(f"{dataset_uri}/spatial")
                g.add((dataset_uri, DCTERMS.spatial, spatial_uri))
                g.add((spatial_uri, RDF.type, DCTERMS.Location))
                
                # Create WKT polygon from bounding box
                south, west, north, east = bbox
                wkt_bbox = f"POLYGON(({west} {south}, {east} {south}, {east} {north}, {west} {north}, {west} {south}))"
                g.add((spatial_uri, GEO.asWKT, Literal(wkt_bbox, datatype=GEO.wktLiteral)))
    
    # Temporal coverage
    if "temporalCoverage" in geocroissant_json:
        temporal_coverage = geocroissant_json["temporalCoverage"]
        if "/" in temporal_coverage:
            start_date, end_date = temporal_coverage.split("/")
            temporal_uri = URIRef(f"{dataset_uri}/temporal")
            g.add((dataset_uri, DCTERMS.temporal, temporal_uri))
            g.add((temporal_uri, RDF.type, DCTERMS.PeriodOfTime))
            g.add((temporal_uri, DCAT.startDate, Literal(start_date, datatype=XSD.date)))
            g.add((temporal_uri, DCAT.endDate, Literal(end_date, datatype=XSD.date)))
    
    # GeoCroissant specific properties
    if "geocr:coordinateReferenceSystem" in geocroissant_json:
        crs_uri = URIRef(f"http://www.opengis.net/def/crs/{geocroissant_json['geocr:coordinateReferenceSystem']}")
        g.add((dataset_uri, GEOCR.coordinateReferenceSystem, crs_uri))
    
    # Spatial resolution
    if "geocr:spatialResolution" in geocroissant_json:
        spatial_res = geocroissant_json["geocr:spatialResolution"]
        if isinstance(spatial_res, dict) and "@type" in spatial_res:
            res_node = BNode()
            g.add((dataset_uri, GEOCR.spatialResolution, res_node))
            g.add((res_node, RDF.type, SCHEMA.QuantitativeValue))
            if "value" in spatial_res:
                g.add((res_node, SCHEMA.value, Literal(spatial_res["value"])))
            if "unitText" in spatial_res:
                g.add((res_node, SCHEMA.unitText, Literal(spatial_res["unitText"])))
    
    # Temporal resolution
    if "geocr:temporalResolution" in geocroissant_json:
        temporal_res = geocroissant_json["geocr:temporalResolution"]
        if isinstance(temporal_res, dict) and "@type" in temporal_res:
            res_node = BNode()
            g.add((dataset_uri, GEOCR.temporalResolution, res_node))
            g.add((res_node, RDF.type, SCHEMA.QuantitativeValue))
            if "value" in temporal_res:
                g.add((res_node, SCHEMA.value, Literal(temporal_res["value"])))
            if "unitText" in temporal_res:
                g.add((res_node, SCHEMA.unitText, Literal(temporal_res["unitText"])))
    
    # Distributions
    for dist in geocroissant_json.get("distribution", []):
        if dist.get("@type") == "cr:FileObject":
            dist_id = dist.get("@id", "distribution")
            dist_uri = URIRef(f"{dataset_uri}/distribution/{dist_id}")
            g.add((dataset_uri, DCAT.distribution, dist_uri))
            g.add((dist_uri, RDF.type, DCAT.Distribution))
            
            if "name" in dist:
                g.add((dist_uri, DCTERMS.title, Literal(dist["name"])))
            if "description" in dist:
                g.add((dist_uri, DCTERMS.description, Literal(dist["description"])))
            if "contentUrl" in dist:
                g.add((dist_uri, DCAT.accessURL, URIRef(dist["contentUrl"])))
            if "encodingFormat" in dist:
                g.add((dist_uri, DCAT.mediaType, Literal(dist["encodingFormat"])))
            if "md5" in dist:
                checksum_node = BNode()
                g.add((dist_uri, SPDX.checksum, checksum_node))
                g.add((checksum_node, RDF.type, SPDX.Checksum))
                g.add((checksum_node, SPDX.algorithm, SPDX.checksumAlgorithm_md5))
                g.add((checksum_node, SPDX.checksumValue, Literal(dist["md5"])))
        
        elif dist.get("@type") == "cr:FileSet":
            # Handle FileSet as a special type of distribution
            dist_id = dist.get("@id", "fileset")
            dist_uri = URIRef(f"{dataset_uri}/distribution/{dist_id}")
            g.add((dataset_uri, DCAT.distribution, dist_uri))
            g.add((dist_uri, RDF.type, DCAT.Distribution))
            g.add((dist_uri, RDF.type, GEOCR.FileSet))
            
            if "name" in dist:
                g.add((dist_uri, DCTERMS.title, Literal(dist["name"])))
            if "description" in dist:
                g.add((dist_uri, DCTERMS.description, Literal(dist["description"])))
            if "encodingFormat" in dist:
                g.add((dist_uri, DCAT.mediaType, Literal(dist["encodingFormat"])))
            if "includes" in dist:
                g.add((dist_uri, GEOCR.includes, Literal(dist["includes"])))
    
    # Record sets and fields (as additional metadata)
    for record_set in geocroissant_json.get("recordSet", []):
        if record_set.get("@type") == "cr:RecordSet":
            rs_id = record_set.get("@id", record_set.get("name", "recordset"))
            rs_uri = URIRef(f"{dataset_uri}/recordset/{rs_id}")
            g.add((dataset_uri, GEOCR.recordSet, rs_uri))
            g.add((rs_uri, RDF.type, GEOCR.RecordSet))
            
            if "name" in record_set:
                g.add((rs_uri, DCTERMS.title, Literal(record_set["name"])))
            if "description" in record_set:
                g.add((rs_uri, DCTERMS.description, Literal(record_set["description"])))
            
            # Handle fields
            for field in record_set.get("field", []):
                if field.get("@type") == "cr:Field":
                    field_id = field.get("@id", field.get("name", "field"))
                    field_uri = URIRef(f"{rs_uri}/field/{field_id}")
                    g.add((rs_uri, GEOCR.field, field_uri))
                    g.add((field_uri, RDF.type, GEOCR.Field))
                    
                    if "name" in field:
                        g.add((field_uri, DCTERMS.title, Literal(field["name"])))
                    if "description" in field:
                        g.add((field_uri, DCTERMS.description, Literal(field["description"])))
                    if "dataType" in field:
                        g.add((field_uri, GEOCR.dataType, Literal(field["dataType"])))

    # Serialize outputs
    g.serialize(destination=output_file, format="json-ld", indent=2)
    print(f"GeoDCAT JSON-LD metadata written to {output_file}")

    ttl_file = output_file.replace(".jsonld", ".ttl")
    g.serialize(destination=ttl_file, format="turtle")
    print(f"✓ GeoDCAT Turtle metadata written to {ttl_file}")
    
    return g

Load GeoCroissant Metadata and Generate GeoDCAT RDF

We load the croissant.json file and convert it using our function.

This will produce: - geodcat.jsonld: GeoDCAT in JSON-LD format - geodcat.ttl: GeoDCAT in Turtle (RDF) format

# Load GeoCroissant metadata and convert to GeoDCAT
with open("croissant.json", "r") as f:
    geocroissant = json.load(f)

# Perform conversion
graph = geocroissant_to_geodcat_jsonld(geocroissant, output_file="geodcat.jsonld")

print("\ Conversion complete!")
print(f"  - Input: croissant.json")
print(f"  - Output JSON-LD: geodcat.jsonld")
print(f"  - Output Turtle: geodcat.ttl")
GeoDCAT JSON-LD metadata written to geodcat.jsonld
✓ GeoDCAT Turtle metadata written to geodcat.ttl
\ Conversion complete!
  - Input: croissant.json
  - Output JSON-LD: geodcat.jsonld
  - Output Turtle: geodcat.ttl

Inspect GeoDCAT JSON-LD

We reload and pretty-print the generated RDF in JSON-LD format to verify key fields like: - Dataset identifiers - Distributions and access URLs - Creator, license, and temporal coverage

# Load and display the GeoDCAT JSON-LD content
g = Graph()
g.parse("geodcat.jsonld", format="json-ld")

print("GeoDCAT JSON-LD Output:")
print("=" * 80)
print(g.serialize(format="json-ld", indent=2))
GeoDCAT JSON-LD Output:
================================================================================
[
  {
    "@id": "https://example.org/NASA%20POWER%20T2M%202020/spatial",
    "@type": [
      "http://purl.org/dc/terms/Location"
    ],
    "http://www.opengis.net/ont/geosparql#asWKT": [
      {
        "@type": "http://www.opengis.net/ont/geosparql#wktLiteral",
        "@value": "POLYGON((-180.0 -90.0, 179.375 -90.0, 179.375 90.0, -180.0 90.0, -180.0 -90.0))"
      }
    ]
  },
  {
    "@id": "https://example.org/NASA%20POWER%20T2M%202020/recordset/t2m_data/field/longitude",
    "@type": [
      "http://mlcommons.org/croissant/geo/Field"
    ],
    "http://mlcommons.org/croissant/geo/dataType": [
      {
        "@value": "sc:Float"
      }
    ],
    "http://purl.org/dc/terms/description": [
      {
        "@value": "Longitude coordinate"
      }
    ],
    "http://purl.org/dc/terms/title": [
      {
        "@value": "longitude"
      }
    ]
  },
  {
    "@id": "https://example.org/NASA%20POWER%20T2M%202020/recordset/t2m_data/field/time",
    "@type": [
      "http://mlcommons.org/croissant/geo/Field"
    ],
    "http://mlcommons.org/croissant/geo/dataType": [
      {
        "@value": "sc:Text"
      }
    ],
    "http://purl.org/dc/terms/description": [
      {
        "@value": "Time coordinate"
      }
    ],
    "http://purl.org/dc/terms/title": [
      {
        "@value": "time"
      }
    ]
  },
  {
    "@id": "https://example.org/NASA%20POWER%20T2M%202020/temporal",
    "@type": [
      "http://purl.org/dc/terms/PeriodOfTime"
    ],
    "http://www.w3.org/ns/dcat#endDate": [
      {
        "@type": "http://www.w3.org/2001/XMLSchema#date",
        "@value": "2020-12-31"
      }
    ],
    "http://www.w3.org/ns/dcat#startDate": [
      {
        "@type": "http://www.w3.org/2001/XMLSchema#date",
        "@value": "2020-01-01"
      }
    ]
  },
  {
    "@id": "https://example.org/NASA%20POWER%20T2M%202020/recordset/t2m_data/field/T2M",
    "@type": [
      "http://mlcommons.org/croissant/geo/Field"
    ],
    "http://mlcommons.org/croissant/geo/dataType": [
      {
        "@value": "sc:Float"
      }
    ],
    "http://purl.org/dc/terms/description": [
      {
        "@value": "Temperature at 2 Meters"
      }
    ],
    "http://purl.org/dc/terms/title": [
      {
        "@value": "T2M"
      }
    ]
  },
  {
    "@id": "https://example.org/NASA%20POWER%20T2M%202020/recordset/t2m_data/field/latitude",
    "@type": [
      "http://mlcommons.org/croissant/geo/Field"
    ],
    "http://mlcommons.org/croissant/geo/dataType": [
      {
        "@value": "sc:Float"
      }
    ],
    "http://purl.org/dc/terms/description": [
      {
        "@value": "Latitude coordinate"
      }
    ],
    "http://purl.org/dc/terms/title": [
      {
        "@value": "latitude"
      }
    ]
  },
  {
    "@id": "https://example.org/NASA%20POWER%20T2M%202020/recordset/t2m_data",
    "@type": [
      "http://mlcommons.org/croissant/geo/RecordSet"
    ],
    "http://mlcommons.org/croissant/geo/field": [
      {
        "@id": "https://example.org/NASA%20POWER%20T2M%202020/recordset/t2m_data/field/T2M"
      },
      {
        "@id": "https://example.org/NASA%20POWER%20T2M%202020/recordset/t2m_data/field/latitude"
      },
      {
        "@id": "https://example.org/NASA%20POWER%20T2M%202020/recordset/t2m_data/field/longitude"
      },
      {
        "@id": "https://example.org/NASA%20POWER%20T2M%202020/recordset/t2m_data/field/time"
      }
    ],
    "http://purl.org/dc/terms/title": [
      {
        "@value": "t2m_data"
      }
    ]
  },
  {
    "@id": "https://example.org/NASA%20POWER%20T2M%202020",
    "@type": [
      "http://www.w3.org/ns/dcat#Dataset",
      "https://schema.org/Dataset"
    ],
    "http://mlcommons.org/croissant/geo/coordinateReferenceSystem": [
      {
        "@id": "http://www.opengis.net/def/crs/EPSG:4326"
      }
    ],
    "http://mlcommons.org/croissant/geo/recordSet": [
      {
        "@id": "https://example.org/NASA%20POWER%20T2M%202020/recordset/t2m_data"
      }
    ],
    "http://mlcommons.org/croissant/geo/spatialResolution": [
      {
        "@id": "_:N42d54b121e2640f99f08b1b2c8e217d7"
      }
    ],
    "http://mlcommons.org/croissant/geo/temporalResolution": [
      {
        "@id": "_:N8869c3a37bb94f9eaa99cc326ab27cc0"
      }
    ],
    "http://purl.org/dc/terms/conformsTo": [
      {
        "@id": "http://mlcommons.org/croissant/1.1"
      },
      {
        "@id": "http://mlcommons.org/croissant/geo/1.0"
      }
    ],
    "http://purl.org/dc/terms/description": [
      {
        "@value": "Temperature at 2 Meters monthly data for 2020"
      }
    ],
    "http://purl.org/dc/terms/issued": [
      {
        "@type": "http://www.w3.org/2001/XMLSchema#date",
        "@value": "2020-01-01"
      }
    ],
    "http://purl.org/dc/terms/license": [
      {
        "@id": "file:///teamspace/studios/this_studio/dcai/GeoCroissant%20to%20GeoDCAT/CC-BY-4.0"
      }
    ],
    "http://purl.org/dc/terms/spatial": [
      {
        "@id": "https://example.org/NASA%20POWER%20T2M%202020/spatial"
      }
    ],
    "http://purl.org/dc/terms/temporal": [
      {
        "@id": "https://example.org/NASA%20POWER%20T2M%202020/temporal"
      }
    ],
    "http://purl.org/dc/terms/title": [
      {
        "@value": "NASA POWER T2M 2020"
      }
    ],
    "http://www.w3.org/ns/adms#version": [
      {
        "@value": "1.0.0"
      }
    ],
    "http://www.w3.org/ns/dcat#distribution": [
      {
        "@id": "https://example.org/NASA%20POWER%20T2M%202020/distribution/zarr-data"
      }
    ],
    "http://www.w3.org/ns/dcat#keyword": [
      {
        "@value": "temperature"
      },
      {
        "@value": "climate"
      },
      {
        "@value": "nasa power"
      },
      {
        "@value": "t2m"
      },
      {
        "@value": "2020"
      }
    ]
  },
  {
    "@id": "_:N42d54b121e2640f99f08b1b2c8e217d7",
    "@type": [
      "https://schema.org/QuantitativeValue"
    ],
    "https://schema.org/unitText": [
      {
        "@value": "degrees"
      }
    ],
    "https://schema.org/value": [
      {
        "@type": "http://www.w3.org/2001/XMLSchema#double",
        "@value": 0.5
      }
    ]
  },
  {
    "@id": "_:N8869c3a37bb94f9eaa99cc326ab27cc0",
    "@type": [
      "https://schema.org/QuantitativeValue"
    ],
    "https://schema.org/unitText": [
      {
        "@value": "month"
      }
    ],
    "https://schema.org/value": [
      {
        "@type": "http://www.w3.org/2001/XMLSchema#integer",
        "@value": 1
      }
    ]
  },
  {
    "@id": "https://example.org/NASA%20POWER%20T2M%202020/distribution/zarr-data",
    "@type": [
      "http://www.w3.org/ns/dcat#Distribution"
    ],
    "http://purl.org/dc/terms/title": [
      {
        "@value": "zarr-data"
      }
    ],
    "http://spdx.org/rdf/terms#checksum": [
      {
        "@id": "_:N7ef45e14da0f4e64aa48b329e73dd23f"
      }
    ],
    "http://www.w3.org/ns/dcat#accessURL": [
      {
        "@id": "https://nasa-power.s3.us-west-2.amazonaws.com/merra2/temporal/power_merra2_monthly_temporal_utc.zarr/"
      }
    ],
    "http://www.w3.org/ns/dcat#mediaType": [
      {
        "@value": "application/zarr"
      }
    ]
  },
  {
    "@id": "_:N7ef45e14da0f4e64aa48b329e73dd23f",
    "@type": [
      "http://spdx.org/rdf/terms#Checksum"
    ],
    "http://spdx.org/rdf/terms#algorithm": [
      {
        "@id": "http://spdx.org/rdf/terms#checksumAlgorithm_md5"
      }
    ],
    "http://spdx.org/rdf/terms#checksumValue": [
      {
        "@value": "cf42f8c8b1d4152d36857216bdfc5056"
      }
    ]
  }
]

Validate TTL Output

We validate the Turtle (.ttl) file to ensure it’s properly formatted RDF and check its statistics.

# Validate the Turtle output
print("Loading Turtle file for validation...")
ttl_graph = Graph()
ttl_graph.parse("geodcat.ttl", format="turtle")

# Print statistics
print(f"\n✓ TTL file successfully parsed!")
print(f"  - Total triples: {len(ttl_graph)}")
print(f"  - Unique subjects: {len(set(ttl_graph.subjects()))}")
print(f"  - Unique predicates: {len(set(ttl_graph.predicates()))}")
print(f"  - Unique objects: {len(set(ttl_graph.objects()))}")

# List namespaces
print("\nNamespaces:")
for prefix, namespace in sorted(ttl_graph.namespaces()):
    print(f"  {prefix}: {namespace}")
Loading Turtle file for validation...

✓ TTL file successfully parsed!
  - Total triples: 62
  - Unique subjects: 12
  - Unique predicates: 27
  - Unique objects: 55

Namespaces:
  adms: http://www.w3.org/ns/adms#
  brick: https://brickschema.org/schema/Brick#
  csvw: http://www.w3.org/ns/csvw#
  dc: http://purl.org/dc/elements/1.1/
  dcam: http://purl.org/dc/dcam/
  dcat: http://www.w3.org/ns/dcat#
  dcmitype: http://purl.org/dc/dcmitype/
  dct: http://purl.org/dc/terms/
  doap: http://usefulinc.com/ns/doap#
  foaf: http://xmlns.com/foaf/0.1/
  geo: http://www.opengis.net/ont/geosparql#
  geocr: http://mlcommons.org/croissant/geo/
  odrl: http://www.w3.org/ns/odrl/2/
  org: http://www.w3.org/ns/org#
  owl: http://www.w3.org/2002/07/owl#
  prof: http://www.w3.org/ns/dx/prof/
  prov: http://www.w3.org/ns/prov#
  qb: http://purl.org/linked-data/cube#
  rdf: http://www.w3.org/1999/02/22-rdf-syntax-ns#
  rdfs: http://www.w3.org/2000/01/rdf-schema#
  schema: https://schema.org/
  sh: http://www.w3.org/ns/shacl#
  skos: http://www.w3.org/2004/02/skos/core#
  sosa: http://www.w3.org/ns/sosa/
  spdx: http://spdx.org/rdf/terms#
  ssn: http://www.w3.org/ns/ssn/
  time: http://www.w3.org/2006/time#
  vann: http://purl.org/vocab/vann/
  void: http://rdfs.org/ns/void#
  wgs: https://www.w3.org/2003/01/geo/wgs84_pos#
  xml: http://www.w3.org/XML/1998/namespace
  xsd: http://www.w3.org/2001/XMLSchema#

Display Complete Turtle Output

View the full RDF Turtle serialization of the GeoDCAT metadata.

# Display the complete Turtle output
print("Complete GeoDCAT Turtle (TTL) Output:")
print("=" * 80)
with open("geodcat.ttl", "r", encoding="utf-8") as f:
    print(f.read())
Complete GeoDCAT Turtle (TTL) Output:
================================================================================
@prefix adms: <http://www.w3.org/ns/adms#> .
@prefix dcat: <http://www.w3.org/ns/dcat#> .
@prefix dct: <http://purl.org/dc/terms/> .
@prefix geo: <http://www.opengis.net/ont/geosparql#> .
@prefix geocr: <http://mlcommons.org/croissant/geo/> .
@prefix schema: <https://schema.org/> .
@prefix spdx: <http://spdx.org/rdf/terms#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

<https://example.org/NASA%20POWER%20T2M%202020> a dcat:Dataset,
        schema:Dataset ;
    geocr:coordinateReferenceSystem <http://www.opengis.net/def/crs/EPSG:4326> ;
    geocr:recordSet <https://example.org/NASA%20POWER%20T2M%202020/recordset/t2m_data> ;
    geocr:spatialResolution [ a schema:QuantitativeValue ;
            schema:unitText "degrees" ;
            schema:value 5e-01 ] ;
    geocr:temporalResolution [ a schema:QuantitativeValue ;
            schema:unitText "month" ;
            schema:value 1 ] ;
    dct:conformsTo <http://mlcommons.org/croissant/1.1>,
        <http://mlcommons.org/croissant/geo/1.0> ;
    dct:description "Temperature at 2 Meters monthly data for 2020" ;
    dct:issued "2020-01-01"^^xsd:date ;
    dct:license <CC-BY-4.0> ;
    dct:spatial <https://example.org/NASA%20POWER%20T2M%202020/spatial> ;
    dct:temporal <https://example.org/NASA%20POWER%20T2M%202020/temporal> ;
    dct:title "NASA POWER T2M 2020" ;
    adms:version "1.0.0" ;
    dcat:distribution <https://example.org/NASA%20POWER%20T2M%202020/distribution/zarr-data> ;
    dcat:keyword "2020",
        "climate",
        "nasa power",
        "t2m",
        "temperature" .

<https://example.org/NASA%20POWER%20T2M%202020/distribution/zarr-data> a dcat:Distribution ;
    dct:title "zarr-data" ;
    spdx:checksum [ a spdx:Checksum ;
            spdx:algorithm spdx:checksumAlgorithm_md5 ;
            spdx:checksumValue "cf42f8c8b1d4152d36857216bdfc5056" ] ;
    dcat:accessURL <https://nasa-power.s3.us-west-2.amazonaws.com/merra2/temporal/power_merra2_monthly_temporal_utc.zarr/> ;
    dcat:mediaType "application/zarr" .

<https://example.org/NASA%20POWER%20T2M%202020/recordset/t2m_data> a geocr:RecordSet ;
    geocr:field <https://example.org/NASA%20POWER%20T2M%202020/recordset/t2m_data/field/T2M>,
        <https://example.org/NASA%20POWER%20T2M%202020/recordset/t2m_data/field/latitude>,
        <https://example.org/NASA%20POWER%20T2M%202020/recordset/t2m_data/field/longitude>,
        <https://example.org/NASA%20POWER%20T2M%202020/recordset/t2m_data/field/time> ;
    dct:title "t2m_data" .

<https://example.org/NASA%20POWER%20T2M%202020/recordset/t2m_data/field/T2M> a geocr:Field ;
    geocr:dataType "sc:Float" ;
    dct:description "Temperature at 2 Meters" ;
    dct:title "T2M" .

<https://example.org/NASA%20POWER%20T2M%202020/recordset/t2m_data/field/latitude> a geocr:Field ;
    geocr:dataType "sc:Float" ;
    dct:description "Latitude coordinate" ;
    dct:title "latitude" .

<https://example.org/NASA%20POWER%20T2M%202020/recordset/t2m_data/field/longitude> a geocr:Field ;
    geocr:dataType "sc:Float" ;
    dct:description "Longitude coordinate" ;
    dct:title "longitude" .

<https://example.org/NASA%20POWER%20T2M%202020/recordset/t2m_data/field/time> a geocr:Field ;
    geocr:dataType "sc:Text" ;
    dct:description "Time coordinate" ;
    dct:title "time" .

<https://example.org/NASA%20POWER%20T2M%202020/spatial> a dct:Location ;
    geo:asWKT "POLYGON((-180.0 -90.0, 179.375 -90.0, 179.375 90.0, -180.0 90.0, -180.0 -90.0))"^^geo:wktLiteral .

<https://example.org/NASA%20POWER%20T2M%202020/temporal> a dct:PeriodOfTime ;
    dcat:endDate "2020-12-31"^^xsd:date ;
    dcat:startDate "2020-01-01"^^xsd:date .

Query GeoDCAT Metadata with SPARQL

Use SPARQL queries to extract specific metadata from the GeoDCAT RDF graph.

# Query the GeoDCAT metadata using SPARQL
from rdflib import Graph, Namespace
from rdflib.namespace import DCAT, DCTERMS

# Load the TTL file
g = Graph()
g.parse("geodcat.ttl", format="turtle")

# Query 1: Get dataset basic info
query = """
PREFIX dct: <http://purl.org/dc/terms/>
PREFIX dcat: <http://www.w3.org/ns/dcat#>
PREFIX schema: <https://schema.org/>

SELECT ?dataset ?title ?description ?license
WHERE {
    ?dataset a dcat:Dataset .
    ?dataset dct:title ?title .
    ?dataset dct:description ?description .
    OPTIONAL { ?dataset dct:license ?license }
}
"""
print("Dataset Information:")
print("=" * 80)
for row in g.query(query):
    print(f"Title: {row.title}")
    print(f"Description: {row.description}")
    if row.license:
        print(f"License: {row.license}")

# Query 2: Get all distributions
query = """
PREFIX dct: <http://purl.org/dc/terms/>
PREFIX dcat: <http://www.w3.org/ns/dcat#>

SELECT ?dist ?title ?url ?format
WHERE {
    ?dataset a dcat:Dataset .
    ?dataset dcat:distribution ?dist .
    OPTIONAL { ?dist dct:title ?title }
    OPTIONAL { ?dist dcat:accessURL ?url }
    OPTIONAL { ?dist dcat:mediaType ?format }
}
"""
print("\n\nDistribution URLs:")
print("=" * 80)
for row in g.query(query):
    if row.title:
        print(f"Distribution: {row.title}")
    if row.url:
        print(f"  Access URL: {row.url}")
    if row.format:
        print(f"  Format: {row.format}")
    print()
Dataset Information:
================================================================================
Title: NASA POWER T2M 2020
Description: Temperature at 2 Meters monthly data for 2020
License: file:///teamspace/studios/this_studio/dcai/GeoCroissant%20to%20GeoDCAT/CC-BY-4.0


Distribution URLs:
================================================================================
Distribution: zarr-data
  Access URL: https://nasa-power.s3.us-west-2.amazonaws.com/merra2/temporal/power_merra2_monthly_temporal_utc.zarr/
  Format: application/zarr