GeoCroissant to STAC Conversion

GeoCroissant

This notebook demonstrates how to convert GeoCroissant metadata a geospatial extension of the Croissant metadata format into a STAC (SpatioTemporal Asset Catalog) Item.

Key Conversion Mappings

STAC Field GeoCroissant Field
id name
type @type
bbox spatialCoverage.geo.box
geometry spatialCoverage.geo.box
properties.start_datetime temporalCoverage (start)
properties.end_datetime temporalCoverage (end)
proj:epsg geocr:coordinateReferenceSystem
properties.gsd geocr:spatialResolution.value
assets distribution (FileObject/FileSet)
assets[key].href contentUrl
assets[key].type encodingFormat
assets[key].raster:bands geocr:spectralBandMetadata

Install Required Libraries

!pip install -q pystac stac-validator

[notice] A new release of pip is available: 26.0 -> 26.0.1
[notice] To update, run: pip install --upgrade pip

GeoCroissant to STAC Converter

This converter properly handles: - Spatial Coverage: Parses GeoCroissant box format (south west north east) → STAC bbox - Temporal Coverage: Parses ISO 8601 temporal ranges → start/end datetime - CRS: Extracts coordinate reference systems (e.g., EPSG:4326) - Resolution: Spatial and temporal resolution metadata - Distribution: FileObject and FileSet assets → STAC assets - Spectral Bands: Band metadata for raster data - Conformance: GeoCroissant version compliance

import json
from datetime import datetime
from pystac import Item, Asset, MediaType
from pystac.extensions.projection import ProjectionExtension

def geocroissant_to_stac(geocroissant_data):
    """Convert GeoCroissant metadata to STAC Item."""
    
    # Extract basic metadata
    item_id = geocroissant_data.get("name", "unknown").replace(" ", "_")
    title = geocroissant_data.get("name", "")
    description = geocroissant_data.get("description", "")
    license_info = geocroissant_data.get("license", "proprietary")
    keywords = geocroissant_data.get("keywords", [])
    
    # Parse spatial coverage (GeoCroissant format: "south west north east")
    spatial_coverage = geocroissant_data.get("spatialCoverage", {})
    geo_info = spatial_coverage.get("geo", {}) if isinstance(spatial_coverage, dict) else {}
    bbox_string = geo_info.get("box", "") if isinstance(geo_info, dict) else ""
    
    if bbox_string:
        coords = [float(x) for x in bbox_string.split()]
        south, west, north, east = coords
        bbox = [west, south, east, north]  # STAC format: [west, south, east, north]
        geometry = {
            "type": "Polygon",
            "coordinates": [[
                [west, south], [west, north], [east, north], [east, south], [west, south]
            ]]
        }
    else:
        # Default to global extent if no spatial coverage provided
        bbox = [-180, -90, 180, 90]
        geometry = {
            "type": "Polygon",
            "coordinates": [[
                [-180, -90], [-180, 90], [180, 90], [180, -90], [-180, -90]
            ]]
        }
    
    # Parse temporal coverage (ISO 8601: "start/end")
    temporal_coverage = geocroissant_data.get("temporalCoverage", "")
    if temporal_coverage and "/" in temporal_coverage:
        start_str, end_str = temporal_coverage.split("/")
        start_dt = datetime.fromisoformat(start_str)
        end_dt = datetime.fromisoformat(end_str)
        midpoint_dt = start_dt + (end_dt - start_dt) / 2
    else:
        start_dt = end_dt = None
        midpoint_dt = datetime.now()
    
    # Build STAC properties
    properties = {
        "title": title,
        "description": description,
        "license": license_info,
        "keywords": keywords,
    }
    
    if start_dt and end_dt:
        properties["start_datetime"] = start_dt.isoformat() + "Z"
        properties["end_datetime"] = end_dt.isoformat() + "Z"
    
    # Add GeoCroissant metadata
    crs = geocroissant_data.get("geocr:coordinateReferenceSystem")
    
    spatial_res = geocroissant_data.get("geocr:spatialResolution", {})
    if isinstance(spatial_res, dict) and spatial_res.get("value"):
        properties["gsd"] = float(spatial_res["value"])
    
    temporal_res = geocroissant_data.get("geocr:temporalResolution", {})
    if isinstance(temporal_res, dict) and temporal_res.get("value"):
        properties["geocr:temporalResolution"] = f"{temporal_res['value']} {temporal_res.get('unitText', '')}"
    
    sampling_strategy = geocroissant_data.get("geocr:samplingStrategy")
    if sampling_strategy:
        properties["geocr:samplingStrategy"] = sampling_strategy
    
    conforms_to = geocroissant_data.get("conformsTo", [])
    if conforms_to:
        properties["conformsTo"] = conforms_to
    
    # Create STAC Item
    item = Item(
        id=item_id,
        geometry=geometry,
        bbox=bbox,
        datetime=midpoint_dt,
        properties=properties
    )
    
    # Add projection extension if CRS present
    if crs and "EPSG:" in crs:
        proj_ext = ProjectionExtension.ext(item, add_if_missing=True)
        proj_ext.epsg = int(crs.replace("EPSG:", ""))
    
    # Process distribution to add assets
    distribution = geocroissant_data.get("distribution", [])
    for dist_item in distribution:
        item_type = dist_item.get("@type", "")
        content_url = dist_item.get("contentUrl", "")
        
        # Skip directory entries and file:// URLs
        if not content_url or content_url.startswith("file://"):
            continue
        if "directory" in dist_item.get("encodingFormat", "").lower():
            continue
        
        asset_id = dist_item.get("@id", dist_item.get("name", "asset")).replace(" ", "_").lower()
        encoding_format = dist_item.get("encodingFormat", "")
        
        # Determine media type
        if "tiff" in encoding_format.lower() or "tif" in encoding_format.lower():
            media_type = MediaType.GEOTIFF
        elif "json" in encoding_format.lower():
            media_type = MediaType.JSON
        elif "parquet" in encoding_format.lower():
            media_type = MediaType.PARQUET
        elif "zarr" in encoding_format.lower():
            media_type = "application/zarr"
        else:
            media_type = encoding_format
        
        # Determine roles
        roles = ["data"]
        if "FileSet" in item_type:
            roles.append("collection")
        
        asset = Asset(
            href=content_url,
            media_type=media_type,
            title=dist_item.get("description", dist_item.get("name", "")),
            roles=roles
        )
        
        # Add file pattern for FileSets
        includes = dist_item.get("includes")
        if includes:
            asset.extra_fields["file_pattern"] = includes
        
        item.add_asset(asset_id, asset)
    
    # Add spectral band metadata to GEOTIFF assets if present
    spectral_bands = geocroissant_data.get("geocr:spectralBandMetadata", [])
    if spectral_bands:
        raster_bands = []
        for band_info in spectral_bands:
            raster_band = {"name": band_info.get("name", "")}
            
            center_wl = band_info.get("geocr:centerWavelength", {})
            if isinstance(center_wl, dict) and center_wl.get("value"):
                raster_band["center_wavelength"] = float(center_wl["value"])
            
            bandwidth = band_info.get("geocr:bandwidth", {})
            if isinstance(bandwidth, dict) and bandwidth.get("value"):
                raster_band["bandwidth"] = float(bandwidth["value"])
            
            raster_bands.append(raster_band)
        
        # Apply to GEOTIFF assets
        for asset_key, asset in item.assets.items():
            if asset.media_type in [MediaType.GEOTIFF, MediaType.COG]:
                asset.extra_fields["raster:bands"] = raster_bands
    
    return item.to_dict()

Save and Display STAC Item

# Load GeoCroissant and convert to STAC
with open("croissant.json", "r") as f:
    geocroissant_data = json.load(f)

stac_item = geocroissant_to_stac(geocroissant_data)

# Save to file
with open('stac_item.json', 'w') as f:
    json.dump(stac_item, f, indent=2)
    
print(json.dumps(stac_item, indent=2))
{
  "type": "Feature",
  "stac_version": "1.1.0",
  "stac_extensions": [
    "https://stac-extensions.github.io/projection/v2.0.0/schema.json"
  ],
  "id": "NASA_POWER_T2M_2020",
  "geometry": {
    "type": "Polygon",
    "coordinates": [
      [
        [
          -180.0,
          -90.0
        ],
        [
          -180.0,
          90.0
        ],
        [
          179.375,
          90.0
        ],
        [
          179.375,
          -90.0
        ],
        [
          -180.0,
          -90.0
        ]
      ]
    ]
  },
  "bbox": [
    -180.0,
    -90.0,
    179.375,
    90.0
  ],
  "properties": {
    "title": "NASA POWER T2M 2020",
    "description": "Temperature at 2 Meters monthly data for 2020",
    "license": "CC-BY-4.0",
    "keywords": [
      "temperature",
      "climate",
      "nasa power",
      "t2m",
      "2020"
    ],
    "start_datetime": "2020-01-01T00:00:00Z",
    "end_datetime": "2020-12-31T00:00:00Z",
    "gsd": 0.5,
    "geocr:temporalResolution": "1 month",
    "conformsTo": [
      "http://mlcommons.org/croissant/1.1",
      "http://mlcommons.org/croissant/geo/1.0"
    ],
    "proj:code": "EPSG:4326",
    "datetime": "2020-07-01T12:00:00Z"
  },
  "links": [],
  "assets": {
    "zarr-data": {
      "href": "https://nasa-power.s3.us-west-2.amazonaws.com/merra2/temporal/power_merra2_monthly_temporal_utc.zarr/",
      "type": "application/zarr",
      "title": "zarr-data",
      "roles": [
        "data"
      ]
    }
  }
}

Validate STAC Item

Validate the generated STAC Item using the stac-validator tool to ensure it conforms to the STAC specification.

!stac-validator stac_item.json

Thanks for using STAC version 1.1.0!

[
    {
        "version": "1.1.0",
        "path": "stac_item.json",
        "schema": [
            "https://stac-extensions.github.io/projection/v2.0.0/schema.json",
            "https://schemas.stacspec.org/v1.1.0/item-spec/json-schema/item.json"
        ],
        "valid_stac": true,
        "asset_type": "ITEM",
        "validation_method": "default"
    }
]

Validation completed in 2.68s