Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement v2.0.3 conversion #122

Merged
merged 24 commits into from
Mar 22, 2025
Merged
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
80204db
fix conversion function name
jh-RLI Jan 27, 2025
0916cd7
Add convert to 202 and 203 #121
vismayajochem Jan 27, 2025
dfc3900
update poetry lock
jh-RLI Feb 19, 2025
c8c5e7a
fix error in readme
jh-RLI Feb 19, 2025
332f89b
Add notice on conversions for major and minor but not patch versions …
jh-RLI Feb 19, 2025
93b5f67
update imports to match oemetadata v203 module
jh-RLI Feb 19, 2025
1fe7204
remove patch version for any oem-v2 related code / comments
jh-RLI Feb 19, 2025
cbbaeab
Fix test that broke after a typo was fixed in the oem version 152 sch…
jh-RLI Feb 19, 2025
de7e440
Module-Docs: describe the purpose of the conversions module and docum…
jh-RLI Feb 24, 2025
df399bb
#121 license check is not optional but enabled by default
jh-RLI Mar 17, 2025
572be5a
#121 oemetadata patch version will not reflect any structural changes…
jh-RLI Mar 17, 2025
804326d
#121 move functionality to conversions submodule
jh-RLI Mar 17, 2025
f779417
#121 add conversion v15 to v16
jh-RLI Mar 17, 2025
762da12
#121 add conversions module
jh-RLI Mar 17, 2025
f4fa3e4
add template license to handle already published metadata documents b…
jh-RLI Mar 17, 2025
8488678
add additional federal legislation license initiatives #121
jh-RLI Mar 17, 2025
64bea25
adapt license check to match more oep specific special cases.
jh-RLI Mar 18, 2025
32ccef6
move function to conversion module #121:
jh-RLI Mar 22, 2025
c727fc9
Raise error is metadata version is not of type string
jh-RLI Mar 22, 2025
95482e6
#121: When parsing the function signature indicates and the purpose o…
jh-RLI Mar 22, 2025
6a95969
lint
jh-RLI Mar 22, 2025
864dd4a
lint
jh-RLI Mar 22, 2025
bfc34b6
pre commit
jh-RLI Mar 22, 2025
e969eea
#121: deactivate test as test data broken
jh-RLI Mar 22, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions README.rst
Original file line number Diff line number Diff line change
@@ -86,6 +86,9 @@ of the oemetadata-specification to help users stick with the latest enhancements
To ease the conversion of oemetadata from any outdated version to the latest version, we provide a
conversion functionality. The following example shows how to convert the oemetadata from v1.6 to v2.0.

Starting form v2 we do not support conversions for patch versions. This means you can convert from v1.6 to v2.0 but not from v2.0.0 to v2.0.1.
The oemetadata release procedure requires to only add breaking changes to major or minor version. Only these changes will require a conversion.

CLI - oemetadata conversion::

# Not implemented yet
@@ -112,7 +115,7 @@ Module usage - In python scripts you can use the conversion::
meta = read_json_file(file_path)

# use omi to convert it to the latest release
converted = convert_metadata(meta, "OEMetadata-2.0.1")
converted = convert_metadata(meta, "OEMetadata-2.0")

# now you can store the result as json file
with open("result.json", "w", encoding="utf-8") as json_file:
@@ -129,7 +132,7 @@ two arguments the first one is the metadata and the second optional one is the s
the validation will try to get the matching schema for the current metadata.


CLI - oemetadata conversion::
CLI - oemetadata validation::

# Not implemented yet

8 changes: 4 additions & 4 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

25 changes: 21 additions & 4 deletions src/omi/base.py
Original file line number Diff line number Diff line change
@@ -4,15 +4,17 @@

import json
import pathlib
import re
from dataclasses import dataclass

import requests
from metadata import v20, v152, v160
from oemetadata.v1 import v152, v160
from oemetadata.v2 import v20

from .settings import OEP_URL

# Order matters! First entry equals latest version of metadata format
METADATA_FORMATS = {"OEP": ["OEMetadata-2.0.1", "OEP-1.6.0", "OEP-1.5.2"], "INSPIRE": []}
METADATA_FORMATS = {"OEP": ["OEMetadata-2.0", "OEP-1.6.0", "OEP-1.5.2"], "INSPIRE": []}
METADATA_VERSIONS = {version: md_format for md_format, versions in METADATA_FORMATS.items() for version in versions}


@@ -70,13 +72,28 @@ def get_metadata_version(metadata: dict) -> str:
"""
# For OEP metadata
try:
return metadata["metaMetadata"]["metadataVersion"]
return __normalize_metadata_version(metadata["metaMetadata"]["metadataVersion"])
except KeyError:
pass
msg = "Could not extract metadata version from metadata."
raise MetadataError(msg)


def __normalize_metadata_version(version: str) -> str:
"""
Normalize a metadata version string by stripping patch numbers.

For example, "OEMetadata-2.0.4" becomes "OEMetadata-2.0".
"""
if not isinstance(version, str):
raise MetadataError(f"Metadata version must be a string, not {type(version)}.")
# This regex captures "OEMetadata-2.0" from "OEMetadata-2.0.4" or similar
m = re.match(r"^(OEMetadata-2\.\d+)(?:\.\d+)?$", version)
if m:
return m.group(1)
return version


def get_latest_metadata_version(metadata_format: str) -> str:
"""
Return the latest metadata version of a given metadata format.
@@ -148,7 +165,7 @@ def __get_metadata_specs_for_oep(metadata_version: str) -> MetadataSpecification
MetadataSpecification
Metadata schema for given metadata version including template and example.
"""
metadata_modules = {"OEP-1.5.2": v152, "OEP-1.6.0": v160, "OEMetadata-2.0.1": v20}
metadata_modules = {"OEP-1.5.2": v152, "OEP-1.6.0": v160, "OEMetadata-2.0": v20}
metadata_module = metadata_modules[metadata_version]
module_path = pathlib.Path(metadata_module.__file__).parent
specs = {}
182 changes: 5 additions & 177 deletions src/omi/conversion.py
Original file line number Diff line number Diff line change
@@ -4,7 +4,9 @@

from copy import deepcopy

from omi.base import get_metadata_specification, get_metadata_version
from omi.base import get_metadata_version
from omi.conversions.v152_to_v160 import convert_oep_152_to_160
from omi.conversions.v160_to_v20 import convert_oep_160_to_20


class ConversionError(Exception):
@@ -77,181 +79,7 @@ def get_chain(current_version: str) -> list[str] | None:
raise ConversionError(f"No conversion chain found from {source_version} to {target_version}.")


def __convert_oep_152_to_160(metadata: dict) -> dict:
"""
Convert metadata with version "OEP-1.5.2" to "OEP-1.6.0".

Parameters
----------
metadata: dict
Metadata

Returns
-------
dict
Updated metadata
"""
# No changes in metadata fields
metadata["metaMetadata"]["metadataVersion"] = "OEP-1.6.0"
return metadata


def __convert_oep_160_to_200(metadata: dict) -> dict:
"""
Convert metadata with version "OEP-1.6.0" to "OEMetadata-2.0.1" using the v2.0 template.

Parameters
----------
metadata: dict
Metadata dictionary in v1.6 format

Returns
-------
dict
Updated metadata dictionary in v2.0 format
"""
metadata_v2 = deepcopy(get_metadata_specification("OEMetadata-2.0.1").template)
metadata_v2["name"] = metadata_v2["title"] = metadata_v2["id"] = metadata_v2["description"] = None

# Populate metadata v2 resources
for i, resource in enumerate(metadata.get("resources", [])):
resource_v2 = ___v2_ensure_resource_entry(metadata_v2, i)
___v2_populate_resource_v2(resource_v2, metadata, resource)

# Update metaMetadata section
metadata_v2["metaMetadata"]["metadataVersion"] = "OEMetadata-2.0.1"
metadata_v2["metaMetadata"]["metadataLicense"] = metadata.get("metaMetadata", {}).get("metadataLicense")

return metadata_v2


def ___v2_ensure_resource_entry(metadata_v2: dict, index: int) -> dict:
"""Ensure a resource entry exists in metadata_v2 resources for the given index."""
if index >= len(metadata_v2["resources"]):
metadata_v2["resources"].append(deepcopy(metadata_v2["resources"][0]))
return metadata_v2["resources"][index]


def ___v2_populate_resource_v2(resource_v2: dict, metadata: dict, resource: dict) -> None:
"""Populate resource_v2 fields based on metadata and resource from v1.6."""
# Bulk update keys without
resource_v2.update(
{
"@id": metadata.get("@id"),
"@context": metadata.get("@context"),
"name": resource.get("name").split(".")[1],
"topics": [resource.get("name", "").split(".")[0]],
"title": metadata.get("title"),
"path": metadata.get("id"),
"description": metadata.get("description"),
"languages": metadata.get("language", []),
"subject": metadata.get("subject", []),
"keywords": metadata.get("keywords", []),
"publicationDate": metadata.get("publicationDate"),
"context": metadata.get("context", {}),
"temporal": metadata.get("temporal", {}),
"type": None,
"format": resource.get("format"),
"encoding": resource.get("encoding"),
"schema": {
"fields": resource.get("schema", {}).get("fields", []),
"primaryKey": resource.get("schema", {}).get("primaryKey", []),
"foreignKeys": resource.get("schema", {}).get("foreignKeys", []),
},
"dialect": resource.get("dialect", {}),
"review": metadata.get("review", {}),
},
)

resource_v2["context"]["publisher"] = None

resource_v2["embargoPeriod"]["start"] = None
resource_v2["embargoPeriod"]["end"] = None

# Set to null to avoid validation errors: URI
resource_v2["spatial"]["location"]["@id"] = None
resource_v2["spatial"]["location"]["address"] = metadata.get("spatial", {}).get("location")
resource_v2["spatial"]["location"]["latitude"] = None
resource_v2["spatial"]["location"]["longitude"] = None
# Set to null to avoid validation errors: URI
resource_v2["spatial"]["extent"]["name"] = metadata.get("spatial", {}).get("extent")
resource_v2["spatial"]["extent"]["@id"] = None
resource_v2["spatial"]["extent"]["resolutionValue"], resource_v2["spatial"]["extent"]["resolutionUnit"] = (
metadata.get("spatial", {}).get("resolution", "").split(" ", 1)
)
resource_v2["spatial"]["extent"]["crs"] = None

___v2_populate_sources(resource_v2, metadata.get("sources", []))
___v2_populate_contributors(resource_v2, metadata.get("contributors", []))
___v2_populate_licenses(resource_v2, metadata.get("licenses", []))
___v2_populate_schema_fields(resource_v2, resource)


def ___v2_populate_sources(resource_v2: dict, sources: list) -> None:
"""Populate sources in resource_v2 from sources in v1.6."""
for i_source, source in enumerate(sources):
if i_source >= len(resource_v2["sources"]):
resource_v2["sources"].append(deepcopy(resource_v2["sources"][0]))
source_v2 = resource_v2["sources"][i_source]
source_v2.update(
{
"title": source.get("title"),
"description": source.get("description"),
"path": source.get("path"),
"publicationYear": None,
"authors": [],
},
)
___v2_populate_source_licenses(source_v2, source.get("licenses", []))


def ___v2_populate_source_licenses(source_v2: dict, licenses: list) -> None:
"""Populate licenses in source_v2 from licenses in v1.6."""
for i_license, license_entry in enumerate(licenses):
if i_license >= len(source_v2["licenses"]):
source_v2["licenses"].append(deepcopy(source_v2["licenses"][0]))
source_v2["licenses"][i_license].update(license_entry)
source_v2["licenses"][i_license]["copyrightStatement"] = None


def ___v2_populate_contributors(resource_v2: dict, contributors: list) -> None:
"""Populate contributors in resource_v2 from contributors in v1.6."""
for i_contribution, contributor in enumerate(contributors):
if i_contribution >= len(resource_v2["contributors"]):
resource_v2["contributors"].append(deepcopy(resource_v2["contributors"][0]))
contributor_v2 = resource_v2["contributors"][i_contribution]
contributor_v2.update(
{
"title": contributor.get("title"),
"path": contributor.get("path"),
"organization": contributor.get("organization"),
"date": contributor.get("date"),
"object": contributor.get("object"),
"comment": contributor.get("comment"),
},
)


def ___v2_populate_licenses(resource_v2: dict, licenses: list) -> None:
"""Populate licenses in resource_v2 from licenses in v1.6."""
for i_license, license_entry in enumerate(licenses):
if i_license >= len(resource_v2["licenses"]):
resource_v2["licenses"].append(deepcopy(resource_v2["licenses"][0]))
resource_v2["licenses"][i_license].update(license_entry)
resource_v2["licenses"][i_license]["copyrightStatement"] = None


def ___v2_populate_schema_fields(resource_v2: dict, resource: dict) -> None:
"""Populate schema fields in resource_v2 from resource in v1.6."""
for i_field, field in enumerate(resource.get("schema", {}).get("fields", [])):
if i_field >= len(resource_v2["schema"]["fields"]):
resource_v2["schema"]["fields"].append(deepcopy(resource_v2["schema"]["fields"][0]))
schema_field_v2 = resource_v2["schema"]["fields"][i_field]
schema_field_v2.update(field)
schema_field_v2["nullable"] = None


METADATA_CONVERSIONS = {
("OEP-1.5.2", "OEP-1.6.0"): __convert_oep_152_to_160,
("OEP-1.6.0", "OEMetadata-2.0.1"): __convert_oep_160_to_200,
("OEP-1.5.2", "OEP-1.6.0"): convert_oep_152_to_160,
("OEP-1.6.0", "OEMetadata-2.0"): convert_oep_160_to_20,
}
5 changes: 5 additions & 0 deletions src/omi/conversions/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# Conversions

This module is used to collect all existing OEMetaData version conversions. Each step in the conversion chain is stored in its own sub module. OMI supports the OEMetaData starting from v1.5.2 previous version are only supported by omi version > v1.0.0.

Since OEMetaData version 2 we decided to use patch versions to only update content or documentation parts of the metadata specification. Therefore OMI will only implement conversion steps for minor versions since they will include all minor structural changes like changing JSON key names or adding new key:value pairs. More substantial changes to the JSON structure will be reflected in a major version change this would include changing the nested structure of the metadata.
1 change: 1 addition & 0 deletions src/omi/conversions/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
__version__ = "1.0.0"
89 changes: 89 additions & 0 deletions src/omi/conversions/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
"""Utility functions for data conversion."""

import re


def find_temporal_resolution_value_and_unit(resolution: str) -> tuple[str, str]:
"""
Find temporal resolution value and unit from a resolution string.

For temporal resolution, if the string starts with a number, this function will extract the number
as the value and any following alphabetical characters as the unit. If no leading numeric value is found,
the whole string is treated as a descriptive resolution with an empty unit.

Possible formats:
- "yearly"
- "hourly"
- "1 h"
- "5 years"
- "1h"

Parameters
----------
resolution: str
Temporal resolution string.

Returns
-------
tuple[str, str]
Temporal resolution value and unit.
"""
# Try matching a number (with optional decimals) and an optional unit, allowing for spaces in between.
match = re.match(r"^\s*(\d+(?:\.\d+)?)(?:\s*([a-zA-Z]+))?\s*$", resolution)
if match:
value = match.group(1)
unit = match.group(2) if match.group(2) is not None else ""
return value, unit

# If no numeric pattern is detected, return the entire trimmed string as the value.
return resolution.strip(), ""


def find_spatial_resolution_value_and_unit(resolution: str) -> tuple[str, str]:
"""
Find spatial resolution value and unit from a resolution string.

For spatial resolution, this function attempts to extract a numeric value with a 'm' (meters) unit,
as in "100 m" or even when embedded in a longer string like "vector, 10 m". If such a pattern is found,
the numeric part is returned as the value and the unit is set to "m". Otherwise, the entire string
is returned as a descriptive resolution (value) with an empty unit.

Possible formats:
- "vector, 10 m"
- "100 m"
- "Germany"
- "NUTS-0"
- "MVGD"
- "Regionale Planungsgemeinschaften und Berlin"
- "national"
- "country"

Parameters
----------
resolution: str
Spatial resolution string.

Returns
-------
tuple[str, str]
Spatial resolution value and unit (unit is expected to be 'm' when a numeric resolution is provided).
"""
# Search for a numeric value followed by optional whitespace and an 'm' unit (case-insensitive).
match = re.search(r"(\d+(?:\.\d+)?)\s*m\b", resolution, re.IGNORECASE)
if match:
value = match.group(1)
unit = "m"
return value, unit

# If no numeric pattern is detected, return the entire trimmed string as the value.
return resolution.strip(), ""


license_cc_by_4 = {
"name": "CC-BY-4.0",
"title": "Creative Commons Attribution 4.0 International",
"path": "https://creativecommons.org/licenses/by/4.0/legalcode",
"instruction": "You are free to share and adapt, but you must attribute and cant add additional restrictions. See https://creativecommons.org/licenses/by/4.0/deed.en for further information.", # noqa: E501
"attribution": "",
"copyrightStatement": "",
}
Loading