diff --git a/datacube/drivers/postgis/_api.py b/datacube/drivers/postgis/_api.py index a55287201..4b9fd5a77 100644 --- a/datacube/drivers/postgis/_api.py +++ b/datacube/drivers/postgis/_api.py @@ -45,8 +45,7 @@ from . import _core from ._fields import parse_fields, Expression, PgField, PgExpression, DateRangeDocField # noqa: F401 from ._fields import NativeField, DateDocField, SimpleDocField, UnindexableValue -from ._schema import MetadataType, Product, \ - Dataset, DatasetLineage, DatasetLocation, SelectedDatasetLocation, \ +from ._schema import MetadataType, Product, Dataset, DatasetLineage, \ search_field_index_map, search_field_indexes, DatasetHome from ._spatial import geom_alchemy, generate_dataset_spatial_values, extract_geometry_from_eo3_projection from .sql import escape_pg_identifier @@ -67,24 +66,6 @@ def _base_known_fields(): 'Archived date', Dataset.archived ) - fields["uris"] = NativeField( - "uris", - "all uris", - func.array( - select( - SelectedDatasetLocation.uri - ).where( - and_( - SelectedDatasetLocation.dataset_ref == Dataset.id, - SelectedDatasetLocation.archived == None - ) - ).order_by( - SelectedDatasetLocation.added.desc(), - SelectedDatasetLocation.id.desc() - ).label('uris') - ), - alchemy_table=Dataset.__table__ # type: ignore[attr-defined] - ) return fields @@ -102,23 +83,7 @@ def _dataset_fields() -> tuple: 'Archived date', Dataset.archived ), - NativeField("uris", - "all uris", - func.array( - select( - SelectedDatasetLocation.uri - ).where( - and_( - SelectedDatasetLocation.dataset_ref == Dataset.id, - SelectedDatasetLocation.archived == None - ) - ).order_by( - SelectedDatasetLocation.added.desc(), - SelectedDatasetLocation.id.desc() - ).label('uris') - ), - alchemy_table=Dataset.__table__ # type: ignore[attr-defined] - ) + native_flds["uri"] ) @@ -126,20 +91,7 @@ def _dataset_bulk_select_fields() -> tuple: return ( Dataset.product_ref, Dataset.metadata_doc, - # All active URIs, from newest to oldest - func.array( - select( - SelectedDatasetLocation.uri - ).where( - and_( - SelectedDatasetLocation.dataset_ref == Dataset.id, - SelectedDatasetLocation.archived == None - ) - ).order_by( - SelectedDatasetLocation.added.desc(), - SelectedDatasetLocation.id.desc() - ).label('uris') - ).label('uris') + Dataset.uri ) @@ -188,17 +140,11 @@ def get_native_fields() -> dict[str, NativeField]: 'Full metadata document', Dataset.metadata_doc ), - # Fields that can affect row selection - - # Note that this field is a single uri: selecting it will result in one-result per uri. - # (ie. duplicate datasets if multiple uris, no dataset if no uris) 'uri': NativeField( 'uri', "Dataset URI", - DatasetLocation.uri_body, - alchemy_expression=DatasetLocation.uri, - join_clause=(DatasetLocation.dataset_ref == Dataset.id), - affects_row_selection=True + Dataset.uri_body, + alchemy_expression=Dataset.uri, ), } return fields @@ -396,22 +342,16 @@ def insert_dataset_location(self, dataset_id, uri): scheme, body = split_uri(uri) r = self._connection.execute( - insert(DatasetLocation).on_conflict_do_nothing( - index_elements=['uri_scheme', 'uri_body', 'dataset_ref'] + update(Dataset).returning(Dataset.uri).where( + Dataset.id == dataset_id ).values( - dataset_ref=dataset_id, uri_scheme=scheme, - uri_body=body, + uri_body=body ) ) return r.rowcount > 0 - def insert_dataset_location_bulk(self, values): - requested = len(values) - res = self._connection.execute(insert(DatasetLocation), values) - return res.rowcount, requested - res.rowcount - def insert_dataset_search(self, search_table, dataset_id, key, value): """ Add/update a search field index entry for a dataset @@ -526,19 +466,17 @@ def get_datasets_for_location(self, uri, mode=None): mode = 'exact' if body.count('#') > 0 else 'prefix' if mode == 'exact': - body_query = DatasetLocation.uri_body == body + body_query = Dataset.uri_body == body elif mode == 'prefix': - body_query = DatasetLocation.uri_body.startswith(body) + body_query = Dataset.uri_body.startswith(body) else: raise ValueError('Unsupported query mode {}'.format(mode)) return self._connection.execute( select( *_dataset_select_fields() - ).join( - Dataset.locations ).where( - and_(DatasetLocation.uri_scheme == scheme, body_query) + and_(Dataset.uri_scheme == scheme, body_query) ) ).fetchall() @@ -577,11 +515,6 @@ def restore_dataset(self, dataset_id): return r.rowcount > 0 def delete_dataset(self, dataset_id): - self._connection.execute( - delete(DatasetLocation).where( - DatasetLocation.dataset_ref == dataset_id - ) - ) for table in search_field_indexes.values(): self._connection.execute( delete(table).where(table.dataset_ref == dataset_id) @@ -1258,90 +1191,30 @@ def get_all_metadata_type_defs(self): for r in self._connection.execute(select(MetadataType.definition).order_by(MetadataType.name.asc())): yield r[0] - def get_locations(self, dataset_id): - return [ - record[0] - for record in self._connection.execute( - select( - DatasetLocation.uri - ).where( - DatasetLocation.dataset_ref == dataset_id - ).where( - DatasetLocation.archived == None - ).order_by( - DatasetLocation.added.desc(), - DatasetLocation.id.desc() - ) - ).fetchall() - ] - - def get_archived_locations(self, dataset_id): - """ - Return a list of uris and archived_times for a dataset - """ - return [ - (location_uri, archived_time) - for location_uri, archived_time in self._connection.execute( - select( - DatasetLocation.uri, DatasetLocation.archived - ).where( - DatasetLocation.dataset_ref == dataset_id - ).where( - DatasetLocation.archived != None - ).order_by( - DatasetLocation.added.desc() - ) - ).fetchall() - ] + def get_location(self, dataset_id): + return self._connection.execute( + select(Dataset.uri).where( + Dataset.id == dataset_id + ) + ).first() def remove_location(self, dataset_id, uri): """ - Remove the given location for a dataset + Remove a dataset's location :returns bool: Was the location deleted? """ scheme, body = split_uri(uri) res = self._connection.execute( - delete(DatasetLocation).where( - DatasetLocation.dataset_ref == dataset_id - ).where( - DatasetLocation.uri_scheme == scheme - ).where( - DatasetLocation.uri_body == body - ) - ) - return res.rowcount > 0 - - def archive_location(self, dataset_id, uri): - scheme, body = split_uri(uri) - res = self._connection.execute( - update(DatasetLocation).where( - DatasetLocation.dataset_ref == dataset_id - ).where( - DatasetLocation.uri_scheme == scheme - ).where( - DatasetLocation.uri_body == body - ).where( - DatasetLocation.archived == None - ).values( - archived=func.now() - ) - ) - return res.rowcount > 0 - - def restore_location(self, dataset_id, uri): - scheme, body = split_uri(uri) - res = self._connection.execute( - update(DatasetLocation).where( - DatasetLocation.dataset_ref == dataset_id - ).where( - DatasetLocation.uri_scheme == scheme - ).where( - DatasetLocation.uri_body == body - ).where( - DatasetLocation.archived != None + update(Dataset).where( + and_( + Dataset.id == dataset_id, + Dataset.uri_scheme == scheme, + Dataset.uri_body == body + ) ).values( - archived=None + uri_scheme=None, + uri_body=None ) ) return res.rowcount > 0 diff --git a/datacube/drivers/postgis/_core.py b/datacube/drivers/postgis/_core.py index a105c471e..799913ac8 100644 --- a/datacube/drivers/postgis/_core.py +++ b/datacube/drivers/postgis/_core.py @@ -130,7 +130,6 @@ def ensure_db(engine, with_permissions=True): grant select on all tables in schema {SCHEMA_NAME} to odc_user; grant insert on {SCHEMA_NAME}.dataset, - {SCHEMA_NAME}.location, {SCHEMA_NAME}.dataset_lineage to odc_manage; grant usage, select on all sequences in schema {SCHEMA_NAME} to odc_manage; diff --git a/datacube/drivers/postgis/_schema.py b/datacube/drivers/postgis/_schema.py index 755225a94..fa079dd14 100644 --- a/datacube/drivers/postgis/_schema.py +++ b/datacube/drivers/postgis/_schema.py @@ -10,10 +10,10 @@ from typing import Type from sqlalchemy.dialects.postgresql import NUMRANGE, TSTZRANGE -from sqlalchemy.orm import aliased, registry, relationship, column_property -from sqlalchemy import ForeignKey, UniqueConstraint, PrimaryKeyConstraint, CheckConstraint, SmallInteger, Text, Index, \ +from sqlalchemy.orm import registry, relationship, column_property +from sqlalchemy import ForeignKey, PrimaryKeyConstraint, CheckConstraint, SmallInteger, Text, Index, \ literal -from sqlalchemy import Column, Integer, String, DateTime +from sqlalchemy import Column, String, DateTime from sqlalchemy.dialects import postgresql as postgres from sqlalchemy.sql import func @@ -105,40 +105,8 @@ class Dataset: updated = Column(DateTime(timezone=True), server_default=func.now(), nullable=False, index=True, comment="when last updated") - locations = relationship("DatasetLocation", viewonly=True) - active_locations = relationship("DatasetLocation", - primaryjoin="and_(Dataset.id==DatasetLocation.dataset_ref, " - "DatasetLocation.archived==None)", - viewonly=True, - order_by="desc(DatasetLocation.added)") - archived_locations = relationship("DatasetLocation", - viewonly=True, - primaryjoin="and_(Dataset.id==DatasetLocation.dataset_ref, " - "DatasetLocation.archived!=None)" - ) - - -Index("ix_ds_prod_active", Dataset.product_ref, postgresql_where=(Dataset.archived == None)) -Index("ix_ds_mdt_active", Dataset.metadata_type_ref, postgresql_where=(Dataset.archived == None)) - - -@orm_registry.mapped -class DatasetLocation: - __tablename__ = "location" - __table_args__ = ( - _core.METADATA, - UniqueConstraint('uri_scheme', 'uri_body', 'dataset_ref'), - Index("ix_loc_ds_added", "dataset_ref", "added"), - { - "schema": sql.SCHEMA_NAME, - "comment": "Where data for the dataset can be found (uri)." - } - ) - id = Column(Integer, primary_key=True, autoincrement=True) - dataset_ref = Column(postgres.UUID(as_uuid=True), ForeignKey(Dataset.id), nullable=False, - comment="The product this dataset belongs to") - uri_scheme = Column(String, nullable=False, comment="The scheme of the uri.") - uri_body = Column(String, nullable=False, comment="""The body of the uri. + uri_scheme = Column(String, comment="The scheme of the uri.") + uri_body = Column(String, comment="""The body of the uri. The uri scheme and body make up the base URI to find the dataset. @@ -147,15 +115,11 @@ class DatasetLocation: eg 'file:///g/data/datasets/LS8_NBAR/odc-metadata.yaml' or 'ftp://eo.something.com/dataset' 'file' is a scheme, '///g/data/datasets/LS8_NBAR/odc-metadata.yaml' is a body.""") - added = Column(DateTime(timezone=True), server_default=func.now(), nullable=False, comment="when added") - added_by = Column(Text, server_default=func.current_user(), nullable=False, comment="added by whom") - archived = Column(DateTime(timezone=True), default=None, nullable=True, index=True, - comment="when archived, null for the active location") uri = column_property(uri_scheme + literal(':') + uri_body) - dataset = relationship("Dataset") -SelectedDatasetLocation = aliased(DatasetLocation, name="sel_loc") +Index("ix_ds_prod_active", Dataset.product_ref, postgresql_where=(Dataset.archived == None)) +Index("ix_ds_mdt_active", Dataset.metadata_type_ref, postgresql_where=(Dataset.archived == None)) @orm_registry.mapped @@ -329,7 +293,7 @@ class DatasetSearchDateTime: ALL_STATIC_TABLES = [ MetadataType.__table__, Product.__table__, # type: ignore[attr-defined] - Dataset.__table__, DatasetLocation.__table__, # type: ignore[attr-defined] + Dataset.__table__, # type: ignore[attr-defined] DatasetLineage.__table__, DatasetHome.__table__, # type: ignore[attr-defined] SpatialIndexRecord.__table__, # type: ignore[attr-defined] DatasetSearchString.__table__, DatasetSearchNumeric.__table__, # type: ignore[attr-defined] diff --git a/datacube/drivers/postgis/alembic/versions/d27eed82e1f6_remove_datasetlocation_table.py b/datacube/drivers/postgis/alembic/versions/d27eed82e1f6_remove_datasetlocation_table.py new file mode 100644 index 000000000..5d8dd3f65 --- /dev/null +++ b/datacube/drivers/postgis/alembic/versions/d27eed82e1f6_remove_datasetlocation_table.py @@ -0,0 +1,81 @@ +# This file is part of the Open Data Cube, see https://opendatacube.org for more information +# +# Copyright (c) 2015-2024 ODC Contributors +# SPDX-License-Identifier: Apache-2.0 +"""remove DatasetLocation table + +Revision ID: d27eed82e1f6 +Revises: 610f32dca3cb +Create Date: 2024-11-13 02:39:55.671819 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa +from sqlalchemy import Column, Integer, String, DateTime, Text, ForeignKey, UniqueConstraint, Index +from sqlalchemy.dialects import postgresql as postgres +from sqlalchemy.sql import func +from datacube.drivers.postgis._core import METADATA + + +# revision identifiers, used by Alembic. +revision: str = 'd27eed82e1f6' +down_revision: Union[str, None] = '610f32dca3cb' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + op.add_column("dataset", + Column("uri_scheme", String, comment="The scheme of the uri."), + schema="odc") + op.add_column("dataset", + Column("uri_body", String, comment="The body of the uri."), + schema="odc") + # select first active location from DatasetLocation and insert into Dataset + conn = op.get_bind() + conn.execute( + sa.text("""UPDATE odc.dataset d + SET d.uri_scheme=( + SELECT l.uri_scheme FROM odc.location l + WHERE l.dataset_ref=d.id AND archived IS NULL + ORDER BY added LIMIT 1), + d.uri_body=( + SELECT l.uri_body FROM odc.location l + WHERE l.dataset_ref=d.id AND archived IS NULL + ORDER BY added LIMIT 1)""") + ) + + op.drop_table("location", schema="odc", if_exists=True) + + +def downgrade() -> None: + loc = op.create_table( + "location", + Column("id", Integer, primary_key=True, autoincrement=True), + Column("dataset_ref", postgres.UUID(as_uuid=True), ForeignKey("odc.dataset.id"), nullable=False, + comment="The product this dataset belongs to"), + Column("uri_scheme", String, nullable=False, comment="The scheme of the uri."), + Column("uri_body", String, nullable=False, comment="The body of the uri."), + Column("added", DateTime(timezone=True), server_default=func.now(), nullable=False, comment="when added"), + Column("added_by", Text, server_default=func.current_user(), nullable=False, comment="added by whom"), + Column("archived", DateTime(timezone=True), default=None, nullable=True, index=True, + comment="when archived, null for the active location"), + UniqueConstraint('uri_scheme', 'uri_body', 'dataset_ref'), + Index("ix_loc_ds_added", "dataset_ref", "added"), + METADATA, + schema="odc", + comment="Where data for the dataset can be found (uri).", + if_not_exists=True, + ) + + conn = op.get_bind() + res = conn.execute( + sa.text("SELECT id, uri_scheme, uri_body FROM odc.dataset WHERE uri_body IS NOT NULL") + ).fetchall() + values = [{"dataset_ref": r.id, "uri_scheme": r.uri_scheme, "uri_body": r.uri_body} for r in res] + op.bulk_insert(loc, values) + + op.drop_column("dataset", "uri_scheme", schema="odc") + op.drop_column("dataset", "uri_body", schema="odc") diff --git a/datacube/index/abstract.py b/datacube/index/abstract.py index d890e7762..f4a185182 100644 --- a/datacube/index/abstract.py +++ b/datacube/index/abstract.py @@ -1085,7 +1085,7 @@ class DatasetTuple(NamedTuple): A named tuple representing a complete dataset: - product: A Product model. - metadata: The dataset metadata document - - uris: A list of locations (uris) + - uri_: The dataset location or list of locations """ product: Product metadata: JsonDict diff --git a/datacube/index/postgis/_datasets.py b/datacube/index/postgis/_datasets.py index 70b607a56..324f506ee 100755 --- a/datacube/index/postgis/_datasets.py +++ b/datacube/index/postgis/_datasets.py @@ -172,7 +172,9 @@ def add(self, dataset: Dataset, transaction.update_spindex(dsids=[dataset.id]) transaction.update_search_index(dsids=[dataset.id]) # 1c. Store locations - if dataset.uris is not None: + if dataset.uri is not None: + if len(dataset.uris) > 1: + raise ValueError('Postgis driver does not support multiple locations for a dataset.') self._ensure_new_locations(dataset, transaction=transaction) if archive_less_mature is not None: self.archive_less_mature(dataset, archive_less_mature) @@ -207,26 +209,22 @@ class BatchRep(NamedTuple): spatial_indexes={crs: [] for crs in crses} ) dsids = [] - for prod, metadata_doc, uris in batch_ds: + for prod, metadata_doc, uri in batch_ds: dsid = UUID(str(metadata_doc["id"])) dsids.append(dsid) + if isinstance(uri, list): + uri = uri[0] + scheme, body = split_uri(uri) batch.datasets.append( { "id": dsid, "product_ref": prod.id, "metadata": metadata_doc, - "metadata_type_ref": prod.metadata_type.id + "metadata_type_ref": prod.metadata_type.id, + "uri_scheme": scheme, + "uri_body": body, } ) - for uri in uris: - scheme, body = split_uri(uri) - batch.uris.append( - { - "dataset_ref": dsid, - "uri_scheme": scheme, - "uri_body": body, - } - ) extent = extract_geometry_from_eo3_projection( metadata_doc["grid_spatial"]["projection"] # type: ignore[misc,call-overload,index] ) @@ -254,8 +252,6 @@ class BatchRep(NamedTuple): with self._db_connection(transaction=True) as connection: if batch.datasets: b_added, b_skipped = connection.insert_dataset_bulk(batch.datasets) - if batch.uris: - connection.insert_dataset_location_bulk(batch.uris) for crs in crses: crs_values = batch.spatial_indexes[crs] if crs_values: @@ -309,6 +305,9 @@ def can_update(self, dataset, updates_allowed=None): dataset.product.name, dataset.id)) + if len(dataset.uris) > 1: + raise ValueError('Postgis driver does not support multiple locations for a dataset.') + # TODO: figure out (un)safe changes from metadata type? allowed = { # can always add more metadata @@ -331,11 +330,10 @@ def update(self, dataset: Dataset, updates_allowed=None, archive_less_mature=Fal delta in timestamp comparison :rtype: Dataset """ - existing = self.get(dataset.id) can_update, safe_changes, unsafe_changes = self.can_update(dataset, updates_allowed) if not safe_changes and not unsafe_changes: - self._ensure_new_locations(dataset, existing) + self._ensure_new_locations(dataset) _LOG.info("No changes detected for dataset %s", dataset.id) return dataset @@ -364,31 +362,16 @@ def update(self, dataset: Dataset, updates_allowed=None, archive_less_mature=Fal if archive_less_mature is not None: self.archive_less_mature(dataset, archive_less_mature) - self._ensure_new_locations(dataset, existing) + self._ensure_new_locations(dataset) return dataset - def _ensure_new_locations(self, dataset, existing=None, transaction=None): - old_uris = set() - if existing: - old_uris.update(existing._uris) - new_uris = dataset._uris - - def ensure_locations_in_transaction(old_uris, new_uris, transaction): - if len(old_uris) <= 1 and len(new_uris) == 1: - # Only one location, so treat as an update. - if len(old_uris): - transaction.remove_location(dataset.id, old_uris.pop()) - transaction.insert_dataset_location(dataset.id, new_uris[0]) - else: - for uri in new_uris[::-1]: - transaction.insert_dataset_location(dataset.id, uri) - + def _ensure_new_locations(self, dataset, transaction=None): if transaction: - ensure_locations_in_transaction(old_uris, new_uris, transaction) + transaction.insert_dataset_location(dataset.id, dataset.uri) else: with self._db_connection(transaction=True) as tr: - ensure_locations_in_transaction(old_uris, new_uris, tr) + tr.insert_dataset_location(dataset.id, dataset.uri) def archive(self, ids): """ @@ -457,8 +440,7 @@ def get_locations(self, id_): :param typing.Union[UUID, str] id_: dataset id :rtype: list[str] """ - with self._db_connection() as connection: - return connection.get_locations(id_) + return [self.get_location(id_)] def get_location(self, id_): """ @@ -468,9 +450,9 @@ def get_location(self, id_): :rtype: list[str] """ with self._db_connection() as connection: - locations = connection.get_locations(id_) - if locations: - return locations[0] + location = connection.get_location(id_) + if location: + return location[0] else: return None @@ -487,8 +469,7 @@ def get_archived_locations(self, id_): :param typing.Union[UUID, str] id_: dataset id :rtype: list[str] """ - with self._db_connection() as connection: - return [uri for uri, archived_dt in connection.get_archived_locations(id_)] + return [] @deprecat( reason="Multiple locations per dataset are now deprecated. " @@ -503,8 +484,7 @@ def get_archived_location_times(self, id_): :param typing.Union[UUID, str] id_: dataset id :rtype: list[Tuple[str, datetime.datetime]] """ - with self._db_connection() as connection: - return list(connection.get_archived_locations(id_)) + return [] @deprecat( reason="Multiple locations per dataset are now deprecated. " @@ -524,6 +504,13 @@ def add_location(self, id_, uri): warnings.warn("Cannot add empty uri. (dataset %s)" % id_) return False + existing = self.get_location(id_) + if existing == uri: + warnings.warn(f"Dataset {id_} already has uri {uri}") + return False + elif existing is not None and existing != uri: + raise ValueError("Postgis index does not support multiple dataset locations.") + with self._db_connection() as connection: return connection.insert_dataset_location(id_, uri) @@ -557,8 +544,7 @@ def remove_location(self, id_, uri): return was_removed @deprecat( - reason="Multiple locations per dataset are now deprecated. " - "Archived locations may not be accessible in future releases. " + reason="The PostGIS index does not support archived locations. " "Dataset location can be set or updated with the update() method.", version="1.9.0", category=ODC2DeprecationWarning @@ -571,13 +557,10 @@ def archive_location(self, id_, uri): :param str uri: fully qualified uri :return bool: location was able to be archived """ - with self._db_connection() as connection: - was_archived = connection.archive_location(id_, uri) - return was_archived + return False @deprecat( - reason="Multiple locations per dataset are now deprecated. " - "Archived locations may not be restorable in future releases. " + reason="The PostGIS index does not support archived locations. " "Dataset location can be set or updated with the update() method.", version="1.9.0", category=ODC2DeprecationWarning @@ -590,9 +573,7 @@ def restore_location(self, id_, uri): :param str uri: fully qualified uri :return bool: location was able to be restored """ - with self._db_connection() as connection: - was_restored = connection.restore_location(id_, uri) - return was_restored + return False def _make(self, dataset_res, full_info=False, product=None, source_tree: LineageTree | None = None, @@ -605,12 +586,8 @@ def _make(self, dataset_res, full_info=False, product=None, kwargs = {} if not isinstance(dataset_res, dict): dataset_res = dataset_res._asdict() - if "uris" in dataset_res: - uris = [uri for uri in dataset_res["uris"] if uri] - if len(uris) == 1: - kwargs["uri"] = uris[0] - else: - kwargs["uris"] = uris + if "uri" in dataset_res: + kwargs["uri"] = dataset_res["uri"] return Dataset( product=product or self.products.get(dataset_res["product_id"]), diff --git a/datacube/index/postgres/_datasets.py b/datacube/index/postgres/_datasets.py index 72a76e5b1..76faf1d91 100755 --- a/datacube/index/postgres/_datasets.py +++ b/datacube/index/postgres/_datasets.py @@ -223,6 +223,8 @@ def _add_batch(self, batch_ds: Iterable[DatasetTuple], cache: Mapping[str, Any]) "metadata_type_ref": prod.metadata_type.id } ) + if isinstance(uris, str): + uris = [uris] for uri in uris: scheme, body = split_uri(uri) batch["uris"].append( diff --git a/datacube/model/model.py b/datacube/model/model.py deleted file mode 100644 index b83aa15d2..000000000 --- a/datacube/model/model.py +++ /dev/null @@ -1,223 +0,0 @@ -# This file is part of the Open Data Cube, see https://opendatacube.org for more information -# -# Copyright (c) 2015-2024 ODC Contributors -# SPDX-License-Identifier: Apache-2.0 -from pathlib import Path -from uuid import UUID -import re - -import affine -import attr -from ruamel.yaml.comments import CommentedMap -from shapely.geometry.base import BaseGeometry - -from .properties import Eo3Dict, Eo3Interface -from datacube.model import Dataset - -DEA_URI_PREFIX = "https://collections.dea.ga.gov.au" -ODC_DATASET_SCHEMA_URL = "https://schemas.opendatacube.org/dataset" - -# Either a local filesystem path or a string URI. -# (the URI can use any scheme supported by rasterio, such as tar:// or https:// or ...) -Location = Path | str - - -@attr.s(auto_attribs=True, slots=True) -class ProductDoc: - """ - The product that this dataset belongs to. - - "name" is the local name in ODC. - - href is intended as a more global unique "identifier" uri for the product. - """ - - name: str | None = None - href: str | None = None - - -@attr.s(auto_attribs=True, slots=True, hash=True) -class GridDoc: - """The grid describing a measurement/band's pixels""" - - shape: tuple[int, int] - transform: affine.Affine - - -@attr.s(auto_attribs=True, slots=True) -class MeasurementDoc: - """ - A Dataset's reference to a measurement file. - """ - - path: str - band: int | None = 1 - layer: str | None = None - grid: str = "default" - - name: str | None = attr.ib(metadata=dict(doc_exclude=True), default=None) - alias: str | None = attr.ib(metadata=dict(doc_exclude=True), default=None) - - -@attr.s(auto_attribs=True, slots=True) -class AccessoryDoc: - """ - An accessory is an extra file included in the dataset that is not - a measurement/band. - - For example: thumbnails, alternative metadata documents, or checksum files. - """ - - path: str - type: str | None = None - name: str | None = attr.ib(metadata=dict(doc_exclude=True), default=None) - - -@attr.s(auto_attribs=True, slots=True) -class DatasetDoc(Eo3Interface): - """ - An EO3 dataset document - - Includes :class:`.Eo3Interface` methods for metadata access:: - - >>> from dateutil.tz import tzutc - >>> p = DatasetDoc() - >>> p.platform = 'LANDSAT_8' - >>> p.processed = '2018-04-03' - >>> p.properties['odc:processing_datetime'] - datetime.datetime(2018, 4, 3, 0, 0, tzinfo=tzutc()) - - """ - - #: Dataset UUID - id: UUID | None = None - #: Human-readable identifier for the dataset - label: str | None = None - #: The product name (local) and/or url (global) - product: ProductDoc | None = None - #: Location(s) where this dataset is stored. - #: - #: (ODC supports multiple locations when the same dataset is stored in multiple places) - #: - #: They are fully qualified URIs (``file://...`, ``https://...``, ``s3://...``) - #: - #: All other paths in the document (measurements, accessories) are relative to the - #: chosen location. - locations: list[str] | None = None - - #: CRS string. Eg. ``epsg:3577`` - crs: str | None = None - #: Shapely geometry of the valid data coverage - #: - #: (it must contain all non-empty pixels of the image) - geometry: BaseGeometry | None = None - #: Grid specifications for measurements - grids: dict[str, GridDoc] | None = None - #: Raw properties - properties: Eo3Dict = attr.ib(factory=Eo3Dict) - #: Loadable measurements of the dataset - measurements: dict[str, MeasurementDoc] | None = None - #: References to accessory files - #: - #: Such as thumbnails, checksums, other kinds of metadata files. - #: - #: (any files included in the dataset that are not measurements) - accessories: dict[str, AccessoryDoc] = attr.ib(factory=CommentedMap) - #: Links to source dataset uuids - lineage: dict[str, list[UUID]] = attr.ib(factory=CommentedMap) - - -# Conversion functions copied from datacube-alchemist for ease of transition - -def dataset_to_datasetdoc(ds: Dataset) -> DatasetDoc: - """ - Convert to the DatasetDoc format that eodatasets expects. - """ - if ds.metadata_type.name in {"eo_plus", "eo_s2_nrt", "gqa_eo"}: - # Handle S2 NRT metadata identically to eo_plus files. - # gqa_eo is the S2 ARD with extra quality check fields. - return _convert_eo_plus(ds) - - if ds.metadata_type.name == "eo": - return _convert_eo(ds) - - # Else we have an already mostly eo3 style dataset - product = ProductDoc(name=ds.type.name) - # Wrap properties to avoid typos and the like - properties = Eo3Dict(ds.metadata_doc.get("properties", {})) - if properties.get("eo:gsd"): - del properties["eo:gsd"] - return DatasetDoc( - id=ds.id, - product=product, - crs=str(ds.crs), - properties=properties, - geometry=ds.extent, - ) - - -def _convert_eo_plus(ds) -> DatasetDoc: - # Definitely need: # - 'datetime' # - 'eo:instrument' # - 'eo:platform' # - 'odc:region_code' - region_code = _guess_region_code(ds) - properties = Eo3Dict( - { - "odc:region_code": region_code, - "datetime": ds.center_time, - "eo:instrument": ds.metadata.instrument, - "eo:platform": ds.metadata.platform, - "landsat:landsat_scene_id": ds.metadata_doc.get( - "tile_id", "??" - ), # Used to find abbreviated instrument id - "sentinel:sentinel_tile_id": ds.metadata_doc.get("tile_id", "??"), - } - ) - product = ProductDoc(name=ds.type.name) - return DatasetDoc(id=ds.id, product=product, crs=str(ds.crs), properties=properties) - - -def _convert_eo(ds) -> DatasetDoc: - # Definitely need: # - 'datetime' # - 'eo:instrument' # - 'eo:platform' # - 'odc:region_code' - region_code = _guess_region_code(ds) - properties = Eo3Dict( - { - "odc:region_code": region_code, - "datetime": ds.center_time, - "eo:instrument": ds.metadata.instrument, - "eo:platform": ds.metadata.platform, - "landsat:landsat_scene_id": ds.metadata.instrument, # Used to find abbreviated instrument id - } - ) - product = ProductDoc(name=ds.type.name) - return DatasetDoc(id=ds.id, product=product, crs=str(ds.crs), properties=properties) - - -# Regex for extracting region codes from tile IDs. -RE_TILE_REGION_CODE = re.compile(r".*A\d{6}_T(\w{5})_N\d{2}\.\d{2}") - - -def _guess_region_code(ds: Dataset) -> str: - """ - Get the region code of a dataset. - """ - try: - # EO plus - return ds.metadata.region_code - except AttributeError: - # Not EO plus - pass - - try: - # EO - return ds.metadata_doc["region_code"] - except KeyError: - # No region code! - pass - - # Region code not specified, so get it from the tile ID. - # An example of such a tile ID for S2A NRT is: - # S2A_OPER_MSI_L1C_TL_VGS1_20201114T053541_A028185_T50JPP_N02.09 - # The region code is 50JPP. - tile_match = RE_TILE_REGION_CODE.match(ds.metadata_doc["tile_id"]) - if not tile_match: - raise ValueError("No region code for dataset {}".format(ds.id)) - return tile_match.group(1) diff --git a/datacube/model/properties.py b/datacube/model/properties.py deleted file mode 100644 index 480f8afde..000000000 --- a/datacube/model/properties.py +++ /dev/null @@ -1,771 +0,0 @@ -# This file is part of the Open Data Cube, see https://opendatacube.org for more information -# -# Copyright (c) 2015-2024 ODC Contributors -# SPDX-License-Identifier: Apache-2.0 -import collections.abc -import warnings -from abc import abstractmethod -from collections import defaultdict -from datetime import datetime -from enum import Enum, EnumMeta -from textwrap import dedent -from typing import Any, Callable, Dict, Mapping, Optional, Set, Tuple, Union -from urllib.parse import urlencode - -from ruamel.yaml.timestamp import TimeStamp as RuamelTimeStamp - -from datacube.utils.dates import parse_time, tz_aware - - -class FileFormat(Enum): - GeoTIFF = 1 - NetCDF = 2 - Zarr = 3 - JPEG2000 = 4 - - -def nest_properties(d: Mapping[str, Any], separator=":") -> dict[str, Any]: - """ - Split keys with embedded colons into sub dictionaries. - - Intended for stac-like properties - - >>> nest_properties({'landsat:path':1, 'landsat:row':2, 'clouds':3}) - {'landsat': {'path': 1, 'row': 2}, 'clouds': 3} - """ - out: dict[str, Any] = defaultdict(dict) - for key, val in d.items(): - section, *remainder = key.split(separator, 1) - if remainder: - [sub_key] = remainder - out[section][sub_key] = val - else: - out[section] = val - - for key, val in out.items(): - if isinstance(val, dict): - out[key] = nest_properties(val, separator=separator) - - return dict(out) - - -def datetime_type(value): - # Ruamel's TimeZone class can become invalid from the .replace(utc) call. - # (I think it no longer matches the internal ._yaml fields.) - # Convert to a regular datetime. - if isinstance(value, RuamelTimeStamp): - value = value.isoformat() - - # Store all dates with a timezone. - # yaml standard says all dates default to UTC. - # (and ruamel normalises timezones to UTC itself) - return tz_aware(parse_time(value)) - - -def of_enum_type( - vals: EnumMeta | tuple[str, ...] | None = None, - lower: bool = False, - strict: bool = True -) -> Callable[[str], str]: - if isinstance(vals, EnumMeta): - vals = tuple(vals.__members__.keys()) - - def normalise(v: str | Enum) -> str: - if isinstance(v, Enum): - v = v.name - - if lower: - v = v.lower() - - if vals is not None and v not in vals: - msg = f"Unexpected value {v!r}. Expected one of: {', '.join(vals)}," - if strict: - raise ValueError(msg) - else: - warnings.warn(msg) - return v - - return normalise - - -def percent_type(value): - value = float(value) - - if not (0.0 <= value <= 100.0): - raise ValueError("Expected percent between 0,100") - return value - - -def normalise_platforms(value: Union[str, list, set]): - """ - >>> normalise_platforms('LANDSAT_8') - 'landsat-8' - >>> # Multiple can be comma-separated. They're normalised independently and sorted. - >>> normalise_platforms('LANDSAT_8,Landsat-5,landsat-7') - 'landsat-5,landsat-7,landsat-8' - >>> # Can be given as a list. - >>> normalise_platforms(['sentinel-2b','SENTINEL-2a']) - 'sentinel-2a,sentinel-2b' - >>> # Deduplicated too - >>> normalise_platforms('landsat-5,landsat-5,LANDSAT-5') - 'landsat-5' - """ - if not isinstance(value, (list, set, tuple)): - value = value.split(",") - - platforms = sorted({s.strip().lower().replace("_", "-") for s in value if s}) - if not platforms: - return None - - return ",".join(platforms) - - -def degrees_type(value): - value = float(value) - - if not (-360.0 <= value <= 360.0): - raise ValueError("Expected degrees between -360,+360") - - return value - - -def identifier_type(v: str): - v = v.replace("-", "_") - if not v.isidentifier() or not v.islower(): - warnings.warn( - f"{v!r} is expected to be an identifier " - "(alphanumeric with underscores, typically lowercase)" - ) - return v - - -def producer_check(value): - if "." not in value: - warnings.warn( - "Property 'odc:producer' is expected to be a domain name, " - "eg 'usgs.gov' or 'ga.gov.au'" - ) - return value - - -def parsed_sentinel_tile_id(tile_id) -> Tuple[str, Dict]: - """Extract useful extra fields from a sentinel tile id - - >>> from dateutil.tz import tzutc - >>> val, props = parsed_sentinel_tile_id("S2B_OPER_MSI_L1C_TL_EPAE_20201011T011446_A018789_T55HFA_N02.09") - >>> val - 'S2B_OPER_MSI_L1C_TL_EPAE_20201011T011446_A018789_T55HFA_N02.09' - >>> props - {'sentinel:datatake_start_datetime': datetime.datetime(2020, 10, 11, 1, 14, 46, tzinfo=tzutc())} - """ - extras = {} - split_tile_id = tile_id.split("_") - try: - datatake_sensing_time = datetime_type(split_tile_id[-4]) - extras["sentinel:datatake_start_datetime"] = datatake_sensing_time - except IndexError: - pass - - # TODO: we could extract other useful fields? - - return tile_id, extras - - -def parsed_sentinel_datastrip_id(tile_id) -> Tuple[str, Dict]: - """Extract useful extra fields from a sentinel datastrip id - - >>> from dateutil.tz import tzutc - >>> val, props = parsed_sentinel_datastrip_id("S2B_OPER_MSI_L1C_DS_EPAE_20201011T011446_S20201011T000244_N02.09") - >>> val - 'S2B_OPER_MSI_L1C_DS_EPAE_20201011T011446_S20201011T000244_N02.09' - >>> props - {'sentinel:datatake_start_datetime': datetime.datetime(2020, 10, 11, 1, 14, 46, tzinfo=tzutc())} - """ - extras = {} - split_tile_id = tile_id.split("_") - try: - datatake_sensing_time = datetime_type(split_tile_id[-3]) - extras["sentinel:datatake_start_datetime"] = datatake_sensing_time - except IndexError: - pass - - # TODO: we could extract other useful fields? - - return tile_id, extras - - -# The primitive types allowed as stac values. -PrimitiveType = Union[str, int, float, datetime] - -ExtraProperties = Dict -# A function to normalise a value. -# (eg. convert to int, or make string lowercase). -# They throw a ValueError if not valid. -NormaliseValueFn = Callable[ - [Any], - # It returns the normalised value, but can optionally also return extra property values extracted from it. - Union[PrimitiveType, Tuple[PrimitiveType, ExtraProperties]], -] - -# Extras typically on the ARD product. -_GQA_FMASK_PROPS = { - "fmask:clear": float, - "fmask:cloud": float, - "fmask:cloud_shadow": float, - "fmask:snow": float, - "fmask:water": float, - "s2cloudless:clear": float, - "s2cloudless:cloud": float, - "gqa:abs_iterative_mean_x": float, - "gqa:abs_iterative_mean_xy": float, - "gqa:abs_iterative_mean_y": float, - "gqa:abs_x": float, - "gqa:abs_xy": float, - "gqa:abs_y": float, - "gqa:cep90": float, - "gqa:error_message": None, - "gqa:final_gcp_count": int, - "gqa:iterative_mean_x": float, - "gqa:iterative_mean_xy": float, - "gqa:iterative_mean_y": float, - "gqa:iterative_stddev_x": float, - "gqa:iterative_stddev_xy": float, - "gqa:iterative_stddev_y": float, - "gqa:mean_x": float, - "gqa:mean_xy": float, - "gqa:mean_y": float, - "gqa:ref_source": None, - "gqa:stddev_x": float, - "gqa:stddev_xy": float, - "gqa:stddev_y": float, -} - -# Typically only from LPGS (ie. Level 1 products) -_LANDSAT_EXTENDED_PROPS = { - "landsat:algorithm_source_surface_reflectance": None, - "landsat:collection_category": None, - "landsat:collection_number": int, - "landsat:data_type": None, - "landsat:earth_sun_distance": None, - "landsat:ephemeris_type": None, - "landsat:geometric_rmse_model": None, - "landsat:geometric_rmse_model_x": None, - "landsat:geometric_rmse_model_y": None, - "landsat:geometric_rmse_verify": None, - "landsat:ground_control_points_model": None, - "landsat:ground_control_points_verify": None, - "landsat:ground_control_points_version": None, - "landsat:image_quality_oli": None, - "landsat:image_quality_tirs": None, - "landsat:processing_software_version": None, - "landsat:scan_gap_interpolation": float, - "landsat:station_id": None, - # Landsat USGS Properties - "landsat:rmse": None, - "landsat:rmse_x": None, - "landsat:rmse_y": None, - "landsat:wrs_type": None, - "landsat:correction": None, - "landsat:cloud_cover_land": None, -} - -_SENTINEL_EXTENDED_PROPS = { - "sentinel:sentinel_tile_id": parsed_sentinel_tile_id, - "sentinel:datatake_start_datetime": datetime_type, - "sentinel:datastrip_id": parsed_sentinel_datastrip_id, - "sentinel:datatake_type": None, - "sentinel:processing_baseline": None, - "sentinel:processing_center": None, - "sentinel:product_name": None, - "sentinel:reception_station": None, - "sentinel:utm_zone": int, - "sentinel:latitude_band": None, - "sentinel:grid_square": None, - "sinergise_product_id": None, -} - - -_STAC_MISC_PROPS = { - "providers": None, # https://github.com/radiantearth/stac-spec/blob/master/item-spec/common-metadata.md#provider, - # Projection extension - "proj:epsg": int, - "proj:shape": None, - "proj:transform": None, -} - - -class Eo3Dict(collections.abc.MutableMapping): - """ - This acts like a dictionary, but will normalise known properties (consistent - case, types etc) and warn about common mistakes. - - It wraps an inner dictionary. By default it will normalise the fields in - the input dictionary on creation, but you can disable this with `normalise_input=False`. - """ - - # Every property we've seen or dealt with so far. Feel free to expand with abandon... - # This is to minimise minor typos, case differences, etc, which plagued previous systems. - # Keep sorted. - KNOWN_PROPERTIES: Mapping[str, Optional[NormaliseValueFn]] = { - "datetime": datetime_type, - "dea:dataset_maturity": of_enum_type(("final", "interim", "nrt"), lower=True), - "dea:product_maturity": of_enum_type(("stable", "provisional"), lower=True), - "dtr:end_datetime": datetime_type, - "dtr:start_datetime": datetime_type, - "eo:azimuth": float, - "eo:cloud_cover": percent_type, - "eo:epsg": None, - "eo:gsd": None, - "eo:instrument": None, - "eo:off_nadir": float, - "eo:platform": normalise_platforms, - "eo:constellation": None, - "eo:sun_azimuth": degrees_type, - "eo:sun_elevation": degrees_type, - "sat:orbit_state": None, - "sat:relative_orbit": int, - "sat:absolute_orbit": int, - "landsat:landsat_product_id": None, - "landsat:scene_id": None, - "landsat:landsat_scene_id": None, - "landsat:wrs_path": int, - "landsat:wrs_row": int, - "odc:dataset_version": None, - "odc:collection_number": int, - "odc:naming_conventions": None, - # Not strict as there may be more added in ODC... - "odc:file_format": of_enum_type(FileFormat, strict=False), - "odc:processing_datetime": datetime_type, - "odc:producer": producer_check, - "odc:product": None, - "odc:product_family": identifier_type, - "odc:region_code": None, - "odc:sat_row": None, # When a dataset has a range of rows (... telemetry) - **_LANDSAT_EXTENDED_PROPS, - **_GQA_FMASK_PROPS, - **_SENTINEL_EXTENDED_PROPS, - **_STAC_MISC_PROPS, - } - - # For backwards compatibility, in case users are extending at runtime. - KNOWN_STAC_PROPERTIES = KNOWN_PROPERTIES - - def __init__(self, properties: dict[str, Any] | None = None, normalise_input: bool = True) -> None: - if properties is None: - properties = {} - self._props = properties - # We normalise the properties they gave us. - for key in list(self._props): - # We always want to normalise dates as datetime objects rather than strings - # for consistency. - if normalise_input or ("datetime" in key): - self.normalise_and_set(key, self._props[key], expect_override=True) - self._finished_init_ = True - - def __setattr__(self, name: str, value: Any) -> None: - """ - Prevent against users accidentally setting new properties (it has happened multiple times). - """ - if hasattr(self, "_finished_init_") and not hasattr(self, name): - raise TypeError( - f"Cannot set new field '{name}' on a dict. " - f"(Perhaps you meant to set it as a dictionary field??)" - ) - super().__setattr__(name, value) - - def __getitem__(self, item): - return self._props[item] - - def __iter__(self): - return iter(self._props) - - def __len__(self): - return len(self._props) - - def __delitem__(self, name: str) -> None: - del self._props[name] - - def __repr__(self) -> str: - return f"{self.__class__.__name__}({self._props!r})" - - def __setitem__(self, key, value): - self.normalise_and_set( - key, - value, - # They can override properties but will receive a warning. - allow_override=True, - ) - - def normalise_and_set(self, key, value, allow_override=True, expect_override=False): - """ - Set a property with the usual normalisation. - - This has some options that are not available on normal dictionary item - setting (``self[key] = val``) - - The default behaviour of this class is very conservative in order to catch common errors - of users. You can loosen the settings here. - - :argument allow_override: Is it okay to overwrite an existing value? (if not, error will be thrown) - :argument expect_override: We expect to overwrite a property, so don't produce a warning or error. - """ - if key not in self.KNOWN_PROPERTIES: - warnings.warn( - f"Unknown Stac property {key!r}. " - f"If this is valid property, please tell us on Github here so we can add it: " - f"\n\t{_github_suggest_new_property_url(key, value)}" - ) - - if value is not None: - normalise = self.KNOWN_PROPERTIES.get(key) - if normalise: - value = normalise(value) - # If the normaliser has extracted extra properties, we'll get two return values. - if isinstance(value, Tuple): - value, extra_properties = value - for k, v in extra_properties.items(): - if k == key: - raise RuntimeError( - f"Infinite loop: writing key {k!r} from itself" - ) - self.normalise_and_set(k, v, allow_override=allow_override) - - if key in self._props and value != self[key] and (not expect_override): - message = ( - f"Overriding property {key!r} " f"(from {self[key]!r} to {value!r})" - ) - if allow_override: - warnings.warn(message, category=PropertyOverrideWarning) - else: - raise KeyError(message) - - self._props[key] = value - - def nested(self): - return nest_properties(self._props) - - -class StacPropertyView(Eo3Dict): - """ - Backwards compatibility class name. Deprecated. - - Use the identical 'Eo3Dict' instead. - - These were called "StacProperties" in Stac 0.6, but many of them have - changed in newer versions and we're sticking to the old names for consistency - and backwards-compatibility. So they're now EO3 Properties. - - (The eo3-to-stac tool to convert EO3 properties to real Stac properties.) - """ - - def __init__(self, properties=None) -> None: - super().__init__(properties) - warnings.warn( - "The class name 'StacPropertyView' is deprecated as it's misleading. " - "Please change your import to the (identical) 'Eo3Dict'.", - category=DeprecationWarning, - ) - - -class PropertyOverrideWarning(UserWarning): - """A warning that a property was set twice with different values.""" - - ... - - -class Eo3Interface: - """ - These are convenience properties for common metadata fields. They are available - on DatasetAssemblers and within other naming APIs. - - (This is abstract. If you want one of these of your own, you probably want to create - an :class:`eodatasets3.DatasetDoc`) - - """ - - @property - @abstractmethod - def properties(self) -> Eo3Dict: - raise NotImplementedError - - @property - def platform(self) -> Optional[str]: - """ - Unique name of the specific platform the instrument is attached to. - - For satellites this would be the name of the satellite (e.g., ``landsat-8``, ``sentinel-2a``), - whereas for drones this would be a unique name for the drone. - - In derivative products, multiple platforms can be specified with a comma: ``landsat-5,landsat-7``. - - Shorthand for ``eo:platform`` property - """ - return self.properties.get("eo:platform") - - @platform.setter - def platform(self, value: str): - self.properties["eo:platform"] = value - - @property - def platforms(self) -> Set[str]: - """ - Get platform as a set (containing zero or more items). - - In EO3, multiple platforms are specified by comma-separating them. - """ - if not self.platform: - return set() - return set(self.properties.get("eo:platform", "").split(",")) - - @platforms.setter - def platforms(self, value: Set[str]): - # The normaliser supports sets/lists - self.properties["eo:platform"] = value - - @property - def instrument(self) -> str | None: - """ - Name of instrument or sensor used (e.g., MODIS, ASTER, OLI, Canon F-1). - - Shorthand for ``eo:instrument`` property - """ - return self.properties.get("eo:instrument") - - @instrument.setter - def instrument(self, value: str): - self.properties["eo:instrument"] = value - - @property - def constellation(self) -> str | None: - """ - Constellation. Eg ``sentinel-2``. - """ - return self.properties.get("eo:constellation") - - @constellation.setter - def constellation(self, value: str): - self.properties["eo:constellation"] = value - - @property - def product_name(self) -> str | None: - """ - The ODC product name - """ - return self.properties.get("odc:product") - - @product_name.setter - def product_name(self, value: str): - self.properties["odc:product"] = value - - @property - def producer(self) -> str | None: - """ - Organisation that produced the data. - - eg. ``usgs.gov`` or ``ga.gov.au`` - - Shorthand for ``odc:producer`` property - """ - return self.properties.get("odc:producer") - - @producer.setter - def producer(self, domain: str): - self.properties["odc:producer"] = domain - - @property - def datetime_range(self) -> tuple[datetime, datetime] | None: - """ - An optional date range for the dataset. - - The ``datetime`` is still mandatory when this is set. - - This field is a shorthand for reading/setting the datetime-range - stac 0.6 extension properties: ``dtr:start_datetime`` and ``dtr:end_datetime`` - """ - sdt = self.properties.get("dtr:start_datetime") - edt = self.properties.get("dtr:end_datetime") - if sdt and edt: - return (sdt, edt) - elif sdt: - return (sdt, sdt) - elif edt: - return (edt, edt) - else: - return None - - @datetime_range.setter - def datetime_range(self, val: Tuple[datetime, datetime]): - # TODO: string type conversion, better validation/errors - start, end = val - self.properties["dtr:start_datetime"] = start - self.properties["dtr:end_datetime"] = end - - @property - def processed(self) -> datetime | None: - """When the dataset was created (Defaults to UTC if not specified) - - Shorthand for the ``odc:processing_datetime`` field - """ - return self.properties.get("odc:processing_datetime") - - @processed.setter - def processed(self, value: Union[str, datetime]): - self.properties["odc:processing_datetime"] = value - - def processed_now(self): - """ - Shorthand for when the dataset was processed right now on the current system. - """ - self.properties["odc:processing_datetime"] = datetime.utcnow() - - @property - def dataset_version(self) -> str | None: - """ - The version of the dataset. - - Typically digits separated by a dot. Eg. `1.0.0` - - The first digit is usually the collection number for - this 'producer' organisation, such as USGS Collection 1 or - GA Collection 3. - """ - return self.properties.get("odc:dataset_version") - - @dataset_version.setter - def dataset_version(self, value): - self.properties["odc:dataset_version"] = value - - @property - def collection_number(self) -> int | None: - """ - The version of the collection. - - Eg.:: - - metadata: - product_family: wofs - dataset_version: 1.6.0 - collection_number: 3 - - """ - return self.properties.get("odc:collection_number") - - @collection_number.setter - def collection_number(self, value): - self.properties["odc:collection_number"] = value - - @property - def naming_conventions(self) -> str | None: - return self.properties.get("odc:naming_conventions") - - @naming_conventions.setter - def naming_conventions(self, value): - self.properties["odc:naming_conventions"] = value - - @property - def product_family(self) -> str | None: - """ - The identifier for this "family" of products, such as ``ard``, ``level1`` or ``fc``. - It's used for grouping similar products together. - - They products in a family are usually produced the same way but have small variations: - they come from different sensors, or are written in different projections, etc. - - ``ard`` family of products: ``ls7_ard``, ``ls5_ard`` .... - - On older versions of Open Data Cube this was called ``product_type``. - - Shorthand for ``odc:product_family`` property. - """ - return self.properties.get("odc:product_family") - - @product_family.setter - def product_family(self, value): - self.properties["odc:product_family"] = value - - @product_family.deleter - def product_family(self): - del self.properties["odc:product_family"] - - @property - def region_code(self) -> str | None: - """ - The "region" of acquisition. This is a platform-agnostic representation of things like - the Landsat Path+Row. Datasets with the same Region Code will *roughly* (but usually - not *exactly*) cover the same spatial footprint. - - It's generally treated as an opaque string to group datasets and process as stacks. - - For Landsat products it's the concatenated ``{path}{row}`` (both numbers formatted to three digits). - - For Sentinel 2, it's the MGRS grid (TODO presumably?). - - Shorthand for ``odc:region_code`` property. - """ - return self.properties.get("odc:region_code") - - @region_code.setter - def region_code(self, value: str): - self.properties["odc:region_code"] = value - - @property - def maturity(self) -> str | None: - """ - The dataset maturity. The same data may be processed multiple times -- becoming more - mature -- as new ancillary data becomes available. - - Typical values (from least to most mature): ``nrt`` (near real time), ``interim``, ``final`` - """ - return self.properties.get("dea:dataset_maturity") - - @maturity.setter - def maturity(self, value): - self.properties["dea:dataset_maturity"] = value - - @property - def product_maturity(self) -> str | None: - """ - Classification: is this a 'provisional' or 'stable' release of the product? - """ - return self.properties.get("dea:product_maturity") - - @product_maturity.setter - def product_maturity(self, value): - self.properties["dea:product_maturity"] = value - - # Note that giving a method the name 'datetime' will override the 'datetime' type - # for class-level declarations (ie, for any types on functions!) - # So we make an alias: - from datetime import datetime as datetime_ - - @property - def datetime(self) -> datetime_ | None: - """ - The searchable date and time of the assets. (Default to UTC if not specified) - """ - return self.properties.get("datetime") - - @datetime.setter - def datetime(self, val: datetime_): - self.properties["datetime"] = val - - -def _github_suggest_new_property_url(key: str, value: object) -> str: - """Get a URL to create a Github issue suggesting new properties to be added.""" - issue_parameters = urlencode( - dict( - title=f"Include property {key!r}", - labels="known-properties", - body=dedent( - f"""\ - Hello! The property {key!r} does not appear to be in the KNOWN_PROPERTIES list, - but I believe it to be valid. - - An example value of this property is: {value!r} - - Thank you! - """ - ), - ) - ) - return f"https://github.com/GeoscienceAustralia/eo-datasets/issues/new?{issue_parameters}" diff --git a/docs/about/whats_new.rst b/docs/about/whats_new.rst index d7181bf9c..b29ad8b27 100644 --- a/docs/about/whats_new.rst +++ b/docs/about/whats_new.rst @@ -8,6 +8,8 @@ What's New v1.9.next ========= +- Remove multiple location support from postgis driver. (:pull:`1658`) +- Remove unnecessary logic copied from eodatasets3 (:pull:`1658`) * Documentation fixes (:pull:`1659`) * Don't use importlib_metadata (:pull:`1657`) * Pin upstream libraries to get CI tests running with numpy2 (:pull:`1661`) diff --git a/integration_tests/index/test_location.py b/integration_tests/index/test_location.py index 18667d519..e51acd293 100644 --- a/integration_tests/index/test_location.py +++ b/integration_tests/index/test_location.py @@ -3,9 +3,11 @@ # Copyright (c) 2015-2024 ODC Contributors # SPDX-License-Identifier: Apache-2.0 +import pytest from datacube.model import Dataset +@pytest.mark.parametrize('datacube_env_name', ('datacube',)) def test_legacy_location_behaviour(index, ls8_eo3_dataset): locations = index.datasets.get_locations(ls8_eo3_dataset.id) # Test of deprecated method assert locations == [ls8_eo3_dataset.uri] @@ -39,6 +41,37 @@ def test_legacy_location_behaviour(index, ls8_eo3_dataset): assert index.datasets.get_location(ls8_eo3_dataset.id) is None +@pytest.mark.parametrize('datacube_env_name', ('experimental',)) +def test_postgis_no_multiple_locations(index, ls8_eo3_dataset): + locations = index.datasets.get_locations(ls8_eo3_dataset.id) # Test of deprecated method + assert locations == [ls8_eo3_dataset.uri] + + update = Dataset( + ls8_eo3_dataset.product, + ls8_eo3_dataset.metadata_doc, + uris=locations + ["file:/tmp/foo"]) + with pytest.raises(ValueError): + index.datasets.update(update) + assert index.datasets.get_location(ls8_eo3_dataset.id) == ls8_eo3_dataset.uri + + index.datasets.remove_location(ls8_eo3_dataset.id, "file:/tmp/foo") + assert index.datasets.get_location(ls8_eo3_dataset.id) == ls8_eo3_dataset.uri + + index.datasets.remove_location(ls8_eo3_dataset.id, ls8_eo3_dataset.uri) + ls8_eo3_dataset = index.datasets.get(ls8_eo3_dataset.id) + assert ls8_eo3_dataset.uri is None + assert index.datasets.get_location(ls8_eo3_dataset.id) is None + + index.datasets.add_location(ls8_eo3_dataset.id, "file:/tmp/foo") + location = index.datasets.get_location(ls8_eo3_dataset.id) + assert location == "file:/tmp/foo" + + with pytest.raises(ValueError): + index.datasets.add_location(ls8_eo3_dataset.id, "s3:/bucket/hole/straw.axe") + + assert index.datasets.get_archived_locations(ls8_eo3_dataset.id) == [] + + def test_dataset_tuple_uris(ls8_eo3_product): from datacube.index.abstract import DatasetTuple dst1 = DatasetTuple(ls8_eo3_product, {"dummy": True}, "file:///uri1") diff --git a/integration_tests/index/test_memory_index.py b/integration_tests/index/test_memory_index.py index 9d6d973ce..74d2db619 100644 --- a/integration_tests/index/test_memory_index.py +++ b/integration_tests/index/test_memory_index.py @@ -707,6 +707,7 @@ def test_default_clone_bulk_ops(mem_index_fresh: Datacube, index, extended_eo3_m assert mem_index_fresh.index.datasets.has(ls8_eo3_dataset4.id) +@pytest.mark.parametrize('datacube_env_name', ('datacube', )) def test_default_clone_bulk_ops_multiloc( mem_index_fresh: Datacube, index, extended_eo3_metadata_type, ls8_eo3_product, wo_eo3_product, africa_s2_eo3_product, diff --git a/integration_tests/index/test_search_eo3.py b/integration_tests/index/test_search_eo3.py index cccb8bfcd..0f66c0896 100644 --- a/integration_tests/index/test_search_eo3.py +++ b/integration_tests/index/test_search_eo3.py @@ -510,7 +510,7 @@ def test_search_returning_rows_eo3(index, ls8_eo3_dataset, ls8_eo3_dataset2): dataset = ls8_eo3_dataset uri = eo3_ls8_dataset_doc[1] - uri3 = eo3_ls8_dataset2_doc[1] + uri2 = eo3_ls8_dataset2_doc[1] results = list(index.datasets.search_returning( ('id', 'uri'), platform='landsat-8', @@ -545,11 +545,48 @@ def test_search_returning_rows_eo3(index, assert len(results) == 1 assert 1.31 < results[0].cloud_shadow < 1.32 - index.datasets.archive_location(dataset.id, uri) # Test of deprecated method - index.datasets.remove_location(dataset.id, uri) # Test of deprecated method + # A second dataset already has a location: + results = set(index.datasets.search_returning( + ('id', 'uri'), + platform='landsat-8', + dataset_maturity='final', + )) + assert len(results) == 2 + assert results == { + (dataset.id, uri), + (ls8_eo3_dataset2.id, uri2), + } + + +@pytest.mark.parametrize('datacube_env_name', ('experimental', )) +def test_search_returning_uri(index, eo3_ls8_dataset_doc, + ls8_eo3_dataset): + dataset = ls8_eo3_dataset + uri = eo3_ls8_dataset_doc[1] + + # If returning a field like uri, there will be one result per dataset. + index.datasets.remove_location(dataset.id, uri) # deprecated method + results = list(index.datasets.search_returning( + ('id', 'uri'), + platform='landsat-8', + instrument='OLI_TIRS', + )) + assert len(results) == 1 + + +@pytest.mark.parametrize('datacube_env_name', ('datacube', )) +def test_search_returning_uris_legacy(index, + eo3_ls8_dataset_doc, + eo3_ls8_dataset2_doc, + ls8_eo3_dataset, ls8_eo3_dataset2): + dataset = ls8_eo3_dataset + uri = eo3_ls8_dataset_doc[1] + uri3 = eo3_ls8_dataset2_doc[1] # If returning a field like uri, there will be one result per location. # No locations + index.datasets.archive_location(dataset.id, uri) + index.datasets.remove_location(dataset.id, uri) results = list(index.datasets.search_returning( ('id', 'uri'), platform='landsat-8', @@ -558,9 +595,9 @@ def test_search_returning_rows_eo3(index, assert len(results) == 0 # Add a second location and we should get two results - index.datasets.add_location(dataset.id, uri) # Test of deprecated method + index.datasets.add_location(dataset.id, uri) uri2 = 'file:///tmp/test2' - index.datasets.add_location(dataset.id, uri2) # Test of deprecated method + index.datasets.add_location(dataset.id, uri2) results = set(index.datasets.search_returning( ('id', 'uri'), platform='landsat-8', @@ -855,8 +892,6 @@ def test_cli_info_eo3(index: Index, """ Search datasets using the cli. """ - index.datasets.add_location(ls8_eo3_dataset.id, 'file:///tmp/location1') # Test deprecated method - opts = [ 'dataset', 'info', str(ls8_eo3_dataset.id) ] @@ -877,7 +912,7 @@ def test_cli_info_eo3(index: Index, "id: " + str(ls8_eo3_dataset.id), 'product: ga_ls8c_ard_3', 'status: active', - 'location: file:///tmp/location1', + 'location: ' + str(ls8_eo3_dataset.uri), 'fields:', ' creation_time: 2019-10-07 20:19:19.218290', ' format: GeoTIFF', diff --git a/wordlist.txt b/wordlist.txt index 2590deff4..ba17c92bb 100644 --- a/wordlist.txt +++ b/wordlist.txt @@ -4,6 +4,7 @@ AbstractProductResource AbstractReaderDriver AbstractWriterDriver acca +accessor Affine affine africa @@ -39,6 +40,7 @@ autoselectionlabel autosummary auxuser aws +backend backends backported backquote @@ -123,6 +125,7 @@ DatasetTypeResource DataSource datasource datetime +dbs dea deafrica dem @@ -165,7 +168,7 @@ EPSG epsg ESPA ESRI -evironments +executor f'file fd feedstock @@ -204,6 +207,7 @@ GeoTiff geotiff GeoTIFFs GeoTiffs +gg gh GHA GIS @@ -220,6 +224,7 @@ gridWorkflow GroupBy HDF hdf +hhBQVas hl hoc hostname @@ -281,6 +286,7 @@ libyaml linux literalinclude localhost +logoColor lon lonlat lr