Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CSW harvester OutputSchema config support #258 #259

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
22 changes: 17 additions & 5 deletions ckanext/spatial/harvesters/csw.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,14 +159,26 @@ def fetch_stage(self,harvest_object):
self._save_object_error('Error contacting the CSW server: %s' % e,
harvest_object)
return False


# load config
self._set_source_config(harvest_object.source.config)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you document the new output_schema option and its default value in here so others are aware of it?

https://github.com/ckan/ckanext-spatial/blob/master/doc/harvesters.rst

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added fallback to default in case the server is not supporting iso19139 -> 19115 transformation
the fallback will log and switch back to default asking for iso19139 -> iso19139.

# get output_schema from config
output_schema = self.source_config.get('output_schema', self.output_schema())
identifier = harvest_object.guid
try:
record = self.csw.getrecordbyid([identifier], outputschema=self.output_schema())
record = self.csw.getrecordbyid([identifier], outputschema=output_schema)
except Exception as e:
self._save_object_error('Error getting the CSW record with GUID %s' % identifier, harvest_object)
return False

try:
log.warn('Unable to fetch GUID {} with output schema: {}'.format(identifier, output_schema))
if output_schema == self.output_schema():
raise e
log.info('Fetching GUID {} with output schema: {}'.format(identifier, self.output_schema()))
# retry with default output schema
record = self.csw.getrecordbyid([identifier], outputschema=self.output_schema())
except Exception as e:
self._save_object_error('Error getting the CSW record with GUID {}'.format(identifier), harvest_object)
return False

if record is None:
self._save_object_error('Empty record for GUID %s' % identifier,
harvest_object)
Expand Down
63 changes: 51 additions & 12 deletions ckanext/spatial/lib/csw_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,14 +70,40 @@ class CswService(OwsService):
def __init__(self, endpoint=None):
super(CswService, self).__init__(endpoint)
self.sortby = SortBy([SortProperty('dc:identifier')])
# check capabilities
_cap = self.getcapabilities(endpoint)['response']
self.capabilities = etree.ElementTree(etree.fromstring(_cap))
self.output_schemas = {
'GetRecords': self._get_output_schemas('GetRecords'),
'GetRecordById': self._get_output_schemas('GetRecordById'),
}

def _get_output_schemas(self, operation):
_cap_ns = self.capabilities.getroot().nsmap
_ows_ns = _cap_ns.get('ows')
if not _ows_ns:
raise CswError('Bad getcapabilities response: OWS namespace not found ' + str(_cap_ns))
_op = self.capabilities.find("//{{{}}}Operation[@name='{}']".format(_ows_ns, operation))
_schemas = _op.find("{{{}}}Parameter[@name='outputSchema']".format(_ows_ns))
_values = map(lambda v: v.text, _schemas.findall("{{{}}}Value".format(_ows_ns)))
output_schemas = {}
for key, value in _schemas.nsmap.items():
if value in _values:
output_schemas.update({key : value})
return output_schemas

def getrecords(self, qtype=None, keywords=[],
typenames="csw:Record", esn="brief",
skip=0, count=10, outputschema="gmd", **kw):
from owslib.csw import namespaces

constraints = []
csw = self._ows(**kw)

# check target csw server capabilities for requested output schema
output_schemas = self.output_schemas['GetRecords']
if not output_schemas.get(outputschema):
raise CswError('Output schema \'{}\' not supported by target server: '.format(output_schemas))

if qtype is not None:
constraints.append(PropertyIsEqualTo("dc:type", qtype))

Expand All @@ -87,7 +113,7 @@ def getrecords(self, qtype=None, keywords=[],
"esn": esn,
"startposition": skip,
"maxrecords": count,
"outputschema": namespaces[outputschema],
"outputschema": output_schemas[outputschema],
"sortby": self.sortby
}
log.info('Making CSW request: getrecords2 %r', kwa)
Expand All @@ -102,10 +128,15 @@ def getrecords(self, qtype=None, keywords=[],
def getidentifiers(self, qtype=None, typenames="csw:Record", esn="brief",
keywords=[], limit=None, page=10, outputschema="gmd",
startposition=0, cql=None, **kw):
from owslib.csw import namespaces

constraints = []
csw = self._ows(**kw)

# check target csw server capabilities for requested output schema
output_schemas = self.output_schemas['GetRecords']
if not output_schemas.get(outputschema):
raise CswError('Output schema \'{}\' not supported by target server: '.format(output_schemas))

if qtype is not None:
constraints.append(PropertyIsEqualTo("dc:type", qtype))

Expand All @@ -115,7 +146,7 @@ def getidentifiers(self, qtype=None, typenames="csw:Record", esn="brief",
"esn": esn,
"startposition": startposition,
"maxrecords": page,
"outputschema": namespaces[outputschema],
"outputschema": output_schemas[outputschema],
"cql": cql,
"sortby": self.sortby
}
Expand All @@ -129,7 +160,6 @@ def getidentifiers(self, qtype=None, typenames="csw:Record", esn="brief",
err = 'Error getting identifiers: %r' % \
csw.exceptionreport.exceptions
#log.error(err)
raise CswError(err)

if matches == 0:
matches = csw.results['matches']
Expand All @@ -154,11 +184,17 @@ def getidentifiers(self, qtype=None, typenames="csw:Record", esn="brief",
kwa["startposition"] = startposition

def getrecordbyid(self, ids=[], esn="full", outputschema="gmd", **kw):
from owslib.csw import namespaces

csw = self._ows(**kw)

# fetch target csw server capabilities for requested output schema
output_schemas=output_schemas = self.output_schemas['GetRecordById']
if not output_schemas.get(outputschema):
raise CswError('Output schema \'{}\' not supported by target server: '.format(output_schemas))
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably here I should be more tolerant Logging ERROR and returning.


kwa = {
"esn": esn,
"outputschema": namespaces[outputschema],
"outputschema": output_schemas[outputschema],
}
# Ordinary Python version's don't support the metadata argument
log.info('Making CSW request: getrecordbyid %r %r', ids, kwa)
Expand All @@ -168,14 +204,17 @@ def getrecordbyid(self, ids=[], esn="full", outputschema="gmd", **kw):
csw.exceptionreport.exceptions
#log.error(err)
raise CswError(err)
if not csw.records:
elif csw.records:
record = self._xmd(list(csw.records.values())[0])
elif csw.response:
record = self._xmd(etree.fromstring(csw.response))
else:
return
record = self._xmd(list(csw.records.values())[0])

## strip off the enclosing results container, we only want the metadata
#md = csw._exml.find("/gmd:MD_Metadata")#, namespaces=namespaces)
# Ordinary Python version's don't support the metadata argument
md = csw._exml.find("/{http://www.isotc211.org/2005/gmd}MD_Metadata")
# '/{schema}*' expression should be safe enough and is able to match the
# desired schema followed by both MD_Metadata or MI_Metadata (iso19115[-2])
md = csw._exml.find("/{{{schema}}}*".format(schema=output_schemas[outputschema]))
mdtree = etree.ElementTree(md)
try:
record["xml"] = etree.tostring(mdtree, pretty_print=True, encoding=str)
Expand Down
2 changes: 2 additions & 0 deletions doc/harvesters.rst
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,9 @@ The currently supported configuration options are:
and spaces replaced with dashes. Setting this option to False gives the same effect as leaving it unset.
* ``validator_profiles``: A list of string that specifies a list of validators that will be applied to the
current harvester, overriding the global ones defined by the 'ckan.spatial.validator.profiles' option.
* ``output_schema``: the namespace to use as outputSchema_ for a CSW request

.. _outputSchema: https://docs.opengeospatial.org/is/12-176r7/12-176r7.html#72

Customizing the harvesters
--------------------------
Expand Down