-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
add generic fetcher with support for ckan dataset
- Loading branch information
Showing
5 changed files
with
167 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
80 changes: 80 additions & 0 deletions
80
datacity_ckan_dgp/generic_fetchers/ckan_dataset_fetcher.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
import os | ||
|
||
import requests | ||
|
||
from .. import ckan | ||
from ..utils import http_stream_download | ||
|
||
|
||
def fetch(source_url, target_instance_name, target_package_id, target_organization_id, tmpdir): | ||
res = ckan.package_show(target_instance_name, target_package_id) | ||
target_package_exists = False | ||
existing_target_resources = {} | ||
if res: | ||
target_package_exists = True | ||
for resource in res['resources']: | ||
format_ = resource.get('format') or '' | ||
name = resource.get('name') or '' | ||
hash_ = resource.get('hash') or '' | ||
id_ = resource.get('id') or '' | ||
if format_ and name and hash_ and id_: | ||
existing_target_resources[f'{name}.{format_}'] = {'hash': hash_, 'id': id_} | ||
source_package_id = source_url.split('/dataset/')[1].split('/')[0] | ||
source_instance_baseurl = source_url.split('/dataset/')[0] | ||
if 'data.gov.il' in source_instance_baseurl: | ||
headers = {'user-agent': 'datagov-external-client'} | ||
else: | ||
headers = None | ||
res = requests.get(f'{source_instance_baseurl}/api/3/action/package_show?id={source_package_id}', headers=headers).json() | ||
assert res['success'] | ||
package_title = res['result']['title'] | ||
resources_to_update = [] | ||
for resource in res['result']['resources']: | ||
id_ = resource.get('id') or '' | ||
url = resource.get('url') or '' | ||
if url and id_: | ||
if 'e.data.gov.il' in url: | ||
url = url.replace('e.data.gov.il', 'data.gov.il') | ||
filename = url.split('/')[-1] | ||
source_hash = http_stream_download(f'{tmpdir}/{id_}', {'url': url, 'headers': headers}) | ||
source_format = resource.get('format') or '' | ||
source_name = resource.get('name') or '' | ||
description = resource.get('description') or '' | ||
if existing_target_resources.get(f'{source_name}.{source_format}', {}).get('hash') != source_hash: | ||
resources_to_update.append((id_, source_name, source_format, source_hash, description, filename)) | ||
if resources_to_update: | ||
print(f'updating {len(resources_to_update)} resources') | ||
if not target_package_exists: | ||
print('creating target package') | ||
res = ckan.package_create(target_instance_name, { | ||
'name': target_package_id, | ||
'title': package_title, | ||
'owner_org': target_organization_id | ||
}) | ||
assert res['success'], str(res) | ||
for id_, name, format_, hash_, description, filename in resources_to_update: | ||
print(f'{name}.{format_}') | ||
if os.path.exists(f'{tmpdir}/{filename}'): | ||
os.unlink(f'{tmpdir}/{filename}') | ||
os.rename(f'{tmpdir}/{id_}', f'{tmpdir}/{filename}') | ||
if f'{name}.{format_}' in existing_target_resources: | ||
print('existing resource found, but hash is different, updating resource data') | ||
res = ckan.resource_update(target_instance_name, { | ||
'id': existing_target_resources[f'{name}.{format_}']['id'], | ||
'hash': hash_, | ||
'description': description | ||
}, files=[('upload', open(f'{tmpdir}/{filename}', 'rb'))]) | ||
assert res['success'], str(res) | ||
else: | ||
print('no existing resource found, creating new resource') | ||
res = ckan.resource_create(target_instance_name, { | ||
'package_id': target_package_id, | ||
'format': format_, | ||
'name': name, | ||
'hash': hash_, | ||
'description': description | ||
}, files=[('upload', open(f'{tmpdir}/{filename}', 'rb'))]) | ||
assert res['success'], str(res) | ||
print('done, all resources created/updated') | ||
else: | ||
print('no resources to create/update') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
import os | ||
import sys | ||
import json | ||
import tempfile | ||
import contextlib | ||
from importlib import import_module | ||
|
||
|
||
# the source url will be checked against the following types in order to determine which type of source it is | ||
FETCHERS = [ | ||
{ | ||
# python3 -m datacity_ckan_dgp.operators.generic_fetcher '{"source_url": "https://data.gov.il/dataset/automated-devices", "target_instance_name": "LOCAL_DEVELOPMENT", "target_package_id": "automated-devices", "target_organization_id": "israel-gov", "tmpdir": ".data/ckan_fetcher_tmpdir"}' | ||
'fetcher': 'ckan_dataset', | ||
'match': { | ||
'url_contains': '/dataset/' | ||
} | ||
} | ||
] | ||
|
||
|
||
@contextlib.contextmanager | ||
def tempdir(tmpdir): | ||
if tmpdir: | ||
os.makedirs(tmpdir, exist_ok=True) | ||
yield tmpdir | ||
else: | ||
with tempfile.TemporaryDirectory() as tmpdir: | ||
yield tmpdir | ||
|
||
|
||
def operator(name, params): | ||
source_url = params['source_url'] | ||
target_instance_name = params['target_instance_name'] | ||
target_package_id = params['target_package_id'] | ||
target_organization_id = params['target_organization_id'] | ||
tmpdir = params.get('tmpdir') | ||
with tempdir(tmpdir) as tmpdir: | ||
print('starting generic_fetcher operator') | ||
print(f'source_url={source_url} target_instance_name={target_instance_name} target_package_id={target_package_id} target_organization_id={target_organization_id}') | ||
print(f'tmpdir={tmpdir}') | ||
for fetcher in FETCHERS: | ||
assert fetcher['match'].keys() == {'url_contains'}, 'only url_contains match is supported at the moment' | ||
if fetcher['match']['url_contains'] in source_url: | ||
import_module(f'datacity_ckan_dgp.generic_fetchers.{fetcher["fetcher"]}_fetcher').fetch(source_url, target_instance_name, target_package_id, target_organization_id, tmpdir) | ||
break | ||
|
||
|
||
# python3 -m datacity_ckan_dgp.operators.generic_fetcher '{"source_url": "https://data.gov.il/dataset/automated-devices", "target_instance_name": "LOCAL_DEVELOPMENT", "target_package_id": "automated-devices", "target_organization_id": "israel-gov", "tmpdir": ".data/ckan_fetcher_tmpdir"}' | ||
if __name__ == '__main__': | ||
operator('_', json.loads(sys.argv[1])) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters