From 57bb4f7af3cd7684664e380b50ee6cc95a185cc6 Mon Sep 17 00:00:00 2001 From: Aditya Chhabra Date: Sun, 1 Oct 2023 13:04:47 +0530 Subject: [PATCH] v1.0.0 pre-release --- .gitignore | 196 ++- LICENSE.txt => LICENSE | 42 +- README.md | 2856 ++++------------------------------ data/README.md | 3 + datagovindia/__init__.py | 1290 --------------- datagovindia/util.py | 252 --- pyproject.toml | 58 + requirements.txt | 4 + setup.py | 47 - src/datagovindia/__init__.py | 617 ++++++++ src/datagovindia/cli.py | 332 ++++ 11 files changed, 1483 insertions(+), 4214 deletions(-) rename LICENSE.txt => LICENSE (92%) create mode 100644 data/README.md delete mode 100644 datagovindia/__init__.py delete mode 100644 datagovindia/util.py create mode 100644 pyproject.toml create mode 100644 requirements.txt delete mode 100644 setup.py create mode 100644 src/datagovindia/__init__.py create mode 100644 src/datagovindia/cli.py diff --git a/.gitignore b/.gitignore index 46e205f6..46382168 100644 --- a/.gitignore +++ b/.gitignore @@ -1,31 +1,165 @@ -*.log -*.pot -*.pyc -*.mypy_cache -__pycache__ -__pycache__/ -local_settings.py -media -*.DS_Store -.AppleDouble -.LSOverride -Icon -._* -.fseventsd -.Spotlight-V100 -.TemporaryItems -.Trashes -.VolumeIcon.icns -.com.apple.timemachine.donotpresent -datagovindia.egg-info -.AppleDB -.AppleDesktop -Network Trash Folder -Temporary Items -.apdisk -build -build/ -dist -dist/ -*.egg-info - +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Mac OS +.DS_Store +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +# .vscode +.vscode/ \ No newline at end of file diff --git a/LICENSE.txt b/LICENSE similarity index 92% rename from LICENSE.txt rename to LICENSE index 40a4b4b5..f99203ef 100644 --- a/LICENSE.txt +++ b/LICENSE @@ -1,21 +1,21 @@ -MIT License - -Copyright (c) 2021 ADITYA KARAN CHHABRA and ABHISHEK ARORA - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. +MIT License + +Copyright (c) 2023 Aditya Karan Chhabra, Abhishek Arora, Arijit Basu + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md index fe25543c..55e8ea66 100644 --- a/README.md +++ b/README.md @@ -1,2622 +1,332 @@ -# **datagovindia** - -[![MIT license](https://img.shields.io/badge/License-MIT-blue.svg)](https://github.com/addypy/datagovindia/blob/master/LICENSE.txt) [![Downloads](https://static.pepy.tech/personalized-badge/datagovindia?period=total&units=international_system&left_color=grey&left_text=Downloads)](https://pepy.tech/project/datagovindia) - - -### A Python API-wrapper for Government of India’s [Open Government Data OGD platform](https://data.gov.in/) -**`datagovindia`** is an API wrapper for `198428` (and counting) APIs available at Government of India’s *[Open Government Data OGD platform](https://data.gov.in/ogpl_apis)* - -------- - -## Features -> - **DISCOVERY** ->> *Find the right API resource.* -> - **INFORMATION** ->> *Retrieve information about an API resource.* -> - **DATA** ->> *Download data in a convenient pandas DataFrame from the chosen API.* - -## Prerequisites - -> - An account on *data.gov.in* -> - An API key from the My Account page - - (Instructions here : [Official Guide](https://data.gov.in/help/how-use-datasets-apis)) - -## Installation -> - Using PIP -```sh -pip install -U datagovindia -``` -------- -> - Clone the Git-Repository -```sh -git clone https://github.com/addypy/datagovindia - -python setup.py install - -``` - -## Basic Usage - -### Import Library -```python -from datagovindia import DataGovIndia -``` +
-### Initialize Class -```python -datagovin = DataGovIndia("579b464db66ec23bdd000001cdd3946e44ce4aad7209ff7b23ac571b") -``` +# **datagovindia** -> Performs : ->> 1) Tests datagov.in API-server status. ->> 2) Validates API-Key. You only need to set this once. ->> 2) Fetches latest details about available APIs. +[![MIT license](https://img.shields.io/badge/License-MIT-green.svg)](https://github.com/addypy/datagovindia/blob/master/LICENSE) ![PyPI - Version](https://img.shields.io/pypi/v/datagovindia?color=green) [![Downloads](https://static.pepy.tech/personalized-badge/datagovindia?period=total&units=international_system&left_color=gray&left_text=Downloads)](https://pepy.tech/project/datagovindia) -### Search -```python -datagovin.search(description="Wheat",max_results=1,print_results=True) -``` -> Output: -``` -# Returns: -1 of 395 results +## Python API-wrapper for Government of India’s [Open Government Data OGD platform](https://data.gov.in/) -================================================================================== +**`datagovindia`** is a client library for accessing resources from the Government of India’s Open Government Data OGD platform. It provides a simple and intuitive interface to search, discover and download data from the platform. -Resource-ID: 4c88fba5e3174e06a34af33194ab4b2d - -Daily FCI Stock postion of the commodity Wheat, for the Haryana region in 2019 (till last week) +
-================================================================================== -``` +## Prerequisites -> Returns: +A data.gov.in API key is required to use this library. You can get your API key from [data.gov.in](https://data.gov.in). -```json - -[{"4c88fba5e3174e06a34af33194ab4b2d": "Daily FCI Stock postion of the commodity Wheat, for the Haryana region in 2019 (till last week)"}] +## Setup +```sh +### Install from PyPI +pip install datagovindia ``` +## Setting Up Your API Key +Saving your `API_KEY` as an environment variable named `DATAGOVINDIA_API_KEY` will allow the library to automatically detect your API key without the need to specify it in every command. -### Download Data -```python -data = datagovin.get_data("b7ea044ea17149ed886c37ed5729b75a",num_results='all') -data.head() +```bash +export DATAGOVINDIA_API_KEY=your_api_key_here ### Linux/Mac ``` -> Returns: - -|date |code |commodityid|commodityname |districtname|districtcode|stock |commoditystock|totalstock | -|--------------------|--------------------|-----------|--------------------|------------|------------|--------------|--------------|--------------| -|2019-07-20T00:00:00Z|Region Name: Haryana|01 |Wheat(Including URS)|FARIDABAD |NC12 |2214591.87343 |35769407.44149|35769407.44149| -|2019-07-20T00:00:00Z|Region Name: Haryana|01 |Wheat(Including URS)|HISSAR |NC13 |17954629.80074|35769407.44149|35769407.44149| -|2019-07-20T00:00:00Z|Region Name: Haryana|01 |Wheat(Including URS)|KARNAL |NC14 |1787375.5789 |35769407.44149|35769407.44149| -|2019-07-20T00:00:00Z|Region Name: Haryana|01 |Wheat(Including URS)|KURUKSHETRA |NC15 |3552965.00293 |35769407.44149|35769407.44149| -|2019-07-20T00:00:00Z|Region Name: Haryana|01 |Wheat(Including URS)|ROHTAK |NC16 |10259845.18549|35769407.44149|35769407.44149| - -------- -________ +or you can specify your API key in every command using the `--api-key` flag. -## Detailed Examples - --------- - -> ## A. **SETUP** -> ### Import *`DataGovIndia`* from *`datagovindia`* +## Sync latest resource data from OGD (`Optional`) ```python +# In a python environment from datagovindia import DataGovIndia -``` -> ### Get `API-KEY` from *[data.gov.in/user](https://data.gov.in/user)* -> -> See : [Official Guide](https://data.gov.in/help/how-use-datasets-apis) - -```python -api_key = "579b464db66ec23bdd000001cdd3946e44ce4aad7209ff7b23ac571b" -``` - -> ### Initialize Class - -```python -# Initializing the library - -# 1) Tests datagov.in API-server status. -# 2) Validates API-Key. You only need to set this once. -# 2) Fetches latest details about available APIs. - - -datagovin = DataGovIndia(api_key) - -# The API key you provided is valid. You won't need to set it again. -# Latest resources loaded. You may begin. -``` - --------- - -> ## B. **DISCOVERY** - -## Check available ***`attributes`*** - -### 1. List of ***`Organization-Names`*** - -```python -datagovin.list_org_names() - -# Returns: -['Adi Dravidar and Tribal Welfare Department, Tamil Nadu', - 'Agriculture Department', - 'Agriculture Department, Meghalaya', -     ..., - 'Department of AIDS Control', - 'Department of Agricultural Research and Education (DARE)', - 'Department of Animal Husbandry, Dairying and Fisheries', - 'Department of Atomic Energy', -     ...., - 'Micro Small and Medium Enterprises Department, Tamil Nadu', - 'Ministry of Agriculture and Farmers Welfare', -    ...., -] -``` - -### 2. List of ***`Organization-Types`*** - -```python -datagovin.list_org_types() - -# Returns: -['Central', - 'City', - 'State'] - -``` -### 3. List of ***`Sectors`*** -```python -datagovin.list_sectors() - -# Returns: -['Adult Education', -'Agricultural', -'Agricultural Marketing', -'Agricultural Research & Extension', -'Agriculture', - . - ., -'Water Quality', -'Water Resources', -'Water and Sanitation', -'Water ways'] -``` - -### 4. List of ***`Sources`*** - -```python -datagovin.list_sources() - -# Returns: -['data.gov.in', 'smartcities.data.gov.in', 'tn.data.gov.in'] -``` - -### 5. List of ***`All Attributes`*** -```python -datagovin.list_all_attributes() -# Returns: -``` -```json - { "org_types": ["Central", "City", "State"], - "sources": ["data.gov.in", "smartcities.data.gov.in", "tn.data.gov.in"], - "org_names": [ "Adi Dravidar and Tribal Welfare Department, Tamil Nadu", - "Agricultural Census, New Delhi", - "Agriculture Department", - , - , - , - "Department of Agriculture, Cooperation and Farmers Welfare", - "Department of Animal Husbandry, Dairying and Fisheries", - "Department of Atomic Energy", - "Department of Ayurveda, Yoga and Naturopathy, Unani, Siddha " - , - , - , - "Tourism, Culture and Religious Endowments Department", - "Transport Department, Madhya Pradesh", - "Transport Department, Tamil Nadu", - , - , - "West Bengal"], - "sectors": [ "Adult Education", - "Agricultural", - "Agricultural Marketing", - "Agriculture", - , - , - "Atmospheric Science", - "Aviation", - "Banking", - "Biotechnology", - "Broadcasting", - "Census", - , - , - "District Adminstration", - "Drinking Water", - "Earth Sciences",, - "Education", - "Employment", - "Environment and Forest", - , - , - "Municipal Waste", - "National Population Register", - "Natural Resources", - "Noise Pollution", - "Panchayati Raj", - "Parliament Of india", - "Passport", - "Power and Energy", - , - , - "Water Quality", - "Water Resources", - "Water and Sanitation", - "Water ways"] - } -``` - -### 6. List of ***`recently created resources`*** - -```python -datagovin.list_recently_created(days=5,max_results=5,print_results=True) -``` -``` -# Prints: - -5 of 1443 results that were created in the last - `5` days - -================================================================================== - -Resource-ID: 52d2933f69be46fda28855c08134fc7f -18 June 2021, 09:57 AM -Allocations for The Welfare of Schedule Caste from 2019-20 to 2021-22 - -================================================================================== - -Resource-ID: 2ef7903b77f04609af93bb54516c125c -18 June 2021, 09:57 AM -Allocations for The Welfare of Schedule Tribes from 2019-20 to 2021-22 - -================================================================================== - -Resource-ID: 8a679d8db6d94605a1d160150fe22b77 -18 June 2021, 09:57 AM -Allocations for the Welfare of Children from 2019-20 to 2021-22 - -================================================================================== - -Resource-ID: 243825f60f304a10877dd1f86ad49598 -18 June 2021, 09:27 AM -Monthly Range-wise Performance of Public Facilities for Deliveries conducted at facility for May 2013-14 - -================================================================================== - -Resource-ID: a5d0bd7d39e84392b65abe5e4737f865 -18 June 2021, 09:26 AM -Monthly Range-wise Performance of Public Facilities for Deliveries conducted at facility for September 2018-19 - -================================================================================== -``` -```json -# Returns: -[{"resourceid": "52d2933f69be46fda28855c08134fc7f", - "timestamp": 1623990466, - "title": "Allocations for The Welfare of Schedule Caste from 2019-20 to 2021-22"}, - {"resourceid": "2ef7903b77f04609af93bb54516c125c", - "timestamp": 1623990466, - "title": "Allocations for The Welfare of Schedule Tribes from 2019-20 to 2021-22"}, - {"resourceid": "8a679d8db6d94605a1d160150fe22b77", - "timestamp": 1623990441, - "title": "Allocations for the Welfare of Children from 2019-20 to 2021-22"}, - {"resourceid": "243825f60f304a10877dd1f86ad49598", - "timestamp": 1623988620, - "title": "Monthly Range-wise Performance of Public Facilities for Deliveries conducted at facility for May 2013-14"}, - {"resourceid": "a5d0bd7d39e84392b65abe5e4737f865", - "timestamp": 1623988618, - "title": "Monthly Range-wise Performance of Public Facilities for Deliveries conducted at facility for September 2018-19"}] -``` - -### 7. List of ***`recently updated resources`*** -```python -datagovin.list_recently_updated(days=3,max_results=5,print_results=True) - -``` -``` -# Prints: - -5 of 303 results that were updated in the last - `3` days - -================================================================================== - -Resource-ID: 9ef84268d588465aa308a864a43d0070 -21 June 2021, 02:05 PM -Current Daily Price of Various Commodities from Various Markets (Mandi) - -================================================================================== - -Resource-ID: 3b01bcb80b144abfb6f2c1bfd384ba69 -21 June 2021, 12:03 PM -Real time Air Quality Index from various location - -================================================================================== - -Resource-ID: d76a86b16a2a4ab39201cb9f6bc61fa4 -21 June 2021, 08:50 AM -District Wise Total MSME Registered Service Enterprises till last date - -================================================================================== - -Resource-ID: 925bb7dd50f048768a1da5e45c4a989a -21 June 2021, 08:50 AM -District Wise Total MSME Registered Manufacturing and Service Enterprises till last date - -================================================================================== - -Resource-ID: 201b66f27fda40b8b613ffb7789c4341 -21 June 2021, 08:50 AM -District Wise Total MSME Registered Manufacturing Enterprises till last date - -================================================================================== -``` - -```json -# Returns: -[{"resourceid": "9ef84268d588465aa308a864a43d0070", - "timestamp": 1624264506, - "title": "Current Daily Price of Various Commodities from Various Markets (Mandi)"}, - {"resourceid": "3b01bcb80b144abfb6f2c1bfd384ba69", - "timestamp": 1624257197, - "title": "Real time Air Quality Index from various location"}, - {"resourceid": "d76a86b16a2a4ab39201cb9f6bc61fa4", - "timestamp": 1624245637, - "title": "District Wise Total MSME Registered Service Enterprises till last date"}, - {"resourceid": "925bb7dd50f048768a1da5e45c4a989a", - "timestamp": 1624245633, - "title": "District Wise Total MSME Registered Manufacturing and Service Enterprises till last date"}, - {"resourceid": "201b66f27fda40b8b613ffb7789c4341", - "timestamp": 1624245629, - "title": "District Wise Total MSME Registered Manufacturing Enterprises till last date"}] -``` -## Searching for a dataset (API-Resource) ---- -### 1. *Search* for resource using **`TITLE`** - -```python -results = datagovin.search_by_title("MGNREGA",max_results=5,print_results=True) -``` - -``` -# Returns: -5 of 45 results for : `MGNREGA` - -================================================================================== - -Resource-ID: bf1da9fc565045c3be3b0ba006377869 - -Expenditure under MGNREGA on Schedule Caste (SC) Persondays during 2015-16 and 2018-19 (From: Ministry of Rural Development) - -================================================================================== - -Resource-ID: 9aa66b7abb1d4e20bd4be5e68539cdfc - -Central Fund Released to Jammu and Kashmir under MGNREGA from 2016-17 to 2018-19 (From: Ministry of Rural Development) - -================================================================================== - -Resource-ID: 57bff16a642345b29700ebcde6709937 - -State/UT-wise Expenditure Reported in Management Information System (MIS) under MGNREGA from 2014-15 to 2018-19 (From: Ministry of Labour and Employment) - -================================================================================== - -Resource-ID: 8e7b41bec79044958339c8da0a7f287e - -State/UT-wise Expenditure made on Water Related Works Taken up under MGNREGA from 2016-17 to 2019-20 (From: Ministry of Jal Shakti) - -================================================================================== - -Resource-ID: 7371da1e4c5e4c529223f85e1756d24d - -District-wise expenditure under the Mahatma Gandhi National Rural Employment Guarantee Act (MGNREGA) in the state Punjab from 2017-18 to 2019-20 (From: Ministry of Rural Development) - -================================================================================== -``` - -### 2. *Search* for resource using **`DESCRIPTION`** - -```python -results = datagovin.search_by_description("Swachh Bharat Mission",max_results=5,print_results=True) -``` - -``` -# Returns: -5 of 25 results for : `Swachh Bharat Mission` - -================================================================================== - -Resource-ID: 22f496bb32a84b6da4124f03c4b3ea62 - -District-wise Target vs Achievement of Construction of Toilets in State of Chhattisgarh under Swachh Bharat Mission (SBM) from 2013-14 to 2017-18 (From : Ministry of Tribal Affairs) - -================================================================================== - -Resource-ID: 673d72fc1c8a497d80477c3c72196e74 - -State/UT-wise Number of IHHLs Constructed under Swachh Bharat Mission - Gramin (SBM-G) from 02 October, 2014 to 17 July, 2019 (From : Ministry of Jal Shakti) - -================================================================================== - -Resource-ID: 2235bc9138cc4a4dbf5413e485596d5c - -Funds Sanctioned, Allocated and Utilised under Swachh Bharat Mission (SBM) in Chhattisgarh from 2016-17 to 2018-19 (From: Ministry of Jal Shakti, Department of Drinking Water and Sanitation) - -================================================================================== - -Resource-ID: 45bb18686df44011b5fbbd5d74a01eda - -Details of Fund (including Swachh Bharat Cess) Allocated & Released under Swachh Bharat Mission (Rural/Urban) from 2016-17 to 2018-19 (From: Ministry of Finance) - -================================================================================== - -Resource-ID: 5329bcc7f75f4a87be6a0bdaa6ebb4b4 - -Funds Allocated, Released, Balance and Utilization Certificate received under Swachh Bharat Mission (Urban) as on 30th November, 2019 (From: Ministry of Housing and Urban Affairs) - -================================================================================== -``` - -### 3. ***Search*** for resources by `SOURCE` - -```python -results = datagovin.search_by_source("tn.data.gov.in",max_results=3,print_results=True) -``` - -``` -# Returns: -3 of 526 results for `source` : `tn.data.gov.in` - -================================================================================== - -Resource-ID: 952da80341cd41e990bcbcb760ffbf90 - -Area, Production & Productivity of Snake Gourd (Vegetables) by District-wise in Tamil Nadu for the Year 2015-16 - -================================================================================== - -Resource-ID: 0bd2498df63c456a9f336e242e9abe82 - -Area, Production & Productivity of Chrysanthimum (Flowers) by District-wise in Tamil Nadu for the Year 2015-16 - -================================================================================== - -Resource-ID: 921f5b1f093146399c96a00195e17881 - -Area, Production & Productivity of Jadhi Malli (Flowers) by District-wise in Tamil Nadu for the Year 2015-16 - -================================================================================== -``` - -### 4. ***Search for resources by*** `SECTOR` - -```python -results = datagovin.search_by_sector("Banking",max_results=3,print_results=True) -``` - -``` -# Returns: -3 of 45 results for `sector` : `Banking` - -================================================================================== - -Resource-ID: 4b9dd94d36be4f968578f8981857773c - -Month-wise Progress Report of PMJDY by Public Sectors Banks/Regional Rural Banks/Private Banks upto 24-Feb-2016 - -================================================================================== - -Resource-ID: f719ee5c50254643aa54157d707d6077 - -Liabilities and assets of different classes of banks - scheduled commercial banks as on 31st March - State Bank of India from 2001 to 2014 - -================================================================================== - -Resource-ID: 371020a7a43747df8946fbd030b53459 - -Liabilities And Assets Of State Financial Corporations (State-wise) upto 2012-13 - -================================================================================== -``` - -### 5. ***Search for resources by*** `ORG-NAME` - -```python -results = datagovin.search_by_org_name("Ministry of Road Transport and Highways",max_results=5,print_results=True) -``` - -``` -# Returns: -5 of 417 results for `organization` - `Ministry of Road Transport and Highways` - -================================================================================== - -Resource-ID: 37b1f841f44c490682fb2442b0f2bd25 - -State/UT-wise Length of Roads under Coal Fields/Coal units of Coal India Limited by Type of Surface as on 31st March, 2017 - -================================================================================== - -Resource-ID: b10ac9f5c1fd42c78c19e74a1fe64c04 - -State/UT-wise Length of Roads under Forest Departments by Type of Surface in India as on 31st March, 2017 - -================================================================================== - -Resource-ID: 8ebce90f62e8421592672bf22bac7f94 - -State-wise Length of Roads in Major Ports by Type of Surface as on 31st March, 2017 - -================================================================================== - -Resource-ID: 888f4d498c864f1c825feef9db674cc8 - -State/UT-wise Length of Military Engineering Service Roads by Type of Surface as on 31st March, 2017 - -================================================================================== - -Resource-ID: 068ecf9440694838981b3529c3a48edc - -State/UT-wise Length of PMGSY Roads by type of Surface as on 31st March, 2017 - -================================================================================== -``` - -### 6. *Search* for resources by `ORG-TYPE` - -```python -results = datagovin.search_by_org_type("State",max_results=5,print_results=True) -``` - -``` -# Returns: -5 of 645 results for `organization type` - `State` - -================================================================================== - -Resource-ID: 4200eb5f17294fee8477af5feb715b3c - -Details of Vehicle Tax collected by Surat Municipal Corporation from Year 1989 onward - -================================================================================== - -Resource-ID: fbdf3432b88a4592bbc4d0f60a0ac140 - -Surat City Bus and BRTS Passenger Information from April 2015 (daily) - -================================================================================== - -Resource-ID: 993acfe3b72e4e07895915aa34bc226d - -Building Plan Applications at Surat Municipal Corporation from April 2015 onward (daily) - -================================================================================== - -Resource-ID: 8addc59332b54531a2346057209f35a0 - -Surat City Complaint Statistics from April 2015 onward (daily) - -================================================================================== - -Resource-ID: 3968cb03596842c9ac43cba988a964c7 - -Garbage Collection in Surat City (in KG) from April 2015 onward (daily) - -================================================================================== -``` - -### 7. *Search* for resources with **`Multiple Filters`** - -```python -results = datagovin.search(title="COVID", - description="Postiive Case", - org_name="Surat", - org_type="City", - sector="All", - source="smartcities.data.gov.in", - max_results=5, - print_results=True, - ) -``` - -``` -# Returns: -2 of 2 results - -================================================================================== - -Resource-ID: b9cfed4ca1a24f7aaffa88a8e1a2149c - -COVID-19 Positive Case Details - -================================================================================== - -Resource-ID: ee35f0724d804b418c17fd74414907be - -COVID-19 Cluster / Containment Zone Details - -================================================================================== -``` - - --------- - - -> ## C. **Learn more about an API-resource.** - ->> ### 1. Get all available `meta-data` for an API resource - -> Meta-Data includes - -> -> - Resource-ID -> - Title -> - Description -> - Total records available -> - Date-Created -> - Data-Updated -> - Organization-Type -> - Organization-Name -> - Source -> - Sector -> - Fields - -```python -datagovin.get_resource_info("b9cfed4ca1a24f7aaffa88a8e1a2149c") -``` - -```json -{"ResourceID": "b9cfed4ca1a24f7aaffa88a8e1a2149c", - "Title": "COVID-19 Positive Case Details", - "Description": "COVID-19 Positive Case Details", - "TotalRecords": 3592, - "DateCreated": "08 May 2020, 09:00 PM", - "DateUdpated": "10 January 2021, 11:04 PM", - "OrganizationNames": ["Gujarat", "Surat"], - "OrganizationTypes": "City", - "Sector": "All", - "Source": "smartcities.data.gov.in", - "Fields": ["sr_no", - "city", - "zone", - "age", - "gender", - "latitude", - "longitude", - "result", - "sample_result", - "resultdate"]} -``` - ->> ### 2. Get details of `fields` (variables) available for a resource. - -```python -datagovin.get_resource_fields("b9cfed4ca1a24f7aaffa88a8e1a2149c") -``` - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
field_codefield_labelfield_type
0sr_noSr.Nokeyword
1cityCitykeyword
2zonezonedouble
3ageagedouble
4genderGenderkeyword
5latitudelatitudedouble
6longitudelongitudedouble
7resultResultkeyword
8sample_resultSample_Resultkeyword
9resultdateResultDatedate
-
- --------- - -> ## D. **Download DATA** - -```python -data = datagovin.get_data("b9cfed4ca1a24f7aaffa88a8e1a2149c") -data.head(20) +datagovin = DataGovIndia() # Specify API key if not set as an environment variable +datagovin.sync_metadata() ``` -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
sr_nocityzoneagegenderlatitudelongituderesultsample_resultresultdate
01SuratSouth West Zone21F21.169772.7933Cured/DischargedPositive19/03/2020
12SuratCentral Zone67M21.186972.816DeathPositive20/03/2020
23SuratEast Zone - B50F21.2113017372.86820564Cured/DischargedPositive10/06/2020
34SuratSouth Zone26M21.139772.8241Cured/DischargedPositive28/03/2020
45SuratWest Zone55M21.205612472.804538Cured/DischargedPositive11/06/2020
56SuratNorth Zone47M21.241942672.8287933Cured/DischargedPositive13/06/2020
67SuratEast Zone - B34M21.222530972.8918084Cured/DischargedPositive17/06/2020
78SuratNorth Zone39M21.233408272.8046628Cured/DischargedPositive19/06/2020
89SuratSouth East Zone20F21.168172.8672Cured/DischargedPositive18/04/2020
910SuratWest Zone32M21.226572.7927Cured/DischargedPositive21/03/2020
1011SuratCentral Zone45M21.185272.8209Cured/DischargedPositive22/03/2020
1112SuratSouth Zone22M21.161372.8305Cured/DischargedPositive01/04/2020
1213SuratSouth East Zone62M21.18672.863Cured/DischargedPositive23/03/2020
1314SuratWest Zone67M21.221272.7954Cured/DischargedPositive29/03/2020
1415SuratSouth West Zone23M21.173872.8141Cured/DischargedPositive20/03/2020
1516SuratNorth Zone29M21.226472.8189Cured/DischargedPositive31/03/2020
1617SuratWest Zone61F21.207872.7732DeathPositive03/04/2020
1718SuratSouth Zone40F21.161272.8303Cured/DischargedPositive04/04/2020
1819SuratCentral Zone65M21.195672.8353DeathPositive04/04/2020
1920SuratWest Zone50M21.201572.8085Cured/DischargedPositive05/04/2020
-
- - --------- +**Note**: Updating the library's metadata from data.gov.in ensures synchronization with the latest data. While this step is optional (especially if you're focused only on data downloads), it's beneficial due to the OGD platform's lack of a search API for resources. -> ## E. Filtering - -```python -# First, let's take a look at valid `fields`. - -datagovin.get_resource_fields("b9cfed4ca1a24f7aaffa88a8e1a2149c") +```sh +# To update metadata from the command line: +$ datagovindia sync-metadata # Specify API key if not set as an environment variable ``` -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
field_codefield_labelfield_type
0sr_noSr.Nokeyword
1cityCitykeyword
2zonezonedouble
3ageagedouble
4genderGenderkeyword
5latitudelatitudedouble
6longitudelongitudedouble
7resultResultkeyword
8sample_resultSample_Resultkeyword
9resultdateResultDatedate
-
- ->> ### 1. Filtering with a *Single* ***`Field`*** - *Single* ***`Value`*** pair +### `Output`: -```python -data = datagovin.get_data("b9cfed4ca1a24f7aaffa88a8e1a2149c",filters={"result":"Active"}) -data ``` - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
sr_nocityzoneagegenderlatitudelongituderesultsample_resultresultdate
0511SuratSouth East Zone25M21.17900472.808405ActivePositive25/04/2020
1951SuratSouth East Zone35M21.190477372.849517ActivePositive13/05/2020
21111Out CityNA70F21.15055472.802457ActivePositive18/05/2020
31164Out CityNA73M21.15055472.802457ActivePositive19/05/2020
41166SuratSouth Zone41M21.15372672.839782ActivePositive20/05/2020
51247SuratSouth Zone55M21.15321572.8267782ActivePositive24/05/2020
61361SuratSouth West Zone50F21.1326897472.74215644ActivePositive24/05/2020
71520Out CityNA72M21.221749272.7830429ActivePositive28/05/2020
81530Out CityNA56F21.157772.7768399ActivePositive28/05/2020
91594Out CityNA53F21.156315172.766301ActivePositive30/05/2020
102327SuratSouth Zone63M21.122313772.8491477ActivePositive10/06/2020
112485Out CityNA41M21.2907972.9001ActivePositive13/06/2020
122609SuratNorth Zone61M21.236675172.8350334ActivePositive14/06/2020
132748Out CityNA3F21.1348874572.76593804ActivePositive16/06/2020
-
- ->> ### 2. Filtering with a *Single* ***`Field`*** - *Multiple* ***`Values`*** - -```python -datagovin.get_data("b9cfed4ca1a24f7aaffa88a8e1a2149c",filters={"result":["Active",'Cured/Discharged']}) +Updated 198465/198465 resources: [===============================================>] - ETA: 0s +Finished updating 198465 records in 62 seconds. ``` -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
sr_nocityzoneagegenderlatitudelongituderesultsample_resultresultdate
0511SuratSouth East Zone25M21.17900472.808405ActivePositive25/04/2020
1951SuratSouth East Zone35M21.190477372.849517ActivePositive13/05/2020
21111Out CityNA70F21.15055472.802457ActivePositive18/05/2020
31164Out CityNA73M21.15055472.802457ActivePositive19/05/2020
41166SuratSouth Zone41M21.15372672.839782ActivePositive20/05/2020
.................................
30093189SuratNorth Zone50M21.22621772.817604Cured/DischargedPositive21/06/2020
30103190SuratNorth Zone42M21.226809972.8256378Cured/DischargedPositive21/06/2020
30113191SuratWest Zone52M21.20512472.776736Cured/DischargedPositive22/06/2020
30123193SuratNorth Zone26F21.239808472.8500394Cured/DischargedPositive21/06/2020
30133194SuratNorth Zone49M21.229016872.808571Cured/DischargedPositive21/06/2020
-

3014 rows × 10 columns

-
- ->> ### 3. Filtering with *Multiple* ***`Field(s)`*** - *Multiple* ***`Value(s)`*** +## Search for resources ```python -datagovin.get_data("b9cfed4ca1a24f7aaffa88a8e1a2149c", - filters={ - "gender":["F","M"], - "result":['Cured/Discharged',"Death"], - }) +search_data = datagovin.search('mgnrega') # Returns a dataframe with search results. Searches in resource title by default -# Note: -# Filtering returns a UNION of matching results, and NOT an INTERSECTION. +search_data = datagovin.search('mgnrega', search_fields=['title', 'description']) # Search in multiple fields ``` -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
sr_nocityzoneagegenderlatitudelongituderesultsample_resultresultdate
01SuratSouth West Zone21F21.169772.7933Cured/DischargedPositive19/03/2020
13SuratEast Zone - B50F21.2113017372.86820564Cured/DischargedPositive10/06/2020
29SuratSouth East Zone20F21.168172.8672Cured/DischargedPositive18/04/2020
317SuratWest Zone61F21.207872.7732DeathPositive03/04/2020
418SuratSouth Zone40F21.161272.8303Cured/DischargedPositive04/04/2020
.................................
58073506SuratWest Zone47M21.205796272.7998015Cured/DischargedPositive23/06/2020
58083508SuratSouth Zone78M21.15974772.838655Cured/DischargedPositive23/06/2020
58093509SuratEast Zone - A30M21.197507472.8450123Cured/DischargedPositive24/06/2020
58103510SuratNorth Zone43M21.228400272.8283048Cured/DischargedPositive23/06/2020
58113511SuratNorth Zone53M21.244012172.8502404Cured/DischargedPositive23/06/2020
-

3592 rows × 10 columns

-
- - --------- -> ## F. Restricting Variables/ Columns - `fields` - -```python -datagovin.get_data("b9cfed4ca1a24f7aaffa88a8e1a2149c", - fields = ["city","zone","age","gender","result"], - ) -# Get only the fields you need, by passing a list of valid fields in `fields` -``` - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +```sh +# Search for resources with the keyword 'mgnrega' +$ datagovindia search mgnrega # Returns a dataframe with search results + +# Search for resources in the title and description fields only and save the results to a csv/xlsx/json file +$ datagovindia search mgnrega -f title -f description --output mgnrega.csv + +# Preview the first n results of a search +$ datagovindia search mgnrega --preview --limit 5 +``` + +### `Output`: + +
cityzoneagegenderresult
0SuratSouth West Zone21FCured/Discharged
1SuratCentral Zone67MDeath
2SuratEast Zone - B50FCured/Discharged
3SuratSouth Zone26MCured/Discharged
4SuratWest Zone55MCured/Discharged
5SuratNorth Zone47MCured/Discharged
6SuratEast Zone - B34MCured/Discharged
7SuratNorth Zone39MCured/Discharged
8SuratSouth East Zone20FCured/Discharged
9SuratWest Zone32MCured/Discharged
10SuratCentral Zone53MCured/Discharged
11SuratSouth East Zone45FCured/Discharged
12SuratSouth East Zone60FCured/Discharged
13SuratNorth Zone65MDeath
14SuratSouth East Zone18MCured/Discharged
15SuratSouth Zone40MCured/Discharged
16SuratEast Zone - A28FCured/Discharged
17SuratNorth Zone77FCured/Discharged
18SuratEast Zone - A62MCured/Discharged
19SuratEast Zone - A24FCured/Discharged
20SuratNorth Zone63MCured/Discharged
22SuratSouth East Zone33MCured/Discharged
23SuratNorth Zone34MCured/Discharged
24SuratCentral Zone24MCured/Discharged
25SuratSouth East Zone34MCured/Discharged
26SuratNorth Zone34FCured/Discharged
27SuratSouth Zone43MCured/Discharged
28SuratNorth Zone52FCured/Discharged
30SuratNorth Zone33MCured/Discharged
31SuratWest Zone46MCured/Discharged
32SuratEast Zone - B38MCured/Discharged
33SuratSouth West Zone70MCured/Discharged
34SuratWest Zone44MCured/Discharged
35SuratSouth West Zone45MCured/Discharged
36SuratNorth Zone36MCured/Discharged
37SuratCentral Zone40MCured/Discharged
39SuratEast Zone - A37MCured/Discharged
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
resource_idtitledescriptionorg_typefieldsorgssourcesectorsdate_createddate_updated
ee03643a-ee4c-48c2-ac30-9f2ff26ab722District-wise MGNREGA Data at a Glance from 01.04.2023 to 31.08.2023District-wise MGNREGA Data at a Glance from 01.04.2023 to 31.08.2023Central['document_id', 'sno_', 'state_name', 'district_name', 'total_no__of_jobcards_issued', 'total_no__of_workers', 'total_no__of_active_job_cards', 'total_no__of_active_workers', 'sc_workers_against_active_workers', 'st_workers_against_active_workers', 'approved_labour_budget', 'persondays_of_central_liability_so_far', 'sc_persondays', 'st_persondays', 'women_persondays', 'average_days_of_employment_provided_per_household', 'average_wage_rate_per_day_per_person_rs__', 'total_no_of_hhs_completed_100_days_of_wage_employment', 'total_households_worked', 'total_individuals_worked', 'differently_abled_persons_worked', 'number_of_gps_with_nil_exp', 'total_no__of_works_takenup__new_spill_over_', 'number_of_ongoing_works', 'number_of_completed_works', '__of_nrm_expenditure_public___individual_', '__of_category_b_works', '__of_expenditure_on_agriculture___agriculture_allied_works', 'total_exp_rs__in_lakhs__', 'wages_rs__in_lakhs_', 'material_and_skilled_wages_rs__in_lakhs_', 'total_adm_expenditure__rs__in_lakhs__', 'resource_uuid']['Ministry of Rural Development', 'Department of Land Resources (DLR)']data.gov.in['Rural', 'Land Resources']2023-09-19T06:43:03+00:002023-09-19T10:39:44+00:00
d1d29e37-1d60-46da-9902-52340abbfb13State/UTs-wise Expenditure on Water Related Works under Mahatma Gandhi National Rural Employment Guarantee Scheme (MGNREGA) from 2019-20 to 2021-22State/UTs-wise Expenditure on Water Related Works under Mahatma Gandhi National Rural Employment Guarantee Scheme (MGNREGA) from 2019-20 to 2021-22Central['document_id', 'sl__no_', 'state_ut', '_2019_2020___water_conservation_and_water_harvesting___completed___number_of_works', '_2019_2020___water_conservation_and_water_harvesting___completed___expenditure__rs__in_lakh_', '_2019_2020___water_conservation_and_water_harvesting___ongoing___number_of_works', '_2019_2020___water_conservation_and_water_harvesting___ongoing___expenditure__rs__in_lakh_', '_2020_2021___water_conservation_and_water_harvesting___completed___number_of_works', '_2020_2021___water_conservation_and_water_harvesting___completed___expenditure__rs__in_lakh_', '_2020_2021___water_conservation_and_water_harvesting___ongoing___number', '_2020_2021___water_conservation_and_water_harvesting___ongoing___expenditure__rs__in_lakh_', '_2021_2022__as_on_10_03_2022____water_conservation_and_water_harvesting___completed___number_of_works', '_2021_2022__as_on_10_03_2022____water_conservation_and_water_harvesting___completed___expenditure__rs__in_lakh_', '_2021_2022__as_on_10_03_2022____water_conservation_and_water_harvesting___ongoing___number_of_works', '_2021_2022__as_on_10_03_2022____water_conservation_and_water_harvesting___ongoing___expenditure__rs__in_lakh_', 'resource_uuid']['Rajya Sabha']data.gov.in['All']2022-09-15T07:24:33+00:002022-09-15T12:37:43+00:00
c0350589-65a7-4166-996a-ba5845c398feState/UT-wise Central Funds Sanctioned/Released for Wage, Material & Admin Component under MGNREGA from 2018-19 to 2021-22State/UT-wise Central Funds Sanctioned/Released for Wage, Material & Admin Component under MGNREGA from 2018-19 to 2021-22Central['document_id', 'sl__no_', 'state_ut', 'fy_2018_19', 'fy_2019_20', 'fy_2020_21', 'fy_2021_22__as_on_26_07_2021_', 'resource_uuid']['Rajya Sabha']data.gov.in['All']2022-04-01T05:41:11+00:002022-04-29T14:13:43+00:00
0fecf99b-2c7c-46db-9f7d-c4bdacf040fcState/UT-wise List of Total Number of Active ST Worker and ST Person Days Generated under Mahatma Gandhi National Rural Employment Guarantee Act (MGNREGA) during 2019-20 and 2020-21 (From: Ministry of Tribal Affairs)State/UT-wise List of Total Number of Active ST Worker and ST Person Days Generated under Mahatma Gandhi National Rural Employment Guarantee Act (MGNREGA) during 2019-20 and 2020-21 (From: Ministry of Tribal Affairs)Central['document_id', '_sl__no_', 'state_ut', 'total_number_of_active_st_worker__in_lakh_', 'st_person_days_generated__in_lakh___2019_20_', 'st_person_days_generated__in_lakhs___2020_21_', 'resource_uuid']['Rajya Sabha']data.gov.in['All']2021-12-15T14:23:27+00:002022-02-28T10:15:14+00:00
aeca8112-5fd4-4c91-92dc-d72b2c7b969eState/UT-wise Central Fund Released and Expenditure Reported under MGNREGA from 2017-18 to 2019-20 (From: Ministry of Rural Development)State/UT-wise Central Fund Released and Expenditure Reported under MGNREGA from 2017-18 to 2019-20 (From: Ministry of Rural Development)Central['document_id', '_s__no_', 'state_ut', 'central_fund_released___2017_18__', 'central_fund_released___2018_19__', 'central_fund_released___2019_20', '_expenditure___2017_18', '_expenditure___2018_19', '_expenditure___2019_20', 'resource_uuid']['Rajya Sabha']data.gov.in['All']2021-03-04T07:17:46+00:002021-03-23T15:15:05+00:00
7efb084d-b562-4b9f-8a3a-d0808a54d609State/UT-wise Persondays Generated under MGNREGA including West Bengal from 2017-18 to 2019-20 (From: Ministry of Rural Development)State/UT-wise Persondays Generated under MGNREGA including West Bengal from 2017-18 to 2019-20 (From: Ministry of Rural Development)Central['document_id', '_sl_no', 'state_ut', '_2017_18', '_2018_19', '_2019_20', 'resource_uuid']['Rajya Sabha']data.gov.in['All']2021-03-04T06:52:05+00:002021-03-23T14:53:45+00:00
6ae541ca-903e-4a6a-be62-48dedea02223Average Mahatma Gandhi National Rural Employment Guarantee Act (MGNREGA) wages from 2014-15 to 2018-19 (From : Ministry of Rural Development)Average Mahatma Gandhi National Rural Employment Guarantee Act (MGNREGA) wages from 2014-15 to 2018-19 (From : Ministry of Rural Development)Central['document_id', 'financial_year', 'average_wage_rate_per_day_per_person__rs__', 'resource_uuid']['Rajya Sabha']data.gov.in['All']2021-03-04T04:45:12+00:002021-03-23T13:10:05+00:00
a1c9803c-d405-4edf-a298-ee42e7e65d0bState/UT-wise Households Completing 100 days of Employment under Mahatma Gandhi National Rural Employment Guarantee Act (MGNREGA) from 2016-17 to 2018-19 (From : Ministry of Rural Development)State/UT-wise Households Completing 100 days of Employment under Mahatma Gandhi National Rural Employment Guarantee Act (MGNREGA) from 2016-17 to 2018-19 (From : Ministry of Rural Development)Central['document_id', 'sl__no_', 'states_uts', '__2016_17', '_2017_18', '_2018_19', 'resource_uuid']['Rajya Sabha']data.gov.in['All']2021-03-04T04:17:04+00:002021-03-23T12:47:05+00:00
bf1da9fc-5650-45c3-be3b-0ba006377869Expenditure under MGNREGA on Schedule Caste (SC) Persondays during 2015-16 and 2018-19 (From: Ministry of Rural Development)Expenditure under MGNREGA on Schedule Caste (SC) Persondays during 2015-16 and 2018-19 (From: Ministry of Rural Development)Central['document_id', 'financial_year', 'approximate_expenditure', 'resource_uuid']['Rajya Sabha']data.gov.in['All']2021-03-04T03:47:13+00:002021-03-23T12:22:45+00:00
7b502fe1-6684-4c4c-9347-57d20af7d4dcDetails of Funds Allocated under MGNREGA from 2014-15 to 2018-19 (From: Ministry of Rural Development)Details of Funds Allocated under MGNREGA from 2014-15 to 2018-19 (From: Ministry of Rural Development)Central['document_id', 's_no_', '_year', 'budget_allocation', 'resource_uuid']['Rajya Sabha']data.gov.in['All']2021-03-04T03:30:03+00:002021-03-23T12:08:45+00:00
-
- --------- - -> ## G. Request data sorted by a valid `field` - -```python -datagovin.get_data("b9cfed4ca1a24f7aaffa88a8e1a2149c", - fields = ["city","zone","age","gender","result"], - sort_key = 'age', - sort_order = 'asc' - ) - -# Sort `field` in Ascending order using `asc`=`Ascending` -``` -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
cityzoneagegenderresult
0SuratSouth East Zone1MCured/Discharged
1SuratEast Zone - A1FCured/Discharged
2SuratSouth Zone1MCured/Discharged
3SuratNorth Zone1MCured/Discharged
4SuratNorth Zone2FCured/Discharged
5SuratCentral Zone2FCured/Discharged
6SuratSouth East Zone2MCured/Discharged
7SuratEast Zone - A2MCured/Discharged
8SuratNorth Zone2MCured/Discharged
9SuratNorth Zone3MCured/Discharged
10SuratNorth Zone34FCured/Discharged
11SuratNorth Zone34MCured/Discharged
12SuratSouth East Zone34MCured/Discharged
17SuratEast Zone - A34MDeath
20SuratSouth East Zone47FCured/Discharged
21SuratWest Zone47MCured/Discharged
22SuratEast Zone - B47MCured/Discharged
23SuratEast Zone - A47MCured/Discharged
25SuratNorth Zone47MCured/Discharged
26SuratSouth West Zone47MDeath
30SuratSouth East Zone60MCured/Discharged
31SuratSouth East Zone60FCured/Discharged
33SuratSouth East Zone60MDeath
35SuratSouth East Zone60FDeath
36SuratNorth Zone60FCured/Discharged
37SuratSouth Zone60FCured/Discharged
-
+## Get information about a resource ```python -datagovin.get_data("b9cfed4ca1a24f7aaffa88a8e1a2149c", - fields = ["city","zone","age","gender","result"], - sort_key = 'age', - sort_order = 'desc' - ) -# Sort `field` in Descending order using `desc`=`Descending` +# In a python environment +datagovin.get_resource_info("5c2f62fe-5afa-4119-a499-fec9d604d5bd") ``` -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
cityzoneagegenderresult
0SuratNorth Zone94MCured/Discharged
1SuratNorth Zone90FDeath
2SuratEast Zone - B89MCured/Discharged
3SuratNorth Zone88MCured/Discharged
4SuratNorth Zone88FDeath
5SuratNorth Zone86MCured/Discharged
6SuratSouth East Zone86MDeath
7SuratNorth Zone85MCured/Discharged
8SuratNorth Zone85FCured/Discharged
10SuratSouth East Zone54MCured/Discharged
11SuratNorth Zone54FCured/Discharged
12SuratSouth West Zone54MCured/Discharged
16SuratSouth Zone54MCured/Discharged
17SuratCentral Zone54FCured/Discharged
18SuratSouth Zone54FCured/Discharged
19SuratCentral Zone54MCured/Discharged
20SuratCentral Zone42MCured/Discharged
21SuratCentral Zone42FCured/Discharged
22SuratEast Zone - A42MCured/Discharged
23SuratSouth West Zone42FCured/Discharged
24SuratSouth Zone42MCured/Discharged
27SuratEast Zone - B42MCured/Discharged
28SuratSouth East Zone42MCured/Discharged
30SuratCentral Zone27FCured/Discharged
31SuratWest Zone27MCured/Discharged
32SuratSouth East Zone27MCured/Discharged
33SuratSouth West Zone27FCured/Discharged
35SuratCentral Zone27MCured/Discharged
39SuratSouth Zone27MCured/Discharged
-
- --------- - -> ## H. `ADVANCED` : Multi-Threading API-requests ->> ->> ### - Multi-Threading is disabled by default. ->> ### - You can enable multi-threading for faster performance on large datasets. ->> - -```python -datagovin.get_resource_info("dad7a738fd3b437dad31e1f844e9a575")['TotalRecords'] - -# Returns: -# 20197 +```sh +# From the command line +$ datagovindia get-resource-info 5c2f62fe-5afa-4119-a499-fec9d604d5bd ``` ->> ### To Enable Multi-threading - -```python -datagovin.enable_multithreading() - -# Returns: -# Multi-Threaded API requests enabled. +### `Output`: +```json +{ + "index_name": "5c2f62fe-5afa-4119-a499-fec9d604d5bd", + "title": "All India Pincode Directory till last month", + "desc": "All India Pincode Directory till last month", + "org_type": "Central", + "org": [ + "Ministry of Communications", + "Department of Posts" + ], + "sector": [ + "Post" + ], + "source": "data.gov.in", + "catalog_uuid": "709e9d78-bf11-487d-93fd-d547d24cc0ef", + "visualizable": false, + "active": false, + "created": 1608423011, + "updated": 1659003955, + "created_date": "2020-12-20", + "updated_date": "2022-07-28T10:25:55Z", + "field": [ + { + "id": "circlename", + "name": "circlename", + "type": "keyword" + }, + { + "id": "regionname", + "name": "regionname", + "type": "keyword" + }, + { + "id": "divisionname", + "name": "divisionname", + "type": "keyword" + }, + { + "id": "officename", + "name": "officename", + "type": "keyword" + }, + { + "id": "pincode", + "name": "pincode", + "type": "double" + }, + { + "id": "officetype", + "name": "officetype", + "type": "keyword" + }, + { + "id": "delivery", + "name": "delivery", + "type": "keyword" + }, + { + "id": "district", + "name": "district", + "type": "keyword" + }, + { + "id": "statename", + "name": "statename", + "type": "keyword" + }, + { + "id": "latitude", + "name": "latitude", + "type": "keyword" + }, + { + "id": "longitude", + "name": "longitude", + "type": "keyword" + } + ], + "total": 165307 +} +``` + +## Download data from a resource +```python +# In a python environment +data = datagovin.get_data("5c2f62fe-5afa-4119-a499-fec9d604d5bd") ``` -```python -%%timeit -datagovin.get_data("dad7a738fd3b437dad31e1f844e9a575",num_results='all') - -# Returns: -# 258 ms ± 11.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) +```sh +# Download data as a json, csv or xlsx file by specifying the --output filepath +$ datagovindia get-data 5c2f62fe-5afa-4119-a499-fec9d604d5bd --output pincode.csv ``` ->> ### To Disable Multi-threading - -```python -datagovin.disable_multithreading() -# Returns: -# Multi-Threaded API requests disabled. -``` +## License +`datagovindia` is licensed under the MIT License. See the [LICENSE](https://github.com/addypy/datagovindia/blob/master/LICENSE) file for more details. -```python -%%timeit -datagovin.get_data("dad7a738fd3b437dad31e1f844e9a575",num_results='all') -# Returns: -# 2.74 s ± 194 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) -``` +## **Authors**: +- [Aditya Karan Chhabra](mailto:aditya0chhabra@gmail.com) --------- +- [Abhishek Arora](https://econabhishek.github.io/) -> ## **Documentation** -> -> - For the Python library, visit - -> -> > [pypi.org/project/datagovindia/](https://pypi.org/project/datagovindia/) -> -> > [github.com/addypy/datagovindia](https://github.com/addypy/datagovindia) -> -> - For the R/CRAN package, visit - -> -> > [cran.r-project.org/web/packages/datagovindia](https://cran.r-project.org/web/packages/datagovindia) -> -> > [github.com/econabhishek/datagovindia](https://github.com/econabhishek/datagovindia) -> -> ### **Authors** : -> -> > - [Aditya Karan Chhabra](mailto:aditya0chhabra@gmail.com) -> -> > - [Abhishek Arora](mailto:abhishek.arora1996@gmail.com) -> -> ## **Meta-Data Updates** : -> -> > Last Updated: **September 18, 2023**, `08:32 IST` -> -> > Number of active APIs: **198428** -> \ No newline at end of file +- [Arijit Basu](https://arijitbasu.in/) \ No newline at end of file diff --git a/data/README.md b/data/README.md new file mode 100644 index 00000000..2016319b --- /dev/null +++ b/data/README.md @@ -0,0 +1,3 @@ +# Archive + +Archived data files for compatibility with `datagovindia<=1.0.0`, will be removed later. \ No newline at end of file diff --git a/datagovindia/__init__.py b/datagovindia/__init__.py deleted file mode 100644 index b55582aa..00000000 --- a/datagovindia/__init__.py +++ /dev/null @@ -1,1290 +0,0 @@ -""" -Python API-wrapper for Government of India’s [Open Government Data OGD platform](https://data.gov.in/) - -datagovindia` is an API wrapper for the over 100,000 APIs available at Government of India’s -[Open Government Data OGD platform](https://data.gov.in/ogpl_apis) - -============================================================================== - LICENSE -============================================================================== - -MIT License - -Copyright (c) 2021 ADITYA KARAN CHHABRA and ABHISHEK ARORA - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -============================================================================== - -""" -# Libraries -import os -import re -import time -import difflib -import requests -import concurrent.futures -import numpy as np -import pandas as pd -from . import util - -# Functions -def extract_keys(json_map): - return np.ravel([list(j.keys()) for j in json_map]) - - -def intersect(list_of_arrays): - return list(set.intersection(*[set(x) for x in list_of_arrays])) - - -def wipe_resource_id(rsrc_id): - """Basic cleaning of resource-id string.""" - rsrc_id = "".join([c for c in str(rsrc_id) if c.isalnum()]).strip() - assert len(rsrc_id) == 32, "{} is not a valid Resource-ID".format(rsrc_id) - return rsrc_id - - -def scrub_resource_id(rsrc_id): - """Converts Resource-ID in the correct format - acceptable format : 8,4,4,4,12 - """ - rsrc_id = wipe_resource_id(rsrc_id) - rsrc_id = "-".join( - [rsrc_id[:8], rsrc_id[8:12], rsrc_id[12:16], rsrc_id[16:20], rsrc_id[20:32]] - ) - assert len(rsrc_id) == 36, "{} is not a valid Resource-ID".format(rsrc_id) - return rsrc_id - - -def calc_loop_steps(n, step_size, offset=0): - """Calculates a list of 2-tuples for looping - through `n` results with steps of size `step_size`. - """ - steps = (n - offset) // step_size - remainder = (n - offset) % step_size - moves = [ - (offset + (step_size * s), offset + (step_size * (s + 1))) for s in range(steps) - ] - if remainder > 0: - moves.append( - (offset + (step_size * steps), (step_size * steps) + remainder + offset) - ) - return moves - - -def regexmatch(s, pat): - """Search `title` and `desc` fields - - Args: - s : `title` | `desc` field - pat : search-query - - Removes whitespace and special characters in - both `s` and `pat` before matching regular - expressions. - - Returns: - `bool` --> True/False - """ - m = False - special_chars = "[^a-zA-Z0-9\.]" - if isinstance(s, str) == True: - pat = re.sub(special_chars, "", pat) - s = re.sub(special_chars, "", s) - if re.search(pat, s, flags=re.I): - m = True - return m - - -def quickdict(json_map): - """Quickly convert a mapping list to a dictionary.""" - D = {} - for j in json_map: - D.update(j) - return D - - -def filtertime(timestamp, interval): - """Check if timestamp is between timestamp_range - (time1,time2) - - Args: - timestamp --> UNIX timestamp value. - interval --> `Tuple` of 2 UNIX timestamp values. - - Returns: - `bool` --> True/False - """ - T0, T1 = interval - if (timestamp <= T1) and (timestamp >= T0): - return True - else: - return False - - -def search_json(attr_map, query, n, error_handle=True): - """Search imported JSON-object - - Args: - attr_map : List((Dictionaries)) - query : string to be matched - - Returns: - - """ - query = str(query) - if len(query) > 0: - results = list( - filter(lambda x: regexmatch(list(x.values())[0], query), attr_map) - ) - R = len(results) - if R == 0: - - if error_handle == True: - print("Found 0 results. Try searching with shorter queries") - else: - pass - return [] - else: - if error_handle == True: - print("{} of {} results for : `{}`".format(min(R, n), R, query)) - else: - pass - return results[:n] - else: - if error_handle == True: - print("Empty Query : Please enter a valid query") - else: - pass - return [] - - -def pp_results(results): - if len(results) > 0: - print( - "\n==================================================================================\n" - ) - for result in results: - idx = list(result.keys())[0] - val = list(result.values())[0] - print("Resource-ID:\t{}\n\n{}".format(idx, val), end="\n\n") - print( - "==================================================================================\n" - ) - - -def pp_time_results(results): - if len(results) > 0: - print( - "\n==================================================================================\n" - ) - for result in results: - idx = result["resourceid"] - title = result["title"] - ftime = util.format_time(result["timestamp"]) - print("Resource-ID:\t{}\n{}\n{}".format(idx, ftime, title), end="\n\n") - print( - "==================================================================================\n" - ) - - -def standard_fetch(url_seq, time_out=31): - session = requests.Session() - responses = [] - url_seq = np.unique(url_seq) - error_count = 0 - error_thresh = np.ceil(len(url_seq) * 0.3) - for url in url_seq: - if error_count < error_thresh: - try: - response = session.get(url, timeout=(time_out, time_out + 15)) - resp_dict = response.json() - if len(resp_dict["records"]) == 0: - error_count += 1 - continue - elif len(resp_dict["records"]) > 0: - responses.append(resp_dict) - except: # Enable better logging - continue - else: - break - session.close() - return responses - - -def advanced_fetch(url_seq, time_out=31): - def request_get(url, time_out=time_out): - return requests.get(url, timeout=(time_out, time_out + 15)) - - responses = [] - url_seq = np.unique(url_seq) - with concurrent.futures.ThreadPoolExecutor() as executor: - futures = [executor.submit(request_get, url) for url in url_seq] - executor.shutdown() - for future in futures: - try: - resp_dict = future.result().json() - responses.append(resp_dict) - except: - continue - return responses - - -# Classes - - -class URLTool: - def __init__(self, api_key, max_results=500): - self.base_url_str = "https://api.data.gov.in/resource/" - self.format_arg_str = "&format=json" - self.resource_id_str = "" - self.api_key = api_key.lower().strip() - self.api_key_str = "?api-key={}".format(self.api_key) - self.max_results = max_results - self.offset_str = "&offset=0" - self.limit_str = "&limit={}".format(self.max_results) - self.filter_str = "" - self.field_str = "" - self.sort_str = "" - - def reset_params(self, max_results=500): - self.base_url_str = "https://api.data.gov.in/resource/" - self.format_arg_str = "&format=json" - self.max_results = max_results - self.offset_str = "&offset=0" - self.limit_str = "&limit={}".format(self.max_results) - self.filter_str = "" - self.field_str = "" - self.sort_str = "" - - def reset_resource(self, resource_id="", max_results=500): - self.base_url_str = "https://api.data.gov.in/resource/" - self.format_arg_str = "&format=json" - self.resource_id_str = resource_id - self.max_results = max_results - self.offset_str = "&offset=0" - self.limit_str = "&limit={}".format(self.max_results) - self.filter_str = "" - self.field_str = "" - self.sort_str = "" - - def add_resource_id(self, resource_id): - self.resource_id_str = scrub_resource_id(resource_id) - - def add_api_key(self, api_key): - self.api_key = api_key.lower().strip() - self.api_key_str = "?api-key={}".format(self.api_key) - - def add_offset(self, o): - if isinstance(o, int): - self.offset_str = "&offset={}".format(int(o)) - else: - self.offset_str = "&offset=0" - - def add_limit(self, l): - if isinstance(l, int): - self.limit_str = "&limit={}".format(int(l)) - else: - self.limit_str = "&limit={}".format(self.max_results) - - def add_filters(self, filter_key, filter_val): - self.filter_str = "&filters[{}]={}".format(filter_key, filter_val) - - def add_fields(self, field_list): - if (isinstance(field_list, list) or (isinstance(field_list, np.array))) & ( - len(field_list) > 0 - ): - self.field_str = "&fields={}".format(",".join([f for f in field_list])) - else: - self.field_str = "" - - def add_sort_key(self, sortby_key, sort_order="asc"): - if isinstance(sortby_key, str): - if len(sortby_key) > 0: - self.sort_str = "&sort[{}]={}".format(sortby_key, sort_order) - else: - self.sort_str = "" - else: - self.sort_str = "" - - def build(self): - url_params = [ - self.base_url_str, - self.resource_id_str, - self.api_key_str, - self.format_arg_str, - self.offset_str, - self.limit_str, - self.filter_str, - self.field_str, - self.sort_str, - ] - url = "".join([u.strip() for u in url_params]) - return url - - def build_url_seq(self, offset=0, num_results=10000, max_results=1000): - self.max_results = max_results - loop_seq = calc_loop_steps(num_results, self.max_results, offset=offset) - url_seq = [] - for (o, l) in loop_seq: - self.add_offset(o) - self.add_limit(min(self.max_results, num_results - o)) - url = self.build() - url_seq.append(url) - self.reset_params() - return url_seq - - -class Resource: - def __init__(self, DataGovIndia): - self.parent = DataGovIndia - self.resource_fields = [] - self.resource_index_string = "" - self.rsrc_id = "" - self.count = np.inf - self.multi_thread = DataGovIndia.multi_thread - self.sortkey = "" - self.sort_order = "asc" - self.offset = 0 - self.filters = [] - self.fields = [] - - def set_resource_id(self, resource_index_string): - self.resource_index_string = wipe_resource_id(resource_index_string) - if self.resource_index_string not in self.parent.assets.resource_ids: - print("{} is not a valid Resource-ID".format(resource_index_string)) - else: - self.rsrc_id = scrub_resource_id(self.resource_index_string) - try: - self.count = util.fetch_nrecords( - self.resource_index_string, self.parent.api_key - ) - except KeyError as err: - self.count = np.inf - - def fetch_true_fields(self): - if len(self.resource_fields) == 0: - self.resource_fields = self.parent.idxfieldmap[self.resource_index_string] - self.correct_field_list_str = "\n".join([f for f in self.resource_fields]) - else: - pass - - def set_filters(self, filters): - if isinstance(filters, list): - if len(filters) > 0: - filter_field_codes = np.unique( - np.hstack([list(f.keys()) for f in filters]) - ) - self.fetch_true_fields() - potential_incorrect_fields = np.setdiff1d( - filter_field_codes, self.resource_fields, assume_unique=True - ) - if len(potential_incorrect_fields) > 0: - incorrect_field_list_str = "; ".join( - [p for p in potential_incorrect_fields] - ) - print( - "These field(s) are invalid for this resource - {}".format( - incorrect_field_list_str - ), - end="\n\n", - ) - print( - "Valid field(s) are - \n{}".format(self.correct_field_list_str) - ) - - else: - self.filters = filters - else: - self.filters = [] - - def set_fields(self, fields): - if len(fields) > 0: - self.fetch_true_fields() - potential_incorrect_fields = np.setdiff1d( - fields, self.resource_fields, assume_unique=True - ) - if len(potential_incorrect_fields) > 0: - incorrect_field_list_str = "; ".join( - [p for p in potential_incorrect_fields] - ) - - print( - "These field(s) are invalid for this resource - {}".format( - incorrect_field_list_str - ), - end="\n\n", - ) - print("Valid field(s) are - \n{}".format(self.correct_field_list_str)) - else: - self.fields = fields - - def set_sort_key(self, sort_key, sort_order): - if len(sort_key) > 0: - self.fetch_true_fields() - if sort_key not in self.resource_fields: - close_matches = difflib.get_close_matches( - sort_key, self.resource_fields, n=1, cutoff=0.75 - ) - print( - "This field is invalid for this resource - {}".format(sort_key), - end="\n\n", - ) - if len(close_matches) > 0: - print("Did you mean - {}?".format(close_matches[0])) - else: - print("Valid fields are - \n{}".format(self.correct_field_list_str)) - - else: - self.sortkey = sort_key - self.sort_order = sort_order - - def set_req_method(self, num=np.inf): - self.num_results = min(self.count, num) - if isinstance(self.num_results, int) == False: - self.num_results = 10 ** 10 - - def make_urls(self): - urltool = URLTool(self.parent.api_key) - urltool.add_resource_id(self.rsrc_id) - self.url_seq = [] - if len(self.filters) > 0: - for filter_dict in self.filters: - for filter_key in filter_dict: - filter_values = filter_dict[filter_key] - if isinstance(filter_values, list) == False: - filter_values = [filter_values] - else: - pass - for val in filter_values: - urltool.add_fields(self.fields) - urltool.add_filters(filter_key, val) - urls = urltool.build_url_seq( - offset=self.offset, - num_results=self.num_results, - max_results=self.parent.max_results_per_req, - ) - urltool.reset_params( - max_results=self.parent.max_results_per_req - ) - self.url_seq.extend(urls) - else: - if len(self.fields) > 0: - urltool.add_fields(self.fields) - pass - if len(self.sortkey) > 0: - if self.sort_order in ["asc", "desc"]: - urltool.add_sort_key(self.sortkey, self.sort_order) - else: - urltool.add_sort_key(self.sortkey, "asc") - pass - urls = urltool.build_url_seq( - offset=self.offset, - num_results=self.num_results, - max_results=self.parent.max_results_per_req, - ) - self.url_seq.extend(urls) - - def get_data(self): - self.data = pd.DataFrame() - if self.multi_thread == True: - responses = advanced_fetch(self.url_seq, time_out=19) - elif self.multi_thread == False: - responses = standard_fetch(self.url_seq, time_out=19) - for resp in responses: - try: - df = pd.DataFrame(resp["records"]) - self.data = self.data.append(df, ignore_index=True) - except KeyError as err: - continue - return self.data.drop_duplicates() - - -def test_server(n=3): - """ - Checks server status at datagovin. - Returns list of n working-apis if server is functional. - """ - - server_response = {} - working_api_url = "https://api.data.gov.in/lists?format=json¬filters[source]=visualize.data.gov.in&filters[active]=1&offset=0&sort[updated]=desc&limit={}".format( - n - ) - working_api_response = requests.get(working_api_url, timeout=30) - working_api_content = working_api_response.json() - - if working_api_content["status"] == "ok": - records = working_api_content["records"] - working_apis = [record.get("index_name", "") for record in records] - working_apis = [w for w in working_apis if len(w) > 0] - server_response["working_apis"] = working_apis - server_response["status"] = True - else: - server_response["working_apis"] = [] - server_response["status"] = False - return server_response - - -def validate_key(api_key, attempts=1): - """ - Runs a quick test on server - If server is UP, uses working-api-indices to fetch a dataframe - """ - api_validity = False - server_status = False - if len(api_key) == 56: - server_response = test_server(n=3) - server_status = server_response["status"] - test_api_idx_list = server_response["working_apis"] - if server_status == True: - for _ in range(attempts): - test_api_idx = np.random.choice(test_api_idx_list) - try: - urltool = URLTool( - api_key, max_results=10 - ) # Dependency on previous func - urltool.add_resource_id(test_api_idx) - test_api_url = urltool.build() - test_response = requests.get(test_api_url, timeout=30) - test_content = test_response.json() - records = test_content["records"] - if len(records) > 0: - api_validity = True - break - except: - api_validity = False - continue - else: - api_validity = False - server_status = True - pass - return {"APIKEY": api_validity, "SERVER": server_status} - - -class DataGovIndia: - """ - datagovindia - ============ - - A Python API-wrapper for Government of India’s [Open Government Data OGD platform](https://data.gov.in/) - - datagovindia` is an API wrapper for the over 80,000 APIs available at Government of India’s - [Open Government Data OGD platform](https://data.gov.in/ogpl_apis) - - Features - ======== - > DISCOVERY - Find the right API resource. - > INFORMATION - Retrieve information about an API resource. - > DATA - Download data in a convenient pandas DataFrame from the chosen API. - - For more documentation, visit - - https://github.com/addypy/datagovindia - - For the R/CRAN package, visit - - https://github.com/econabhishek/datagovindia - - https://github.com/cran/datagovindia - """ - - sample_key = "579b464db66ec23bdd000001cdd3946e44ce4aad7209ff7b23ac571b" - - def __init__(self, api_key, enable_multithreading=False): - """ - Initialize `class` by providing a valid `api-key` from `data.gov.in/user` - - Args: - ===== - `api_key` : API-KEY (str) - - Optional: - ========= - `enable_multithreading`: (bool) - When `True`, enables faster downloads - with multi-threaded requests - - Note: - ===== - Initializing this class may take a few seconds, depending on the speed - of your internet connection. - - Initialization performs two key tasks: - 1) Tests server - - Tests data.gov.in server to check if APIs are functional. - 2) Validates the API-key provided - - Once validated, the API-key is stored and does not need to be entered again. - 3) Loads latest API meta-data. - - Downloads and loads data containing the latest details of available APIs. - - """ - self.api_key = "".join([a for a in api_key if a.isalnum()]).lower().strip() - print( - ".... Step (1/2) - Validating API-Key \r", - end="", - ) - validation_response = validate_key(self.api_key) - self.is_key_valid, self.is_server_up = ( - validation_response["APIKEY"], - validation_response["SERVER"], - ) - if self.is_server_up == True: - if self.is_key_valid == False: - print( - "This key - {} is INVALID! Please generate a valid API key here - https://data.gov.in/user".format( - api_key - ) - ) - elif self.is_key_valid == True: - if api_key == self.sample_key: - print( - "Step (1/2) : This API key is a sample-key with limited access. ", - end="\n", - ) - print( - "\tFor full access, generate a valid API key here - https://data.gov.in/user", - end="\n", - ) - self.is_sample_key = True - pass - else: - print( - "Step (1/2) : API key is VALID ", - end="\n", - ) - print( - "\tYou don't need to enter it again ", - end="\n", - ) - self.is_sample_key = False - pass - self.max_results_per_req = 1000 - self.assets = util.git_assets() - self.attributes = self.assets.attribute_dict - self.org_types = self.attributes["org_types"] - self.org_names = self.attributes["org_names"] - self.sources = self.attributes["sources"] - self.sectors = self.attributes["sectors"] - self.idxtitlemap = quickdict(self.assets.idx_title_map) - self.idxfieldmap = quickdict(self.assets.idx_field_map) - self.resource = None - self.error_handle = True - self.multi_thread = enable_multithreading - print( - "Step (2/2) : Latest API meta-data loaded! You may begin. \r", - end="\n", - ) - else: - print( - "The `data.gov.in` server appears to be down. Please try a little while later." - ) - - def enable_multithreading(self): - """Enables multi-thread API-requests for fast downloads of - large datasets. - """ - print("Multi-Threaded API requests enabled.") - self.multi_thread = True - - def disable_multithreading(self): - """Disables multi-thread API-requests.""" - print("Multi-Threaded API requests disabled.") - self.multi_thread = False - - def list_org_types(self): - """ - Returns list of organization-types as available on `data.gov.in` - """ - return self.org_types - - def list_org_names(self): - """ - Returns list of organizations-names as available on `data.gov.in` - """ - return self.org_names - - def list_sectors(self): - """ - Returns list of sectors listed on `data.gov.in` - """ - return self.sectors - - def list_sources(self): - """ - Returns list of data-sources listed on `data.gov.in` - """ - return self.sources - - def list_all_attributes(self): - """ - Returns a dictionary of lists - - - Sectors - - Sources - - Organization-Types - - Organization-Names - """ - return self.attributes - - def list_recently_updated(self, days=7, max_results=10, print_results=True): - """ - Returns list of resources updated in the last N days. - - Args: - days : Number of days. Defaults to 7. (int) - max_results : number of results to return. Defaults to 10. (int) - print_results : prints results when enabled. (bool) - Returns: - List of resources with `titles` and `Resource-ID`s that were updated - in the last `N` days. - """ - TimeNow = int(time.time()) - TimePast = TimeNow - int(86400 * days) - TimeInterval = (TimePast, TimeNow) - - filtered_json = list( - filter( - lambda x: filtertime(list(x.values())[0], TimeInterval), - self.assets.idx_updationtime_map, - ) - ) - num_total_results = len(filtered_json) - timedict = quickdict(filtered_json) - - finalitems = [ - {k: v} for k, v in sorted(timedict.items(), key=lambda item: item[1]) - ][-max_results:] - timedict = quickdict(finalitems) - - resourceids = list(timedict.keys()) - timestamps = list(timedict.values()) - titles = [self.idxtitlemap[r] for r in resourceids] - - results = [ - { - "resourceid": resourceids[r], - "timestamp": timestamps[r], - "title": titles[r], - } - for r in range(len(resourceids)) - ] - results.reverse() - - print( - "{} of {} results that were updated in the last - `{}` days".format( - len(results), num_total_results, days - ) - ) - if print_results == True: - pp_time_results(results) - return results - - def list_recently_created(self, days=7, max_results=10, print_results=True): - """ - Returns list of resources created in the last N days. - - Args: - days : Number of days. Defaults to 7. (int) - max_results : number of results to return. Defaults to 10. (int) - print_results : prints results when enabled. (bool) - Returns: - List of resources with `titles` and `Resource-ID`s that were created - in the last `N` days. - """ - TimeNow = int(time.time()) - TimePast = TimeNow - int(86400 * days) - TimeInterval = (TimePast, TimeNow) - - filtered_json = list( - filter( - lambda x: filtertime(list(x.values())[0], TimeInterval), - self.assets.idx_creationtime_map, - ) - ) - num_total_results = len(filtered_json) - timedict = quickdict(filtered_json) - - finalitems = [ - {k: v} for k, v in sorted(timedict.items(), key=lambda item: item[1]) - ][-max_results:] - timedict = quickdict(finalitems) - - resourceids = list(timedict.keys()) - timestamps = list(timedict.values()) - titles = [self.idxtitlemap[r] for r in resourceids] - - results = [ - { - "resourceid": resourceids[r], - "timestamp": timestamps[r], - "title": titles[r], - } - for r in range(len(resourceids)) - ] - results.reverse() - - print( - "{} of {} results that were created in the last - `{}` days".format( - len(results), num_total_results, days - ) - ) - if print_results == True: - pp_time_results(results) - return results - - def search_by_title(self, query, max_results=10, print_results=True): - """Search for a `data.gov.in` data-resource in `titles` of resources. - - Args: - query : the query string to search for. (str) - max_results : number of results to return. Defaults to 10. (int) - print_results : prints results when enabled. (bool) - Returns: - List of matching resources with `titles` and `Resource-ID`s. - """ - results = search_json( - self.assets.idx_title_map, query, max_results, self.error_handle - ) - if print_results == True: - pp_results(results) - return results - - def search_by_description(self, query, max_results=10, print_results=True): - """Search for a `data.gov.in` dataset resource in the descriptions of datasets. - - Args: - query : the query string to search for. (str) - max_results : number of results to return. Defaults to 10. (int) - print_results : prints results when enabled. (bool) - Returns: - List of matching resources with `descriptions` and `Resource-ID`s. - """ - results = search_json( - self.assets.idx_desc_map, query, max_results, self.error_handle - ) - if print_results == True: - pp_results(results) - return results - - def search_by_org_name(self, query, max_results=10, print_results=True): - """Search for a `data.gov.in` dataset resource using the name of the organization. - - Args: - query : the query string to search for. (str) - max_results : number of results to return. Defaults to 10. (int) - print_results : prints results when enabled. (bool) - Returns: - List of matching resources with `org-names` and `Resource-ID`s. - """ - if query in self.org_names: - result_indices = np.ravel( - [ - list(item.values()) - for item in self.assets.org_idx_map - if list(item.keys())[0] == query - ] - ) - if self.error_handle == True: - print( - "{} of {} results for `organization` - `{}`".format( - min(len(result_indices), max_results), - len(result_indices), - query, - ) - ) - else: - pass - results = [{r: self.idxtitlemap[r]} for r in result_indices][:max_results] - if print_results == True: - pp_results(results) - return results - else: - try: - close_match = difflib.get_close_matches( - query, self.org_names, n=1, cutoff=0.75 - )[0] - if self.error_handle == True: - print( - "No organization named - `{}`. Did you mean - `{}`?".format( - query, close_match - ) - ) - else: - pass - except: - print( - "No organization named - `{}`".format(query), - "Try using `.list_org_names()` to see a list of available organizations", - sep="\n", - ) - return [] - - def search_by_org_type(self, query, max_results=10, print_results=True): - """Search for a `data.gov.in` dataset resource using an organization type. - - Args: - query : the query string to search for. (str) - max_results : number of results to return. Defaults to 10. (int) - print_results : prints results when enabled. (bool) - Returns: - List of matching resources by `org-types` and `Resource-ID`s. - """ - if query in self.org_types: - result_indices = np.ravel( - [ - list(item.values()) - for item in self.assets.orgtype_idx_map - if list(item.keys())[0] == query - ] - ) - if self.error_handle == True: - print( - "{} of {} results for `organization type` - `{}`".format( - min(len(result_indices), max_results), - len(result_indices), - query, - ) - ) - else: - pass - results = [{r: self.idxtitlemap[r]} for r in result_indices][:max_results] - if print_results == True: - pp_results(results) - return results - else: - try: - close_match = difflib.get_close_matches( - query, self.org_types, n=1, cutoff=0.5 - )[0] - if self.error_handle == True: - print( - "No `organization type` named - `{}`. Did you mean - `{}`?".format( - query, close_match - ) - ) - else: - pass - except: - if self.error_handle == True: - print( - "No `organization type` named - `{}`".format(query), - "Try using `.list_org_types()` to see a list of available organization types", - sep="\n", - ) - else: - pass - return [] - - def search_by_sector(self, query, max_results=10, print_results=True): - """Search for a `data.gov.in` dataset resource using the `sector` attribute. - - Args: - query : the query string to search for. (str) - max_results : number of results to return. Defaults to 10 (int) - print_results : prints results when enabled. (bool) - Returns: - List of matching resources by `sector` and their `Resource-IDs`. - - """ - if query in self.sectors: - result_indices = self.assets.sector_idx_map[query] - if self.error_handle == True: - print( - "{} of {} results for `sector` : `{}`".format( - min(len(result_indices), max_results), - len(result_indices), - query, - ) - ) - else: - pass - results = [{r: self.idxtitlemap[r]} for r in result_indices][:max_results] - if print_results == True: - pp_results(results) - return results - else: - try: - close_match = difflib.get_close_matches( - query, self.sectors, n=1, cutoff=0.7 - )[0] - if self.error_handle == True: - print( - "No `sector` : `{}`. Did you mean : `{}`?".format( - query, close_match - ) - ) - else: - pass - except: - if self.error_handle == True: - print( - "No `sector` : `{}`".format(query), - "Try using `.list_sectors()` to see a list of available sectors", - sep="\n", - ) - else: - pass - return [] - - def search_by_source(self, query, max_results=10, print_results=True): - """Search for a `data.gov.in` dataset resource using the `source` attribute. - Args: - query : the query string to search for. (str) - max_results : number of results to return. Defaults to 10. (int) - print_results : prints results when enabled. (bool) - Returns: - List of matching resources by `source` and their `Resource-IDs`. - - """ - if query in self.sources: - result_indices = self.assets.source_idx_map[query] - if self.error_handle == True: - print( - "{} of {} results for `source` : `{}`".format( - min(len(result_indices), max_results), - len(result_indices), - query, - ) - ) - else: - pass - results = [{r: self.idxtitlemap[r]} for r in result_indices][:max_results] - if print_results == True: - pp_results(results) - return results - else: - try: - close_match = difflib.get_close_matches( - query, self.sources, n=1, cutoff=0.75 - )[0] - if self.error_handle == True: - print( - "No `source` : `{}`. Did you mean : `{}`?".format( - query, close_match - ) - ) - else: - pass - except: - if self.error_handle == True: - print( - "No `source` : `{}`".format(query), - "Try using `.list_sources()` to see a list of available sources", - sep="\n", - ) - else: - pass - - def search( - self, - title=None, - description=None, - org_name=None, - org_type=None, - sector=None, - source=None, - max_results=10, - print_results=True, - ): - """Search for resource using multiple filters. - - Args: - title : title query string. (str) - description : description query string. (str) - org_type : organization-type query string. (str) - org_name : organization-name query string. (str) - sector : sector query string. (str) - source : source query string. (str) - - - max_results : number of results to return. Defaults to 10. (int) - print_results : prints results if enabled. (bool) - - """ - print("Searching ... \r", end="") - self.error_handle = False - - if pd.isnull(org_type) == False: - org_type_matches = extract_keys( - self.search_by_org_type( - org_type, max_results=10 ** 10, print_results=False - ) - ) - else: - org_type_matches = self.assets.resource_ids - - if pd.isnull(org_name) == False: - org_name_matches = extract_keys( - self.search_by_org_name( - org_name, max_results=10 ** 10, print_results=False - ) - ) - else: - org_name_matches = self.assets.resource_ids - - if pd.isnull(sector) == False: - sector_matches = extract_keys( - self.search_by_sector(sector, max_results=10 ** 10, print_results=False) - ) - else: - sector_matches = self.assets.resource_ids - - if pd.isnull(source) == False: - source_matches = extract_keys( - self.search_by_source(source, max_results=10 ** 10, print_results=False) - ) - else: - source_matches = self.assets.resource_ids - - if pd.isnull(title) == False: - title_matches = extract_keys( - self.search_by_title(title, max_results=10 ** 10, print_results=False) - ) - else: - title_matches = self.assets.resource_ids - - if pd.isnull(description) == False: - description_matches = extract_keys( - self.search_by_description( - description, max_results=10 ** 10, print_results=False - ) - ) - else: - description_matches = self.assets.resource_ids - match_list_1 = [ - org_type_matches, - org_name_matches, - sector_matches, - source_matches, - title_matches, - description_matches, - ] - match_list_2 = [ - m if len(m) > 0 else self.assets.resource_ids for m in match_list_1 - ] - matches = intersect(match_list_2) - M = len(matches) - self.error_handle = True - - if M > 0: - print("{} of {} results".format(min(M, max_results), M)) - results = [{match: self.idxtitlemap[match]} for match in matches][ - :max_results - ] - if print_results == True: - pp_results(results) - return results - elif M == 0: - print("Found 0 results. Try searching with fewer parameters.") - - def get_resource_info(self, rsrc_id): - """Get all available meta-data for a `data.gov.in` data resource - Meta-Data includes - - - Resource-ID - - Title - - Description - - Total records available - - Date-Created - - Data-Updated - - Organization-Type - - Organization-Name - - Source - - Sector - - Fields - """ - rsrc_id = wipe_resource_id(rsrc_id) - results = self.assets.compile_all_information(rsrc_id, self.api_key) - - return results - - def get_resource_fields(self, rsrc_id): - """Get details of fields (variables) available for a `data.gov.in` data resource.""" - try: - rsrc_id = wipe_resource_id(rsrc_id) - if rsrc_id in self.assets.resource_ids: - fieldcodes = self.idxfieldmap[rsrc_id] - fieldlabels = [self.assets.field_label_map[f] for f in fieldcodes] - fielddtypes = [self.assets.field_dtype_map[f] for f in fieldcodes] - fieldinfo = [ - { - "field_code": fieldcodes[f], - "field_label": fieldlabels[f], - "field_type": fielddtypes[f], - } - for f in range(len(fieldcodes)) - ] - return pd.DataFrame(fieldinfo) - else: - print("{} is not a valid Resource-ID".format(rsrc_id)) - except AssertionError: - print("{} is not a valid Resource-ID".format(rsrc_id)) - - def get_last_resource(self): - """Returns the last collected data.""" - try: - return self.resource.data - except: - return None - - def get_data( - self, - resource_id, - start_from=0, - num_results="all", - filters=[], - fields=[], - sort_key="", - sort_order="asc", - ): - """Returns requested data in a dataframe format. - - Args: - resource_id : Resource-ID selected using search functionality (str) - start_from : Start Index. Defaults to 0. (int) - num_results : Total number of results desired. Defaults to 'all'. (int) - filters : Filters for request (List of dicts/ dict) - fields : List of fields (variables) to return - sort_key : Sort dataframe using an available field (variable) - sort_order : Ascending- 'asc' , Descending - 'desc' - - """ - self.resource = Resource(self) - self.resource.set_resource_id(resource_id) - if isinstance(filters, dict) or isinstance(filters, list): - if isinstance(filters, dict): - self.resource.set_filters([filters]) - elif isinstance(filters, list): - self.resource.set_filters(filters) - else: - self.resource.set_filters([]) - - self.resource.set_fields(fields) - self.resource.set_sort_key(sort_key, sort_order) - n = np.inf - if isinstance(num_results, int) == True: - if n > 0: - n = num_results - self.resource.set_req_method(num=n) - self.resource.make_urls() - data = self.resource.get_data() - if self.is_sample_key == True: - print( - "*Warning*\nYou are using a sample API-key. Some observations may be missing." - ) - else: - pass - return data diff --git a/datagovindia/util.py b/datagovindia/util.py deleted file mode 100644 index ef345f10..00000000 --- a/datagovindia/util.py +++ /dev/null @@ -1,252 +0,0 @@ -import os -import time -import numpy as np -import json -import gzip -import concurrent.futures -import requests - - -def fetch_assets_from_github(url_seq, time_out=60): - """ - Downloads reference data from github rep, updated periodically. - """ - - def fetch_asset(url, time_out=time_out): - response = requests.get(url, timeout=(time_out, time_out + 15)) - return json.loads(gzip.decompress(response.content)) - - datasets = [] - with concurrent.futures.ThreadPoolExecutor() as executor: - futures = [executor.submit(fetch_asset, url) for url in url_seq] - executor.shutdown() - return [future.result() for future in futures] - - -def format_time(ts): - """Converts UNIX timestamp to local timestring with human-readable format""" - return time.strftime("%d %B %Y, %I:%M %p", time.localtime(int(ts))) - - -def wipe_resource_id(rsrc_id): - """Basic cleaning of resource-id string.""" - rsrc_id = "".join([c for c in str(rsrc_id) if c.isalnum()]).strip() - assert len(rsrc_id) == 32, "{} is not a valid Resource-ID".format(rsrc_id) - return rsrc_id - - -def scrub_resource_id(rsrc_id): - """Converts Resource-ID in the correct format - acceptable format : 8,4,4,4,12 - """ - rsrc_id = wipe_resource_id(rsrc_id) - rsrc_id = "-".join( - [rsrc_id[:8], rsrc_id[8:12], rsrc_id[12:16], rsrc_id[16:20], rsrc_id[20:32]] - ) - assert len(rsrc_id) == 36, "{} is not a valid Resource-ID".format(rsrc_id) - return rsrc_id - - -def fetch_nrecords(resourceid, api_key): - """ - Fetch number of records in realtime - resourceid must be in in `scrub_resource_id` format - """ - try: - url = "https://api.data.gov.in/resource/{}?api-key={}&format=json&offset=0&limit=0".format( - scrub_resource_id(resourceid), api_key - ) - response = requests.get(url).json() - total = response.get("total", np.inf) - except: - total = np.inf - return total - - -class git_assets: - """ - Holds json serialized static files for `data.gov.in` resources, - indexed by resource-id(s). - """ - - def __init__(self): - # Change absolute path to relative path - # - print( - ".... Step (2/2) - Loading latest API reference data. This may take a few seconds......... \r", - end="", - ) - util_base_url = "https://raw.github.com/addypy/datagovindia/master/data/" - attribute_fp = util_base_url + "attributes.gz" - idx_map_fp = util_base_url + "idx_maps.gz" - field_label_fp = util_base_url + "fieldlabel_map.gz" - field_dtype_fp = util_base_url + "fielddtype_map.gz" - org_idx_fp = util_base_url + "orgidx_map.gz" - orgtype_idx_fp = util_base_url + "orgtypeidx_map.gz" - source_idx_fp = util_base_url + "sourceidx_map.gz" - sector_idx_fp = util_base_url + "sectoridx_map.gz" - - asset_url_seq = [ - attribute_fp, - idx_map_fp, - org_idx_fp, - orgtype_idx_fp, - source_idx_fp, - sector_idx_fp, - field_label_fp, - field_dtype_fp, - ] - print( - ".... Step (2/2) - Loading latest API meta-data. This may take a few seconds........ \r", - end="", - ) - asset_data = fetch_assets_from_github(asset_url_seq) - print( - ".... Step (2/2) - Loading latest API meta-data. This may take a few seconds...... \r", - end="", - ) - self.attribute_dict = asset_data[0] - self.idx_map = asset_data[1] - self.org_idx_map = asset_data[2] - self.orgtype_idx_map = asset_data[3] - self.source_idx_map = asset_data[4] - self.sector_idx_map = asset_data[5] - self.field_label_map = asset_data[6] - self.field_dtype_map = asset_data[7] - print( - ".... Step (2/2) - Loading latest API meta-data. This may take a few seconds..... \r", - end="", - ) - self.resource_ids = np.array([k.get("resourceid") for k in self.idx_map]) - self.idx_title_map = [ - {k.get("resourceid"): k.get("title", "")} for k in self.idx_map - ] - self.idx_desc_map = [ - {k.get("resourceid"): k.get("desc", "")} for k in self.idx_map - ] - self.idx_creationtime_map = [ - {k.get("resourceid"): k.get("date_created", np.nan)} for k in self.idx_map - ] - self.idx_updationtime_map = [ - {k.get("resourceid"): k.get("date_updated", np.nan)} for k in self.idx_map - ] - self.idx_field_map = [ - {k.get("resourceid"): k.get("fields", [])} for k in self.idx_map - ] - self.idx_source_map = [ - {k.get("resourceid"): k.get("source", "")} for k in self.idx_map - ] - self.idx_orgname_map = [ - {k.get("resourceid"): k.get("orgnames", [])} for k in self.idx_map - ] - self.idx_orgtype_map = [ - {k.get("resourceid"): k.get("orgtype", "")} for k in self.idx_map - ] - self.idx_sector_map = [ - {k.get("resourceid"): k.get("sectors", [])} for k in self.idx_map - ] - print( - ".... Step (2/2) - Loading latest API meta-data. This may take a few seconds... \r", - end="", - ) - - def compile_resource_fields(self, rsrc_id): - """ - Compile Field Information specific to resource. - """ - fields = list( - np.ravel( - [ - next(iter(d.values())) - for d in self.idx_field_map - if next(iter(d)) == rsrc_id - ] - ) - ) - labels = [self.field_label_map[f] for f in fields] - return {fields[f]: labels[f] for f in range(len(fields))} - - def compile_all_information(self, rsrc_id, api_key): - """ """ - if rsrc_id in self.resource_ids: - title = [ - list(item.values())[0] - for item in self.idx_title_map - if list(item.keys())[0] == rsrc_id - ][0] - desc = [ - list(item.values())[0] - for item in self.idx_desc_map - if list(item.keys())[0] == rsrc_id - ][0] - nrecords = fetch_nrecords(scrub_resource_id(rsrc_id), api_key) - created_on = format_time( - [ - list(item.values())[0] - for item in self.idx_creationtime_map - if list(item.keys())[0] == rsrc_id - ][0] - ) - updated_on = format_time( - [ - list(item.values())[0] - for item in self.idx_updationtime_map - if list(item.keys())[0] == rsrc_id - ][0] - ) - orgnames = list( - np.ravel( - [ - list(item.values()) - for item in self.idx_orgname_map - if list(item.keys())[0] == rsrc_id - ] - ) - ) - org_type = np.ravel( - [ - list(item.values()) - for item in self.idx_orgtype_map - if list(item.keys())[0] == rsrc_id - ][0] - )[0] - sector = np.ravel( - [ - list(item.values()) - for item in self.idx_sector_map - if list(item.keys())[0] == rsrc_id - ][0] - )[0] - source = np.ravel( - [ - list(item.values()) - for item in self.idx_source_map - if list(item.keys())[0] == rsrc_id - ][0] - )[0] - fields = list( - np.ravel( - [ - list(item.values()) - for item in self.idx_field_map - if list(item.keys())[0] == rsrc_id - ][0] - ) - ) - ApiInformation = { - "ResourceID": rsrc_id, - "Title": title, - "Description": desc, - "TotalRecords": nrecords, - "DateCreated": created_on, - "DateUdpated": updated_on, - "OrganizationNames": orgnames, - "OrganizationTypes": org_type, - "Sector": sector, - "Source": source, - "Fields": fields, - } - else: - print("{} is not a valid Resource-ID".format(rsrc_id)) - ApiInformation = {} - return ApiInformation diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..795786e2 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,58 @@ +[build-system] +requires = ["setuptools>=61.2", "wheel"] +build-backend = "setuptools.build_meta" + +[tool.setuptools] +packages = ["datagovindia"] +package-dir = {"" = "src"} + +[project] +name = "datagovindia" +version = "1.0.0" +description = "Python API wrapper for Government of India Open Government Data (OGD) platform data.gov.in" +readme = "README.md" +license = { file = "LICENSE" } +authors = [ + { name = "Aditya Karan Chhabra", email = "aditya0chhabra@gmail.com" }, + { name = "Arijit Basu", email = "hi@arijitbasu.in" }, + { name = "Abhishek Arora" } +] +maintainers = [ + { name = "Aditya Karan Chhabra", email = "aditya0chhabra@gmail.com" }, + { name = "Arijit Basu", email = "hi@arijitbasu.in" } +] + +keywords = ["data-gov-in","indian-government-data","open-data-india","public-datasets","indian-datasets","government-api","india-statistics","ogd-platform"] + +classifiers = [ + "License :: OSI Approved :: MIT License", + "Topic :: Database", + "Intended Audience :: Education", + "Intended Audience :: Financial and Insurance Industry", + "Intended Audience :: Information Technology", + "Intended Audience :: Science/Research", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Natural Language :: English", + "Operating System :: OS Independent" +] +dependencies = [ + "click >=8.0.0", + "pandas >=1.2.0", + "python-dateutil >=2.8.0", + "requests >=2.25.0", + "numpy" +] + +[project.urls] +homepage = "https://pypi.org/project/datagovindia/" +repository = "https://github.com/addypy/datagovindia/" + +[project.entry-points.console_scripts] +datagovindia = "datagovindia.cli:cli" \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 00000000..00960214 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +click>=8.0.0 +pandas>=1.2.0 +python-dateutil>=2.8.0 +requests>=2.25.0 \ No newline at end of file diff --git a/setup.py b/setup.py deleted file mode 100644 index 82804a36..00000000 --- a/setup.py +++ /dev/null @@ -1,47 +0,0 @@ -from setuptools import setup, find_packages -from os import path - -directory = path.abspath(path.dirname(__file__)) - -with open(path.join(directory, 'README.md')) as f: - long_description = f.read() - -setup( - name="datagovindia", - version='0.4', - author='Aditya Karan Chhabra', - maintainer = 'Abhishek Arora', - author_email='aditya0chhabra@gmail.com', - packages=find_packages(include=['datagovindia', 'datagovindia.*'],exclude=['data','.*gz']), - scripts=['datagovindia/__init__.py','datagovindia/util.py'], - url='https://pypi.org/project/datagovindia/', - download_url = "https://github.com/addypy/datagovindia/", - license='MIT', - description='Python API wrapper for Government of India Open Government Data (OGD) platform data.gov.in', - long_description=long_description, - long_description_content_type='text/markdown', - classifiers=[ - "License :: OSI Approved :: MIT License", - "Topic :: Database", - "Intended Audience :: Science/Research", - 'Programming Language :: Python', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.2', - 'Programming Language :: Python :: 3.3', - 'Programming Language :: Python :: 3.4', - 'Programming Language :: Python :: 3.5', - 'Programming Language :: Python :: 3.6', - 'Programming Language :: Python :: 3.7', - 'Programming Language :: Python :: 3.8', - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", - "Natural Language :: English", - "Operating System :: OS Independent", - ], - keywords='india government opendata ogd ogdindia datagovin', - install_requires=[ - "requests", - "numpy", - "pandas" - ], -) \ No newline at end of file diff --git a/src/datagovindia/__init__.py b/src/datagovindia/__init__.py new file mode 100644 index 00000000..8c3e3837 --- /dev/null +++ b/src/datagovindia/__init__.py @@ -0,0 +1,617 @@ +"""Python API-wrapper for Government of India’s [Open Government Data OGD platform](https://data.gov.in/) +`datagovindia` is an API wrapper for APIs available at Government of India’s [Open Government Data OGD platform](https://data.gov.in/ogpl_apis)""" + +import os +import re +import sys +import time +import requests +import sqlite3 +import pandas as pd +import multiprocessing as mp +from typing import List, Dict +from urllib.parse import urlencode +from datetime import datetime +from dateutil.parser import parse as dateutil_parse + +def construct_url(params: dict) -> str: + """ + Construct URL with query parameters. + """ + return "https://api.data.gov.in/lists" + "?" + "&".join([f"{k}={v}" for k, v in params.items()]) + +def remove_special_chars(s: str) -> str: + """ + Remove special characters from string. + """ + return re.sub("[^a-zA-Z0-9\.]", "", s).strip().lower() # type: ignore + +def regexmatch(text: str, query: str) -> bool: + """Search for 'query' within 'text' using regex""" + if text and query: # Check only non-null + text, query = remove_special_chars(text), remove_special_chars(query) + if re.search(query, text, flags=re.I): + return True + else: + return False + else: + return False + +def format_date(date_string: str): + """Parse date string with given format and return ISO 8601 formatted date string""" + try: + return dateutil_parse(date_string).isoformat(timespec="seconds") + except (ValueError, TypeError): + return None + +def current_datetime() -> str: + """Get the current datetime as a string in ISO 8601 format.""" + return datetime.now().isoformat(timespec="seconds") + +def is_nested(lst: list) -> bool: + """Check if list is nested""" + return any(isinstance(i, list) for i in lst) + +def format_seconds(seconds: int, padding: int = 8) -> str: + """Format seconds into a readable format""" + hours, remainder = divmod(seconds, 3600) + minutes, seconds = divmod(remainder, 60) + parts = [] + if hours: + parts.append(f"{int(hours)}h{'' if hours > 1 else ''}") + if minutes: + parts.append(f"{int(minutes)}m{'' if minutes > 1 else ''}") + if seconds or not parts: + parts.append(f"{int(seconds)}s{'' if seconds > 1 else ''}") + eta_string = ":".join(parts) + return eta_string.ljust(padding) + +def display_progress_bar(iteration, total, bar_length=50, char="=", fill_char=".", eta=None): + """ + Display a progress bar in the console. + + Parameters: + ------------ + iteration (int): Current iteration. + total (int): Total number of iterations. + bar_length (int): Length of the progress bar. + char (str): Character for completed progress. + fill_char (str): Character for remaining progress. + eta (str or None): Estimated time of arrival. + """ + progress = iteration / total + arrow = char * int(round(progress * bar_length) - 1) + ">" + spaces = fill_char * (bar_length - len(arrow)) + + if eta: + sys.stdout.write(f"\rUpdated {iteration}/{total} resources: [{arrow + spaces}] - ETA: {eta}") + else: + sys.stdout.write(f"\rUpdated {iteration}/{total} resources: [{arrow + spaces}]") + sys.stdout.flush() + +def compile_record_info(record: dict) -> dict: + """Compile record info into a dictionary""" + return { + "resource_id": record.get("index_name"), + "title": record.get("title"), + "description": record.get("desc"), + "org_type": record.get("org_type"), + "fields": " | ".join([f.get("id", "") for f in record.get("field", [])]), + "orgs": " | ".join(record.get("org", [])), + "source": record.get("source"), + "sectors": " | ".join(record.get("sector", [])), + "date_created": format_date(record.get("created_date")), + "date_updated": format_date(record.get("updated_date")), + } + +def get_total_available_resources() -> int: + """ + Retrieve total number of available records. + """ + params = { + "format": "json", + "notfilters[source]": "visualize.data.gov.in", + "filters[active]": 1, + "offset": 0, + "limit": 0, + } + api_url = construct_url(params) + api_response = requests.get(api_url, timeout=(5, 10)) + return api_response.json()["total"] + +def _fetch_metadata(api_key: str, start: int = 0, end: int = 1000) -> list: + """ + Retrieve records using single thread. + """ + params = { + "api-key": api_key, + "notfilters[source]": "visualize.data.gov.in", + "filters[active]": 1, + "sort[updated]": "desc", + "format": "json", + "offset": start, + "limit": end - start, + } + api_url = construct_url(params) + resp = requests.get(api_url, timeout=(5, 10)) + return [compile_record_info(record) for record in resp.json()["records"]] + +def fetch_metadata_records( + api_key:str, start: int = 0, end: int = 1000000, batch_size: int = 100, njobs: int = None +) -> list: + """Retrieve records using multiple threads.""" + with mp.Pool(njobs) as pool: + data = pool.starmap( + _fetch_metadata, [(api_key, i, min(end, i + batch_size)) for i in range(start, end, batch_size)] + ) + return [item for sublist in data for item in sublist] + +def get_api_info(url) -> dict: + """Get json data from url""" + response = requests.get(url, timeout=(5, 10)).json() + skip_keys = [ + "message", + "version", + "status", + "offset", + "limit", + "count", + "records", + "external_ws", + "external_ws_url", + "target_bucket", + ] + boolean_keys = ["visualizable", "active"] + for key in boolean_keys: + response[key] = True if response[key] == "1" else False + response = {k: v for k, v in response.items() if k not in skip_keys} + return response + +def save_dataframe(df, filepath): + """Save dataframe to filepath""" + def get_file_extension(filepath) -> str: + """Get file extension from filepath""" + return os.path.splitext(filepath)[-1] + file_extension = get_file_extension(filepath) + if file_extension == ".csv": + df.to_csv(filepath, index=False) + elif file_extension == ".json": + df.to_json(filepath, orient="records") + elif file_extension == ".xlsx": + df.to_excel(filepath, index=False) + else: + raise ValueError(f"Invalid file extension: {file_extension}") + +def get_api_records(url:str, **kwargs) -> list: + """Get json data from url""" + response = requests.get(url, **kwargs) + response.raise_for_status() + data = response.json() + if "records" not in data: + return [] + else: + return data["records"] + +def get_data_njobs(url_list: list, njobs=None) -> list: + """Get record data from url_list using njobs""" + if njobs is None: + njobs = mp.cpu_count() + with mp.Pool(njobs) as pool: + data = pool.map(get_api_records, url_list) + # Flatten list of lists + data = [item for sublist in data for item in sublist] + return data + +def build_url( + api_key: str, + resource_id: str, + offset: int = 0, + limit: int = 1000, + filters: Dict[str, str] = None, + fields: List[str] = None, + sort_by: str = None, + sort_order: str = "asc", +) -> str: + """Build url to fetch data from data.gov.in + + Parameters + ---------- + + api_key: (str) (required) + API key for data.gov.in + + resource_id: (str) (required) + Unique identifier of the resource. + + offset: (int) (optional) + Offset of the records to be fetched. Defaults to 0. + + limit: (int) (optional) + Number of records to be fetched. Defaults to 1000. + + filters: (dict) (optional) + Filters to be applied on the records, should be a list of dicts of the form {:}. + Defaults to {}. + + fields: (list) (optional) + Fields to be fetched. Defaults to []. + + sort_by: (str) (optional) + Field to sort results by. Defaults to None. + + sort_order: (str) (optional) + Order of sorting. Defaults to "asc". Only applicable if sort_by is not None. + + Returns + ------- + + url: str + Url to fetch data from data.gov.in + """ + params = {"api-key": api_key, "format": "json", "offset": offset, "limit": limit} + if fields: + params["fields"] = ",".join(fields) + if sort_by: + params[f"sort[{sort_by}]"] = sort_order or "asc" + if filters: + params.update({f"filters[{k}]": v for k, v in filters.items()}) + url = f"https://api.data.gov.in/resource/{resource_id}" + "?" + urlencode(params, doseq=True, safe="\],\[") + return url + +class DataGovIndia: + """Python API-wrapper for Government of India’s [Open Government Data OGD platform](https://data.gov.in/)""" + def __init__(self, api_key:str=None, db_path:str=None) -> None: + """ + Initialize DataGovIndia object + + Parameters + ---------- + + api_key: str (optional) + API key for data.gov.in. If not provided, it will be read from the environment variable DATAGOVINDIA_API_KEY + If not found, it will raise an error. + + db_path: str (optional) + Required only for searching the database. + Path to the database file. If not provided, it will be read from the environment variable DATAGOVINDIA_DB_PATH + If not found, it will be set to ~/datagovindia.db + """ + if api_key: + self.api_key = api_key + else: + self.api_key = os.environ.get("DATAGOVINDIA_API_KEY") + + if db_path: + self.db_path = db_path + else: + self.db_path = os.environ.get( + "DATAGOVINDIA_DB_PATH", os.path.join(os.path.expanduser("~"), "datagovindia.db") + ) + self.connect(verify=False) + + def validate_api_key(self): + if not self.api_key: + raise ValueError("API key not found. Please set it as an environment variable `DATAGOVINDIA_API_KEY` or pass it as an argument while initializing the DataGovIndia object.") + + def connect(self, verify:bool=False): + """Connect to datagovindia.db sqlite database""" + self.conn = sqlite3.connect(self.db_path) + self.conn.create_function("regexmatch", 2, regexmatch) + self.cursor = self.conn.cursor() + # check whether the table exists + if verify: + self.cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='resources'") + if (self.cursor.fetchone() is None): + raise ValueError(f""" + Could not find tables in {self.db_path}. + If this is the first time you are using this package, please run the following commands: + >>> from datagovindia import DataGovIndia + >>> data_gov = DataGovIndia() + >>> data_gov.update_metadata() + """ + ) + def close(self): + """Close connection to datagovindia.db sqlite database""" + self.conn.close() + + def search(self, query:str, search_fields: list=["title"], sort_by:str=None, ascending:bool=True) -> pd.DataFrame: + """Search for a query in the database. + + Parameters + ---------- + + query: str (required) + Search query to be searched in the database. + + search_fields: list (optional) + List of fields to search in. Defaults to ['title']. + Valid fields are: ['title', 'description', 'org_type', 'fields', 'orgs', 'source', 'sectors', 'date_created', 'date_updated'] + + sort_by: str (optional) + Field to sort results by. Defaults to None. + Valid fields are: ['title', 'description', 'org_type', 'fields', 'orgs', 'source', 'sectors', 'date_created', 'date_updated'] + + ascending: bool (optional) + Sort results in ascending order. Defaults to True. + Set to False to sort in descending order. Only applicable if sort_by is not None. + + Returns + ------- + df: pd.DataFrame + Dataframe of search results. + + Examples + -------- + >>> from datagovindia import DataGovIndia + >>> datagovin = DataGovIndia() + + ### Simple search + >>> datagovin.search("covid") + + ### Search in specific fields + >>> datagovin.search("pollution", search_fields=['title', 'description']) + + ### Search and sort results by date_created in descending order + >>> datagovin.search("MGNREGA", search_fields=['title', 'description'], sort_by='date_created', ascending=False) + """ + self.connect(verify=True) + sql_query = self.gen_sql_query(query, search_fields, sort_by, ascending) + self.cursor.execute(sql_query) + keys = list(map(lambda x: x[0], self.cursor.description)) + values = self.cursor.fetchall() + records = list(map(lambda x: dict(zip(keys, x)), values)) + data = pd.DataFrame(records) + if len(data) > 0: + for col in ["fields", "orgs", "sectors"]: + data[col] = data[col].str.split(" | ", regex=False) + self.close() + return data + + def gen_sql_query(self, query : str, search_fields:list=["title"], sort_by:str=None, ascending:bool=True) -> str: + """Construct sql query for searching the database""" + searchable_attributes = [ + "title", + "description", + "org_type", + "fields", + "orgs", + "source", + "sectors", + "date_created", + "date_updated", + ] + sql_query = "SELECT * FROM resources WHERE " + for field in search_fields: + if field not in searchable_attributes: + # Raise error and print statement + raise ValueError( + f"Invalid search field {field}, valid fields are {searchable_attributes}" + ) + sql_query += f"regexmatch({field}, '{query}') OR " + sql_query = sql_query[:-4] # Remove the last " OR " + if sort_by: + assert sort_by in searchable_attributes, f"Invalid sort_by field {sort_by}, valid fields are {searchable_attributes}" + sql_query += f" ORDER BY {sort_by} {'ASC' if ascending else 'DESC'}" + return sql_query + + def get_resource_info(self, resource_id: str) -> dict: + """Fetches information about a resource. + + Parameters + ---------- + resource_id: (str) (required) + Unique identifier of the resource. + + Returns + ------- + info: dict + Dictionary containing information about the resource. + """ + self.validate_api_key() + url = build_url( + api_key=self.api_key, + resource_id=resource_id, + filters={}, + fields=[], + sort_by="", + sort_order="asc", + offset=0, + limit=0, + ) + api_info = get_api_info(url) + return api_info + + def get_data( + self, + resource_id: str, + sort_by: str = None, + ascending: bool = True, + offset: int = 0, + batch_size: int = 2000, + njobs: int = None, + limit: int = None, + filters: Dict[str, str] = None, + fields: List = None, + ) -> pd.DataFrame: + """Returns requested data as a pandas dataframe. + + Parameters + ---------- + + resource_id: (str) (required) + Unique identifier of the resource. + + sort_by: (str) (optional) + Field to sort results by. Defaults to None. + + ascending: (bool) (optional) + Whether to sort results in ascending order. Defaults to True. + Only applicable if sort_by is not None. + + offset: (int) (optional) + Offset of the records to be fetched. Defaults to 0. + + batch_size: (int) (optional) + Number of records to be fetched in a single request. Defaults to 2000. + Increasing batch_size will increase the speed of data collection but will also increase the memory usage. + reduce batch_size if you are facing memory issues or timeout errors. + + limit: (int) (optional) + Number of records to be fetched. Defaults to None. + If None, it will be set to the total number of records available in the resource. + + filters: (dict) (optional) + Filters to be applied on the records, should be a dict of the form {:}. + + fields: (list) (optional) + Fields to be fetched. Defaults to []. Use `.get_resource_info` to get a list of all available fields for a resource. + + njobs: (int) (optional) + Number of threads to use for collecting data. Defaults to None. + None will use all available threads. + + Returns + ------- + + df: pd.DataFrame + Dataframe with requested data. + """ + self.validate_api_key() + + if limit is None: + limit = self.get_resource_info(resource_id)["total"] + params_ = { + "resource_id": resource_id, + "sort_by": sort_by, + "sort_order": "asc" if ascending else "desc", + "filters": filters, + "fields": fields, + } + param_list = [ + {**params_, **{"offset": i, "limit": min(batch_size, limit - i)}} + for i in range(offset, limit, batch_size) + ] + + url_list = [build_url(api_key=self.api_key, **params) for params in param_list] + data = get_data_njobs(url_list, njobs=njobs) + return pd.DataFrame(data) + + def get_update_info(self): + """Fetches information about the last update of the database. + + Returns + ------- + info: dict + Dictionary containing information about the metadata in the database. + """ + self.connect(verify=True) + + self.cursor.execute(""" + SELECT last_updated, number_of_resources FROM metadata + """) + info_ = dict(zip(["last_updated", "number_of_resources"], self.cursor.fetchone())) + self.close() + return info_ + + def create_tables(self): + """Create tables in database if they don't exist. + Tables: resources, metadata + """ + + self.connect() + self.cursor.execute(""" + CREATE TABLE IF NOT EXISTS resources( + resource_id TEXT PRIMARY KEY, + title TEXT, + description TEXT, + org_type TEXT, + fields TEXT, + orgs TEXT, + source TEXT, + sectors TEXT, + date_created TEXT, + date_updated TEXT + ) + """) + self.cursor.execute(""" + CREATE TABLE IF NOT EXISTS metadata( + id INTEGER PRIMARY KEY, + last_updated TEXT, + number_of_resources INTEGER + ) + """) + self.conn.commit() + self.close() + + def _save_update_info(self , _num_updated:int): + """Save info about last update to database""" + self.connect() + self.cursor.execute(""" + DELETE FROM metadata + """) + sql = """ + INSERT INTO metadata + (last_updated, number_of_resources) + VALUES (?, ?) + """ + last_refreshed = current_datetime() + self.cursor.execute(sql, (last_refreshed, _num_updated)) + self.conn.commit() + self.close() + + def upsert_records(self, table_name, data_dicts): + """Insert or replace records in database""" + self.connect(verify=True) + placeholders = ", ".join(["?"] * len(data_dicts[0])) + columns = ", ".join(data_dicts[0].keys()) + sql = f"""INSERT OR REPLACE INTO {table_name} + ({columns}) + VALUES ({placeholders})""" + self.cursor.executemany(sql, [tuple(data_dict.values()) for data_dict in data_dicts]) + self.conn.commit() + self.close() + + def sync_metadata(self, batch_size=2500, njobs=None): + """Updates metadata in datagovindia.db sqlite database + + Parameters + ---------- + + batch_size: int (optional) + Number of records to be fetched in a single request. Defaults to 2500. + + njobs: int (optional) + Number of threads to use for collecting data. Defaults to None. + None will use all available threads. + """ + self.validate_api_key() + + start_time = time.time() + + _num_available = get_total_available_resources() + _num_updated = 0 + + self.create_tables() + njobs = mp.cpu_count() if njobs is None else njobs + _batch = njobs * batch_size + + display_progress_bar(_num_updated, _num_available) + + for start in range(0, _num_available, _batch): + end = min(_num_available, start + _batch) + records = fetch_metadata_records( + self.api_key, start=start, end=end, batch_size=batch_size, njobs=njobs + ) + self.upsert_records("resources", records) + _num_updated += len(records) + + # Calculate ETA + elapsed_time = time.time() - start_time + avg_time = elapsed_time / _num_updated + _num_remaining = (_num_available - _num_updated) + eta = avg_time * _num_remaining + display_progress_bar(_num_updated, _num_available, eta=format_seconds(eta)) + + total_time = time.time() - start_time + print(f"\nFinished updating {_num_updated} records in {round(total_time)} seconds.") + self._save_update_info(_num_updated) \ No newline at end of file diff --git a/src/datagovindia/cli.py b/src/datagovindia/cli.py new file mode 100644 index 00000000..9df6b3e2 --- /dev/null +++ b/src/datagovindia/cli.py @@ -0,0 +1,332 @@ +import json +import click +from datagovindia import DataGovIndia, save_dataframe + +################## datagovindia cli ################## +@click.group() +def cli(): + """Command-line interface for the DataGovIndia API wrapper. + + `sync-metadata`: Fetches and syncs metadata from the OGD platform into an SQLite database. + + `get-update-info`: Fetches info for the last database update from the OGD platform. + + `search`: Searches the database based on a query and displays or saves the results. + + `get-resource-info`: Fetches info for a given resource ID from the OGD platform and displays it in the terminal. + + `get-data`: Fetches data for a given resource ID from the OGD platform and saves it to a specified file. + + """ + +################## sync-metadata ################## +@cli.command(name="sync-metadata") +@click.option( + "--api-key", + default=None, + type=str, + help=( + "API key to be used for fetching data. Optional if already set in environment variable" + " 'DATAGOVINDIA_API_KEY'." + ), +) +@click.option( + "--db-path", + default=None, + type=str, + help=( + "Path to the SQLite database. Optional if already set in environment variable" + " 'DATAGOVINDIA_DB_PATH'." + ), +) +@click.option( + "--batch-size", + default=5000, + type=int, + help="Number of records to be fetched in a single request. Increase this value to improve performance.", +) +@click.option( + "--njobs", + default=None, + type=int, + help="Number of threads to use for collecting data. (default is all cores)", +) +def sync_metadata_cli(api_key, db_path, batch_size, njobs): + """Fetches and syncs metadata from the OGD platform into the SQLite database. + + usage: `datagovindia sync-metadata [--api-key ] [--db-path ] [--batch-size ] [--njobs `] + + optional arguments: + + - `api-key`: `API key for the OGD platform. Uses 'DATAGOVINDIA_API_KEY' environment variable if not provided.` + + - `db-path`: `Path to the SQLite database. Defaults to 'DATAGOVINDIA_DB_PATH' environment variable or '~/datagovindia.db'.` + + - `batch-size`: `Number of records to fetch in one request. Increase for better performance but be wary of potential memory issues.` + + - `njobs`: `Number of parallel threads for fetching data. Defaults to using all available cores.` + + """ + click.echo("Syncing latest metadata from the OGD platform...") + datagovin = DataGovIndia(api_key=api_key, db_path=db_path) + datagovin.sync_metadata(batch_size=batch_size, njobs=njobs) + click.echo("Metadata updated successfully.") + +################## get-update-info ################## +@cli.command(name="get-update-info") +@click.option( + "--api-key", + default=None, + type=str, + help="API key for the OGD platform. Uses 'DATAGOVINDIA_API_KEY' environment variable if not provided.", +) +@click.option( + "--db-path", + default=None, + type=str, + help=( + "Path to the SQLite database. Optional if already set in environment variable" + " 'DATAGOVINDIA_DB_PATH'." + ), +) +def get_update_info_cli(api_key, db_path): + """Fetches info for the last metadata update from the OGD platform. + + usage: `datagovindia get-update-info [--api-key ] [--db-path ]` + + optional arguments: + + - `api-key`: `API key for the OGD platform. Uses 'DATAGOVINDIA_API_KEY' environment variable if not provided.` + + - `db-path`: `Path to the SQLite database. Defaults to 'DATAGOVINDIA_DB_PATH' environment variable or '~/datagovindia.db'.` + """ + datagovin = DataGovIndia(api_key=api_key, db_path=db_path) + click.echo("Fetching info for the last metadata update...") + info = datagovin.get_update_info() + click.echo(json.dumps(info, indent=4)) + +################## search ################## +@cli.command(name="search") +@click.argument("query", required=True) +@click.option( + "--api-key", + default=None, + type=str, + help=( + "API key to be used for fetching data. Optional if already set in environment variable" + " 'DATAGOVINDIA_API_KEY'." + ), +) +@click.option( + "--db-path", + default=None, + type=str, + help=( + "Path to the SQLite database. Optional if already set in environment variable" + " 'DATAGOVINDIA_DB_PATH'." + ), +) +@click.option( + "-o", "--output", default=None, type=str, help="Path to the output file if you want to save the results." +) +@click.option("--preview", is_flag=True, help="Display the results in the terminal.", default=False) +@click.option("-n", "--limit", default=5, type=int, show_default=True, help="Number of results to show.") +@click.option( + "-f", + "--fields", + default=["title"], + multiple=True, + type=str, + show_default=True, + help="List of fields to search in.", +) +@click.option("-s", "--sort-by", default=None, type=str, help="Field to sort results by.") +@click.option("--asc", is_flag=True, help="Sort results in ascending order.") +def search_cli(query, api_key, db_path, output, preview, limit, fields, sort_by, asc): + """Searches the metadata database based on a query and displays or saves the results. + + usage: `datagovindia search [--api-key ] [--db-path ] [--output ] [--show] [--limit ] [--fields ] [--sort-by ] [--asc]` + + positional arguments: + + - `query`: `Search term or phrase for querying the metadata database.` + + optional arguments: + + - `api-key`: `API key for the OGD platform. Uses 'DATAGOVINDIA_API_KEY' environment variable if not provided.` + + - `db-path`: `Path to the SQLite database. Defaults to 'DATAGOVINDIA_DB_PATH' environment variable or '~/datagovindia.db'.` + + - `output`: `File path to save the search results in CSV format. If not provided, results will be displayed.` + + - `preview`: `Display the search results in the terminal.` + + - `limit`: `Limit the number of displayed results.` + + - `fields`: `Database fields to search in. Multiple fields can be specified.` + + - `sort-by`: `Field to sort the search results by.` + + - `asc`: `Sort the results in ascending order. If not provided, defaults to ascending.` + """ + datagovin = DataGovIndia(api_key=api_key, db_path=db_path) + click.echo(f"Searching for '{query}' in fields {fields}...") + search_df = datagovin.search(query, search_fields=fields, sort_by=sort_by, ascending=asc) + + if output: + save_dataframe(search_df, output) + click.echo(f"{len(search_df)} results saved to '{output}'.") + + if preview: + click.echo(search_df.head(limit)) + click.echo(f"{len(search_df)} results found.") + + else: + click.echo(f"{len(search_df)} results found.") + +################## resource-info ################## +@cli.command(name="get-resource-info") +@click.argument("resource_id", required=True, type=str) +@click.option( + "--api-key", + default=None, + type=str, + help=( + "API key to be used for fetching data. Optional if already set in environment variable" + " 'DATAGOVINDIA_API_KEY'." + ), +) +@click.option( + "--db-path", + default=None, + type=str, + help=( + "Path to the SQLite database. Optional if already set in environment variable" + " 'DATAGOVINDIA_DB_PATH'." + ), +) +def get_resource_info_cli(resource_id, api_key, db_path): + """Fetches info for a given resource ID from the OGD platform and displays it in the terminal. + + usage: `datagovindia resource-info [--api-key ] [--db-path ]` + + positional arguments: + + - `resource_id`: `Unique identifier for the data resource to be fetched.` + + optional arguments: + + - `api-key`: `API key for the OGD platform. Uses 'DATAGOVINDIA_API_KEY' environment variable if not provided.` + + - `db-path`: `Path to the SQLite database. Defaults to 'DATAGOVINDIA_DB_PATH' environment variable or '~/datagovindia.db'.` + + """ + datagovin = DataGovIndia(api_key=api_key, db_path=db_path) + click.echo(f"Fetching info for resource_id '{resource_id}'...") + info = datagovin.get_resource_info(resource_id) + click.echo(json.dumps(info, indent=4)) + +################## get-data ################## + +@cli.command(name="get-data") +@click.argument("resource_id", required=True, type=str) +@click.option( + "--api-key", + default=None, + type=str, + help=( + "API key to be used for fetching data. Optional if already set in environment variable" + " 'DATAGOVINDIA_API_KEY'." + ), +) +@click.option( + "--db-path", + default=None, + type=str, + help=( + "Path to the SQLite database. Optional if already set in environment variable" + " 'DATAGOVINDIA_DB_PATH'." + ), +) +@click.option("-o", "--output", default=None, type=str, help="Path to the output file.", required=True) +@click.option("--filters", default={}, type=dict, help="Filters to be applied on the records.") +@click.option( + "--fields", + default=[], + multiple=True, + type=str, + help="Fields to be fetched. Keep empty to fetch all fields.", +) +@click.option("--offset", default=0, type=int, help="Offset of the records to be fetched.") +@click.option( + "--limit", default=None, type=int, help="Number of records to be fetched. (default is all records)" +) +@click.option( + "--batch-size", + default=2000, + type=int, + help="Number of records to be fetched in a single request. Increase this value to improve performance.", +) +@click.option("--sort-by", default=None, type=str, help="Field to sort results by.") +@click.option("--asc", is_flag=True, help="Sort results in ascending order (default is ascending).") +@click.option( + "--njobs", + default=None, + type=int, + help="Number of threads to use for collecting data. (default is all cores)", +) +def get_data_cli( + resource_id, api_key, db_path, output, sort_by, asc, offset, batch_size, limit, filters, fields, njobs +): + """Fetches data for a given resource ID from the OGD platform and saves it to a specified file. + + usage: `datagovindia get-data [--api-key ] [--db-path ] [--output ] [--filters ] [--fields ] [--offset ] [--limit ] [--batch-size ] [--sort-by ] [--asc] [--njobs ]` + + positional arguments: + + - `resource_id`: `Unique identifier for the data resource to be fetched.` + + optional arguments: + + - `api-key`: `API key for the OGD platform. Uses 'DATAGOVINDIA_API_KEY' environment variable if not provided.` + + - `db-path`: `Path to the SQLite database. Defaults to 'DATAGOVINDIA_DB_PATH' environment variable or '~/datagovindia.db'.` + + - `output`: `File path to save the fetched data in CSV format.` + + - `filters`: `Filter the fetched records based on specified criteria in the format field:value.` + + - `fields`: `Specific fields to fetch from the data resource. Multiple fields can be specified.` + + - `offset`: `Starting offset for fetching the records.` + + - `limit`: `Maximum number of records to fetch. If not specified, fetches all available records.` + + - `batch-size`: `Number of records to fetch in one request. Adjust based on performance and memory considerations.` + + - `sort-by`: `Field to sort the fetched records by.` + + - `asc`: `Sort the fetched records in ascending order. If not provided, defaults to ascending.` + + - `njobs`: `Number of parallel threads for fetching data. Defaults to using all available cores.` + + """ + datagovin = DataGovIndia(api_key=api_key, db_path=db_path) + click.echo(f"Fetching data for resource_id '{resource_id}'...") + data = datagovin.get_data( + resource_id, + sort_by=sort_by, + ascending=asc, + offset=offset, + batch_size=batch_size, + limit=limit, + filters=filters, + fields=fields, + njobs=njobs, + ) + save_dataframe(data, output) + click.echo(f"{len(data)} records fetched and saved to '{output}'.") + +if __name__ == "__main__": + cli() + \ No newline at end of file