generated from habedi/template-python-library
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Improved the README file * Updated the version * Separated the production and development dependence in the `pyproject.toml` file
- Loading branch information
Showing
8 changed files
with
178 additions
and
30 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,41 +1,41 @@ | ||
# Mongo Analyser | ||
|
||
<img src="https://github.com/habedi/mongo-analyser/blob/main/assets/logo_v1.png" align="right" width="25%"/> | ||
<img src="assets/logo_v1.png" align="right" width="25%"/> | ||
|
||
[![Tests](https://github.com/habedi/mongo-analyser/actions/workflows/tests.yml/badge.svg)](https://github.com/habedi/mongo-analyser/actions/workflows/tests.yml) | ||
[![codecov](https://codecov.io/gh/habedi/mongo-analyser/graph/badge.svg?token=HOTAZKP3V7)](https://codecov.io/gh/habedi/mongo-analyser) | ||
[![CodeFactor](https://www.codefactor.io/repository/github/habedi/mongo-analyser/badge)](https://www.codefactor.io/repository/github/habedi/mongo-analyser) | ||
[![PyPI version](https://badge.fury.io/py/mongo-analyser.svg)](https://badge.fury.io/py/mongo-analyser) | ||
[![License](https://img.shields.io/github/license/habedi/mongo-analyser)](https://github.com/habedi/mongo-analyser/blob/main/LICENSE) | ||
[![Python version](https://img.shields.io/badge/Python-%3E=3.9-blue)](https://github.com/habedi/mongo-analyser) | ||
[![Pip downloads](https://img.shields.io/pypi/dm/mongo-analyser.svg)](https://pypi.org/project/mongo-analyser) | ||
[![Documentation](https://img.shields.io/badge/docs-latest-green)](https://github.com/habedi/mongo-analyser/blob/main/docs/index.md) | ||
[![CodeFactor](https://www.codefactor.io/repository/github/habedi/mongo-analyser/badge)](https://www.codefactor.io/repository/github/habedi/mongo-analyser) | ||
[![Python version](https://img.shields.io/badge/Python-%3E=3.9-blue)](https://github.com/habedi/mongo-analyser) | ||
[![Documentation](https://img.shields.io/badge/docs-latest-blue)](https://github.com/habedi/mongo-analyser/blob/main/docs/index.md) | ||
[![License](https://img.shields.io/badge/license-MIT-blue)](https://github.com/habedi/mongo-analyser/blob/main/LICENSE) | ||
|
||
Mongo Analyser is a tool that helps you to analyse the structure of a MongoDB collection. It can help you extract the | ||
schema of a collection, find the data types of the fields, and also extract sample data from the collection based on the | ||
schema. | ||
Mongo Analyser is a tool that helps you analyse and infer a MongoDB collection's structure. It can help you extract the | ||
schema of a collection, find the data types of the fields and also export data from the collection based on the | ||
schema it found. | ||
|
||
Mongo Analyser can be used as a command-line tool or as a Python library. | ||
|
||
## Installation | ||
|
||
You can install Mongo Analyser using pip (mainly to use it as a library): | ||
You can install Mongo Analyser using `pip` (mainly to use it as a library): | ||
|
||
```bash | ||
pip install mongo-analyser | ||
``` | ||
|
||
You can also install it as a standalone executable using pipx: | ||
You can also install it as a standalone executable using `pipx` or `uv`: | ||
|
||
```bash | ||
pipx install mongo-analyser | ||
``` | ||
|
||
After installing it using pipx, you can run it from the command line: | ||
|
||
```bash | ||
mongo-analyser <command> [<args>] | ||
uv tool install mongo-analyser | ||
``` | ||
|
||
After the installation is complete, you can use the `mongo-analyser` command in your terminal. | ||
|
||
See the [documentation](https://github.com/habedi/mongo-analyser/blob/main/docs/index.md) for more information and | ||
examples. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
#!/bin/bash | ||
|
||
# Make sure `pipx install mongo-analyser` or `uv tool install mongo-analyser` is run before executing this script :) | ||
export PATH=$PATH:~/.local/bin | ||
|
||
ANALYSE_SCHEMA="mongo_analyser analyse_schema" | ||
EXTRACT_DATA="mongo_analyser extract_data" | ||
|
||
print_usage() { | ||
echo "Usage: $0 [DB_NAME] [COLLECTION_NAME] [SAMPLE_SIZE]" | ||
echo "Default values:" | ||
echo " DB_NAME: admin" | ||
echo " COLLECTION_NAME: system.version" | ||
echo " SAMPLE_SIZE: 100" | ||
} | ||
|
||
if [[ "$1" == "-h" || "$1" == "--help" ]]; then | ||
print_usage | ||
exit 0 | ||
fi | ||
|
||
DB_NAME=${1:-admin} | ||
COLLECTION_NAME=${2:-system.version} | ||
SAMPLE_SIZE=${3:-100} | ||
TIME_ZONE="CET" | ||
DATA_DIR="data" | ||
|
||
mkdir -p "$DATA_DIR/$DB_NAME" | ||
|
||
PREFIX="$COLLECTION_NAME" | ||
SCHEMA_FILE="$DATA_DIR/$DB_NAME/${PREFIX}_schema.json" | ||
METADATA_FILE="$DATA_DIR/$DB_NAME/${PREFIX}_metadata.csv" | ||
OUTPUT_FILE="$DATA_DIR/$DB_NAME/${PREFIX}_data.json.gz" | ||
|
||
echo "Database: $DB_NAME" | ||
echo "Collection: $COLLECTION_NAME" | ||
echo "Sample Size: $SAMPLE_SIZE" | ||
echo "Timezone: $TIME_ZONE" | ||
echo "Schema File: $SCHEMA_FILE" | ||
echo "Metadata File: $METADATA_FILE" | ||
echo "Output File: $OUTPUT_FILE" | ||
|
||
extract_schema() { | ||
echo "Extracting schema..." | ||
time $ANALYSE_SCHEMA --database "$DB_NAME" --collection "$COLLECTION_NAME" --sample_size "$SAMPLE_SIZE" \ | ||
--schema_file "$SCHEMA_FILE" --metadata_file "$METADATA_FILE" | ||
} | ||
|
||
extract_data() { | ||
echo "Extracting data..." | ||
time $EXTRACT_DATA --database "$DB_NAME" --collection "$COLLECTION_NAME" --timezone "$TIME_ZONE" \ | ||
--output_file "$OUTPUT_FILE" --schema_file "$SCHEMA_FILE" --limit 100000 | ||
} | ||
|
||
# Extract schema and data | ||
extract_schema | ||
extract_data |
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
from mongo_analyser import SchemaAnalyser | ||
|
||
# MongoDB connection details | ||
mongo_uri = SchemaAnalyser.build_mongo_uri('localhost', 27017) | ||
collection = SchemaAnalyser.connect_mongo( | ||
mongo_uri, 'admin', 'system.version' | ||
) | ||
|
||
# Infer the schema and statistics | ||
schema, stats = SchemaAnalyser.infer_schema_and_stats(collection, sample_size=1000) | ||
|
||
# Print the schema and metadata as JSON | ||
print(f"Schema: {schema}") | ||
|
||
# Print the statistics as JSON | ||
print(f"Stats: {stats}") | ||
|
||
# Print the schema and metadata as a table | ||
headers = ["Field", "Type", "Cardinality", "Missing (%)"] | ||
rows = [] | ||
for field, details in schema.items(): | ||
field_stats = stats.get(field, {}) | ||
cardinality = field_stats.get("cardinality", "N/A") | ||
missing_percentage = field_stats.get("missing_percentage", "N/A") | ||
rows.append([field, details['type'], cardinality, round(missing_percentage, 1)]) | ||
|
||
SchemaAnalyser.draw_unicode_table(headers, rows) | ||
|
||
# Save the schema to a JSON file | ||
SchemaAnalyser.save_schema_to_json(schema, 'schema.json') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
import io | ||
import json | ||
|
||
import pytz | ||
|
||
from mongo_analyser import DataExtractor | ||
|
||
# Build MongoDB URI | ||
mongo_uri = DataExtractor.build_mongo_uri("localhost", 27017) | ||
|
||
# Load the schema from the JSON file | ||
with io.open("schema.json", 'r', encoding='utf-8') as f: | ||
schema = json.load(f) | ||
|
||
# Extract data from the MongoDB collection | ||
DataExtractor.extract_data( | ||
mongo_uri, "admin", "system.users", schema, | ||
"output.json.gz", pytz.timezone("UTC"), batch_size=1000, limit=100 | ||
) | ||
|
||
# Output: Data should be extracted and saved to the output.json.gz file on success |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,12 +1,51 @@ | ||
# Documentation | ||
# Mongo Analyser Documentation | ||
|
||
Put the documentation for the library in this folder. Typically, you would write the documentation in Markdown or | ||
reStructuredText files. | ||
Mongo Analyser consists of a command-line tool and a Python library that helps you analyse the structure of a MongoDB | ||
collection and extract data from it. | ||
|
||
## Example | ||
## Command-Line Interface | ||
|
||
Example of including an image is shown below: | ||
Mongo Analyser can be used as a command-line tool. The general interface for the command-line tool is: | ||
|
||
<p style="text-align: center;"> | ||
<img width="512" height="512" src="../assets/logo_v1.png" alt="" style="width: 75%; height: 75%;"> | ||
</p> | ||
```bash | ||
Usage: mongo_analyser <command> [<args>] | ||
|
||
Commands: | ||
analyse_schema Analyse the structure of a MongoDB collection and infer schema and statistics from a sample of documents | ||
extract_data Extract data from MongoDB and store it to a compressed JSON file | ||
``` | ||
|
||
Run the following command to get help on a specific command: | ||
|
||
```bash | ||
mongo_analyser <command> --help # or -h | ||
``` | ||
|
||
## Python Interface | ||
|
||
See the [examples](examples/) directory for example code snippets on how to use Mongo Analyser as a Python library. | ||
|
||
## Supported Field Types | ||
|
||
Mongo Analyser supports the following field types: | ||
|
||
| Field Type | Python Equivalent | MongoDB Equivalent | Comments | | ||
|--------------------|-------------------|----------------------|-----------------------------------------------------------------------------------------------------------------------------------------------| | ||
| `int32` | `int` | `int32` | | | ||
| `int64` | `int` | `int64` | | | ||
| `double` | `float` | `double` | | | ||
| `str` | `str` | `string` | | | ||
| `bool` | `bool` | `bool` | | | ||
| `datetime` | `datetime` | `date` | | | ||
| `dict` | `dict` | `document` | Equivalent to a BSON document (which is a MongoDB object or subdocument) | | ||
| `empty` | `None` or `[]` | `null` or `array` | The empty type is used when a field has no value (`null`) or is an empty array. | | ||
| `array<type>` | `list` | `array` | The type of the elements in the array is inferred from the sample of documents and can be any of the supported types except for `array<type>` | | ||
| `binary<UUID>` | `bytes` | `binary (subtype 4)` | The UUID is stored as a 16-byte binary value | | ||
| `binary<MD5>` | `bytes` | `binary (subtype 5)` | The MD5 hash is stored as a 16-byte binary value | | ||
| `binary<ObjectId>` | `bytes` | `objectId` | The ObjectId is stored as a 12-byte binary value | | ||
## Notes | ||
|
||
- At the moment, Mongo Analyser does not support arrays of objects with different types. Such arrays will be treated as | ||
arrays of objects with a single type. For example, if an array contains both integers and strings, it will be treated | ||
as either an array of integers or an array of strings. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,12 +1,12 @@ | ||
[tool.poetry] | ||
name = "mongo-analyser" | ||
version = "0.1.2" | ||
version = "0.1.4" | ||
description = "A minimalistic tool for analysing and extracting the schema of a MongoDB collection." | ||
authors = ["Hassan Abedi <[email protected]>"] | ||
maintainers = ["Hassan Abedi <[email protected]>"] | ||
license = "MIT" | ||
readme = "README.md" | ||
include = ["README.md", "LICENSE", 'logo.png'] | ||
include = ["README.md", "LICENSE"] | ||
packages = [{ include = "mongo_analyser", from = "." }] | ||
repository = "https://github.com/habedi/mongo-analyser" | ||
documentation = "https://github.com/habedi/mongo-analyser/blob/main/docs/index.md" | ||
|
@@ -19,14 +19,15 @@ keywords = ["mongodb", "python", "nosql", "command-line tool", "json"] | |
|
||
[tool.poetry.dependencies] | ||
python = "^3.9" | ||
pylint = "^3.0.3" | ||
pymongo = "^4.10.1" | ||
pytz = "^2024.2" | ||
|
||
[tool.poetry.dev-dependencies] | ||
pytest = "^8.0.1" | ||
pytest-cov = "^5.0.0" | ||
pytest-mock = "^3.14.0" | ||
mypy = "^1.11.1" | ||
poetry-dynamic-versioning = "^1.4.0" | ||
pymongo = "^4.10.1" | ||
pytz = "^2024.2" | ||
ruff = "^0.7.0" | ||
|
||
[tool.poetry.scripts] | ||
|
@@ -51,7 +52,7 @@ enable = true | |
vcs = "git" | ||
versioning = "semver" # Semantic Versioning | ||
|
||
# Ruff configuration (Edit as needed) | ||
# Ruff configuration | ||
[tool.ruff] | ||
exclude = [ | ||
".bzr", | ||
|