Apply minor improvements

* Improved the README file * Updated the version * Separated the production and development dependence in the `pyproject.toml` file
habedi · Oct 21, 2024 · 8b5b783 · 8b5b783
1 parent 13fc5a3
commit 8b5b783
Show file tree

Hide file tree

Showing 8 changed files with 178 additions and 30 deletions.
diff --git a/README.md b/README.md
@@ -1,41 +1,41 @@
 # Mongo Analyser
 
-<img src="https://github.com/habedi/mongo-analyser/blob/main/assets/logo_v1.png" align="right" width="25%"/>
+<img src="assets/logo_v1.png" align="right" width="25%"/>
 
 [![Tests](https://github.com/habedi/mongo-analyser/actions/workflows/tests.yml/badge.svg)](https://github.com/habedi/mongo-analyser/actions/workflows/tests.yml)
 [![codecov](https://codecov.io/gh/habedi/mongo-analyser/graph/badge.svg?token=HOTAZKP3V7)](https://codecov.io/gh/habedi/mongo-analyser)
+[![CodeFactor](https://www.codefactor.io/repository/github/habedi/mongo-analyser/badge)](https://www.codefactor.io/repository/github/habedi/mongo-analyser)
 [![PyPI version](https://badge.fury.io/py/mongo-analyser.svg)](https://badge.fury.io/py/mongo-analyser)
-[![License](https://img.shields.io/github/license/habedi/mongo-analyser)](https://github.com/habedi/mongo-analyser/blob/main/LICENSE)
-[![Python version](https://img.shields.io/badge/Python-%3E=3.9-blue)](https://github.com/habedi/mongo-analyser)
 [![Pip downloads](https://img.shields.io/pypi/dm/mongo-analyser.svg)](https://pypi.org/project/mongo-analyser)
-[![Documentation](https://img.shields.io/badge/docs-latest-green)](https://github.com/habedi/mongo-analyser/blob/main/docs/index.md)
-[![CodeFactor](https://www.codefactor.io/repository/github/habedi/mongo-analyser/badge)](https://www.codefactor.io/repository/github/habedi/mongo-analyser)
+[![Python version](https://img.shields.io/badge/Python-%3E=3.9-blue)](https://github.com/habedi/mongo-analyser)
+[![Documentation](https://img.shields.io/badge/docs-latest-blue)](https://github.com/habedi/mongo-analyser/blob/main/docs/index.md)
+[![License](https://img.shields.io/badge/license-MIT-blue)](https://github.com/habedi/mongo-analyser/blob/main/LICENSE)
 
-Mongo Analyser is a tool that helps you to analyse the structure of a MongoDB collection. It can help you extract the
-schema of a collection, find the data types of the fields, and also extract sample data from the collection based on the
-schema.
+Mongo Analyser is a tool that helps you analyse and infer a MongoDB collection's structure. It can help you extract the
+schema of a collection, find the data types of the fields and also export data from the collection based on the
+schema it found.
 
 Mongo Analyser can be used as a command-line tool or as a Python library.
 
 ## Installation
 
-You can install Mongo Analyser using pip (mainly to use it as a library):
+You can install Mongo Analyser using `pip` (mainly to use it as a library):
 
 ```bash
 pip install mongo-analyser
 ```
 
-You can also install it as a standalone executable using pipx:
+You can also install it as a standalone executable using `pipx` or `uv`:
 
 ```bash
 pipx install mongo-analyser
 ```
 
-After installing it using pipx, you can run it from the command line:
-
 ```bash
-mongo-analyser <command> [<args>]
+uv tool install mongo-analyser
 ```
 
+After the installation is complete, you can use the `mongo-analyser` command in your terminal.
+
 See the [documentation](https://github.com/habedi/mongo-analyser/blob/main/docs/index.md) for more information and
 examples.
diff --git a/docs/examples/analyse_and_extract.sh b/docs/examples/analyse_and_extract.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+
+# Make sure `pipx install mongo-analyser` or `uv tool install mongo-analyser` is run before executing this script :)
+export PATH=$PATH:~/.local/bin
+
+ANALYSE_SCHEMA="mongo_analyser analyse_schema"
+EXTRACT_DATA="mongo_analyser extract_data"
+
+print_usage() {
+    echo "Usage: $0 [DB_NAME] [COLLECTION_NAME] [SAMPLE_SIZE]"
+    echo "Default values:"
+    echo "  DB_NAME: admin"
+    echo "  COLLECTION_NAME: system.version"
+    echo "  SAMPLE_SIZE: 100"
+}
+
+if [[ "$1" == "-h" || "$1" == "--help" ]]; then
+    print_usage
+    exit 0
+fi
+
+DB_NAME=${1:-admin}
+COLLECTION_NAME=${2:-system.version}
+SAMPLE_SIZE=${3:-100}
+TIME_ZONE="CET"
+DATA_DIR="data"
+
+mkdir -p "$DATA_DIR/$DB_NAME"
+
+PREFIX="$COLLECTION_NAME"
+SCHEMA_FILE="$DATA_DIR/$DB_NAME/${PREFIX}_schema.json"
+METADATA_FILE="$DATA_DIR/$DB_NAME/${PREFIX}_metadata.csv"
+OUTPUT_FILE="$DATA_DIR/$DB_NAME/${PREFIX}_data.json.gz"
+
+echo "Database: $DB_NAME"
+echo "Collection: $COLLECTION_NAME"
+echo "Sample Size: $SAMPLE_SIZE"
+echo "Timezone: $TIME_ZONE"
+echo "Schema File: $SCHEMA_FILE"
+echo "Metadata File: $METADATA_FILE"
+echo "Output File: $OUTPUT_FILE"
+
+extract_schema() {
+    echo "Extracting schema..."
+    time $ANALYSE_SCHEMA --database "$DB_NAME" --collection "$COLLECTION_NAME" --sample_size "$SAMPLE_SIZE" \
+    --schema_file "$SCHEMA_FILE" --metadata_file "$METADATA_FILE"
+}
+
+extract_data() {
+    echo "Extracting data..."
+    time $EXTRACT_DATA --database "$DB_NAME" --collection "$COLLECTION_NAME" --timezone "$TIME_ZONE" \
+    --output_file "$OUTPUT_FILE" --schema_file "$SCHEMA_FILE" --limit 100000
+}
+
+# Extract schema and data
+extract_schema
+extract_data
diff --git a/docs/examples/analyse_and_extract_schema.py b/docs/examples/analyse_and_extract_schema.py
diff --git a/docs/examples/analyse_schema.py b/docs/examples/analyse_schema.py
@@ -0,0 +1,30 @@
+from mongo_analyser import SchemaAnalyser
+
+# MongoDB connection details
+mongo_uri = SchemaAnalyser.build_mongo_uri('localhost', 27017)
+collection = SchemaAnalyser.connect_mongo(
+    mongo_uri, 'admin', 'system.version'
+)
+
+# Infer the schema and statistics
+schema, stats = SchemaAnalyser.infer_schema_and_stats(collection, sample_size=1000)
+
+# Print the schema and metadata as JSON
+print(f"Schema: {schema}")
+
+# Print the statistics as JSON
+print(f"Stats: {stats}")
+
+# Print the schema and metadata as a table
+headers = ["Field", "Type", "Cardinality", "Missing (%)"]
+rows = []
+for field, details in schema.items():
+    field_stats = stats.get(field, {})
+    cardinality = field_stats.get("cardinality", "N/A")
+    missing_percentage = field_stats.get("missing_percentage", "N/A")
+    rows.append([field, details['type'], cardinality, round(missing_percentage, 1)])
+
+SchemaAnalyser.draw_unicode_table(headers, rows)
+
+# Save the schema to a JSON file
+SchemaAnalyser.save_schema_to_json(schema, 'schema.json')
diff --git a/docs/examples/extract_data.py b/docs/examples/extract_data.py
@@ -0,0 +1,21 @@
+import io
+import json
+
+import pytz
+
+from mongo_analyser import DataExtractor
+
+# Build MongoDB URI
+mongo_uri = DataExtractor.build_mongo_uri("localhost", 27017)
+
+# Load the schema from the JSON file
+with io.open("schema.json", 'r', encoding='utf-8') as f:
+    schema = json.load(f)
+
+# Extract data from the MongoDB collection
+DataExtractor.extract_data(
+    mongo_uri, "admin", "system.users", schema,
+    "output.json.gz", pytz.timezone("UTC"), batch_size=1000, limit=100
+)
+
+# Output: Data should be extracted and saved to the output.json.gz file on success
diff --git a/docs/index.md b/docs/index.md
@@ -1,12 +1,51 @@
-# Documentation
+# Mongo Analyser Documentation
 
-Put the documentation for the library in this folder. Typically, you would write the documentation in Markdown or
-reStructuredText files.
+Mongo Analyser consists of a command-line tool and a Python library that helps you analyse the structure of a MongoDB
+collection and extract data from it.
 
-## Example
+## Command-Line Interface
 
-Example of including an image is shown below:
+Mongo Analyser can be used as a command-line tool. The general interface for the command-line tool is:
 
-<p style="text-align: center;">
-  <img width="512" height="512" src="../assets/logo_v1.png" alt="" style="width: 75%; height: 75%;">
-</p>
+```bash
+Usage: mongo_analyser <command> [<args>]
+
+Commands:
+  analyse_schema  Analyse the structure of a MongoDB collection and infer schema and statistics from a sample of documents
+  extract_data    Extract data from MongoDB and store it to a compressed JSON file
+```
+
+Run the following command to get help on a specific command:
+
+```bash
+mongo_analyser <command> --help # or -h
+```
+
+## Python Interface
+
+See the [examples](examples/) directory for example code snippets on how to use Mongo Analyser as a Python library.
+
+## Supported Field Types
+
+Mongo Analyser supports the following field types:
+
+| Field Type         | Python Equivalent | MongoDB Equivalent   | Comments                                                                                                                                      |
+|--------------------|-------------------|----------------------|-----------------------------------------------------------------------------------------------------------------------------------------------|
+| `int32`            | `int`             | `int32`              |                                                                                                                                               |
+| `int64`            | `int`             | `int64`              |                                                                                                                                               |
+| `double`           | `float`           | `double`             |                                                                                                                                               |
+| `str`              | `str`             | `string`             |                                                                                                                                               |
+| `bool`             | `bool`            | `bool`               |                                                                                                                                               |
+| `datetime`         | `datetime`        | `date`               |                                                                                                                                               |
+| `dict`             | `dict`            | `document`           | Equivalent to a BSON document (which is a MongoDB object or subdocument)                                                                      |
+| `empty`            | `None` or `[]`    | `null` or `array`    | The empty type is used when a field has no value (`null`) or is an empty array.                                                               |
+| `array<type>`      | `list`            | `array`              | The type of the elements in the array is inferred from the sample of documents and can be any of the supported types except for `array<type>` |
+| `binary<UUID>`     | `bytes`           | `binary (subtype 4)` | The UUID is stored as a 16-byte binary value                                                                                                  |
+| `binary<MD5>`      | `bytes`           | `binary (subtype 5)` | The MD5 hash is stored as a 16-byte binary value                                                                                              |
+| `binary<ObjectId>` | `bytes`           | `objectId`           | The ObjectId is stored as a 12-byte binary value                                                                                              |
+                                                                                             
+## Notes
+
+- At the moment, Mongo Analyser does not support arrays of objects with different types. Such arrays will be treated as
+  arrays of objects with a single type. For example, if an array contains both integers and strings, it will be treated
+  as either an array of integers or an array of strings.
diff --git a/mongo_analyser/cli.py b/mongo_analyser/cli.py
@@ -70,8 +70,9 @@ def print_custom_help(parser, subparsers):
 
 
 def main():
-    analyse_schema_description = 'Analyse the structure of a MongoDB collection and infer schema and statistics from a sample of documents'
-    extract_data_description = 'Extract data from MongoDB and store it to a compressed JSON file'
+    analyse_schema_description = ('Analyse the structure of a MongoDB collection and infer schema '
+                                  'and statistics from a sample of documents')
+    extract_data_description = 'Extract data from MongoDB and store it in a compressed JSON file'
 
     # Create the top-level parser
     parser = argparse.ArgumentParser(

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,12 +1,12 @@
 [tool.poetry]
 name = "mongo-analyser"
-version = "0.1.2"
+version = "0.1.4"
 description = "A minimalistic tool for analysing and extracting the schema of a MongoDB collection."
 authors = ["Hassan Abedi <[email protected]>"]
 maintainers = ["Hassan Abedi <[email protected]>"]
 license = "MIT"
 readme = "README.md"
-include = ["README.md", "LICENSE", 'logo.png']
+include = ["README.md", "LICENSE"]
 packages = [{ include = "mongo_analyser", from = "." }]
 repository = "https://github.com/habedi/mongo-analyser"
 documentation = "https://github.com/habedi/mongo-analyser/blob/main/docs/index.md"
@@ -19,14 +19,15 @@ keywords = ["mongodb", "python", "nosql", "command-line tool", "json"]
 
 [tool.poetry.dependencies]
 python = "^3.9"
-pylint = "^3.0.3"
+pymongo = "^4.10.1"
+pytz = "^2024.2"
+
+[tool.poetry.dev-dependencies]
 pytest = "^8.0.1"
 pytest-cov = "^5.0.0"
 pytest-mock = "^3.14.0"
 mypy = "^1.11.1"
 poetry-dynamic-versioning = "^1.4.0"
-pymongo = "^4.10.1"
-pytz = "^2024.2"
 ruff = "^0.7.0"
 
 [tool.poetry.scripts]
@@ -51,7 +52,7 @@ enable = true
 vcs = "git"
 versioning = "semver"  # Semantic Versioning
 
-# Ruff configuration (Edit as needed)
+# Ruff configuration
 [tool.ruff]
 exclude = [
     ".bzr",