Add files via upload

Sohammhatre10 · Aug 15, 2024 · b383720 · b383720
1 parent e4a309f
commit b383720
Show file tree

Hide file tree

Showing 10 changed files with 318 additions and 1 deletion.
diff --git a/DataHorseUI/app.py b/DataHorseUI/app.py
diff --git a/DataHorseUI/png.png b/DataHorseUI/png.png
diff --git a/DataHorseUI/png1.png b/DataHorseUI/png1.png
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2024 Dedolphins Tec
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
@@ -1 +1,49 @@
-# DataHorse
+# 🎉 Do data science and data analysis in plain english 🌟
+
+<p align="">
+  <a href="https://datahorse.ai/">
+    <img src="image.png" height="">
+  </a>
+  <h1 align="center">
+    <a href="https://datahorse.ai/">DataHorse</a>
+  </h1>
+</p>
+
+<p align="center">
+  <a href="https://www.linkedin.com/showcase/data-horse"> 
+    <img
+      src="https://img.shields.io/badge/LINKEDIN-blue.svg?style=for-the-badge&logo=read-the-docs&logoColor=white&labelColor=000000&logoWidth=20">
+  </a>
+</p>
+
+🚀 **DataHorse** is an open-source tool and Python library that simplifies data science for everyone. It lets users interact with data in plain English 📝, without needing technical skills or watching tutorials 🎥 to learn how to use it. With DataHorse, you can create graphs 📊, modify data 🛠️, and even create smart systems called machine learning models 🤖 to get answers or make predictions. It’s designed to help businesses and individuals 💼 regardless of knowledge background to quickly understand their data and make smart, data-driven decisions, all with ease. ✨
+
+## Quick Installation
+
+```bash
+pip install datahorse
+```
+
+## Examples
+
+Setup and usage examples are available in this **[Google Colab notebook](https://colab.research.google.com/drive/1brAw2Qj_VnlTbzcfjm5sCOaQbNl7Disd?usp=sharing)**.
+
+```python
+import datahorse
+
+df = datahorse.read('https://raw.githubusercontent.com/plotly/datasets/master/iris-data.csv')
+
+# Data transformation
+df = df.chat('convert species names to numeric codes')
+df = df.chat('add a new column "petal_area" calculated as petal_length * petal_width')
+
+# Queries
+average_measurements = df.chat('what are the average sepal length and petal width for each species?')
+species_count = df.chat('how many samples are there for each species?')
+largest_petal_length = df.chat('which species has the largest petal length?')
+
+# Plotting
+df.chat('scatter plot of sepal length vs petal length by species')
+df.chat('histogram of petal width')
+df.chat('box plot of sepal length distribution by species')
+```
diff --git a/datahorse/__init__.py b/datahorse/__init__.py
@@ -0,0 +1,141 @@
+import pandas as pd
+from groq import Groq
+
+verbose = False 
+mutable = False 
+
+model = 'llama3-8b-8192'
+groq_api_key = "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"
+client = Groq(api_key=groq_api_key)
+
+template = '''
+Write a Python function `process({arg_name})` which takes the following input value:
+
+{arg_name} = {arg}
+
+This is the function's purpose: {goal}
+'''
+
+_ask_cache = {}
+
+class Ask:
+    def __init__(self, *, verbose=None, mutable=None):
+        self.verbose = verbose if verbose is not None else globals()['verbose']
+        self.mutable = mutable if mutable is not None else globals()['mutable']
+
+    @staticmethod
+    def _fill_template(template, **kw):
+        import re
+        from textwrap import dedent
+        result = dedent(template.lstrip('\n').rstrip())
+        for k, v in kw.items():
+            result = result.replace(f'{{{k}}}', v)
+        m = re.match(r'\{[a-zA-Z0-9_]*\}', result)
+        if m:
+            raise Exception(f'Expected variable: {m.group(0)}')
+        return result
+
+    def _get_prompt(self, goal, arg):
+        if isinstance(arg, pd.DataFrame) or isinstance(arg, pd.Series):
+            import io
+            buf = io.StringIO()
+            arg.info(buf=buf)
+            arg_summary = buf.getvalue()
+        else:
+            arg_summary = repr(arg)
+        arg_name = 'df' if isinstance(arg, pd.DataFrame) else 'index' if isinstance(arg, pd.Index) else 'data'
+
+        return self._fill_template(template, arg_name=arg_name, arg=arg_summary.strip(), goal=goal.strip())
+
+    def _run_prompt(self, prompt):
+        cache = _ask_cache
+        completion = cache.get(prompt) or client.chat.completions.create(
+            messages=[
+                {
+                    "role": "system",
+                    "content": "Write the function in a Python code block with all necessary imports and no example usage.",
+                },
+                {
+                    "role": "user",
+                    "content": prompt,
+                },
+            ],
+            model=model,
+        )
+        cache[prompt] = completion
+        return completion.choices[0].message.content
+
+    def _extract_code_block(self, text):
+        import re
+        pattern = r'```(\s*(py|python)\s*\n)?([\s\S]*?)```'
+        m = re.search(pattern, text)
+        if not m:
+            return text
+        return m.group(3)
+
+    def _eval(self, source, *args):
+        _args_ = args
+        scope = dict(_args_=args)
+        exec(self._fill_template('''
+            {source}
+            _result_ = process(*_args_)
+        ''', source=source), scope)
+        return scope['_result_']
+
+    def _code(self, goal, arg):
+        prompt = self._get_prompt(goal, arg)
+        result = self._run_prompt(prompt)
+        if self.verbose:
+            print()
+            print(result)
+        return self._extract_code_block(result)
+
+    def code(self, *args):
+        print(self._code(*args))
+
+    def prompt(self, *args):
+        print(self._get_prompt(*args))
+
+    def __call__(self, goal, *args):
+        source = self._code(goal, *args)
+        return self._eval(source, *args)
+
+
+@pd.api.extensions.register_dataframe_accessor('chat')
+@pd.api.extensions.register_series_accessor('chat')
+@pd.api.extensions.register_index_accessor('chat')
+class AskAccessor:
+    def __init__(self, pandas_obj):
+        self._validate(pandas_obj)
+        self._obj = pandas_obj
+
+    @staticmethod
+    def _validate(obj):
+        pass
+
+    def _ask(self, **kw):
+        return Ask(**kw)
+
+    def _data(self, **kw):
+        if not mutable and not kw.get('mutable') and hasattr(self._obj, 'copy'):
+            return self._obj.copy() 
+        return self._obj
+
+    def __call__(self, goal, *args, **kw):
+        ask = self._ask(**kw)
+        data = self._data(**kw)
+        return ask(goal, data, *args)
+
+    def code(self, goal, *args, **kw):
+        ask = self._ask(**kw)
+        data = self._data(**kw)
+        return ask.code(goal, data, *args)
+
+    def prompt(self, goal, *args, **kw):
+        ask = self._ask(**kw)
+        data = self._data(**kw)
+        return ask.prompt(goal, data, *args)
+
+
+def read(file_path):
+    return pd.read_csv(file_path)
diff --git a/datahorse/__version__.py b/datahorse/__version__.py
@@ -0,0 +1,7 @@
+__title__ = "DataHorse"
+__description__ = "Do data science and data analysis in plain english"
+__version__ = "0.0.0"
+__author__ = "DeDolphins"
+__author_email__ = "[email protected]"
+__license__ = "creativeml-openrail-m"
+__url__ = "https://datahorse.ai"
diff --git a/image.png b/image.png
diff --git a/setup.cfg b/setup.cfg
@@ -0,0 +1,14 @@
+[tool:pytest]
+addopts =
+    -vv
+testpaths = tests
+
+[aliases]
+test = pytest
+
+[metadata]
+description-file = README.md
+license_file = LICENSE
+
+[wheel]
+universal = 1
diff --git a/setup.py b/setup.py
@@ -0,0 +1,86 @@
+import os
+import re
+import setuptools
+from typing import AnyStr, List
+
+
+def read_file(path_parts: List[str], encoding: str = "utf-8") -> AnyStr:
+    """
+    Read a file from the project directory
+    Args:
+        path_parts: List of parts of the path to the file
+        encoding: Encoding of the file
+    Returns:
+        Content of the file as a string
+    """
+    with open(
+        os.path.join(os.path.dirname(__file__), *path_parts), "r", encoding=encoding
+    ) as file:
+        return file.read()
+
+
+version_contents = read_file(["datahorse", "__version__.py"])
+about = {}
+
+for key in [
+    "__author__",
+    "__author_email__",
+    "__description__",
+    "__license__",
+    "__title__",
+    "__url__",
+    "__version__",
+]:
+    key_match = re.search(f"{key} = ['\"]([^'\"]+)['\"]", version_contents)
+    if key_match:
+        about[key] = key_match.group(1)
+
+readme = read_file(["README.md"])
+
+# Include only pandas and groq
+required_packages = [
+    "pandas",
+    "groq",
+]
+
+extras = {
+    "test": [
+        "black",
+        "coverage",
+        "flake8",
+        "mock",
+        "pydocstyle",
+        "pytest",
+        "pytest-cov",
+        "tox",
+    ]
+}
+
+setuptools.setup(
+    name=about.get("__title__", "unknown"),
+    version=about.get("__version__", "0.0.0"),
+    description=about.get("__description__", "unknown"),
+    long_description=readme,
+    author=about.get("__author__", "unknown"),
+    author_email=about.get("__author_email__", "unknown"),
+    url=about.get("__url__", "unknown"),
+    packages=setuptools.find_packages("datahorse"),
+    classifiers=[
+        "Development Status :: 3 - Alpha",
+        "Intended Audience :: Developers",
+        "Natural Language :: English",
+        "Programming Language :: Python",
+        "Programming Language :: Python :: 3",
+        "Programming Language :: Python :: 3.7",
+        "Programming Language :: Python :: 3.8",
+        "Programming Language :: Python :: 3.9",
+        "Programming Language :: Python :: 3.10",
+    ],
+    license=about.get("__license__", "unknown"),
+    package_dir={"": "datahorse"},
+    package_data={"": ["*.txt"]},
+    extras_require=extras,
+    install_requires=required_packages,
+    long_description_content_type="text/markdown",
+    python_requires=">=3.7.0",
+)