diff --git a/DataHorseUI/app.py b/DataHorseUI/app.py new file mode 100644 index 0000000..e69de29 diff --git a/DataHorseUI/png.png b/DataHorseUI/png.png new file mode 100644 index 0000000..0f04105 Binary files /dev/null and b/DataHorseUI/png.png differ diff --git a/DataHorseUI/png1.png b/DataHorseUI/png1.png new file mode 100644 index 0000000..1b7ea3d Binary files /dev/null and b/DataHorseUI/png1.png differ diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..6df264c --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2024 Dedolphins Tec + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/README.md b/README.md index c1573a0..3ab05e2 100644 --- a/README.md +++ b/README.md @@ -1 +1,49 @@ -# DataHorse \ No newline at end of file +# πŸŽ‰ Do data science and data analysis in plain english 🌟 + +

+ + + +

+ DataHorse +

+

+ +

+ + + +

+ +πŸš€ **DataHorse** is an open-source tool and Python library that simplifies data science for everyone. It lets users interact with data in plain English πŸ“, without needing technical skills or watching tutorials πŸŽ₯ to learn how to use it. With DataHorse, you can create graphs πŸ“Š, modify data πŸ› οΈ, and even create smart systems called machine learning models πŸ€– to get answers or make predictions. It’s designed to help businesses and individuals πŸ’Ό regardless of knowledge background to quickly understand their data and make smart, data-driven decisions, all with ease. ✨ + +## Quick Installation + +```bash +pip install datahorse +``` + +## Examples + +Setup and usage examples are available in this **[Google Colab notebook](https://colab.research.google.com/drive/1brAw2Qj_VnlTbzcfjm5sCOaQbNl7Disd?usp=sharing)**. + +```python +import datahorse + +df = datahorse.read('https://raw.githubusercontent.com/plotly/datasets/master/iris-data.csv') + +# Data transformation +df = df.chat('convert species names to numeric codes') +df = df.chat('add a new column "petal_area" calculated as petal_length * petal_width') + +# Queries +average_measurements = df.chat('what are the average sepal length and petal width for each species?') +species_count = df.chat('how many samples are there for each species?') +largest_petal_length = df.chat('which species has the largest petal length?') + +# Plotting +df.chat('scatter plot of sepal length vs petal length by species') +df.chat('histogram of petal width') +df.chat('box plot of sepal length distribution by species') +``` \ No newline at end of file diff --git a/datahorse/__init__.py b/datahorse/__init__.py new file mode 100644 index 0000000..f3cfd81 --- /dev/null +++ b/datahorse/__init__.py @@ -0,0 +1,141 @@ +import pandas as pd +from groq import Groq + +verbose = False +mutable = False + +model = 'llama3-8b-8192' +groq_api_key = "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" +client = Groq(api_key=groq_api_key) + +template = ''' +Write a Python function `process({arg_name})` which takes the following input value: + +{arg_name} = {arg} + +This is the function's purpose: {goal} +''' + +_ask_cache = {} + +class Ask: + def __init__(self, *, verbose=None, mutable=None): + self.verbose = verbose if verbose is not None else globals()['verbose'] + self.mutable = mutable if mutable is not None else globals()['mutable'] + + @staticmethod + def _fill_template(template, **kw): + import re + from textwrap import dedent + result = dedent(template.lstrip('\n').rstrip()) + for k, v in kw.items(): + result = result.replace(f'{{{k}}}', v) + m = re.match(r'\{[a-zA-Z0-9_]*\}', result) + if m: + raise Exception(f'Expected variable: {m.group(0)}') + return result + + def _get_prompt(self, goal, arg): + if isinstance(arg, pd.DataFrame) or isinstance(arg, pd.Series): + import io + buf = io.StringIO() + arg.info(buf=buf) + arg_summary = buf.getvalue() + else: + arg_summary = repr(arg) + arg_name = 'df' if isinstance(arg, pd.DataFrame) else 'index' if isinstance(arg, pd.Index) else 'data' + + return self._fill_template(template, arg_name=arg_name, arg=arg_summary.strip(), goal=goal.strip()) + + def _run_prompt(self, prompt): + cache = _ask_cache + completion = cache.get(prompt) or client.chat.completions.create( + messages=[ + { + "role": "system", + "content": "Write the function in a Python code block with all necessary imports and no example usage.", + }, + { + "role": "user", + "content": prompt, + }, + ], + model=model, + ) + cache[prompt] = completion + return completion.choices[0].message.content + + def _extract_code_block(self, text): + import re + pattern = r'```(\s*(py|python)\s*\n)?([\s\S]*?)```' + m = re.search(pattern, text) + if not m: + return text + return m.group(3) + + def _eval(self, source, *args): + _args_ = args + scope = dict(_args_=args) + exec(self._fill_template(''' + {source} + _result_ = process(*_args_) + ''', source=source), scope) + return scope['_result_'] + + def _code(self, goal, arg): + prompt = self._get_prompt(goal, arg) + result = self._run_prompt(prompt) + if self.verbose: + print() + print(result) + return self._extract_code_block(result) + + def code(self, *args): + print(self._code(*args)) + + def prompt(self, *args): + print(self._get_prompt(*args)) + + def __call__(self, goal, *args): + source = self._code(goal, *args) + return self._eval(source, *args) + + +@pd.api.extensions.register_dataframe_accessor('chat') +@pd.api.extensions.register_series_accessor('chat') +@pd.api.extensions.register_index_accessor('chat') +class AskAccessor: + def __init__(self, pandas_obj): + self._validate(pandas_obj) + self._obj = pandas_obj + + @staticmethod + def _validate(obj): + pass + + def _ask(self, **kw): + return Ask(**kw) + + def _data(self, **kw): + if not mutable and not kw.get('mutable') and hasattr(self._obj, 'copy'): + return self._obj.copy() + return self._obj + + def __call__(self, goal, *args, **kw): + ask = self._ask(**kw) + data = self._data(**kw) + return ask(goal, data, *args) + + def code(self, goal, *args, **kw): + ask = self._ask(**kw) + data = self._data(**kw) + return ask.code(goal, data, *args) + + def prompt(self, goal, *args, **kw): + ask = self._ask(**kw) + data = self._data(**kw) + return ask.prompt(goal, data, *args) + + +def read(file_path): + return pd.read_csv(file_path) diff --git a/datahorse/__version__.py b/datahorse/__version__.py new file mode 100644 index 0000000..918a58a --- /dev/null +++ b/datahorse/__version__.py @@ -0,0 +1,7 @@ +__title__ = "DataHorse" +__description__ = "Do data science and data analysis in plain english" +__version__ = "0.0.0" +__author__ = "DeDolphins" +__author_email__ = "info@datahorse.ai" +__license__ = "creativeml-openrail-m" +__url__ = "https://datahorse.ai" \ No newline at end of file diff --git a/image.png b/image.png new file mode 100644 index 0000000..76fc421 Binary files /dev/null and b/image.png differ diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..0d4a452 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,14 @@ +[tool:pytest] +addopts = + -vv +testpaths = tests + +[aliases] +test = pytest + +[metadata] +description-file = README.md +license_file = LICENSE + +[wheel] +universal = 1 \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..e758fe1 --- /dev/null +++ b/setup.py @@ -0,0 +1,86 @@ +import os +import re +import setuptools +from typing import AnyStr, List + + +def read_file(path_parts: List[str], encoding: str = "utf-8") -> AnyStr: + """ + Read a file from the project directory + Args: + path_parts: List of parts of the path to the file + encoding: Encoding of the file + Returns: + Content of the file as a string + """ + with open( + os.path.join(os.path.dirname(__file__), *path_parts), "r", encoding=encoding + ) as file: + return file.read() + + +version_contents = read_file(["datahorse", "__version__.py"]) +about = {} + +for key in [ + "__author__", + "__author_email__", + "__description__", + "__license__", + "__title__", + "__url__", + "__version__", +]: + key_match = re.search(f"{key} = ['\"]([^'\"]+)['\"]", version_contents) + if key_match: + about[key] = key_match.group(1) + +readme = read_file(["README.md"]) + +# Include only pandas and groq +required_packages = [ + "pandas", + "groq", +] + +extras = { + "test": [ + "black", + "coverage", + "flake8", + "mock", + "pydocstyle", + "pytest", + "pytest-cov", + "tox", + ] +} + +setuptools.setup( + name=about.get("__title__", "unknown"), + version=about.get("__version__", "0.0.0"), + description=about.get("__description__", "unknown"), + long_description=readme, + author=about.get("__author__", "unknown"), + author_email=about.get("__author_email__", "unknown"), + url=about.get("__url__", "unknown"), + packages=setuptools.find_packages("datahorse"), + classifiers=[ + "Development Status :: 3 - Alpha", + "Intended Audience :: Developers", + "Natural Language :: English", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + ], + license=about.get("__license__", "unknown"), + package_dir={"": "datahorse"}, + package_data={"": ["*.txt"]}, + extras_require=extras, + install_requires=required_packages, + long_description_content_type="text/markdown", + python_requires=">=3.7.0", +)