From 2291119f8bd9878175be5f5a79ea7dfe993dc852 Mon Sep 17 00:00:00 2001 From: Li Yin Date: Mon, 28 Oct 2024 08:07:48 -0700 Subject: [PATCH 1/4] fixed a nested data structured in dataclass parser, updated the dataclass tutorial and the notebook --- SETUP.md | 3 + .../output_parsers/dataclass_parser.py | 6 +- .../components/output_parsers/outputs.py | 9 +- docs/source/apis/components/index.rst | 1 + docs/source/tutorials/base_data_class.rst | 38 +- docs/source/tutorials/output_parsers.rst | 2 + notebooks/adalflow_colab_template.ipynb | 2 +- .../adalflow_dataclasses.ipynb | 493 +++++++++++------- 8 files changed, 344 insertions(+), 210 deletions(-) create mode 100644 SETUP.md rename notebooks/{ => tutorials}/adalflow_dataclasses.ipynb (61%) diff --git a/SETUP.md b/SETUP.md new file mode 100644 index 00000000..887eff68 --- /dev/null +++ b/SETUP.md @@ -0,0 +1,3 @@ +# Create a kernel + +```poetry run python -m ipykernel install --user --name my-project-kernel``` diff --git a/adalflow/adalflow/components/output_parsers/dataclass_parser.py b/adalflow/adalflow/components/output_parsers/dataclass_parser.py index 057f97d9..fb20da79 100644 --- a/adalflow/adalflow/components/output_parsers/dataclass_parser.py +++ b/adalflow/adalflow/components/output_parsers/dataclass_parser.py @@ -132,10 +132,10 @@ def get_examples_str( def call(self, input: str) -> Any: r"""Parse the output string to the desired format and return the parsed output.""" try: - output = self._output_processor(input) + output_dict = self._output_processor(input) if self._return_data_class: - return self._data_class(**output) - return output + return self._data_class.from_dict(output_dict) + return output_dict except Exception as e: log.error(f"Error at parsing output: {e}") raise ValueError(f"Error: {e}") diff --git a/adalflow/adalflow/components/output_parsers/outputs.py b/adalflow/adalflow/components/output_parsers/outputs.py index b38f63a4..1f4ff652 100644 --- a/adalflow/adalflow/components/output_parsers/outputs.py +++ b/adalflow/adalflow/components/output_parsers/outputs.py @@ -1,4 +1,11 @@ -"""The most commonly used output parsers for the Generator.""" +"""The most commonly used output parsers for the Generator. + +Includes: +- YamlOutputParser: YAML output parser using dataclass for schema extraction. +- JsonOutputParser: JSON output parser using dataclass for schema extraction. +- ListOutputParser: List output parser to parse list of objects from the string. +- BooleanOutputParser: Boolean output parser to parse boolean values from the string. +""" from dataclasses import is_dataclass from typing import Dict, Any, Optional, List diff --git a/docs/source/apis/components/index.rst b/docs/source/apis/components/index.rst index 893e7483..fce07dc1 100644 --- a/docs/source/apis/components/index.rst +++ b/docs/source/apis/components/index.rst @@ -49,6 +49,7 @@ Output Parsers .. autosummary:: components.output_parsers.outputs + components.output_parsers.dataclass_parser Agent ~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/source/tutorials/base_data_class.rst b/docs/source/tutorials/base_data_class.rst index da78f58e..52b4e926 100644 --- a/docs/source/tutorials/base_data_class.rst +++ b/docs/source/tutorials/base_data_class.rst @@ -1,4 +1,15 @@ .. _core-base_data_class_note: + + +.. raw:: html + +
+ + Try Quickstart in Colab + + +
+ DataClass ============ @@ -7,10 +18,10 @@ DataClass .. `Li Yin `_ -In `PyTorch`, ``Tensor`` is the data type used in ``Module`` and ``Optimizer`` across the library. -Tensor wraps a multi-dimensional matrix to better support its operations and computations. + In LLM applications, data constantly needs to interact with LLMs in the form of strings via prompt and be parsed back to structured data from LLMs' text prediction. :class:`DataClass` is designed to ease this data interaction with LLMs via prompt(input) and to parse the text prediction(output). +It is even more convenient to use together with **:ref:`components-output_parser_note`**. .. figure:: /_static/images/dataclass.png :align: center @@ -61,11 +72,13 @@ Here is how users typically use the ``dataclasses`` module: We also made the effort to provide more control: 1. **Keep the ordering of your data fields.** We provided :func:`required_field` with ``default_factory`` to mark the field as required even if it is after optional fields. We also has to do customization to preserve their ordering while being converted to dictionary, json and yaml string. -2. **Exclude some fields from the output.** All serialization methods support `exclude` parameter to exclude some fields even for nested dataclasses. -3. **Allow nested dataclasses, lists, and dictionaries.** All methods support nested dataclasses, lists, and dictionaries. +2. **Signal the output/input fields.** We allow you to use ``__output_fields__`` and ``__input_fields__`` to explicitly signal the output and input fields. (1) It can be a subset of the fields in the data class. (2) You can specify the ordering in the `__output_fields__`. +3. **Exclude some fields from the output.** All serialization methods support `exclude` parameter to exclude some fields even for nested dataclasses. +4. **Allow nested dataclasses, lists, and dictionaries.** All methods support nested dataclasses, lists, and dictionaries. +5. **Easy to use with Output parser.** It works well with output parsers such as ``JsonOutputParser``, ``YamlOutputParser``, and ``DataClassParser``. You can refer to :ref:`components-output_parser_note` for more details. -Describing the Data Format +Describing the Data Format (Data Class) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. list-table:: @@ -74,6 +87,10 @@ Describing the Data Format * - **Name** - **Description** + * - ``__input_fields__`` + - A list of fields that are input fields. + * - ``__output_fields__`` + - Used more often than ``__input_fields__``. A list of fields that are output fields. (1) It can be a subset of the fields in the data class. (2) You can specify the ordering in the `__output_fields__`. (3) Works well and only with :class:`DataClassParser`. * - ``to_schema(cls, exclude) -> Dict`` - Generate a JSON schema which is more detailed than the signature. * - ``to_schema_str(cls, exclude) -> str`` @@ -227,7 +244,7 @@ As you can see, it handles the nested dataclass `Question` and the required fiel .. note:: - ``Optional`` type hint will not affect the field's required status. You can use this to work with static type checkers such as `mypy` if you want to. + ``Optional`` type hint will not affect the field's required status. We recommend you not to use it in the `dataclasses` module especially when you are nesting many levels of dataclasses. It might end up confusing the LLMs. **Signature** @@ -600,7 +617,10 @@ You can simply do a bit customization to map the dataset's key to the field name If you are looking for data types we used to support each component or any other class like `Optimizer`, you can check out the :ref:`core.types` file. - +About __output_fields__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Though you can use `exclude` in the :class:`JsonOutputParser` to exclude some fields from the output, it is less readable and less convenient than +directly use `__output_fields__` in the data class to signal the output fields and directly work with :class:`DataClassParser`. .. admonition:: References :class: highlight @@ -616,7 +636,9 @@ You can simply do a bit customization to map the dataset's key to the field name - :class:`core.base_data_class.DataClassFormatType` - :func:`core.functional.custom_asdict` - :ref:`core.base_data_class` - + - :class:`core.base_data_class.required_field` + - :class:`components.output_parsers.outputs.JsonOutputParser` + - :class:`components.output_parsers.dataclass_parser.DataClassParser` .. Document .. ------------ diff --git a/docs/source/tutorials/output_parsers.rst b/docs/source/tutorials/output_parsers.rst index c619998b..bf7192ae 100644 --- a/docs/source/tutorials/output_parsers.rst +++ b/docs/source/tutorials/output_parsers.rst @@ -1,3 +1,5 @@ +.. _components-output_parser_note: + Parser ============= diff --git a/notebooks/adalflow_colab_template.ipynb b/notebooks/adalflow_colab_template.ipynb index 480d5b1a..384a3165 100644 --- a/notebooks/adalflow_colab_template.ipynb +++ b/notebooks/adalflow_colab_template.ipynb @@ -5,7 +5,7 @@ "metadata": {}, "source": [ "# 🤗 Welcome to AdalFlow!\n", - "## The PyTorch library to auto-optimize any LLM task pipelines\n", + "## The library to build & auto-optimize any LLM task pipelines\n", "\n", "Thanks for trying us out, we're here to provide you with the best LLM application development experience you can dream of 😊 any questions or concerns you may have, [come talk to us on discord,](https://discord.gg/ezzszrRZvT) we're always here to help! ⭐ Star us on Github ⭐\n", "\n", diff --git a/notebooks/adalflow_dataclasses.ipynb b/notebooks/tutorials/adalflow_dataclasses.ipynb similarity index 61% rename from notebooks/adalflow_dataclasses.ipynb rename to notebooks/tutorials/adalflow_dataclasses.ipynb index 958ba903..aceacdf7 100644 --- a/notebooks/adalflow_dataclasses.ipynb +++ b/notebooks/tutorials/adalflow_dataclasses.ipynb @@ -18,7 +18,7 @@ }, "source": [ "# 🤗 Welcome to AdalFlow!\n", - "## The PyTorch library to auto-optimize any LLM task pipelines\n", + "## The library to build & auto-optimize any LLM task pipelines\n", "\n", "Thanks for trying us out, we're here to provide you with the best LLM application development experience you can dream of 😊 any questions or concerns you may have, [come talk to us on discord,](https://discord.gg/ezzszrRZvT) we're always here to help! ⭐ Star us on Github ⭐\n", "\n", @@ -37,18 +37,18 @@ "\n", "This is a quick introduction of what AdalFlow is capable of. We will cover:\n", "\n", - "* How to use adalflow dataclass\n", - "* How to do nested dataclass with optional fields\n", + "* How to use `DataClass` with `DataClassParser`.\n", + "* How to do nested dataclass, we will test both one and two levels of nesting.\n", "\n", "**Next: Try our [auto-optimization](https://colab.research.google.com/drive/1n3mHUWekTEYHiBdYBTw43TKlPN41A9za?usp=sharing)**\n", "\n", "\n", "# Installation\n", "\n", - "1. Use `pip` to install the `adalflow` Python package. We will need `openai`, `groq`, and `faiss`(cpu version) from the extra packages.\n", + "1. Use `pip` to install the `adalflow` Python package. We will need `openai` and `groq`from the extra packages.\n", "\n", " ```bash\n", - " pip install adalflow[openai,groq,faiss-cpu]\n", + " pip install adalflow[openai,groq]\n", " ```\n", "2. Setup `openai` and `groq` API key in the environment variables" ] @@ -64,7 +64,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 1, "metadata": { "id": "ZaaevxNH9JMQ" }, @@ -73,7 +73,7 @@ "# Install adalflow with necessary dependencies\n", "from IPython.display import clear_output\n", "\n", - "!pip install -U adalflow[openai,groq,faiss-cpu]\n", + "!pip install -U adalflow[openai,groq]\n", "\n", "clear_output()" ] @@ -91,7 +91,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 23, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -115,6 +115,38 @@ "GROQ_API_KEY=\"PASTE-GROQ_API_KEY-HERE\"" ] }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "API keys have been set.\n" + ] + } + ], + "source": [ + "# or more securely\n", + "\n", + "import os\n", + "\n", + "from getpass import getpass\n", + "\n", + "# Prompt user to enter their API keys securely\n", + "groq_api_key = getpass(\"Please enter your GROQ API key: \")\n", + "openai_api_key = getpass(\"Please enter your OpenAI API key: \")\n", + "\n", + "\n", + "# Set environment variables\n", + "os.environ['GROQ_API_KEY'] = groq_api_key\n", + "os.environ['OPENAI_API_KEY'] = openai_api_key\n", + "\n", + "print(\"API keys have been set.\")" + ] + }, { "cell_type": "markdown", "metadata": { @@ -126,14 +158,13 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "metadata": { "id": "wOAiKg899Z2u" }, "outputs": [], "source": [ "# Import required libraries\n", - "from IPython.display import clear_output\n", "from dataclasses import dataclass, field\n", "from typing import List, Dict\n", "import adalflow as adal\n", @@ -143,7 +174,27 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'0.2.4'" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "adal.__version__" + ] + }, + { + "cell_type": "code", + "execution_count": null, "metadata": { "id": "bTzgyp6S9bnH" }, @@ -164,7 +215,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": { "id": "YA4pAIek9ewc" }, @@ -194,7 +245,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "metadata": { "id": "x4__jnbP9luN" }, @@ -228,7 +279,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "metadata": { "id": "TVi3rGvs9nte" }, @@ -249,13 +300,14 @@ " response = qa(\"What is LLM?\")\n", " print(\"\\nResponse:\")\n", " print(response)\n", + " print(f\"BasicQAOutput: {response.data}\")\n", " print(f\"Explanation: {response.data.explanation}\")\n", " print(f\"Example: {response.data.example}\")" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -293,9 +345,10 @@ ")\n", "\n", "Response:\n", - "GeneratorOutput(id=None, data=BasicQAOutput(explanation='LLM stands for Large Language Model, a type of artificial intelligence that is trained on vast amounts of text data to generate human-like language outputs.', example='For example, a chatbot powered by an LLM can comprehend and respond to user queries in a conversational manner.'), error=None, usage=CompletionUsage(completion_tokens=70, prompt_tokens=174, total_tokens=244), raw_response='```\\n{\\n \"explanation\": \"LLM stands for Large Language Model, a type of artificial intelligence that is trained on vast amounts of text data to generate human-like language outputs.\",\\n \"example\": \"For example, a chatbot powered by an LLM can comprehend and respond to user queries in a conversational manner.\"\\n}\\n```', metadata=None)\n", - "Explanation: LLM stands for Large Language Model, a type of artificial intelligence that is trained on vast amounts of text data to generate human-like language outputs.\n", - "Example: For example, a chatbot powered by an LLM can comprehend and respond to user queries in a conversational manner.\n" + "GeneratorOutput(id=None, data=BasicQAOutput(explanation='Large Language Model (LLM) is a type of artificial intelligence designed to process and generate human-like language', example='The new LLM-powered chatbot was able to understand and respond to complex user queries with high accuracy'), error=None, usage=CompletionUsage(completion_tokens=60, prompt_tokens=174, total_tokens=234), raw_response='```\\n{\\n \"explanation\": \"Large Language Model (LLM) is a type of artificial intelligence designed to process and generate human-like language\",\\n \"example\": \"The new LLM-powered chatbot was able to understand and respond to complex user queries with high accuracy\"\\n}\\n```', metadata=None)\n", + "BasicQAOutput: BasicQAOutput(explanation='Large Language Model (LLM) is a type of artificial intelligence designed to process and generate human-like language', example='The new LLM-powered chatbot was able to understand and respond to complex user queries with high accuracy')\n", + "Explanation: Large Language Model (LLM) is a type of artificial intelligence designed to process and generate human-like language\n", + "Example: The new LLM-powered chatbot was able to understand and respond to complex user queries with high accuracy\n" ] } ], @@ -314,7 +367,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 13, "metadata": { "id": "5Arp4-Dq9u49" }, @@ -347,7 +400,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 14, "metadata": { "id": "VLbRUzXg9yP0" }, @@ -362,18 +415,7 @@ }, { "cell_type": "code", - "execution_count": 11, - "metadata": { - "id": "u4u98LMm9z_i" - }, - "outputs": [], - "source": [ - "from adalflow.core.functional import custom_asdict, dataclass_obj_from_dict" - ] - }, - { - "cell_type": "code", - "execution_count": 12, + "execution_count": 15, "metadata": { "id": "7MUcu0tk91l4" }, @@ -381,6 +423,8 @@ "source": [ "# 2. Nested DataClass example\n", "\n", + "# Have both MovieReview and Actor nested in DetailedMovieReview\n", + "\n", "@dataclass\n", "class DetailedMovieReview(adal.DataClass):\n", " basic_review: MovieReview\n", @@ -402,34 +446,8 @@ }, { "cell_type": "code", - "execution_count": 13, - "metadata": { - "id": "ekr4v8Xg93en" - }, - "outputs": [], - "source": [ - "# 3. DataClass with optional fields\n", - "@dataclass\n", - "class MovieAnalysis(adal.DataClass):\n", - " review: DetailedMovieReview\n", - " box_office: float = field(\n", - " default=None,\n", - " metadata={\"desc\": \"Box office earnings in millions of dollars\"}\n", - " )\n", - " awards: Dict[str, int] = field(\n", - " default=None,\n", - " metadata={\"desc\": \"Dictionary of award categories and number of wins\"}\n", - " )\n", - "\n", - " __output_fields__ = [\"review\", \"box_office\", \"awards\"]" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": { - "id": "jq84dbWB95TX" - }, + "execution_count": 16, + "metadata": {}, "outputs": [], "source": [ "# Example template for movie review\n", @@ -439,24 +457,22 @@ "{{output_format_str}}\n", "\n", "\n", - " Review this movie: {{movie_title}} \"\"\"\n" + " Review this movie: {{movie_title}} \"\"\"" ] }, { "cell_type": "code", - "execution_count": 15, - "metadata": { - "id": "QrwnCihM97Oh" - }, + "execution_count": 17, + "metadata": {}, "outputs": [], "source": [ "# Create the MovieReviewer component with MovieAnalysis data class\n", "class MovieReviewer(adal.Component):\n", - " def __init__(self, model_client: adal.ModelClient, model_kwargs: Dict):\n", + " def __init__(self, model_client: adal.ModelClient, model_kwargs: Dict, data_class: adal.DataClass):\n", " super().__init__()\n", " self.additional_structure_prompt = \"Dont use 'type' and 'properties' in output directly give as dict\"\n", " parser = adal.DataClassParser(\n", - " data_class=MovieAnalysis,\n", + " data_class=data_class,\n", " return_data_class=True\n", " )\n", " self.generator = adal.Generator(\n", @@ -473,82 +489,158 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "DetailedMovieReview: DetailedMovieReview(basic_review=MovieReview(title='The Matrix', rating=8.5, pros=['Groundbreaking special effects', 'Intriguing story with complex themes', 'Well-developed characters', 'Excellent world-building'], cons=['Pacing can be slow in some parts']), cast=[Actor(name='Keanu Reeves', role='Neo'), Actor(name='Laurence Fishburne', role='Morpheus'), Actor(name='Carrie-Anne Moss', role='Trinity')], genre=['Science Fiction', 'Action'], recommend=True)\n", + "BasicReview: MovieReview(title='The Matrix', rating=8.5, pros=['Groundbreaking special effects', 'Intriguing story with complex themes', 'Well-developed characters', 'Excellent world-building'], cons=['Pacing can be slow in some parts'])\n", + "Cast: [Actor(name='Keanu Reeves', role='Neo'), Actor(name='Laurence Fishburne', role='Morpheus'), Actor(name='Carrie-Anne Moss', role='Trinity')]\n" + ] + } + ], + "source": [ + "# test the data class with one level of nesting\n", + "\n", + "reviewer = MovieReviewer(\n", + " model_client=GroqAPIClient(),\n", + " model_kwargs={\"model\": \"llama3-8b-8192\"},\n", + " data_class=DetailedMovieReview\n", + ")\n", + "\n", + "response = reviewer(\"The Matrix\")\n", + "print(f\"DetailedMovieReview: {response.data}\")\n", + "print(f\"BasicReview: {response.data.basic_review}\")\n", + "print(f\"Cast: {response.data.cast}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "DetailedMovieReview: DetailedMovieReview(basic_review=MovieReview(title='The Matrix', rating=9.0, pros=['Innovative special effects and action sequences', 'Thought-provoking storyline', 'Engaging cyberpunk aesthetic', 'Strong performances from the cast', 'Iconic fight choreography'], cons=['Complex narrative that may confuse some viewers', 'Some dated CGI when compared to modern standards']), cast=[Actor(name='Keanu Reeves', role='Neo'), Actor(name='Laurence Fishburne', role='Morpheus'), Actor(name='Carrie-Anne Moss', role='Trinity'), Actor(name='Hugo Weaving', role='Agent Smith')], genre=['Science Fiction', 'Action', 'Adventure'], recommend=True)\n", + "BasicReview: MovieReview(title='The Matrix', rating=9.0, pros=['Innovative special effects and action sequences', 'Thought-provoking storyline', 'Engaging cyberpunk aesthetic', 'Strong performances from the cast', 'Iconic fight choreography'], cons=['Complex narrative that may confuse some viewers', 'Some dated CGI when compared to modern standards'])\n", + "Cast: [Actor(name='Keanu Reeves', role='Neo'), Actor(name='Laurence Fishburne', role='Morpheus'), Actor(name='Carrie-Anne Moss', role='Trinity'), Actor(name='Hugo Weaving', role='Agent Smith')]\n" + ] + } + ], + "source": [ + "# try use openai model\n", + "reviewer = MovieReviewer(\n", + " model_client=adal.OpenAIClient(),\n", + " model_kwargs={\"model\": \"gpt-4o\"},\n", + " data_class=DetailedMovieReview\n", + ")\n", + "response = reviewer(\"The Matrix\")\n", + "print(f\"DetailedMovieReview: {response.data}\")\n", + "print(f\"BasicReview: {response.data.basic_review}\")\n", + "print(f\"Cast: {response.data.cast}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We see both models can handle one level of nested dataclass quite well. And the output ordering will follow the ordering specified in __output_fields__" + ] + }, + { + "cell_type": "code", + "execution_count": 20, "metadata": { - "id": "WhdiJPfq99qS" + "id": "ekr4v8Xg93en" }, "outputs": [], "source": [ - "# Use SongReviewer Class for QA\n", - "def run_movie_analysis_example():\n", - " reviewer = MovieReviewer(\n", - " model_client=GroqAPIClient(),\n", - " model_kwargs={\"model\": \"llama3-8b-8192\"},\n", - " )\n", + "# 3. second level nested dataclass\n", "\n", - " # Get a movie review\n", - " analysis = reviewer.call(\"The Matrix\")\n", + "@dataclass\n", + "class MovieAnalysis(adal.DataClass):\n", + " review: DetailedMovieReview\n", + " box_office: float = field(\n", + " default=None,\n", + " metadata={\"desc\": \"Box office earnings in millions of dollars\"}\n", + " )\n", + " awards: Dict[str, int] = field(\n", + " default=None,\n", + " metadata={\"desc\": \"Dictionary of award categories and number of wins\"}\n", + " )\n", "\n", - " # Access nested data\n", - " print(f\"Movie Title: {analysis.data.review['basic_review']['title']}\")\n", - " print(f\"Rating: {analysis.data.review['basic_review']['rating']}\")\n", - " print(\"\\nPros:\")\n", - " for pro in analysis.data.review[\"basic_review\"][\"pros\"]:\n", - " print(f\"- {pro}\")\n", + " __output_fields__ = [\"review\", \"box_office\", \"awards\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MovieAnalysis: MovieAnalysis(review=DetailedMovieReview(basic_review=MovieReview(title='The Matrix', rating=9.5, pros=['Innovative concept', 'Mind-bending plot', 'Impressive action sequences'], cons=['Some overly complex dialogue', 'Ending leaves room for interpretation']), cast=[Actor(name='Keanu Reeves', role='Neo'), Actor(name='Laurence Fishburne', role='Morpheus'), Actor(name='Carrie-Anne Moss', role='Trinity')], genre=['Action', 'Science Fiction'], recommend=True), box_office=463.5, awards={'Best Visual Effects': 4, 'Best Film Editing': 2, 'Best Sound': 1})\n", + "DetailedMovieReview: DetailedMovieReview(basic_review=MovieReview(title='The Matrix', rating=9.5, pros=['Innovative concept', 'Mind-bending plot', 'Impressive action sequences'], cons=['Some overly complex dialogue', 'Ending leaves room for interpretation']), cast=[Actor(name='Keanu Reeves', role='Neo'), Actor(name='Laurence Fishburne', role='Morpheus'), Actor(name='Carrie-Anne Moss', role='Trinity')], genre=['Action', 'Science Fiction'], recommend=True)\n", + "BasicReview: MovieReview(title='The Matrix', rating=9.5, pros=['Innovative concept', 'Mind-bending plot', 'Impressive action sequences'], cons=['Some overly complex dialogue', 'Ending leaves room for interpretation'])\n", + "Cast: [Actor(name='Keanu Reeves', role='Neo'), Actor(name='Laurence Fishburne', role='Morpheus'), Actor(name='Carrie-Anne Moss', role='Trinity')]\n" + ] + } + ], + "source": [ + "# test the data class with two levels of nested dataclass\n", "\n", - " print(\"\\nCast:\")\n", - " for actor in analysis.data.review[\"cast\"]:\n", - " print(f\"- {actor['name']} as {actor['role']}\")\n", + "# gpt-3.5-turbo model\n", "\n", - " if analysis.data.box_office:\n", - " print(f\"\\nBox Office: ${analysis.data.box_office} million\")\n", + "analysis = MovieReviewer(\n", + " model_client=adal.OpenAIClient(),\n", + " model_kwargs={\"model\": \"gpt-3.5-turbo\"},\n", + " data_class=MovieAnalysis\n", + ")\n", "\n", - " if analysis.data.awards:\n", - " print(\"\\nAwards:\")\n", - " for category, count in analysis.data.awards.items():\n", - " print(f\"- {category}: {count}\")" + "response = analysis(\"The Matrix\")\n", + "print(f\"MovieAnalysis: {response.data}\")\n", + "print(f\"DetailedMovieReview: {response.data.review}\")\n", + "print(f\"BasicReview: {response.data.review.basic_review}\")\n", + "print(f\"Cast: {response.data.review.cast}\")" ] }, { "cell_type": "code", - "execution_count": 17, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "0oxGGjqC9_v1", - "outputId": "903c6747-2c8a-4264-902e-56c03f8a2dcc" - }, + "execution_count": 24, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Movie Title: The Matrix\n", - "Rating: 8.7\n", - "\n", - "Pros:\n", - "- Innovative special effects\n", - "- Philosophical themes and storyline\n", - "- Excellent acting performances\n", - "- Epic scope and cinematography\n", - "\n", - "Cast:\n", - "- Keanu Reeves as Neo\n", - "- Laurence Fishburne as Morpheus\n", - "- Carlson Young as Trinity\n", - "\n", - "Box Office: $463.5 million\n", - "\n", - "Awards:\n", - "- Academy Awards: 4\n", - "- Bram Stoker Awards: 1\n", - "- Empire Awards: 2\n" + "MovieAnalysis: MovieAnalysis(review=DetailedMovieReview(basic_review=MovieReview(title='The Matrix', rating=9.5, pros=['Groundbreaking special effects', 'Thought-provoking themes', 'Innovative storyline', 'Strong performances from the cast'], cons=['Somewhat slow pacing in parts']), cast=[Actor(name='Keanu Reeves', role='Neo'), Actor(name='Laurence Fishburne', role='Morpheus'), Actor(name='Carrie-Anne Moss', role='Trinity')], genre=['Science Fiction', 'Action', 'Adventure'], recommend=True), box_office=463.5, awards={'Academy Awards': 4, 'MTV Movie Awards': 10, 'Saturn Awards': 7})\n", + "DetailedMovieReview: DetailedMovieReview(basic_review=MovieReview(title='The Matrix', rating=9.5, pros=['Groundbreaking special effects', 'Thought-provoking themes', 'Innovative storyline', 'Strong performances from the cast'], cons=['Somewhat slow pacing in parts']), cast=[Actor(name='Keanu Reeves', role='Neo'), Actor(name='Laurence Fishburne', role='Morpheus'), Actor(name='Carrie-Anne Moss', role='Trinity')], genre=['Science Fiction', 'Action', 'Adventure'], recommend=True)\n", + "BasicReview: MovieReview(title='The Matrix', rating=9.5, pros=['Groundbreaking special effects', 'Thought-provoking themes', 'Innovative storyline', 'Strong performances from the cast'], cons=['Somewhat slow pacing in parts'])\n", + "Cast: [Actor(name='Keanu Reeves', role='Neo'), Actor(name='Laurence Fishburne', role='Morpheus'), Actor(name='Carrie-Anne Moss', role='Trinity')]\n" ] } ], "source": [ - "run_movie_analysis_example()" + "# test the data class with two levels of nested dataclass\n", + "\n", + "analysis = MovieReviewer(\n", + " model_client=GroqAPIClient(),\n", + " model_kwargs={\"model\": \"llama3-8b-8192\"},\n", + " data_class=MovieAnalysis\n", + ")\n", + "\n", + "response = analysis(\"The Matrix\")\n", + "print(f\"MovieAnalysis: {response.data}\")\n", + "print(f\"DetailedMovieReview: {response.data.review}\")\n", + "print(f\"BasicReview: {response.data.review.basic_review}\")\n", + "print(f\"Cast: {response.data.review.cast}\")" ] }, { @@ -563,7 +655,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 26, "metadata": { "id": "7g9bUa0q-B6Y" }, @@ -603,7 +695,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 27, "metadata": { "id": "UGhMRZht-HiB" }, @@ -618,7 +710,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 28, "metadata": { "id": "sfNWgPYN-JAj" }, @@ -649,13 +741,17 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 29, "metadata": { "id": "HG8rtCd8-K7t" }, "outputs": [], "source": [ - "# 3. DataClass with optional fields\n", + "# 3. two levels of nesting dataclass\n", + "\n", + "# all these fields as we use default, it is optional, so \n", + "# llm might not output that field if they dont have information\n", + "\n", "@dataclass\n", "class SongAnalysis(adal.DataClass):\n", " review: DetailedSongReview = field(\n", @@ -675,7 +771,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 30, "metadata": { "id": "v3mNeyz7-MpY" }, @@ -693,7 +789,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 31, "metadata": { "id": "X2eifXOU-OrE" }, @@ -723,96 +819,99 @@ }, { "cell_type": "code", - "execution_count": 24, - "metadata": { - "id": "2A0j3Ra1_rkH" - }, - "outputs": [], + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "SongAnalysis: {'review': {'basic_review': {'title': 'Shape of You', 'album': '÷ (Divide)', 'ranking': 7, 'streaming': {'spotify': 4.5, 'youtube': 2.5}, 'pros': ['Catchy beat', 'Catchy melody', 'Funky rhythm', 'Great lyrics'], 'cons': ['Some may find the lyrics objectifying', 'Not typically my cup of tea']}, 'cast': [{'name': 'Ed Sheeran', 'role': 'Lead vocals, songwriting'}], 'genre': ['Pop', 'Dance', 'Electro'], 'recommend': True}, 'duration': 3.53}\n" + ] + } + ], "source": [ - "# Use SongReviewer Class for QA\n", - "def run_song_analysis_example():\n", - " reviewer = SongReviewer(\n", - " model_client=GroqAPIClient(),\n", - " model_kwargs={\"model\": \"llama3-8b-8192\"},\n", - " )\n", - "\n", - " # Get a movie review\n", - " analysis = reviewer.call(\"A Thousand Years\")\n", - " print(analysis)\n", - " # Access nested data\n", - " print(f\"Song Title: {analysis.data['review']['basic_review']['title']}\")\n", - " print(f\"Album: {analysis.data['review']['basic_review']['album']}\")\n", - " print(f\"Ranking: {analysis.data['review']['basic_review']['ranking']}\")\n", - "\n", - " for platform, views in analysis.data['review']['basic_review']['streaming'].items():\n", - " print(f\"- {platform} - {views} million views\")\n", - " print(\"\\nPros:\")\n", - " for pro in analysis.data['review'][\"basic_review\"][\"pros\"]:\n", - " print(f\"- {pro}\")\n", + "analysis = SongReviewer(\n", + " model_client=GroqAPIClient(),\n", + " model_kwargs={\"model\": \"llama3-8b-8192\"},\n", + ")\n", "\n", - " print(\"\\nArtist's:\")\n", - " for actor in analysis.data['review'][\"cast\"]:\n", - " print(f\"- {actor['name']} as {actor['role']}\")\n", + "response = analysis(\"Shape of you\")\n", + "print(f\"SongAnalysis: {response.data}\")\n", "\n", - " if analysis.data['review']['genre']:\n", - " print(f\"\\nGenere: \")\n", - " for genre in analysis.data['review']['genre']:\n", - " print(f\" {genre} \")\n", - "\n", - " if analysis.data['duration']:\n", - " print(f\"\\nDuration: {analysis.data['duration']} minutes\")\n", - "\n", - " if analysis.data['awards']:\n", - " print(\"\\nAwards:\")\n", - " for category, count in analysis.data['awards'].items():\n", - " print(f\"- {category}: {count}\")" + "# this time as we set `return_data_class` to False in the parser, we get the output as dict" ] }, { "cell_type": "code", - "execution_count": 25, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "eGzZYj0F_tf3", - "outputId": "b4e07bf9-a14f-4b75-b3bb-e161a4091c3a" - }, + "execution_count": 38, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "GeneratorOutput(id=None, data={'review': {'basic_review': {'title': 'A Thousand Years', 'album': 'The Twilight Saga: Breaking Dawn - Part 1', 'ranking': 34, 'streaming': {'Spotify': 1.4, 'YouTube': 2.8}, 'pros': ['Hauntingly beautiful vocals from Christina Perri', 'A timeless love song with a memorable melody', 'Great chemistry between the lead vocals and the instrumental accompaniment'], 'cons': ['Some listeners may find the lyrics a bit too sappy or clichéd', 'The song may become repetitive after a few listens']}, 'cast': [{'name': 'Christina Perri', 'role': 'Lead Vocals'}], 'genre': ['Pop', 'Rock', 'Ballad'], 'recommend': True}, 'duration': 4.43, 'awards': {'Teen Choice Awards': 1, 'MTV Video Music Awards': 1}}, error=None, usage=CompletionUsage(completion_tokens=239, prompt_tokens=590, total_tokens=829), raw_response='```\\n{\\n \"review\": {\\n \"basic_review\": {\\n \"title\": \"A Thousand Years\",\\n \"album\": \"The Twilight Saga: Breaking Dawn - Part 1\",\\n \"ranking\": 34,\\n \"streaming\": {\"Spotify\": 1.4, \"YouTube\": 2.8},\\n \"pros\": [\\n \"Hauntingly beautiful vocals from Christina Perri\",\\n \"A timeless love song with a memorable melody\",\\n \"Great chemistry between the lead vocals and the instrumental accompaniment\"\\n ],\\n \"cons\": [\\n \"Some listeners may find the lyrics a bit too sappy or clichéd\",\\n \"The song may become repetitive after a few listens\"\\n ]\\n },\\n \"cast\": [\\n {\"name\": \"Christina Perri\", \"role\": \"Lead Vocals\"}\\n ],\\n \"genre\": [\"Pop\", \"Rock\", \"Ballad\"],\\n \"recommend\": true\\n },\\n \"duration\": 4.43,\\n \"awards\": {\\n \"Teen Choice Awards\": 1,\\n \"MTV Video Music Awards\": 1\\n }\\n}\\n```', metadata=None)\n", - "Song Title: A Thousand Years\n", - "Album: The Twilight Saga: Breaking Dawn - Part 1\n", - "Ranking: 34\n", - "- Spotify - 1.4 million views\n", - "- YouTube - 2.8 million views\n", + "Song Title: Shape of You\n", + "Album: ÷ (Divide)\n", + "Ranking: 7\n", + "- spotify - 4.5 million views\n", + "- youtube - 2.5 million views\n", "\n", "Pros:\n", - "- Hauntingly beautiful vocals from Christina Perri\n", - "- A timeless love song with a memorable melody\n", - "- Great chemistry between the lead vocals and the instrumental accompaniment\n", + "- Catchy beat\n", + "- Catchy melody\n", + "- Funky rhythm\n", + "- Great lyrics\n", "\n", "Artist's:\n", - "- Christina Perri as Lead Vocals\n", + "- Ed Sheeran as Lead vocals, songwriting\n", "\n", "Genere: \n", " Pop \n", - " Rock \n", - " Ballad \n", + " Dance \n", + " Electro \n", "\n", - "Duration: 4.43 minutes\n", - "\n", - "Awards:\n", - "- Teen Choice Awards: 1\n", - "- MTV Video Music Awards: 1\n" + "Duration: 3.53 minutes\n" ] } ], "source": [ - "run_song_analysis_example()" + "# Access nested data\n", + "analysis = response.data\n", + "print(f\"Song Title: {analysis['review']['basic_review']['title']}\")\n", + "print(f\"Album: {analysis['review']['basic_review']['album']}\")\n", + "print(f\"Ranking: {analysis['review']['basic_review']['ranking']}\")\n", + "\n", + "for platform, views in analysis['review']['basic_review']['streaming'].items():\n", + " print(f\"- {platform} - {views} million views\")\n", + "print(\"\\nPros:\")\n", + "for pro in analysis['review'][\"basic_review\"][\"pros\"]:\n", + " print(f\"- {pro}\")\n", + "\n", + "print(\"\\nArtist's:\")\n", + "for actor in analysis['review'][\"cast\"]:\n", + " print(f\"- {actor['name']} as {actor['role']}\")\n", + "\n", + "if analysis['review']['genre']:\n", + " print(f\"\\nGenere: \")\n", + " for genre in analysis['review']['genre']:\n", + " print(f\" {genre} \")\n", + "\n", + "if analysis['duration']:\n", + " print(f\"\\nDuration: {analysis['duration']} minutes\")\n", + "\n", + "if hasattr(analysis, 'awards') and analysis['awards']:\n", + " print(\"\\nAwards:\")\n", + " for category, count in analysis['awards'].items():\n", + " print(f\"- {category}: {count}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "TODOs:\n", + "1. Add `JsonOutputParser` and `YamlOutputParser` to this notebook." ] }, { @@ -838,9 +937,9 @@ "provenance": [] }, "kernelspec": { - "display_name": "openc", + "display_name": "my-project-kernel", "language": "python", - "name": "python3" + "name": "my-project-kernel" }, "language_info": { "codemirror_mode": { @@ -852,7 +951,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.14" + "version": "3.12.4" } }, "nbformat": 4, From 6280d87c4a97ccaa60264a022dcbeda0179308a4 Mon Sep 17 00:00:00 2001 From: Li Yin Date: Mon, 28 Oct 2024 22:12:48 -0700 Subject: [PATCH 2/4] improve parser tutorial with data class parser, data class parser doc string, and add test cases, and link to the code --- adalflow/adalflow/__init__.py | 2 +- .../output_parsers/dataclass_parser.py | 45 +++- adalflow/pyproject.toml | 4 +- adalflow/tests/test_data_class_parser.py | 142 +++++++++++ adalflow/tests/test_output_parser.py | 2 + docs/source/tutorials/base_data_class.rst | 2 +- docs/source/tutorials/output_parsers.rst | 232 +++++++++++++++++- tutorials/parser_note.py | 57 +++++ 8 files changed, 468 insertions(+), 18 deletions(-) create mode 100644 adalflow/tests/test_data_class_parser.py diff --git a/adalflow/adalflow/__init__.py b/adalflow/adalflow/__init__.py index 184dc514..8ccd7ba5 100644 --- a/adalflow/adalflow/__init__.py +++ b/adalflow/adalflow/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.2.4" +__version__ = "0.2.5" from adalflow.core.component import Component, fun_to_component from adalflow.core.container import Sequential diff --git a/adalflow/adalflow/components/output_parsers/dataclass_parser.py b/adalflow/adalflow/components/output_parsers/dataclass_parser.py index fb20da79..6d2e56dd 100644 --- a/adalflow/adalflow/components/output_parsers/dataclass_parser.py +++ b/adalflow/adalflow/components/output_parsers/dataclass_parser.py @@ -1,4 +1,4 @@ -"""DataClassParser will help users convert a dataclass to prompt""" +"""DataClassParser will help users interact with LLMs even better than JsonOutputParser and YamlOutputParser with DataClass.""" from dataclasses import is_dataclass from typing import Any, Literal, List, Optional @@ -43,9 +43,46 @@ class DataClassParser(Component): - __doc__ = ( - r"""This is similar to Dspy's signature but more controllable and flexible.""" - ) + __doc__ = r"""Made the structured output even simpler compared with JsonOutputParser and YamlOutputParser. + + 1. Understands __input_fields__ and __output_fields__ from the DataClass (no need to use include/exclude to decide fields). + 2. User can choose to save the `task_desc` in the DataClass and use it in the prompt. + + Example: + + .. code-block:: python + + @dataclass + class BasicQAOutput(adal.DataClass): + explanation: str = field( + metadata={"desc": "A brief explanation of the concept in one sentence."} + ) + example: str = field( + metadata={"desc": "An example of the concept in a sentence."} + ) + # Control output fields order + __output_fields__ = ["explanation", "example"] + + # Define the template using jinja2 syntax + qa_template = " + You are a helpful assistant. + + {{output_format_str}} + + + {{input_str}} " + + parser = adal.DataClassParser(data_class=BasicQAOutput, return_data_class=True) + + # Set up the generator with model, template, and parser + self.generator = adal.Generator( + model_client=model_client, + model_kwargs=model_kwargs, + template=qa_template, + prompt_kwargs={"output_format_str": parser.get_output_format_str()}, + output_processors=parser, + ) + """ def __init__( self, diff --git a/adalflow/pyproject.toml b/adalflow/pyproject.toml index c2705364..49a6cbe8 100644 --- a/adalflow/pyproject.toml +++ b/adalflow/pyproject.toml @@ -1,8 +1,8 @@ [tool.poetry] name = "adalflow" -version = "0.2.4" -description = "The Library to Build and Auto-optimize Any LLM Task Pipeline" +version = "0.2.5" +description = "The Library to Build and Auto-optimize LLM Applications" authors = ["Li Yin "] readme = "README.md" repository = "https://github.com/SylphAI-Inc/AdalFlow" diff --git a/adalflow/tests/test_data_class_parser.py b/adalflow/tests/test_data_class_parser.py new file mode 100644 index 00000000..b87fde4c --- /dev/null +++ b/adalflow/tests/test_data_class_parser.py @@ -0,0 +1,142 @@ +import unittest +from dataclasses import dataclass, field +from typing import List +from adalflow.core.base_data_class import DataClass +from adalflow.components.output_parsers.dataclass_parser import DataClassParser + + +# Define a basic DataClass for testing +@dataclass +class BasicOutput(DataClass): + explanation: str = field( + metadata={"desc": "A brief explanation of the concept in one sentence."} + ) + example: str = field(metadata={"desc": "An example of the concept in a sentence."}) + __output_fields__ = ["explanation", "example"] + + +# Define a nested DataClass for testing +@dataclass +class NestedOutput(DataClass): + title: str + description: str + items: List[str] + __output_fields__ = ["title", "description", "items"] + + +class TestDataClassParser(unittest.TestCase): + + def setUp(self): + self.basic_data_class = BasicOutput + self.nested_data_class = NestedOutput + self.basic_parser = DataClassParser( + data_class=self.basic_data_class, return_data_class=True, format_type="json" + ) + self.nested_parser = DataClassParser( + data_class=self.nested_data_class, + return_data_class=True, + format_type="yaml", + ) + + def test_basic_data_class_json(self): + input_instance = BasicOutput( + explanation="This is a test.", example="Example sentence." + ) + input_str = self.basic_parser.get_input_str(input_instance) + self.assertIn("This is a test.", input_str) + self.assertIn("Example sentence.", input_str) + + output_format_str = self.basic_parser.get_output_format_str() + self.assertIn("explanation", output_format_str) + self.assertIn("example", output_format_str) + + output = self.basic_parser.call( + '{"explanation": "Test explanation", "example": "Test example."}' + ) + self.assertIsInstance(output, BasicOutput) + + def test_basic_data_class_yaml(self): + self.yaml_parser = DataClassParser( + data_class=self.basic_data_class, return_data_class=True, format_type="yaml" + ) + input_instance = BasicOutput( + explanation="This is a test.", example="Example sentence." + ) + input_str = self.yaml_parser.get_input_str(input_instance) + self.assertIn("This is a test.", input_str) + + self.assertIn("Example sentence.", input_str) + + output_format_str = self.yaml_parser.get_output_format_str() + self.assertIn("explanation", output_format_str) + self.assertIn("example", output_format_str) + + output = self.yaml_parser.call( + """explanation: Test explanation +example: Test example.""" + ) + print(f"output: {output}") + self.assertIsInstance(output, BasicOutput) + + def test_nested_data_class_json(self): + input_instance = NestedOutput( + title="Title", description="Description", items=["Item 1", "Item 2"] + ) + input_str = self.nested_parser.get_input_str(input_instance) + self.assertIn("Title", input_str) + self.assertIn("Description", input_str) + self.assertIn("Item 1", input_str) + self.assertIn("Item 2", input_str) + + output_format_str = self.nested_parser.get_output_format_str() + self.assertIn("title", output_format_str) + self.assertIn("description", output_format_str) + self.assertIn("items", output_format_str) + + output = self.nested_parser.call( + """title: Nested Title +description: Nested description +items: + - Item 1 + - Item 2""" + ) + self.assertIsInstance(output, NestedOutput) + + def test_nested_data_class_yaml(self): + self.nested_parser._format_type = "yaml" + input_instance = NestedOutput( + title="Title", description="Description", items=["Item 1", "Item 2"] + ) + input_str = self.nested_parser.get_input_str(input_instance) + self.assertIn("Title", input_str) + self.assertIn("Description", input_str) + self.assertIn("Item 1", input_str) + self.assertIn("Item 2", input_str) + + output_format_str = self.nested_parser.get_output_format_str() + self.assertIn("title", output_format_str) + self.assertIn("description", output_format_str) + self.assertIn("items", output_format_str) + + output = self.nested_parser.call( + """title: Nested Title +description: Nested description +items: + - Item 1 + - Item 2""" + ) + self.assertIsInstance(output, NestedOutput) + + def test_invalid_data_class(self): + with self.assertRaises(ValueError): + DataClassParser(data_class=dict) # dict is not a dataclass + + def test_invalid_format_type(self): + with self.assertRaises(ValueError): + DataClassParser( + data_class=self.basic_data_class, format_type="xml" + ) # Invalid format type + + +if __name__ == "__main__": + unittest.main() diff --git a/adalflow/tests/test_output_parser.py b/adalflow/tests/test_output_parser.py index b9502f77..a2b529dc 100644 --- a/adalflow/tests/test_output_parser.py +++ b/adalflow/tests/test_output_parser.py @@ -13,6 +13,8 @@ class User(DataClass): id: int = field(default=1, metadata={"description": "User ID"}) name: str = field(default="John", metadata={"description": "User name"}) + __input_fields__ = ["id", "name"] + class TestOutputParsers(unittest.TestCase): diff --git a/docs/source/tutorials/base_data_class.rst b/docs/source/tutorials/base_data_class.rst index 52b4e926..081533eb 100644 --- a/docs/source/tutorials/base_data_class.rst +++ b/docs/source/tutorials/base_data_class.rst @@ -21,7 +21,7 @@ DataClass In LLM applications, data constantly needs to interact with LLMs in the form of strings via prompt and be parsed back to structured data from LLMs' text prediction. :class:`DataClass` is designed to ease this data interaction with LLMs via prompt(input) and to parse the text prediction(output). -It is even more convenient to use together with **:ref:`components-output_parser_note`**. +It is even more convenient to use together with :ref:`components-output_parser_note`. .. figure:: /_static/images/dataclass.png :align: center diff --git a/docs/source/tutorials/output_parsers.rst b/docs/source/tutorials/output_parsers.rst index bf7192ae..0df19e09 100644 --- a/docs/source/tutorials/output_parsers.rst +++ b/docs/source/tutorials/output_parsers.rst @@ -1,9 +1,26 @@ .. _components-output_parser_note: +.. raw:: html + + + Parser ============= -Parser is the `interpreter` of the LLM output. +Parser is the `interpreter` of the LLM output. We have three types of parsers: + +- **String Parsers**: it simply converts the string to the desired data type. They are located at :ref:`core.string_parser`. +- **Output Parsers**: it orchestrates the parsing and output formatting(in yaml, json and more) process. They are located at :ref:`components.output_parsers.outputs`. :class:`JsonOutputParser` and :class:`YamlOutputParser` can work with :ref:`DataClass` for structured output. +- **DataClass Parser**: On top of `YamlOutputParser` and `JsonOutputParser`, :class:`DataClassParser` is the most compatible to work with :ref:`DataClass` for structured output. @@ -142,7 +159,44 @@ Thus, ``JsonOutputParser`` and ``YamlOutputParser`` both takes the following arg - ``data_class``: the ``DataClass`` type. - ``examples``: the examples of the data class instance if you want to show the examples in the prompt. -- ``exclude``: the fields to exclude from both the data format and the examples. +- ``exclude``: the fields to exclude from both the data format and the examples, a way to tell the ``format_instructions`` on which is the output field from the data class. + +DataClass Parser +~~~~~~~~~~~~~~~~~~~~ +To make things even easier for the developers, we created :class:`DataClassParser` which +understands `__input_fields__` and `__output_fields__` of the `DataClass`, and it is especially helpful to work on a training dataset where we will have both inputs and outputs. +Users do not have to use `exclude/include` fields to specify the output fields, it will automatically understand the output fields from the `DataClass` instance. + +Below is an overview of its key components and functionalities. + +.. list-table:: + :header-rows: 1 + :widths: 20 20 60 + + * - Method + - Description + - Details + * - ``__init__(data_class: DataClass, return_data_class: bool = False, format_type: Literal["yaml", "json"] = "json")`` + - Initializes the DataClassParser + - Takes a DataClass type, whether to return the DataClass instance after parsing, and the output format type (JSON or YAML). + * - ``get_input_format_str() -> str`` + - Returns formatted instructions for input data + - Provides a string representation of the input fields defined in the DataClass. + * - ``get_output_format_str() -> str`` + - Returns formatted instructions for output data + - Generates a schema string for the output fields of the DataClass. + * - ``get_input_str(input: DataClass) -> str`` + - Formats the input data as a string + - Converts a DataClass instance to either JSON or YAML based on the specified format type. + * - ``get_task_desc_str() -> str`` + - Returns the task description string + - Retrieves the task description associated with the DataClass, useful for context in LLM prompts. + * - ``get_examples_str(examples: List[DataClass], include: Optional[IncludeType] = None, exclude: Optional[ExcludeType] = None) -> str`` + - Formats a list of example DataClass instances + - Generates a formatted string representation of examples, adhering to the specified ``include/exclude`` parameters. + * - ``call(input: str) -> Any`` + - Parses the output string to the desired format and returns parsed output + - Handles both JSON and YAML parsing, converting to the corresponding DataClass if specified. .. TODO: a summary table and a diagram @@ -150,7 +204,8 @@ Parser in Action ------------------ All of the parsers are quite straightforward to use. -**BooleanParser** +BooleanParser +~~~~~~~~~~~~~~~~~~ .. code-block:: python @@ -183,7 +238,9 @@ The printout will be: Boolean parsers will not work for '1', '0', 'yes', 'no' as they are not the standard boolean values. -**IntParser** + +IntParser +~~~~~~~~~~~~~~~~~~ .. code-block:: python @@ -212,7 +269,9 @@ The printout will be: ``IntParser`` will return the integer value of the first number in the string, even if it is a float. -**FloatParser** + +FloatParser +~~~~~~~~~~~~~~~~~~ .. code-block:: python @@ -242,7 +301,9 @@ The printout will be: ``FloatParser`` will return the float value of the first number in the string, even if it is an integer. -**ListParser** + +ListParser +~~~~~~~~~~~~~~~~~~ .. code-block:: python @@ -265,7 +326,9 @@ The output will be: ['key', 2] [{'key': 'value'}, {'key': 'value'}] -**JsonParser** + +JsonParser +~~~~~~~~~~~~~~~~~~ Even though it can work on lists, it is better to only use it for dictionaries. @@ -296,7 +359,9 @@ The output will be: ['key', 2] [{'key': 'value'}, {'key': 'value'}] -**YamlParser** + +YamlParser +~~~~~~~~~~~~~~~~~~ Though it works almost on all of the previous examples, it is better to use it for yaml formatted dictionaries. @@ -346,7 +411,9 @@ And we will demonstrate how to use ``JsonOutputParser`` and ``YamlOutputParser`` user_example = User(id=1, name="John") -**JsonOutputParser** + +JsonOutputParser +~~~~~~~~~~~~~~~~~~ Here is how to use ``JsonOutputParser``: @@ -418,7 +485,9 @@ The output will be: {'id': 2, 'name': 'Jane'} -**YamlOutputParser** + +YamlOutputParser +~~~~~~~~~~~~~~~~~~ The steps are totally the same as the ``JsonOutputParser``. @@ -498,6 +567,147 @@ The output will be: .. .. [1] Jinja2: https://jinja.palletsprojects.com/en/3.1.x/ .. .. [2] Llama3 special tokens: https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3/ +DataclassParser in Action +-------------------------- + +First, let's create a new data class with both input and output fields. + +.. code-block:: python + + @dataclass + class SampleDataClass(DataClass): + description: str = field(metadata={"desc": "A sample description"}) + category: str = field(metadata={"desc": "Category of the sample"}) + value: int = field(metadata={"desc": "A sample integer value"}) + status: str = field(metadata={"desc": "Status of the sample"}) + + __input_fields__ = [ + "description", + "category", + ] # Define which fields are input fields + __output_fields__ = ["value", "status"] # Define which fields are output fields + + +Now, lets' create a parser that will use the `SampleDataClass` to parse the output json string back to the data class instance. + +.. code-block:: python + + from adalflow.components.output_parsers import DataClassParser + + parser = DataClassParser(data_class=SampleDataClass, return_data_class=True, format_type="json") + +Let's view the structure of the parser use `print(parser)`. + +The output will be: + +.. code-block:: + + DataClassParser( + data_class=SampleDataClass, format_type=json, return_data_class=True, input_fields=['description', 'category'], output_fields=['value', 'status'] + (_output_processor): JsonParser() + (output_format_prompt): Prompt( + template: Your output should be formatted as a standard JSON instance with the following schema: + ``` + {{schema}} + ``` + -Make sure to always enclose the JSON output in triple backticks (```). Please do not add anything other than valid JSON output! + -Use double quotes for the keys and string values. + -DO NOT mistaken the "properties" and "type" in the schema as the actual fields in the JSON output. + -Follow the JSON formatting conventions., prompt_variables: ['schema'] + ) + ) + +You can get the output and input format strings using the following methods: + +.. code-block:: python + + print(parser.get_input_format_str()) + print(parser.get_output_format_str()) + +The output for the output format string will be: + +.. code-block:: + + Your output should be formatted as a standard JSON instance with the following schema: + ``` + { + "value": " (int) (required)", + "status": " (str) (required)" + } + ``` + -Make sure to always enclose the JSON output in triple backticks (```). Please do not add anything other than valid JSON output! + -Use double quotes for the keys and string values. + -DO NOT mistaken the "properties" and "type" in the schema as the actual fields in the JSON output. + -Follow the JSON formatting conventions. + +The input format string will be: + +.. code-block:: + + { + "description": " (str) (required)", + "category": " (str) (required)" + } + +Convert a json string to a data class instance: + +.. code-block:: python + + user_input = '{"description": "Parsed description", "category": "Sample Category", "value": 100, "status": "active"}' + parsed_instance = parser.call(user_input) + + print(parsed_instance) + +The output will be: + +.. code-block:: python + + SampleDataClass(description='Parsed description', category='Sample Category', value=100, status='active') + +Try the examples string: + +.. code-block:: python + + samples = [ + SampleDataClass( + description="Sample description", + category="Sample category", + value=100, + status="active", + ), + SampleDataClass( + description="Another description", + category="Another category", + value=200, + status="inactive", + ), + ] + + examples_str = parser.get_examples_str(examples=samples) + print(examples_str) + +The output will be: + +.. code-block:: python + + examples_str: + { + "description": "Sample description", + "category": "Sample category", + "value": 100, + "status": "active" + } + __________ + { + "description": "Another description", + "category": "Another category", + "value": 200, + "status": "inactive" + } + __________ + + + .. admonition:: API References :class: highlight @@ -509,3 +719,5 @@ The output will be: - :class:`components.output_parsers.outputs.OutputParser` - :class:`components.output_parsers.outputs.BooleanOutputParser` - :class:`components.output_parsers.outputs.ListOutputParser` + - :class:`components.output_parsers.dataclass_parser.DataClassParser` + - :class:`core.base_data_class.DataClass` diff --git a/tutorials/parser_note.py b/tutorials/parser_note.py index fdc23fce..80c2c009 100644 --- a/tutorials/parser_note.py +++ b/tutorials/parser_note.py @@ -271,6 +271,62 @@ class User(DataClass): print(parsed_user) +def dataclass_parser(): + from dataclasses import dataclass, field + from adalflow.components.output_parsers import DataClassParser + from adalflow.core import DataClass + + @dataclass + class SampleDataClass(DataClass): + description: str = field(metadata={"description": "A sample description"}) + category: str = field(metadata={"description": "Category of the sample"}) + value: int = field(metadata={"description": "A sample integer value"}) + status: str = field(metadata={"description": "Status of the sample"}) + + __input_fields__ = [ + "description", + "category", + ] # Define which fields are input fields + __output_fields__ = ["value", "status"] # Define which fields are output fields + + # Initialize the DataClassParser with SampleDataClass + parser = DataClassParser( + data_class=SampleDataClass, return_data_class=True, format_type="json" + ) + print("DataClassParser instance created:\n", parser) + + # Get formatted instructions for the output format + output_format_str = parser.get_output_format_str() + print("\nOutput format string:\n", output_format_str) + + # Get formatted instructions for the input format + input_format_str = parser.get_input_format_str() + print("\nInput format string:\n", input_format_str) + + # Parse a sample JSON string + user_input = '{"description": "Parsed description", "category": "Sample Category", "value": 100, "status": "active"}' + parsed_instance = parser.call(user_input) + print("\nParsed DataClass instance:\n", parsed_instance) + + samples = [ + SampleDataClass( + description="Sample description", + category="Sample category", + value=100, + status="active", + ), + SampleDataClass( + description="Another description", + category="Another category", + value=200, + status="inactive", + ), + ] + + examples_str = parser.get_examples_str(examples=samples) + print(f"examples_str: {examples_str}") + + if __name__ == "__main__": examples_of_different_ways_to_parse_string() int_parser() @@ -281,3 +337,4 @@ class User(DataClass): yaml_parser() json_output_parser() yaml_output_parser() + dataclass_parser() From 23d5dcdf448011be01c32fa791a7abefd58ce48c Mon Sep 17 00:00:00 2001 From: Li Yin Date: Mon, 28 Oct 2024 22:25:00 -0700 Subject: [PATCH 3/4] add author --- adalflow/CHANGELOG.md | 5 +++++ adalflow/adalflow/__init__.py | 5 +++++ notebooks/tutorials/adalflow_dataclasses.ipynb | 6 +++++- 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/adalflow/CHANGELOG.md b/adalflow/CHANGELOG.md index 05bbbbd8..315814a6 100644 --- a/adalflow/CHANGELOG.md +++ b/adalflow/CHANGELOG.md @@ -1,3 +1,8 @@ +## [0.2.5] - 2024-10-28 + +### Fixed +- `DataClassParser` nested data class parsing where we have to use `from_dict(json_dict)` instead of `(**json_dict)` to parse the nested data class. + ## [0.2.4] - 2024-10-27 ### Added diff --git a/adalflow/adalflow/__init__.py b/adalflow/adalflow/__init__.py index 8ccd7ba5..af90187b 100644 --- a/adalflow/adalflow/__init__.py +++ b/adalflow/adalflow/__init__.py @@ -15,6 +15,8 @@ ) from adalflow.core.model_client import ModelClient from adalflow.core.embedder import Embedder + +# parser from adalflow.core.string_parser import ( YamlParser, JsonParser, @@ -30,7 +32,10 @@ ListOutputParser, ) from adalflow.components.output_parsers.dataclass_parser import DataClassParser + from adalflow.core.prompt_builder import Prompt + +# optimization from adalflow.optim import ( Optimizer, DemoOptimizer, diff --git a/notebooks/tutorials/adalflow_dataclasses.ipynb b/notebooks/tutorials/adalflow_dataclasses.ipynb index aceacdf7..5218f5e7 100644 --- a/notebooks/tutorials/adalflow_dataclasses.ipynb +++ b/notebooks/tutorials/adalflow_dataclasses.ipynb @@ -6,7 +6,7 @@ "id": "hGLYrUwBmvUD" }, "source": [ - "\n", + "\n", " \"Open\n", "\n" ] @@ -33,6 +33,10 @@ "\n", "Common use cases along with the auto-optimization: check out [Use cases](https://adalflow.sylph.ai/use_cases/index.html).\n", "\n", + "# Author\n", + "\n", + "This notebook was created by community contributor [Ajith](https://github.com/ajithvcoder).\n", + "\n", "# Outline\n", "\n", "This is a quick introduction of what AdalFlow is capable of. We will cover:\n", From 1b2702baf11c20af5ce6f4e66991d9ea6ec6937d Mon Sep 17 00:00:00 2001 From: Li Yin Date: Mon, 28 Oct 2024 22:26:38 -0700 Subject: [PATCH 4/4] update the template --- notebooks/adalflow_colab_template.ipynb | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/notebooks/adalflow_colab_template.ipynb b/notebooks/adalflow_colab_template.ipynb index 384a3165..191bbf08 100644 --- a/notebooks/adalflow_colab_template.ipynb +++ b/notebooks/adalflow_colab_template.ipynb @@ -20,6 +20,10 @@ "\n", "Common use cases along with the auto-optimization: check out [Use cases](https://adalflow.sylph.ai/use_cases/index.html).\n", "\n", + "# Author\n", + "\n", + "This notebook was created by community contributor [Name](Replace_to_github_or_other_social_account).\n", + "\n", "# Outline\n", "\n", "This is a quick introduction of what AdalFlow is capable of. We will cover:\n",