Merge pull request #243 from SylphAI-Inc/main

[release][v0.2.5][doc][dataclass parser] bug fix + notebook
SylphAI-Inc · Oct 29, 2024 · 733c4b6 · 733c4b6
2 parents ac39baf + 93a8569
commit 733c4b6
Show file tree

Hide file tree

Showing 14 changed files with 1,492 additions and 30 deletions.
diff --git a/SETUP.md b/SETUP.md
@@ -0,0 +1,3 @@
+# Create a kernel
+
+```poetry run python -m ipykernel install --user --name my-project-kernel```
diff --git a/adalflow/CHANGELOG.md b/adalflow/CHANGELOG.md
@@ -1,3 +1,8 @@
+## [0.2.5] - 2024-10-28
+
+### Fixed
+- `DataClassParser` nested data class parsing where we have to use `from_dict(json_dict)` instead of `(**json_dict)` to parse the nested data class.
+
 ## [0.2.4] - 2024-10-27
 
 ### Added

diff --git a/adalflow/adalflow/__init__.py b/adalflow/adalflow/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.2.4"
+__version__ = "0.2.5"
 
 from adalflow.core.component import Component, fun_to_component
 from adalflow.core.container import Sequential
@@ -15,6 +15,8 @@
 )
 from adalflow.core.model_client import ModelClient
 from adalflow.core.embedder import Embedder
+
+# parser
 from adalflow.core.string_parser import (
     YamlParser,
     JsonParser,
@@ -30,7 +32,10 @@
     ListOutputParser,
 )
 from adalflow.components.output_parsers.dataclass_parser import DataClassParser
+
 from adalflow.core.prompt_builder import Prompt
+
+# optimization
 from adalflow.optim import (
     Optimizer,
     DemoOptimizer,

diff --git a/adalflow/adalflow/components/output_parsers/dataclass_parser.py b/adalflow/adalflow/components/output_parsers/dataclass_parser.py
@@ -1,4 +1,4 @@
-"""DataClassParser will help users convert a dataclass to prompt"""
+"""DataClassParser will help users interact with LLMs even better than JsonOutputParser and YamlOutputParser with DataClass."""
 
 from dataclasses import is_dataclass
 from typing import Any, Literal, List, Optional
@@ -43,9 +43,46 @@
 
 
 class DataClassParser(Component):
-    __doc__ = (
-        r"""This is similar to Dspy's signature but more controllable and flexible."""
-    )
+    __doc__ = r"""Made the structured output even simpler compared with JsonOutputParser and YamlOutputParser.
+
+        1. Understands __input_fields__ and __output_fields__ from the DataClass (no need to use include/exclude to decide fields).
+        2. User can choose to save the `task_desc` in the DataClass and use it in the prompt.
+
+        Example:
+
+        .. code-block:: python
+
+            @dataclass
+            class BasicQAOutput(adal.DataClass):
+                explanation: str = field(
+                    metadata={"desc": "A brief explanation of the concept in one sentence."}
+                )
+                example: str = field(
+                    metadata={"desc": "An example of the concept in a sentence."}
+                )
+                # Control output fields order
+                __output_fields__ = ["explanation", "example"]
+
+            # Define the template using jinja2 syntax
+            qa_template = "<SYS>
+            You are a helpful assistant.
+            <OUTPUT_FORMAT>
+            {{output_format_str}}
+            </OUTPUT_FORMAT>
+            </SYS>
+            <USER> {{input_str}} </USER>"
+
+            parser = adal.DataClassParser(data_class=BasicQAOutput, return_data_class=True)
+
+            # Set up the generator with model, template, and parser
+            self.generator = adal.Generator(
+                model_client=model_client,
+                model_kwargs=model_kwargs,
+                template=qa_template,
+                prompt_kwargs={"output_format_str": parser.get_output_format_str()},
+                output_processors=parser,
+            )
+        """
 
     def __init__(
         self,
@@ -132,10 +169,10 @@ def get_examples_str(
     def call(self, input: str) -> Any:
         r"""Parse the output string to the desired format and return the parsed output."""
         try:
-            output = self._output_processor(input)
+            output_dict = self._output_processor(input)
             if self._return_data_class:
-                return self._data_class(**output)
-            return output
+                return self._data_class.from_dict(output_dict)
+            return output_dict
         except Exception as e:
             log.error(f"Error at parsing output: {e}")
             raise ValueError(f"Error: {e}")

diff --git a/adalflow/adalflow/components/output_parsers/outputs.py b/adalflow/adalflow/components/output_parsers/outputs.py
@@ -1,4 +1,11 @@
-"""The most commonly used output parsers for the Generator."""
+"""The most commonly used output parsers for the Generator.
+
+Includes:
+- YamlOutputParser: YAML output parser using dataclass for schema extraction.
+- JsonOutputParser: JSON output parser using dataclass for schema extraction.
+- ListOutputParser: List output parser to parse list of objects from the string.
+- BooleanOutputParser: Boolean output parser to parse boolean values from the string.
+"""
 
 from dataclasses import is_dataclass
 from typing import Dict, Any, Optional, List

diff --git a/adalflow/pyproject.toml b/adalflow/pyproject.toml
@@ -1,8 +1,8 @@
 [tool.poetry]
 name = "adalflow"
 
-version = "0.2.4"
-description = "The Library to Build and Auto-optimize Any LLM Task Pipeline"
+version = "0.2.5"
+description = "The Library to Build and Auto-optimize LLM Applications"
 authors = ["Li Yin <[email protected]>"]
 readme = "README.md"
 repository = "https://github.com/SylphAI-Inc/AdalFlow"

diff --git a/adalflow/tests/test_data_class_parser.py b/adalflow/tests/test_data_class_parser.py
@@ -0,0 +1,142 @@
+import unittest
+from dataclasses import dataclass, field
+from typing import List
+from adalflow.core.base_data_class import DataClass
+from adalflow.components.output_parsers.dataclass_parser import DataClassParser
+
+
+# Define a basic DataClass for testing
+@dataclass
+class BasicOutput(DataClass):
+    explanation: str = field(
+        metadata={"desc": "A brief explanation of the concept in one sentence."}
+    )
+    example: str = field(metadata={"desc": "An example of the concept in a sentence."})
+    __output_fields__ = ["explanation", "example"]
+
+
+# Define a nested DataClass for testing
+@dataclass
+class NestedOutput(DataClass):
+    title: str
+    description: str
+    items: List[str]
+    __output_fields__ = ["title", "description", "items"]
+
+
+class TestDataClassParser(unittest.TestCase):
+
+    def setUp(self):
+        self.basic_data_class = BasicOutput
+        self.nested_data_class = NestedOutput
+        self.basic_parser = DataClassParser(
+            data_class=self.basic_data_class, return_data_class=True, format_type="json"
+        )
+        self.nested_parser = DataClassParser(
+            data_class=self.nested_data_class,
+            return_data_class=True,
+            format_type="yaml",
+        )
+
+    def test_basic_data_class_json(self):
+        input_instance = BasicOutput(
+            explanation="This is a test.", example="Example sentence."
+        )
+        input_str = self.basic_parser.get_input_str(input_instance)
+        self.assertIn("This is a test.", input_str)
+        self.assertIn("Example sentence.", input_str)
+
+        output_format_str = self.basic_parser.get_output_format_str()
+        self.assertIn("explanation", output_format_str)
+        self.assertIn("example", output_format_str)
+
+        output = self.basic_parser.call(
+            '{"explanation": "Test explanation", "example": "Test example."}'
+        )
+        self.assertIsInstance(output, BasicOutput)
+
+    def test_basic_data_class_yaml(self):
+        self.yaml_parser = DataClassParser(
+            data_class=self.basic_data_class, return_data_class=True, format_type="yaml"
+        )
+        input_instance = BasicOutput(
+            explanation="This is a test.", example="Example sentence."
+        )
+        input_str = self.yaml_parser.get_input_str(input_instance)
+        self.assertIn("This is a test.", input_str)
+
+        self.assertIn("Example sentence.", input_str)
+
+        output_format_str = self.yaml_parser.get_output_format_str()
+        self.assertIn("explanation", output_format_str)
+        self.assertIn("example", output_format_str)
+
+        output = self.yaml_parser.call(
+            """explanation: Test explanation
+example: Test example."""
+        )
+        print(f"output: {output}")
+        self.assertIsInstance(output, BasicOutput)
+
+    def test_nested_data_class_json(self):
+        input_instance = NestedOutput(
+            title="Title", description="Description", items=["Item 1", "Item 2"]
+        )
+        input_str = self.nested_parser.get_input_str(input_instance)
+        self.assertIn("Title", input_str)
+        self.assertIn("Description", input_str)
+        self.assertIn("Item 1", input_str)
+        self.assertIn("Item 2", input_str)
+
+        output_format_str = self.nested_parser.get_output_format_str()
+        self.assertIn("title", output_format_str)
+        self.assertIn("description", output_format_str)
+        self.assertIn("items", output_format_str)
+
+        output = self.nested_parser.call(
+            """title: Nested Title
+description: Nested description
+items:
+  - Item 1
+  - Item 2"""
+        )
+        self.assertIsInstance(output, NestedOutput)
+
+    def test_nested_data_class_yaml(self):
+        self.nested_parser._format_type = "yaml"
+        input_instance = NestedOutput(
+            title="Title", description="Description", items=["Item 1", "Item 2"]
+        )
+        input_str = self.nested_parser.get_input_str(input_instance)
+        self.assertIn("Title", input_str)
+        self.assertIn("Description", input_str)
+        self.assertIn("Item 1", input_str)
+        self.assertIn("Item 2", input_str)
+
+        output_format_str = self.nested_parser.get_output_format_str()
+        self.assertIn("title", output_format_str)
+        self.assertIn("description", output_format_str)
+        self.assertIn("items", output_format_str)
+
+        output = self.nested_parser.call(
+            """title: Nested Title
+description: Nested description
+items:
+  - Item 1
+  - Item 2"""
+        )
+        self.assertIsInstance(output, NestedOutput)
+
+    def test_invalid_data_class(self):
+        with self.assertRaises(ValueError):
+            DataClassParser(data_class=dict)  # dict is not a dataclass
+
+    def test_invalid_format_type(self):
+        with self.assertRaises(ValueError):
+            DataClassParser(
+                data_class=self.basic_data_class, format_type="xml"
+            )  # Invalid format type
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/adalflow/tests/test_output_parser.py b/adalflow/tests/test_output_parser.py
@@ -13,6 +13,8 @@ class User(DataClass):
     id: int = field(default=1, metadata={"description": "User ID"})
     name: str = field(default="John", metadata={"description": "User name"})
 
+    __input_fields__ = ["id", "name"]
+
 
 class TestOutputParsers(unittest.TestCase):
 

diff --git a/docs/source/apis/components/index.rst b/docs/source/apis/components/index.rst
@@ -49,6 +49,7 @@ Output Parsers
 .. autosummary::
 
    components.output_parsers.outputs
+   components.output_parsers.dataclass_parser
 
 Agent
 ~~~~~~~~~~~~~~~~~~~~

diff --git a/docs/source/tutorials/base_data_class.rst b/docs/source/tutorials/base_data_class.rst
@@ -1,4 +1,15 @@
 .. _core-base_data_class_note:
+
+
+.. raw:: html
+
+   <div style="display: flex; justify-content: flex-start; align-items: center; margin-bottom: 20px;">
+      <a href="https://colab.research.google.com/github/SylphAI-Inc/AdalFlow/blob/main/notebooks/tutorials/adalflow_dataclasses.ipynb" target="_blank" style="margin-right: 10px;">
+         <img alt="Try Quickstart in Colab" src="https://colab.research.google.com/assets/colab-badge.svg" style="vertical-align: middle;">
+      </a>
+
+   </div>
+
 DataClass
 ============
 
@@ -7,10 +18,10 @@ DataClass
 
 ..    `Li Yin <https://github.com/liyin2015>`_
 
-In `PyTorch`, ``Tensor`` is the data type used in ``Module`` and ``Optimizer`` across the library.
-Tensor wraps a multi-dimensional matrix to better support its operations and computations.
+
 In LLM applications, data constantly needs to interact with LLMs in the form of strings via prompt and be parsed back to structured data from LLMs' text prediction.
 :class:`DataClass<core.base_data_class.DataClass>` is designed to ease this data interaction with LLMs via prompt(input) and to parse the text prediction(output).
+It is even more convenient to use together with :ref:`components-output_parser_note`.
 
 .. figure:: /_static/images/dataclass.png
     :align: center
@@ -61,11 +72,13 @@ Here is how users typically use the ``dataclasses`` module:
 We also made the effort to provide more control:
 
 1. **Keep the ordering of your data fields.** We provided :func:`required_field<core.base_data_class.required_field>` with ``default_factory`` to mark the field as required even if it is after optional fields. We also has to do customization to preserve their ordering while being converted to dictionary, json and yaml string.
-2. **Exclude some fields from the output.**  All serialization methods support `exclude` parameter to exclude some fields even for nested dataclasses.
-3. **Allow nested dataclasses, lists, and dictionaries.** All methods support nested dataclasses, lists, and dictionaries.
+2. **Signal the output/input fields.** We allow you to use ``__output_fields__`` and ``__input_fields__`` to explicitly signal the output and input fields. (1) It can be a subset of the fields in the data class. (2) You can specify the ordering in the `__output_fields__`.
+3. **Exclude some fields from the output.**  All serialization methods support `exclude` parameter to exclude some fields even for nested dataclasses.
+4. **Allow nested dataclasses, lists, and dictionaries.** All methods support nested dataclasses, lists, and dictionaries.
+5. **Easy to use with Output parser.**  It works well with output parsers such as ``JsonOutputParser``, ``YamlOutputParser``, and ``DataClassParser``. You can refer to :ref:`components-output_parser_note` for more details.
 
 
-Describing the Data Format
+Describing the Data Format (Data Class)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. list-table::
@@ -74,6 +87,10 @@ Describing the Data Format
 
    * - **Name**
      - **Description**
+   * - ``__input_fields__``
+     - A list of fields that are input fields.
+   * - ``__output_fields__``
+     - Used more often than ``__input_fields__``. A list of fields that are output fields. (1) It can be a subset of the fields in the data class. (2) You can specify the ordering in the `__output_fields__`. (3) Works well and only with :class:`DataClassParser<core.base_data_class.DataClassParser>`.
    * - ``to_schema(cls, exclude) -> Dict``
      - Generate a JSON schema which is more detailed than the signature.
    * - ``to_schema_str(cls, exclude) -> str``
@@ -227,7 +244,7 @@ As you can see, it handles the nested dataclass `Question` and the required fiel
 
 .. note::
 
-    ``Optional`` type hint will not affect the field's required status. You can use this to work with static type checkers such as `mypy` if you want to.
+    ``Optional`` type hint will not affect the field's required status. We recommend you not to use it in the `dataclasses` module especially when you are nesting many levels of dataclasses. It might end up confusing the LLMs.
 
 **Signature**
 
@@ -600,7 +617,10 @@ You can simply do a bit customization to map the dataset's key to the field name
 
     If you are looking for data types we used to support each component or any other class like `Optimizer`, you can check out the :ref:`core.types<core-types>` file.
 
-
+About __output_fields__
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Though you can use `exclude` in the :class:`JsonOutputParser<components.output_parsers.outputs.JsonOutputParser>` to exclude some fields from the output, it is less readable and less convenient than
+directly use `__output_fields__` in the data class to signal the output fields and directly work with :class:`DataClassParser<components.output_parsers.dataclass_parser.DataClassParser>`.
 
 .. admonition:: References
    :class: highlight
@@ -616,7 +636,9 @@ You can simply do a bit customization to map the dataset's key to the field name
    - :class:`core.base_data_class.DataClassFormatType`
    - :func:`core.functional.custom_asdict`
    - :ref:`core.base_data_class<core-base_data_class>`
-
+   - :class:`core.base_data_class.required_field`
+   - :class:`components.output_parsers.outputs.JsonOutputParser`
+   - :class:`components.output_parsers.dataclass_parser.DataClassParser`
 
 .. Document
 .. ------------