Merge pull request #32 from ucbepic/shreyashankar/dataset

Add Dataset Class and Parsing Tools
ucbepic · Oct 1, 2024 · ab7e87a · ab7e87a
2 parents e8e54cb + 9a82565
commit ab7e87a
Show file tree

Hide file tree

Showing 21 changed files with 1,744 additions and 97 deletions.
diff --git a/Makefile b/Makefile
@@ -4,10 +4,7 @@ tests:
 	poetry run pytest
 
 tests-basic:
-	poetry run pytest tests/basic/test_basic_map.py
-	poetry run pytest tests/basic/test_basic_reduce_resolve.py
-	poetry run pytest tests/basic/test_basic_parallel_map.py
-	poetry run pytest tests/basic/test_basic_filter_split_gather.py
+	poetry run pytest tests/basic
 
 lint:
 	poetry run ruff check docetl/* --fix

diff --git a/docetl/api.py b/docetl/api.py
@@ -5,7 +5,7 @@
 The module provides a high-level API for defining, optimizing, and running document processing pipelines.
 
 Classes:
-    Dataset: Represents a dataset with a type and path.
+    Dataset: Represents a dataset with a type, path, and optional parsing tools.
     BaseOp: Base class for all operation types.
     MapOp: Represents a map operation in the pipeline.
     ResolveOp: Represents a resolve operation for entity resolution.
@@ -27,7 +27,13 @@
     from docetl.api import Pipeline, Dataset, MapOp, ReduceOp
 
     pipeline = Pipeline(
-        datasets={"input": Dataset(type="file", path="input.json")},
+        datasets={
+            "input": Dataset(
+                type="file",
+                path="input.json",
+                parsing=[{"name": "txt_to_string", "input_key": "text", "output_key": "content"}]
+            )
+        },
         operations=[
             MapOp(name="process", type="map", prompt="Process the document"),
             ReduceOp(name="summarize", type="reduce", reduce_key="content")
@@ -44,7 +50,7 @@
 """
 
 import os
-from typing import Any, Dict, Optional
+from typing import Any, Dict, List, Optional
 
 import yaml
 from rich import print
@@ -60,6 +66,7 @@
     ResolveOp,
     SplitOp,
     UnnestOp,
+    ParsingTool,
 )
 
 
@@ -103,6 +110,7 @@ def optimize(
             steps=self.steps,
             output=self.output,
             default_model=self.default_model,
+            parsing_tools=self.parsing_tools,
         )
         updated_pipeline._update_from_dict(optimized_config)
         return updated_pipeline
@@ -161,6 +169,11 @@ def _to_dict(self) -> Dict[str, Any]:
                 "output": self.output.dict(),
             },
             "default_model": self.default_model,
+            "parsing_tools": (
+                [tool.dict() for tool in self.parsing_tools]
+                if self.parsing_tools
+                else None
+            ),
         }
 
     def _update_from_dict(self, config: Dict[str, Any]):
@@ -171,7 +184,13 @@ def _update_from_dict(self, config: Dict[str, Any]):
             config (Dict[str, Any]): Dictionary representation of the Pipeline.
         """
         self.datasets = {
-            name: Dataset(**dataset) for name, dataset in config["datasets"].items()
+            name: Dataset(
+                type=dataset["type"],
+                source=dataset["source"],
+                path=dataset["path"],
+                parsing=dataset.get("parsing"),
+            )
+            for name, dataset in config["datasets"].items()
         }
         self.operations = []
         for op in config["operations"]:
@@ -197,3 +216,8 @@ def _update_from_dict(self, config: Dict[str, Any]):
         self.steps = [PipelineStep(**step) for step in config["pipeline"]["steps"]]
         self.output = PipelineOutput(**config["pipeline"]["output"])
         self.default_model = config.get("default_model")
+        self.parsing_tools = (
+            [ParsingTool(**tool) for tool in config.get("parsing_tools", [])]
+            if config.get("parsing_tools")
+            else []
+        )
diff --git a/docetl/builder.py b/docetl/builder.py
@@ -8,6 +8,7 @@
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 import yaml
+from docetl.dataset import Dataset, create_parsing_tool_map
 from rich.console import Console
 from rich.status import Status
 from rich.traceback import install
@@ -139,6 +140,11 @@ def __init__(
         self.samples_taken = defaultdict(dict)
         self.resume = resume
 
+        # create parsing tool map
+        self.parsing_tool_map = create_parsing_tool_map(
+            self.config.get("parsing_tools", None)
+        )
+
         home_dir = os.path.expanduser("~")
         cache_dir = os.path.join(home_dir, f".docetl/cache/{yaml_file_suffix}")
         os.makedirs(cache_dir, exist_ok=True)
@@ -955,16 +961,18 @@ def _get_sample_data(
         if name_hash and name_hash in self.datasets:
             data = self.datasets[name_hash]
         else:
-            dataset = self.config["datasets"].get(dataset_name)
-            if dataset is None:
+            dataset_config = self.config["datasets"].get(dataset_name)
+            if dataset_config is None:
                 raise ValueError(
                     f"Dataset '{dataset_name}' not found in config or previous steps."
                 )
-            if dataset["type"] == "file":
-                with open(dataset["path"], "r") as f:
-                    data = json.load(f)
-            else:
-                raise ValueError(f"Unsupported dataset type: {dataset['type']}")
+            dataset = Dataset(
+                type=dataset_config["type"],
+                path_or_data=dataset_config["path"],
+                parsing=dataset_config.get("parsing", []),
+                user_defined_parsing_tool_map=self.parsing_tool_map,
+            )
+            data = dataset.load()
 
         if sample_size == float("inf"):
             return data