docs: update documentation for custom parsers

ucbepic · Oct 1, 2024 · efe78f5 · efe78f5
1 parent 8f2724b
commit efe78f5
Show file tree

Hide file tree

Showing 12 changed files with 500 additions and 28 deletions.
diff --git a/Makefile b/Makefile
@@ -4,10 +4,7 @@ tests:
 	poetry run pytest
 
 tests-basic:
-	poetry run pytest tests/basic/test_basic_map.py
-	poetry run pytest tests/basic/test_basic_reduce_resolve.py
-	poetry run pytest tests/basic/test_basic_parallel_map.py
-	poetry run pytest tests/basic/test_basic_filter_split_gather.py
+	poetry run pytest tests/basic
 
 lint:
 	poetry run ruff check docetl/* --fix

diff --git a/docetl/builder.py b/docetl/builder.py
@@ -968,8 +968,8 @@ def _get_sample_data(
                 )
             dataset = Dataset(
                 type=dataset_config["type"],
-                source=dataset_config["source"],
                 path_or_data=dataset_config["path"],
+                source=dataset_config["source"],
                 parsing=dataset_config.get("parsing", []),
                 user_defined_parsing_tool_map=self.parsing_tool_map,
             )

diff --git a/docetl/dataset.py b/docetl/dataset.py
@@ -40,8 +40,8 @@ class Dataset:
     def __init__(
         self,
         type: str,
-        source: str,
         path_or_data: Union[str, List[Dict]],
+        source: str = "local",
         parsing: List[Dict[str, str]] = None,
         user_defined_parsing_tool_map: Dict[str, ParsingTool] = {},
     ):

diff --git a/docetl/parsing_tools.py b/docetl/parsing_tools.py
@@ -72,19 +72,24 @@ def xlsx_to_string(
 
     def process_sheet(sheet):
         if col_order:
-            headers = col_order
+            headers = [
+                col for col in col_order if col in sheet.iter_cols(1, sheet.max_column)
+            ]
         else:
             headers = [cell.value for cell in sheet[1]]
 
         result = []
-        for row in sheet.iter_rows(min_row=2, values_only=True):
-            row_dict = dict(zip(headers, row))
-            if orientation == "col":
-                result.extend(
-                    [f"{header}: {value}" for header, value in row_dict.items()]
-                )
-                result.append("")  # Empty line between rows
-            else:  # row
+        if orientation == "col":
+            for col_idx, header in enumerate(headers, start=1):
+                column = sheet.cell(1, col_idx).column_letter
+                column_values = [cell.value for cell in sheet[column][1:]]
+                result.append(f"{header}: " + "\n".join(map(str, column_values)))
+                result.append("")  # Empty line between columns
+        else:  # row
+            for row in sheet.iter_rows(min_row=2, values_only=True):
+                row_dict = {
+                    header: value for header, value in zip(headers, row) if header
+                }
                 result.append(
                     " | ".join(
                         [f"{header}: {value}" for header, value in row_dict.items()]
@@ -129,10 +134,48 @@ def docx_to_string(filename: str) -> List[str]:
     return ["\n".join([paragraph.text for paragraph in doc.paragraphs])]
 
 
+def pptx_to_string(filename: str, slide_per_document: bool = False) -> List[str]:
+    """
+    Extract text from a PowerPoint presentation.
+
+    Args:
+        filename (str): Path to the pptx file.
+        slide_per_document (bool): If True, return each slide as a separate
+            document. If False, return the entire presentation as one document.
+
+    Returns:
+        List[str]: Extracted text from the presentation. If slide_per_document
+            is True, each string in the list represents a single slide.
+            Otherwise, the list contains a single string with all slides'
+            content.
+    """
+    from pptx import Presentation
+
+    prs = Presentation(filename)
+    result = []
+
+    for slide in prs.slides:
+        slide_content = []
+        for shape in slide.shapes:
+            if hasattr(shape, "text"):
+                slide_content.append(shape.text)
+
+        if slide_per_document:
+            result.append("\n".join(slide_content))
+        else:
+            result.extend(slide_content)
+
+    if not slide_per_document:
+        result = ["\n".join(result)]
+
+    return result
+
+
 # Define a dictionary mapping function names to their corresponding functions
 PARSING_TOOLS = {
     "whisper_speech_to_text": whisper_speech_to_text,
     "xlsx_to_string": xlsx_to_string,
     "txt_to_string": txt_to_string,
     "docx_to_string": docx_to_string,
+    "pptx_to_string": pptx_to_string,
 }
diff --git a/docetl/runner.py b/docetl/runner.py
@@ -132,7 +132,7 @@ def run(self) -> float:
                 self.datasets[step["input"]].load() if "input" in step else None
             )
             output_data, step_cost = self.execute_step(step, input_data)
-            self.datasets[step_name] = Dataset("memory", "local", output_data)
+            self.datasets[step_name] = Dataset("memory", output_data)
             flush_cache(self.console)
             total_cost += step_cost
             self.console.log(
@@ -164,8 +164,8 @@ def load_datasets(self):
             if dataset_config["type"] == "file":
                 self.datasets[name] = Dataset(
                     "file",
-                    "local",
                     dataset_config["path"],
+                    source="local",
                     parsing=dataset_config.get("parsing", []),
                     user_defined_parsing_tool_map=self.parsing_tool_map,
                 )
@@ -281,7 +281,7 @@ def _load_from_checkpoint_if_exists(
         if os.path.exists(checkpoint_path):
             if f"{step_name}_{operation_name}" not in self.datasets:
                 self.datasets[f"{step_name}_{operation_name}"] = Dataset(
-                    "file", "local", checkpoint_path
+                    "file", checkpoint_path, "local"
                 )
             return self.datasets[f"{step_name}_{operation_name}"].load()
         return None

diff --git a/docetl/schemas.py b/docetl/schemas.py
@@ -21,8 +21,8 @@ class ParsingTool(BaseModel):
 
 class Dataset(BaseModel):
     type: str
-    source: str
     path: str
+    source: str = "local"
     parsing: Optional[List[Dict[str, str]]] = None
 
 

diff --git a/docs/concepts/pipelines.md b/docs/concepts/pipelines.md
@@ -21,7 +21,7 @@ default_model: gpt-4o-mini
 
 ### Datasets
 
-Datasets define the input data for your pipeline. They are collections of documents, where each document is an object in a JSON list. Datasets are typically specified in the YAML configuration file, indicating the type and path of the data source. For example:
+Datasets define the input data for your pipeline. They are collections of documents, where each document is an object in a JSON list (or row in a CSV file). Datasets are typically specified in the YAML configuration file, indicating the type and path of the data source. For example:
 
 ```yaml
 datasets:
@@ -30,9 +30,35 @@ datasets:
     path: "user_logs.json"
 ```
 
+#### Dynamic Data Loading
+
+DocETL supports dynamic data loading, allowing you to process various file types by specifying a key that points to a path or using a custom parsing function. This feature is particularly useful for handling diverse data sources, such as audio files, PDFs, or any other non-standard format.
+
+To implement dynamic data loading, you can use parsing tools in your dataset configuration. Here's an example:
+
+```yaml
+datasets:
+  audio_transcripts:
+    type: file
+    source: local
+    path: "audio_files/audio_paths.json"
+    parsing_tools:
+      - input_key: audio_path
+        function: whisper_speech_to_text
+        output_key: transcript
+```
+
+In this example, the dataset configuration specifies a JSON file (audio_paths.json) that contains paths to audio files. The parsing_tools section defines how to process these files:
+
+- `input_key`: Specifies which key in the JSON contains the path to the audio file. In this example, each object in the dataset should have a "audio_path" key, that represents a path to an audio file or mp3.
+- `function`: Names the parsing function to use (in this case, the built-in whisper_speech_to_text function for audio transcription).
+- `output_key`: Defines the key where the processed data (transcript) will be stored. You can access this in the pipeline in any prompts with the `{{ input.transcipt }}` syntax.
+
+This approach allows DocETL to dynamically load and process various file types, extending its capabilities beyond standard JSON or CSV inputs. You can use built-in parsing tools or define custom ones to handle specific file formats or data processing needs. See the [Custom Parsing](../examples/custom-parsing.md) documentation for more details.
+
 !!! note
 
-    Currently, DocETL only supports JSON files as input datasets. If you're interested in support for other data types or cloud-based datasets, please reach out to us or join our open-source community and contribute! We welcome new ideas and contributions to expand the capabilities of DocETL.
+    Currently, DocETL only supports JSON files or CSV files as input datasets. If you're interested in support for other data types or cloud-based datasets, please reach out to us or join our open-source community and contribute! We welcome new ideas and contributions to expand the capabilities of DocETL.
 
 ### Operators