From c3416753f01c5b48cf7481397853aa7c04be75eb Mon Sep 17 00:00:00 2001 From: Shreya Shankar Date: Mon, 30 Sep 2024 18:32:04 -0700 Subject: [PATCH 01/11] feat: add custom dataset parsers --- docetl/api.py | 24 ++- docetl/builder.py | 23 ++- docetl/dataset.py | 219 ++++++++++++++++++++++ docetl/parsing_tools.py | 138 ++++++++++++++ docetl/runner.py | 44 +++-- docetl/schemas.py | 8 + poetry.lock | 210 ++++++++++++++++++++- pyproject.toml | 6 + tests/basic/sample_texts/one.txt | 11 ++ tests/basic/sample_texts/two.md | 33 ++++ tests/basic/test_pipeline_with_parsing.py | 121 ++++++++++++ tests/test_parsing_tools.py | 122 ++++++++++++ 12 files changed, 936 insertions(+), 23 deletions(-) create mode 100644 docetl/dataset.py create mode 100644 docetl/parsing_tools.py create mode 100644 tests/basic/sample_texts/one.txt create mode 100644 tests/basic/sample_texts/two.md create mode 100644 tests/basic/test_pipeline_with_parsing.py create mode 100644 tests/test_parsing_tools.py diff --git a/docetl/api.py b/docetl/api.py index 780a08f7..601d86d4 100644 --- a/docetl/api.py +++ b/docetl/api.py @@ -5,7 +5,7 @@ The module provides a high-level API for defining, optimizing, and running document processing pipelines. Classes: - Dataset: Represents a dataset with a type and path. + Dataset: Represents a dataset with a type, path, and optional parsing tools. BaseOp: Base class for all operation types. MapOp: Represents a map operation in the pipeline. ResolveOp: Represents a resolve operation for entity resolution. @@ -27,7 +27,13 @@ from docetl.api import Pipeline, Dataset, MapOp, ReduceOp pipeline = Pipeline( - datasets={"input": Dataset(type="file", path="input.json")}, + datasets={ + "input": Dataset( + type="file", + path="input.json", + parsing_tools=[{"name": "txt_to_string", "input_key": "text", "output_key": "content"}] + ) + }, operations=[ MapOp(name="process", type="map", prompt="Process the document"), ReduceOp(name="summarize", type="reduce", reduce_key="content") @@ -44,7 +50,7 @@ """ import os -from typing import Any, Dict, Optional +from typing import Any, Dict, List, Optional import yaml from rich import print @@ -60,6 +66,7 @@ ResolveOp, SplitOp, UnnestOp, + ParsingTool, ) @@ -103,6 +110,7 @@ def optimize( steps=self.steps, output=self.output, default_model=self.default_model, + parsing_tools=self.parsing_tools, ) updated_pipeline._update_from_dict(optimized_config) return updated_pipeline @@ -161,6 +169,11 @@ def _to_dict(self) -> Dict[str, Any]: "output": self.output.dict(), }, "default_model": self.default_model, + "parsing_tools": ( + [tool.dict() for tool in self.parsing_tools] + if self.parsing_tools + else None + ), } def _update_from_dict(self, config: Dict[str, Any]): @@ -197,3 +210,8 @@ def _update_from_dict(self, config: Dict[str, Any]): self.steps = [PipelineStep(**step) for step in config["pipeline"]["steps"]] self.output = PipelineOutput(**config["pipeline"]["output"]) self.default_model = config.get("default_model") + self.parsing_tools = ( + [ParsingTool(**tool) for tool in config.get("parsing_tools", [])] + if config.get("parsing_tools") + else [] + ) diff --git a/docetl/builder.py b/docetl/builder.py index cb577da4..f4a02201 100644 --- a/docetl/builder.py +++ b/docetl/builder.py @@ -8,6 +8,7 @@ from typing import Any, Dict, List, Optional, Tuple, Union import yaml +from docetl.dataset import Dataset, create_parsing_tool_map from rich.console import Console from rich.status import Status from rich.traceback import install @@ -139,6 +140,11 @@ def __init__( self.samples_taken = defaultdict(dict) self.resume = resume + # create parsing tool map + self.parsing_tool_map = create_parsing_tool_map( + self.config.get("parsing_tools", None) + ) + home_dir = os.path.expanduser("~") cache_dir = os.path.join(home_dir, f".docetl/cache/{yaml_file_suffix}") os.makedirs(cache_dir, exist_ok=True) @@ -955,16 +961,19 @@ def _get_sample_data( if name_hash and name_hash in self.datasets: data = self.datasets[name_hash] else: - dataset = self.config["datasets"].get(dataset_name) - if dataset is None: + dataset_config = self.config["datasets"].get(dataset_name) + if dataset_config is None: raise ValueError( f"Dataset '{dataset_name}' not found in config or previous steps." ) - if dataset["type"] == "file": - with open(dataset["path"], "r") as f: - data = json.load(f) - else: - raise ValueError(f"Unsupported dataset type: {dataset['type']}") + dataset = Dataset( + type=dataset_config["type"], + source=dataset_config["source"], + path_or_data=dataset_config["path"], + parsing_tools=dataset_config.get("parsing_tools", []), + user_defined_parsing_tool_map=self.parsing_tool_map, + ) + data = dataset.load() if sample_size == float("inf"): return data diff --git a/docetl/dataset.py b/docetl/dataset.py new file mode 100644 index 00000000..1adf551d --- /dev/null +++ b/docetl/dataset.py @@ -0,0 +1,219 @@ +from typing import List, Dict, Union, Optional +import os +from pydantic import BaseModel + +from docetl.parsing_tools import PARSING_TOOLS +from docetl.schemas import ParsingTool + + +def create_parsing_tool_map( + parsing_tools: Optional[List[ParsingTool]], +) -> Dict[str, ParsingTool]: + if parsing_tools is None: + return {} + + return {tool.name: tool for tool in parsing_tools} + + +class Dataset: + def __init__( + self, + type: str, + source: str, + path_or_data: Union[str, List[Dict]], + parsing_tools: List[Dict[str, str]] = None, + user_defined_parsing_tool_map: Dict[str, ParsingTool] = {}, + ): + self.type = self._validate_type(type) + self.source = self._validate_source(source) + self.path_or_data = self._validate_path_or_data(path_or_data) + self.parsing_tools = self._validate_parsing_tools(parsing_tools) + self.user_defined_parsing_tool_map = user_defined_parsing_tool_map + + def _validate_type(self, type: str) -> str: + if type not in ["file", "memory"]: + raise ValueError("Type must be 'file' or 'memory'") + return type + + def _validate_source(self, source: str) -> str: + if source != "local": + raise ValueError("Source must be 'local'") + return source + + def _validate_path_or_data( + self, path_or_data: Union[str, List[Dict]] + ) -> Union[str, List[Dict]]: + if self.type == "file": + if not isinstance(path_or_data, str): + raise ValueError("For type 'file', path_or_data must be a string") + valid_extensions = (".json", ".csv") + if not path_or_data.lower().endswith(valid_extensions): + raise ValueError(f"Path must end with one of {valid_extensions}") + elif self.type == "memory": + if not isinstance(path_or_data, list): + raise ValueError( + "For type 'memory', path_or_data must be a list of dictionaries" + ) + return path_or_data + + def _validate_parsing_tools( + self, parsing_tools: Union[List[Dict[str, str]], None] + ) -> List[Dict[str, str]]: + if parsing_tools is None: + return [] + + for tool in parsing_tools: + if ( + not isinstance(tool, dict) + or "input_key" not in tool + or "function" not in tool + or "output_key" not in tool + ): + raise ValueError( + "Each parsing tool must be a dictionary with 'input_key', 'function', and 'output_key' keys" + ) + if ( + not isinstance(tool["input_key"], str) + or not isinstance(tool["function"], str) + or not isinstance(tool["output_key"], str) + ): + raise ValueError( + "'input_key', 'function', and 'output_key' in parsing tools must be strings" + ) + if "function_kwargs" in tool and not isinstance( + tool["function_kwargs"], dict + ): + raise ValueError("'function_kwargs', if present, must be a dictionary") + + return parsing_tools + + def __repr__(self): + return f"Dataset(type='{self.type}', source='{self.source}', path_or_data='{self.path_or_data}', parsing_tools={self.parsing_tools})" + + def load(self) -> List[Dict]: + """ + Load the dataset from the specified path or return the in-memory data. + + Returns: + List[Dict]: A list of dictionaries representing the dataset. + """ + if self.type == "memory": + return self._apply_parsing_tools(self.path_or_data) + + _, ext = os.path.splitext(self.path_or_data.lower()) + + if ext == ".json": + import json + + with open(self.path_or_data, "r") as f: + data = json.load(f) + elif ext == ".csv": + import csv + + with open(self.path_or_data, "r") as f: + reader = csv.DictReader(f) + data = list(reader) + else: + raise ValueError(f"Unsupported file extension: {ext}") + + return self._apply_parsing_tools(data) + + def _apply_parsing_tools(self, data: List[Dict]) -> List[Dict]: + """ + Apply parsing tools to the data. + + Args: + data (List[Dict]): The data to apply parsing tools to. + + Returns: + List[Dict]: The data with parsing tools applied. + """ + for tool in self.parsing_tools: + input_key = tool["input_key"] + if tool["function"] in PARSING_TOOLS: + func = PARSING_TOOLS[tool["function"]] + elif ( + self.user_defined_parsing_tool_map + and tool["function"] in self.user_defined_parsing_tool_map + ): + func = eval( + self.user_defined_parsing_tool_map[tool["function"]].function_code + ) + else: + raise ValueError( + f"Parsing tool {tool['function']} not found. Please define it or use one of our existing parsing tools: {PARSING_TOOLS.keys()}" + ) + + output_key = tool["output_key"] + function_kwargs = tool.get("function_kwargs", {}) + for item in data: + if input_key in item: + item[output_key] = func(item[input_key], **function_kwargs) + else: + raise ValueError(f"Input key {input_key} not found in item: {item}") + + return data + + def sample(self, n: int, random: bool = True) -> List[Dict]: + """ + Sample n items from the dataset. + + Args: + n (int): Number of items to sample. + random (bool): If True, sample randomly. If False, take the first n items. + + Returns: + List[Dict]: A list of n sampled items. + """ + if self.type == "memory": + import random as rd + + data = self.path_or_data + if n > len(data): + raise ValueError( + f"Sample size {n} is larger than dataset size {len(data)}" + ) + sampled_data = rd.sample(data, n) if random else data[:n] + return self._apply_parsing_tools(sampled_data) + + _, ext = os.path.splitext(self.path_or_data.lower()) + + if ext == ".json": + import json + import random as rd + + with open(self.path_or_data, "r") as f: + if random: + data = json.load(f) + if n > len(data): + raise ValueError( + f"Sample size {n} is larger than dataset size {len(data)}" + ) + sampled_data = rd.sample(data, n) + else: + sampled_data = [] + for i, line in enumerate(f): + if i >= n: + break + sampled_data.append(json.loads(line)) + + elif ext == ".csv": + import csv + import random as rd + + with open(self.path_or_data, "r") as f: + reader = csv.DictReader(f) + if random: + data = list(reader) + if n > len(data): + raise ValueError( + f"Sample size {n} is larger than dataset size {len(data)}" + ) + sampled_data = rd.sample(data, n) + else: + sampled_data = [next(reader) for _ in range(n)] + + else: + raise ValueError(f"Unsupported file extension: {ext}") + + return self._apply_parsing_tools(sampled_data) diff --git a/docetl/parsing_tools.py b/docetl/parsing_tools.py new file mode 100644 index 00000000..c898a08c --- /dev/null +++ b/docetl/parsing_tools.py @@ -0,0 +1,138 @@ +import os +from typing import Optional, List +from litellm import transcription + + +def whisper_speech_to_text(filename: str) -> List[str]: + """ + Transcribe speech from an audio file to text using Whisper model via litellm. + If the file is larger than 25 MB, it's split into 10-minute chunks with 30-second overlap. + + Args: + filename (str): Path to the mp3 or mp4 file. + + Returns: + List[str]: Transcribed text. + """ + import os + + file_size = os.path.getsize(filename) + if file_size > 25 * 1024 * 1024: # 25 MB in bytes + from pydub import AudioSegment + + audio = AudioSegment.from_file(filename) + chunk_length = 10 * 60 * 1000 # 10 minutes in milliseconds + overlap = 30 * 1000 # 30 seconds in milliseconds + + chunks = [] + for i in range(0, len(audio), chunk_length - overlap): + chunk = audio[i : i + chunk_length] + chunks.append(chunk) + + transcriptions = [] + for i, chunk in enumerate(chunks): + temp_filename = f"temp_chunk_{i}_{os.path.basename(filename)}" + chunk.export(temp_filename, format="mp3") + + with open(temp_filename, "rb") as audio_file: + response = transcription(model="whisper-1", file=audio_file) + transcriptions.append(response.text) + + os.remove(temp_filename) + + return transcriptions + else: + with open(filename, "rb") as audio_file: + response = transcription(model="whisper-1", file=audio_file) + + return [response.text] + + +def xlsx_to_string( + filename: str, + orientation: str = "col", + col_order: Optional[List[str]] = None, + doc_per_sheet: bool = False, +) -> List[str]: + """ + Convert an Excel file to a string representation or a list of string representations. + + Args: + filename (str): Path to the xlsx file. + orientation (str): Either "row" or "col" for cell arrangement. + col_order (Optional[List[str]]): List of column names to specify the order. + doc_per_sheet (bool): If True, return a list of strings, one per sheet. + + Returns: + List[str]: String representation(s) of the Excel file content. + """ + import openpyxl + + wb = openpyxl.load_workbook(filename) + + def process_sheet(sheet): + if col_order: + headers = col_order + else: + headers = [cell.value for cell in sheet[1]] + + result = [] + for row in sheet.iter_rows(min_row=2, values_only=True): + row_dict = dict(zip(headers, row)) + if orientation == "col": + result.extend( + [f"{header}: {value}" for header, value in row_dict.items()] + ) + result.append("") # Empty line between rows + else: # row + result.append( + " | ".join( + [f"{header}: {value}" for header, value in row_dict.items()] + ) + ) + + return "\n".join(result) + + if doc_per_sheet: + return [process_sheet(sheet) for sheet in wb.worksheets] + else: + return [process_sheet(wb.active)] + + +def txt_to_string(filename: str) -> List[str]: + """ + Read the content of a text file and return it as a list of strings (only one element). + + Args: + filename (str): Path to the txt or md file. + + Returns: + List[str]: Content of the file as a list of strings. + """ + with open(filename, "r", encoding="utf-8") as file: + return [file.read()] + + +def docx_to_string(filename: str) -> List[str]: + """ + Extract text from a Word document. + + Args: + filename (str): Path to the docx file. + + Returns: + List[str]: Extracted text from the document. + """ + from docx import Document + + doc = Document(filename) + return ["\n".join([paragraph.text for paragraph in doc.paragraphs])] + + +# Define a dictionary mapping function names to their corresponding functions +PARSING_TOOLS = { + "whisper_speech_to_text": whisper_speech_to_text, + "xlsx_to_string": xlsx_to_string, + "txt_to_string": txt_to_string, + "docx_to_string": docx_to_string, +} diff --git a/docetl/runner.py b/docetl/runner.py index d8abb998..716f0b2b 100644 --- a/docetl/runner.py +++ b/docetl/runner.py @@ -9,6 +9,7 @@ from docetl.operations import get_operation from docetl.operations.utils import flush_cache from docetl.utils import load_config +from docetl.dataset import Dataset, create_parsing_tool_map load_dotenv() @@ -47,6 +48,11 @@ def __init__(self, config: Dict, max_threads: int = None): "intermediate_dir" ) + # Create parsing tool map + self.parsing_tool_map = create_parsing_tool_map( + self.config.get("parsing_tools", None) + ) + # Check if output path is correctly formatted as JSON output_path = self.config.get("pipeline", {}).get("output", {}).get("path") if output_path: @@ -122,16 +128,20 @@ def run(self) -> float: total_cost = 0 for step in self.config["pipeline"]["steps"]: step_name = step["name"] - input_data = self.datasets[step["input"]] if "input" in step else None + input_data = ( + self.datasets[step["input"]].load() if "input" in step else None + ) output_data, step_cost = self.execute_step(step, input_data) - self.datasets[step_name] = output_data + self.datasets[step_name] = Dataset("memory", "local", output_data) flush_cache(self.console) total_cost += step_cost self.console.log( f"Step [cyan]{step_name}[/cyan] completed. Cost: [green]${step_cost:.2f}[/green]" ) - self.save_output(self.datasets[self.config["pipeline"]["steps"][-1]["name"]]) + self.save_output( + self.datasets[self.config["pipeline"]["steps"][-1]["name"]].load() + ) self.console.rule("[bold green]Execution Summary[/bold green]") self.console.print(f"[bold green]Total cost: [green]${total_cost:.2f}[/green]") self.console.print( @@ -144,7 +154,7 @@ def load_datasets(self): """ Load all datasets defined in the configuration. - This method reads datasets from files and stores them in the `datasets` attribute. + This method creates Dataset objects for each dataset in the configuration. Raises: ValueError: If an unsupported dataset type is encountered. @@ -152,9 +162,13 @@ def load_datasets(self): self.console.rule("[cyan]Loading Datasets[/cyan]") for name, dataset_config in self.config["datasets"].items(): if dataset_config["type"] == "file": - with open(dataset_config["path"], "r") as file: - self.datasets[name] = json.load(file) - self.datasets[name] = self.datasets[name] + self.datasets[name] = Dataset( + "file", + "local", + dataset_config["path"], + parsing_tools=dataset_config.get("parsing_tools", []), + user_defined_parsing_tool_map=self.parsing_tool_map, + ) self.console.print(f"Loaded dataset: [bold]{name}[/bold]") else: raise ValueError(f"Unsupported dataset type: {dataset_config['type']}") @@ -222,7 +236,7 @@ def execute_step( # If sample is set, sample the input data if op_object.get("sample"): - input_data = input_data[: op_object["sample"]] + input_data = self.datasets[step["input"]].sample(op_object["sample"]) with self.console.status("[bold]Running Operation:[/bold]") as status: status.update(f"Type: [cyan]{op_object['type']}[/cyan]") @@ -238,8 +252,8 @@ def execute_step( self.status, ) if op_object["type"] == "equijoin": - left_data = self.datasets[op_object["left"]] - right_data = self.datasets[op_object["right"]] + left_data = self.datasets[op_object["left"]].load() + right_data = self.datasets[op_object["right"]].load() input_data, cost = operation_instance.execute(left_data, right_data) else: input_data, cost = operation_instance.execute(input_data) @@ -257,13 +271,19 @@ def execute_step( def _load_from_checkpoint_if_exists( self, step_name: str, operation_name: str ) -> Optional[List[Dict]]: + if self.intermediate_dir is None: + return None + checkpoint_path = os.path.join( self.intermediate_dir, step_name, f"{operation_name}.json" ) # check if checkpoint exists if os.path.exists(checkpoint_path): - with open(checkpoint_path, "r") as f: - return json.load(f) + if f"{step_name}_{operation_name}" not in self.datasets: + self.datasets[f"{step_name}_{operation_name}"] = Dataset( + "file", "local", checkpoint_path + ) + return self.datasets[f"{step_name}_{operation_name}"].load() return None def _save_checkpoint(self, step_name: str, operation_name: str, data: List[Dict]): diff --git a/docetl/schemas.py b/docetl/schemas.py index 3d752a2a..f8086405 100644 --- a/docetl/schemas.py +++ b/docetl/schemas.py @@ -14,9 +14,16 @@ class Tool(BaseModel): function: ToolFunction +class ParsingTool(BaseModel): + name: str + function_code: str + + class Dataset(BaseModel): type: str + source: str path: str + parsing_tools: Optional[List[Dict[str, str]]] = None class BaseOp(BaseModel): @@ -194,4 +201,5 @@ class Pipeline(BaseModel): operations: List[OpType] steps: List[PipelineStep] output: PipelineOutput + parsing_tools: List[ParsingTool] = [] default_model: Optional[str] = None diff --git a/poetry.lock b/poetry.lock index f721bfb4..b33a9e91 100644 --- a/poetry.lock +++ b/poetry.lock @@ -451,6 +451,17 @@ idna = ["idna (>=3.6)"] trio = ["trio (>=0.23)"] wmi = ["wmi (>=1.5.1)"] +[[package]] +name = "et-xmlfile" +version = "1.1.0" +description = "An implementation of lxml.xmlfile for the standard library" +optional = true +python-versions = ">=3.6" +files = [ + {file = "et_xmlfile-1.1.0-py3-none-any.whl", hash = "sha256:a2ba85d1d6a74ef63837eed693bcb89c3f752169b0e3e7ae5b16ca5e1b3deada"}, + {file = "et_xmlfile-1.1.0.tar.gz", hash = "sha256:8eb9e2bc2f8c97e37a2dc85a09ecdcdec9d8a396530a6d5a33b30b9a92da0c5c"}, +] + [[package]] name = "exceptiongroup" version = "1.2.2" @@ -1023,6 +1034,160 @@ tokenizers = "*" extra-proxy = ["azure-identity (>=1.15.0,<2.0.0)", "azure-keyvault-secrets (>=4.8.0,<5.0.0)", "google-cloud-kms (>=2.21.3,<3.0.0)", "prisma (==0.11.0)", "resend (>=0.8.0,<0.9.0)"] proxy = ["PyJWT (>=2.8.0,<3.0.0)", "apscheduler (>=3.10.4,<4.0.0)", "backoff", "cryptography (>=42.0.5,<43.0.0)", "fastapi (>=0.111.0,<0.112.0)", "fastapi-sso (>=0.10.0,<0.11.0)", "gunicorn (>=22.0.0,<23.0.0)", "orjson (>=3.9.7,<4.0.0)", "pynacl (>=1.5.0,<2.0.0)", "python-multipart (>=0.0.9,<0.0.10)", "pyyaml (>=6.0.1,<7.0.0)", "rq", "uvicorn (>=0.22.0,<0.23.0)"] +[[package]] +name = "lxml" +version = "5.3.0" +description = "Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API." +optional = true +python-versions = ">=3.6" +files = [ + {file = "lxml-5.3.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:dd36439be765e2dde7660212b5275641edbc813e7b24668831a5c8ac91180656"}, + {file = "lxml-5.3.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ae5fe5c4b525aa82b8076c1a59d642c17b6e8739ecf852522c6321852178119d"}, + {file = "lxml-5.3.0-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:501d0d7e26b4d261fca8132854d845e4988097611ba2531408ec91cf3fd9d20a"}, + {file = "lxml-5.3.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fb66442c2546446944437df74379e9cf9e9db353e61301d1a0e26482f43f0dd8"}, + {file = "lxml-5.3.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9e41506fec7a7f9405b14aa2d5c8abbb4dbbd09d88f9496958b6d00cb4d45330"}, + {file = "lxml-5.3.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f7d4a670107d75dfe5ad080bed6c341d18c4442f9378c9f58e5851e86eb79965"}, + {file = "lxml-5.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:41ce1f1e2c7755abfc7e759dc34d7d05fd221723ff822947132dc934d122fe22"}, + {file = "lxml-5.3.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:44264ecae91b30e5633013fb66f6ddd05c006d3e0e884f75ce0b4755b3e3847b"}, + {file = "lxml-5.3.0-cp310-cp310-manylinux_2_28_ppc64le.whl", hash = "sha256:3c174dc350d3ec52deb77f2faf05c439331d6ed5e702fc247ccb4e6b62d884b7"}, + {file = "lxml-5.3.0-cp310-cp310-manylinux_2_28_s390x.whl", hash = "sha256:2dfab5fa6a28a0b60a20638dc48e6343c02ea9933e3279ccb132f555a62323d8"}, + {file = "lxml-5.3.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:b1c8c20847b9f34e98080da785bb2336ea982e7f913eed5809e5a3c872900f32"}, + {file = "lxml-5.3.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:2c86bf781b12ba417f64f3422cfc302523ac9cd1d8ae8c0f92a1c66e56ef2e86"}, + {file = "lxml-5.3.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:c162b216070f280fa7da844531169be0baf9ccb17263cf5a8bf876fcd3117fa5"}, + {file = "lxml-5.3.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:36aef61a1678cb778097b4a6eeae96a69875d51d1e8f4d4b491ab3cfb54b5a03"}, + {file = "lxml-5.3.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:f65e5120863c2b266dbcc927b306c5b78e502c71edf3295dfcb9501ec96e5fc7"}, + {file = "lxml-5.3.0-cp310-cp310-win32.whl", hash = "sha256:ef0c1fe22171dd7c7c27147f2e9c3e86f8bdf473fed75f16b0c2e84a5030ce80"}, + {file = "lxml-5.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:052d99051e77a4f3e8482c65014cf6372e61b0a6f4fe9edb98503bb5364cfee3"}, + {file = "lxml-5.3.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:74bcb423462233bc5d6066e4e98b0264e7c1bed7541fff2f4e34fe6b21563c8b"}, + {file = "lxml-5.3.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a3d819eb6f9b8677f57f9664265d0a10dd6551d227afb4af2b9cd7bdc2ccbf18"}, + {file = "lxml-5.3.0-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5b8f5db71b28b8c404956ddf79575ea77aa8b1538e8b2ef9ec877945b3f46442"}, + {file = "lxml-5.3.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2c3406b63232fc7e9b8783ab0b765d7c59e7c59ff96759d8ef9632fca27c7ee4"}, + {file = "lxml-5.3.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2ecdd78ab768f844c7a1d4a03595038c166b609f6395e25af9b0f3f26ae1230f"}, + {file = "lxml-5.3.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:168f2dfcfdedf611eb285efac1516c8454c8c99caf271dccda8943576b67552e"}, + {file = "lxml-5.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aa617107a410245b8660028a7483b68e7914304a6d4882b5ff3d2d3eb5948d8c"}, + {file = "lxml-5.3.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:69959bd3167b993e6e710b99051265654133a98f20cec1d9b493b931942e9c16"}, + {file = "lxml-5.3.0-cp311-cp311-manylinux_2_28_ppc64le.whl", hash = "sha256:bd96517ef76c8654446fc3db9242d019a1bb5fe8b751ba414765d59f99210b79"}, + {file = "lxml-5.3.0-cp311-cp311-manylinux_2_28_s390x.whl", hash = "sha256:ab6dd83b970dc97c2d10bc71aa925b84788c7c05de30241b9e96f9b6d9ea3080"}, + {file = "lxml-5.3.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:eec1bb8cdbba2925bedc887bc0609a80e599c75b12d87ae42ac23fd199445654"}, + {file = "lxml-5.3.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:6a7095eeec6f89111d03dabfe5883a1fd54da319c94e0fb104ee8f23616b572d"}, + {file = "lxml-5.3.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:6f651ebd0b21ec65dfca93aa629610a0dbc13dbc13554f19b0113da2e61a4763"}, + {file = "lxml-5.3.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:f422a209d2455c56849442ae42f25dbaaba1c6c3f501d58761c619c7836642ec"}, + {file = "lxml-5.3.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:62f7fdb0d1ed2065451f086519865b4c90aa19aed51081979ecd05a21eb4d1be"}, + {file = "lxml-5.3.0-cp311-cp311-win32.whl", hash = "sha256:c6379f35350b655fd817cd0d6cbeef7f265f3ae5fedb1caae2eb442bbeae9ab9"}, + {file = "lxml-5.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:9c52100e2c2dbb0649b90467935c4b0de5528833c76a35ea1a2691ec9f1ee7a1"}, + {file = "lxml-5.3.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:e99f5507401436fdcc85036a2e7dc2e28d962550afe1cbfc07c40e454256a859"}, + {file = "lxml-5.3.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:384aacddf2e5813a36495233b64cb96b1949da72bef933918ba5c84e06af8f0e"}, + {file = "lxml-5.3.0-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:874a216bf6afaf97c263b56371434e47e2c652d215788396f60477540298218f"}, + {file = "lxml-5.3.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:65ab5685d56914b9a2a34d67dd5488b83213d680b0c5d10b47f81da5a16b0b0e"}, + {file = "lxml-5.3.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:aac0bbd3e8dd2d9c45ceb82249e8bdd3ac99131a32b4d35c8af3cc9db1657179"}, + {file = "lxml-5.3.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b369d3db3c22ed14c75ccd5af429086f166a19627e84a8fdade3f8f31426e52a"}, + {file = "lxml-5.3.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c24037349665434f375645fa9d1f5304800cec574d0310f618490c871fd902b3"}, + {file = "lxml-5.3.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:62d172f358f33a26d6b41b28c170c63886742f5b6772a42b59b4f0fa10526cb1"}, + {file = "lxml-5.3.0-cp312-cp312-manylinux_2_28_ppc64le.whl", hash = "sha256:c1f794c02903c2824fccce5b20c339a1a14b114e83b306ff11b597c5f71a1c8d"}, + {file = "lxml-5.3.0-cp312-cp312-manylinux_2_28_s390x.whl", hash = "sha256:5d6a6972b93c426ace71e0be9a6f4b2cfae9b1baed2eed2006076a746692288c"}, + {file = "lxml-5.3.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:3879cc6ce938ff4eb4900d901ed63555c778731a96365e53fadb36437a131a99"}, + {file = "lxml-5.3.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:74068c601baff6ff021c70f0935b0c7bc528baa8ea210c202e03757c68c5a4ff"}, + {file = "lxml-5.3.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:ecd4ad8453ac17bc7ba3868371bffb46f628161ad0eefbd0a855d2c8c32dd81a"}, + {file = "lxml-5.3.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:7e2f58095acc211eb9d8b5771bf04df9ff37d6b87618d1cbf85f92399c98dae8"}, + {file = "lxml-5.3.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:e63601ad5cd8f860aa99d109889b5ac34de571c7ee902d6812d5d9ddcc77fa7d"}, + {file = "lxml-5.3.0-cp312-cp312-win32.whl", hash = "sha256:17e8d968d04a37c50ad9c456a286b525d78c4a1c15dd53aa46c1d8e06bf6fa30"}, + {file = "lxml-5.3.0-cp312-cp312-win_amd64.whl", hash = "sha256:c1a69e58a6bb2de65902051d57fde951febad631a20a64572677a1052690482f"}, + {file = "lxml-5.3.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:8c72e9563347c7395910de6a3100a4840a75a6f60e05af5e58566868d5eb2d6a"}, + {file = "lxml-5.3.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e92ce66cd919d18d14b3856906a61d3f6b6a8500e0794142338da644260595cd"}, + {file = "lxml-5.3.0-cp313-cp313-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1d04f064bebdfef9240478f7a779e8c5dc32b8b7b0b2fc6a62e39b928d428e51"}, + {file = "lxml-5.3.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5c2fb570d7823c2bbaf8b419ba6e5662137f8166e364a8b2b91051a1fb40ab8b"}, + {file = "lxml-5.3.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0c120f43553ec759f8de1fee2f4794452b0946773299d44c36bfe18e83caf002"}, + {file = "lxml-5.3.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:562e7494778a69086f0312ec9689f6b6ac1c6b65670ed7d0267e49f57ffa08c4"}, + {file = "lxml-5.3.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:423b121f7e6fa514ba0c7918e56955a1d4470ed35faa03e3d9f0e3baa4c7e492"}, + {file = "lxml-5.3.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:c00f323cc00576df6165cc9d21a4c21285fa6b9989c5c39830c3903dc4303ef3"}, + {file = "lxml-5.3.0-cp313-cp313-manylinux_2_28_ppc64le.whl", hash = "sha256:1fdc9fae8dd4c763e8a31e7630afef517eab9f5d5d31a278df087f307bf601f4"}, + {file = "lxml-5.3.0-cp313-cp313-manylinux_2_28_s390x.whl", hash = "sha256:658f2aa69d31e09699705949b5fc4719cbecbd4a97f9656a232e7d6c7be1a367"}, + {file = "lxml-5.3.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:1473427aff3d66a3fa2199004c3e601e6c4500ab86696edffdbc84954c72d832"}, + {file = "lxml-5.3.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a87de7dd873bf9a792bf1e58b1c3887b9264036629a5bf2d2e6579fe8e73edff"}, + {file = "lxml-5.3.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:0d7b36afa46c97875303a94e8f3ad932bf78bace9e18e603f2085b652422edcd"}, + {file = "lxml-5.3.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:cf120cce539453ae086eacc0130a324e7026113510efa83ab42ef3fcfccac7fb"}, + {file = "lxml-5.3.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:df5c7333167b9674aa8ae1d4008fa4bc17a313cc490b2cca27838bbdcc6bb15b"}, + {file = "lxml-5.3.0-cp313-cp313-win32.whl", hash = "sha256:c802e1c2ed9f0c06a65bc4ed0189d000ada8049312cfeab6ca635e39c9608957"}, + {file = "lxml-5.3.0-cp313-cp313-win_amd64.whl", hash = "sha256:406246b96d552e0503e17a1006fd27edac678b3fcc9f1be71a2f94b4ff61528d"}, + {file = "lxml-5.3.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:8f0de2d390af441fe8b2c12626d103540b5d850d585b18fcada58d972b74a74e"}, + {file = "lxml-5.3.0-cp36-cp36m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1afe0a8c353746e610bd9031a630a95bcfb1a720684c3f2b36c4710a0a96528f"}, + {file = "lxml-5.3.0-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:56b9861a71575f5795bde89256e7467ece3d339c9b43141dbdd54544566b3b94"}, + {file = "lxml-5.3.0-cp36-cp36m-manylinux_2_28_x86_64.whl", hash = "sha256:9fb81d2824dff4f2e297a276297e9031f46d2682cafc484f49de182aa5e5df99"}, + {file = "lxml-5.3.0-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:2c226a06ecb8cdef28845ae976da407917542c5e6e75dcac7cc33eb04aaeb237"}, + {file = "lxml-5.3.0-cp36-cp36m-musllinux_1_2_x86_64.whl", hash = "sha256:7d3d1ca42870cdb6d0d29939630dbe48fa511c203724820fc0fd507b2fb46577"}, + {file = "lxml-5.3.0-cp36-cp36m-win32.whl", hash = "sha256:094cb601ba9f55296774c2d57ad68730daa0b13dc260e1f941b4d13678239e70"}, + {file = "lxml-5.3.0-cp36-cp36m-win_amd64.whl", hash = "sha256:eafa2c8658f4e560b098fe9fc54539f86528651f61849b22111a9b107d18910c"}, + {file = "lxml-5.3.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:cb83f8a875b3d9b458cada4f880fa498646874ba4011dc974e071a0a84a1b033"}, + {file = "lxml-5.3.0-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:25f1b69d41656b05885aa185f5fdf822cb01a586d1b32739633679699f220391"}, + {file = "lxml-5.3.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:23e0553b8055600b3bf4a00b255ec5c92e1e4aebf8c2c09334f8368e8bd174d6"}, + {file = "lxml-5.3.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9ada35dd21dc6c039259596b358caab6b13f4db4d4a7f8665764d616daf9cc1d"}, + {file = "lxml-5.3.0-cp37-cp37m-manylinux_2_28_aarch64.whl", hash = "sha256:81b4e48da4c69313192d8c8d4311e5d818b8be1afe68ee20f6385d0e96fc9512"}, + {file = "lxml-5.3.0-cp37-cp37m-manylinux_2_28_x86_64.whl", hash = "sha256:2bc9fd5ca4729af796f9f59cd8ff160fe06a474da40aca03fcc79655ddee1a8b"}, + {file = "lxml-5.3.0-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:07da23d7ee08577760f0a71d67a861019103e4812c87e2fab26b039054594cc5"}, + {file = "lxml-5.3.0-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:ea2e2f6f801696ad7de8aec061044d6c8c0dd4037608c7cab38a9a4d316bfb11"}, + {file = "lxml-5.3.0-cp37-cp37m-win32.whl", hash = "sha256:5c54afdcbb0182d06836cc3d1be921e540be3ebdf8b8a51ee3ef987537455f84"}, + {file = "lxml-5.3.0-cp37-cp37m-win_amd64.whl", hash = "sha256:f2901429da1e645ce548bf9171784c0f74f0718c3f6150ce166be39e4dd66c3e"}, + {file = "lxml-5.3.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:c56a1d43b2f9ee4786e4658c7903f05da35b923fb53c11025712562d5cc02753"}, + {file = "lxml-5.3.0-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6ee8c39582d2652dcd516d1b879451500f8db3fe3607ce45d7c5957ab2596040"}, + {file = "lxml-5.3.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0fdf3a3059611f7585a78ee10399a15566356116a4288380921a4b598d807a22"}, + {file = "lxml-5.3.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:146173654d79eb1fc97498b4280c1d3e1e5d58c398fa530905c9ea50ea849b22"}, + {file = "lxml-5.3.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:0a7056921edbdd7560746f4221dca89bb7a3fe457d3d74267995253f46343f15"}, + {file = "lxml-5.3.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:9e4b47ac0f5e749cfc618efdf4726269441014ae1d5583e047b452a32e221920"}, + {file = "lxml-5.3.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:f914c03e6a31deb632e2daa881fe198461f4d06e57ac3d0e05bbcab8eae01945"}, + {file = "lxml-5.3.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:213261f168c5e1d9b7535a67e68b1f59f92398dd17a56d934550837143f79c42"}, + {file = "lxml-5.3.0-cp38-cp38-win32.whl", hash = "sha256:218c1b2e17a710e363855594230f44060e2025b05c80d1f0661258142b2add2e"}, + {file = "lxml-5.3.0-cp38-cp38-win_amd64.whl", hash = "sha256:315f9542011b2c4e1d280e4a20ddcca1761993dda3afc7a73b01235f8641e903"}, + {file = "lxml-5.3.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:1ffc23010330c2ab67fac02781df60998ca8fe759e8efde6f8b756a20599c5de"}, + {file = "lxml-5.3.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:2b3778cb38212f52fac9fe913017deea2fdf4eb1a4f8e4cfc6b009a13a6d3fcc"}, + {file = "lxml-5.3.0-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4b0c7a688944891086ba192e21c5229dea54382f4836a209ff8d0a660fac06be"}, + {file = "lxml-5.3.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:747a3d3e98e24597981ca0be0fd922aebd471fa99d0043a3842d00cdcad7ad6a"}, + {file = "lxml-5.3.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:86a6b24b19eaebc448dc56b87c4865527855145d851f9fc3891673ff97950540"}, + {file = "lxml-5.3.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b11a5d918a6216e521c715b02749240fb07ae5a1fefd4b7bf12f833bc8b4fe70"}, + {file = "lxml-5.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:68b87753c784d6acb8a25b05cb526c3406913c9d988d51f80adecc2b0775d6aa"}, + {file = "lxml-5.3.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:109fa6fede314cc50eed29e6e56c540075e63d922455346f11e4d7a036d2b8cf"}, + {file = "lxml-5.3.0-cp39-cp39-manylinux_2_28_ppc64le.whl", hash = "sha256:02ced472497b8362c8e902ade23e3300479f4f43e45f4105c85ef43b8db85229"}, + {file = "lxml-5.3.0-cp39-cp39-manylinux_2_28_s390x.whl", hash = "sha256:6b038cc86b285e4f9fea2ba5ee76e89f21ed1ea898e287dc277a25884f3a7dfe"}, + {file = "lxml-5.3.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:7437237c6a66b7ca341e868cda48be24b8701862757426852c9b3186de1da8a2"}, + {file = "lxml-5.3.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:7f41026c1d64043a36fda21d64c5026762d53a77043e73e94b71f0521939cc71"}, + {file = "lxml-5.3.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:482c2f67761868f0108b1743098640fbb2a28a8e15bf3f47ada9fa59d9fe08c3"}, + {file = "lxml-5.3.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:1483fd3358963cc5c1c9b122c80606a3a79ee0875bcac0204149fa09d6ff2727"}, + {file = "lxml-5.3.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:2dec2d1130a9cda5b904696cec33b2cfb451304ba9081eeda7f90f724097300a"}, + {file = "lxml-5.3.0-cp39-cp39-win32.whl", hash = "sha256:a0eabd0a81625049c5df745209dc7fcef6e2aea7793e5f003ba363610aa0a3ff"}, + {file = "lxml-5.3.0-cp39-cp39-win_amd64.whl", hash = "sha256:89e043f1d9d341c52bf2af6d02e6adde62e0a46e6755d5eb60dc6e4f0b8aeca2"}, + {file = "lxml-5.3.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:7b1cd427cb0d5f7393c31b7496419da594fe600e6fdc4b105a54f82405e6626c"}, + {file = "lxml-5.3.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:51806cfe0279e06ed8500ce19479d757db42a30fd509940b1701be9c86a5ff9a"}, + {file = "lxml-5.3.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ee70d08fd60c9565ba8190f41a46a54096afa0eeb8f76bd66f2c25d3b1b83005"}, + {file = "lxml-5.3.0-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:8dc2c0395bea8254d8daebc76dcf8eb3a95ec2a46fa6fae5eaccee366bfe02ce"}, + {file = "lxml-5.3.0-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:6ba0d3dcac281aad8a0e5b14c7ed6f9fa89c8612b47939fc94f80b16e2e9bc83"}, + {file = "lxml-5.3.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:6e91cf736959057f7aac7adfc83481e03615a8e8dd5758aa1d95ea69e8931dba"}, + {file = "lxml-5.3.0-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:94d6c3782907b5e40e21cadf94b13b0842ac421192f26b84c45f13f3c9d5dc27"}, + {file = "lxml-5.3.0-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c300306673aa0f3ed5ed9372b21867690a17dba38c68c44b287437c362ce486b"}, + {file = "lxml-5.3.0-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:78d9b952e07aed35fe2e1a7ad26e929595412db48535921c5013edc8aa4a35ce"}, + {file = "lxml-5.3.0-pp37-pypy37_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:01220dca0d066d1349bd6a1726856a78f7929f3878f7e2ee83c296c69495309e"}, + {file = "lxml-5.3.0-pp37-pypy37_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:2d9b8d9177afaef80c53c0a9e30fa252ff3036fb1c6494d427c066a4ce6a282f"}, + {file = "lxml-5.3.0-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:20094fc3f21ea0a8669dc4c61ed7fa8263bd37d97d93b90f28fc613371e7a875"}, + {file = "lxml-5.3.0-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:ace2c2326a319a0bb8a8b0e5b570c764962e95818de9f259ce814ee666603f19"}, + {file = "lxml-5.3.0-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:92e67a0be1639c251d21e35fe74df6bcc40cba445c2cda7c4a967656733249e2"}, + {file = "lxml-5.3.0-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd5350b55f9fecddc51385463a4f67a5da829bc741e38cf689f38ec9023f54ab"}, + {file = "lxml-5.3.0-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:4c1fefd7e3d00921c44dc9ca80a775af49698bbfd92ea84498e56acffd4c5469"}, + {file = "lxml-5.3.0-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:71a8dd38fbd2f2319136d4ae855a7078c69c9a38ae06e0c17c73fd70fc6caad8"}, + {file = "lxml-5.3.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:97acf1e1fd66ab53dacd2c35b319d7e548380c2e9e8c54525c6e76d21b1ae3b1"}, + {file = "lxml-5.3.0-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:68934b242c51eb02907c5b81d138cb977b2129a0a75a8f8b60b01cb8586c7b21"}, + {file = "lxml-5.3.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b710bc2b8292966b23a6a0121f7a6c51d45d2347edcc75f016ac123b8054d3f2"}, + {file = "lxml-5.3.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:18feb4b93302091b1541221196a2155aa296c363fd233814fa11e181adebc52f"}, + {file = "lxml-5.3.0-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:3eb44520c4724c2e1a57c0af33a379eee41792595023f367ba3952a2d96c2aab"}, + {file = "lxml-5.3.0-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:609251a0ca4770e5a8768ff902aa02bf636339c5a93f9349b48eb1f606f7f3e9"}, + {file = "lxml-5.3.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:516f491c834eb320d6c843156440fe7fc0d50b33e44387fcec5b02f0bc118a4c"}, + {file = "lxml-5.3.0.tar.gz", hash = "sha256:4e109ca30d1edec1ac60cdbe341905dc3b8f55b16855e03a54aaf59e51ec8c6f"}, +] + +[package.extras] +cssselect = ["cssselect (>=0.7)"] +html-clean = ["lxml-html-clean"] +html5 = ["html5lib"] +htmlsoup = ["BeautifulSoup4"] +source = ["Cython (>=3.0.11)"] + [[package]] name = "markdown" version = "3.7" @@ -1550,6 +1715,20 @@ typing-extensions = ">=4.11,<5" [package.extras] datalib = ["numpy (>=1)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)"] +[[package]] +name = "openpyxl" +version = "3.1.5" +description = "A Python library to read/write Excel 2010 xlsx/xlsm files" +optional = true +python-versions = ">=3.8" +files = [ + {file = "openpyxl-3.1.5-py2.py3-none-any.whl", hash = "sha256:5282c12b107bffeef825f4617dc029afaf41d0ea60823bbb665ef3079dc79de2"}, + {file = "openpyxl-3.1.5.tar.gz", hash = "sha256:cf0e3cf56142039133628b5acffe8ef0c12bc902d2aadd3e0fe5878dc08d1050"}, +] + +[package.dependencies] +et-xmlfile = "*" + [[package]] name = "packaging" version = "24.1" @@ -1760,6 +1939,17 @@ files = [ [package.dependencies] typing-extensions = ">=4.6.0,<4.7.0 || >4.7.0" +[[package]] +name = "pydub" +version = "0.25.1" +description = "Manipulate audio with an simple and easy high level interface" +optional = false +python-versions = "*" +files = [ + {file = "pydub-0.25.1-py2.py3-none-any.whl", hash = "sha256:65617e33033874b59d87db603aa1ed450633288aefead953b30bded59cb599a6"}, + {file = "pydub-0.25.1.tar.gz", hash = "sha256:980a33ce9949cab2a569606b65674d748ecbca4f0796887fd6f46173a7b0d30f"}, +] + [[package]] name = "pygments" version = "2.18.0" @@ -1867,6 +2057,21 @@ files = [ [package.dependencies] six = ">=1.5" +[[package]] +name = "python-docx" +version = "1.1.2" +description = "Create, read, and update Microsoft Word .docx files." +optional = true +python-versions = ">=3.7" +files = [ + {file = "python_docx-1.1.2-py3-none-any.whl", hash = "sha256:08c20d6058916fb19853fcf080f7f42b6270d89eac9fa5f8c15f691c0017fabe"}, + {file = "python_docx-1.1.2.tar.gz", hash = "sha256:0cf1f22e95b9002addca7948e16f2cd7acdfd498047f1941ca5d293db7762efd"}, +] + +[package.dependencies] +lxml = ">=3.1.0" +typing-extensions = ">=4.9.0" + [[package]] name = "python-dotenv" version = "1.0.1" @@ -2877,7 +3082,10 @@ enabler = ["pytest-enabler (>=2.2)"] test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-itertools", "pytest (>=6,!=8.1.*)", "pytest-ignore-flaky"] type = ["pytest-mypy"] +[extras] +parsing = ["openpyxl", "python-docx"] + [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "25275c6149c2285114bebb25b4e800cafa4f55be91a7e17a0e4901d9f1c10589" +content-hash = "73859cbdc653a343e81e2ce0e68c24cb9fbb8592411b9ad77ed59a762c48e645" diff --git a/pyproject.toml b/pyproject.toml index 21b9ba54..1e15bea5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,6 +24,12 @@ pydantic = "^2.9.2" asteval = "^1.0.4" scikit-learn = "^1.5.2" numpy = "^1.26.4" +openpyxl = { version = "^3.1.5", optional = true } +python-docx = { version = "^1.1.2", optional = true } +pydub = { version = "^0.25.1", optional = true } + +[tool.poetry.extras] +parsing = ["python-docx", "openpyxl", "pydub"] [tool.poetry.group.dev.dependencies] pytest = "^8.3.2" diff --git a/tests/basic/sample_texts/one.txt b/tests/basic/sample_texts/one.txt new file mode 100644 index 00000000..bddb3867 --- /dev/null +++ b/tests/basic/sample_texts/one.txt @@ -0,0 +1,11 @@ +Once upon a time, in a quaint village nestled among rolling hills, there lived a curious young girl named Lily. She had always dreamed of adventure beyond the confines of her small town. One day, while exploring the attic of her grandmother's old house, Lily discovered a dusty, leather-bound book with strange symbols etched on its cover. + +As she opened the book, a swirl of glittering mist escaped from its pages, enveloping her in a magical aura. Suddenly, Lily found herself transported to a fantastical world filled with talking animals, floating islands, and shimmering forests. + +Guided by a wise old owl named Hoot, Lily embarked on a quest to find the lost key of harmony, which would restore balance to this enchanted realm. Along her journey, she befriended a mischievous fox, outsmarted a grumpy troll, and solved riddles posed by ancient tree spirits. + +With each challenge she overcame, Lily grew braver and more confident. She learned that true magic lies not in spells or potions, but in the power of kindness, perseverance, and friendship. + +As Lily finally reached the crystal cave where the key of harmony was hidden, she realized that the real treasure was the incredible adventure she had experienced and the lifelong friends she had made along the way. + +With a bittersweet heart, Lily used the key to return home, knowing that her ordinary life would never be the same again. From that day forward, she approached each day with the wonder and courage of a true adventurer, always ready for the next exciting chapter in her story. diff --git a/tests/basic/sample_texts/two.md b/tests/basic/sample_texts/two.md new file mode 100644 index 00000000..dbb9e28d --- /dev/null +++ b/tests/basic/sample_texts/two.md @@ -0,0 +1,33 @@ +# The Enchanted Forest + +Once upon a time, in a land far beyond the reaches of our modern world, there lay a mysterious and enchanted forest. This forest, known as the Whispering Woods, was said to be alive with magic and wonder. + +## The Guardian of the Woods + +At the heart of the Whispering Woods lived an ancient tree spirit named Eldora. With bark as silver as moonlight and leaves that shimmered like emeralds, Eldora had watched over the forest for countless centuries. + +## The Lost Traveler + +One misty morning, a young traveler named Finn stumbled into the Whispering Woods. Lost and weary, he marveled at the ethereal beauty of the forest. + +### A Magical Encounter + +As Finn wandered deeper into the woods, he heard a soft, melodious voice carried on the breeze. It was Eldora, calling out to him: + +> "Welcome, young one. What brings you to our magical realm?" + +Finn, awestruck, replied, "I've lost my way, kind spirit. Can you help me find my path?" + +### The Quest Begins + +Eldora smiled, her leaves rustling gently. "To find your true path, you must first complete three tasks: + +1. Befriend the Moonlight Rabbits +2. Solve the Riddle of the Babbling Brook +3. Plant a seed of hope in the Glade of Dreams" + +And so, Finn's adventure in the Whispering Woods began, filled with magical creatures, enigmatic puzzles, and the promise of self-discovery. + +--- + +_To be continued..._ diff --git a/tests/basic/test_pipeline_with_parsing.py b/tests/basic/test_pipeline_with_parsing.py new file mode 100644 index 00000000..13c204d1 --- /dev/null +++ b/tests/basic/test_pipeline_with_parsing.py @@ -0,0 +1,121 @@ +import pytest +import json +import os +import tempfile +from docetl.runner import DSLRunner +from docetl.utils import load_config +import yaml + +# Sample configuration for the test +SAMPLE_CONFIG = """ +default_model: "gpt-4o-mini" + +operations: + - name: map_operation + type: map + prompt: | + Summarize the following text in one sentence: "{{ input.content }}" + output: + schema: + summary: string + model: "gpt-4o-mini" + +datasets: + sample_dataset: + type: file + source: local + path: "tests/sample_data.json" + parsing_tools: + - input_key: text_file_path + function: txt_to_string + output_key: content + +pipeline: + steps: + - name: summarize_text + input: sample_dataset + operations: + - map_operation + + output: + type: file + path: "tests/output.json" +""" + +SAMPLE_JSON_DATA = [ + {"id": 1, "text_file_path": "tests/basic/sample_texts/one.txt"}, + {"id": 2, "text_file_path": "tests/basic/sample_texts/two.md"}, +] + +# Read sample text content from files +with open("tests/basic/sample_texts/one.txt", "r") as f: + SAMPLE_TEXT_CONTENT_ONE = f.read() + +with open("tests/basic/sample_texts/two.md", "r") as f: + SAMPLE_TEXT_CONTENT_TWO = f.read() + + +@pytest.fixture +def config_file(): + with tempfile.NamedTemporaryFile( + mode="w+", suffix=".yaml", delete=False + ) as temp_file: + temp_file.write(SAMPLE_CONFIG) + temp_file.flush() + yield temp_file.name + os.unlink(temp_file.name) + + +def test_pipeline_with_parsing(config_file): + # Update the config with the correct sample data path + config = load_config(config_file) + + with tempfile.NamedTemporaryFile( + mode="w+", suffix=".json", delete=False + ) as sample_data_file: + json.dump(SAMPLE_JSON_DATA, sample_data_file) + sample_data_file.flush() + config["datasets"]["sample_dataset"]["path"] = sample_data_file.name + + with tempfile.NamedTemporaryFile( + mode="w+", suffix=".json", delete=False + ) as output_file: + config["pipeline"]["output"]["path"] = output_file.name + + # Write the updated config back to the file + with open(config_file, "w") as f: + yaml.dump(config, f) + + # Create and run the DSLRunner + runner = DSLRunner.from_yaml(config_file) + total_cost = runner.run() + + # Check if the output file was created + assert os.path.exists(output_file.name), "Output file was not created" + + # Load and check the output + with open(output_file.name, "r") as f: + output_data = json.load(f) + + # Verify the output + assert len(output_data) == len( + SAMPLE_JSON_DATA + ), f"Expected {len(SAMPLE_JSON_DATA)} output items" + for item in output_data: + assert "summary" in item, "Summary was not generated" + assert isinstance(item["summary"], str), "Summary is not a string" + + # Check if the cost was calculated and is greater than 0 + assert total_cost > 0, "Total cost was not calculated or is 0" + + print(f"Pipeline executed successfully. Total cost: ${total_cost:.2f}") + + # Assert that each output has at least 50 characters + for item in output_data: + assert len(item["summary"]) >= 50, "Summary is not long enough" + + # Clean up the output file + os.unlink(output_file.name) + + # Clean up the sample data file + os.remove(sample_data_file.name) diff --git a/tests/test_parsing_tools.py b/tests/test_parsing_tools.py new file mode 100644 index 00000000..1954548e --- /dev/null +++ b/tests/test_parsing_tools.py @@ -0,0 +1,122 @@ +import pytest +import os +import tempfile +from docetl import parsing_tools + + +@pytest.fixture +def temp_audio_file(): + import requests + + url = "https://listenaminute.com/a/animals.mp3" + response = requests.get(url) + with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as temp_file: + temp_file.write(response.content) + yield temp_file.name + return temp_file.name + + +@pytest.fixture +def temp_xlsx_file(): + import openpyxl + + wb = openpyxl.Workbook() + ws = wb.active + ws.append(["Name", "Age", "City"]) + ws.append(["Alice", 30, "New York"]) + ws.append(["Bob", 25, "London"]) + with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False) as temp_file: + wb.save(temp_file.name) + yield temp_file.name + return temp_file.name + + +@pytest.fixture +def temp_txt_file(): + with tempfile.NamedTemporaryFile( + mode="w", suffix=".txt", delete=False + ) as temp_file: + temp_file.write("This is a test text file.\nIt has multiple lines.") + yield temp_file.name + return temp_file.name + + +@pytest.fixture +def temp_docx_file(): + from docx import Document + + doc = Document() + doc.add_paragraph("This is a test Word document.") + doc.add_paragraph("It has multiple paragraphs.") + with tempfile.NamedTemporaryFile(suffix=".docx", delete=False) as temp_file: + doc.save(temp_file.name) + yield temp_file.name + return temp_file.name + + +def test_whisper_speech_to_text(temp_audio_file): + result = parsing_tools.whisper_speech_to_text(temp_audio_file) + + assert isinstance(result, list) + assert len(result) == 1 + assert isinstance(result[0], str) + assert len(result[0]) > 0 # Ensure some text was transcribed + + +def test_xlsx_to_string(temp_xlsx_file): + result = parsing_tools.xlsx_to_string(temp_xlsx_file) + + assert isinstance(result, list) + assert len(result) == 1 + assert "Name: Alice" in result[0] + assert "Age: 30" in result[0] + assert "City: New York" in result[0] + assert "Name: Bob" in result[0] + assert "Age: 25" in result[0] + assert "City: London" in result[0] + + +def test_xlsx_to_string_row_orientation(temp_xlsx_file): + result = parsing_tools.xlsx_to_string(temp_xlsx_file, orientation="row") + + assert isinstance(result, list) + assert len(result) == 1 + assert "Name: Alice | Age: 30 | City: New York" in result[0] + assert "Name: Bob | Age: 25 | City: London" in result[0] + + +def test_xlsx_to_string_doc_per_sheet(temp_xlsx_file): + result = parsing_tools.xlsx_to_string(temp_xlsx_file, doc_per_sheet=True) + + assert isinstance(result, list) + assert len(result) == 1 # Only one sheet in our test file + assert "Name: Alice" in result[0] + assert "Age: 30" in result[0] + assert "City: New York" in result[0] + + +def test_txt_to_string(temp_txt_file): + result = parsing_tools.txt_to_string(temp_txt_file) + + assert isinstance(result, list) + assert len(result) == 1 + assert result[0] == "This is a test text file.\nIt has multiple lines." + + +def test_docx_to_string(temp_docx_file): + result = parsing_tools.docx_to_string(temp_docx_file) + + assert isinstance(result, list) + assert len(result) == 1 + assert "This is a test Word document." in result[0] + assert "It has multiple paragraphs." in result[0] + + +# Clean up temporary files after all tests have passed +def pytest_sessionfinish(session, exitstatus): + if exitstatus == 0: + for fixture in [temp_audio_file, temp_xlsx_file, temp_txt_file, temp_docx_file]: + file_path = session.config.cache.get(fixture.__name__, None) + if file_path and os.path.exists(file_path): + os.remove(file_path) + os.unlink(file_path) From 871a2b358c8b2e2693c2b5bb4d509681fd0cc21c Mon Sep 17 00:00:00 2001 From: Shreya Shankar Date: Mon, 30 Sep 2024 18:44:46 -0700 Subject: [PATCH 02/11] chore: refactor parsing naming --- docetl/api.py | 10 ++++++++-- docetl/builder.py | 2 +- docetl/dataset.py | 22 ++++++++++++++++------ docetl/runner.py | 2 +- docetl/schemas.py | 2 +- 5 files changed, 27 insertions(+), 11 deletions(-) diff --git a/docetl/api.py b/docetl/api.py index 601d86d4..b16a970c 100644 --- a/docetl/api.py +++ b/docetl/api.py @@ -31,7 +31,7 @@ "input": Dataset( type="file", path="input.json", - parsing_tools=[{"name": "txt_to_string", "input_key": "text", "output_key": "content"}] + parsing=[{"name": "txt_to_string", "input_key": "text", "output_key": "content"}] ) }, operations=[ @@ -184,7 +184,13 @@ def _update_from_dict(self, config: Dict[str, Any]): config (Dict[str, Any]): Dictionary representation of the Pipeline. """ self.datasets = { - name: Dataset(**dataset) for name, dataset in config["datasets"].items() + name: Dataset( + type=dataset["type"], + source=dataset["source"], + path=dataset["path"], + parsing=dataset.get("parsing"), + ) + for name, dataset in config["datasets"].items() } self.operations = [] for op in config["operations"]: diff --git a/docetl/builder.py b/docetl/builder.py index f4a02201..cab0c54c 100644 --- a/docetl/builder.py +++ b/docetl/builder.py @@ -970,7 +970,7 @@ def _get_sample_data( type=dataset_config["type"], source=dataset_config["source"], path_or_data=dataset_config["path"], - parsing_tools=dataset_config.get("parsing_tools", []), + parsing=dataset_config.get("parsing", []), user_defined_parsing_tool_map=self.parsing_tool_map, ) data = dataset.load() diff --git a/docetl/dataset.py b/docetl/dataset.py index 1adf551d..a572e3f3 100644 --- a/docetl/dataset.py +++ b/docetl/dataset.py @@ -21,13 +21,13 @@ def __init__( type: str, source: str, path_or_data: Union[str, List[Dict]], - parsing_tools: List[Dict[str, str]] = None, + parsing: List[Dict[str, str]] = None, user_defined_parsing_tool_map: Dict[str, ParsingTool] = {}, ): self.type = self._validate_type(type) self.source = self._validate_source(source) self.path_or_data = self._validate_path_or_data(path_or_data) - self.parsing_tools = self._validate_parsing_tools(parsing_tools) + self.parsing = self._validate_parsing(parsing) self.user_defined_parsing_tool_map = user_defined_parsing_tool_map def _validate_type(self, type: str) -> str: @@ -56,7 +56,7 @@ def _validate_path_or_data( ) return path_or_data - def _validate_parsing_tools( + def _validate_parsing( self, parsing_tools: Union[List[Dict[str, str]], None] ) -> List[Dict[str, str]]: if parsing_tools is None: @@ -88,7 +88,7 @@ def _validate_parsing_tools( return parsing_tools def __repr__(self): - return f"Dataset(type='{self.type}', source='{self.source}', path_or_data='{self.path_or_data}', parsing_tools={self.parsing_tools})" + return f"Dataset(type='{self.type}', source='{self.source}', path_or_data='{self.path_or_data}', parsing={self.parsing})" def load(self) -> List[Dict]: """ @@ -128,7 +128,7 @@ def _apply_parsing_tools(self, data: List[Dict]) -> List[Dict]: Returns: List[Dict]: The data with parsing tools applied. """ - for tool in self.parsing_tools: + for tool in self.parsing: input_key = tool["input_key"] if tool["function"] in PARSING_TOOLS: func = PARSING_TOOLS[tool["function"]] @@ -146,11 +146,21 @@ def _apply_parsing_tools(self, data: List[Dict]) -> List[Dict]: output_key = tool["output_key"] function_kwargs = tool.get("function_kwargs", {}) + new_data = [] for item in data: if input_key in item: - item[output_key] = func(item[input_key], **function_kwargs) + result = func(item[input_key], **function_kwargs) + if isinstance(result, list): + for res in result: + new_item = item.copy() + new_item[output_key] = res + new_data.append(new_item) + else: + item[output_key] = result + new_data.append(item) else: raise ValueError(f"Input key {input_key} not found in item: {item}") + data = new_data return data diff --git a/docetl/runner.py b/docetl/runner.py index 716f0b2b..8dc5ef9f 100644 --- a/docetl/runner.py +++ b/docetl/runner.py @@ -166,7 +166,7 @@ def load_datasets(self): "file", "local", dataset_config["path"], - parsing_tools=dataset_config.get("parsing_tools", []), + parsing=dataset_config.get("parsing", []), user_defined_parsing_tool_map=self.parsing_tool_map, ) self.console.print(f"Loaded dataset: [bold]{name}[/bold]") diff --git a/docetl/schemas.py b/docetl/schemas.py index f8086405..49343022 100644 --- a/docetl/schemas.py +++ b/docetl/schemas.py @@ -23,7 +23,7 @@ class Dataset(BaseModel): type: str source: str path: str - parsing_tools: Optional[List[Dict[str, str]]] = None + parsing: Optional[List[Dict[str, str]]] = None class BaseOp(BaseModel): From e50cc74418d54ddb9253ad90863c9352f5f773e5 Mon Sep 17 00:00:00 2001 From: Shreya Shankar Date: Mon, 30 Sep 2024 18:46:10 -0700 Subject: [PATCH 03/11] chore: update poetry lockfile --- poetry.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/poetry.lock b/poetry.lock index b33a9e91..3e7dc7c9 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1943,7 +1943,7 @@ typing-extensions = ">=4.6.0,<4.7.0 || >4.7.0" name = "pydub" version = "0.25.1" description = "Manipulate audio with an simple and easy high level interface" -optional = false +optional = true python-versions = "*" files = [ {file = "pydub-0.25.1-py2.py3-none-any.whl", hash = "sha256:65617e33033874b59d87db603aa1ed450633288aefead953b30bded59cb599a6"}, @@ -3083,9 +3083,9 @@ test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", type = ["pytest-mypy"] [extras] -parsing = ["openpyxl", "python-docx"] +parsing = ["openpyxl", "pydub", "python-docx"] [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "73859cbdc653a343e81e2ce0e68c24cb9fbb8592411b9ad77ed59a762c48e645" +content-hash = "80b030f8e672413a7c3d20d45f9bc376af822601a2095fc66ca21ff22395c411" From 8f2724b98c66accea20d34a92689b21ad801d162 Mon Sep 17 00:00:00 2001 From: Shreya Shankar Date: Mon, 30 Sep 2024 18:48:57 -0700 Subject: [PATCH 04/11] chore: multithreading for parsing --- docetl/dataset.py | 121 +++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 109 insertions(+), 12 deletions(-) diff --git a/docetl/dataset.py b/docetl/dataset.py index a572e3f3..fe14ffda 100644 --- a/docetl/dataset.py +++ b/docetl/dataset.py @@ -4,11 +4,21 @@ from docetl.parsing_tools import PARSING_TOOLS from docetl.schemas import ParsingTool +from concurrent.futures import ThreadPoolExecutor, as_completed def create_parsing_tool_map( parsing_tools: Optional[List[ParsingTool]], ) -> Dict[str, ParsingTool]: + """ + Create a mapping of parsing tool names to their corresponding ParsingTool objects. + + Args: + parsing_tools (Optional[List[ParsingTool]]): A list of ParsingTool objects. + + Returns: + Dict[str, ParsingTool]: A dictionary mapping tool names to ParsingTool objects. + """ if parsing_tools is None: return {} @@ -16,6 +26,17 @@ def create_parsing_tool_map( class Dataset: + """ + A class representing a dataset with various loading and parsing capabilities. + + Attributes: + type (str): The type of the dataset ('file' or 'memory'). + source (str): The source of the dataset (currently only 'local' is supported). + path_or_data (Union[str, List[Dict]]): The file path or in-memory data. + parsing (List[Dict[str, str]]): A list of parsing tools to apply to the data. + user_defined_parsing_tool_map (Dict[str, ParsingTool]): A map of user-defined parsing tools. + """ + def __init__( self, type: str, @@ -24,6 +45,16 @@ def __init__( parsing: List[Dict[str, str]] = None, user_defined_parsing_tool_map: Dict[str, ParsingTool] = {}, ): + """ + Initialize a Dataset object. + + Args: + type (str): The type of the dataset ('file' or 'memory'). + source (str): The source of the dataset (currently only 'local' is supported). + path_or_data (Union[str, List[Dict]]): The file path or in-memory data. + parsing (List[Dict[str, str]], optional): A list of parsing tools to apply to the data. + user_defined_parsing_tool_map (Dict[str, ParsingTool], optional): A map of user-defined parsing tools. + """ self.type = self._validate_type(type) self.source = self._validate_source(source) self.path_or_data = self._validate_path_or_data(path_or_data) @@ -31,11 +62,35 @@ def __init__( self.user_defined_parsing_tool_map = user_defined_parsing_tool_map def _validate_type(self, type: str) -> str: + """ + Validate the dataset type. + + Args: + type (str): The type to validate. + + Returns: + str: The validated type. + + Raises: + ValueError: If the type is not 'file' or 'memory'. + """ if type not in ["file", "memory"]: raise ValueError("Type must be 'file' or 'memory'") return type def _validate_source(self, source: str) -> str: + """ + Validate the dataset source. + + Args: + source (str): The source to validate. + + Returns: + str: The validated source. + + Raises: + ValueError: If the source is not 'local'. + """ if source != "local": raise ValueError("Source must be 'local'") return source @@ -43,6 +98,18 @@ def _validate_source(self, source: str) -> str: def _validate_path_or_data( self, path_or_data: Union[str, List[Dict]] ) -> Union[str, List[Dict]]: + """ + Validate the path or data of the dataset. + + Args: + path_or_data (Union[str, List[Dict]]): The path or data to validate. + + Returns: + Union[str, List[Dict]]: The validated path or data. + + Raises: + ValueError: If the path or data is invalid for the given type. + """ if self.type == "file": if not isinstance(path_or_data, str): raise ValueError("For type 'file', path_or_data must be a string") @@ -59,6 +126,18 @@ def _validate_path_or_data( def _validate_parsing( self, parsing_tools: Union[List[Dict[str, str]], None] ) -> List[Dict[str, str]]: + """ + Validate the parsing tools. + + Args: + parsing_tools (Union[List[Dict[str, str]], None]): The parsing tools to validate. + + Returns: + List[Dict[str, str]]: The validated parsing tools. + + Raises: + ValueError: If any parsing tool is invalid. + """ if parsing_tools is None: return [] @@ -88,6 +167,12 @@ def _validate_parsing( return parsing_tools def __repr__(self): + """ + Return a string representation of the Dataset object. + + Returns: + str: A string representation of the Dataset object. + """ return f"Dataset(type='{self.type}', source='{self.source}', path_or_data='{self.path_or_data}', parsing={self.parsing})" def load(self) -> List[Dict]: @@ -96,6 +181,9 @@ def load(self) -> List[Dict]: Returns: List[Dict]: A list of dictionaries representing the dataset. + + Raises: + ValueError: If the file extension is unsupported. """ if self.type == "memory": return self._apply_parsing_tools(self.path_or_data) @@ -127,6 +215,9 @@ def _apply_parsing_tools(self, data: List[Dict]) -> List[Dict]: Returns: List[Dict]: The data with parsing tools applied. + + Raises: + ValueError: If a parsing tool is not found or if an input key is missing from an item. """ for tool in self.parsing: input_key = tool["input_key"] @@ -147,19 +238,22 @@ def _apply_parsing_tools(self, data: List[Dict]) -> List[Dict]: output_key = tool["output_key"] function_kwargs = tool.get("function_kwargs", {}) new_data = [] - for item in data: - if input_key in item: - result = func(item[input_key], **function_kwargs) - if isinstance(result, list): - for res in result: - new_item = item.copy() - new_item[output_key] = res - new_data.append(new_item) - else: - item[output_key] = result - new_data.append(item) - else: + + def process_item(item): + if input_key not in item: raise ValueError(f"Input key {input_key} not found in item: {item}") + result = func(item[input_key], **function_kwargs) + if isinstance(result, list): + return [item.copy() | {output_key: res} for res in result] + else: + return [item | {output_key: result}] + + with ThreadPoolExecutor() as executor: + futures = [executor.submit(process_item, item) for item in data] + new_data = [] + for future in as_completed(futures): + new_data.extend(future.result()) + data = new_data return data @@ -174,6 +268,9 @@ def sample(self, n: int, random: bool = True) -> List[Dict]: Returns: List[Dict]: A list of n sampled items. + + Raises: + ValueError: If the sample size is larger than the dataset size or if the file extension is unsupported. """ if self.type == "memory": import random as rd From efe78f55bfdf0e6fd8733acfc62c42c8c35ed2ef Mon Sep 17 00:00:00 2001 From: Shreya Shankar Date: Mon, 30 Sep 2024 21:25:56 -0700 Subject: [PATCH 05/11] docs: update documentation for custom parsers --- Makefile | 5 +- docetl/builder.py | 2 +- docetl/dataset.py | 2 +- docetl/parsing_tools.py | 61 +++++++-- docetl/runner.py | 6 +- docetl/schemas.py | 2 +- docs/concepts/pipelines.md | 30 +++- docs/examples/custom-parsing.md | 235 ++++++++++++++++++++++++++++++++ mkdocs.yml | 1 + poetry.lock | 129 +++++++++++++++++- pyproject.toml | 3 +- tests/test_parsing_tools.py | 52 ++++++- 12 files changed, 500 insertions(+), 28 deletions(-) create mode 100644 docs/examples/custom-parsing.md diff --git a/Makefile b/Makefile index 4d2de654..d3522e1f 100644 --- a/Makefile +++ b/Makefile @@ -4,10 +4,7 @@ tests: poetry run pytest tests-basic: - poetry run pytest tests/basic/test_basic_map.py - poetry run pytest tests/basic/test_basic_reduce_resolve.py - poetry run pytest tests/basic/test_basic_parallel_map.py - poetry run pytest tests/basic/test_basic_filter_split_gather.py + poetry run pytest tests/basic lint: poetry run ruff check docetl/* --fix diff --git a/docetl/builder.py b/docetl/builder.py index cab0c54c..23eb99a3 100644 --- a/docetl/builder.py +++ b/docetl/builder.py @@ -968,8 +968,8 @@ def _get_sample_data( ) dataset = Dataset( type=dataset_config["type"], - source=dataset_config["source"], path_or_data=dataset_config["path"], + source=dataset_config["source"], parsing=dataset_config.get("parsing", []), user_defined_parsing_tool_map=self.parsing_tool_map, ) diff --git a/docetl/dataset.py b/docetl/dataset.py index fe14ffda..bcd83ca7 100644 --- a/docetl/dataset.py +++ b/docetl/dataset.py @@ -40,8 +40,8 @@ class Dataset: def __init__( self, type: str, - source: str, path_or_data: Union[str, List[Dict]], + source: str = "local", parsing: List[Dict[str, str]] = None, user_defined_parsing_tool_map: Dict[str, ParsingTool] = {}, ): diff --git a/docetl/parsing_tools.py b/docetl/parsing_tools.py index c898a08c..368d6c4b 100644 --- a/docetl/parsing_tools.py +++ b/docetl/parsing_tools.py @@ -72,19 +72,24 @@ def xlsx_to_string( def process_sheet(sheet): if col_order: - headers = col_order + headers = [ + col for col in col_order if col in sheet.iter_cols(1, sheet.max_column) + ] else: headers = [cell.value for cell in sheet[1]] result = [] - for row in sheet.iter_rows(min_row=2, values_only=True): - row_dict = dict(zip(headers, row)) - if orientation == "col": - result.extend( - [f"{header}: {value}" for header, value in row_dict.items()] - ) - result.append("") # Empty line between rows - else: # row + if orientation == "col": + for col_idx, header in enumerate(headers, start=1): + column = sheet.cell(1, col_idx).column_letter + column_values = [cell.value for cell in sheet[column][1:]] + result.append(f"{header}: " + "\n".join(map(str, column_values))) + result.append("") # Empty line between columns + else: # row + for row in sheet.iter_rows(min_row=2, values_only=True): + row_dict = { + header: value for header, value in zip(headers, row) if header + } result.append( " | ".join( [f"{header}: {value}" for header, value in row_dict.items()] @@ -129,10 +134,48 @@ def docx_to_string(filename: str) -> List[str]: return ["\n".join([paragraph.text for paragraph in doc.paragraphs])] +def pptx_to_string(filename: str, slide_per_document: bool = False) -> List[str]: + """ + Extract text from a PowerPoint presentation. + + Args: + filename (str): Path to the pptx file. + slide_per_document (bool): If True, return each slide as a separate + document. If False, return the entire presentation as one document. + + Returns: + List[str]: Extracted text from the presentation. If slide_per_document + is True, each string in the list represents a single slide. + Otherwise, the list contains a single string with all slides' + content. + """ + from pptx import Presentation + + prs = Presentation(filename) + result = [] + + for slide in prs.slides: + slide_content = [] + for shape in slide.shapes: + if hasattr(shape, "text"): + slide_content.append(shape.text) + + if slide_per_document: + result.append("\n".join(slide_content)) + else: + result.extend(slide_content) + + if not slide_per_document: + result = ["\n".join(result)] + + return result + + # Define a dictionary mapping function names to their corresponding functions PARSING_TOOLS = { "whisper_speech_to_text": whisper_speech_to_text, "xlsx_to_string": xlsx_to_string, "txt_to_string": txt_to_string, "docx_to_string": docx_to_string, + "pptx_to_string": pptx_to_string, } diff --git a/docetl/runner.py b/docetl/runner.py index 8dc5ef9f..e566fac3 100644 --- a/docetl/runner.py +++ b/docetl/runner.py @@ -132,7 +132,7 @@ def run(self) -> float: self.datasets[step["input"]].load() if "input" in step else None ) output_data, step_cost = self.execute_step(step, input_data) - self.datasets[step_name] = Dataset("memory", "local", output_data) + self.datasets[step_name] = Dataset("memory", output_data) flush_cache(self.console) total_cost += step_cost self.console.log( @@ -164,8 +164,8 @@ def load_datasets(self): if dataset_config["type"] == "file": self.datasets[name] = Dataset( "file", - "local", dataset_config["path"], + source="local", parsing=dataset_config.get("parsing", []), user_defined_parsing_tool_map=self.parsing_tool_map, ) @@ -281,7 +281,7 @@ def _load_from_checkpoint_if_exists( if os.path.exists(checkpoint_path): if f"{step_name}_{operation_name}" not in self.datasets: self.datasets[f"{step_name}_{operation_name}"] = Dataset( - "file", "local", checkpoint_path + "file", checkpoint_path, "local" ) return self.datasets[f"{step_name}_{operation_name}"].load() return None diff --git a/docetl/schemas.py b/docetl/schemas.py index 49343022..4c9c63ce 100644 --- a/docetl/schemas.py +++ b/docetl/schemas.py @@ -21,8 +21,8 @@ class ParsingTool(BaseModel): class Dataset(BaseModel): type: str - source: str path: str + source: str = "local" parsing: Optional[List[Dict[str, str]]] = None diff --git a/docs/concepts/pipelines.md b/docs/concepts/pipelines.md index fa41588b..ed0e4cae 100644 --- a/docs/concepts/pipelines.md +++ b/docs/concepts/pipelines.md @@ -21,7 +21,7 @@ default_model: gpt-4o-mini ### Datasets -Datasets define the input data for your pipeline. They are collections of documents, where each document is an object in a JSON list. Datasets are typically specified in the YAML configuration file, indicating the type and path of the data source. For example: +Datasets define the input data for your pipeline. They are collections of documents, where each document is an object in a JSON list (or row in a CSV file). Datasets are typically specified in the YAML configuration file, indicating the type and path of the data source. For example: ```yaml datasets: @@ -30,9 +30,35 @@ datasets: path: "user_logs.json" ``` +#### Dynamic Data Loading + +DocETL supports dynamic data loading, allowing you to process various file types by specifying a key that points to a path or using a custom parsing function. This feature is particularly useful for handling diverse data sources, such as audio files, PDFs, or any other non-standard format. + +To implement dynamic data loading, you can use parsing tools in your dataset configuration. Here's an example: + +```yaml +datasets: + audio_transcripts: + type: file + source: local + path: "audio_files/audio_paths.json" + parsing_tools: + - input_key: audio_path + function: whisper_speech_to_text + output_key: transcript +``` + +In this example, the dataset configuration specifies a JSON file (audio_paths.json) that contains paths to audio files. The parsing_tools section defines how to process these files: + +- `input_key`: Specifies which key in the JSON contains the path to the audio file. In this example, each object in the dataset should have a "audio_path" key, that represents a path to an audio file or mp3. +- `function`: Names the parsing function to use (in this case, the built-in whisper_speech_to_text function for audio transcription). +- `output_key`: Defines the key where the processed data (transcript) will be stored. You can access this in the pipeline in any prompts with the `{{ input.transcipt }}` syntax. + +This approach allows DocETL to dynamically load and process various file types, extending its capabilities beyond standard JSON or CSV inputs. You can use built-in parsing tools or define custom ones to handle specific file formats or data processing needs. See the [Custom Parsing](../examples/custom-parsing.md) documentation for more details. + !!! note - Currently, DocETL only supports JSON files as input datasets. If you're interested in support for other data types or cloud-based datasets, please reach out to us or join our open-source community and contribute! We welcome new ideas and contributions to expand the capabilities of DocETL. + Currently, DocETL only supports JSON files or CSV files as input datasets. If you're interested in support for other data types or cloud-based datasets, please reach out to us or join our open-source community and contribute! We welcome new ideas and contributions to expand the capabilities of DocETL. ### Operators diff --git a/docs/examples/custom-parsing.md b/docs/examples/custom-parsing.md new file mode 100644 index 00000000..499af153 --- /dev/null +++ b/docs/examples/custom-parsing.md @@ -0,0 +1,235 @@ +# Custom Parsing in DocETL + +DocETL provides some custom parsing capabilities that allow you to preprocess your data before it enters the main pipeline. This guide will walk you through creating a pipeline with custom parsing tools using a concrete example. + +## Example Scenario + +Imagine you have: + +- A folder called "sales_data" containing JSON files with paths to Excel spreadsheets of monthly sales reports. +- A folder called "receipts" with JSON files containing paths to scanned receipts in PDF format that you want to process using OCR. + +## Setting Up Custom Parsing + +Let's walk through setting up a pipeline with custom parsing for this scenario: + +### 1. Create a Configuration File + +First, create a configuration file (`config.yaml`) that defines your dataset, parsing tools, and pipeline: + +```yaml +default_model: "gpt-4o-mini" + +parsing_tools: + - name: ocr_parser + function_code: | + import pytesseract + from pdf2image import convert_from_path + def ocr_parser(filename: str) -> List[str]: + images = convert_from_path(filename) + text = "" + for image in images: + text += pytesseract.image_to_string(image) + return [text] # Return as a list with one element + +operations: + - name: summarize_sales + type: map + prompt: | + Summarize the following sales data: + {{ input.sales_data }} + output: + schema: + summary: string + model: "gpt-4o-mini" + - name: extract_receipt_info + type: map + prompt: | + Extract the total amount and date from the following receipt text: + {{ input.receipt_text }} + output: + schema: + total_amount: float + date: string + model: "gpt-4o-mini" + +datasets: + sales_reports: + type: file + source: local + path: "sales_data/sales_paths.json" + parsing_tools: + - input_key: excel_path + function: xlsx_to_string + output_key: sales_data + function_kwargs: + orientation: "col" + + receipts: + type: file + source: local + path: "receipts/receipt_paths.json" + parsing_tools: + - input_key: pdf_path + function: ocr_parser + output_key: receipt_text + +pipeline: + steps: + - name: process_sales + input: sales_reports + operations: + - summarize_sales + - name: process_receipts + input: receipts + operations: + - extract_receipt_info + +output: + type: file + path: "output.json" +``` + +### 2. Configuration Breakdown + +In this configuration: + +- We define a custom parsing tool `ocr_parser` for PDF files. +- We use the built-in `xlsx_to_string` parsing tool for Excel files. +- We create two datasets: `sales_reports` for Excel files and `receipts` for PDF files. +- We apply the parsing tools to their respective datasets. +- We define map operations to process the parsed data. + +### 3. Prepare Required Files + +Ensure you have the necessary input files: + +#### JSON file for Excel paths (`sales_data/sales_paths.json`): + +```json +[ + { "id": 1, "excel_path": "sales_data/january_sales.xlsx" }, + { "id": 2, "excel_path": "sales_data/february_sales.xlsx" } +] +``` + +#### JSON file for PDF paths (`receipts/receipt_paths.json`): + +```json +[ + { "id": 1, "pdf_path": "receipts/receipt001.pdf" }, + { "id": 2, "pdf_path": "receipts/receipt002.pdf" } +] +``` + + +#### Parsing Process + +Let's examine how the input files would be parsed using the logic defined in `parsing_tools.py`: + +1. For the Excel files (`sales_data/january_sales.xlsx` and `sales_data/february_sales.xlsx`): + - The `xlsx_to_string` function is used. + - By default, it processes the active sheet of each Excel file. + - The function returns a list containing a single string for each file. + - The string representation includes column headers followed by their respective values. + - For example, if the Excel file has columns "Date", "Product", and "Amount", the output might look like: + + Date: + 2023-01-01 + 2023-01-02 + ... + + Product: + Widget A + Widget B + ... + + Amount: + 100 + 150 + ... + +2. For the PDF files (`receipts/receipt001.pdf` and `receipts/receipt002.pdf`): + - The custom `ocr_parser` function is used. + - It converts each page of the PDF to an image using `pdf2image`. + - Then, it applies OCR to each image using `pytesseract`. + - The function combines the text from all pages and returns it as a list with a single string element. + - The output might look like: + + RECEIPT + Store: Example Store + Date: 2023-05-15 + Items: + 1. Product A - $10.99 + 2. Product B - $15.50 + Total: $26.49 + +These parsed strings are then passed to the respective operations (`summarize_sales` and `extract_receipt_info`) for further processing in the pipeline. + + +### 4. Run the Pipeline + +Execute the pipeline using the DocETL CLI: + +```bash +docetl run config.yaml +``` + +### 5. Check the Output + +After running the pipeline, you'll find the output in `output.json`. It will contain summaries of the sales data and extracted information from the receipts. + +## Understanding the Parsing Tools + +In this example, we used two parsing tools: + +1. **xlsx_to_string**: A built-in parsing tool provided by DocETL. It reads Excel files and converts them to a string representation. + +2. **ocr_parser**: A custom parsing tool we defined for OCR processing of PDF files. *Note that it returns a list containing a single string, which is the format expected by DocETL for parsing tools.* + +## Built-in Parsing Tools + +DocETL provides several built-in parsing tools to handle common file formats and data processing tasks. These tools can be used directly in your configuration by specifying their names in the `function` field of your parsing tools configuration. Here's an overview of the available built-in parsing tools: + +::: docetl.parsing_tools.xlsx_to_string + options: + heading_level: 3 + +::: docetl.parsing_tools.txt_to_string + options: + heading_level: 3 + +::: docetl.parsing_tools.docx_to_string + options: + heading_level: 3 + +::: docetl.parsing_tools.whisper_speech_to_text + options: + heading_level: 3 + +::: docetl.parsing_tools.pptx_to_string + options: + heading_level: 3 + + +### Using Function Arguments with Parsing Tools + +When using parsing tools in your DocETL configuration, you can pass additional arguments to the parsing functions using the function_kwargs field. This allows you to customize the behavior of the parsing tools without modifying their implementation. + +For example, when using the xlsx_to_string parsing tool, you can specify options like the orientation of the data, the order of columns, or whether to process each sheet separately. Here's an example of how to use function_kwargs in your configuration: + +```yaml +datasets: + my_sales: + type: file + source: local + path: "sales_data/sales_paths.json" + parsing_tools: + - name: excel_parser + function: xlsx_to_string + function_kwargs: + orientation: row + col_order: ["Date", "Product", "Quantity", "Price"] + doc_per_sheet: true +``` + diff --git a/mkdocs.yml b/mkdocs.yml index 41c60bff..c896a820 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -49,6 +49,7 @@ nav: - Reporting on Themes from Presidential Debates: examples/presidential-debate-themes.md - Mining Product Reviews for Polarizing Features: examples/mining-product-reviews.md - Medical Document Classification with Ollama: examples/ollama.md + - Datasets With Custom Parsing: examples/custom-parsing.md # - Annotating Legal Documents: examples/annotating-legal-documents.md # - Characterizing Troll Behavior on Wikipedia: examples/characterizing-troll-behavior.md - API Reference: diff --git a/poetry.lock b/poetry.lock index 3e7dc7c9..e82c00af 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1038,7 +1038,7 @@ proxy = ["PyJWT (>=2.8.0,<3.0.0)", "apscheduler (>=3.10.4,<4.0.0)", "backoff", " name = "lxml" version = "5.3.0" description = "Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API." -optional = true +optional = false python-versions = ">=3.6" files = [ {file = "lxml-5.3.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:dd36439be765e2dde7660212b5275641edbc813e7b24668831a5c8ac91180656"}, @@ -1766,6 +1766,103 @@ files = [ {file = "pathspec-0.12.1.tar.gz", hash = "sha256:a482d51503a1ab33b1c67a6c3813a26953dbdc71c31dacaef9a838c4e29f5712"}, ] +[[package]] +name = "pillow" +version = "10.4.0" +description = "Python Imaging Library (Fork)" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pillow-10.4.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:4d9667937cfa347525b319ae34375c37b9ee6b525440f3ef48542fcf66f2731e"}, + {file = "pillow-10.4.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:543f3dc61c18dafb755773efc89aae60d06b6596a63914107f75459cf984164d"}, + {file = "pillow-10.4.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7928ecbf1ece13956b95d9cbcfc77137652b02763ba384d9ab508099a2eca856"}, + {file = "pillow-10.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e4d49b85c4348ea0b31ea63bc75a9f3857869174e2bf17e7aba02945cd218e6f"}, + {file = "pillow-10.4.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:6c762a5b0997f5659a5ef2266abc1d8851ad7749ad9a6a5506eb23d314e4f46b"}, + {file = "pillow-10.4.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:a985e028fc183bf12a77a8bbf36318db4238a3ded7fa9df1b9a133f1cb79f8fc"}, + {file = "pillow-10.4.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:812f7342b0eee081eaec84d91423d1b4650bb9828eb53d8511bcef8ce5aecf1e"}, + {file = "pillow-10.4.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:ac1452d2fbe4978c2eec89fb5a23b8387aba707ac72810d9490118817d9c0b46"}, + {file = "pillow-10.4.0-cp310-cp310-win32.whl", hash = "sha256:bcd5e41a859bf2e84fdc42f4edb7d9aba0a13d29a2abadccafad99de3feff984"}, + {file = "pillow-10.4.0-cp310-cp310-win_amd64.whl", hash = "sha256:ecd85a8d3e79cd7158dec1c9e5808e821feea088e2f69a974db5edf84dc53141"}, + {file = "pillow-10.4.0-cp310-cp310-win_arm64.whl", hash = "sha256:ff337c552345e95702c5fde3158acb0625111017d0e5f24bf3acdb9cc16b90d1"}, + {file = "pillow-10.4.0-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:0a9ec697746f268507404647e531e92889890a087e03681a3606d9b920fbee3c"}, + {file = "pillow-10.4.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:dfe91cb65544a1321e631e696759491ae04a2ea11d36715eca01ce07284738be"}, + {file = "pillow-10.4.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5dc6761a6efc781e6a1544206f22c80c3af4c8cf461206d46a1e6006e4429ff3"}, + {file = "pillow-10.4.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5e84b6cc6a4a3d76c153a6b19270b3526a5a8ed6b09501d3af891daa2a9de7d6"}, + {file = "pillow-10.4.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:bbc527b519bd3aa9d7f429d152fea69f9ad37c95f0b02aebddff592688998abe"}, + {file = "pillow-10.4.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:76a911dfe51a36041f2e756b00f96ed84677cdeb75d25c767f296c1c1eda1319"}, + {file = "pillow-10.4.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:59291fb29317122398786c2d44427bbd1a6d7ff54017075b22be9d21aa59bd8d"}, + {file = "pillow-10.4.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:416d3a5d0e8cfe4f27f574362435bc9bae57f679a7158e0096ad2beb427b8696"}, + {file = "pillow-10.4.0-cp311-cp311-win32.whl", hash = "sha256:7086cc1d5eebb91ad24ded9f58bec6c688e9f0ed7eb3dbbf1e4800280a896496"}, + {file = "pillow-10.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:cbed61494057c0f83b83eb3a310f0bf774b09513307c434d4366ed64f4128a91"}, + {file = "pillow-10.4.0-cp311-cp311-win_arm64.whl", hash = "sha256:f5f0c3e969c8f12dd2bb7e0b15d5c468b51e5017e01e2e867335c81903046a22"}, + {file = "pillow-10.4.0-cp312-cp312-macosx_10_10_x86_64.whl", hash = "sha256:673655af3eadf4df6b5457033f086e90299fdd7a47983a13827acf7459c15d94"}, + {file = "pillow-10.4.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:866b6942a92f56300012f5fbac71f2d610312ee65e22f1aa2609e491284e5597"}, + {file = "pillow-10.4.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:29dbdc4207642ea6aad70fbde1a9338753d33fb23ed6956e706936706f52dd80"}, + {file = "pillow-10.4.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bf2342ac639c4cf38799a44950bbc2dfcb685f052b9e262f446482afaf4bffca"}, + {file = "pillow-10.4.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:f5b92f4d70791b4a67157321c4e8225d60b119c5cc9aee8ecf153aace4aad4ef"}, + {file = "pillow-10.4.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:86dcb5a1eb778d8b25659d5e4341269e8590ad6b4e8b44d9f4b07f8d136c414a"}, + {file = "pillow-10.4.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:780c072c2e11c9b2c7ca37f9a2ee8ba66f44367ac3e5c7832afcfe5104fd6d1b"}, + {file = "pillow-10.4.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:37fb69d905be665f68f28a8bba3c6d3223c8efe1edf14cc4cfa06c241f8c81d9"}, + {file = "pillow-10.4.0-cp312-cp312-win32.whl", hash = "sha256:7dfecdbad5c301d7b5bde160150b4db4c659cee2b69589705b6f8a0c509d9f42"}, + {file = "pillow-10.4.0-cp312-cp312-win_amd64.whl", hash = "sha256:1d846aea995ad352d4bdcc847535bd56e0fd88d36829d2c90be880ef1ee4668a"}, + {file = "pillow-10.4.0-cp312-cp312-win_arm64.whl", hash = "sha256:e553cad5179a66ba15bb18b353a19020e73a7921296a7979c4a2b7f6a5cd57f9"}, + {file = "pillow-10.4.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:8bc1a764ed8c957a2e9cacf97c8b2b053b70307cf2996aafd70e91a082e70df3"}, + {file = "pillow-10.4.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:6209bb41dc692ddfee4942517c19ee81b86c864b626dbfca272ec0f7cff5d9fb"}, + {file = "pillow-10.4.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bee197b30783295d2eb680b311af15a20a8b24024a19c3a26431ff83eb8d1f70"}, + {file = "pillow-10.4.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1ef61f5dd14c300786318482456481463b9d6b91ebe5ef12f405afbba77ed0be"}, + {file = "pillow-10.4.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:297e388da6e248c98bc4a02e018966af0c5f92dfacf5a5ca22fa01cb3179bca0"}, + {file = "pillow-10.4.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:e4db64794ccdf6cb83a59d73405f63adbe2a1887012e308828596100a0b2f6cc"}, + {file = "pillow-10.4.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:bd2880a07482090a3bcb01f4265f1936a903d70bc740bfcb1fd4e8a2ffe5cf5a"}, + {file = "pillow-10.4.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4b35b21b819ac1dbd1233317adeecd63495f6babf21b7b2512d244ff6c6ce309"}, + {file = "pillow-10.4.0-cp313-cp313-win32.whl", hash = "sha256:551d3fd6e9dc15e4c1eb6fc4ba2b39c0c7933fa113b220057a34f4bb3268a060"}, + {file = "pillow-10.4.0-cp313-cp313-win_amd64.whl", hash = "sha256:030abdbe43ee02e0de642aee345efa443740aa4d828bfe8e2eb11922ea6a21ea"}, + {file = "pillow-10.4.0-cp313-cp313-win_arm64.whl", hash = "sha256:5b001114dd152cfd6b23befeb28d7aee43553e2402c9f159807bf55f33af8a8d"}, + {file = "pillow-10.4.0-cp38-cp38-macosx_10_10_x86_64.whl", hash = "sha256:8d4d5063501b6dd4024b8ac2f04962d661222d120381272deea52e3fc52d3736"}, + {file = "pillow-10.4.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:7c1ee6f42250df403c5f103cbd2768a28fe1a0ea1f0f03fe151c8741e1469c8b"}, + {file = "pillow-10.4.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b15e02e9bb4c21e39876698abf233c8c579127986f8207200bc8a8f6bb27acf2"}, + {file = "pillow-10.4.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7a8d4bade9952ea9a77d0c3e49cbd8b2890a399422258a77f357b9cc9be8d680"}, + {file = "pillow-10.4.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:43efea75eb06b95d1631cb784aa40156177bf9dd5b4b03ff38979e048258bc6b"}, + {file = "pillow-10.4.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:950be4d8ba92aca4b2bb0741285a46bfae3ca699ef913ec8416c1b78eadd64cd"}, + {file = "pillow-10.4.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:d7480af14364494365e89d6fddc510a13e5a2c3584cb19ef65415ca57252fb84"}, + {file = "pillow-10.4.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:73664fe514b34c8f02452ffb73b7a92c6774e39a647087f83d67f010eb9a0cf0"}, + {file = "pillow-10.4.0-cp38-cp38-win32.whl", hash = "sha256:e88d5e6ad0d026fba7bdab8c3f225a69f063f116462c49892b0149e21b6c0a0e"}, + {file = "pillow-10.4.0-cp38-cp38-win_amd64.whl", hash = "sha256:5161eef006d335e46895297f642341111945e2c1c899eb406882a6c61a4357ab"}, + {file = "pillow-10.4.0-cp39-cp39-macosx_10_10_x86_64.whl", hash = "sha256:0ae24a547e8b711ccaaf99c9ae3cd975470e1a30caa80a6aaee9a2f19c05701d"}, + {file = "pillow-10.4.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:298478fe4f77a4408895605f3482b6cc6222c018b2ce565c2b6b9c354ac3229b"}, + {file = "pillow-10.4.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:134ace6dc392116566980ee7436477d844520a26a4b1bd4053f6f47d096997fd"}, + {file = "pillow-10.4.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:930044bb7679ab003b14023138b50181899da3f25de50e9dbee23b61b4de2126"}, + {file = "pillow-10.4.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:c76e5786951e72ed3686e122d14c5d7012f16c8303a674d18cdcd6d89557fc5b"}, + {file = "pillow-10.4.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:b2724fdb354a868ddf9a880cb84d102da914e99119211ef7ecbdc613b8c96b3c"}, + {file = "pillow-10.4.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:dbc6ae66518ab3c5847659e9988c3b60dc94ffb48ef9168656e0019a93dbf8a1"}, + {file = "pillow-10.4.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:06b2f7898047ae93fad74467ec3d28fe84f7831370e3c258afa533f81ef7f3df"}, + {file = "pillow-10.4.0-cp39-cp39-win32.whl", hash = "sha256:7970285ab628a3779aecc35823296a7869f889b8329c16ad5a71e4901a3dc4ef"}, + {file = "pillow-10.4.0-cp39-cp39-win_amd64.whl", hash = "sha256:961a7293b2457b405967af9c77dcaa43cc1a8cd50d23c532e62d48ab6cdd56f5"}, + {file = "pillow-10.4.0-cp39-cp39-win_arm64.whl", hash = "sha256:32cda9e3d601a52baccb2856b8ea1fc213c90b340c542dcef77140dfa3278a9e"}, + {file = "pillow-10.4.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:5b4815f2e65b30f5fbae9dfffa8636d992d49705723fe86a3661806e069352d4"}, + {file = "pillow-10.4.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:8f0aef4ef59694b12cadee839e2ba6afeab89c0f39a3adc02ed51d109117b8da"}, + {file = "pillow-10.4.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9f4727572e2918acaa9077c919cbbeb73bd2b3ebcfe033b72f858fc9fbef0026"}, + {file = "pillow-10.4.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ff25afb18123cea58a591ea0244b92eb1e61a1fd497bf6d6384f09bc3262ec3e"}, + {file = "pillow-10.4.0-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:dc3e2db6ba09ffd7d02ae9141cfa0ae23393ee7687248d46a7507b75d610f4f5"}, + {file = "pillow-10.4.0-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:02a2be69f9c9b8c1e97cf2713e789d4e398c751ecfd9967c18d0ce304efbf885"}, + {file = "pillow-10.4.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:0755ffd4a0c6f267cccbae2e9903d95477ca2f77c4fcf3a3a09570001856c8a5"}, + {file = "pillow-10.4.0-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:a02364621fe369e06200d4a16558e056fe2805d3468350df3aef21e00d26214b"}, + {file = "pillow-10.4.0-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:1b5dea9831a90e9d0721ec417a80d4cbd7022093ac38a568db2dd78363b00908"}, + {file = "pillow-10.4.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9b885f89040bb8c4a1573566bbb2f44f5c505ef6e74cec7ab9068c900047f04b"}, + {file = "pillow-10.4.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:87dd88ded2e6d74d31e1e0a99a726a6765cda32d00ba72dc37f0651f306daaa8"}, + {file = "pillow-10.4.0-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:2db98790afc70118bd0255c2eeb465e9767ecf1f3c25f9a1abb8ffc8cfd1fe0a"}, + {file = "pillow-10.4.0-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:f7baece4ce06bade126fb84b8af1c33439a76d8a6fd818970215e0560ca28c27"}, + {file = "pillow-10.4.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:cfdd747216947628af7b259d274771d84db2268ca062dd5faf373639d00113a3"}, + {file = "pillow-10.4.0.tar.gz", hash = "sha256:166c1cd4d24309b30d61f79f4a9114b7b2313d7450912277855ff5dfd7cd4a06"}, +] + +[package.extras] +docs = ["furo", "olefile", "sphinx (>=7.3)", "sphinx-copybutton", "sphinx-inline-tabs", "sphinxext-opengraph"] +fpx = ["olefile"] +mic = ["olefile"] +tests = ["check-manifest", "coverage", "defusedxml", "markdown2", "olefile", "packaging", "pyroma", "pytest", "pytest-cov", "pytest-timeout"] +typing = ["typing-extensions"] +xmp = ["defusedxml"] + [[package]] name = "platformdirs" version = "4.3.6" @@ -2086,6 +2183,23 @@ files = [ [package.extras] cli = ["click (>=5.0)"] +[[package]] +name = "python-pptx" +version = "1.0.2" +description = "Create, read, and update PowerPoint 2007+ (.pptx) files." +optional = false +python-versions = ">=3.8" +files = [ + {file = "python_pptx-1.0.2-py3-none-any.whl", hash = "sha256:160838e0b8565a8b1f67947675886e9fea18aa5e795db7ae531606d68e785cba"}, + {file = "python_pptx-1.0.2.tar.gz", hash = "sha256:479a8af0eaf0f0d76b6f00b0887732874ad2e3188230315290cd1f9dd9cc7095"}, +] + +[package.dependencies] +lxml = ">=3.1.0" +Pillow = ">=3.3.2" +typing-extensions = ">=4.9.0" +XlsxWriter = ">=0.5.7" + [[package]] name = "pytkdocs" version = "0.16.2" @@ -2958,6 +3072,17 @@ files = [ [package.extras] watchmedo = ["PyYAML (>=3.10)"] +[[package]] +name = "xlsxwriter" +version = "3.2.0" +description = "A Python module for creating Excel XLSX files." +optional = false +python-versions = ">=3.6" +files = [ + {file = "XlsxWriter-3.2.0-py3-none-any.whl", hash = "sha256:ecfd5405b3e0e228219bcaf24c2ca0915e012ca9464a14048021d21a995d490e"}, + {file = "XlsxWriter-3.2.0.tar.gz", hash = "sha256:9977d0c661a72866a61f9f7a809e25ebbb0fb7036baa3b9fe74afcfca6b3cb8c"}, +] + [[package]] name = "yarl" version = "1.13.1" @@ -3088,4 +3213,4 @@ parsing = ["openpyxl", "pydub", "python-docx"] [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "80b030f8e672413a7c3d20d45f9bc376af822601a2095fc66ca21ff22395c411" +content-hash = "b9e0e3a033e5d8429c687a88b98d4760c23eb506e8660f0d9566cdfa6ac24bff" diff --git a/pyproject.toml b/pyproject.toml index 1e15bea5..4b9f29ff 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,9 +27,10 @@ numpy = "^1.26.4" openpyxl = { version = "^3.1.5", optional = true } python-docx = { version = "^1.1.2", optional = true } pydub = { version = "^0.25.1", optional = true } +python-pptx = { version = "^1.0.2", optional = true } [tool.poetry.extras] -parsing = ["python-docx", "openpyxl", "pydub"] +parsing = ["python-docx", "openpyxl", "pydub", "python-pptx"] [tool.poetry.group.dev.dependencies] pytest = "^8.3.2" diff --git a/tests/test_parsing_tools.py b/tests/test_parsing_tools.py index 1954548e..a4d5e04b 100644 --- a/tests/test_parsing_tools.py +++ b/tests/test_parsing_tools.py @@ -54,6 +54,25 @@ def temp_docx_file(): return temp_file.name +@pytest.fixture +def temp_pptx_file(): + from pptx import Presentation + + prs = Presentation() + slide1 = prs.slides.add_slide(prs.slide_layouts[0]) + slide1.shapes.title.text = "Test Presentation" + slide1.placeholders[1].text = "This is the first slide" + + slide2 = prs.slides.add_slide(prs.slide_layouts[1]) + slide2.shapes.title.text = "Second Slide" + slide2.placeholders[1].text = "This is the second slide" + + with tempfile.NamedTemporaryFile(suffix=".pptx", delete=False) as temp_file: + prs.save(temp_file.name) + yield temp_file.name + return temp_file.name + + def test_whisper_speech_to_text(temp_audio_file): result = parsing_tools.whisper_speech_to_text(temp_audio_file) @@ -71,9 +90,6 @@ def test_xlsx_to_string(temp_xlsx_file): assert "Name: Alice" in result[0] assert "Age: 30" in result[0] assert "City: New York" in result[0] - assert "Name: Bob" in result[0] - assert "Age: 25" in result[0] - assert "City: London" in result[0] def test_xlsx_to_string_row_orientation(temp_xlsx_file): @@ -112,10 +128,38 @@ def test_docx_to_string(temp_docx_file): assert "It has multiple paragraphs." in result[0] +def test_pptx_to_string(temp_pptx_file): + result = parsing_tools.pptx_to_string(temp_pptx_file) + + assert isinstance(result, list) + assert len(result) == 1 + assert "Test Presentation" in result[0] + assert "This is the first slide" in result[0] + assert "Second Slide" in result[0] + assert "This is the second slide" in result[0] + + +def test_pptx_to_string_slide_per_document(temp_pptx_file): + result = parsing_tools.pptx_to_string(temp_pptx_file, slide_per_document=True) + + assert isinstance(result, list) + assert len(result) == 2 + assert "Test Presentation" in result[0] + assert "This is the first slide" in result[0] + assert "Second Slide" in result[1] + assert "This is the second slide" in result[1] + + # Clean up temporary files after all tests have passed def pytest_sessionfinish(session, exitstatus): if exitstatus == 0: - for fixture in [temp_audio_file, temp_xlsx_file, temp_txt_file, temp_docx_file]: + for fixture in [ + temp_audio_file, + temp_xlsx_file, + temp_txt_file, + temp_docx_file, + temp_pptx_file, + ]: file_path = session.config.cache.get(fixture.__name__, None) if file_path and os.path.exists(file_path): os.remove(file_path) From 6b1fdb5c3ca4e036e14f7965f20795ce4bb18387 Mon Sep 17 00:00:00 2001 From: Shreya Shankar Date: Mon, 30 Sep 2024 21:27:36 -0700 Subject: [PATCH 06/11] chore: update poetry lockfile --- poetry.lock | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/poetry.lock b/poetry.lock index e82c00af..547bddf0 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1038,7 +1038,7 @@ proxy = ["PyJWT (>=2.8.0,<3.0.0)", "apscheduler (>=3.10.4,<4.0.0)", "backoff", " name = "lxml" version = "5.3.0" description = "Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API." -optional = false +optional = true python-versions = ">=3.6" files = [ {file = "lxml-5.3.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:dd36439be765e2dde7660212b5275641edbc813e7b24668831a5c8ac91180656"}, @@ -1770,7 +1770,7 @@ files = [ name = "pillow" version = "10.4.0" description = "Python Imaging Library (Fork)" -optional = false +optional = true python-versions = ">=3.8" files = [ {file = "pillow-10.4.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:4d9667937cfa347525b319ae34375c37b9ee6b525440f3ef48542fcf66f2731e"}, @@ -2187,7 +2187,7 @@ cli = ["click (>=5.0)"] name = "python-pptx" version = "1.0.2" description = "Create, read, and update PowerPoint 2007+ (.pptx) files." -optional = false +optional = true python-versions = ">=3.8" files = [ {file = "python_pptx-1.0.2-py3-none-any.whl", hash = "sha256:160838e0b8565a8b1f67947675886e9fea18aa5e795db7ae531606d68e785cba"}, @@ -3076,7 +3076,7 @@ watchmedo = ["PyYAML (>=3.10)"] name = "xlsxwriter" version = "3.2.0" description = "A Python module for creating Excel XLSX files." -optional = false +optional = true python-versions = ">=3.6" files = [ {file = "XlsxWriter-3.2.0-py3-none-any.whl", hash = "sha256:ecfd5405b3e0e228219bcaf24c2ca0915e012ca9464a14048021d21a995d490e"}, @@ -3208,9 +3208,9 @@ test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", type = ["pytest-mypy"] [extras] -parsing = ["openpyxl", "pydub", "python-docx"] +parsing = ["openpyxl", "pydub", "python-docx", "python-pptx"] [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "b9e0e3a033e5d8429c687a88b98d4760c23eb506e8660f0d9566cdfa6ac24bff" +content-hash = "8a6c32c6ced43a22fb24cc0449bee9bf1079c703dc0467d655a99ea53f60f00c" From cc9c0a2e14a67a9dbfad129535c50ccfa36b92f4 Mon Sep 17 00:00:00 2001 From: Shreya Shankar Date: Mon, 30 Sep 2024 21:47:44 -0700 Subject: [PATCH 07/11] refactor: small changes to address PR comments --- docetl/dataset.py | 40 ++++++++++++++++++++++++++++------------ docetl/parsing_tools.py | 13 +++++++------ 2 files changed, 35 insertions(+), 18 deletions(-) diff --git a/docetl/dataset.py b/docetl/dataset.py index bcd83ca7..00717e7b 100644 --- a/docetl/dataset.py +++ b/docetl/dataset.py @@ -1,4 +1,4 @@ -from typing import List, Dict, Union, Optional +from typing import List, Dict, Union, Optional, Any, Callable import os from pydantic import BaseModel @@ -7,6 +7,22 @@ from concurrent.futures import ThreadPoolExecutor, as_completed +def process_item( + item: Dict[str, Any], + input_key: str, + output_key: str, + func: Callable, + **function_kwargs: Dict[str, Any], +): + if input_key not in item: + raise ValueError(f"Input key {input_key} not found in item: {item}") + result = func(item[input_key], **function_kwargs) + if isinstance(result, list): + return [item.copy() | {output_key: res} for res in result] + else: + return [item | {output_key: result}] + + def create_parsing_tool_map( parsing_tools: Optional[List[ParsingTool]], ) -> Dict[str, ParsingTool]: @@ -239,18 +255,18 @@ def _apply_parsing_tools(self, data: List[Dict]) -> List[Dict]: function_kwargs = tool.get("function_kwargs", {}) new_data = [] - def process_item(item): - if input_key not in item: - raise ValueError(f"Input key {input_key} not found in item: {item}") - result = func(item[input_key], **function_kwargs) - if isinstance(result, list): - return [item.copy() | {output_key: res} for res in result] - else: - return [item | {output_key: result}] - with ThreadPoolExecutor() as executor: - futures = [executor.submit(process_item, item) for item in data] - new_data = [] + futures = [ + executor.submit( + process_item, + item, + input_key, + output_key, + func, + **function_kwargs, + ) + for item in data + ] for future in as_completed(futures): new_data.extend(future.result()) diff --git a/docetl/parsing_tools.py b/docetl/parsing_tools.py index 368d6c4b..9e41d293 100644 --- a/docetl/parsing_tools.py +++ b/docetl/parsing_tools.py @@ -1,5 +1,6 @@ import os from typing import Optional, List +import io from litellm import transcription @@ -30,16 +31,16 @@ def whisper_speech_to_text(filename: str) -> List[str]: chunks.append(chunk) transcriptions = [] + for i, chunk in enumerate(chunks): - temp_filename = f"temp_chunk_{i}_{os.path.basename(filename)}" - chunk.export(temp_filename, format="mp3") + buffer = io.BytesIO() + buffer.name = f"temp_chunk_{i}_{os.path.basename(filename)}" + chunk.export(buffer, format="mp3") + buffer.seek(0) # Reset buffer position to the beginning - with open(temp_filename, "rb") as audio_file: - response = transcription(model="whisper-1", file=audio_file) + response = transcription(model="whisper-1", file=buffer) transcriptions.append(response.text) - os.remove(temp_filename) - return transcriptions else: with open(filename, "rb") as audio_file: From 674c64bf0f2304952f4888c081f266cd9b94f9d0 Mon Sep 17 00:00:00 2001 From: Shreya Shankar Date: Mon, 30 Sep 2024 21:57:03 -0700 Subject: [PATCH 08/11] test: reduce the character minimum for parsing test --- docetl/dataset.py | 1 - tests/basic/test_pipeline_with_parsing.py | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/docetl/dataset.py b/docetl/dataset.py index 00717e7b..6649a452 100644 --- a/docetl/dataset.py +++ b/docetl/dataset.py @@ -1,6 +1,5 @@ from typing import List, Dict, Union, Optional, Any, Callable import os -from pydantic import BaseModel from docetl.parsing_tools import PARSING_TOOLS from docetl.schemas import ParsingTool diff --git a/tests/basic/test_pipeline_with_parsing.py b/tests/basic/test_pipeline_with_parsing.py index 13c204d1..145af2b8 100644 --- a/tests/basic/test_pipeline_with_parsing.py +++ b/tests/basic/test_pipeline_with_parsing.py @@ -110,9 +110,9 @@ def test_pipeline_with_parsing(config_file): print(f"Pipeline executed successfully. Total cost: ${total_cost:.2f}") - # Assert that each output has at least 50 characters + # Assert that each output has at least 40 characters for item in output_data: - assert len(item["summary"]) >= 50, "Summary is not long enough" + assert len(item["summary"]) >= 40, "Summary is not long enough" # Clean up the output file os.unlink(output_file.name) From 9846f013245cd14fad76edfa38d5dc0dce080fb5 Mon Sep 17 00:00:00 2001 From: Shreya Shankar Date: Mon, 30 Sep 2024 22:14:39 -0700 Subject: [PATCH 09/11] docs: update documentation for custom parsers --- docetl/schemas.py | 169 ++++++++++++++++++++++ docs/api-reference/python.md | 30 +++- docs/examples/custom-parsing.md | 4 +- docs/python-api.md | 15 +- tests/basic/test_pipeline_with_parsing.py | 2 +- 5 files changed, 203 insertions(+), 17 deletions(-) diff --git a/docetl/schemas.py b/docetl/schemas.py index 4c9c63ce..4ca8dccd 100644 --- a/docetl/schemas.py +++ b/docetl/schemas.py @@ -15,11 +15,63 @@ class Tool(BaseModel): class ParsingTool(BaseModel): + """ + Represents a parsing tool used for custom data parsing in the pipeline. + + Attributes: + name (str): The name of the parsing tool. This should be unique within the pipeline configuration. + function_code (str): The Python code defining the parsing function. This code will be executed + to parse the input data according to the specified logic. It should return a list of strings, where each string is its own document. + + Example: + ```yaml + parsing_tools: + - name: ocr_parser + function_code: | + import pytesseract + from pdf2image import convert_from_path + def ocr_parser(filename: str) -> List[str]: + images = convert_from_path(filename) + text = "" + for image in images: + text += pytesseract.image_to_string(image) + return [text] + ``` + """ + name: str function_code: str class Dataset(BaseModel): + """ + Represents a dataset configuration in the pipeline. + + Attributes: + type (str): The type of the dataset. Must be either 'file' or 'memory'. + path (str): The path to the dataset file or the in-memory data, depending on the type. + source (str): The source of the dataset. Currently, only 'local' is supported. Defaults to 'local'. + parsing (Optional[List[Dict[str, str]]]): A list of parsing tools to apply to the data. Each parsing tool + is represented by a dictionary with 'input_key', 'function', and + 'output_key' keys. Defaults to None. + + Example: + ```yaml + datasets: + my_dataset: + type: file + path: input.json + parsing: + - input_key: file_path + function: txt_to_string + output_key: content + ``` + + Note: + The parsing tools are applied in the order they are listed. Each parsing tool takes the output + of the previous tool as its input, allowing for chained processing of the data. + """ + type: str path: str source: str = "local" @@ -184,18 +236,135 @@ class UnnestOp(BaseOp): class PipelineStep(BaseModel): + """ + Represents a step in the pipeline. + + Attributes: + name (str): The name of the step. + operations (List[Union[Dict[str, Any], str]]): A list of operations to be applied in this step. + Each operation can be either a string (the name of the operation) or a dictionary + (for more complex configurations). + input (Optional[str]): The input for this step. It can be either the name of a dataset + or the name of a previous step. If not provided, the step will use the output + of the previous step as its input. + + Example: + ```python + # Simple step with a single operation + process_step = PipelineStep( + name="process_step", + input="my_dataset", + operations=["process"] + ) + + # Step with multiple operations + summarize_step = PipelineStep( + name="summarize_step", + input="process_step", + operations=["summarize"] + ) + + # Step with a more complex operation configuration + custom_step = PipelineStep( + name="custom_step", + input="previous_step", + operations=[ + { + "custom_operation": { + "model": "gpt-4", + "prompt": "Perform a custom analysis on the following text:" + } + } + ] + ) + ``` + + These examples show different ways to configure pipeline steps, from simple + single-operation steps to more complex configurations with custom parameters. + """ + name: str operations: List[Union[Dict[str, Any], str]] input: Optional[str] = None class PipelineOutput(BaseModel): + """ + Represents the output configuration for a pipeline. + + Attributes: + type (str): The type of output. This could be 'file', 'database', etc. + path (str): The path where the output will be stored. This could be a file path, + database connection string, etc., depending on the type. + intermediate_dir (Optional[str]): The directory to store intermediate results, + if applicable. Defaults to None. + + Example: + ```python + output = PipelineOutput( + type="file", + path="/path/to/output.json", + intermediate_dir="/path/to/intermediate/results" + ) + ``` + """ + type: str path: str intermediate_dir: Optional[str] = None class Pipeline(BaseModel): + """ + Represents a complete document processing pipeline. + + Attributes: + name (str): The name of the pipeline. + datasets (Dict[str, Dataset]): A dictionary of datasets used in the pipeline, + where keys are dataset names and values are Dataset objects. + operations (List[OpType]): A list of operations to be performed in the pipeline. + steps (List[PipelineStep]): A list of steps that make up the pipeline. + output (PipelineOutput): The output configuration for the pipeline. + parsing_tools (List[ParsingTool]): A list of parsing tools used in the pipeline. + Defaults to an empty list. + default_model (Optional[str]): The default language model to use for operations + that require one. Defaults to None. + + Example: + ```python + pipeline = Pipeline( + name="document_processing_pipeline", + datasets={ + "input_data": Dataset(type="file", path="/path/to/input.json") + }, + operations=[ + MapOp( + name="process", + type="map", + prompt="Determine what type of document this is: {{ input.content }}", + output={"schema": {"document_type": "string"}} + ), + ReduceOp( + name="summarize", + type="reduce", + reduce_key="document_type", + prompt="Summarize the processed contents: {% for item in inputs %}{{ item.content }} {% endfor %}", + output={"schema": {"summary": "string"}} + ) + ], + steps=[ + PipelineStep(name="process_step", input="input_data", operations=["process"]), + PipelineStep(name="summarize_step", input="process_step", operations=["summarize"]) + ], + output=PipelineOutput(type="file", path="/path/to/output.json"), + default_model="gpt-4o-mini" + ) + ``` + + This example shows a complete pipeline configuration with datasets, operations, + steps, and output settings. + """ + name: str datasets: Dict[str, Dataset] operations: List[OpType] diff --git a/docs/api-reference/python.md b/docs/api-reference/python.md index a756b0db..9ec82bd0 100644 --- a/docs/api-reference/python.md +++ b/docs/api-reference/python.md @@ -1,13 +1,6 @@ # Python API -::: docetl.schemas.Dataset - options: - show_root_heading: true - heading_level: 3 - show_if_no_docstring: false - docstring_options: - ignore_init_summary: false - trim_doctest_flags: true +## Operations ::: docetl.schemas.BaseOp options: @@ -99,6 +92,27 @@ ignore_init_summary: false trim_doctest_flags: true +## Dataset and Pipeline + +::: docetl.schemas.Dataset + options: + show_root_heading: true + heading_level: 3 + show_if_no_docstring: false + docstring_options: + ignore_init_summary: false + trim_doctest_flags: true + +::: docetl.schemas.ParsingTool + options: + show_root_heading: true + heading_level: 3 + show_if_no_docstring: false + docstring_options: + ignore_init_summary: false + trim_doctest_flags: true + + ::: docetl.schemas.PipelineStep options: show_root_heading: true diff --git a/docs/examples/custom-parsing.md b/docs/examples/custom-parsing.md index 499af153..11b9bb51 100644 --- a/docs/examples/custom-parsing.md +++ b/docs/examples/custom-parsing.md @@ -58,7 +58,7 @@ datasets: type: file source: local path: "sales_data/sales_paths.json" - parsing_tools: + parsing: - input_key: excel_path function: xlsx_to_string output_key: sales_data @@ -69,7 +69,7 @@ datasets: type: file source: local path: "receipts/receipt_paths.json" - parsing_tools: + parsing: - input_key: pdf_path function: ocr_parser output_key: receipt_text diff --git a/docs/python-api.md b/docs/python-api.md index 386346e4..3aa6174b 100644 --- a/docs/python-api.md +++ b/docs/python-api.md @@ -21,29 +21,32 @@ from docetl.api import Pipeline, Dataset, MapOp, ReduceOp, PipelineStep, Pipelin # Define datasets datasets = { - "input": Dataset(type="file", path="input.json") + "my_dataset": Dataset(type="file", path="input.json", parsing=[{"input_key": "file_path", "function": "txt_to_string", "output_key": "content"}]), } +# Note that the parsing is applied to the `file_path` key in each item of the dataset, +# and the result is stored in the `content` key. + # Define operations operations = [ MapOp( name="process", type="map", - prompt="Process the document", - output={"schema": {"processed_content": "string"}} + prompt="Determine what type of document this is: {{ input.content }}", + output={"schema": {"document_type": "string"}} ), ReduceOp( name="summarize", type="reduce", - reduce_key="processed_content", - prompt="Summarize the processed content", + reduce_key="document_type", + prompt="Summarize the processed contents: {% for item in inputs %}{{ item.content }} {% endfor %}", output={"schema": {"summary": "string"}} ) ] # Define pipeline steps steps = [ - PipelineStep(name="process_step", input="input", operations=["process"]), + PipelineStep(name="process_step", input="my_dataset", operations=["process"]), PipelineStep(name="summarize_step", input="process_step", operations=["summarize"]) ] diff --git a/tests/basic/test_pipeline_with_parsing.py b/tests/basic/test_pipeline_with_parsing.py index 145af2b8..7603e7fd 100644 --- a/tests/basic/test_pipeline_with_parsing.py +++ b/tests/basic/test_pipeline_with_parsing.py @@ -25,7 +25,7 @@ type: file source: local path: "tests/sample_data.json" - parsing_tools: + parsing: - input_key: text_file_path function: txt_to_string output_key: content From ca32fa73f63194c21fd85177550bdd331edf18e6 Mon Sep 17 00:00:00 2001 From: Shreya Shankar Date: Mon, 30 Sep 2024 22:19:24 -0700 Subject: [PATCH 10/11] fix: remove check for function calling for ollama models --- docetl/operations/utils.py | 48 -------------------------------------- docs/examples/ollama.md | 4 ---- docs/tutorial.md | 2 +- 3 files changed, 1 insertion(+), 53 deletions(-) diff --git a/docetl/operations/utils.py b/docetl/operations/utils.py index ccf359bd..88038b2c 100644 --- a/docetl/operations/utils.py +++ b/docetl/operations/utils.py @@ -156,49 +156,6 @@ def clear_cache(console: Console = Console()): console.log(f"[bold red]Error clearing cache: {str(e)}[/bold red]") -def create_dynamic_model(schema: Dict[str, Any], model_name: str = "DynamicModel"): - fields = {} - - def process_schema(s: Dict[str, Any], prefix: str = "") -> None: - for key, value in s.items(): - field_name = f"{prefix}__{key}" if prefix else key - if isinstance(value, dict): - process_schema(value, field_name) - else: - fields[field_name] = parse_type(value, field_name) - - def parse_type(type_str: str, field_name: str) -> tuple: - type_str = type_str.strip().lower() - if type_str in ["str", "text", "string", "varchar"]: - return (str, ...) - elif type_str in ["int", "integer"]: - return (int, ...) - elif type_str in ["float", "decimal", "number"]: - return (float, ...) - elif type_str in ["bool", "boolean"]: - return (bool, ...) - elif type_str.startswith("list["): - inner_type = type_str[5:-1].strip() - item_type = parse_type(inner_type, f"{field_name}_item")[0] - return (List[item_type], ...) - elif type_str == "list": - return (List[Any], ...) - elif type_str.startswith("{") and type_str.endswith("}"): - subfields = {} - for item in type_str[1:-1].split(","): - sub_key, sub_type = item.strip().split(":") - subfields[sub_key.strip()] = parse_type( - sub_type.strip(), f"{field_name}_{sub_key}" - ) - SubModel = create_model(f"{model_name}_{field_name}", **subfields) - return (SubModel, ...) - else: - return (Any, ...) - - process_schema(schema) - return create_model(model_name, **fields) - - def convert_val(value: Any) -> Dict[str, Any]: """ Convert a string representation of a type to a dictionary representation. @@ -432,11 +389,6 @@ def call_llm( Raises: TimeoutError: If the call times out after retrying. """ - if not litellm.supports_function_calling(model): - raise ValueError( - f"Model {model} does not support function calling (which we use for structured outputs). Please use a different model." - ) - key = cache_key(model, op_type, messages, output_schema, scratchpad) max_retries = max_retries_per_timeout diff --git a/docs/examples/ollama.md b/docs/examples/ollama.md index fb9ad908..40f336c8 100644 --- a/docs/examples/ollama.md +++ b/docs/examples/ollama.md @@ -18,10 +18,6 @@ export OLLAMA_API_BASE=http://localhost:11434/ For more information on the Ollama REST API, refer to the [Ollama documentation](https://github.com/ollama/ollama?tab=readme-ov-file#rest-api). -!!! tip "Use Ollama Chat Models for Structured Outputs" - - Use [ollama-chat](https://github.com/BerriAI/litellm/issues/5048) instead of regular Ollama models to improve structured output performance. Llama 3.1 models support tool calling (which is how we get structured outputs). - ## Pipeline Configuration Let's create a pipeline that classifies medical documents into categories such as "Cardiology", "Neurology", "Oncology", etc. diff --git a/docs/tutorial.md b/docs/tutorial.md index 5c907c95..a2b88eca 100644 --- a/docs/tutorial.md +++ b/docs/tutorial.md @@ -30,7 +30,7 @@ DocETL uses [LiteLLM](https://github.com/BerriAI/litellm) under the hood, which If you choose to use a different provider, be aware that you may encounter unexpected behavior or reduced functionality, especially with operations that depend on structured outputs. We use tool calling to extract structured outputs from the LLM's response, so make sure your provider supports tool calling. - If using Ollama (e.g., llama 3.1), make sure your output schemas are not too complex, since these models are not as good as OpenAI for structured outputs! Use [parallel map operations](operators/parallel-map.md) to reduce the number of output attributes per prompt. Also, use [ollama-chat](https://github.com/BerriAI/litellm/issues/5048) instead of regular Ollama models to improve structured output performance. + If using Ollama (e.g., llama 3.1), make sure your output schemas are not too complex, since these models are not as good as OpenAI for structured outputs! Use [parallel map operations](operators/parallel-map.md) to reduce the number of output attributes per prompt. ## Preparing the Data From 9a8256541c66a9b2993f707d9ec719645315100d Mon Sep 17 00:00:00 2001 From: Shreya Shankar Date: Mon, 30 Sep 2024 22:24:25 -0700 Subject: [PATCH 11/11] fix: remove source param from optimizer --- docetl/builder.py | 1 - 1 file changed, 1 deletion(-) diff --git a/docetl/builder.py b/docetl/builder.py index 23eb99a3..380faa11 100644 --- a/docetl/builder.py +++ b/docetl/builder.py @@ -969,7 +969,6 @@ def _get_sample_data( dataset = Dataset( type=dataset_config["type"], path_or_data=dataset_config["path"], - source=dataset_config["source"], parsing=dataset_config.get("parsing", []), user_defined_parsing_tool_map=self.parsing_tool_map, )