From efe78f55bfdf0e6fd8733acfc62c42c8c35ed2ef Mon Sep 17 00:00:00 2001 From: Shreya Shankar Date: Mon, 30 Sep 2024 21:25:56 -0700 Subject: [PATCH] docs: update documentation for custom parsers --- Makefile | 5 +- docetl/builder.py | 2 +- docetl/dataset.py | 2 +- docetl/parsing_tools.py | 61 +++++++-- docetl/runner.py | 6 +- docetl/schemas.py | 2 +- docs/concepts/pipelines.md | 30 +++- docs/examples/custom-parsing.md | 235 ++++++++++++++++++++++++++++++++ mkdocs.yml | 1 + poetry.lock | 129 +++++++++++++++++- pyproject.toml | 3 +- tests/test_parsing_tools.py | 52 ++++++- 12 files changed, 500 insertions(+), 28 deletions(-) create mode 100644 docs/examples/custom-parsing.md diff --git a/Makefile b/Makefile index 4d2de654..d3522e1f 100644 --- a/Makefile +++ b/Makefile @@ -4,10 +4,7 @@ tests: poetry run pytest tests-basic: - poetry run pytest tests/basic/test_basic_map.py - poetry run pytest tests/basic/test_basic_reduce_resolve.py - poetry run pytest tests/basic/test_basic_parallel_map.py - poetry run pytest tests/basic/test_basic_filter_split_gather.py + poetry run pytest tests/basic lint: poetry run ruff check docetl/* --fix diff --git a/docetl/builder.py b/docetl/builder.py index cab0c54c..23eb99a3 100644 --- a/docetl/builder.py +++ b/docetl/builder.py @@ -968,8 +968,8 @@ def _get_sample_data( ) dataset = Dataset( type=dataset_config["type"], - source=dataset_config["source"], path_or_data=dataset_config["path"], + source=dataset_config["source"], parsing=dataset_config.get("parsing", []), user_defined_parsing_tool_map=self.parsing_tool_map, ) diff --git a/docetl/dataset.py b/docetl/dataset.py index fe14ffda..bcd83ca7 100644 --- a/docetl/dataset.py +++ b/docetl/dataset.py @@ -40,8 +40,8 @@ class Dataset: def __init__( self, type: str, - source: str, path_or_data: Union[str, List[Dict]], + source: str = "local", parsing: List[Dict[str, str]] = None, user_defined_parsing_tool_map: Dict[str, ParsingTool] = {}, ): diff --git a/docetl/parsing_tools.py b/docetl/parsing_tools.py index c898a08c..368d6c4b 100644 --- a/docetl/parsing_tools.py +++ b/docetl/parsing_tools.py @@ -72,19 +72,24 @@ def xlsx_to_string( def process_sheet(sheet): if col_order: - headers = col_order + headers = [ + col for col in col_order if col in sheet.iter_cols(1, sheet.max_column) + ] else: headers = [cell.value for cell in sheet[1]] result = [] - for row in sheet.iter_rows(min_row=2, values_only=True): - row_dict = dict(zip(headers, row)) - if orientation == "col": - result.extend( - [f"{header}: {value}" for header, value in row_dict.items()] - ) - result.append("") # Empty line between rows - else: # row + if orientation == "col": + for col_idx, header in enumerate(headers, start=1): + column = sheet.cell(1, col_idx).column_letter + column_values = [cell.value for cell in sheet[column][1:]] + result.append(f"{header}: " + "\n".join(map(str, column_values))) + result.append("") # Empty line between columns + else: # row + for row in sheet.iter_rows(min_row=2, values_only=True): + row_dict = { + header: value for header, value in zip(headers, row) if header + } result.append( " | ".join( [f"{header}: {value}" for header, value in row_dict.items()] @@ -129,10 +134,48 @@ def docx_to_string(filename: str) -> List[str]: return ["\n".join([paragraph.text for paragraph in doc.paragraphs])] +def pptx_to_string(filename: str, slide_per_document: bool = False) -> List[str]: + """ + Extract text from a PowerPoint presentation. + + Args: + filename (str): Path to the pptx file. + slide_per_document (bool): If True, return each slide as a separate + document. If False, return the entire presentation as one document. + + Returns: + List[str]: Extracted text from the presentation. If slide_per_document + is True, each string in the list represents a single slide. + Otherwise, the list contains a single string with all slides' + content. + """ + from pptx import Presentation + + prs = Presentation(filename) + result = [] + + for slide in prs.slides: + slide_content = [] + for shape in slide.shapes: + if hasattr(shape, "text"): + slide_content.append(shape.text) + + if slide_per_document: + result.append("\n".join(slide_content)) + else: + result.extend(slide_content) + + if not slide_per_document: + result = ["\n".join(result)] + + return result + + # Define a dictionary mapping function names to their corresponding functions PARSING_TOOLS = { "whisper_speech_to_text": whisper_speech_to_text, "xlsx_to_string": xlsx_to_string, "txt_to_string": txt_to_string, "docx_to_string": docx_to_string, + "pptx_to_string": pptx_to_string, } diff --git a/docetl/runner.py b/docetl/runner.py index 8dc5ef9f..e566fac3 100644 --- a/docetl/runner.py +++ b/docetl/runner.py @@ -132,7 +132,7 @@ def run(self) -> float: self.datasets[step["input"]].load() if "input" in step else None ) output_data, step_cost = self.execute_step(step, input_data) - self.datasets[step_name] = Dataset("memory", "local", output_data) + self.datasets[step_name] = Dataset("memory", output_data) flush_cache(self.console) total_cost += step_cost self.console.log( @@ -164,8 +164,8 @@ def load_datasets(self): if dataset_config["type"] == "file": self.datasets[name] = Dataset( "file", - "local", dataset_config["path"], + source="local", parsing=dataset_config.get("parsing", []), user_defined_parsing_tool_map=self.parsing_tool_map, ) @@ -281,7 +281,7 @@ def _load_from_checkpoint_if_exists( if os.path.exists(checkpoint_path): if f"{step_name}_{operation_name}" not in self.datasets: self.datasets[f"{step_name}_{operation_name}"] = Dataset( - "file", "local", checkpoint_path + "file", checkpoint_path, "local" ) return self.datasets[f"{step_name}_{operation_name}"].load() return None diff --git a/docetl/schemas.py b/docetl/schemas.py index 49343022..4c9c63ce 100644 --- a/docetl/schemas.py +++ b/docetl/schemas.py @@ -21,8 +21,8 @@ class ParsingTool(BaseModel): class Dataset(BaseModel): type: str - source: str path: str + source: str = "local" parsing: Optional[List[Dict[str, str]]] = None diff --git a/docs/concepts/pipelines.md b/docs/concepts/pipelines.md index fa41588b..ed0e4cae 100644 --- a/docs/concepts/pipelines.md +++ b/docs/concepts/pipelines.md @@ -21,7 +21,7 @@ default_model: gpt-4o-mini ### Datasets -Datasets define the input data for your pipeline. They are collections of documents, where each document is an object in a JSON list. Datasets are typically specified in the YAML configuration file, indicating the type and path of the data source. For example: +Datasets define the input data for your pipeline. They are collections of documents, where each document is an object in a JSON list (or row in a CSV file). Datasets are typically specified in the YAML configuration file, indicating the type and path of the data source. For example: ```yaml datasets: @@ -30,9 +30,35 @@ datasets: path: "user_logs.json" ``` +#### Dynamic Data Loading + +DocETL supports dynamic data loading, allowing you to process various file types by specifying a key that points to a path or using a custom parsing function. This feature is particularly useful for handling diverse data sources, such as audio files, PDFs, or any other non-standard format. + +To implement dynamic data loading, you can use parsing tools in your dataset configuration. Here's an example: + +```yaml +datasets: + audio_transcripts: + type: file + source: local + path: "audio_files/audio_paths.json" + parsing_tools: + - input_key: audio_path + function: whisper_speech_to_text + output_key: transcript +``` + +In this example, the dataset configuration specifies a JSON file (audio_paths.json) that contains paths to audio files. The parsing_tools section defines how to process these files: + +- `input_key`: Specifies which key in the JSON contains the path to the audio file. In this example, each object in the dataset should have a "audio_path" key, that represents a path to an audio file or mp3. +- `function`: Names the parsing function to use (in this case, the built-in whisper_speech_to_text function for audio transcription). +- `output_key`: Defines the key where the processed data (transcript) will be stored. You can access this in the pipeline in any prompts with the `{{ input.transcipt }}` syntax. + +This approach allows DocETL to dynamically load and process various file types, extending its capabilities beyond standard JSON or CSV inputs. You can use built-in parsing tools or define custom ones to handle specific file formats or data processing needs. See the [Custom Parsing](../examples/custom-parsing.md) documentation for more details. + !!! note - Currently, DocETL only supports JSON files as input datasets. If you're interested in support for other data types or cloud-based datasets, please reach out to us or join our open-source community and contribute! We welcome new ideas and contributions to expand the capabilities of DocETL. + Currently, DocETL only supports JSON files or CSV files as input datasets. If you're interested in support for other data types or cloud-based datasets, please reach out to us or join our open-source community and contribute! We welcome new ideas and contributions to expand the capabilities of DocETL. ### Operators diff --git a/docs/examples/custom-parsing.md b/docs/examples/custom-parsing.md new file mode 100644 index 00000000..499af153 --- /dev/null +++ b/docs/examples/custom-parsing.md @@ -0,0 +1,235 @@ +# Custom Parsing in DocETL + +DocETL provides some custom parsing capabilities that allow you to preprocess your data before it enters the main pipeline. This guide will walk you through creating a pipeline with custom parsing tools using a concrete example. + +## Example Scenario + +Imagine you have: + +- A folder called "sales_data" containing JSON files with paths to Excel spreadsheets of monthly sales reports. +- A folder called "receipts" with JSON files containing paths to scanned receipts in PDF format that you want to process using OCR. + +## Setting Up Custom Parsing + +Let's walk through setting up a pipeline with custom parsing for this scenario: + +### 1. Create a Configuration File + +First, create a configuration file (`config.yaml`) that defines your dataset, parsing tools, and pipeline: + +```yaml +default_model: "gpt-4o-mini" + +parsing_tools: + - name: ocr_parser + function_code: | + import pytesseract + from pdf2image import convert_from_path + def ocr_parser(filename: str) -> List[str]: + images = convert_from_path(filename) + text = "" + for image in images: + text += pytesseract.image_to_string(image) + return [text] # Return as a list with one element + +operations: + - name: summarize_sales + type: map + prompt: | + Summarize the following sales data: + {{ input.sales_data }} + output: + schema: + summary: string + model: "gpt-4o-mini" + - name: extract_receipt_info + type: map + prompt: | + Extract the total amount and date from the following receipt text: + {{ input.receipt_text }} + output: + schema: + total_amount: float + date: string + model: "gpt-4o-mini" + +datasets: + sales_reports: + type: file + source: local + path: "sales_data/sales_paths.json" + parsing_tools: + - input_key: excel_path + function: xlsx_to_string + output_key: sales_data + function_kwargs: + orientation: "col" + + receipts: + type: file + source: local + path: "receipts/receipt_paths.json" + parsing_tools: + - input_key: pdf_path + function: ocr_parser + output_key: receipt_text + +pipeline: + steps: + - name: process_sales + input: sales_reports + operations: + - summarize_sales + - name: process_receipts + input: receipts + operations: + - extract_receipt_info + +output: + type: file + path: "output.json" +``` + +### 2. Configuration Breakdown + +In this configuration: + +- We define a custom parsing tool `ocr_parser` for PDF files. +- We use the built-in `xlsx_to_string` parsing tool for Excel files. +- We create two datasets: `sales_reports` for Excel files and `receipts` for PDF files. +- We apply the parsing tools to their respective datasets. +- We define map operations to process the parsed data. + +### 3. Prepare Required Files + +Ensure you have the necessary input files: + +#### JSON file for Excel paths (`sales_data/sales_paths.json`): + +```json +[ + { "id": 1, "excel_path": "sales_data/january_sales.xlsx" }, + { "id": 2, "excel_path": "sales_data/february_sales.xlsx" } +] +``` + +#### JSON file for PDF paths (`receipts/receipt_paths.json`): + +```json +[ + { "id": 1, "pdf_path": "receipts/receipt001.pdf" }, + { "id": 2, "pdf_path": "receipts/receipt002.pdf" } +] +``` + + +#### Parsing Process + +Let's examine how the input files would be parsed using the logic defined in `parsing_tools.py`: + +1. For the Excel files (`sales_data/january_sales.xlsx` and `sales_data/february_sales.xlsx`): + - The `xlsx_to_string` function is used. + - By default, it processes the active sheet of each Excel file. + - The function returns a list containing a single string for each file. + - The string representation includes column headers followed by their respective values. + - For example, if the Excel file has columns "Date", "Product", and "Amount", the output might look like: + + Date: + 2023-01-01 + 2023-01-02 + ... + + Product: + Widget A + Widget B + ... + + Amount: + 100 + 150 + ... + +2. For the PDF files (`receipts/receipt001.pdf` and `receipts/receipt002.pdf`): + - The custom `ocr_parser` function is used. + - It converts each page of the PDF to an image using `pdf2image`. + - Then, it applies OCR to each image using `pytesseract`. + - The function combines the text from all pages and returns it as a list with a single string element. + - The output might look like: + + RECEIPT + Store: Example Store + Date: 2023-05-15 + Items: + 1. Product A - $10.99 + 2. Product B - $15.50 + Total: $26.49 + +These parsed strings are then passed to the respective operations (`summarize_sales` and `extract_receipt_info`) for further processing in the pipeline. + + +### 4. Run the Pipeline + +Execute the pipeline using the DocETL CLI: + +```bash +docetl run config.yaml +``` + +### 5. Check the Output + +After running the pipeline, you'll find the output in `output.json`. It will contain summaries of the sales data and extracted information from the receipts. + +## Understanding the Parsing Tools + +In this example, we used two parsing tools: + +1. **xlsx_to_string**: A built-in parsing tool provided by DocETL. It reads Excel files and converts them to a string representation. + +2. **ocr_parser**: A custom parsing tool we defined for OCR processing of PDF files. *Note that it returns a list containing a single string, which is the format expected by DocETL for parsing tools.* + +## Built-in Parsing Tools + +DocETL provides several built-in parsing tools to handle common file formats and data processing tasks. These tools can be used directly in your configuration by specifying their names in the `function` field of your parsing tools configuration. Here's an overview of the available built-in parsing tools: + +::: docetl.parsing_tools.xlsx_to_string + options: + heading_level: 3 + +::: docetl.parsing_tools.txt_to_string + options: + heading_level: 3 + +::: docetl.parsing_tools.docx_to_string + options: + heading_level: 3 + +::: docetl.parsing_tools.whisper_speech_to_text + options: + heading_level: 3 + +::: docetl.parsing_tools.pptx_to_string + options: + heading_level: 3 + + +### Using Function Arguments with Parsing Tools + +When using parsing tools in your DocETL configuration, you can pass additional arguments to the parsing functions using the function_kwargs field. This allows you to customize the behavior of the parsing tools without modifying their implementation. + +For example, when using the xlsx_to_string parsing tool, you can specify options like the orientation of the data, the order of columns, or whether to process each sheet separately. Here's an example of how to use function_kwargs in your configuration: + +```yaml +datasets: + my_sales: + type: file + source: local + path: "sales_data/sales_paths.json" + parsing_tools: + - name: excel_parser + function: xlsx_to_string + function_kwargs: + orientation: row + col_order: ["Date", "Product", "Quantity", "Price"] + doc_per_sheet: true +``` + diff --git a/mkdocs.yml b/mkdocs.yml index 41c60bff..c896a820 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -49,6 +49,7 @@ nav: - Reporting on Themes from Presidential Debates: examples/presidential-debate-themes.md - Mining Product Reviews for Polarizing Features: examples/mining-product-reviews.md - Medical Document Classification with Ollama: examples/ollama.md + - Datasets With Custom Parsing: examples/custom-parsing.md # - Annotating Legal Documents: examples/annotating-legal-documents.md # - Characterizing Troll Behavior on Wikipedia: examples/characterizing-troll-behavior.md - API Reference: diff --git a/poetry.lock b/poetry.lock index 3e7dc7c9..e82c00af 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1038,7 +1038,7 @@ proxy = ["PyJWT (>=2.8.0,<3.0.0)", "apscheduler (>=3.10.4,<4.0.0)", "backoff", " name = "lxml" version = "5.3.0" description = "Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API." -optional = true +optional = false python-versions = ">=3.6" files = [ {file = "lxml-5.3.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:dd36439be765e2dde7660212b5275641edbc813e7b24668831a5c8ac91180656"}, @@ -1766,6 +1766,103 @@ files = [ {file = "pathspec-0.12.1.tar.gz", hash = "sha256:a482d51503a1ab33b1c67a6c3813a26953dbdc71c31dacaef9a838c4e29f5712"}, ] +[[package]] +name = "pillow" +version = "10.4.0" +description = "Python Imaging Library (Fork)" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pillow-10.4.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:4d9667937cfa347525b319ae34375c37b9ee6b525440f3ef48542fcf66f2731e"}, + {file = "pillow-10.4.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:543f3dc61c18dafb755773efc89aae60d06b6596a63914107f75459cf984164d"}, + {file = "pillow-10.4.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7928ecbf1ece13956b95d9cbcfc77137652b02763ba384d9ab508099a2eca856"}, + {file = "pillow-10.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e4d49b85c4348ea0b31ea63bc75a9f3857869174e2bf17e7aba02945cd218e6f"}, + {file = "pillow-10.4.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:6c762a5b0997f5659a5ef2266abc1d8851ad7749ad9a6a5506eb23d314e4f46b"}, + {file = "pillow-10.4.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:a985e028fc183bf12a77a8bbf36318db4238a3ded7fa9df1b9a133f1cb79f8fc"}, + {file = "pillow-10.4.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:812f7342b0eee081eaec84d91423d1b4650bb9828eb53d8511bcef8ce5aecf1e"}, + {file = "pillow-10.4.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:ac1452d2fbe4978c2eec89fb5a23b8387aba707ac72810d9490118817d9c0b46"}, + {file = "pillow-10.4.0-cp310-cp310-win32.whl", hash = "sha256:bcd5e41a859bf2e84fdc42f4edb7d9aba0a13d29a2abadccafad99de3feff984"}, + {file = "pillow-10.4.0-cp310-cp310-win_amd64.whl", hash = "sha256:ecd85a8d3e79cd7158dec1c9e5808e821feea088e2f69a974db5edf84dc53141"}, + {file = "pillow-10.4.0-cp310-cp310-win_arm64.whl", hash = "sha256:ff337c552345e95702c5fde3158acb0625111017d0e5f24bf3acdb9cc16b90d1"}, + {file = "pillow-10.4.0-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:0a9ec697746f268507404647e531e92889890a087e03681a3606d9b920fbee3c"}, + {file = "pillow-10.4.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:dfe91cb65544a1321e631e696759491ae04a2ea11d36715eca01ce07284738be"}, + {file = "pillow-10.4.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5dc6761a6efc781e6a1544206f22c80c3af4c8cf461206d46a1e6006e4429ff3"}, + {file = "pillow-10.4.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5e84b6cc6a4a3d76c153a6b19270b3526a5a8ed6b09501d3af891daa2a9de7d6"}, + {file = "pillow-10.4.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:bbc527b519bd3aa9d7f429d152fea69f9ad37c95f0b02aebddff592688998abe"}, + {file = "pillow-10.4.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:76a911dfe51a36041f2e756b00f96ed84677cdeb75d25c767f296c1c1eda1319"}, + {file = "pillow-10.4.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:59291fb29317122398786c2d44427bbd1a6d7ff54017075b22be9d21aa59bd8d"}, + {file = "pillow-10.4.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:416d3a5d0e8cfe4f27f574362435bc9bae57f679a7158e0096ad2beb427b8696"}, + {file = "pillow-10.4.0-cp311-cp311-win32.whl", hash = "sha256:7086cc1d5eebb91ad24ded9f58bec6c688e9f0ed7eb3dbbf1e4800280a896496"}, + {file = "pillow-10.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:cbed61494057c0f83b83eb3a310f0bf774b09513307c434d4366ed64f4128a91"}, + {file = "pillow-10.4.0-cp311-cp311-win_arm64.whl", hash = "sha256:f5f0c3e969c8f12dd2bb7e0b15d5c468b51e5017e01e2e867335c81903046a22"}, + {file = "pillow-10.4.0-cp312-cp312-macosx_10_10_x86_64.whl", hash = "sha256:673655af3eadf4df6b5457033f086e90299fdd7a47983a13827acf7459c15d94"}, + {file = "pillow-10.4.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:866b6942a92f56300012f5fbac71f2d610312ee65e22f1aa2609e491284e5597"}, + {file = "pillow-10.4.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:29dbdc4207642ea6aad70fbde1a9338753d33fb23ed6956e706936706f52dd80"}, + {file = "pillow-10.4.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bf2342ac639c4cf38799a44950bbc2dfcb685f052b9e262f446482afaf4bffca"}, + {file = "pillow-10.4.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:f5b92f4d70791b4a67157321c4e8225d60b119c5cc9aee8ecf153aace4aad4ef"}, + {file = "pillow-10.4.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:86dcb5a1eb778d8b25659d5e4341269e8590ad6b4e8b44d9f4b07f8d136c414a"}, + {file = "pillow-10.4.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:780c072c2e11c9b2c7ca37f9a2ee8ba66f44367ac3e5c7832afcfe5104fd6d1b"}, + {file = "pillow-10.4.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:37fb69d905be665f68f28a8bba3c6d3223c8efe1edf14cc4cfa06c241f8c81d9"}, + {file = "pillow-10.4.0-cp312-cp312-win32.whl", hash = "sha256:7dfecdbad5c301d7b5bde160150b4db4c659cee2b69589705b6f8a0c509d9f42"}, + {file = "pillow-10.4.0-cp312-cp312-win_amd64.whl", hash = "sha256:1d846aea995ad352d4bdcc847535bd56e0fd88d36829d2c90be880ef1ee4668a"}, + {file = "pillow-10.4.0-cp312-cp312-win_arm64.whl", hash = "sha256:e553cad5179a66ba15bb18b353a19020e73a7921296a7979c4a2b7f6a5cd57f9"}, + {file = "pillow-10.4.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:8bc1a764ed8c957a2e9cacf97c8b2b053b70307cf2996aafd70e91a082e70df3"}, + {file = "pillow-10.4.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:6209bb41dc692ddfee4942517c19ee81b86c864b626dbfca272ec0f7cff5d9fb"}, + {file = "pillow-10.4.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bee197b30783295d2eb680b311af15a20a8b24024a19c3a26431ff83eb8d1f70"}, + {file = "pillow-10.4.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1ef61f5dd14c300786318482456481463b9d6b91ebe5ef12f405afbba77ed0be"}, + {file = "pillow-10.4.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:297e388da6e248c98bc4a02e018966af0c5f92dfacf5a5ca22fa01cb3179bca0"}, + {file = "pillow-10.4.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:e4db64794ccdf6cb83a59d73405f63adbe2a1887012e308828596100a0b2f6cc"}, + {file = "pillow-10.4.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:bd2880a07482090a3bcb01f4265f1936a903d70bc740bfcb1fd4e8a2ffe5cf5a"}, + {file = "pillow-10.4.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4b35b21b819ac1dbd1233317adeecd63495f6babf21b7b2512d244ff6c6ce309"}, + {file = "pillow-10.4.0-cp313-cp313-win32.whl", hash = "sha256:551d3fd6e9dc15e4c1eb6fc4ba2b39c0c7933fa113b220057a34f4bb3268a060"}, + {file = "pillow-10.4.0-cp313-cp313-win_amd64.whl", hash = "sha256:030abdbe43ee02e0de642aee345efa443740aa4d828bfe8e2eb11922ea6a21ea"}, + {file = "pillow-10.4.0-cp313-cp313-win_arm64.whl", hash = "sha256:5b001114dd152cfd6b23befeb28d7aee43553e2402c9f159807bf55f33af8a8d"}, + {file = "pillow-10.4.0-cp38-cp38-macosx_10_10_x86_64.whl", hash = "sha256:8d4d5063501b6dd4024b8ac2f04962d661222d120381272deea52e3fc52d3736"}, + {file = "pillow-10.4.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:7c1ee6f42250df403c5f103cbd2768a28fe1a0ea1f0f03fe151c8741e1469c8b"}, + {file = "pillow-10.4.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b15e02e9bb4c21e39876698abf233c8c579127986f8207200bc8a8f6bb27acf2"}, + {file = "pillow-10.4.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7a8d4bade9952ea9a77d0c3e49cbd8b2890a399422258a77f357b9cc9be8d680"}, + {file = "pillow-10.4.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:43efea75eb06b95d1631cb784aa40156177bf9dd5b4b03ff38979e048258bc6b"}, + {file = "pillow-10.4.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:950be4d8ba92aca4b2bb0741285a46bfae3ca699ef913ec8416c1b78eadd64cd"}, + {file = "pillow-10.4.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:d7480af14364494365e89d6fddc510a13e5a2c3584cb19ef65415ca57252fb84"}, + {file = "pillow-10.4.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:73664fe514b34c8f02452ffb73b7a92c6774e39a647087f83d67f010eb9a0cf0"}, + {file = "pillow-10.4.0-cp38-cp38-win32.whl", hash = "sha256:e88d5e6ad0d026fba7bdab8c3f225a69f063f116462c49892b0149e21b6c0a0e"}, + {file = "pillow-10.4.0-cp38-cp38-win_amd64.whl", hash = "sha256:5161eef006d335e46895297f642341111945e2c1c899eb406882a6c61a4357ab"}, + {file = "pillow-10.4.0-cp39-cp39-macosx_10_10_x86_64.whl", hash = "sha256:0ae24a547e8b711ccaaf99c9ae3cd975470e1a30caa80a6aaee9a2f19c05701d"}, + {file = "pillow-10.4.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:298478fe4f77a4408895605f3482b6cc6222c018b2ce565c2b6b9c354ac3229b"}, + {file = "pillow-10.4.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:134ace6dc392116566980ee7436477d844520a26a4b1bd4053f6f47d096997fd"}, + {file = "pillow-10.4.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:930044bb7679ab003b14023138b50181899da3f25de50e9dbee23b61b4de2126"}, + {file = "pillow-10.4.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:c76e5786951e72ed3686e122d14c5d7012f16c8303a674d18cdcd6d89557fc5b"}, + {file = "pillow-10.4.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:b2724fdb354a868ddf9a880cb84d102da914e99119211ef7ecbdc613b8c96b3c"}, + {file = "pillow-10.4.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:dbc6ae66518ab3c5847659e9988c3b60dc94ffb48ef9168656e0019a93dbf8a1"}, + {file = "pillow-10.4.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:06b2f7898047ae93fad74467ec3d28fe84f7831370e3c258afa533f81ef7f3df"}, + {file = "pillow-10.4.0-cp39-cp39-win32.whl", hash = "sha256:7970285ab628a3779aecc35823296a7869f889b8329c16ad5a71e4901a3dc4ef"}, + {file = "pillow-10.4.0-cp39-cp39-win_amd64.whl", hash = "sha256:961a7293b2457b405967af9c77dcaa43cc1a8cd50d23c532e62d48ab6cdd56f5"}, + {file = "pillow-10.4.0-cp39-cp39-win_arm64.whl", hash = "sha256:32cda9e3d601a52baccb2856b8ea1fc213c90b340c542dcef77140dfa3278a9e"}, + {file = "pillow-10.4.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:5b4815f2e65b30f5fbae9dfffa8636d992d49705723fe86a3661806e069352d4"}, + {file = "pillow-10.4.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:8f0aef4ef59694b12cadee839e2ba6afeab89c0f39a3adc02ed51d109117b8da"}, + {file = "pillow-10.4.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9f4727572e2918acaa9077c919cbbeb73bd2b3ebcfe033b72f858fc9fbef0026"}, + {file = "pillow-10.4.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ff25afb18123cea58a591ea0244b92eb1e61a1fd497bf6d6384f09bc3262ec3e"}, + {file = "pillow-10.4.0-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:dc3e2db6ba09ffd7d02ae9141cfa0ae23393ee7687248d46a7507b75d610f4f5"}, + {file = "pillow-10.4.0-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:02a2be69f9c9b8c1e97cf2713e789d4e398c751ecfd9967c18d0ce304efbf885"}, + {file = "pillow-10.4.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:0755ffd4a0c6f267cccbae2e9903d95477ca2f77c4fcf3a3a09570001856c8a5"}, + {file = "pillow-10.4.0-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:a02364621fe369e06200d4a16558e056fe2805d3468350df3aef21e00d26214b"}, + {file = "pillow-10.4.0-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:1b5dea9831a90e9d0721ec417a80d4cbd7022093ac38a568db2dd78363b00908"}, + {file = "pillow-10.4.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9b885f89040bb8c4a1573566bbb2f44f5c505ef6e74cec7ab9068c900047f04b"}, + {file = "pillow-10.4.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:87dd88ded2e6d74d31e1e0a99a726a6765cda32d00ba72dc37f0651f306daaa8"}, + {file = "pillow-10.4.0-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:2db98790afc70118bd0255c2eeb465e9767ecf1f3c25f9a1abb8ffc8cfd1fe0a"}, + {file = "pillow-10.4.0-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:f7baece4ce06bade126fb84b8af1c33439a76d8a6fd818970215e0560ca28c27"}, + {file = "pillow-10.4.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:cfdd747216947628af7b259d274771d84db2268ca062dd5faf373639d00113a3"}, + {file = "pillow-10.4.0.tar.gz", hash = "sha256:166c1cd4d24309b30d61f79f4a9114b7b2313d7450912277855ff5dfd7cd4a06"}, +] + +[package.extras] +docs = ["furo", "olefile", "sphinx (>=7.3)", "sphinx-copybutton", "sphinx-inline-tabs", "sphinxext-opengraph"] +fpx = ["olefile"] +mic = ["olefile"] +tests = ["check-manifest", "coverage", "defusedxml", "markdown2", "olefile", "packaging", "pyroma", "pytest", "pytest-cov", "pytest-timeout"] +typing = ["typing-extensions"] +xmp = ["defusedxml"] + [[package]] name = "platformdirs" version = "4.3.6" @@ -2086,6 +2183,23 @@ files = [ [package.extras] cli = ["click (>=5.0)"] +[[package]] +name = "python-pptx" +version = "1.0.2" +description = "Create, read, and update PowerPoint 2007+ (.pptx) files." +optional = false +python-versions = ">=3.8" +files = [ + {file = "python_pptx-1.0.2-py3-none-any.whl", hash = "sha256:160838e0b8565a8b1f67947675886e9fea18aa5e795db7ae531606d68e785cba"}, + {file = "python_pptx-1.0.2.tar.gz", hash = "sha256:479a8af0eaf0f0d76b6f00b0887732874ad2e3188230315290cd1f9dd9cc7095"}, +] + +[package.dependencies] +lxml = ">=3.1.0" +Pillow = ">=3.3.2" +typing-extensions = ">=4.9.0" +XlsxWriter = ">=0.5.7" + [[package]] name = "pytkdocs" version = "0.16.2" @@ -2958,6 +3072,17 @@ files = [ [package.extras] watchmedo = ["PyYAML (>=3.10)"] +[[package]] +name = "xlsxwriter" +version = "3.2.0" +description = "A Python module for creating Excel XLSX files." +optional = false +python-versions = ">=3.6" +files = [ + {file = "XlsxWriter-3.2.0-py3-none-any.whl", hash = "sha256:ecfd5405b3e0e228219bcaf24c2ca0915e012ca9464a14048021d21a995d490e"}, + {file = "XlsxWriter-3.2.0.tar.gz", hash = "sha256:9977d0c661a72866a61f9f7a809e25ebbb0fb7036baa3b9fe74afcfca6b3cb8c"}, +] + [[package]] name = "yarl" version = "1.13.1" @@ -3088,4 +3213,4 @@ parsing = ["openpyxl", "pydub", "python-docx"] [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "80b030f8e672413a7c3d20d45f9bc376af822601a2095fc66ca21ff22395c411" +content-hash = "b9e0e3a033e5d8429c687a88b98d4760c23eb506e8660f0d9566cdfa6ac24bff" diff --git a/pyproject.toml b/pyproject.toml index 1e15bea5..4b9f29ff 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,9 +27,10 @@ numpy = "^1.26.4" openpyxl = { version = "^3.1.5", optional = true } python-docx = { version = "^1.1.2", optional = true } pydub = { version = "^0.25.1", optional = true } +python-pptx = { version = "^1.0.2", optional = true } [tool.poetry.extras] -parsing = ["python-docx", "openpyxl", "pydub"] +parsing = ["python-docx", "openpyxl", "pydub", "python-pptx"] [tool.poetry.group.dev.dependencies] pytest = "^8.3.2" diff --git a/tests/test_parsing_tools.py b/tests/test_parsing_tools.py index 1954548e..a4d5e04b 100644 --- a/tests/test_parsing_tools.py +++ b/tests/test_parsing_tools.py @@ -54,6 +54,25 @@ def temp_docx_file(): return temp_file.name +@pytest.fixture +def temp_pptx_file(): + from pptx import Presentation + + prs = Presentation() + slide1 = prs.slides.add_slide(prs.slide_layouts[0]) + slide1.shapes.title.text = "Test Presentation" + slide1.placeholders[1].text = "This is the first slide" + + slide2 = prs.slides.add_slide(prs.slide_layouts[1]) + slide2.shapes.title.text = "Second Slide" + slide2.placeholders[1].text = "This is the second slide" + + with tempfile.NamedTemporaryFile(suffix=".pptx", delete=False) as temp_file: + prs.save(temp_file.name) + yield temp_file.name + return temp_file.name + + def test_whisper_speech_to_text(temp_audio_file): result = parsing_tools.whisper_speech_to_text(temp_audio_file) @@ -71,9 +90,6 @@ def test_xlsx_to_string(temp_xlsx_file): assert "Name: Alice" in result[0] assert "Age: 30" in result[0] assert "City: New York" in result[0] - assert "Name: Bob" in result[0] - assert "Age: 25" in result[0] - assert "City: London" in result[0] def test_xlsx_to_string_row_orientation(temp_xlsx_file): @@ -112,10 +128,38 @@ def test_docx_to_string(temp_docx_file): assert "It has multiple paragraphs." in result[0] +def test_pptx_to_string(temp_pptx_file): + result = parsing_tools.pptx_to_string(temp_pptx_file) + + assert isinstance(result, list) + assert len(result) == 1 + assert "Test Presentation" in result[0] + assert "This is the first slide" in result[0] + assert "Second Slide" in result[0] + assert "This is the second slide" in result[0] + + +def test_pptx_to_string_slide_per_document(temp_pptx_file): + result = parsing_tools.pptx_to_string(temp_pptx_file, slide_per_document=True) + + assert isinstance(result, list) + assert len(result) == 2 + assert "Test Presentation" in result[0] + assert "This is the first slide" in result[0] + assert "Second Slide" in result[1] + assert "This is the second slide" in result[1] + + # Clean up temporary files after all tests have passed def pytest_sessionfinish(session, exitstatus): if exitstatus == 0: - for fixture in [temp_audio_file, temp_xlsx_file, temp_txt_file, temp_docx_file]: + for fixture in [ + temp_audio_file, + temp_xlsx_file, + temp_txt_file, + temp_docx_file, + temp_pptx_file, + ]: file_path = session.config.cache.get(fixture.__name__, None) if file_path and os.path.exists(file_path): os.remove(file_path)