Skip to content

Commit

Permalink
docs: update documentation for custom parsers
Browse files Browse the repository at this point in the history
  • Loading branch information
shreyashankar committed Oct 1, 2024
1 parent 8f2724b commit efe78f5
Show file tree
Hide file tree
Showing 12 changed files with 500 additions and 28 deletions.
5 changes: 1 addition & 4 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,7 @@ tests:
poetry run pytest

tests-basic:
poetry run pytest tests/basic/test_basic_map.py
poetry run pytest tests/basic/test_basic_reduce_resolve.py
poetry run pytest tests/basic/test_basic_parallel_map.py
poetry run pytest tests/basic/test_basic_filter_split_gather.py
poetry run pytest tests/basic

lint:
poetry run ruff check docetl/* --fix
Expand Down
2 changes: 1 addition & 1 deletion docetl/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -968,8 +968,8 @@ def _get_sample_data(
)
dataset = Dataset(
type=dataset_config["type"],
source=dataset_config["source"],
path_or_data=dataset_config["path"],
source=dataset_config["source"],
parsing=dataset_config.get("parsing", []),
user_defined_parsing_tool_map=self.parsing_tool_map,
)
Expand Down
2 changes: 1 addition & 1 deletion docetl/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,8 @@ class Dataset:
def __init__(
self,
type: str,
source: str,
path_or_data: Union[str, List[Dict]],
source: str = "local",
parsing: List[Dict[str, str]] = None,
user_defined_parsing_tool_map: Dict[str, ParsingTool] = {},
):
Expand Down
61 changes: 52 additions & 9 deletions docetl/parsing_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,19 +72,24 @@ def xlsx_to_string(

def process_sheet(sheet):
if col_order:
headers = col_order
headers = [
col for col in col_order if col in sheet.iter_cols(1, sheet.max_column)
]
else:
headers = [cell.value for cell in sheet[1]]

result = []
for row in sheet.iter_rows(min_row=2, values_only=True):
row_dict = dict(zip(headers, row))
if orientation == "col":
result.extend(
[f"{header}: {value}" for header, value in row_dict.items()]
)
result.append("") # Empty line between rows
else: # row
if orientation == "col":
for col_idx, header in enumerate(headers, start=1):
column = sheet.cell(1, col_idx).column_letter
column_values = [cell.value for cell in sheet[column][1:]]
result.append(f"{header}: " + "\n".join(map(str, column_values)))
result.append("") # Empty line between columns
else: # row
for row in sheet.iter_rows(min_row=2, values_only=True):
row_dict = {
header: value for header, value in zip(headers, row) if header
}
result.append(
" | ".join(
[f"{header}: {value}" for header, value in row_dict.items()]
Expand Down Expand Up @@ -129,10 +134,48 @@ def docx_to_string(filename: str) -> List[str]:
return ["\n".join([paragraph.text for paragraph in doc.paragraphs])]


def pptx_to_string(filename: str, slide_per_document: bool = False) -> List[str]:
"""
Extract text from a PowerPoint presentation.
Args:
filename (str): Path to the pptx file.
slide_per_document (bool): If True, return each slide as a separate
document. If False, return the entire presentation as one document.
Returns:
List[str]: Extracted text from the presentation. If slide_per_document
is True, each string in the list represents a single slide.
Otherwise, the list contains a single string with all slides'
content.
"""
from pptx import Presentation

prs = Presentation(filename)
result = []

for slide in prs.slides:
slide_content = []
for shape in slide.shapes:
if hasattr(shape, "text"):
slide_content.append(shape.text)

if slide_per_document:
result.append("\n".join(slide_content))
else:
result.extend(slide_content)

if not slide_per_document:
result = ["\n".join(result)]

return result


# Define a dictionary mapping function names to their corresponding functions
PARSING_TOOLS = {
"whisper_speech_to_text": whisper_speech_to_text,
"xlsx_to_string": xlsx_to_string,
"txt_to_string": txt_to_string,
"docx_to_string": docx_to_string,
"pptx_to_string": pptx_to_string,
}
6 changes: 3 additions & 3 deletions docetl/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ def run(self) -> float:
self.datasets[step["input"]].load() if "input" in step else None
)
output_data, step_cost = self.execute_step(step, input_data)
self.datasets[step_name] = Dataset("memory", "local", output_data)
self.datasets[step_name] = Dataset("memory", output_data)
flush_cache(self.console)
total_cost += step_cost
self.console.log(
Expand Down Expand Up @@ -164,8 +164,8 @@ def load_datasets(self):
if dataset_config["type"] == "file":
self.datasets[name] = Dataset(
"file",
"local",
dataset_config["path"],
source="local",
parsing=dataset_config.get("parsing", []),
user_defined_parsing_tool_map=self.parsing_tool_map,
)
Expand Down Expand Up @@ -281,7 +281,7 @@ def _load_from_checkpoint_if_exists(
if os.path.exists(checkpoint_path):
if f"{step_name}_{operation_name}" not in self.datasets:
self.datasets[f"{step_name}_{operation_name}"] = Dataset(
"file", "local", checkpoint_path
"file", checkpoint_path, "local"
)
return self.datasets[f"{step_name}_{operation_name}"].load()
return None
Expand Down
2 changes: 1 addition & 1 deletion docetl/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ class ParsingTool(BaseModel):

class Dataset(BaseModel):
type: str
source: str
path: str
source: str = "local"
parsing: Optional[List[Dict[str, str]]] = None


Expand Down
30 changes: 28 additions & 2 deletions docs/concepts/pipelines.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ default_model: gpt-4o-mini
### Datasets
Datasets define the input data for your pipeline. They are collections of documents, where each document is an object in a JSON list. Datasets are typically specified in the YAML configuration file, indicating the type and path of the data source. For example:
Datasets define the input data for your pipeline. They are collections of documents, where each document is an object in a JSON list (or row in a CSV file). Datasets are typically specified in the YAML configuration file, indicating the type and path of the data source. For example:
```yaml
datasets:
Expand All @@ -30,9 +30,35 @@ datasets:
path: "user_logs.json"
```
#### Dynamic Data Loading
DocETL supports dynamic data loading, allowing you to process various file types by specifying a key that points to a path or using a custom parsing function. This feature is particularly useful for handling diverse data sources, such as audio files, PDFs, or any other non-standard format.
To implement dynamic data loading, you can use parsing tools in your dataset configuration. Here's an example:
```yaml
datasets:
audio_transcripts:
type: file
source: local
path: "audio_files/audio_paths.json"
parsing_tools:
- input_key: audio_path
function: whisper_speech_to_text
output_key: transcript
```
In this example, the dataset configuration specifies a JSON file (audio_paths.json) that contains paths to audio files. The parsing_tools section defines how to process these files:
- `input_key`: Specifies which key in the JSON contains the path to the audio file. In this example, each object in the dataset should have a "audio_path" key, that represents a path to an audio file or mp3.
- `function`: Names the parsing function to use (in this case, the built-in whisper_speech_to_text function for audio transcription).
- `output_key`: Defines the key where the processed data (transcript) will be stored. You can access this in the pipeline in any prompts with the `{{ input.transcipt }}` syntax.

This approach allows DocETL to dynamically load and process various file types, extending its capabilities beyond standard JSON or CSV inputs. You can use built-in parsing tools or define custom ones to handle specific file formats or data processing needs. See the [Custom Parsing](../examples/custom-parsing.md) documentation for more details.

!!! note

Currently, DocETL only supports JSON files as input datasets. If you're interested in support for other data types or cloud-based datasets, please reach out to us or join our open-source community and contribute! We welcome new ideas and contributions to expand the capabilities of DocETL.
Currently, DocETL only supports JSON files or CSV files as input datasets. If you're interested in support for other data types or cloud-based datasets, please reach out to us or join our open-source community and contribute! We welcome new ideas and contributions to expand the capabilities of DocETL.

### Operators

Expand Down
Loading

0 comments on commit efe78f5

Please sign in to comment.