Skip to content

Commit

Permalink
Merge pull request #113 from ucbepic/shreya/api
Browse files Browse the repository at this point in the history
  • Loading branch information
shreyashankar authored Oct 17, 2024
2 parents cca416e + ead12e8 commit 158d72c
Show file tree
Hide file tree
Showing 5 changed files with 31 additions and 10 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# DocETL: Powering Complex Document Processing Pipelines

[Website (Includes Demo)](https://docetl.com) | [Documentation](https://ucbepic.github.io/docetl) | [Discord](https://discord.gg/fHp7B2X3xx) | [NotebookLM Podcast](https://notebooklm.google.com/notebook/ef73248b-5a43-49cd-9976-432d20f9fa4f/audio?pli=1) (thanks Shabie from our Discord community!) | Paper (coming soon!)
[Website (Includes Demo)](https://docetl.org) | [Documentation](https://ucbepic.github.io/docetl) | [Discord](https://discord.gg/fHp7B2X3xx) | [Paper](https://arxiv.org/abs/2410.12189)

![DocETL Figure](docs/assets/readmefig.png)

Expand Down
16 changes: 10 additions & 6 deletions docetl/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,17 +64,17 @@
FilterOp,
GatherOp,
MapOp,
ReduceOp,
ResolveOp,
SplitOp,
UnnestOp,
ClusterOp,
SampleOp,
OpType,
ParallelMapOp,
ParsingTool,
)
from docetl.schemas import (
PipelineOutput,
PipelineStep,
ReduceOp,
ResolveOp,
SplitOp,
UnnestOp,
)


Expand Down Expand Up @@ -322,6 +322,10 @@ def _update_from_dict(self, config: Dict[str, Any]):
self.operations.append(GatherOp(**op, type=op_type))
elif op_type == "unnest":
self.operations.append(UnnestOp(**op, type=op_type))
elif op_type == "cluster":
self.operations.append(ClusterOp(**op, type=op_type))
elif op_type == "sample":
self.operations.append(SampleOp(**op, type=op_type))
self.steps = [PipelineStep(**step) for step in config["pipeline"]["steps"]]
self.output = PipelineOutput(**config["pipeline"]["output"])
self.default_model = config.get("default_model")
Expand Down
20 changes: 19 additions & 1 deletion docetl/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ class Dataset(BaseModel):
parsing: Optional[List[Dict[str, str]]] = None


class BaseOp(BaseModel):
class BaseOp(BaseModel, extra="allow"):
name: str
type: str

Expand Down Expand Up @@ -222,6 +222,22 @@ class UnnestOp(BaseOp):
depth: Optional[int] = None


class ClusterOp(BaseOp):
type: str = "cluster"
embedding_keys: List[str]
summary_prompt: str
summary_schema: Dict[str, Any]
output_key: Optional[str] = "clusters"


class SampleOp(BaseOp):
type: str = "sample"
method: str
samples: Union[int, float, List[Dict[str, Any]]]
method_kwargs: Optional[Dict[str, Any]] = None
random_state: Optional[int] = None


OpType = Union[
MapOp,
ResolveOp,
Expand All @@ -232,6 +248,8 @@ class UnnestOp(BaseOp):
SplitOp,
GatherOp,
UnnestOp,
ClusterOp,
SampleOp,
]


Expand Down
2 changes: 0 additions & 2 deletions docs/operators/cluster.md
Original file line number Diff line number Diff line change
Expand Up @@ -181,8 +181,6 @@ and a description, and groups them into a tree of categories.
| `output_key` | The name of the output key where the cluster path will be inserted in the items. | "clusters" |
| `model` | The language model to use | Falls back to `default_model` |
| `embedding_model` | The embedding model to use | "text-embedding-3-small" |
| `tools` | List of tool definitions for LLM use | None |
| `timeout` | Timeout for each LLM call in seconds | 120 |
| `max_retries_per_timeout` | Maximum number of retries per timeout | 2 |
| `validate` | List of Python expressions to validate the output | None |
| `sample` | Number of items to sample for this operation | None |
1 change: 1 addition & 0 deletions tests/test_runner_caching.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ def create_pipeline(input_file, output_file, intermediate_dir, operation_prompt)
)


@pytest.mark.flaky(reruns=3)
def test_pipeline_rerun_on_operation_change(
temp_input_file, temp_output_file, temp_intermediate_dir
):
Expand Down

0 comments on commit 158d72c

Please sign in to comment.