Skip to content

Commit

Permalink
Merge pull request #12 from ucbepic/shreyashankar/blockingerr
Browse files Browse the repository at this point in the history
quality of life: show error when trying to execute resolve without blocking
  • Loading branch information
shreyashankar authored Sep 24, 2024
2 parents 4f1dea4 + 733c30a commit c17c6ae
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 2 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,4 +65,4 @@ make tests-basic

That's it! You've successfully installed DocETL and are ready to start processing documents.

For more detailed information on usage and configuration, please refer to our [documentation](https://shreyashankar.github.io/docetl).
For more detailed information on usage and configuration, please refer to our [documentation](https://ucbepic.github.io/docetl).
17 changes: 17 additions & 0 deletions docetl/operations/resolve.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
validate_output,
gen_embedding,
)
from rich.prompt import Confirm


def compare_pair(
Expand Down Expand Up @@ -195,6 +196,22 @@ def execute(self, input_data: List[Dict]) -> Tuple[List[Dict], float]:
blocking_keys = self.config.get("blocking_keys", [])
blocking_threshold = self.config.get("blocking_threshold")
blocking_conditions = self.config.get("blocking_conditions", [])

if not blocking_threshold and not blocking_conditions:
# Prompt the user for confirmation
if self.status:
self.status.stop()
if not Confirm.ask(
f"[yellow]Warning: No blocking keys or conditions specified. "
f"This may result in a large number of comparisons. "
f"We recommend specifying at least one blocking key or condition, or using the optimizer to automatically come up with these. "
f"Do you want to continue without blocking?[/yellow]",
):
raise ValueError("Operation cancelled by user.")

if self.status:
self.status.start()

input_schema = self.config.get("input", {}).get("schema", {})
if not blocking_keys:
# Set them to all keys in the input data
Expand Down
6 changes: 5 additions & 1 deletion docetl/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,11 @@ def execute_step(

operation_class = get_operation(op_object["type"])
operation_instance = operation_class(
op_object, self.default_model, self.max_threads, self.console
op_object,
self.default_model,
self.max_threads,
self.console,
self.status,
)
if op_object["type"] == "equijoin":
left_data = self.datasets[op_object["left"]]
Expand Down

0 comments on commit c17c6ae

Please sign in to comment.