diff --git a/README.md b/README.md index 8d603a54..ce9d70be 100644 --- a/README.md +++ b/README.md @@ -65,4 +65,4 @@ make tests-basic That's it! You've successfully installed DocETL and are ready to start processing documents. -For more detailed information on usage and configuration, please refer to our [documentation](https://shreyashankar.github.io/docetl). +For more detailed information on usage and configuration, please refer to our [documentation](https://ucbepic.github.io/docetl). diff --git a/docetl/operations/resolve.py b/docetl/operations/resolve.py index 9903d271..0daa78b7 100644 --- a/docetl/operations/resolve.py +++ b/docetl/operations/resolve.py @@ -23,6 +23,7 @@ validate_output, gen_embedding, ) +from rich.prompt import Confirm def compare_pair( @@ -195,6 +196,22 @@ def execute(self, input_data: List[Dict]) -> Tuple[List[Dict], float]: blocking_keys = self.config.get("blocking_keys", []) blocking_threshold = self.config.get("blocking_threshold") blocking_conditions = self.config.get("blocking_conditions", []) + + if not blocking_threshold and not blocking_conditions: + # Prompt the user for confirmation + if self.status: + self.status.stop() + if not Confirm.ask( + f"[yellow]Warning: No blocking keys or conditions specified. " + f"This may result in a large number of comparisons. " + f"We recommend specifying at least one blocking key or condition, or using the optimizer to automatically come up with these. " + f"Do you want to continue without blocking?[/yellow]", + ): + raise ValueError("Operation cancelled by user.") + + if self.status: + self.status.start() + input_schema = self.config.get("input", {}).get("schema", {}) if not blocking_keys: # Set them to all keys in the input data diff --git a/docetl/runner.py b/docetl/runner.py index 2c6ceb73..dc783f1c 100644 --- a/docetl/runner.py +++ b/docetl/runner.py @@ -229,7 +229,11 @@ def execute_step( operation_class = get_operation(op_object["type"]) operation_instance = operation_class( - op_object, self.default_model, self.max_threads, self.console + op_object, + self.default_model, + self.max_threads, + self.console, + self.status, ) if op_object["type"] == "equijoin": left_data = self.datasets[op_object["left"]]