Fix some bugs and documentation

ucbepic · Sep 15, 2024 · 57e7c67 · 57e7c67
1 parent 56ef62e
commit 57e7c67
Show file tree

Hide file tree

Showing 10 changed files with 1,040 additions and 942 deletions.
diff --git a/docetl/builder.py b/docetl/builder.py
@@ -1275,7 +1275,14 @@ def _optimize_resolve(
             List[Dict[str, Any]]: The optimized operation configuration.
         """
         optimized_config, cost = JoinOptimizer(
-            self.config, op_config, self.console, self.llm_client, self.max_threads
+            self.config,
+            op_config,
+            self.console,
+            self.llm_client,
+            self.max_threads,
+            target_recall=self.config.get("optimizer_config", {})
+            .get("resolve", {})
+            .get("target_recall", 0.95),
         ).optimize_resolve(input_data)
 
         if optimized_config.get("empty", False):

diff --git a/docetl/operations/resolve.py b/docetl/operations/resolve.py
@@ -293,9 +293,8 @@ def meets_blocking_conditions(pair):
             else float("inf")
         )
         if remaining_comparisons > 0 and blocking_threshold is not None:
-            # Compute cosine similarity for all pairs at once
-            all_embeddings = np.array([embeddings[i] for i in range(len(input_data))])
-            similarity_matrix = cosine_similarity(all_embeddings)
+            # Compute cosine similarity for all pairs efficiently
+            similarity_matrix = cosine_similarity(embeddings)
 
             cosine_pairs = []
             for i, j in all_pairs:

diff --git a/docetl/operations/utils.py b/docetl/operations/utils.py
@@ -101,7 +101,7 @@ def gen_embedding(model: str, input: List[str]) -> List[float]:
         if not isinstance(input[0], str):
             input = [json.dumps(item) for item in input]
 
-        input = [item if not item else "None" for item in input]
+        input = [item if item else "None" for item in input]
 
         result = embedding(model=model, input=input)
         # Cache the result
@@ -656,7 +656,7 @@ def call_llm_with_gleaning(
         ]
     )
 
-    for _ in range(num_gleaning_rounds):
+    for rnd in range(num_gleaning_rounds):
         cost += completion_cost(response)
 
         # Prepare validator prompt
@@ -693,7 +693,9 @@ def call_llm_with_gleaning(
         if suggestion["should_refine"] == False:
             break
 
-        console.log(f"Validator improvements: {suggestion['improvements']}")
+        console.log(
+            f"Validator improvements (gleaning round {rnd + 1}): {suggestion['improvements']}"
+        )
 
         # Prompt for improvement
         improvement_prompt = f"""Based on the validation feedback:

diff --git a/docetl/optimizers/map_optimizer/config_generators.py b/docetl/optimizers/map_optimizer/config_generators.py
@@ -413,7 +413,7 @@ def _generate_chunk_sizes(
         split_key: str,
         input_data_sample: List[Dict[str, Any]],
         token_limit: int,
-        num_chunks: int = 4,
+        num_chunks: int = 8,
     ) -> List[int]:
         # Get the average document length
         avg_doc_length = sum(

diff --git a/docs/execution/optimizing-pipelines.md b/docs/execution/optimizing-pipelines.md
@@ -206,15 +206,90 @@ This optimized pipeline now includes improved prompts, a resolve operation, and
 
     We're continually improving the optimizer. Your feedback on its performance and usability is invaluable. Please share your experiences and suggestions!
 
+## Advanced: Customizing Optimization
+
+You can customize the optimization process for specific operations using the ``optimizer_config in your pipeline. 
+
+### Global Configuration
+
+The following options can be applied globally to all operations in your pipeline during optimization:
+
+- `num_retries`: The number of times to retry optimizing if the LLM agent fails. Default is 1.
+
+- `sample_sizes`: Override the default sample sizes for each operator type. Specify as a dictionary with operator types as keys and integer sample sizes as values.
+
+  Default sample sizes:
+
+  ```python
+  SAMPLE_SIZE_MAP = {
+      "reduce": 40,
+      "map": 5,
+      "resolve": 100,
+      "equijoin": 100,
+      "filter": 5,
+  }
+  ```
+
+### Equijoin Configuration
+
+- `target_recall`: Change the default target recall (default is 0.95).
+
+### Resolve Configuration
+
+- `target_recall`: Specify the target recall for the resolve operation.
+
+### Reduce Configuration
+
+- `synthesize_resolve`: Set to `False` if you definitely don't want a resolve operation synthesized or want to turn off this rewrite rule.
+
+### Map Configuration
+
+- `force_chunking_plan`: Set to `True` if you want the the optimizer to force plan that breaks up the input documents into chunks.
+
+### Example Configuration
+
+Here's an example of how to use the `optimizer_config` in your pipeline:
+
+```yaml
+optimizer_config:
+  num_retries: 2
+  sample_sizes:
+    map: 10
+    reduce: 50
+  reduce:
+    synthesize_resolve: false
+  map:
+    force_chunking_plan: true
+
+operations:
+  - name: extract_medications
+    type: map
+    optimize: true
+    # ... other configuration ...
+
+  - name: summarize_prescriptions
+    type: reduce
+    optimize: true
+    # ... other configuration ...
+# ... rest of the pipeline configuration ...
+```
+
+This configuration will:
+
+1. Retry optimization up to 2 times for each operation if the LLM agent fails.
+2. Use custom sample sizes for map (10) and reduce (50) operations.
+3. Prevent the synthesis of resolve operations for reduce operations.
+4. Force a chunking plan for map operations.
+
 ## Optimizer API
 
 ::: docetl.cli.build
     handler: python
     options:
-        members: 
+        members:
             - build
-    show_root_full_path: true
-    show_root_toc_entry: true
-    show_root_heading: true
-    show_source: false
-    show_name: true
+        show_root_full_path: true
+        show_root_toc_entry: true
+        show_root_heading: true
+        show_source: false
+        show_name: true
diff --git a/workloads/medical/checkpoints/medical_info_extraction/extract_medications.json b/workloads/medical/checkpoints/medical_info_extraction/extract_medications.json
diff --git a/workloads/medical/checkpoints/medical_info_extraction/resolve_medications.json b/workloads/medical/checkpoints/medical_info_extraction/resolve_medications.json
diff --git a/workloads/medical/checkpoints/medical_info_extraction/unnest_medications.json b/workloads/medical/checkpoints/medical_info_extraction/unnest_medications.json
diff --git a/workloads/medical/extracted_medical_info.json b/workloads/medical/extracted_medical_info.json
diff --git a/workloads/medical/full.yaml b/workloads/medical/full.yaml
@@ -29,6 +29,9 @@ operations:
       schema:
         medication: str
     embedding_model: "text-embedding-3-small"
+    blocking_threshold: 0.8
+    # blocking_keys:
+    #   - medication
     comparison_prompt: |
       Compare the following two medication entries:
       Entry 1: {{ input1.medication }}