docs: add a figure for readme

ucbepic · Sep 21, 2024 · e767ec2 · e767ec2
1 parent 9a93ff7
commit e767ec2
Show file tree

Hide file tree

Showing 11 changed files with 61 additions and 61 deletions.
diff --git a/README.md b/README.md
@@ -1,9 +1,11 @@
-# DocETL: A System for Complex LLM-Powered Document Processing
-
-DocETL is a tool for creating and executing data processing pipelines, especially suited for complex document processing tasks. It offers a low-code, declarative YAML interface to define LLM-powered operations on complex data.
+# DocETL: Powering Complex Document Processing Pipelines
 
 [Website (Includes Demo)](https://docetl.com) | [Documentation](https://ucbepic.github.io/docetl) | [Discord](https://discord.gg/fHp7B2X3xx) | Paper (coming soon!)
 
+![DocETL Figure](docs/assets/readmefig.png)
+
+DocETL is a tool for creating and executing data processing pipelines, especially suited for complex document processing tasks. It offers a low-code, declarative YAML interface to define LLM-powered operations on complex data.
+
 ## When to Use DocETL
 
 DocETL is the ideal choice when you're looking to maximize correctness and output quality for complex tasks over a collection of documents or unstructured datasets. You should consider using DocETL if:
@@ -14,13 +16,6 @@ DocETL is the ideal choice when you're looking to maximize correctness and outpu
 - You're working with long documents that don't fit into a single prompt or are too lengthy for effective LLM reasoning
 - You have validation criteria and want tasks to automatically retry when the validation fails
 
-## Features
-
-- **Rich Suite of Operators**: Tailored for complex data processing, including specialized operators like "resolve" for entity resolution and "gather" for maintaining context when splitting documents.
-- **Low-Code Interface**: Define your pipeline and prompts easily using YAML. You have 100% control over the prompts.
-- **Flexible Processing**: Handle various document types and processing tasks across domains like law, medicine, and social sciences.
-- **Optional Optimization**: Improve pipeline accuracy with agent-based rewriting and assessment if desired.
-
 ## Installation
 
 See the documentation for installing from PyPI.

diff --git a/docetl/cli.py b/docetl/cli.py
@@ -44,7 +44,6 @@ def build(
         resume=resume,
     )
     optimizer.optimize()
-    typer.echo("Optimization complete. Check the optimized configuration.")
 
 
 @app.command()

diff --git a/docetl/operations/reduce.py b/docetl/operations/reduce.py
@@ -570,9 +570,10 @@ def _incremental_reduce(
         for i in range(0, len(group_list), fold_batch_size):
             # Log the current iteration and total number of folds
             current_fold = i // fold_batch_size + 1
-            self.console.log(
-                f"Processing fold {current_fold} of {num_folds} for group with key {key}"
-            )
+            if self.config.get("verbose", False):
+                self.console.log(
+                    f"Processing fold {current_fold} of {num_folds} for group with key {key}"
+                )
             batch = group_list[i : i + fold_batch_size]
 
             folded_output, fold_cost = self._increment_fold(
@@ -586,7 +587,10 @@ def _incremental_reduce(
             # Pop off updated_scratchpad
             if "updated_scratchpad" in folded_output:
                 scratchpad = folded_output["updated_scratchpad"]
-                self.console.log(f"Updated notes: {scratchpad}")
+                if self.config.get("verbose", False):
+                    self.console.log(
+                        f"Updated scratchpad for fold {current_fold}: {scratchpad}"
+                    )
                 del folded_output["updated_scratchpad"]
 
             current_output = folded_output

diff --git a/docetl/operations/utils.py b/docetl/operations/utils.py
@@ -693,9 +693,9 @@ def call_llm_with_gleaning(
         if suggestion["should_refine"] == False:
             break
 
-        console.log(
-            f"Validator improvements (gleaning round {rnd + 1}): {suggestion['improvements']}"
-        )
+        # console.log(
+        #     f"Validator improvements (gleaning round {rnd + 1}): {suggestion['improvements']}"
+        # )
 
         # Prompt for improvement
         improvement_prompt = f"""Based on the validation feedback:

diff --git a/docetl/optimizers/join_optimizer.py b/docetl/optimizers/join_optimizer.py
@@ -238,22 +238,24 @@ def synthesize_compare_prompt(
                 "role": "user",
                 "content": f"""
     Create a comparison prompt for entity resolution: The prompt should:
-    1. Be tailored to the specific domain and type of data being compared, based on the context provided.
+    1. Be tailored to the specific domain and type of data being compared ({reduce_key}), based on the context provided.
     2. Instruct to compare two entities, referred to as input1 and input2.
-    3. Specifically mention comparing each reduce key in input1 and input2 (e.g., input1.{{key}} and input2.{{key}} for each key in {reduce_key}).
+    3. Specifically mention comparing each reduce key in input1 and input2 (e.g., input1.{{key}} and input2.{{key}} for each key in {reduce_key}). You can reference other fields in the input as well, as long as they are short.
     4. Include instructions to consider relevant attributes or characteristics for comparison.
     5. Ask to respond with "True" if the entities are likely the same, or "False" if they are likely different.
 
     Example structure:
     ```
-    Compare the following two [entity type]:
+    Compare the following two {reduce_key} from [entity or document type]:
 
     [Entity 1]:
     {{{{ input1.key1 }}}}
+    {{{{ input1.optional_key2 }}}}
 
     [Entity 2]:
     {{{{ input2.key1 }}}}
-
+    {{{{ input2.optional_key2 }}}}
+    
     Are these [entities] likely referring to the same [entity type]? Consider [list relevant attributes or characteristics to compare]. Respond with "True" if they are likely the same [entity type], or "False" if they are likely different [entity types].
     ```
 
@@ -324,7 +326,9 @@ def synthesize_resolution_prompt(
 
     {{% for key in inputs %}}
     Entry {{{{ loop.index }}}}:
-    {{{{ key | tojson }}}}
+    {{ % for key in reduce_key %}}
+    {{{{ key }}}}: {{{{ key[reduce_key] }}}}
+    {{% endfor %}}
 
     {{% endfor %}}
 

diff --git a/docetl/optimizers/reduce_optimizer.py b/docetl/optimizers/reduce_optimizer.py
@@ -1021,12 +1021,12 @@ def _generate_validator_prompt(
         {json.dumps(sample_output, indent=2)}
 
         Create a custom validator prompt that will assess how well the reduce operation performed its intended task. The prompt should ask specific 2-3 questions about the quality of the output, such as:
-        1. Does the output accurately reflect the aggregation method specified in the task? For example, if summing numeric values, are the totals correct?
+        1. Does the output accurately reflect the aggregation method specified in the task? For example, if finding anomalies, are the identified anomalies actually anomalies?
         2. Are there any missing fields, unexpected null values, or data type mismatches in the output compared to the expected schema?
         3. Does the output maintain the key information from the input while appropriately condensing or summarizing it? For instance, in a text summarization task, are the main points preserved?
         4. How well does the output adhere to any specific formatting requirements mentioned in the original prompt, such as character limits for summaries or specific data types for aggregated values?
 
-        Note that the output may reflect more than just the input provided, since we only provide a one-item sample input. Provide your response as a single string containing the custom validator prompt. The prompt should be tailored to the task and avoid generic criteria.
+        Note that the output may reflect more than just the input provided, since we only provide a one-item sample input. Provide your response as a single string containing the custom validator prompt. The prompt should be tailored to the task and avoid generic criteria. The prompt should not reference a specific value in the sample input, but rather a general property.
         """
 
         parameters = {
@@ -1622,7 +1622,7 @@ def _evaluate_reduce_plans(
             f"\n[green]Selected best plan with score: {best_score:.2f} and batch size: {best_plan['fold_batch_size']}[/green]"
         )
 
-        if op_config.get("synthesize_merge", True):
+        if op_config.get("synthesize_merge", False):
             # Create a new plan with merge prompt and updated parameters
             merged_plan = best_plan.copy()
 

diff --git a/docetl/runner.py b/docetl/runner.py
@@ -74,7 +74,8 @@ def syntax_check(self):
         Raises:
             ValueError: If any operation fails the syntax check.
         """
-        self.console.log(
+        self.console.rule("[yellow]Syntax Check[/yellow]")
+        self.console.print(
             "[yellow]Performing syntax check on all operations...[/yellow]"
         )
 
@@ -95,7 +96,7 @@ def syntax_check(self):
                     f"Syntax check failed for operation '{operation}': {str(e)}"
                 )
 
-        self.console.log("[green]Syntax check passed for all operations.[/green]")
+        self.console.print("[green]Syntax check passed for all operations.[/green]")
 
     def find_operation(self, op_name: str) -> Dict:
         for operation_config in self.config["operations"]:
@@ -113,6 +114,7 @@ def run(self) -> float:
         Returns:
             float: The total cost of executing the pipeline.
         """
+        self.console.rule("[bold blue]Pipeline Execution[/bold blue]")
         start_time = time.time()
         self.load_datasets()
         total_cost = 0
@@ -123,23 +125,19 @@ def run(self) -> float:
         ) as progress:
             for step in self.config["pipeline"]["steps"]:
                 step_name = step["name"]
-                step_task = progress.add_task(
-                    f"Running step [cyan]{step_name}[/cyan]...", total=1
-                )
                 input_data = self.datasets[step["input"]] if "input" in step else None
                 output_data, step_cost = self.execute_step(step, input_data, progress)
                 self.datasets[step_name] = output_data
                 flush_cache(self.console)
                 total_cost += step_cost
-                progress.update(
-                    step_task,
-                    advance=1,
-                    description=f"Step [cyan]{step_name}[/cyan] completed. Cost: [green]${step_cost:.2f}[/green]",
+                self.console.log(
+                    f"Step [cyan]{step_name}[/cyan] completed. Cost: [green]${step_cost:.2f}[/green]"
                 )
 
         self.save_output(self.datasets[self.config["pipeline"]["steps"][-1]["name"]])
-        self.console.log(f"[bold green]Total cost: [green]${total_cost:.2f}[/green]")
-        self.console.log(
+        self.console.rule("[bold green]Execution Summary[/bold green]")
+        self.console.print(f"[bold green]Total cost: [green]${total_cost:.2f}[/green]")
+        self.console.print(
             f"[bold green]Total time: [green]{time.time() - start_time:.2f} seconds[/green]"
         )
 
@@ -154,11 +152,13 @@ def load_datasets(self):
         Raises:
             ValueError: If an unsupported dataset type is encountered.
         """
+        self.console.rule("[cyan]Loading Datasets[/cyan]")
         for name, dataset_config in self.config["datasets"].items():
             if dataset_config["type"] == "file":
                 with open(dataset_config["path"], "r") as file:
                     self.datasets[name] = json.load(file)
                     self.datasets[name] = self.datasets[name]
+                self.console.print(f"Loaded dataset: [bold]{name}[/bold]")
             else:
                 raise ValueError(f"Unsupported dataset type: {dataset_config['type']}")
 
@@ -172,11 +172,12 @@ def save_output(self, data: List[Dict]):
         Raises:
             ValueError: If an unsupported output type is specified in the configuration.
         """
+        self.console.rule("[cyan]Saving Output[/cyan]")
         output_config = self.config["pipeline"]["output"]
         if output_config["type"] == "file":
             with open(output_config["path"], "w") as file:
                 json.dump(data, file, indent=2)
-            self.console.log(
+            self.console.print(
                 f"[green italic]💾 Output saved to {output_config['path']}[/green italic]"
             )
         else:
@@ -199,6 +200,7 @@ def execute_step(
         Returns:
             Tuple[List[Dict], float]: A tuple containing the output data and the total cost of the step.
         """
+        self.console.rule(f"[bold blue]Executing Step: {step['name']}[/bold blue]")
         total_cost = 0
         for operation in step["operations"]:
             if isinstance(operation, dict):
@@ -215,12 +217,11 @@ def execute_step(
             if op_object.get("sample"):
                 input_data = input_data[: op_object["sample"]]
 
-            op_task = progress.add_task(
-                f"Running operation [cyan]{operation_name}[/cyan]...", total=1
+            self.console.print("[bold]Running Operation:[/bold]")
+            self.console.print(f"  Type: [cyan]{op_object['type']}[/cyan]")
+            self.console.print(
+                f"  Name: [cyan]{op_object.get('name', 'Unnamed')}[/cyan]"
             )
-            self.console.log("[bold]Running Operation:[/bold]")
-            self.console.log(f"  Type: [cyan]{op_object['type']}[/cyan]")
-            self.console.log(f"  Name: [cyan]{op_object.get('name', 'Unnamed')}[/cyan]")
 
             operation_class = get_operation(op_object["type"])
             operation_instance = operation_class(
@@ -233,10 +234,8 @@ def execute_step(
             else:
                 input_data, cost = operation_instance.execute(input_data)
             total_cost += cost
-            progress.update(
-                op_task,
-                advance=1,
-                description=f"Operation [cyan]{operation_name}[/cyan] completed. Cost: [green]${cost:.2f}[/green]",
+            self.console.log(
+                f"\tOperation [cyan]{operation_name}[/cyan] completed. Cost: [green]${cost:.2f}[/green]"
             )
 
             # Checkpoint after each operation
@@ -268,8 +267,8 @@ def _save_checkpoint(self, step_name: str, operation_name: str, data: List[Dict]
         os.makedirs(os.path.dirname(checkpoint_path), exist_ok=True)
         with open(checkpoint_path, "w") as f:
             json.dump(data, f)
-        self.console.log(
-            f"[green]Intermediate saved for operation '{operation_name}' in step '{step_name}' at {checkpoint_path}[/green]"
+        self.console.print(
+            f"[green]✓ [italic]Intermediate saved for operation '{operation_name}' in step '{step_name}' at {checkpoint_path}[/italic][/green]"
         )
 
 

diff --git a/docs/assets/readmefig.png b/docs/assets/readmefig.png
diff --git a/docs/index.md b/docs/index.md
@@ -1,4 +1,6 @@
-# DocETL: A System for Complex Document Processing
+# 📜 DocETL: A System for Complex Document Processing
+
+![DocETL Figure](assets/readmefig.png)
 
 DocETL is a tool for creating and executing LLM-powered data processing pipelines. It offers a low-code, declarative YAML interface to define complex data operations on complex data.
 
@@ -11,22 +13,22 @@ DocETL is a tool for creating and executing LLM-powered data processing pipeline
     - You're working with long documents that don't fit into a single prompt or are too lengthy for effective LLM reasoning
     - You have validation criteria and want tasks to automatically retry when the validation fails
 
-## Features
+## 🚀 Features
 
 - **Rich Suite of Operators**: Tailored for complex data processing, including specialized operators like "resolve" for entity resolution and "gather" for maintaining context when splitting documents.
 - **Low-Code Interface**: Define your pipeline and prompts easily using YAML. You have 100% control over the prompts.
 - **Flexible Processing**: Handle various document types and processing tasks across domains like law, medicine, and social sciences.
 - **Accuracy Optimization**: Our optimizer leverages LLM agents to experiment with different logically-equivalent rewrites of your pipeline and automatically selects the most accurate version. This includes finding limits of how many documents to process in a single reduce operation before the accuracy plateaus.
 
-## Getting Started
+## ⚡ Getting Started
 
 To get started with DocETL:
 
 1. Install the package (see [installation](installation.md) for detailed instructions)
 2. Define your pipeline in a YAML file
 3. Run your pipeline using the DocETL command-line interface
 
-## Project Origin
+## 🏛️ Project Origin
 
 DocETL was created by members of the EPIC Data Lab and Data Systems and Foundations group at UC Berkeley. The EPIC (Effective Programming, Interaction, and Computation with Data) Lab focuses on developing low-code and no-code interfaces for data work, powered by next-generation predictive programming techniques. DocETL is one of the projects that emerged from our research efforts to streamline complex document processing tasks.
 

diff --git a/docs/installation.md b/docs/installation.md
@@ -2,15 +2,11 @@
 
 DocETL can be easily installed using pip, Python's package installer, or from source. Follow these steps to get DocETL up and running on your system:
 
-## Prerequisites
+## 🛠️ Prerequisites
 
 Before installing DocETL, ensure you have Python 3.10 or later installed on your system. You can check your Python version by running:
 
-```bash
-python --version
-```
-
-## Installation via pip
+## 📦 Installation via pip
 
 1. Install DocETL using pip:
 
@@ -24,7 +20,7 @@ This command will install DocETL along with its dependencies as specified in the
 docetl version
 ```
 
-## Installation from Source
+## 🔧 Installation from Source
 
 To install DocETL from source, follow these steps:
 
@@ -65,7 +61,7 @@ Alternatively, you can set the OPENAI_API_KEY environment variable in your shell
 make tests-basic
 ```
 
-## Troubleshooting
+## 🚨 Troubleshooting
 
 If you encounter any issues during installation, please ensure that:
 

diff --git a/docs/operators/reduce.md b/docs/operators/reduce.md
@@ -59,6 +59,7 @@ This Reduce operation processes customer feedback grouped by department:
 | `fold_prompt`        | A prompt template for incremental folding                                       | None                        |
 | `fold_batch_size`    | Number of items to process in each fold operation                               | None                        |
 | `value_sampling`     | A dictionary specifying the sampling strategy for large groups                  | None                        |
+| `verbose`            | If true, enables detailed logging of the reduce operation                       | false                       |
 
 ## Advanced Features