diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 00000000..e69de29b diff --git a/404.html b/404.html new file mode 100644 index 00000000..18a7c879 --- /dev/null +++ b/404.html @@ -0,0 +1,600 @@ + + + + + + + + + + + + + + + + + + + docetl docs + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+
+ +
+ + + + + + +
+ + + + + + + +
+ +
+ + + + +
+
+ + + +
+
+
+ + + + + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ +

404 - Not found

+ +
+
+ + + +
+ + + +
+ + + +
+
+
+
+ + + + + + + + + + \ No newline at end of file diff --git a/advanced/custom-operators/index.html b/advanced/custom-operators/index.html new file mode 100644 index 00000000..8d60e107 --- /dev/null +++ b/advanced/custom-operators/index.html @@ -0,0 +1,636 @@ + + + + + + + + + + + + + + + + + + + + + Custom operators - docetl docs + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+
+ +
+ + + + + + +
+ + + + + + + +
+ +
+ + + + +
+
+ + + +
+
+
+ + + + + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

Custom operators

+ + + + + + + + + + + + + + + + + + +
+
+ + + +
+ + + +
+ + + +
+
+
+
+ + + + + + + + + + \ No newline at end of file diff --git a/advanced/extending-agents/index.html b/advanced/extending-agents/index.html new file mode 100644 index 00000000..22e2d5e8 --- /dev/null +++ b/advanced/extending-agents/index.html @@ -0,0 +1,636 @@ + + + + + + + + + + + + + + + + + + + + + Extending agents - docetl docs + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+
+ +
+ + + + + + +
+ + + + + + + +
+ +
+ + + + +
+
+ + + +
+
+
+ + + + + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

Extending agents

+ + + + + + + + + + + + + + + + + + +
+
+ + + +
+ + + +
+ + + +
+
+
+
+ + + + + + + + + + \ No newline at end of file diff --git a/advanced/performance-tuning/index.html b/advanced/performance-tuning/index.html new file mode 100644 index 00000000..b590736a --- /dev/null +++ b/advanced/performance-tuning/index.html @@ -0,0 +1,636 @@ + + + + + + + + + + + + + + + + + + + + + Performance tuning - docetl docs + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+
+ +
+ + + + + + +
+ + + + + + + +
+ +
+ + + + +
+
+ + + +
+
+
+ + + + + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

Performance tuning

+ + + + + + + + + + + + + + + + + + +
+
+ + + +
+ + + +
+ + + +
+
+
+
+ + + + + + + + + + \ No newline at end of file diff --git a/api-reference/docetl/index.html b/api-reference/docetl/index.html new file mode 100644 index 00000000..3528f70f --- /dev/null +++ b/api-reference/docetl/index.html @@ -0,0 +1,6190 @@ + + + + + + + + + + + + + + + + + + + + + + + + + docetl - docetl docs + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + Skip to content + + +
+
+ +
+ + + + + + +
+ + + + + + + +
+ +
+ + + + +
+
+ + + +
+
+
+ + + + + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

docetl

+ +
+ + + +

+ docetl.DSLRunner + + +

+ + +
+ + +

A class for executing Domain-Specific Language (DSL) configurations.

+

This class is responsible for loading, validating, and executing DSL configurations +defined in YAML files. It manages datasets, executes pipeline steps, and tracks +the cost of operations.

+ + +

Attributes:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescription
config + Dict + +
+

The loaded configuration from the YAML file.

+
+
default_model + str + +
+

The default language model to use for operations.

+
+
max_threads + int + +
+

Maximum number of threads for parallel processing.

+
+
console + Console + +
+

Rich console for output formatting.

+
+
datasets + Dict + +
+

Storage for loaded datasets.

+
+
+ +
+ Source code in docetl/runner.py +
 18
+ 19
+ 20
+ 21
+ 22
+ 23
+ 24
+ 25
+ 26
+ 27
+ 28
+ 29
+ 30
+ 31
+ 32
+ 33
+ 34
+ 35
+ 36
+ 37
+ 38
+ 39
+ 40
+ 41
+ 42
+ 43
+ 44
+ 45
+ 46
+ 47
+ 48
+ 49
+ 50
+ 51
+ 52
+ 53
+ 54
+ 55
+ 56
+ 57
+ 58
+ 59
+ 60
+ 61
+ 62
+ 63
+ 64
+ 65
+ 66
+ 67
+ 68
+ 69
+ 70
+ 71
+ 72
+ 73
+ 74
+ 75
+ 76
+ 77
+ 78
+ 79
+ 80
+ 81
+ 82
+ 83
+ 84
+ 85
+ 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
+121
+122
+123
+124
+125
+126
+127
+128
+129
+130
+131
+132
+133
+134
+135
+136
+137
+138
+139
+140
+141
+142
+143
+144
+145
+146
+147
+148
+149
+150
+151
+152
+153
+154
+155
+156
+157
+158
+159
+160
+161
+162
+163
+164
+165
+166
+167
+168
+169
+170
+171
+172
+173
+174
+175
+176
+177
+178
+179
+180
+181
+182
+183
+184
+185
+186
+187
+188
+189
+190
+191
+192
+193
+194
+195
+196
+197
+198
+199
+200
+201
+202
+203
+204
+205
+206
+207
+208
+209
+210
+211
+212
+213
+214
+215
+216
+217
+218
+219
+220
+221
class DSLRunner:
+    """
+    A class for executing Domain-Specific Language (DSL) configurations.
+
+    This class is responsible for loading, validating, and executing DSL configurations
+    defined in YAML files. It manages datasets, executes pipeline steps, and tracks
+    the cost of operations.
+
+    Attributes:
+        config (Dict): The loaded configuration from the YAML file.
+        default_model (str): The default language model to use for operations.
+        max_threads (int): Maximum number of threads for parallel processing.
+        console (Console): Rich console for output formatting.
+        datasets (Dict): Storage for loaded datasets.
+    """
+
+    def __init__(self, yaml_file: str, max_threads: int = None):
+        """
+        Initialize the DSLRunner with a YAML configuration file.
+
+        Args:
+            yaml_file (str): Path to the YAML configuration file.
+            max_threads (int, optional): Maximum number of threads to use. Defaults to None.
+        """
+        self.config = load_config(yaml_file)
+        self.default_model = self.config.get("default_model", "gpt-4o-mini")
+        self.max_threads = max_threads or (os.cpu_count() or 1) * 4
+        self.console = Console()
+        self.status = None
+        self.datasets = {}
+        self.syntax_check()
+
+    def syntax_check(self):
+        """
+        Perform a syntax check on all operations defined in the configuration.
+
+        This method validates each operation by attempting to instantiate it.
+        If any operation fails to instantiate, a ValueError is raised.
+
+        Raises:
+            ValueError: If any operation fails the syntax check.
+        """
+        for operation_config in self.config["operations"]:
+            operation = operation_config["name"]
+            operation_type = operation_config["type"]
+
+            try:
+                operation_class = get_operation(operation_type)
+                operation_class(
+                    operation_config,
+                    self.default_model,
+                    self.max_threads,
+                    self.console,
+                )
+            except Exception as e:
+                raise ValueError(
+                    f"Syntax check failed for operation '{operation}': {str(e)}"
+                )
+
+        self.console.log("[green]Syntax check passed for all operations.[/green]")
+
+    def find_operation(self, op_name: str) -> Dict:
+        for operation_config in self.config["operations"]:
+            if operation_config["name"] == op_name:
+                return operation_config
+        raise ValueError(f"Operation '{op_name}' not found in configuration.")
+
+    def run(self) -> float:
+        """
+        Execute the entire pipeline defined in the configuration.
+
+        This method loads datasets, executes each step in the pipeline, saves the output,
+        and returns the total cost of execution.
+
+        Returns:
+            float: The total cost of executing the pipeline.
+        """
+        start_time = time.time()
+        self.load_datasets()
+        total_cost = 0
+        with Progress(
+            SpinnerColumn(),
+            TextColumn("[progress.description]{task.description}"),
+            console=self.console,
+        ) as progress:
+            for step in self.config["pipeline"]["steps"]:
+                step_name = step["name"]
+                step_task = progress.add_task(
+                    f"Running step [cyan]{step_name}[/cyan]...", total=1
+                )
+                input_data = self.datasets[step["input"]] if "input" in step else None
+                output_data, step_cost = self.execute_step(step, input_data, progress)
+                self.datasets[step_name] = output_data
+                flush_cache(self.console)
+                total_cost += step_cost
+                progress.update(
+                    step_task,
+                    advance=1,
+                    description=f"Step [cyan]{step_name}[/cyan] completed. Cost: [green]${step_cost:.2f}[/green]",
+                )
+
+        self.save_output(self.datasets[self.config["pipeline"]["steps"][-1]["name"]])
+        self.console.log(f"[bold green]Total cost: [green]${total_cost:.2f}[/green]")
+        self.console.log(
+            f"[bold green]Total time: [green]{time.time() - start_time:.2f} seconds[/green]"
+        )
+
+        return total_cost
+
+    def load_datasets(self):
+        """
+        Load all datasets defined in the configuration.
+
+        This method reads datasets from files and stores them in the `datasets` attribute.
+
+        Raises:
+            ValueError: If an unsupported dataset type is encountered.
+        """
+        for name, dataset_config in self.config["datasets"].items():
+            if dataset_config["type"] == "file":
+                with open(dataset_config["path"], "r") as file:
+                    self.datasets[name] = json.load(file)
+                    self.datasets[name] = self.datasets[name]
+            else:
+                raise ValueError(f"Unsupported dataset type: {dataset_config['type']}")
+
+    def save_output(self, data: List[Dict]):
+        """
+        Save the final output of the pipeline.
+
+        Args:
+            data (List[Dict]): The data to be saved.
+
+        Raises:
+            ValueError: If an unsupported output type is specified in the configuration.
+        """
+        output_config = self.config["pipeline"]["output"]
+        if output_config["type"] == "file":
+            with open(output_config["path"], "w") as file:
+                json.dump(data, file, indent=2)
+            self.console.log(
+                f"[green italic]💾 Output saved to {output_config['path']}[/green italic]"
+            )
+        else:
+            raise ValueError(f"Unsupported output type: {output_config['type']}")
+
+    def execute_step(
+        self, step: Dict, input_data: Optional[List[Dict]], progress: Progress
+    ) -> Tuple[List[Dict], float]:
+        """
+        Execute a single step in the pipeline.
+
+        This method runs all operations defined for a step, updating the progress
+        and calculating the cost.
+
+        Args:
+            step (Dict): The step configuration.
+            input_data (Optional[List[Dict]]): Input data for the step.
+            progress (Progress): Progress tracker for rich output.
+
+        Returns:
+            Tuple[List[Dict], float]: A tuple containing the output data and the total cost of the step.
+        """
+        total_cost = 0
+        for operation in step["operations"]:
+            if isinstance(operation, dict):
+                operation_name = list(operation.keys())[0]
+                operation_config = self.find_operation(operation_name)
+            else:
+                operation_name = operation
+                operation_config = {}
+
+            op_object = self.find_operation(operation_name).copy()
+            op_object.update(operation_config)
+
+            # If sample is set, sample the input data
+            if op_object.get("sample"):
+                input_data = input_data[: op_object["sample"]]
+
+            op_task = progress.add_task(
+                f"Running operation [cyan]{operation_name}[/cyan]...", total=1
+            )
+            self.console.log("[bold]Running Operation:[/bold]")
+            self.console.log(f"  Type: [cyan]{op_object['type']}[/cyan]")
+            self.console.log(f"  Name: [cyan]{op_object.get('name', 'Unnamed')}[/cyan]")
+
+            operation_class = get_operation(op_object["type"])
+            operation_instance = operation_class(
+                op_object, self.default_model, self.max_threads, self.console
+            )
+            if op_object["type"] == "equijoin":
+                left_data = self.datasets[op_object["left"]]
+                right_data = self.datasets[op_object["right"]]
+                input_data, cost = operation_instance.execute(left_data, right_data)
+            else:
+                input_data, cost = operation_instance.execute(input_data)
+            total_cost += cost
+            progress.update(
+                op_task,
+                advance=1,
+                description=f"Operation [cyan]{operation_name}[/cyan] completed. Cost: [green]${cost:.2f}[/green]",
+            )
+
+        return input_data, total_cost
+
+
+ + + +
+ + + + + + + + + +
+ + +

+ __init__(yaml_file, max_threads=None) + +

+ + +
+ +

Initialize the DSLRunner with a YAML configuration file.

+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
yaml_file + str + +
+

Path to the YAML configuration file.

+
+
+ required +
max_threads + int + +
+

Maximum number of threads to use. Defaults to None.

+
+
+ None +
+ +
+ Source code in docetl/runner.py +
34
+35
+36
+37
+38
+39
+40
+41
+42
+43
+44
+45
+46
+47
+48
def __init__(self, yaml_file: str, max_threads: int = None):
+    """
+    Initialize the DSLRunner with a YAML configuration file.
+
+    Args:
+        yaml_file (str): Path to the YAML configuration file.
+        max_threads (int, optional): Maximum number of threads to use. Defaults to None.
+    """
+    self.config = load_config(yaml_file)
+    self.default_model = self.config.get("default_model", "gpt-4o-mini")
+    self.max_threads = max_threads or (os.cpu_count() or 1) * 4
+    self.console = Console()
+    self.status = None
+    self.datasets = {}
+    self.syntax_check()
+
+
+
+ +
+ +
+ + +

+ execute_step(step, input_data, progress) + +

+ + +
+ +

Execute a single step in the pipeline.

+

This method runs all operations defined for a step, updating the progress +and calculating the cost.

+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
step + Dict + +
+

The step configuration.

+
+
+ required +
input_data + Optional[List[Dict]] + +
+

Input data for the step.

+
+
+ required +
progress + Progress + +
+

Progress tracker for rich output.

+
+
+ required +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ Tuple[List[Dict], float] + +
+

Tuple[List[Dict], float]: A tuple containing the output data and the total cost of the step.

+
+
+ +
+ Source code in docetl/runner.py +
164
+165
+166
+167
+168
+169
+170
+171
+172
+173
+174
+175
+176
+177
+178
+179
+180
+181
+182
+183
+184
+185
+186
+187
+188
+189
+190
+191
+192
+193
+194
+195
+196
+197
+198
+199
+200
+201
+202
+203
+204
+205
+206
+207
+208
+209
+210
+211
+212
+213
+214
+215
+216
+217
+218
+219
+220
+221
def execute_step(
+    self, step: Dict, input_data: Optional[List[Dict]], progress: Progress
+) -> Tuple[List[Dict], float]:
+    """
+    Execute a single step in the pipeline.
+
+    This method runs all operations defined for a step, updating the progress
+    and calculating the cost.
+
+    Args:
+        step (Dict): The step configuration.
+        input_data (Optional[List[Dict]]): Input data for the step.
+        progress (Progress): Progress tracker for rich output.
+
+    Returns:
+        Tuple[List[Dict], float]: A tuple containing the output data and the total cost of the step.
+    """
+    total_cost = 0
+    for operation in step["operations"]:
+        if isinstance(operation, dict):
+            operation_name = list(operation.keys())[0]
+            operation_config = self.find_operation(operation_name)
+        else:
+            operation_name = operation
+            operation_config = {}
+
+        op_object = self.find_operation(operation_name).copy()
+        op_object.update(operation_config)
+
+        # If sample is set, sample the input data
+        if op_object.get("sample"):
+            input_data = input_data[: op_object["sample"]]
+
+        op_task = progress.add_task(
+            f"Running operation [cyan]{operation_name}[/cyan]...", total=1
+        )
+        self.console.log("[bold]Running Operation:[/bold]")
+        self.console.log(f"  Type: [cyan]{op_object['type']}[/cyan]")
+        self.console.log(f"  Name: [cyan]{op_object.get('name', 'Unnamed')}[/cyan]")
+
+        operation_class = get_operation(op_object["type"])
+        operation_instance = operation_class(
+            op_object, self.default_model, self.max_threads, self.console
+        )
+        if op_object["type"] == "equijoin":
+            left_data = self.datasets[op_object["left"]]
+            right_data = self.datasets[op_object["right"]]
+            input_data, cost = operation_instance.execute(left_data, right_data)
+        else:
+            input_data, cost = operation_instance.execute(input_data)
+        total_cost += cost
+        progress.update(
+            op_task,
+            advance=1,
+            description=f"Operation [cyan]{operation_name}[/cyan] completed. Cost: [green]${cost:.2f}[/green]",
+        )
+
+    return input_data, total_cost
+
+
+
+ +
+ +
+ + +

+ load_datasets() + +

+ + +
+ +

Load all datasets defined in the configuration.

+

This method reads datasets from files and stores them in the datasets attribute.

+ + +

Raises:

+ + + + + + + + + + + + + +
TypeDescription
+ ValueError + +
+

If an unsupported dataset type is encountered.

+
+
+ +
+ Source code in docetl/runner.py +
127
+128
+129
+130
+131
+132
+133
+134
+135
+136
+137
+138
+139
+140
+141
+142
def load_datasets(self):
+    """
+    Load all datasets defined in the configuration.
+
+    This method reads datasets from files and stores them in the `datasets` attribute.
+
+    Raises:
+        ValueError: If an unsupported dataset type is encountered.
+    """
+    for name, dataset_config in self.config["datasets"].items():
+        if dataset_config["type"] == "file":
+            with open(dataset_config["path"], "r") as file:
+                self.datasets[name] = json.load(file)
+                self.datasets[name] = self.datasets[name]
+        else:
+            raise ValueError(f"Unsupported dataset type: {dataset_config['type']}")
+
+
+
+ +
+ +
+ + +

+ run() + +

+ + +
+ +

Execute the entire pipeline defined in the configuration.

+

This method loads datasets, executes each step in the pipeline, saves the output, +and returns the total cost of execution.

+ + +

Returns:

+ + + + + + + + + + + + + +
Name TypeDescription
float + float + +
+

The total cost of executing the pipeline.

+
+
+ +
+ Source code in docetl/runner.py +
 85
+ 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
+121
+122
+123
+124
+125
def run(self) -> float:
+    """
+    Execute the entire pipeline defined in the configuration.
+
+    This method loads datasets, executes each step in the pipeline, saves the output,
+    and returns the total cost of execution.
+
+    Returns:
+        float: The total cost of executing the pipeline.
+    """
+    start_time = time.time()
+    self.load_datasets()
+    total_cost = 0
+    with Progress(
+        SpinnerColumn(),
+        TextColumn("[progress.description]{task.description}"),
+        console=self.console,
+    ) as progress:
+        for step in self.config["pipeline"]["steps"]:
+            step_name = step["name"]
+            step_task = progress.add_task(
+                f"Running step [cyan]{step_name}[/cyan]...", total=1
+            )
+            input_data = self.datasets[step["input"]] if "input" in step else None
+            output_data, step_cost = self.execute_step(step, input_data, progress)
+            self.datasets[step_name] = output_data
+            flush_cache(self.console)
+            total_cost += step_cost
+            progress.update(
+                step_task,
+                advance=1,
+                description=f"Step [cyan]{step_name}[/cyan] completed. Cost: [green]${step_cost:.2f}[/green]",
+            )
+
+    self.save_output(self.datasets[self.config["pipeline"]["steps"][-1]["name"]])
+    self.console.log(f"[bold green]Total cost: [green]${total_cost:.2f}[/green]")
+    self.console.log(
+        f"[bold green]Total time: [green]{time.time() - start_time:.2f} seconds[/green]"
+    )
+
+    return total_cost
+
+
+
+ +
+ +
+ + +

+ save_output(data) + +

+ + +
+ +

Save the final output of the pipeline.

+ + +

Parameters:

+ + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
data + List[Dict] + +
+

The data to be saved.

+
+
+ required +
+ + +

Raises:

+ + + + + + + + + + + + + +
TypeDescription
+ ValueError + +
+

If an unsupported output type is specified in the configuration.

+
+
+ +
+ Source code in docetl/runner.py +
144
+145
+146
+147
+148
+149
+150
+151
+152
+153
+154
+155
+156
+157
+158
+159
+160
+161
+162
def save_output(self, data: List[Dict]):
+    """
+    Save the final output of the pipeline.
+
+    Args:
+        data (List[Dict]): The data to be saved.
+
+    Raises:
+        ValueError: If an unsupported output type is specified in the configuration.
+    """
+    output_config = self.config["pipeline"]["output"]
+    if output_config["type"] == "file":
+        with open(output_config["path"], "w") as file:
+            json.dump(data, file, indent=2)
+        self.console.log(
+            f"[green italic]💾 Output saved to {output_config['path']}[/green italic]"
+        )
+    else:
+        raise ValueError(f"Unsupported output type: {output_config['type']}")
+
+
+
+ +
+ +
+ + +

+ syntax_check() + +

+ + +
+ +

Perform a syntax check on all operations defined in the configuration.

+

This method validates each operation by attempting to instantiate it. +If any operation fails to instantiate, a ValueError is raised.

+ + +

Raises:

+ + + + + + + + + + + + + +
TypeDescription
+ ValueError + +
+

If any operation fails the syntax check.

+
+
+ +
+ Source code in docetl/runner.py +
50
+51
+52
+53
+54
+55
+56
+57
+58
+59
+60
+61
+62
+63
+64
+65
+66
+67
+68
+69
+70
+71
+72
+73
+74
+75
+76
+77
def syntax_check(self):
+    """
+    Perform a syntax check on all operations defined in the configuration.
+
+    This method validates each operation by attempting to instantiate it.
+    If any operation fails to instantiate, a ValueError is raised.
+
+    Raises:
+        ValueError: If any operation fails the syntax check.
+    """
+    for operation_config in self.config["operations"]:
+        operation = operation_config["name"]
+        operation_type = operation_config["type"]
+
+        try:
+            operation_class = get_operation(operation_type)
+            operation_class(
+                operation_config,
+                self.default_model,
+                self.max_threads,
+                self.console,
+            )
+        except Exception as e:
+            raise ValueError(
+                f"Syntax check failed for operation '{operation}': {str(e)}"
+            )
+
+    self.console.log("[green]Syntax check passed for all operations.[/green]")
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ docetl.Optimizer + + +

+ + +
+ + +
+ Source code in docetl/builder.py +
  79
+  80
+  81
+  82
+  83
+  84
+  85
+  86
+  87
+  88
+  89
+  90
+  91
+  92
+  93
+  94
+  95
+  96
+  97
+  98
+  99
+ 100
+ 101
+ 102
+ 103
+ 104
+ 105
+ 106
+ 107
+ 108
+ 109
+ 110
+ 111
+ 112
+ 113
+ 114
+ 115
+ 116
+ 117
+ 118
+ 119
+ 120
+ 121
+ 122
+ 123
+ 124
+ 125
+ 126
+ 127
+ 128
+ 129
+ 130
+ 131
+ 132
+ 133
+ 134
+ 135
+ 136
+ 137
+ 138
+ 139
+ 140
+ 141
+ 142
+ 143
+ 144
+ 145
+ 146
+ 147
+ 148
+ 149
+ 150
+ 151
+ 152
+ 153
+ 154
+ 155
+ 156
+ 157
+ 158
+ 159
+ 160
+ 161
+ 162
+ 163
+ 164
+ 165
+ 166
+ 167
+ 168
+ 169
+ 170
+ 171
+ 172
+ 173
+ 174
+ 175
+ 176
+ 177
+ 178
+ 179
+ 180
+ 181
+ 182
+ 183
+ 184
+ 185
+ 186
+ 187
+ 188
+ 189
+ 190
+ 191
+ 192
+ 193
+ 194
+ 195
+ 196
+ 197
+ 198
+ 199
+ 200
+ 201
+ 202
+ 203
+ 204
+ 205
+ 206
+ 207
+ 208
+ 209
+ 210
+ 211
+ 212
+ 213
+ 214
+ 215
+ 216
+ 217
+ 218
+ 219
+ 220
+ 221
+ 222
+ 223
+ 224
+ 225
+ 226
+ 227
+ 228
+ 229
+ 230
+ 231
+ 232
+ 233
+ 234
+ 235
+ 236
+ 237
+ 238
+ 239
+ 240
+ 241
+ 242
+ 243
+ 244
+ 245
+ 246
+ 247
+ 248
+ 249
+ 250
+ 251
+ 252
+ 253
+ 254
+ 255
+ 256
+ 257
+ 258
+ 259
+ 260
+ 261
+ 262
+ 263
+ 264
+ 265
+ 266
+ 267
+ 268
+ 269
+ 270
+ 271
+ 272
+ 273
+ 274
+ 275
+ 276
+ 277
+ 278
+ 279
+ 280
+ 281
+ 282
+ 283
+ 284
+ 285
+ 286
+ 287
+ 288
+ 289
+ 290
+ 291
+ 292
+ 293
+ 294
+ 295
+ 296
+ 297
+ 298
+ 299
+ 300
+ 301
+ 302
+ 303
+ 304
+ 305
+ 306
+ 307
+ 308
+ 309
+ 310
+ 311
+ 312
+ 313
+ 314
+ 315
+ 316
+ 317
+ 318
+ 319
+ 320
+ 321
+ 322
+ 323
+ 324
+ 325
+ 326
+ 327
+ 328
+ 329
+ 330
+ 331
+ 332
+ 333
+ 334
+ 335
+ 336
+ 337
+ 338
+ 339
+ 340
+ 341
+ 342
+ 343
+ 344
+ 345
+ 346
+ 347
+ 348
+ 349
+ 350
+ 351
+ 352
+ 353
+ 354
+ 355
+ 356
+ 357
+ 358
+ 359
+ 360
+ 361
+ 362
+ 363
+ 364
+ 365
+ 366
+ 367
+ 368
+ 369
+ 370
+ 371
+ 372
+ 373
+ 374
+ 375
+ 376
+ 377
+ 378
+ 379
+ 380
+ 381
+ 382
+ 383
+ 384
+ 385
+ 386
+ 387
+ 388
+ 389
+ 390
+ 391
+ 392
+ 393
+ 394
+ 395
+ 396
+ 397
+ 398
+ 399
+ 400
+ 401
+ 402
+ 403
+ 404
+ 405
+ 406
+ 407
+ 408
+ 409
+ 410
+ 411
+ 412
+ 413
+ 414
+ 415
+ 416
+ 417
+ 418
+ 419
+ 420
+ 421
+ 422
+ 423
+ 424
+ 425
+ 426
+ 427
+ 428
+ 429
+ 430
+ 431
+ 432
+ 433
+ 434
+ 435
+ 436
+ 437
+ 438
+ 439
+ 440
+ 441
+ 442
+ 443
+ 444
+ 445
+ 446
+ 447
+ 448
+ 449
+ 450
+ 451
+ 452
+ 453
+ 454
+ 455
+ 456
+ 457
+ 458
+ 459
+ 460
+ 461
+ 462
+ 463
+ 464
+ 465
+ 466
+ 467
+ 468
+ 469
+ 470
+ 471
+ 472
+ 473
+ 474
+ 475
+ 476
+ 477
+ 478
+ 479
+ 480
+ 481
+ 482
+ 483
+ 484
+ 485
+ 486
+ 487
+ 488
+ 489
+ 490
+ 491
+ 492
+ 493
+ 494
+ 495
+ 496
+ 497
+ 498
+ 499
+ 500
+ 501
+ 502
+ 503
+ 504
+ 505
+ 506
+ 507
+ 508
+ 509
+ 510
+ 511
+ 512
+ 513
+ 514
+ 515
+ 516
+ 517
+ 518
+ 519
+ 520
+ 521
+ 522
+ 523
+ 524
+ 525
+ 526
+ 527
+ 528
+ 529
+ 530
+ 531
+ 532
+ 533
+ 534
+ 535
+ 536
+ 537
+ 538
+ 539
+ 540
+ 541
+ 542
+ 543
+ 544
+ 545
+ 546
+ 547
+ 548
+ 549
+ 550
+ 551
+ 552
+ 553
+ 554
+ 555
+ 556
+ 557
+ 558
+ 559
+ 560
+ 561
+ 562
+ 563
+ 564
+ 565
+ 566
+ 567
+ 568
+ 569
+ 570
+ 571
+ 572
+ 573
+ 574
+ 575
+ 576
+ 577
+ 578
+ 579
+ 580
+ 581
+ 582
+ 583
+ 584
+ 585
+ 586
+ 587
+ 588
+ 589
+ 590
+ 591
+ 592
+ 593
+ 594
+ 595
+ 596
+ 597
+ 598
+ 599
+ 600
+ 601
+ 602
+ 603
+ 604
+ 605
+ 606
+ 607
+ 608
+ 609
+ 610
+ 611
+ 612
+ 613
+ 614
+ 615
+ 616
+ 617
+ 618
+ 619
+ 620
+ 621
+ 622
+ 623
+ 624
+ 625
+ 626
+ 627
+ 628
+ 629
+ 630
+ 631
+ 632
+ 633
+ 634
+ 635
+ 636
+ 637
+ 638
+ 639
+ 640
+ 641
+ 642
+ 643
+ 644
+ 645
+ 646
+ 647
+ 648
+ 649
+ 650
+ 651
+ 652
+ 653
+ 654
+ 655
+ 656
+ 657
+ 658
+ 659
+ 660
+ 661
+ 662
+ 663
+ 664
+ 665
+ 666
+ 667
+ 668
+ 669
+ 670
+ 671
+ 672
+ 673
+ 674
+ 675
+ 676
+ 677
+ 678
+ 679
+ 680
+ 681
+ 682
+ 683
+ 684
+ 685
+ 686
+ 687
+ 688
+ 689
+ 690
+ 691
+ 692
+ 693
+ 694
+ 695
+ 696
+ 697
+ 698
+ 699
+ 700
+ 701
+ 702
+ 703
+ 704
+ 705
+ 706
+ 707
+ 708
+ 709
+ 710
+ 711
+ 712
+ 713
+ 714
+ 715
+ 716
+ 717
+ 718
+ 719
+ 720
+ 721
+ 722
+ 723
+ 724
+ 725
+ 726
+ 727
+ 728
+ 729
+ 730
+ 731
+ 732
+ 733
+ 734
+ 735
+ 736
+ 737
+ 738
+ 739
+ 740
+ 741
+ 742
+ 743
+ 744
+ 745
+ 746
+ 747
+ 748
+ 749
+ 750
+ 751
+ 752
+ 753
+ 754
+ 755
+ 756
+ 757
+ 758
+ 759
+ 760
+ 761
+ 762
+ 763
+ 764
+ 765
+ 766
+ 767
+ 768
+ 769
+ 770
+ 771
+ 772
+ 773
+ 774
+ 775
+ 776
+ 777
+ 778
+ 779
+ 780
+ 781
+ 782
+ 783
+ 784
+ 785
+ 786
+ 787
+ 788
+ 789
+ 790
+ 791
+ 792
+ 793
+ 794
+ 795
+ 796
+ 797
+ 798
+ 799
+ 800
+ 801
+ 802
+ 803
+ 804
+ 805
+ 806
+ 807
+ 808
+ 809
+ 810
+ 811
+ 812
+ 813
+ 814
+ 815
+ 816
+ 817
+ 818
+ 819
+ 820
+ 821
+ 822
+ 823
+ 824
+ 825
+ 826
+ 827
+ 828
+ 829
+ 830
+ 831
+ 832
+ 833
+ 834
+ 835
+ 836
+ 837
+ 838
+ 839
+ 840
+ 841
+ 842
+ 843
+ 844
+ 845
+ 846
+ 847
+ 848
+ 849
+ 850
+ 851
+ 852
+ 853
+ 854
+ 855
+ 856
+ 857
+ 858
+ 859
+ 860
+ 861
+ 862
+ 863
+ 864
+ 865
+ 866
+ 867
+ 868
+ 869
+ 870
+ 871
+ 872
+ 873
+ 874
+ 875
+ 876
+ 877
+ 878
+ 879
+ 880
+ 881
+ 882
+ 883
+ 884
+ 885
+ 886
+ 887
+ 888
+ 889
+ 890
+ 891
+ 892
+ 893
+ 894
+ 895
+ 896
+ 897
+ 898
+ 899
+ 900
+ 901
+ 902
+ 903
+ 904
+ 905
+ 906
+ 907
+ 908
+ 909
+ 910
+ 911
+ 912
+ 913
+ 914
+ 915
+ 916
+ 917
+ 918
+ 919
+ 920
+ 921
+ 922
+ 923
+ 924
+ 925
+ 926
+ 927
+ 928
+ 929
+ 930
+ 931
+ 932
+ 933
+ 934
+ 935
+ 936
+ 937
+ 938
+ 939
+ 940
+ 941
+ 942
+ 943
+ 944
+ 945
+ 946
+ 947
+ 948
+ 949
+ 950
+ 951
+ 952
+ 953
+ 954
+ 955
+ 956
+ 957
+ 958
+ 959
+ 960
+ 961
+ 962
+ 963
+ 964
+ 965
+ 966
+ 967
+ 968
+ 969
+ 970
+ 971
+ 972
+ 973
+ 974
+ 975
+ 976
+ 977
+ 978
+ 979
+ 980
+ 981
+ 982
+ 983
+ 984
+ 985
+ 986
+ 987
+ 988
+ 989
+ 990
+ 991
+ 992
+ 993
+ 994
+ 995
+ 996
+ 997
+ 998
+ 999
+1000
+1001
+1002
+1003
+1004
+1005
+1006
+1007
+1008
+1009
+1010
+1011
+1012
+1013
+1014
+1015
+1016
+1017
+1018
+1019
+1020
+1021
+1022
+1023
+1024
+1025
+1026
+1027
+1028
+1029
+1030
+1031
+1032
+1033
+1034
+1035
+1036
+1037
+1038
+1039
+1040
+1041
+1042
+1043
+1044
+1045
+1046
+1047
+1048
+1049
+1050
+1051
+1052
+1053
+1054
+1055
+1056
+1057
+1058
+1059
+1060
+1061
+1062
+1063
+1064
+1065
+1066
+1067
+1068
+1069
+1070
+1071
+1072
+1073
+1074
+1075
+1076
+1077
+1078
+1079
+1080
+1081
+1082
+1083
+1084
+1085
+1086
+1087
+1088
+1089
+1090
+1091
+1092
+1093
+1094
+1095
+1096
+1097
+1098
+1099
+1100
+1101
+1102
+1103
+1104
+1105
+1106
+1107
+1108
+1109
+1110
+1111
+1112
+1113
+1114
+1115
+1116
+1117
+1118
+1119
+1120
+1121
+1122
+1123
+1124
+1125
+1126
+1127
+1128
+1129
+1130
+1131
+1132
+1133
+1134
+1135
+1136
+1137
+1138
+1139
+1140
+1141
+1142
+1143
+1144
+1145
+1146
+1147
+1148
+1149
+1150
+1151
+1152
+1153
+1154
+1155
+1156
+1157
+1158
+1159
+1160
+1161
+1162
+1163
+1164
+1165
+1166
+1167
+1168
+1169
+1170
+1171
+1172
+1173
+1174
+1175
+1176
+1177
+1178
+1179
+1180
+1181
+1182
+1183
+1184
+1185
+1186
+1187
+1188
+1189
+1190
+1191
+1192
+1193
+1194
+1195
+1196
+1197
+1198
+1199
+1200
+1201
+1202
+1203
+1204
+1205
+1206
+1207
+1208
+1209
+1210
+1211
+1212
+1213
+1214
+1215
+1216
+1217
+1218
+1219
+1220
+1221
+1222
+1223
+1224
+1225
+1226
+1227
+1228
+1229
+1230
+1231
+1232
+1233
+1234
+1235
+1236
+1237
+1238
+1239
+1240
+1241
+1242
+1243
+1244
+1245
+1246
+1247
+1248
+1249
+1250
+1251
+1252
+1253
+1254
+1255
+1256
+1257
+1258
+1259
+1260
+1261
+1262
+1263
+1264
+1265
+1266
+1267
+1268
+1269
+1270
+1271
+1272
+1273
+1274
+1275
+1276
+1277
+1278
+1279
+1280
+1281
+1282
+1283
+1284
+1285
+1286
+1287
+1288
+1289
+1290
+1291
+1292
+1293
+1294
+1295
+1296
+1297
+1298
+1299
+1300
+1301
+1302
+1303
+1304
+1305
+1306
+1307
+1308
+1309
+1310
+1311
+1312
+1313
+1314
+1315
+1316
+1317
+1318
+1319
+1320
+1321
+1322
+1323
+1324
+1325
+1326
+1327
+1328
+1329
+1330
+1331
+1332
+1333
+1334
+1335
+1336
+1337
+1338
+1339
+1340
+1341
+1342
+1343
+1344
+1345
+1346
+1347
+1348
+1349
+1350
+1351
+1352
+1353
+1354
+1355
+1356
+1357
+1358
+1359
+1360
+1361
+1362
+1363
+1364
class Optimizer:
+    def __init__(
+        self,
+        yaml_file: str,
+        max_threads: Optional[int] = None,
+        model: str = "gpt-4o",
+        resume: bool = False,
+        timeout: int = 60,
+    ):
+        """
+        Initialize the Optimizer class.
+
+        This method sets up the optimizer with the given configuration file and parameters.
+        It loads the configuration, initializes the console for output, sets up the LLM client,
+        and prepares various attributes for optimization.
+
+        Args:
+            yaml_file (str): Path to the YAML configuration file.
+            max_threads (Optional[int]): Maximum number of threads to use for parallel processing.
+                If None, it will be set to (number of CPUs * 4).
+            model (str): The name of the language model to use. Defaults to "gpt-4o".
+            resume (bool): Whether to resume optimization from a previous run. Defaults to False.
+            timeout (int): Timeout in seconds for operations. Defaults to 60.
+
+        Attributes:
+            yaml_file_path (str): Stores the path to the YAML file.
+            config (Dict): Stores the loaded configuration from the YAML file.
+            console (Console): Rich console for formatted output.
+            optimized_config (Dict): A copy of the original config to be optimized.
+            llm_client (LLMClient): Client for interacting with the language model.
+            max_threads (int): Maximum number of threads for parallel processing.
+            operations_cost (float): Tracks the total cost of operations.
+            timeout (int): Timeout for operations in seconds.
+            selectivities (defaultdict): Stores selectivity information for operations.
+                Selectivity is the ratio of output size to input size for an operation.
+                It's used to estimate how much data will flow through the pipeline after
+                each operation, which helps in optimizing subsequent operations and
+                determining appropriate sample sizes. For example, a selectivity of 0.5
+                means an operation halves the size of its input data.
+            datasets (Dict): Stores loaded datasets.
+
+        The method also calls print_optimizer_config() to display the initial configuration.
+        """
+        self.yaml_file_path = yaml_file
+        self.config = load_config(yaml_file)
+        self.console = Console()
+        self.optimized_config = copy.deepcopy(self.config)
+        self.llm_client = LLMClient(model)
+        self.max_threads = max_threads or (os.cpu_count() or 1) * 4
+        self.operations_cost = 0
+        self.timeout = timeout
+        self.selectivities = defaultdict(dict)
+        self.resume = resume
+
+        home_dir = os.path.expanduser("~")
+        yaml_file_suffix = yaml_file.split("/")[-1].split(".")[0]
+        cache_dir = os.path.join(home_dir, f".docetl/cache/{yaml_file_suffix}")
+        os.makedirs(cache_dir, exist_ok=True)
+        self.datasets = DatasetOnDisk(dir=cache_dir, console=self.console)
+        self.optimized_ops_path = f"{cache_dir}/optimized_ops"
+        base_name = yaml_file.rsplit(".", 1)[0]
+        self.optimized_config_path = f"{base_name}_opt.yaml"
+
+        # Update sample size map
+        self.sample_size_map = SAMPLE_SIZE_MAP
+        if self.config.get("optimizer_config", {}).get("sample_sizes", {}):
+            self.sample_size_map.update(self.config["optimizer_config"]["sample_sizes"])
+
+        self.status = None
+        self.step_op_to_optimized_ops = {}
+
+        self.print_optimizer_config()
+
+    def find_operation(self, op_name: str) -> Dict:
+        for operation_config in self.config["operations"]:
+            if operation_config["name"] == op_name:
+                return operation_config
+        raise ValueError(f"Operation '{op_name}' not found in configuration.")
+
+    def syntax_check(self):
+        """
+        Perform a syntax check on all operations defined in the configuration.
+
+        This method validates each operation by attempting to instantiate it.
+        If any operation fails to instantiate, a ValueError is raised.
+
+        Raises:
+            ValueError: If any operation fails the syntax check.
+        """
+        for operation_config in self.config["operations"]:
+            operation = operation_config["name"]
+            operation_type = operation_config["type"]
+
+            try:
+                operation_class = get_operation(operation_type)
+                operation_class(
+                    operation_config,
+                    self.config.get("default_model", "gpt-4o-mini"),
+                    self.max_threads,
+                    self.console,
+                )
+            except Exception as e:
+                raise ValueError(
+                    f"Syntax check failed for operation '{operation}': {str(e)}"
+                )
+
+        self.console.log("[green]Syntax check passed for all operations.[/green]")
+
+    def print_optimizer_config(self):
+        """
+        Print the current configuration of the optimizer.
+
+        This method uses the Rich console to display a formatted output of the optimizer's
+        configuration. It includes details such as the YAML file path, sample sizes for
+        different operation types, maximum number of threads, the language model being used,
+        and the timeout setting.
+
+        The output is color-coded and formatted for easy readability, with a header and
+        separator lines to clearly delineate the configuration information.
+        """
+        self.console.rule("[bold cyan]Optimizer Configuration[/bold cyan]")
+        self.console.log(f"[yellow]YAML File:[/yellow] {self.yaml_file_path}")
+        self.console.log(f"[yellow]Sample Size:[/yellow] {self.sample_size_map}")
+        self.console.log(f"[yellow]Max Threads:[/yellow] {self.max_threads}")
+        self.console.log(f"[yellow]Model:[/yellow] {self.llm_client.model}")
+        self.console.log(f"[yellow]Timeout:[/yellow] {self.timeout} seconds")
+
+    def compute_sample_size(
+        self,
+        step_name: str,
+        step_ops: List[str],
+        op_config: Dict[str, Any],
+    ) -> int:
+        """
+        Compute the sample size necessary for optimizing given operation based on upstream operations.
+
+        This method calculates an appropriate sample size for an operation, taking into
+        account the selectivities of upstream operations in the same step. It uses a
+        predefined sample size map (SAMPLE_SIZE_MAP) as a starting point.
+
+        For example, if we have a 'map' operation with a default sample size of 10,
+        and one upstream operation with a selectivity of 0.5, the computed sample size for the upstream operation would be:
+        10 / 0.5 = 20
+
+        This ensures that after applying the selectivity of the upstream operation,
+        we still have a representative sample size for the current operation.
+
+        Args:
+            step_name (str): The name of the current step in the pipeline.
+            step_ops (List[str]): A list of all operations in the current step.
+            op_config (Dict[str, Any]): The configuration dictionary for the current operation.
+
+        Returns:
+            int: The computed sample size for the operation.
+
+        The method works as follows:
+        1. If there are no upstream operations, it returns the default sample size for the operation type.
+        2. Otherwise, it starts with the default sample size and adjusts it based on the selectivities
+           of upstream operations.
+        3. It iterates through upstream operations in reverse order, dividing the sample size by
+           each operation's selectivity.
+        4. The final result is rounded to the nearest integer.
+
+        Raises:
+            ValueError: If the selectivity for any upstream operation is not found.
+
+        Note:
+            - The method assumes that selectivities for all upstream operations have been
+              previously computed and stored in self.selectivities.
+            - The sample size is always at least 1, even after all adjustments.
+        """
+        # If an equijoin, load the default. Equijoins are always first
+        if op_config.get("type") == "equijoin":
+            return SAMPLE_SIZE_MAP.get(op_config.get("type"))
+
+        # If there are no upstream operations, use the default sample_size
+        upstream_ops = []
+        for step_op in step_ops:
+            if step_op != op_config.get("name"):
+                if step_op in self.step_op_to_optimized_ops:
+                    upstream_ops.extend(self.step_op_to_optimized_ops[step_op])
+                else:
+                    upstream_ops.append(step_op)
+            else:
+                break
+
+        if len(upstream_ops) == 0:
+            return self.sample_size_map.get(op_config.get("type"), float("inf"))
+
+        # Otherwise, compute the sample size based on the upstream operations
+        sample_size = self.sample_size_map.get(op_config.get("type"), 100)
+        for op in reversed(upstream_ops):
+            # Use the selectivity of the upstream operation to compute the sample size
+            if op not in self.selectivities[step_name]:
+                raise ValueError(
+                    f"Selectivity for operation {op} not found in selectivities. Other ops are {self.selectivities[step_name]}"
+                )
+
+            sample_size = sample_size / self.selectivities[step_name].get(op)
+
+        return int(math.ceil(sample_size))
+
+    def _insert_empty_resolve_operations(self):
+        """
+        Determines whether to insert resolve operations in the pipeline.
+
+        This method iterates through each step in the pipeline and checks if there's a reduce
+        operation that follows a map operation with no resolver in between. If such a case is
+        found, it synthesizes an empty resolver operation and inserts it into the pipeline.
+
+        The method modifies the pipeline configuration in-place.
+
+        Returns:
+            None
+
+        Side effects:
+        - Modifies self.config["pipeline"]["steps"] by potentially inserting new resolve operations.
+        - Adds new resolve operations to self.config["operations"] if necessary.
+        """
+        for i, step in enumerate(self.config["pipeline"]["steps"]):
+            operations = step.get("operations", [])
+            has_map = False
+            has_reduce = False
+            has_resolve = False
+            map_op = None
+            reduce_op = None
+
+            for op in operations:
+                if isinstance(op, dict):
+                    op = list(op.keys())[0]
+                op_config = self.find_operation(op)
+                op_type = op_config["type"]
+                if op_type == "map":
+                    has_map = True
+                    map_op = op
+                elif op_type == "reduce" and op_config.get("synthesize_resolve", True):
+                    has_reduce = True
+                    reduce_op = op
+                elif op_type == "resolve":
+                    has_resolve = True
+
+            if has_map and has_reduce and not has_resolve:
+                # Synthesize an empty resolver
+                self.console.log(
+                    "[yellow]Synthesizing empty resolver operation:[/yellow]"
+                )
+                self.console.log(
+                    f"  • [cyan]Reduce operation:[/cyan] [bold]{reduce_op}[/bold]"
+                )
+                self.console.log(f"  • [cyan]Step:[/cyan] [bold]{step['name']}[/bold]")
+
+                new_resolve_op = f"synthesized_resolve_{i}"
+                reduce_key = self.find_operation(reduce_op).get("reduce_key")
+                if isinstance(reduce_key, str):
+                    reduce_key = [reduce_key]
+                self.config["operations"].append(
+                    {
+                        "name": new_resolve_op,
+                        "type": "resolve",
+                        "empty": True,
+                        "optimize": True,
+                        "embedding_model": "text-embedding-3-small",
+                        "resolution_model": self.config.get(
+                            "default_model", "gpt-4o-mini"
+                        ),
+                        "comparison_model": self.config.get(
+                            "default_model", "gpt-4o-mini"
+                        ),
+                        "_intermediates": {
+                            "map_prompt": self.find_operation(map_op).get("prompt"),
+                            "reduce_key": reduce_key,
+                        },
+                    }
+                )
+
+                # Insert the new resolve operation before the reduce operation
+                reduce_index = next(
+                    i
+                    for i, op in enumerate(operations)
+                    if self.find_operation(op).get("type") == "reduce"
+                )
+                operations.insert(reduce_index, new_resolve_op)
+
+                has_resolve = True
+
+            self.config["pipeline"]["steps"][i]["operations"] = operations
+
+        # Update the pipeline configuration
+        self.config["pipeline"]["steps"] = self.config["pipeline"]["steps"]
+
+    def _add_map_prompts_to_reduce_operations(self):
+        """
+        Add relevant map prompts to reduce operations based on their reduce keys.
+
+        This method iterates through all map operations to create a dictionary mapping
+        output schema keys to map prompts. It then loops through reduce operations,
+        adding the relevant map prompts based on the reduce keys and output schema.
+
+        Side effects:
+        - Modifies reduce operations in self.config["operations"] by adding map prompts.
+        """
+        # Create a dictionary mapping output schema keys to map prompts
+        output_key_to_prompt = {}
+        for op_config in self.config["operations"]:
+            if op_config.get("type") == "map":
+                output_schema = op_config.get("output", {}).get("schema", {})
+                prompt = op_config.get("prompt", "")
+                for key in output_schema.keys():
+                    output_key_to_prompt[key] = prompt
+
+        # Add relevant map prompts to reduce operations
+        for op_config in self.config["operations"]:
+            if op_config.get("type") == "reduce":
+                reduce_keys = op_config.get("reduce_key", [])
+                if isinstance(reduce_keys, str):
+                    reduce_keys = [reduce_keys]
+
+                relevant_prompts = []
+                for key in reduce_keys:
+                    if key in output_key_to_prompt:
+                        relevant_prompts.append(output_key_to_prompt[key])
+
+                if relevant_prompts:
+                    op_config["_intermediates"] = op_config.get("_intermediates", {})
+                    op_config["_intermediates"]["last_map_prompt"] = relevant_prompts[
+                        -1
+                    ]
+
+    def _load_optimized_ops(self):
+        """
+        Load the optimized operations from disk.
+        """
+        if os.path.exists(self.optimized_ops_path):
+            for filename in os.listdir(self.optimized_ops_path):
+                if filename.endswith(".json"):
+                    original_op_name = filename[:-5]  # Remove '.json' from the filename
+                    with open(
+                        os.path.join(self.optimized_ops_path, filename), "r"
+                    ) as f:
+                        optimized_ops = json.load(f)
+
+                    # Update the config with the optimized operations
+                    if original_op_name in [
+                        op["name"] for op in self.config["operations"]
+                    ]:
+                        # Update the config with the optimized operations
+                        for op in optimized_ops:
+                            op["optimize"] = False
+                            self.config["operations"].append(op)
+
+                        # Update the step operations
+                        for step in self.config["pipeline"]["steps"]:
+                            if original_op_name in step["operations"]:
+                                index = step["operations"].index(original_op_name)
+                                step["operations"] = (
+                                    step["operations"][:index]
+                                    + [op["name"] for op in optimized_ops]
+                                    + step["operations"][index + 1 :]
+                                )
+
+                    self.console.log(
+                        f"Loaded optimized operations for {original_op_name}"
+                    )
+
+            self.console.log("[green]Finished loading optimized operations[/green]")
+
+            # Print out the operations for each step
+            self.console.log("[bold blue]Operations for each step:[/bold blue]")
+            for step in self.config["pipeline"]["steps"]:
+                step_name = step.get("name")
+                operations = step.get("operations", [])
+                self.console.log(f"[cyan]Step: {step_name}[/cyan]")
+                for op in operations:
+                    if isinstance(op, dict):
+                        # Handle the case where the operation is a dictionary (e.g., for equijoin)
+                        op_name = list(op.keys())[0]
+                        op_details = op[op_name]
+                        self.console.log(f"  - {op_name}: {op_details}")
+                    else:
+                        self.console.log(f"  - {op}")
+                self.console.log("")  # Add a blank line between steps
+        else:
+            self.console.log("[yellow]No optimized operations found[/yellow]")
+
+    def optimize(self):
+        """
+        Optimize the entire pipeline defined in the configuration.
+
+        This method is the main entry point for the optimization process. It iterates through
+        each step in the pipeline, optimizing from upstream to downstream, and constructs an
+        optimized version of the configuration.
+
+        The optimization process includes:
+        1. Iterating through each step in the pipeline, from upstream to downstream.
+        2. Optimizing each step using the _optimize_step method.
+        3. Updating the optimized configuration with the new operations and steps.
+        4. Saving the optimized configuration to a file.
+        5. Logging the total costs (agent cost, operations cost, and total cost).
+
+        Returns:
+            None
+
+        Side effects:
+        - Modifies self.optimized_config with the optimized pipeline and operations.
+        - Updates self.datasets with the results of each step.
+        - Calls _save_optimized_config to save the optimized configuration to a file.
+        - Logs cost information to the console.
+
+        Raises:
+            ValueError: If a step in the pipeline does not have a name.
+
+        Note:
+        - This method assumes that all necessary data and configurations are already
+          loaded and initialized in the Optimizer instance.
+        - The optimization process is performed step by step, from upstream to downstream,
+          with each step potentially depending on the results of previous steps.
+        """
+        self.console.rule("[bold cyan]Beginning Pipeline Optimization[/bold cyan]")
+
+        self.syntax_check()
+
+        self._insert_empty_resolve_operations()
+
+        # If resume is True, load the optimized operations from disk
+        if self.resume:
+            self._load_optimized_ops()
+
+        for step in self.config["pipeline"]["steps"]:
+            step_name = step.get("name")
+            if not step_name:
+                raise ValueError(
+                    "Step does not have a name. Each step must have a unique name."
+                )
+
+            optimized_step, step_operations, input_data = self._optimize_step(step)
+
+            self.optimized_config["operations"].update(step_operations)
+            for i, op in enumerate(self.optimized_config["operations"]):
+                if op["name"] in step_operations:
+                    self.optimized_config["operations"][i] = step_operations[op["name"]]
+
+            self.optimized_config["pipeline"]["steps"] = [
+                step
+                for step in self.optimized_config["pipeline"]["steps"]
+                if step["name"] != step_name
+            ] + [optimized_step]
+
+            self.step_op_to_optimized_ops[step_name] = optimized_step["operations"]
+
+            step_hash = (
+                hashlib.md5(
+                    json.dumps(
+                        {
+                            "step": [
+                                s
+                                for s in self.optimized_config["pipeline"]["steps"]
+                                if s["name"] == step_name
+                            ][0],
+                            "operations": [
+                                self.find_operation(op)
+                                for op in optimized_step["operations"]
+                            ],
+                        }
+                    ).encode()
+                ).hexdigest()
+                + ".json"
+            )
+            # If the dataset already exists, skip the step
+            if step_hash in self.datasets:
+                continue
+
+            flush_cache(self.console)
+
+            if step_name in self.config.get("optimizer_config", {}).get(
+                "run_full_step", []
+            ):
+                # Run the entire step
+                input_data = self._run_partial_step(
+                    step,
+                    step_operations,
+                    float("inf"),  # TODO: FIX THIS
+                )
+                self.datasets[step_hash] = copy.deepcopy(input_data)
+            else:
+                self.datasets[step_hash] = copy.deepcopy(input_data)
+
+        self._save_optimized_config()
+
+        self.console.log(
+            f"[bold]Total agent cost: ${self.llm_client.total_cost:.2f}[/bold]"
+        )
+        self.console.log(
+            f"[bold]Total operations cost: ${self.operations_cost:.2f}[/bold]"
+        )
+        self.console.log(
+            f"[bold]Total cost: ${self.llm_client.total_cost + self.operations_cost:.2f}[/bold]"
+        )
+
+    def _run_partial_step(
+        self,
+        step: Dict[str, Any],
+        ops_to_run: List[str],
+        sample_size: int,
+        optimized_operations: Dict[str, Dict[str, Any]],
+    ) -> List[Dict[str, Any]]:
+        """
+        Execute a partial step of the pipeline on a sample of the input data.
+
+        This internal method runs a subset of operations for a given step on a sample
+        of the input data. It's used as part of the optimization process to evaluate
+        and optimize individual operations within a step.
+
+        Args:
+            step (Dict[str, Any]): The step configuration dictionary.
+            ops_to_run (List[str]): List of operation names to execute in this partial step.
+            sample_size (int): The number of items to include in the input sample.
+            optimized_operations (Dict[str, Dict[str, Any]]): Dictionary of optimized operations.
+
+        Returns:
+            List[Dict[str, Any]]: The output data after running the specified operations.
+
+        The method performs the following steps:
+        1. Retrieves a sample of the input data using _get_sample_data.
+        2. For equijoin operations, it loads both left and right datasets.
+        3. Iterates through the specified operations, running each on the input sample.
+        4. Returns the final output after all specified operations have been applied.
+
+        Note:
+        - The method handles both regular steps and equijoin steps differently.
+
+        Raises:
+            Any exceptions raised by _get_sample_data or _run_operation methods.
+        """
+        # Take the input data and run the operations in ops_to_run
+        # Return the output data
+        input_sample = self._get_sample_data(step.get("input"), None, sample_size)
+
+        if step.get("input") is None:
+            join_op_name = list(step.get("operations")[0].keys())[0]
+            # this is an equijoin step, load left and right datasets
+            left_data = self._get_sample_data(
+                step.get("operations")[0][join_op_name].get("left"), None, sample_size
+            )
+            right_data = self._get_sample_data(
+                step.get("operations")[0][join_op_name].get("right"), None, sample_size
+            )
+            input_sample = {"left": left_data, "right": right_data}
+
+        for op in ops_to_run:
+            op_object = optimized_operations[op]
+            if "name" not in op_object:
+                op_object["name"] = op
+
+            input_sample = self._run_operation(op_object, input_sample)
+        return input_sample
+
+    def _optimize_step(
+        self, step: Dict[str, Any]
+    ) -> Tuple[Dict[str, Any], List[Dict[str, Any]]]:
+        """
+        Optimize a single step in the pipeline.
+
+        This method takes a step configuration and optimizes each operation within it.
+        It handles different types of operations, including those that require optimization
+        and those that don't.
+
+        Args:
+            step (Dict[str, Any]): The configuration dictionary for the step to be optimized.
+
+        Returns:
+            Tuple[Dict[str, Any], List[Dict[str, Any]], List[Dict[str, Any]]]:
+                - The optimized step configuration.
+                - A list of optimized operations.
+                - The output data after running all operations in the step.
+
+        The method performs the following for each operation in the step:
+        1. Extracts the operation configuration.
+        2. Computes the appropriate sample size for the operation.
+        3. Runs the operation on a sample of the input data.
+        4. If the operation is optimizable and of a supported type, it calls the appropriate
+           optimization method (e.g., _optimize_map, _optimize_reduce).
+        5. If not optimizable or not supported, it runs the operation as-is.
+        6. Calculates and stores the selectivity of each operation.
+        7. Updates the list of optimized operations and their configurations.
+
+        The method uses rich console to provide status updates during the optimization process.
+
+        Note:
+        - This method is a key part of the overall optimization process, focusing on
+          individual steps in the pipeline.
+        - It relies on several helper methods like _run_partial_step, compute_sample_size,
+          and various _optimize_* methods for specific operation types.
+        - When optimizing an operation in the step, all previous operations are run on the
+          sample size needed for the current operation. This ensures that the input to the
+          operation being optimized is representative of what it would receive in the full pipeline.
+
+        Raises:
+            ValueError: If an unsupported operation type is encountered.
+        """
+        optimized_operations = {}
+        optimized_operation_names = []
+        replacement_operations = {}  # List from old op name to new ops
+
+        for op_idx, operation in enumerate(step["operations"]):
+            if isinstance(operation, dict):
+                operation_name = list(operation.keys())[0]
+                operation_config = operation[operation_name]
+            else:
+                operation_name = operation
+                operation_config = {}
+
+            op_object = self.find_operation(operation_name).copy()
+            op_object.update(operation_config)
+            op_object["name"] = operation_name
+
+            # Run the pipeline
+            step_ops = []
+            for step_op in step.get("operations"):
+                if step_op in replacement_operations:
+                    step_ops.extend(replacement_operations[step_op])
+                else:
+                    step_ops.append(step_op)
+
+            # TODO: incorporate this into the optimizer to not run the most downstream operations
+            downstream_ops_exist = op_idx < len(step["operations"]) - 1
+
+            sample_size = self.compute_sample_size(
+                step.get("name"), step_ops, op_object
+            )
+            input_data = self._run_partial_step(
+                step, optimized_operation_names, sample_size, optimized_operations
+            )
+
+            if (
+                not op_object.get("optimize", False)  # Default don't optimize
+                or op_object.get("type") not in SUPPORTED_OPS
+            ):
+                # If optimize is False or operation type is not supported, just use the operation without optimization
+                output_data = self._run_operation(op_object, input_data)
+                optimized_operations[operation_name] = op_object
+                optimized_operation_names.append(operation_name)
+
+                selectivity = len(output_data) / len(input_data)
+
+                self.selectivities[step.get("name")][operation_name] = selectivity
+            else:
+                # Use rich console status to indicate optimization of the operation
+                with self.console.status(
+                    f"[bold blue]Optimizing operation: {operation_name} (Type: {op_object['type']})[/bold blue]"
+                ) as status:
+                    self.status = status
+
+                    # Print the number of elements in input_data
+                    self.console.rule(
+                        f"[yellow]Optimizing operation {operation_name} (Type: {op_object['type']})[/yellow]"
+                    )
+                    if op_object.get("type") == "equijoin":
+                        self.console.log(
+                            f"[yellow]  Sample size (left): {len(input_data['left'])}[/yellow]"
+                        )
+                        self.console.log(
+                            f"[yellow]  Sample size (right): {len(input_data['right'])}[/yellow]"
+                        )
+                    else:
+                        self.console.log(
+                            f"[yellow]  Sample size: {len(input_data)}[/yellow]"
+                        )
+
+                    # Run optimization
+                    for retry in range(
+                        self.config.get("optimizer_config", {}).get(
+                            "num_retries", NUM_OPTIMIZER_RETRIES
+                        )
+                    ):
+                        try:
+                            if op_object.get("type") == "map":
+                                optimized_ops = self._optimize_map(
+                                    op_object, input_data
+                                )
+                            elif op_object.get("type") == "filter":
+                                optimized_ops = self._optimize_map(
+                                    op_object, input_data, is_filter=True
+                                )
+                            elif op_object.get("type") == "reduce":
+                                optimized_ops = self._optimize_reduce(
+                                    op_object, input_data
+                                )
+                            elif op_object.get("type") == "resolve":
+                                optimized_ops = self._optimize_resolve(
+                                    op_object, input_data
+                                )
+                            elif op_object.get("type") == "equijoin":
+                                (
+                                    optimized_ops,
+                                    input_data,
+                                    new_left_name,
+                                    new_right_name,
+                                ) = self._optimize_equijoin(
+                                    op_object,
+                                    operation["left"],
+                                    operation["right"],
+                                    input_data["left"],
+                                    input_data["right"],
+                                    status,
+                                )
+                            else:
+                                raise ValueError(
+                                    f"Unsupported operation type: {op_object['type']}"
+                                )
+                            break  # If successful, break out of the retry loop
+                        except Exception as e:
+                            if (
+                                retry
+                                == self.config.get("optimizer_config", {}).get(
+                                    "num_retries", NUM_OPTIMIZER_RETRIES
+                                )
+                                - 1
+                            ):
+                                raise  # If this was the last retry, re-raise the exception
+                            self.console.log(
+                                f"Optimization attempt {retry + 1} failed. Retrying..."
+                            )
+
+                    if self.status:
+                        self.status.update(
+                            f"[bold blue]Running optimized operation to estimate selectivities: {operation_name}[/bold blue]"
+                        )
+
+                    for op in optimized_ops:
+                        op_name = op["name"]
+                        optimized_operations[op_name] = op
+                        if op.get("type") == "equijoin":
+                            optimized_operation_names.append(
+                                {
+                                    op_name: {
+                                        "left": new_left_name,
+                                        "right": new_right_name,
+                                    }
+                                }
+                            )
+                        else:
+                            optimized_operation_names.append(op_name)
+
+                        old_input_data_size = len(input_data)
+                        input_data = self._run_operation(op, input_data)
+                        new_input_data_size = len(input_data)
+                        selectivity = new_input_data_size / old_input_data_size
+                        self.selectivities[step.get("name")][op_name] = selectivity
+
+                    # Set replacement_operations
+                    replacement_operations[op_object["name"]] = [
+                        o["name"] for o in optimized_ops
+                    ]
+
+                    # Print new operator configs
+                    self.console.log("[bold green]New op configurations:[/bold green]")
+                    for op_name, op_config in optimized_operations.items():
+                        if op_name in [o["name"] for o in optimized_ops]:
+                            self.console.log(
+                                f"[cyan]{op_name}:[/cyan] {json.dumps(op_config, indent=2)}"
+                            )
+
+                    # Save the optimized operations to disk
+                    os.makedirs(self.optimized_ops_path, exist_ok=True)
+
+                    for original_op, replacement_ops in replacement_operations.items():
+                        optimized_ops_list = [
+                            (
+                                optimized_operations[op_name]
+                                if isinstance(op_name, str)
+                                else {
+                                    list(op_name.keys())[0]: optimized_operations[
+                                        list(op_name.keys())[0]
+                                    ]
+                                }
+                            )
+                            for op_name in replacement_ops
+                        ]
+
+                        # Save to disk
+                        optimized_op_file = os.path.join(
+                            self.optimized_ops_path, f"{original_op}.json"
+                        )
+                        with open(optimized_op_file, "w") as f:
+                            json.dump(optimized_ops_list, f, indent=2)
+
+                    self.console.log(
+                        f"[green]Saved optimized operations to {self.optimized_ops_path}[/green]"
+                    )
+                    self.status = None
+                    output_data = input_data
+
+        optimized_step = step.copy()
+        optimized_step["operations"] = optimized_operation_names
+        return optimized_step, optimized_operations, output_data
+
+    def _get_sample_data(
+        self, dataset_name: str, op_config: Optional[Dict[str, Any]], sample_size: int
+    ) -> List[Dict[str, Any]]:
+        """
+        Retrieve a sample of data from a specified dataset.
+
+        This method loads data from either a previously processed dataset or from a file,
+        and returns a sample of the data based on the given sample size and operation configuration.
+
+        Args:
+            dataset_name (str): The name of the dataset to sample from.
+            op_config (Optional[Dict[str, Any]]): The configuration of the operation to be performed.
+                                                  This is used to determine if special sampling is needed.
+            sample_size (int): The desired size of the sample. If set to float('inf'), all data is returned.
+
+        Returns:
+            List[Dict[str, Any]]: A list of dictionaries representing the sampled data.
+
+        Raises:
+            ValueError: If the dataset is not found or if the dataset type is unsupported.
+        """
+        if dataset_name is None:
+            return []
+
+        if any(
+            s["name"] == dataset_name
+            for s in self.optimized_config["pipeline"]["steps"]
+        ):
+            step = [
+                s
+                for s in self.optimized_config["pipeline"]["steps"]
+                if s["name"] == dataset_name
+            ][0]
+            name_hash = (
+                hashlib.md5(
+                    json.dumps(
+                        {
+                            "step": step,
+                            "operations": [
+                                self.find_operation(op) for op in step["operations"]
+                            ],
+                        }
+                    ).encode()
+                ).hexdigest()
+                + ".json"
+            )
+        else:
+            name_hash = None
+
+        if name_hash and name_hash in self.datasets:
+            data = self.datasets[name_hash]
+        else:
+            dataset = self.config["datasets"].get(dataset_name)
+            if dataset is None:
+                raise ValueError(
+                    f"Dataset '{dataset_name}' not found in config or previous steps."
+                )
+            if dataset["type"] == "file":
+                with open(dataset["path"], "r") as f:
+                    data = json.load(f)
+            else:
+                raise ValueError(f"Unsupported dataset type: {dataset['type']}")
+
+        if sample_size == float("inf"):
+            return data
+
+        if op_config:
+            if op_config.get("type") == "reduce":
+                return self._get_reduce_sample(
+                    data, op_config.get("reduce_key"), sample_size
+                )
+
+        # Take the random 500 examples or all if less than 500
+        initial_data = random.sample(data, min(500, len(data)))
+
+        # Calculate counts for each example
+        char_counts = [len(str(item)) for item in initial_data]
+        total_counts = sum(char_counts)
+
+        # Calculate weights based on word counts
+        weights = [count / total_counts for count in char_counts]
+
+        # Perform weighted random sampling
+        return random.choices(
+            initial_data, weights=weights, k=min(sample_size, len(initial_data))
+        )
+
+    def _get_reduce_sample(
+        self, data: List[Dict[str, Any]], reduce_key: str, sample_size: int
+    ) -> List[Dict[str, Any]]:
+        """
+        Get a representative sample for a reduce operation.
+
+        This method creates a sample that preserves the distribution of groups in the data,
+        focusing on the top 5 largest groups. It also generates and prints a histogram of group sizes.
+
+        Args:
+            data (List[Dict[str, Any]]): The full dataset to sample from.
+            reduce_key (str): The key used for grouping in the reduce operation.
+            sample_size (int): The desired size of the sample.
+
+        Returns:
+            List[Dict[str, Any]]: A list of dictionaries representing the sampled data.
+        """
+        # Group data by reduce key
+        grouped_data = defaultdict(list)
+        for item in data:
+            grouped_data[item[reduce_key]].append(item)
+
+        # Sort groups by size in descending order
+        sorted_groups = sorted(
+            grouped_data.items(), key=lambda x: len(x[1]), reverse=True
+        )
+
+        sample = []
+
+        # Take the top 5 groups
+        top_5_groups = sorted_groups[:5]
+
+        # Calculate the total count of items in the top 5 groups
+        total_count = sum(len(items) for _, items in top_5_groups)
+
+        sample = []
+        for _, items in top_5_groups:
+            # Calculate the proportion of items to sample from this group
+            group_proportion = len(items) / total_count
+            group_sample_size = int(sample_size * group_proportion)
+
+            # Sample from the group
+            group_sample = random.sample(items, min(group_sample_size, len(items)))
+            sample.extend(group_sample)
+
+        # If we haven't reached the desired sample size, add more items randomly
+        if len(sample) < sample_size:
+            remaining_items = [
+                item
+                for _, items in top_5_groups
+                for item in items
+                if item not in sample
+            ]
+            additional_sample = random.sample(
+                remaining_items,
+                min(sample_size - len(sample), len(remaining_items)),
+            )
+            sample.extend(additional_sample)
+
+        # Add items randomly from non-top groups to meet the sample size
+        if len(sample) < sample_size:
+            remaining_items = [
+                item
+                for _, items in grouped_data.items()
+                for item in items
+                if item not in sample
+            ]
+            additional_sample = random.sample(
+                remaining_items,
+                min(sample_size - len(sample), len(remaining_items)),
+            )
+            sample.extend(additional_sample)
+
+        # Create a histogram of group sizes
+        group_sizes = [len(items) for _, items in grouped_data.items()]
+        size_counts = Counter(group_sizes)
+
+        # Sort the sizes for a more readable output
+        sorted_sizes = sorted(size_counts.items())
+
+        # Print the histogram
+        self.console.log("\n[bold]Histogram of Group Sizes:[/bold]")
+        max_bar_width, max_count = 2, max(size_counts.values())
+        for size, count in sorted_sizes[:5]:
+            normalized_count = int(count / max_count * max_bar_width)
+            bar = "█" * normalized_count
+            self.console.log(f"{size:3d}: {bar} ({count})")
+        self.console.log("\n")
+
+        return sample
+
+    def _optimize_reduce(
+        self, op_config: Dict[str, Any], input_data: List[Dict[str, Any]]
+    ) -> List[Dict[str, Any]]:
+        """
+        Optimize a reduce operation.
+
+        This method creates a ReduceOptimizer instance and uses it to optimize the reduce operation.
+
+        Args:
+            op_config (Dict[str, Any]): The configuration of the reduce operation.
+            input_data (List[Dict[str, Any]]): The input data for the reduce operation.
+
+        Returns:
+            List[Dict[str, Any]]: The optimized operation configuration.
+        """
+        reduce_optimizer = ReduceOptimizer(
+            self.config,
+            self.console,
+            self.llm_client,
+            self.max_threads,
+            self._run_operation,
+        )
+        optimized_ops, _, cost = reduce_optimizer.optimize(op_config, input_data)
+        self.operations_cost += cost
+        return optimized_ops
+
+    def _optimize_equijoin(
+        self,
+        op_config: Dict[str, Any],
+        left_name: str,
+        right_name: str,
+        left_data: List[Dict[str, Any]],
+        right_data: List[Dict[str, Any]],
+        status: Status,
+    ) -> Tuple[List[Dict[str, Any]], Dict[str, List[Dict[str, Any]]], str, str]:
+        """
+        Optimize an equijoin operation.
+
+        This method creates a JoinOptimizer instance and uses it to optimize the equijoin operation.
+        It updates the operation cost and runs the optimized operation.
+        If the LLM suggests a map transformation, it will optimize the map operation as its own step, and then go back to optimize the equijoin operation.
+
+        Args:
+            op_config (Dict[str, Any]): The configuration of the equijoin operation.
+            left_name (str): The name of the left dataset.
+            right_name (str): The name of the right dataset.
+            left_data (List[Dict[str, Any]]): The left dataset for the join.
+            right_data (List[Dict[str, Any]]): The right dataset for the join.
+
+        Returns:
+            Tuple[List[Dict[str, Any]], Dict[str, List[Dict[str, Any]]], str, str]: The optimized operation configuration, the new left and right datasets, and the new left and right names.
+        """
+        max_iterations = 2
+        new_left_name = left_name
+        new_right_name = right_name
+        for _ in range(max_iterations):
+            join_optimizer = JoinOptimizer(
+                self.config,
+                op_config,
+                self.console,
+                self.llm_client,
+                self.max_threads,
+                target_recall=self.config.get("optimizer_config", {})
+                .get("equijoin", {})
+                .get("target_recall", 0.95),
+                estimated_selectivity=self.config.get("optimizer_config", {})
+                .get("equijoin", {})
+                .get("estimated_selectivity", None),
+                status=status,
+            )
+            optimized_config, cost, agent_results = join_optimizer.optimize_equijoin(
+                left_data, right_data
+            )
+            self.operations_cost += cost
+            # Update the operation config with the optimized values
+            op_config.update(optimized_config)
+
+            if not agent_results.get("optimize_map", False):
+                break  # Exit the loop if no more map optimizations are necessary
+
+            # Update the status to indicate we're optimizing a map operation
+            output_key = agent_results["output_key"]
+            if self.status:
+                self.status.update(
+                    f"Optimizing map operation for {output_key} extraction to help with the equijoin"
+                )
+            map_prompt = agent_results["map_prompt"]
+            dataset_to_transform = (
+                left_data
+                if agent_results["dataset_to_transform"] == "left"
+                else right_data
+            )
+
+            # Create a new step for the map operation
+            map_operation = {
+                "name": f"synthesized_{output_key}_extraction",
+                "type": "map",
+                "prompt": map_prompt,
+                "model": self.config.get("default_model", "gpt-4o-mini"),
+                "output": {"schema": {output_key: "string"}},
+                "optimize": False,
+            }
+
+            # Optimize the map operation
+            if map_operation["optimize"]:
+                dataset_to_transform_sample = random.sample(
+                    dataset_to_transform, self.sample_size_map.get("map")
+                )
+                optimized_map_operations = self._optimize_map(
+                    map_operation, dataset_to_transform_sample
+                )
+            else:
+                optimized_map_operations = [map_operation]
+
+            new_step = {
+                "name": f"synthesized_{output_key}_extraction",
+                "input": (
+                    left_name
+                    if agent_results["dataset_to_transform"] == "left"
+                    else right_name
+                ),
+                "operations": [mo["name"] for mo in optimized_map_operations],
+            }
+            if agent_results["dataset_to_transform"] == "left":
+                new_left_name = new_step["name"]
+            else:
+                new_right_name = new_step["name"]
+
+            for optimized_map_op in optimized_map_operations:
+                self.optimized_config["operations"].append(optimized_map_op)
+
+            self.optimized_config["pipeline"]["steps"].append(new_step)
+
+            # Now run the optimized map operation on the entire dataset_to_transform
+            for op in optimized_map_operations:
+                dataset_to_transform = self._run_operation(op, dataset_to_transform)
+
+            # Update the appropriate dataset for the next iteration
+            if agent_results["dataset_to_transform"] == "left":
+                left_data = dataset_to_transform
+            else:
+                right_data = dataset_to_transform
+
+            if self.status:
+                self.status.update(
+                    f"Optimizing equijoin operation with {output_key} extraction"
+                )
+
+        # Pop off "left" and "right" from the op_config
+        op_config.pop("left")
+        op_config.pop("right")
+        return (
+            [op_config],
+            {"left": left_data, "right": right_data},
+            new_left_name,
+            new_right_name,
+        )
+
+    def _optimize_map(
+        self,
+        op_config: Dict[str, Any],
+        input_data: List[Dict[str, Any]],
+        is_filter: bool = False,
+    ) -> List[Dict[str, Any]]:
+        """
+        Optimize a map operation.
+
+        This method creates a MapOptimizer instance and uses it to optimize the map operation.
+
+        Args:
+            op_config (Dict[str, Any]): The configuration of the map operation.
+            input_data (List[Dict[str, Any]]): The input data for the map operation.
+            is_filter (bool, optional): If True, the operation is a filter operation. Defaults to False.
+
+        Returns:
+            List[Dict[str, Any]]: The optimized operation configuration.
+        """
+        map_optimizer = MapOptimizer(
+            self.config,
+            self.console,
+            self.llm_client,
+            self.max_threads,
+            self._run_operation,
+            timeout=self.timeout,
+            is_filter=is_filter,
+        )
+        optimized_ops, _, cost = map_optimizer.optimize(op_config, input_data)
+        self.operations_cost += cost
+        return optimized_ops
+
+    def _optimize_resolve(
+        self, op_config: Dict[str, Any], input_data: List[Dict[str, Any]]
+    ) -> List[Dict[str, Any]]:
+        """
+        Optimize a resolve operation.
+
+        This method creates a JoinOptimizer instance and uses it to optimize the resolve operation.
+        It updates the operation cost and runs the optimized operation.
+
+        Args:
+            op_config (Dict[str, Any]): The configuration of the resolve operation.
+            input_data (List[Dict[str, Any]]): The input data for the resolve operation.
+
+        Returns:
+            List[Dict[str, Any]]: The optimized operation configuration.
+        """
+        optimized_config, cost = JoinOptimizer(
+            self.config, op_config, self.console, self.llm_client, self.max_threads
+        ).optimize_resolve(input_data)
+
+        if optimized_config.get("empty", False):
+            # Remove this operation from the pipeline and just return input data
+            return [], input_data
+
+        self.operations_cost += cost
+
+        # Update the operation config with the optimized values
+        op_config.update(optimized_config)
+
+        return [op_config]
+
+    def _run_operation(
+        self,
+        op_config: Dict[str, Any],
+        input_data: List[Dict[str, Any]],
+        return_instance: bool = False,
+        is_build: bool = False,
+    ) -> Union[List[Dict[str, Any]], Tuple[List[Dict[str, Any]], BaseOperation]]:
+        """
+        Run a single operation based on its configuration.
+
+        This method creates an instance of the appropriate operation class and executes it.
+        It also updates the total operation cost.
+
+        Args:
+            op_config (Dict[str, Any]): The configuration of the operation to run.
+            input_data (List[Dict[str, Any]]): The input data for the operation.
+            return_instance (bool, optional): If True, return the operation instance along with the output data.
+
+        Returns:
+            Union[List[Dict[str, Any]], Tuple[List[Dict[str, Any]], BaseOperation]]:
+            If return_instance is False, returns the output data.
+            If return_instance is True, returns a tuple of the output data and the operation instance.
+        """
+        operation_class = get_operation(op_config["type"])
+
+        oc_kwargs = {
+            "config": op_config,
+            "default_model": self.config["default_model"],
+            "max_threads": self.max_threads,
+            "console": self.console,
+            "status": self.status,
+        }
+        operation_instance = operation_class(**oc_kwargs)
+        if op_config["type"] == "equijoin":
+            left_data = input_data["left"]
+            right_data = input_data["right"]
+            output_data, cost = operation_instance.execute(left_data, right_data)
+        elif op_config["type"] == "filter":
+            output_data, cost = operation_instance.execute(input_data, is_build)
+        else:
+            output_data, cost = operation_instance.execute(input_data)
+        self.operations_cost += cost
+        if return_instance:
+            return output_data, operation_instance
+        else:
+            return output_data
+
+    # Recursively resolve all anchors and aliases
+    @staticmethod
+    def resolve_anchors(data):
+        """
+        Recursively resolve all anchors and aliases in a nested data structure.
+
+        This static method traverses through dictionaries and lists, resolving any YAML anchors and aliases.
+
+        Args:
+            data: The data structure to resolve. Can be a dictionary, list, or any other type.
+
+        Returns:
+            The resolved data structure with all anchors and aliases replaced by their actual values.
+        """
+        if isinstance(data, dict):
+            return {k: Optimizer.resolve_anchors(v) for k, v in data.items()}
+        elif isinstance(data, list):
+            return [Optimizer.resolve_anchors(item) for item in data]
+        else:
+            return data
+
+    def _save_optimized_config(self):
+        """
+        Save the optimized configuration to a YAML file.
+
+        This method creates a copy of the optimized configuration, resolves all anchors and aliases,
+        and saves it to a new YAML file. The new file name is based on the original file name with '_opt' appended.
+        """
+        # Create a copy of the optimized config to modify
+        config_to_save = self.optimized_config.copy()
+
+        resolved_config = Optimizer.resolve_anchors(config_to_save)
+
+        # Remove _intermediates from each operation in resolved_config
+        if "operations" in resolved_config:
+            for op_config in resolved_config["operations"]:
+                if "_intermediates" in op_config:
+                    del op_config["_intermediates"]
+
+        with open(self.optimized_config_path, "w") as f:
+            yaml.safe_dump(resolved_config, f, default_flow_style=False, width=80)
+            self.console.log(
+                f"[green italic]💾 Optimized config saved to {self.optimized_config_path}[/green italic]"
+            )
+
+
+ + + +
+ + + + + + + + + +
+ + +

+ __init__(yaml_file, max_threads=None, model='gpt-4o', resume=False, timeout=60) + +

+ + +
+ +

Initialize the Optimizer class.

+

This method sets up the optimizer with the given configuration file and parameters. +It loads the configuration, initializes the console for output, sets up the LLM client, +and prepares various attributes for optimization.

+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
yaml_file + str + +
+

Path to the YAML configuration file.

+
+
+ required +
max_threads + Optional[int] + +
+

Maximum number of threads to use for parallel processing. +If None, it will be set to (number of CPUs * 4).

+
+
+ None +
model + str + +
+

The name of the language model to use. Defaults to "gpt-4o".

+
+
+ 'gpt-4o' +
resume + bool + +
+

Whether to resume optimization from a previous run. Defaults to False.

+
+
+ False +
timeout + int + +
+

Timeout in seconds for operations. Defaults to 60.

+
+
+ 60 +
+ + +

Attributes:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescription
yaml_file_path + str + +
+

Stores the path to the YAML file.

+
+
config + Dict + +
+

Stores the loaded configuration from the YAML file.

+
+
console + Console + +
+

Rich console for formatted output.

+
+
optimized_config + Dict + +
+

A copy of the original config to be optimized.

+
+
llm_client + LLMClient + +
+

Client for interacting with the language model.

+
+
max_threads + int + +
+

Maximum number of threads for parallel processing.

+
+
operations_cost + float + +
+

Tracks the total cost of operations.

+
+
timeout + int + +
+

Timeout for operations in seconds.

+
+
selectivities + defaultdict + +
+

Stores selectivity information for operations. +Selectivity is the ratio of output size to input size for an operation. +It's used to estimate how much data will flow through the pipeline after +each operation, which helps in optimizing subsequent operations and +determining appropriate sample sizes. For example, a selectivity of 0.5 +means an operation halves the size of its input data.

+
+
datasets + Dict + +
+

Stores loaded datasets.

+
+
+

The method also calls print_optimizer_config() to display the initial configuration.

+ +
+ Source code in docetl/builder.py +
 80
+ 81
+ 82
+ 83
+ 84
+ 85
+ 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
+121
+122
+123
+124
+125
+126
+127
+128
+129
+130
+131
+132
+133
+134
+135
+136
+137
+138
+139
+140
+141
+142
+143
+144
+145
+146
+147
+148
+149
+150
def __init__(
+    self,
+    yaml_file: str,
+    max_threads: Optional[int] = None,
+    model: str = "gpt-4o",
+    resume: bool = False,
+    timeout: int = 60,
+):
+    """
+    Initialize the Optimizer class.
+
+    This method sets up the optimizer with the given configuration file and parameters.
+    It loads the configuration, initializes the console for output, sets up the LLM client,
+    and prepares various attributes for optimization.
+
+    Args:
+        yaml_file (str): Path to the YAML configuration file.
+        max_threads (Optional[int]): Maximum number of threads to use for parallel processing.
+            If None, it will be set to (number of CPUs * 4).
+        model (str): The name of the language model to use. Defaults to "gpt-4o".
+        resume (bool): Whether to resume optimization from a previous run. Defaults to False.
+        timeout (int): Timeout in seconds for operations. Defaults to 60.
+
+    Attributes:
+        yaml_file_path (str): Stores the path to the YAML file.
+        config (Dict): Stores the loaded configuration from the YAML file.
+        console (Console): Rich console for formatted output.
+        optimized_config (Dict): A copy of the original config to be optimized.
+        llm_client (LLMClient): Client for interacting with the language model.
+        max_threads (int): Maximum number of threads for parallel processing.
+        operations_cost (float): Tracks the total cost of operations.
+        timeout (int): Timeout for operations in seconds.
+        selectivities (defaultdict): Stores selectivity information for operations.
+            Selectivity is the ratio of output size to input size for an operation.
+            It's used to estimate how much data will flow through the pipeline after
+            each operation, which helps in optimizing subsequent operations and
+            determining appropriate sample sizes. For example, a selectivity of 0.5
+            means an operation halves the size of its input data.
+        datasets (Dict): Stores loaded datasets.
+
+    The method also calls print_optimizer_config() to display the initial configuration.
+    """
+    self.yaml_file_path = yaml_file
+    self.config = load_config(yaml_file)
+    self.console = Console()
+    self.optimized_config = copy.deepcopy(self.config)
+    self.llm_client = LLMClient(model)
+    self.max_threads = max_threads or (os.cpu_count() or 1) * 4
+    self.operations_cost = 0
+    self.timeout = timeout
+    self.selectivities = defaultdict(dict)
+    self.resume = resume
+
+    home_dir = os.path.expanduser("~")
+    yaml_file_suffix = yaml_file.split("/")[-1].split(".")[0]
+    cache_dir = os.path.join(home_dir, f".docetl/cache/{yaml_file_suffix}")
+    os.makedirs(cache_dir, exist_ok=True)
+    self.datasets = DatasetOnDisk(dir=cache_dir, console=self.console)
+    self.optimized_ops_path = f"{cache_dir}/optimized_ops"
+    base_name = yaml_file.rsplit(".", 1)[0]
+    self.optimized_config_path = f"{base_name}_opt.yaml"
+
+    # Update sample size map
+    self.sample_size_map = SAMPLE_SIZE_MAP
+    if self.config.get("optimizer_config", {}).get("sample_sizes", {}):
+        self.sample_size_map.update(self.config["optimizer_config"]["sample_sizes"])
+
+    self.status = None
+    self.step_op_to_optimized_ops = {}
+
+    self.print_optimizer_config()
+
+
+
+ +
+ +
+ + +

+ compute_sample_size(step_name, step_ops, op_config) + +

+ + +
+ +

Compute the sample size necessary for optimizing given operation based on upstream operations.

+

This method calculates an appropriate sample size for an operation, taking into +account the selectivities of upstream operations in the same step. It uses a +predefined sample size map (SAMPLE_SIZE_MAP) as a starting point.

+

For example, if we have a 'map' operation with a default sample size of 10, +and one upstream operation with a selectivity of 0.5, the computed sample size for the upstream operation would be: +10 / 0.5 = 20

+

This ensures that after applying the selectivity of the upstream operation, +we still have a representative sample size for the current operation.

+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
step_name + str + +
+

The name of the current step in the pipeline.

+
+
+ required +
step_ops + List[str] + +
+

A list of all operations in the current step.

+
+
+ required +
op_config + Dict[str, Any] + +
+

The configuration dictionary for the current operation.

+
+
+ required +
+ + +

Returns:

+ + + + + + + + + + + + + +
Name TypeDescription
int + int + +
+

The computed sample size for the operation.

+
+
+

The method works as follows: +1. If there are no upstream operations, it returns the default sample size for the operation type. +2. Otherwise, it starts with the default sample size and adjusts it based on the selectivities + of upstream operations. +3. It iterates through upstream operations in reverse order, dividing the sample size by + each operation's selectivity. +4. The final result is rounded to the nearest integer.

+ + +

Raises:

+ + + + + + + + + + + + + +
TypeDescription
+ ValueError + +
+

If the selectivity for any upstream operation is not found.

+
+
+ + +
+ Note +
    +
  • The method assumes that selectivities for all upstream operations have been + previously computed and stored in self.selectivities.
  • +
  • The sample size is always at least 1, even after all adjustments.
  • +
+
+
+ Source code in docetl/builder.py +
206
+207
+208
+209
+210
+211
+212
+213
+214
+215
+216
+217
+218
+219
+220
+221
+222
+223
+224
+225
+226
+227
+228
+229
+230
+231
+232
+233
+234
+235
+236
+237
+238
+239
+240
+241
+242
+243
+244
+245
+246
+247
+248
+249
+250
+251
+252
+253
+254
+255
+256
+257
+258
+259
+260
+261
+262
+263
+264
+265
+266
+267
+268
+269
+270
+271
+272
+273
+274
+275
+276
+277
+278
+279
def compute_sample_size(
+    self,
+    step_name: str,
+    step_ops: List[str],
+    op_config: Dict[str, Any],
+) -> int:
+    """
+    Compute the sample size necessary for optimizing given operation based on upstream operations.
+
+    This method calculates an appropriate sample size for an operation, taking into
+    account the selectivities of upstream operations in the same step. It uses a
+    predefined sample size map (SAMPLE_SIZE_MAP) as a starting point.
+
+    For example, if we have a 'map' operation with a default sample size of 10,
+    and one upstream operation with a selectivity of 0.5, the computed sample size for the upstream operation would be:
+    10 / 0.5 = 20
+
+    This ensures that after applying the selectivity of the upstream operation,
+    we still have a representative sample size for the current operation.
+
+    Args:
+        step_name (str): The name of the current step in the pipeline.
+        step_ops (List[str]): A list of all operations in the current step.
+        op_config (Dict[str, Any]): The configuration dictionary for the current operation.
+
+    Returns:
+        int: The computed sample size for the operation.
+
+    The method works as follows:
+    1. If there are no upstream operations, it returns the default sample size for the operation type.
+    2. Otherwise, it starts with the default sample size and adjusts it based on the selectivities
+       of upstream operations.
+    3. It iterates through upstream operations in reverse order, dividing the sample size by
+       each operation's selectivity.
+    4. The final result is rounded to the nearest integer.
+
+    Raises:
+        ValueError: If the selectivity for any upstream operation is not found.
+
+    Note:
+        - The method assumes that selectivities for all upstream operations have been
+          previously computed and stored in self.selectivities.
+        - The sample size is always at least 1, even after all adjustments.
+    """
+    # If an equijoin, load the default. Equijoins are always first
+    if op_config.get("type") == "equijoin":
+        return SAMPLE_SIZE_MAP.get(op_config.get("type"))
+
+    # If there are no upstream operations, use the default sample_size
+    upstream_ops = []
+    for step_op in step_ops:
+        if step_op != op_config.get("name"):
+            if step_op in self.step_op_to_optimized_ops:
+                upstream_ops.extend(self.step_op_to_optimized_ops[step_op])
+            else:
+                upstream_ops.append(step_op)
+        else:
+            break
+
+    if len(upstream_ops) == 0:
+        return self.sample_size_map.get(op_config.get("type"), float("inf"))
+
+    # Otherwise, compute the sample size based on the upstream operations
+    sample_size = self.sample_size_map.get(op_config.get("type"), 100)
+    for op in reversed(upstream_ops):
+        # Use the selectivity of the upstream operation to compute the sample size
+        if op not in self.selectivities[step_name]:
+            raise ValueError(
+                f"Selectivity for operation {op} not found in selectivities. Other ops are {self.selectivities[step_name]}"
+            )
+
+        sample_size = sample_size / self.selectivities[step_name].get(op)
+
+    return int(math.ceil(sample_size))
+
+
+
+ +
+ +
+ + +

+ optimize() + +

+ + +
+ +

Optimize the entire pipeline defined in the configuration.

+

This method is the main entry point for the optimization process. It iterates through +each step in the pipeline, optimizing from upstream to downstream, and constructs an +optimized version of the configuration.

+

The optimization process includes: +1. Iterating through each step in the pipeline, from upstream to downstream. +2. Optimizing each step using the _optimize_step method. +3. Updating the optimized configuration with the new operations and steps. +4. Saving the optimized configuration to a file. +5. Logging the total costs (agent cost, operations cost, and total cost).

+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ +
+

None

+
+
+

Side effects: +- Modifies self.optimized_config with the optimized pipeline and operations. +- Updates self.datasets with the results of each step. +- Calls _save_optimized_config to save the optimized configuration to a file. +- Logs cost information to the console.

+ + +

Raises:

+ + + + + + + + + + + + + +
TypeDescription
+ ValueError + +
+

If a step in the pipeline does not have a name.

+
+
+

Note: +- This method assumes that all necessary data and configurations are already + loaded and initialized in the Optimizer instance. +- The optimization process is performed step by step, from upstream to downstream, + with each step potentially depending on the results of previous steps.

+ +
+ Source code in docetl/builder.py +
463
+464
+465
+466
+467
+468
+469
+470
+471
+472
+473
+474
+475
+476
+477
+478
+479
+480
+481
+482
+483
+484
+485
+486
+487
+488
+489
+490
+491
+492
+493
+494
+495
+496
+497
+498
+499
+500
+501
+502
+503
+504
+505
+506
+507
+508
+509
+510
+511
+512
+513
+514
+515
+516
+517
+518
+519
+520
+521
+522
+523
+524
+525
+526
+527
+528
+529
+530
+531
+532
+533
+534
+535
+536
+537
+538
+539
+540
+541
+542
+543
+544
+545
+546
+547
+548
+549
+550
+551
+552
+553
+554
+555
+556
+557
+558
+559
+560
+561
+562
+563
+564
+565
+566
+567
+568
+569
+570
+571
+572
+573
+574
+575
def optimize(self):
+    """
+    Optimize the entire pipeline defined in the configuration.
+
+    This method is the main entry point for the optimization process. It iterates through
+    each step in the pipeline, optimizing from upstream to downstream, and constructs an
+    optimized version of the configuration.
+
+    The optimization process includes:
+    1. Iterating through each step in the pipeline, from upstream to downstream.
+    2. Optimizing each step using the _optimize_step method.
+    3. Updating the optimized configuration with the new operations and steps.
+    4. Saving the optimized configuration to a file.
+    5. Logging the total costs (agent cost, operations cost, and total cost).
+
+    Returns:
+        None
+
+    Side effects:
+    - Modifies self.optimized_config with the optimized pipeline and operations.
+    - Updates self.datasets with the results of each step.
+    - Calls _save_optimized_config to save the optimized configuration to a file.
+    - Logs cost information to the console.
+
+    Raises:
+        ValueError: If a step in the pipeline does not have a name.
+
+    Note:
+    - This method assumes that all necessary data and configurations are already
+      loaded and initialized in the Optimizer instance.
+    - The optimization process is performed step by step, from upstream to downstream,
+      with each step potentially depending on the results of previous steps.
+    """
+    self.console.rule("[bold cyan]Beginning Pipeline Optimization[/bold cyan]")
+
+    self.syntax_check()
+
+    self._insert_empty_resolve_operations()
+
+    # If resume is True, load the optimized operations from disk
+    if self.resume:
+        self._load_optimized_ops()
+
+    for step in self.config["pipeline"]["steps"]:
+        step_name = step.get("name")
+        if not step_name:
+            raise ValueError(
+                "Step does not have a name. Each step must have a unique name."
+            )
+
+        optimized_step, step_operations, input_data = self._optimize_step(step)
+
+        self.optimized_config["operations"].update(step_operations)
+        for i, op in enumerate(self.optimized_config["operations"]):
+            if op["name"] in step_operations:
+                self.optimized_config["operations"][i] = step_operations[op["name"]]
+
+        self.optimized_config["pipeline"]["steps"] = [
+            step
+            for step in self.optimized_config["pipeline"]["steps"]
+            if step["name"] != step_name
+        ] + [optimized_step]
+
+        self.step_op_to_optimized_ops[step_name] = optimized_step["operations"]
+
+        step_hash = (
+            hashlib.md5(
+                json.dumps(
+                    {
+                        "step": [
+                            s
+                            for s in self.optimized_config["pipeline"]["steps"]
+                            if s["name"] == step_name
+                        ][0],
+                        "operations": [
+                            self.find_operation(op)
+                            for op in optimized_step["operations"]
+                        ],
+                    }
+                ).encode()
+            ).hexdigest()
+            + ".json"
+        )
+        # If the dataset already exists, skip the step
+        if step_hash in self.datasets:
+            continue
+
+        flush_cache(self.console)
+
+        if step_name in self.config.get("optimizer_config", {}).get(
+            "run_full_step", []
+        ):
+            # Run the entire step
+            input_data = self._run_partial_step(
+                step,
+                step_operations,
+                float("inf"),  # TODO: FIX THIS
+            )
+            self.datasets[step_hash] = copy.deepcopy(input_data)
+        else:
+            self.datasets[step_hash] = copy.deepcopy(input_data)
+
+    self._save_optimized_config()
+
+    self.console.log(
+        f"[bold]Total agent cost: ${self.llm_client.total_cost:.2f}[/bold]"
+    )
+    self.console.log(
+        f"[bold]Total operations cost: ${self.operations_cost:.2f}[/bold]"
+    )
+    self.console.log(
+        f"[bold]Total cost: ${self.llm_client.total_cost + self.operations_cost:.2f}[/bold]"
+    )
+
+
+
+ +
+ +
+ + +

+ print_optimizer_config() + +

+ + +
+ +

Print the current configuration of the optimizer.

+

This method uses the Rich console to display a formatted output of the optimizer's +configuration. It includes details such as the YAML file path, sample sizes for +different operation types, maximum number of threads, the language model being used, +and the timeout setting.

+

The output is color-coded and formatted for easy readability, with a header and +separator lines to clearly delineate the configuration information.

+ +
+ Source code in docetl/builder.py +
187
+188
+189
+190
+191
+192
+193
+194
+195
+196
+197
+198
+199
+200
+201
+202
+203
+204
def print_optimizer_config(self):
+    """
+    Print the current configuration of the optimizer.
+
+    This method uses the Rich console to display a formatted output of the optimizer's
+    configuration. It includes details such as the YAML file path, sample sizes for
+    different operation types, maximum number of threads, the language model being used,
+    and the timeout setting.
+
+    The output is color-coded and formatted for easy readability, with a header and
+    separator lines to clearly delineate the configuration information.
+    """
+    self.console.rule("[bold cyan]Optimizer Configuration[/bold cyan]")
+    self.console.log(f"[yellow]YAML File:[/yellow] {self.yaml_file_path}")
+    self.console.log(f"[yellow]Sample Size:[/yellow] {self.sample_size_map}")
+    self.console.log(f"[yellow]Max Threads:[/yellow] {self.max_threads}")
+    self.console.log(f"[yellow]Model:[/yellow] {self.llm_client.model}")
+    self.console.log(f"[yellow]Timeout:[/yellow] {self.timeout} seconds")
+
+
+
+ +
+ +
+ + +

+ resolve_anchors(data) + + + staticmethod + + +

+ + +
+ +

Recursively resolve all anchors and aliases in a nested data structure.

+

This static method traverses through dictionaries and lists, resolving any YAML anchors and aliases.

+ + +

Parameters:

+ + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
data + +
+

The data structure to resolve. Can be a dictionary, list, or any other type.

+
+
+ required +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ +
+

The resolved data structure with all anchors and aliases replaced by their actual values.

+
+
+ +
+ Source code in docetl/builder.py +
1322
+1323
+1324
+1325
+1326
+1327
+1328
+1329
+1330
+1331
+1332
+1333
+1334
+1335
+1336
+1337
+1338
+1339
+1340
@staticmethod
+def resolve_anchors(data):
+    """
+    Recursively resolve all anchors and aliases in a nested data structure.
+
+    This static method traverses through dictionaries and lists, resolving any YAML anchors and aliases.
+
+    Args:
+        data: The data structure to resolve. Can be a dictionary, list, or any other type.
+
+    Returns:
+        The resolved data structure with all anchors and aliases replaced by their actual values.
+    """
+    if isinstance(data, dict):
+        return {k: Optimizer.resolve_anchors(v) for k, v in data.items()}
+    elif isinstance(data, list):
+        return [Optimizer.resolve_anchors(item) for item in data]
+    else:
+        return data
+
+
+
+ +
+ +
+ + +

+ syntax_check() + +

+ + +
+ +

Perform a syntax check on all operations defined in the configuration.

+

This method validates each operation by attempting to instantiate it. +If any operation fails to instantiate, a ValueError is raised.

+ + +

Raises:

+ + + + + + + + + + + + + +
TypeDescription
+ ValueError + +
+

If any operation fails the syntax check.

+
+
+ +
+ Source code in docetl/builder.py +
158
+159
+160
+161
+162
+163
+164
+165
+166
+167
+168
+169
+170
+171
+172
+173
+174
+175
+176
+177
+178
+179
+180
+181
+182
+183
+184
+185
def syntax_check(self):
+    """
+    Perform a syntax check on all operations defined in the configuration.
+
+    This method validates each operation by attempting to instantiate it.
+    If any operation fails to instantiate, a ValueError is raised.
+
+    Raises:
+        ValueError: If any operation fails the syntax check.
+    """
+    for operation_config in self.config["operations"]:
+        operation = operation_config["name"]
+        operation_type = operation_config["type"]
+
+        try:
+            operation_class = get_operation(operation_type)
+            operation_class(
+                operation_config,
+                self.config.get("default_model", "gpt-4o-mini"),
+                self.max_threads,
+                self.console,
+            )
+        except Exception as e:
+            raise ValueError(
+                f"Syntax check failed for operation '{operation}': {str(e)}"
+            )
+
+    self.console.log("[green]Syntax check passed for all operations.[/green]")
+
+
+
+ +
+ + + +
+ +
+ +
+ + + + + + + + + + + + + +
+
+ + + +
+ + + +
+ + + +
+
+
+
+ + + + + + + + + + \ No newline at end of file diff --git a/api-reference/operations/index.html b/api-reference/operations/index.html new file mode 100644 index 00000000..3de27e04 --- /dev/null +++ b/api-reference/operations/index.html @@ -0,0 +1,12709 @@ + + + + + + + + + + + + + + + + + + + + + + + + + docetl.operations - docetl docs + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + Skip to content + + +
+
+ +
+ + + + + + +
+ + + + + + + +
+ +
+ + + + +
+
+ + + +
+
+
+ + + + + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

LLM-Powered Operators

+ + +
+ + + +

+ docetl.operations.map.MapOperation + + +

+ + +
+

+ Bases: BaseOperation

+ + +
+ Source code in docetl/operations/map.py +
 22
+ 23
+ 24
+ 25
+ 26
+ 27
+ 28
+ 29
+ 30
+ 31
+ 32
+ 33
+ 34
+ 35
+ 36
+ 37
+ 38
+ 39
+ 40
+ 41
+ 42
+ 43
+ 44
+ 45
+ 46
+ 47
+ 48
+ 49
+ 50
+ 51
+ 52
+ 53
+ 54
+ 55
+ 56
+ 57
+ 58
+ 59
+ 60
+ 61
+ 62
+ 63
+ 64
+ 65
+ 66
+ 67
+ 68
+ 69
+ 70
+ 71
+ 72
+ 73
+ 74
+ 75
+ 76
+ 77
+ 78
+ 79
+ 80
+ 81
+ 82
+ 83
+ 84
+ 85
+ 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
+121
+122
+123
+124
+125
+126
+127
+128
+129
+130
+131
+132
+133
+134
+135
+136
+137
+138
+139
+140
+141
+142
+143
+144
+145
+146
+147
+148
+149
+150
+151
+152
+153
+154
+155
+156
+157
+158
+159
+160
+161
+162
+163
+164
+165
+166
+167
+168
+169
+170
+171
+172
+173
+174
+175
+176
+177
+178
+179
+180
+181
+182
+183
+184
+185
+186
+187
+188
+189
+190
+191
+192
+193
+194
+195
class MapOperation(BaseOperation):
+    def syntax_check(self) -> None:
+        """
+        Checks the configuration of the MapOperation for required keys and valid structure.
+
+        Raises:
+            ValueError: If required keys ('prompt' or 'output') are missing in the configuration.
+            ValueError: If 'schema' is missing in the 'output' configuration.
+            ValueError: If 'schema' in the 'output' configuration is empty.
+            ValueError: If the 'prompt' is not a valid Jinja2 template.
+            TypeError: If 'schema' in the 'output' configuration is not a dictionary.
+            TypeError: If 'model' is present in the configuration but is not a string.
+            ValueError: If any gleaning-related configuration is invalid (raised by self.gleaning_check()).
+        """
+        required_keys = ["prompt", "output"]
+        for key in required_keys:
+            if key not in self.config:
+                raise ValueError(
+                    f"Missing required key '{key}' in MapOperation configuration"
+                )
+
+        if "schema" not in self.config["output"]:
+            raise ValueError("Missing 'schema' in 'output' configuration")
+
+        if not isinstance(self.config["output"]["schema"], dict):
+            raise TypeError("'schema' in 'output' configuration must be a dictionary")
+
+        if not self.config["output"]["schema"]:
+            raise ValueError("'schema' in 'output' configuration cannot be empty")
+
+        # Check if the prompt is a valid Jinja2 template
+        try:
+            Template(self.config["prompt"])
+        except Exception as e:
+            raise ValueError(f"Invalid Jinja2 template in 'prompt': {str(e)}")
+
+        # Check if the model is specified (optional)
+        if "model" in self.config and not isinstance(self.config["model"], str):
+            raise TypeError("'model' in configuration must be a string")
+
+        # Check if tools are specified and validate their structure
+        if "tools" in self.config:
+            if not isinstance(self.config["tools"], list):
+                raise TypeError("'tools' in configuration must be a list")
+
+            for i, tool in enumerate(self.config["tools"]):
+                if not isinstance(tool, dict):
+                    raise TypeError(f"Tool {i} in 'tools' must be a dictionary")
+
+                if "code" not in tool or "function" not in tool:
+                    raise ValueError(
+                        f"Tool {i} is missing required 'code' or 'function' key"
+                    )
+
+                function = tool.get("function", {})
+                if not isinstance(function, dict):
+                    raise TypeError(f"'function' in tool {i} must be a dictionary")
+
+                required_function_keys = ["name", "description", "parameters"]
+                for key in required_function_keys:
+                    if key not in function:
+                        raise ValueError(
+                            f"Tool {i} is missing required '{key}' in 'function'"
+                        )
+
+        self.gleaning_check()
+
+    def execute(self, input_data: List[Dict]) -> Tuple[List[Dict], float]:
+        """
+        Executes the map operation on the provided input data.
+
+        Args:
+            input_data (List[Dict]): The input data to process.
+
+        Returns:
+            Tuple[List[Dict], float]: A tuple containing the processed results and the total cost of the operation.
+
+        This method performs the following steps:
+        1. Processes each input item using the specified prompt and LLM model
+        2. Applies gleaning if configured
+        3. Validates the output
+        4. Aggregates results and calculates total cost
+
+        The method uses parallel processing to improve performance.
+        """
+
+        def _process_map_item(item: Dict) -> Tuple[Optional[Dict], float]:
+            prompt_template = Template(self.config["prompt"])
+            prompt = prompt_template.render(input=item)
+
+            def validation_fn(response: Dict[str, Any]):
+                output = parse_llm_response(
+                    response, tools=self.config.get("tools", None)
+                )[0]
+                for key, value in item.items():
+                    if key not in self.config["output"]["schema"]:
+                        output[key] = value
+                if validate_output(self.config, output, self.console):
+                    return output, True
+                return output, False
+
+            if "gleaning" in self.config:
+                output, cost, success = call_llm_with_validation(
+                    [{"role": "user", "content": prompt}],
+                    llm_call_fn=lambda messages: call_llm_with_gleaning(
+                        self.config.get("model", self.default_model),
+                        "map",
+                        messages,
+                        self.config["output"]["schema"],
+                        self.config["gleaning"]["validation_prompt"],
+                        self.config["gleaning"]["num_rounds"],
+                        self.console,
+                    ),
+                    validation_fn=validation_fn,
+                    val_rule=self.config.get("validate", []),
+                    num_retries=self.num_retries_on_validate_failure,
+                    console=self.console,
+                )
+            else:
+                output, cost, success = call_llm_with_validation(
+                    [{"role": "user", "content": prompt}],
+                    llm_call_fn=lambda messages: call_llm(
+                        self.config.get("model", self.default_model),
+                        "map",
+                        messages,
+                        self.config["output"]["schema"],
+                        tools=self.config.get("tools", None),
+                        console=self.console,
+                    ),
+                    validation_fn=validation_fn,
+                    val_rule=self.config.get("validate", []),
+                    num_retries=self.num_retries_on_validate_failure,
+                    console=self.console,
+                )
+
+            if success:
+                return output, cost
+
+            return None, cost
+
+        with ThreadPoolExecutor(max_workers=self.max_threads) as executor:
+            futures = [executor.submit(_process_map_item, item) for item in input_data]
+            results = []
+            total_cost = 0
+            pbar = RichLoopBar(
+                range(len(futures)),
+                desc="Processing map items",
+                console=self.console,
+            )
+            for i in pbar:
+                result, item_cost = futures[i].result()
+                if result is not None:
+                    results.append(result)
+                total_cost += item_cost
+                pbar.update(i)
+
+        return results, total_cost
+
+    def validate_output(self, output: Dict) -> bool:
+        """
+        Validates the output of a single map operation against the specified schema.
+
+        Args:
+            output (Dict): The output to validate.
+
+        Returns:
+            bool: True if the output is valid, False otherwise.
+        """
+        schema = self.config["output"]["schema"]
+        for key in schema:
+            if key not in output:
+                self.console.log(f"[red]Error: Missing key '{key}' in output[/red]")
+                return False
+        return True
+
+
+ + + +
+ + + + + + + + + +
+ + +

+ execute(input_data) + +

+ + +
+ +

Executes the map operation on the provided input data.

+ + +

Parameters:

+ + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
input_data + List[Dict] + +
+

The input data to process.

+
+
+ required +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ Tuple[List[Dict], float] + +
+

Tuple[List[Dict], float]: A tuple containing the processed results and the total cost of the operation.

+
+
+

This method performs the following steps: +1. Processes each input item using the specified prompt and LLM model +2. Applies gleaning if configured +3. Validates the output +4. Aggregates results and calculates total cost

+

The method uses parallel processing to improve performance.

+ +
+ Source code in docetl/operations/map.py +
 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
+121
+122
+123
+124
+125
+126
+127
+128
+129
+130
+131
+132
+133
+134
+135
+136
+137
+138
+139
+140
+141
+142
+143
+144
+145
+146
+147
+148
+149
+150
+151
+152
+153
+154
+155
+156
+157
+158
+159
+160
+161
+162
+163
+164
+165
+166
+167
+168
+169
+170
+171
+172
+173
+174
+175
+176
+177
+178
def execute(self, input_data: List[Dict]) -> Tuple[List[Dict], float]:
+    """
+    Executes the map operation on the provided input data.
+
+    Args:
+        input_data (List[Dict]): The input data to process.
+
+    Returns:
+        Tuple[List[Dict], float]: A tuple containing the processed results and the total cost of the operation.
+
+    This method performs the following steps:
+    1. Processes each input item using the specified prompt and LLM model
+    2. Applies gleaning if configured
+    3. Validates the output
+    4. Aggregates results and calculates total cost
+
+    The method uses parallel processing to improve performance.
+    """
+
+    def _process_map_item(item: Dict) -> Tuple[Optional[Dict], float]:
+        prompt_template = Template(self.config["prompt"])
+        prompt = prompt_template.render(input=item)
+
+        def validation_fn(response: Dict[str, Any]):
+            output = parse_llm_response(
+                response, tools=self.config.get("tools", None)
+            )[0]
+            for key, value in item.items():
+                if key not in self.config["output"]["schema"]:
+                    output[key] = value
+            if validate_output(self.config, output, self.console):
+                return output, True
+            return output, False
+
+        if "gleaning" in self.config:
+            output, cost, success = call_llm_with_validation(
+                [{"role": "user", "content": prompt}],
+                llm_call_fn=lambda messages: call_llm_with_gleaning(
+                    self.config.get("model", self.default_model),
+                    "map",
+                    messages,
+                    self.config["output"]["schema"],
+                    self.config["gleaning"]["validation_prompt"],
+                    self.config["gleaning"]["num_rounds"],
+                    self.console,
+                ),
+                validation_fn=validation_fn,
+                val_rule=self.config.get("validate", []),
+                num_retries=self.num_retries_on_validate_failure,
+                console=self.console,
+            )
+        else:
+            output, cost, success = call_llm_with_validation(
+                [{"role": "user", "content": prompt}],
+                llm_call_fn=lambda messages: call_llm(
+                    self.config.get("model", self.default_model),
+                    "map",
+                    messages,
+                    self.config["output"]["schema"],
+                    tools=self.config.get("tools", None),
+                    console=self.console,
+                ),
+                validation_fn=validation_fn,
+                val_rule=self.config.get("validate", []),
+                num_retries=self.num_retries_on_validate_failure,
+                console=self.console,
+            )
+
+        if success:
+            return output, cost
+
+        return None, cost
+
+    with ThreadPoolExecutor(max_workers=self.max_threads) as executor:
+        futures = [executor.submit(_process_map_item, item) for item in input_data]
+        results = []
+        total_cost = 0
+        pbar = RichLoopBar(
+            range(len(futures)),
+            desc="Processing map items",
+            console=self.console,
+        )
+        for i in pbar:
+            result, item_cost = futures[i].result()
+            if result is not None:
+                results.append(result)
+            total_cost += item_cost
+            pbar.update(i)
+
+    return results, total_cost
+
+
+
+ +
+ +
+ + +

+ syntax_check() + +

+ + +
+ +

Checks the configuration of the MapOperation for required keys and valid structure.

+ + +

Raises:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
TypeDescription
+ ValueError + +
+

If required keys ('prompt' or 'output') are missing in the configuration.

+
+
+ ValueError + +
+

If 'schema' is missing in the 'output' configuration.

+
+
+ ValueError + +
+

If 'schema' in the 'output' configuration is empty.

+
+
+ ValueError + +
+

If the 'prompt' is not a valid Jinja2 template.

+
+
+ TypeError + +
+

If 'schema' in the 'output' configuration is not a dictionary.

+
+
+ TypeError + +
+

If 'model' is present in the configuration but is not a string.

+
+
+ ValueError + +
+

If any gleaning-related configuration is invalid (raised by self.gleaning_check()).

+
+
+ +
+ Source code in docetl/operations/map.py +
23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+40
+41
+42
+43
+44
+45
+46
+47
+48
+49
+50
+51
+52
+53
+54
+55
+56
+57
+58
+59
+60
+61
+62
+63
+64
+65
+66
+67
+68
+69
+70
+71
+72
+73
+74
+75
+76
+77
+78
+79
+80
+81
+82
+83
+84
+85
+86
+87
def syntax_check(self) -> None:
+    """
+    Checks the configuration of the MapOperation for required keys and valid structure.
+
+    Raises:
+        ValueError: If required keys ('prompt' or 'output') are missing in the configuration.
+        ValueError: If 'schema' is missing in the 'output' configuration.
+        ValueError: If 'schema' in the 'output' configuration is empty.
+        ValueError: If the 'prompt' is not a valid Jinja2 template.
+        TypeError: If 'schema' in the 'output' configuration is not a dictionary.
+        TypeError: If 'model' is present in the configuration but is not a string.
+        ValueError: If any gleaning-related configuration is invalid (raised by self.gleaning_check()).
+    """
+    required_keys = ["prompt", "output"]
+    for key in required_keys:
+        if key not in self.config:
+            raise ValueError(
+                f"Missing required key '{key}' in MapOperation configuration"
+            )
+
+    if "schema" not in self.config["output"]:
+        raise ValueError("Missing 'schema' in 'output' configuration")
+
+    if not isinstance(self.config["output"]["schema"], dict):
+        raise TypeError("'schema' in 'output' configuration must be a dictionary")
+
+    if not self.config["output"]["schema"]:
+        raise ValueError("'schema' in 'output' configuration cannot be empty")
+
+    # Check if the prompt is a valid Jinja2 template
+    try:
+        Template(self.config["prompt"])
+    except Exception as e:
+        raise ValueError(f"Invalid Jinja2 template in 'prompt': {str(e)}")
+
+    # Check if the model is specified (optional)
+    if "model" in self.config and not isinstance(self.config["model"], str):
+        raise TypeError("'model' in configuration must be a string")
+
+    # Check if tools are specified and validate their structure
+    if "tools" in self.config:
+        if not isinstance(self.config["tools"], list):
+            raise TypeError("'tools' in configuration must be a list")
+
+        for i, tool in enumerate(self.config["tools"]):
+            if not isinstance(tool, dict):
+                raise TypeError(f"Tool {i} in 'tools' must be a dictionary")
+
+            if "code" not in tool or "function" not in tool:
+                raise ValueError(
+                    f"Tool {i} is missing required 'code' or 'function' key"
+                )
+
+            function = tool.get("function", {})
+            if not isinstance(function, dict):
+                raise TypeError(f"'function' in tool {i} must be a dictionary")
+
+            required_function_keys = ["name", "description", "parameters"]
+            for key in required_function_keys:
+                if key not in function:
+                    raise ValueError(
+                        f"Tool {i} is missing required '{key}' in 'function'"
+                    )
+
+    self.gleaning_check()
+
+
+
+ +
+ +
+ + +

+ validate_output(output) + +

+ + +
+ +

Validates the output of a single map operation against the specified schema.

+ + +

Parameters:

+ + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
output + Dict + +
+

The output to validate.

+
+
+ required +
+ + +

Returns:

+ + + + + + + + + + + + + +
Name TypeDescription
bool + bool + +
+

True if the output is valid, False otherwise.

+
+
+ +
+ Source code in docetl/operations/map.py +
180
+181
+182
+183
+184
+185
+186
+187
+188
+189
+190
+191
+192
+193
+194
+195
def validate_output(self, output: Dict) -> bool:
+    """
+    Validates the output of a single map operation against the specified schema.
+
+    Args:
+        output (Dict): The output to validate.
+
+    Returns:
+        bool: True if the output is valid, False otherwise.
+    """
+    schema = self.config["output"]["schema"]
+    for key in schema:
+        if key not in output:
+            self.console.log(f"[red]Error: Missing key '{key}' in output[/red]")
+            return False
+    return True
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ docetl.operations.resolve.ResolveOperation + + +

+ + +
+

+ Bases: BaseOperation

+ + +
+ Source code in docetl/operations/resolve.py +
 66
+ 67
+ 68
+ 69
+ 70
+ 71
+ 72
+ 73
+ 74
+ 75
+ 76
+ 77
+ 78
+ 79
+ 80
+ 81
+ 82
+ 83
+ 84
+ 85
+ 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
+121
+122
+123
+124
+125
+126
+127
+128
+129
+130
+131
+132
+133
+134
+135
+136
+137
+138
+139
+140
+141
+142
+143
+144
+145
+146
+147
+148
+149
+150
+151
+152
+153
+154
+155
+156
+157
+158
+159
+160
+161
+162
+163
+164
+165
+166
+167
+168
+169
+170
+171
+172
+173
+174
+175
+176
+177
+178
+179
+180
+181
+182
+183
+184
+185
+186
+187
+188
+189
+190
+191
+192
+193
+194
+195
+196
+197
+198
+199
+200
+201
+202
+203
+204
+205
+206
+207
+208
+209
+210
+211
+212
+213
+214
+215
+216
+217
+218
+219
+220
+221
+222
+223
+224
+225
+226
+227
+228
+229
+230
+231
+232
+233
+234
+235
+236
+237
+238
+239
+240
+241
+242
+243
+244
+245
+246
+247
+248
+249
+250
+251
+252
+253
+254
+255
+256
+257
+258
+259
+260
+261
+262
+263
+264
+265
+266
+267
+268
+269
+270
+271
+272
+273
+274
+275
+276
+277
+278
+279
+280
+281
+282
+283
+284
+285
+286
+287
+288
+289
+290
+291
+292
+293
+294
+295
+296
+297
+298
+299
+300
+301
+302
+303
+304
+305
+306
+307
+308
+309
+310
+311
+312
+313
+314
+315
+316
+317
+318
+319
+320
+321
+322
+323
+324
+325
+326
+327
+328
+329
+330
+331
+332
+333
+334
+335
+336
+337
+338
+339
+340
+341
+342
+343
+344
+345
+346
+347
+348
+349
+350
+351
+352
+353
+354
+355
+356
+357
+358
+359
+360
+361
+362
+363
+364
+365
+366
+367
+368
+369
+370
+371
+372
+373
+374
+375
+376
+377
+378
+379
+380
+381
+382
+383
+384
+385
+386
+387
+388
+389
+390
+391
+392
+393
+394
+395
+396
+397
+398
+399
+400
+401
+402
+403
+404
+405
+406
+407
+408
+409
+410
+411
+412
+413
+414
+415
+416
+417
+418
+419
+420
+421
+422
+423
+424
+425
+426
+427
+428
+429
+430
+431
+432
+433
+434
+435
+436
+437
+438
+439
class ResolveOperation(BaseOperation):
+    def syntax_check(self) -> None:
+        """
+        Checks the configuration of the ResolveOperation for required keys and valid structure.
+
+        This method performs the following checks:
+        1. Verifies the presence of required keys: 'comparison_prompt' and 'output'.
+        2. Ensures 'output' contains a 'schema' key.
+        3. Validates that 'schema' in 'output' is a non-empty dictionary.
+        4. Checks if 'comparison_prompt' is a valid Jinja2 template with 'input1' and 'input2' variables.
+        5. If 'resolution_prompt' is present, verifies it as a valid Jinja2 template with 'inputs' variable.
+        6. Optionally checks if 'model' is a string (if present).
+        7. Optionally checks 'blocking_keys' (if present, further checks are performed).
+
+        Raises:
+            ValueError: If required keys are missing, if templates are invalid or missing required variables,
+                        or if any other configuration aspect is incorrect or inconsistent.
+            TypeError: If the types of configuration values are incorrect, such as 'schema' not being a dict
+                       or 'model' not being a string.
+        """
+        required_keys = ["comparison_prompt", "output"]
+        for key in required_keys:
+            if key not in self.config:
+                raise ValueError(
+                    f"Missing required key '{key}' in ResolveOperation configuration"
+                )
+
+        if "schema" not in self.config["output"]:
+            raise ValueError("Missing 'schema' in 'output' configuration")
+
+        if not isinstance(self.config["output"]["schema"], dict):
+            raise TypeError("'schema' in 'output' configuration must be a dictionary")
+
+        if not self.config["output"]["schema"]:
+            raise ValueError("'schema' in 'output' configuration cannot be empty")
+
+        # Check if the comparison_prompt is a valid Jinja2 template
+        try:
+            comparison_template = Template(self.config["comparison_prompt"])
+            comparison_vars = comparison_template.environment.parse(
+                self.config["comparison_prompt"]
+            ).find_all(jinja2.nodes.Name)
+            comparison_var_names = {var.name for var in comparison_vars}
+            if (
+                "input1" not in comparison_var_names
+                or "input2" not in comparison_var_names
+            ):
+                raise ValueError(
+                    "'comparison_prompt' must contain both 'input1' and 'input2' variables"
+                )
+
+            if "resolution_prompt" in self.config:
+                reduction_template = Template(self.config["resolution_prompt"])
+                reduction_vars = reduction_template.environment.parse(
+                    self.config["resolution_prompt"]
+                ).find_all(jinja2.nodes.Name)
+                reduction_var_names = {var.name for var in reduction_vars}
+                if "inputs" not in reduction_var_names:
+                    raise ValueError(
+                        "'resolution_prompt' must contain 'inputs' variable"
+                    )
+        except Exception as e:
+            raise ValueError(f"Invalid Jinja2 template: {str(e)}")
+
+        # Check if the model is specified (optional)
+        if "model" in self.config and not isinstance(self.config["model"], str):
+            raise TypeError("'model' in configuration must be a string")
+
+        # Check blocking_keys (optional)
+        if "blocking_keys" in self.config:
+            if not isinstance(self.config["blocking_keys"], list):
+                raise TypeError("'blocking_keys' must be a list")
+            if not all(isinstance(key, str) for key in self.config["blocking_keys"]):
+                raise TypeError("All items in 'blocking_keys' must be strings")
+
+        # Check blocking_threshold (optional)
+        if "blocking_threshold" in self.config:
+            if not isinstance(self.config["blocking_threshold"], (int, float)):
+                raise TypeError("'blocking_threshold' must be a number")
+            if not 0 <= self.config["blocking_threshold"] <= 1:
+                raise ValueError("'blocking_threshold' must be between 0 and 1")
+
+        # Check blocking_conditions (optional)
+        if "blocking_conditions" in self.config:
+            if not isinstance(self.config["blocking_conditions"], list):
+                raise TypeError("'blocking_conditions' must be a list")
+            if not all(
+                isinstance(cond, str) for cond in self.config["blocking_conditions"]
+            ):
+                raise TypeError("All items in 'blocking_conditions' must be strings")
+
+        # Check if input schema is provided and valid (optional)
+        if "input" in self.config:
+            if "schema" not in self.config["input"]:
+                raise ValueError("Missing 'schema' in 'input' configuration")
+            if not isinstance(self.config["input"]["schema"], dict):
+                raise TypeError(
+                    "'schema' in 'input' configuration must be a dictionary"
+                )
+
+        # Check limit_comparisons (optional)
+        if "limit_comparisons" in self.config:
+            if not isinstance(self.config["limit_comparisons"], int):
+                raise TypeError("'limit_comparisons' must be an integer")
+            if self.config["limit_comparisons"] <= 0:
+                raise ValueError("'limit_comparisons' must be a positive integer")
+
+    def execute(self, input_data: List[Dict]) -> Tuple[List[Dict], float]:
+        """
+        Executes the resolve operation on the provided dataset.
+
+        Args:
+            input_data (List[Dict]): The dataset to resolve.
+
+        Returns:
+            Tuple[List[Dict], float]: A tuple containing the resolved results and the total cost of the operation.
+
+        This method performs the following steps:
+        1. Initial blocking based on specified conditions and/or embedding similarity
+        2. Pairwise comparison of potentially matching entries using LLM
+        3. Clustering of matched entries
+        4. Resolution of each cluster into a single entry (if applicable)
+        5. Result aggregation and validation
+
+        The method also calculates and logs statistics such as comparisons saved by blocking and self-join selectivity.
+        """
+        if len(input_data) == 0:
+            return [], 0
+
+        blocking_keys = self.config.get("blocking_keys", [])
+        blocking_threshold = self.config.get("blocking_threshold")
+        blocking_conditions = self.config.get("blocking_conditions", [])
+        input_schema = self.config.get("input", {}).get("schema", {})
+        if not blocking_keys:
+            # Set them to all keys in the input data
+            blocking_keys = list(input_data[0].keys())
+        limit_comparisons = self.config.get("limit_comparisons")
+        total_cost = 0
+
+        def is_match(item1: Dict[str, Any], item2: Dict[str, Any]) -> bool:
+            return any(
+                eval(condition, {"input1": item1, "input2": item2})
+                for condition in blocking_conditions
+            )
+
+        # Calculate embeddings if blocking_threshold is set
+        embeddings = None
+        if blocking_threshold is not None:
+            embedding_model = self.config.get("embedding_model", self.default_model)
+
+            def get_embeddings_batch(
+                items: List[Dict[str, Any]]
+            ) -> List[Tuple[List[float], float]]:
+                texts = [
+                    " ".join(str(item[key]) for key in blocking_keys if key in item)
+                    for item in items
+                ]
+                response = gen_embedding(model=embedding_model, input=texts)
+                return [
+                    (data["embedding"], completion_cost(response))
+                    for data in response["data"]
+                ]
+
+            embeddings = []
+            costs = []
+            with ThreadPoolExecutor(max_workers=self.max_threads) as executor:
+                for i in range(
+                    0, len(input_data), self.config.get("embedding_batch_size", 1000)
+                ):
+                    batch = input_data[
+                        i : i + self.config.get("embedding_batch_size", 1000)
+                    ]
+                    batch_results = list(executor.map(get_embeddings_batch, [batch]))
+
+                    for result in batch_results:
+                        embeddings.extend([r[0] for r in result])
+                        costs.extend([r[1] for r in result])
+
+                total_cost += sum(costs)
+
+        # Initialize clusters
+        clusters = [{i} for i in range(len(input_data))]
+        cluster_map = {i: i for i in range(len(input_data))}
+
+        def find_cluster(item):
+            while item != cluster_map[item]:
+                cluster_map[item] = cluster_map[cluster_map[item]]
+                item = cluster_map[item]
+            return item
+
+        def merge_clusters(item1, item2):
+            root1, root2 = find_cluster(item1), find_cluster(item2)
+            if root1 != root2:
+                if len(clusters[root1]) < len(clusters[root2]):
+                    root1, root2 = root2, root1
+                clusters[root1] |= clusters[root2]
+                cluster_map[root2] = root1
+                clusters[root2] = set()
+
+        # Generate all pairs to compare
+        # TODO: virtualize this if possible
+        all_pairs = [
+            (i, j)
+            for i in range(len(input_data))
+            for j in range(i + 1, len(input_data))
+        ]
+
+        # Filter pairs based on blocking conditions
+        def meets_blocking_conditions(pair):
+            i, j = pair
+            return (
+                is_match(input_data[i], input_data[j]) if blocking_conditions else False
+            )
+
+        blocked_pairs = list(filter(meets_blocking_conditions, all_pairs))
+
+        # Apply limit_comparisons to blocked pairs
+        if limit_comparisons is not None and len(blocked_pairs) > limit_comparisons:
+            self.console.log(
+                f"Randomly sampling {limit_comparisons} pairs out of {len(blocked_pairs)} blocked pairs."
+            )
+            blocked_pairs = random.sample(blocked_pairs, limit_comparisons)
+
+        # If there are remaining comparisons, fill with highest cosine similarities
+        remaining_comparisons = (
+            limit_comparisons - len(blocked_pairs)
+            if limit_comparisons is not None
+            else float("inf")
+        )
+        if remaining_comparisons > 0 and blocking_threshold is not None:
+            # Compute cosine similarity for all pairs at once
+            all_embeddings = np.array([embeddings[i] for i in range(len(input_data))])
+            similarity_matrix = cosine_similarity(all_embeddings)
+
+            cosine_pairs = []
+            for i, j in all_pairs:
+                if (i, j) not in blocked_pairs and find_cluster(i) != find_cluster(j):
+                    similarity = similarity_matrix[i, j]
+                    if similarity >= blocking_threshold:
+                        cosine_pairs.append((i, j, similarity))
+
+            if remaining_comparisons != float("inf"):
+                cosine_pairs.sort(key=lambda x: x[2], reverse=True)
+                additional_pairs = [
+                    (i, j) for i, j, _ in cosine_pairs[: int(remaining_comparisons)]
+                ]
+                blocked_pairs.extend(additional_pairs)
+            else:
+                blocked_pairs.extend((i, j) for i, j, _ in cosine_pairs)
+
+        filtered_pairs = blocked_pairs
+
+        # Calculate and print statistics
+        total_possible_comparisons = len(input_data) * (len(input_data) - 1) // 2
+        comparisons_made = len(filtered_pairs)
+        comparisons_saved = total_possible_comparisons - comparisons_made
+        self.console.log(
+            f"[green]Comparisons saved by blocking: {comparisons_saved} "
+            f"({(comparisons_saved / total_possible_comparisons) * 100:.2f}%)[/green]"
+        )
+
+        # Compare pairs and update clusters in real-time
+        batch_size = self.config.get("compare_batch_size", 100)
+        pair_costs = 0
+
+        pbar = RichLoopBar(
+            range(0, len(filtered_pairs), batch_size),
+            desc=f"Processing batches of {batch_size} LLM comparisons",
+            console=self.console,
+        )
+        for i in pbar:
+            batch = filtered_pairs[i : i + batch_size]
+
+            with ThreadPoolExecutor(max_workers=self.max_threads) as executor:
+                future_to_pair = {
+                    executor.submit(
+                        compare_pair,
+                        self.config["comparison_prompt"],
+                        self.config.get("comparison_model", self.default_model),
+                        input_data[pair[0]],
+                        input_data[pair[1]],
+                        blocking_keys,
+                    ): pair
+                    for pair in batch
+                }
+
+                for future in as_completed(future_to_pair):
+                    pair = future_to_pair[future]
+                    is_match_result, cost = future.result()
+                    pair_costs += cost
+                    if is_match_result:
+                        merge_clusters(pair[0], pair[1])
+
+                    pbar.update(i)
+
+        total_cost += pair_costs
+
+        # Collect final clusters
+        final_clusters = [cluster for cluster in clusters if cluster]
+
+        # Process each cluster
+        results = []
+
+        def process_cluster(cluster):
+            if len(cluster) > 1:
+                cluster_items = [input_data[i] for i in cluster]
+                reduction_template = Template(self.config["resolution_prompt"])
+                if input_schema:
+                    cluster_items = [
+                        {k: item[k] for k in input_schema.keys() if k in item}
+                        for item in cluster_items
+                    ]
+
+                resolution_prompt = reduction_template.render(inputs=cluster_items)
+                reduction_response = call_llm(
+                    self.config.get("resolution_model", self.default_model),
+                    "reduce",
+                    [{"role": "user", "content": resolution_prompt}],
+                    self.config["output"]["schema"],
+                    console=self.console,
+                )
+                reduction_output = parse_llm_response(reduction_response)[0]
+                reduction_cost = completion_cost(reduction_response)
+
+                if validate_output(self.config, reduction_output, self.console):
+                    return (
+                        [
+                            {
+                                **item,
+                                **{
+                                    k: reduction_output[k]
+                                    for k in self.config["output"]["schema"]
+                                },
+                            }
+                            for item in [input_data[i] for i in cluster]
+                        ],
+                        reduction_cost,
+                    )
+                return [], reduction_cost
+            else:
+                return [input_data[list(cluster)[0]]], 0
+
+        # Calculate the number of records before and clusters after
+        num_records_before = len(input_data)
+        num_clusters_after = len(final_clusters)
+        self.console.log(f"Number of documents before: {num_records_before}")
+        self.console.log(f"Number of distinct documents after: {num_clusters_after}")
+
+        with ThreadPoolExecutor(max_workers=self.max_threads) as executor:
+            futures = [
+                executor.submit(process_cluster, cluster) for cluster in final_clusters
+            ]
+            for future in rich_as_completed(
+                futures,
+                total=len(futures),
+                desc="Determining resolved key for each group of equivalent keys",
+                console=self.console,
+            ):
+                cluster_results, cluster_cost = future.result()
+                results.extend(cluster_results)
+                total_cost += cluster_cost
+
+        total_pairs = len(input_data) * (len(input_data) - 1) // 2
+        true_match_count = sum(
+            len(cluster) * (len(cluster) - 1) // 2
+            for cluster in final_clusters
+            if len(cluster) > 1
+        )
+        true_match_selectivity = (
+            true_match_count / total_pairs if total_pairs > 0 else 0
+        )
+        self.console.log(f"Self-join selectivity: {true_match_selectivity:.4f}")
+
+        return results, total_cost
+
+
+ + + +
+ + + + + + + + + +
+ + +

+ execute(input_data) + +

+ + +
+ +

Executes the resolve operation on the provided dataset.

+ + +

Parameters:

+ + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
input_data + List[Dict] + +
+

The dataset to resolve.

+
+
+ required +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ Tuple[List[Dict], float] + +
+

Tuple[List[Dict], float]: A tuple containing the resolved results and the total cost of the operation.

+
+
+

This method performs the following steps: +1. Initial blocking based on specified conditions and/or embedding similarity +2. Pairwise comparison of potentially matching entries using LLM +3. Clustering of matched entries +4. Resolution of each cluster into a single entry (if applicable) +5. Result aggregation and validation

+

The method also calculates and logs statistics such as comparisons saved by blocking and self-join selectivity.

+ +
+ Source code in docetl/operations/resolve.py +
173
+174
+175
+176
+177
+178
+179
+180
+181
+182
+183
+184
+185
+186
+187
+188
+189
+190
+191
+192
+193
+194
+195
+196
+197
+198
+199
+200
+201
+202
+203
+204
+205
+206
+207
+208
+209
+210
+211
+212
+213
+214
+215
+216
+217
+218
+219
+220
+221
+222
+223
+224
+225
+226
+227
+228
+229
+230
+231
+232
+233
+234
+235
+236
+237
+238
+239
+240
+241
+242
+243
+244
+245
+246
+247
+248
+249
+250
+251
+252
+253
+254
+255
+256
+257
+258
+259
+260
+261
+262
+263
+264
+265
+266
+267
+268
+269
+270
+271
+272
+273
+274
+275
+276
+277
+278
+279
+280
+281
+282
+283
+284
+285
+286
+287
+288
+289
+290
+291
+292
+293
+294
+295
+296
+297
+298
+299
+300
+301
+302
+303
+304
+305
+306
+307
+308
+309
+310
+311
+312
+313
+314
+315
+316
+317
+318
+319
+320
+321
+322
+323
+324
+325
+326
+327
+328
+329
+330
+331
+332
+333
+334
+335
+336
+337
+338
+339
+340
+341
+342
+343
+344
+345
+346
+347
+348
+349
+350
+351
+352
+353
+354
+355
+356
+357
+358
+359
+360
+361
+362
+363
+364
+365
+366
+367
+368
+369
+370
+371
+372
+373
+374
+375
+376
+377
+378
+379
+380
+381
+382
+383
+384
+385
+386
+387
+388
+389
+390
+391
+392
+393
+394
+395
+396
+397
+398
+399
+400
+401
+402
+403
+404
+405
+406
+407
+408
+409
+410
+411
+412
+413
+414
+415
+416
+417
+418
+419
+420
+421
+422
+423
+424
+425
+426
+427
+428
+429
+430
+431
+432
+433
+434
+435
+436
+437
+438
+439
def execute(self, input_data: List[Dict]) -> Tuple[List[Dict], float]:
+    """
+    Executes the resolve operation on the provided dataset.
+
+    Args:
+        input_data (List[Dict]): The dataset to resolve.
+
+    Returns:
+        Tuple[List[Dict], float]: A tuple containing the resolved results and the total cost of the operation.
+
+    This method performs the following steps:
+    1. Initial blocking based on specified conditions and/or embedding similarity
+    2. Pairwise comparison of potentially matching entries using LLM
+    3. Clustering of matched entries
+    4. Resolution of each cluster into a single entry (if applicable)
+    5. Result aggregation and validation
+
+    The method also calculates and logs statistics such as comparisons saved by blocking and self-join selectivity.
+    """
+    if len(input_data) == 0:
+        return [], 0
+
+    blocking_keys = self.config.get("blocking_keys", [])
+    blocking_threshold = self.config.get("blocking_threshold")
+    blocking_conditions = self.config.get("blocking_conditions", [])
+    input_schema = self.config.get("input", {}).get("schema", {})
+    if not blocking_keys:
+        # Set them to all keys in the input data
+        blocking_keys = list(input_data[0].keys())
+    limit_comparisons = self.config.get("limit_comparisons")
+    total_cost = 0
+
+    def is_match(item1: Dict[str, Any], item2: Dict[str, Any]) -> bool:
+        return any(
+            eval(condition, {"input1": item1, "input2": item2})
+            for condition in blocking_conditions
+        )
+
+    # Calculate embeddings if blocking_threshold is set
+    embeddings = None
+    if blocking_threshold is not None:
+        embedding_model = self.config.get("embedding_model", self.default_model)
+
+        def get_embeddings_batch(
+            items: List[Dict[str, Any]]
+        ) -> List[Tuple[List[float], float]]:
+            texts = [
+                " ".join(str(item[key]) for key in blocking_keys if key in item)
+                for item in items
+            ]
+            response = gen_embedding(model=embedding_model, input=texts)
+            return [
+                (data["embedding"], completion_cost(response))
+                for data in response["data"]
+            ]
+
+        embeddings = []
+        costs = []
+        with ThreadPoolExecutor(max_workers=self.max_threads) as executor:
+            for i in range(
+                0, len(input_data), self.config.get("embedding_batch_size", 1000)
+            ):
+                batch = input_data[
+                    i : i + self.config.get("embedding_batch_size", 1000)
+                ]
+                batch_results = list(executor.map(get_embeddings_batch, [batch]))
+
+                for result in batch_results:
+                    embeddings.extend([r[0] for r in result])
+                    costs.extend([r[1] for r in result])
+
+            total_cost += sum(costs)
+
+    # Initialize clusters
+    clusters = [{i} for i in range(len(input_data))]
+    cluster_map = {i: i for i in range(len(input_data))}
+
+    def find_cluster(item):
+        while item != cluster_map[item]:
+            cluster_map[item] = cluster_map[cluster_map[item]]
+            item = cluster_map[item]
+        return item
+
+    def merge_clusters(item1, item2):
+        root1, root2 = find_cluster(item1), find_cluster(item2)
+        if root1 != root2:
+            if len(clusters[root1]) < len(clusters[root2]):
+                root1, root2 = root2, root1
+            clusters[root1] |= clusters[root2]
+            cluster_map[root2] = root1
+            clusters[root2] = set()
+
+    # Generate all pairs to compare
+    # TODO: virtualize this if possible
+    all_pairs = [
+        (i, j)
+        for i in range(len(input_data))
+        for j in range(i + 1, len(input_data))
+    ]
+
+    # Filter pairs based on blocking conditions
+    def meets_blocking_conditions(pair):
+        i, j = pair
+        return (
+            is_match(input_data[i], input_data[j]) if blocking_conditions else False
+        )
+
+    blocked_pairs = list(filter(meets_blocking_conditions, all_pairs))
+
+    # Apply limit_comparisons to blocked pairs
+    if limit_comparisons is not None and len(blocked_pairs) > limit_comparisons:
+        self.console.log(
+            f"Randomly sampling {limit_comparisons} pairs out of {len(blocked_pairs)} blocked pairs."
+        )
+        blocked_pairs = random.sample(blocked_pairs, limit_comparisons)
+
+    # If there are remaining comparisons, fill with highest cosine similarities
+    remaining_comparisons = (
+        limit_comparisons - len(blocked_pairs)
+        if limit_comparisons is not None
+        else float("inf")
+    )
+    if remaining_comparisons > 0 and blocking_threshold is not None:
+        # Compute cosine similarity for all pairs at once
+        all_embeddings = np.array([embeddings[i] for i in range(len(input_data))])
+        similarity_matrix = cosine_similarity(all_embeddings)
+
+        cosine_pairs = []
+        for i, j in all_pairs:
+            if (i, j) not in blocked_pairs and find_cluster(i) != find_cluster(j):
+                similarity = similarity_matrix[i, j]
+                if similarity >= blocking_threshold:
+                    cosine_pairs.append((i, j, similarity))
+
+        if remaining_comparisons != float("inf"):
+            cosine_pairs.sort(key=lambda x: x[2], reverse=True)
+            additional_pairs = [
+                (i, j) for i, j, _ in cosine_pairs[: int(remaining_comparisons)]
+            ]
+            blocked_pairs.extend(additional_pairs)
+        else:
+            blocked_pairs.extend((i, j) for i, j, _ in cosine_pairs)
+
+    filtered_pairs = blocked_pairs
+
+    # Calculate and print statistics
+    total_possible_comparisons = len(input_data) * (len(input_data) - 1) // 2
+    comparisons_made = len(filtered_pairs)
+    comparisons_saved = total_possible_comparisons - comparisons_made
+    self.console.log(
+        f"[green]Comparisons saved by blocking: {comparisons_saved} "
+        f"({(comparisons_saved / total_possible_comparisons) * 100:.2f}%)[/green]"
+    )
+
+    # Compare pairs and update clusters in real-time
+    batch_size = self.config.get("compare_batch_size", 100)
+    pair_costs = 0
+
+    pbar = RichLoopBar(
+        range(0, len(filtered_pairs), batch_size),
+        desc=f"Processing batches of {batch_size} LLM comparisons",
+        console=self.console,
+    )
+    for i in pbar:
+        batch = filtered_pairs[i : i + batch_size]
+
+        with ThreadPoolExecutor(max_workers=self.max_threads) as executor:
+            future_to_pair = {
+                executor.submit(
+                    compare_pair,
+                    self.config["comparison_prompt"],
+                    self.config.get("comparison_model", self.default_model),
+                    input_data[pair[0]],
+                    input_data[pair[1]],
+                    blocking_keys,
+                ): pair
+                for pair in batch
+            }
+
+            for future in as_completed(future_to_pair):
+                pair = future_to_pair[future]
+                is_match_result, cost = future.result()
+                pair_costs += cost
+                if is_match_result:
+                    merge_clusters(pair[0], pair[1])
+
+                pbar.update(i)
+
+    total_cost += pair_costs
+
+    # Collect final clusters
+    final_clusters = [cluster for cluster in clusters if cluster]
+
+    # Process each cluster
+    results = []
+
+    def process_cluster(cluster):
+        if len(cluster) > 1:
+            cluster_items = [input_data[i] for i in cluster]
+            reduction_template = Template(self.config["resolution_prompt"])
+            if input_schema:
+                cluster_items = [
+                    {k: item[k] for k in input_schema.keys() if k in item}
+                    for item in cluster_items
+                ]
+
+            resolution_prompt = reduction_template.render(inputs=cluster_items)
+            reduction_response = call_llm(
+                self.config.get("resolution_model", self.default_model),
+                "reduce",
+                [{"role": "user", "content": resolution_prompt}],
+                self.config["output"]["schema"],
+                console=self.console,
+            )
+            reduction_output = parse_llm_response(reduction_response)[0]
+            reduction_cost = completion_cost(reduction_response)
+
+            if validate_output(self.config, reduction_output, self.console):
+                return (
+                    [
+                        {
+                            **item,
+                            **{
+                                k: reduction_output[k]
+                                for k in self.config["output"]["schema"]
+                            },
+                        }
+                        for item in [input_data[i] for i in cluster]
+                    ],
+                    reduction_cost,
+                )
+            return [], reduction_cost
+        else:
+            return [input_data[list(cluster)[0]]], 0
+
+    # Calculate the number of records before and clusters after
+    num_records_before = len(input_data)
+    num_clusters_after = len(final_clusters)
+    self.console.log(f"Number of documents before: {num_records_before}")
+    self.console.log(f"Number of distinct documents after: {num_clusters_after}")
+
+    with ThreadPoolExecutor(max_workers=self.max_threads) as executor:
+        futures = [
+            executor.submit(process_cluster, cluster) for cluster in final_clusters
+        ]
+        for future in rich_as_completed(
+            futures,
+            total=len(futures),
+            desc="Determining resolved key for each group of equivalent keys",
+            console=self.console,
+        ):
+            cluster_results, cluster_cost = future.result()
+            results.extend(cluster_results)
+            total_cost += cluster_cost
+
+    total_pairs = len(input_data) * (len(input_data) - 1) // 2
+    true_match_count = sum(
+        len(cluster) * (len(cluster) - 1) // 2
+        for cluster in final_clusters
+        if len(cluster) > 1
+    )
+    true_match_selectivity = (
+        true_match_count / total_pairs if total_pairs > 0 else 0
+    )
+    self.console.log(f"Self-join selectivity: {true_match_selectivity:.4f}")
+
+    return results, total_cost
+
+
+
+ +
+ +
+ + +

+ syntax_check() + +

+ + +
+ +

Checks the configuration of the ResolveOperation for required keys and valid structure.

+

This method performs the following checks: +1. Verifies the presence of required keys: 'comparison_prompt' and 'output'. +2. Ensures 'output' contains a 'schema' key. +3. Validates that 'schema' in 'output' is a non-empty dictionary. +4. Checks if 'comparison_prompt' is a valid Jinja2 template with 'input1' and 'input2' variables. +5. If 'resolution_prompt' is present, verifies it as a valid Jinja2 template with 'inputs' variable. +6. Optionally checks if 'model' is a string (if present). +7. Optionally checks 'blocking_keys' (if present, further checks are performed).

+ + +

Raises:

+ + + + + + + + + + + + + + + + + +
TypeDescription
+ ValueError + +
+

If required keys are missing, if templates are invalid or missing required variables, + or if any other configuration aspect is incorrect or inconsistent.

+
+
+ TypeError + +
+

If the types of configuration values are incorrect, such as 'schema' not being a dict + or 'model' not being a string.

+
+
+ +
+ Source code in docetl/operations/resolve.py +
 67
+ 68
+ 69
+ 70
+ 71
+ 72
+ 73
+ 74
+ 75
+ 76
+ 77
+ 78
+ 79
+ 80
+ 81
+ 82
+ 83
+ 84
+ 85
+ 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
+121
+122
+123
+124
+125
+126
+127
+128
+129
+130
+131
+132
+133
+134
+135
+136
+137
+138
+139
+140
+141
+142
+143
+144
+145
+146
+147
+148
+149
+150
+151
+152
+153
+154
+155
+156
+157
+158
+159
+160
+161
+162
+163
+164
+165
+166
+167
+168
+169
+170
+171
def syntax_check(self) -> None:
+    """
+    Checks the configuration of the ResolveOperation for required keys and valid structure.
+
+    This method performs the following checks:
+    1. Verifies the presence of required keys: 'comparison_prompt' and 'output'.
+    2. Ensures 'output' contains a 'schema' key.
+    3. Validates that 'schema' in 'output' is a non-empty dictionary.
+    4. Checks if 'comparison_prompt' is a valid Jinja2 template with 'input1' and 'input2' variables.
+    5. If 'resolution_prompt' is present, verifies it as a valid Jinja2 template with 'inputs' variable.
+    6. Optionally checks if 'model' is a string (if present).
+    7. Optionally checks 'blocking_keys' (if present, further checks are performed).
+
+    Raises:
+        ValueError: If required keys are missing, if templates are invalid or missing required variables,
+                    or if any other configuration aspect is incorrect or inconsistent.
+        TypeError: If the types of configuration values are incorrect, such as 'schema' not being a dict
+                   or 'model' not being a string.
+    """
+    required_keys = ["comparison_prompt", "output"]
+    for key in required_keys:
+        if key not in self.config:
+            raise ValueError(
+                f"Missing required key '{key}' in ResolveOperation configuration"
+            )
+
+    if "schema" not in self.config["output"]:
+        raise ValueError("Missing 'schema' in 'output' configuration")
+
+    if not isinstance(self.config["output"]["schema"], dict):
+        raise TypeError("'schema' in 'output' configuration must be a dictionary")
+
+    if not self.config["output"]["schema"]:
+        raise ValueError("'schema' in 'output' configuration cannot be empty")
+
+    # Check if the comparison_prompt is a valid Jinja2 template
+    try:
+        comparison_template = Template(self.config["comparison_prompt"])
+        comparison_vars = comparison_template.environment.parse(
+            self.config["comparison_prompt"]
+        ).find_all(jinja2.nodes.Name)
+        comparison_var_names = {var.name for var in comparison_vars}
+        if (
+            "input1" not in comparison_var_names
+            or "input2" not in comparison_var_names
+        ):
+            raise ValueError(
+                "'comparison_prompt' must contain both 'input1' and 'input2' variables"
+            )
+
+        if "resolution_prompt" in self.config:
+            reduction_template = Template(self.config["resolution_prompt"])
+            reduction_vars = reduction_template.environment.parse(
+                self.config["resolution_prompt"]
+            ).find_all(jinja2.nodes.Name)
+            reduction_var_names = {var.name for var in reduction_vars}
+            if "inputs" not in reduction_var_names:
+                raise ValueError(
+                    "'resolution_prompt' must contain 'inputs' variable"
+                )
+    except Exception as e:
+        raise ValueError(f"Invalid Jinja2 template: {str(e)}")
+
+    # Check if the model is specified (optional)
+    if "model" in self.config and not isinstance(self.config["model"], str):
+        raise TypeError("'model' in configuration must be a string")
+
+    # Check blocking_keys (optional)
+    if "blocking_keys" in self.config:
+        if not isinstance(self.config["blocking_keys"], list):
+            raise TypeError("'blocking_keys' must be a list")
+        if not all(isinstance(key, str) for key in self.config["blocking_keys"]):
+            raise TypeError("All items in 'blocking_keys' must be strings")
+
+    # Check blocking_threshold (optional)
+    if "blocking_threshold" in self.config:
+        if not isinstance(self.config["blocking_threshold"], (int, float)):
+            raise TypeError("'blocking_threshold' must be a number")
+        if not 0 <= self.config["blocking_threshold"] <= 1:
+            raise ValueError("'blocking_threshold' must be between 0 and 1")
+
+    # Check blocking_conditions (optional)
+    if "blocking_conditions" in self.config:
+        if not isinstance(self.config["blocking_conditions"], list):
+            raise TypeError("'blocking_conditions' must be a list")
+        if not all(
+            isinstance(cond, str) for cond in self.config["blocking_conditions"]
+        ):
+            raise TypeError("All items in 'blocking_conditions' must be strings")
+
+    # Check if input schema is provided and valid (optional)
+    if "input" in self.config:
+        if "schema" not in self.config["input"]:
+            raise ValueError("Missing 'schema' in 'input' configuration")
+        if not isinstance(self.config["input"]["schema"], dict):
+            raise TypeError(
+                "'schema' in 'input' configuration must be a dictionary"
+            )
+
+    # Check limit_comparisons (optional)
+    if "limit_comparisons" in self.config:
+        if not isinstance(self.config["limit_comparisons"], int):
+            raise TypeError("'limit_comparisons' must be an integer")
+        if self.config["limit_comparisons"] <= 0:
+            raise ValueError("'limit_comparisons' must be a positive integer")
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ docetl.operations.reduce.ReduceOperation + + +

+ + +
+

+ Bases: BaseOperation

+ + +

A class that implements a reduce operation on input data using language models.

+

This class extends BaseOperation to provide functionality for reducing grouped data +using various strategies including batch reduce, incremental reduce, and parallel fold and merge.

+ +
+ Source code in docetl/operations/reduce.py +
 36
+ 37
+ 38
+ 39
+ 40
+ 41
+ 42
+ 43
+ 44
+ 45
+ 46
+ 47
+ 48
+ 49
+ 50
+ 51
+ 52
+ 53
+ 54
+ 55
+ 56
+ 57
+ 58
+ 59
+ 60
+ 61
+ 62
+ 63
+ 64
+ 65
+ 66
+ 67
+ 68
+ 69
+ 70
+ 71
+ 72
+ 73
+ 74
+ 75
+ 76
+ 77
+ 78
+ 79
+ 80
+ 81
+ 82
+ 83
+ 84
+ 85
+ 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
+121
+122
+123
+124
+125
+126
+127
+128
+129
+130
+131
+132
+133
+134
+135
+136
+137
+138
+139
+140
+141
+142
+143
+144
+145
+146
+147
+148
+149
+150
+151
+152
+153
+154
+155
+156
+157
+158
+159
+160
+161
+162
+163
+164
+165
+166
+167
+168
+169
+170
+171
+172
+173
+174
+175
+176
+177
+178
+179
+180
+181
+182
+183
+184
+185
+186
+187
+188
+189
+190
+191
+192
+193
+194
+195
+196
+197
+198
+199
+200
+201
+202
+203
+204
+205
+206
+207
+208
+209
+210
+211
+212
+213
+214
+215
+216
+217
+218
+219
+220
+221
+222
+223
+224
+225
+226
+227
+228
+229
+230
+231
+232
+233
+234
+235
+236
+237
+238
+239
+240
+241
+242
+243
+244
+245
+246
+247
+248
+249
+250
+251
+252
+253
+254
+255
+256
+257
+258
+259
+260
+261
+262
+263
+264
+265
+266
+267
+268
+269
+270
+271
+272
+273
+274
+275
+276
+277
+278
+279
+280
+281
+282
+283
+284
+285
+286
+287
+288
+289
+290
+291
+292
+293
+294
+295
+296
+297
+298
+299
+300
+301
+302
+303
+304
+305
+306
+307
+308
+309
+310
+311
+312
+313
+314
+315
+316
+317
+318
+319
+320
+321
+322
+323
+324
+325
+326
+327
+328
+329
+330
+331
+332
+333
+334
+335
+336
+337
+338
+339
+340
+341
+342
+343
+344
+345
+346
+347
+348
+349
+350
+351
+352
+353
+354
+355
+356
+357
+358
+359
+360
+361
+362
+363
+364
+365
+366
+367
+368
+369
+370
+371
+372
+373
+374
+375
+376
+377
+378
+379
+380
+381
+382
+383
+384
+385
+386
+387
+388
+389
+390
+391
+392
+393
+394
+395
+396
+397
+398
+399
+400
+401
+402
+403
+404
+405
+406
+407
+408
+409
+410
+411
+412
+413
+414
+415
+416
+417
+418
+419
+420
+421
+422
+423
+424
+425
+426
+427
+428
+429
+430
+431
+432
+433
+434
+435
+436
+437
+438
+439
+440
+441
+442
+443
+444
+445
+446
+447
+448
+449
+450
+451
+452
+453
+454
+455
+456
+457
+458
+459
+460
+461
+462
+463
+464
+465
+466
+467
+468
+469
+470
+471
+472
+473
+474
+475
+476
+477
+478
+479
+480
+481
+482
+483
+484
+485
+486
+487
+488
+489
+490
+491
+492
+493
+494
+495
+496
+497
+498
+499
+500
+501
+502
+503
+504
+505
+506
+507
+508
+509
+510
+511
+512
+513
+514
+515
+516
+517
+518
+519
+520
+521
+522
+523
+524
+525
+526
+527
+528
+529
+530
+531
+532
+533
+534
+535
+536
+537
+538
+539
+540
+541
+542
+543
+544
+545
+546
+547
+548
+549
+550
+551
+552
+553
+554
+555
+556
+557
+558
+559
+560
+561
+562
+563
+564
+565
+566
+567
+568
+569
+570
+571
+572
+573
+574
+575
+576
+577
+578
+579
+580
+581
+582
+583
+584
+585
+586
+587
+588
+589
+590
+591
+592
+593
+594
+595
+596
+597
+598
+599
+600
+601
+602
+603
+604
+605
+606
+607
+608
+609
+610
+611
+612
+613
+614
+615
+616
+617
+618
+619
+620
+621
+622
+623
+624
+625
+626
+627
+628
+629
+630
+631
+632
+633
+634
+635
+636
+637
+638
+639
+640
+641
+642
+643
+644
+645
+646
+647
+648
+649
+650
+651
+652
+653
+654
+655
+656
+657
+658
+659
+660
+661
+662
+663
+664
+665
+666
+667
+668
+669
+670
+671
+672
+673
+674
+675
+676
+677
+678
+679
+680
+681
+682
+683
+684
+685
+686
+687
+688
+689
+690
+691
+692
+693
+694
+695
+696
+697
+698
+699
+700
+701
+702
+703
+704
+705
+706
+707
+708
+709
+710
+711
+712
+713
+714
+715
+716
+717
+718
+719
+720
+721
+722
+723
+724
+725
+726
+727
+728
+729
+730
+731
+732
+733
+734
+735
+736
+737
+738
+739
+740
+741
+742
+743
+744
+745
+746
+747
+748
+749
+750
+751
+752
+753
+754
+755
+756
+757
+758
+759
+760
+761
+762
+763
+764
+765
+766
+767
+768
+769
+770
+771
+772
+773
+774
+775
+776
+777
+778
+779
+780
class ReduceOperation(BaseOperation):
+    """
+    A class that implements a reduce operation on input data using language models.
+
+    This class extends BaseOperation to provide functionality for reducing grouped data
+    using various strategies including batch reduce, incremental reduce, and parallel fold and merge.
+    """
+
+    def __init__(self, *args, **kwargs):
+        """
+        Initialize the ReduceOperation.
+
+        Args:
+            *args: Variable length argument list.
+            **kwargs: Arbitrary keyword arguments.
+        """
+        super().__init__(*args, **kwargs)
+        self.min_samples = 5
+        self.max_samples = 1000
+        self.fold_times = deque(maxlen=self.max_samples)
+        self.merge_times = deque(maxlen=self.max_samples)
+        self.lock = Lock()
+        self.config["reduce_key"] = (
+            [self.config["reduce_key"]]
+            if isinstance(self.config["reduce_key"], str)
+            else self.config["reduce_key"]
+        )
+
+    def syntax_check(self) -> None:
+        """
+        Perform comprehensive syntax checks on the configuration of the ReduceOperation.
+
+        This method validates the presence and correctness of all required configuration keys, Jinja2 templates, and ensures the correct
+        structure and types of the entire configuration.
+
+        The method performs the following checks:
+        1. Verifies the presence of all required keys in the configuration.
+        2. Validates the structure and content of the 'output' configuration, including its 'schema'.
+        3. Checks if the main 'prompt' is a valid Jinja2 template and contains the required 'inputs' variable.
+        4. If 'merge_prompt' is specified, ensures that 'fold_prompt' is also present.
+        5. If 'fold_prompt' is present, verifies the existence of 'fold_batch_size'.
+        6. Validates the 'fold_prompt' as a Jinja2 template with required variables 'inputs' and 'output'.
+        7. If present, checks 'merge_prompt' as a valid Jinja2 template with required 'outputs' variable.
+        8. Verifies types of various configuration inputs (e.g., 'fold_batch_size' as int).
+        9. Checks for the presence and validity of optional configurations like 'model'.
+
+        Raises:
+            ValueError: If any required configuration is missing, if templates are invalid or missing required
+                        variables, or if any other configuration aspect is incorrect or inconsistent.
+            TypeError: If any configuration value has an incorrect type, such as 'schema' not being a dict
+                       or 'fold_batch_size' not being an integer.
+        """
+        required_keys = ["reduce_key", "prompt", "output"]
+        for key in required_keys:
+            if key not in self.config:
+                raise ValueError(
+                    f"Missing required key '{key}' in ReduceOperation configuration"
+                )
+
+        if "schema" not in self.config["output"]:
+            raise ValueError("Missing 'schema' in 'output' configuration")
+
+        if not isinstance(self.config["output"]["schema"], dict):
+            raise TypeError("'schema' in 'output' configuration must be a dictionary")
+
+        if not self.config["output"]["schema"]:
+            raise ValueError("'schema' in 'output' configuration cannot be empty")
+
+        # Check if the prompt is a valid Jinja2 template
+        try:
+            template = Template(self.config["prompt"])
+            template_vars = template.environment.parse(self.config["prompt"]).find_all(
+                jinja2.nodes.Name
+            )
+            template_var_names = {var.name for var in template_vars}
+            if "inputs" not in template_var_names:
+                raise ValueError("Template must include the 'inputs' variable")
+        except Exception as e:
+            raise ValueError(f"Invalid Jinja2 template in 'prompt': {str(e)}")
+
+        # Check if fold_prompt is a valid Jinja2 template (now required if merge exists)
+        if "merge_prompt" in self.config:
+            if "fold_prompt" not in self.config:
+                raise ValueError(
+                    "'fold_prompt' is required when 'merge_prompt' is specified"
+                )
+
+        if "fold_prompt" in self.config:
+            if "fold_batch_size" not in self.config:
+                raise ValueError(
+                    "'fold_batch_size' is required when 'fold_prompt' is specified"
+                )
+
+            try:
+                fold_template = Template(self.config["fold_prompt"])
+                fold_template_vars = fold_template.environment.parse(
+                    self.config["fold_prompt"]
+                ).find_all(jinja2.nodes.Name)
+                fold_template_var_names = {var.name for var in fold_template_vars}
+                required_vars = {"inputs", "output"}
+                if not required_vars.issubset(fold_template_var_names):
+                    raise ValueError(
+                        f"Fold template must include variables: {required_vars}. Current template includes: {fold_template_var_names}"
+                    )
+            except Exception as e:
+                raise ValueError(f"Invalid Jinja2 template in 'fold_prompt': {str(e)}")
+
+        # Check merge_prompt and merge_batch_size
+        if "merge_prompt" in self.config:
+            if "merge_batch_size" not in self.config:
+                raise ValueError(
+                    "'merge_batch_size' is required when 'merge_prompt' is specified"
+                )
+
+            try:
+                merge_template = Template(self.config["merge_prompt"])
+                merge_template_vars = merge_template.environment.parse(
+                    self.config["merge_prompt"]
+                ).find_all(jinja2.nodes.Name)
+                merge_template_var_names = {var.name for var in merge_template_vars}
+                if "outputs" not in merge_template_var_names:
+                    raise ValueError(
+                        "Merge template must include the 'outputs' variable"
+                    )
+            except Exception as e:
+                raise ValueError(f"Invalid Jinja2 template in 'merge_prompt': {str(e)}")
+
+        # Check if the model is specified (optional)
+        if "model" in self.config and not isinstance(self.config["model"], str):
+            raise TypeError("'model' in configuration must be a string")
+
+        # Check if reduce_key is a string or a list of strings
+        if not isinstance(self.config["reduce_key"], (str, list)):
+            raise TypeError("'reduce_key' must be a string or a list of strings")
+        if isinstance(self.config["reduce_key"], list):
+            if not all(isinstance(key, str) for key in self.config["reduce_key"]):
+                raise TypeError("All elements in 'reduce_key' list must be strings")
+
+        # Check if input schema is provided and valid (optional)
+        if "input" in self.config:
+            if "schema" not in self.config["input"]:
+                raise ValueError("Missing 'schema' in 'input' configuration")
+            if not isinstance(self.config["input"]["schema"], dict):
+                raise TypeError(
+                    "'schema' in 'input' configuration must be a dictionary"
+                )
+
+        # Check if fold_batch_size and merge_batch_size are positive integers
+        for key in ["fold_batch_size", "merge_batch_size"]:
+            if key in self.config:
+                if not isinstance(self.config[key], int) or self.config[key] <= 0:
+                    raise ValueError(f"'{key}' must be a positive integer")
+
+        if "value_sampling" in self.config:
+            sampling = self.config["value_sampling"]
+            if not isinstance(sampling, dict):
+                raise TypeError("'value_sampling' must be a dictionary")
+
+            if "enabled" not in sampling:
+                raise ValueError(
+                    "'enabled' is required in 'value_sampling' configuration"
+                )
+            if not isinstance(sampling["enabled"], bool):
+                raise TypeError("'enabled' in 'value_sampling' must be a boolean")
+
+            if sampling["enabled"]:
+                if "sample_size" not in sampling:
+                    raise ValueError(
+                        "'sample_size' is required when value_sampling is enabled"
+                    )
+                if (
+                    not isinstance(sampling["sample_size"], int)
+                    or sampling["sample_size"] <= 0
+                ):
+                    raise ValueError("'sample_size' must be a positive integer")
+
+                if "method" not in sampling:
+                    raise ValueError(
+                        "'method' is required when value_sampling is enabled"
+                    )
+                if sampling["method"] not in [
+                    "random",
+                    "first_n",
+                    "cluster",
+                    "sem_sim",
+                ]:
+                    raise ValueError(
+                        "Invalid 'method'. Must be 'random', 'first_n', or 'embedding'"
+                    )
+
+                if sampling["method"] == "embedding":
+                    if "embedding_model" not in sampling:
+                        raise ValueError(
+                            "'embedding_model' is required when using embedding-based sampling"
+                        )
+                    if "embedding_keys" not in sampling:
+                        raise ValueError(
+                            "'embedding_keys' is required when using embedding-based sampling"
+                        )
+
+        self.gleaning_check()
+
+    def execute(self, input_data: List[Dict]) -> Tuple[List[Dict], float]:
+        """
+        Execute the reduce operation on the provided input data.
+
+        This method sorts and groups the input data by the reduce key(s), then processes each group
+        using either parallel fold and merge, incremental reduce, or batch reduce strategies.
+
+        Args:
+            input_data (List[Dict]): The input data to process.
+
+        Returns:
+            Tuple[List[Dict], float]: A tuple containing the processed results and the total cost of the operation.
+        """
+        reduce_keys = self.config["reduce_key"]
+        if isinstance(reduce_keys, str):
+            reduce_keys = [reduce_keys]
+        input_schema = self.config.get("input", {}).get("schema", {})
+
+        # Group the input data by the reduce key(s) while maintaining original order
+        def get_group_key(item):
+            return tuple(item[key] for key in reduce_keys)
+
+        grouped_data = {}
+        for item in input_data:
+            key = get_group_key(item)
+            if key not in grouped_data:
+                grouped_data[key] = []
+            grouped_data[key].append(item)
+
+        # Convert the grouped data to a list of tuples
+        grouped_data = list(grouped_data.items())
+
+        def process_group(
+            key: Tuple, group_elems: List[Dict]
+        ) -> Tuple[Optional[Dict], float]:
+            if input_schema:
+                group_list = [
+                    {k: item[k] for k in input_schema.keys() if k in item}
+                    for item in group_elems
+                ]
+            else:
+                group_list = group_elems
+
+            total_cost = 0.0
+
+            # Apply value sampling if enabled
+            value_sampling = self.config.get("value_sampling", {})
+            if value_sampling.get("enabled", False):
+                sample_size = min(value_sampling["sample_size"], len(group_list))
+                method = value_sampling["method"]
+
+                if method == "random":
+                    group_sample = random.sample(group_list, sample_size)
+                    group_sample.sort(key=lambda x: group_list.index(x))
+                elif method == "first_n":
+                    group_sample = group_list[:sample_size]
+                elif method == "cluster":
+                    group_sample, embedding_cost = self._cluster_based_sampling(
+                        group_list, value_sampling, sample_size
+                    )
+                    group_sample.sort(key=lambda x: group_list.index(x))
+                    total_cost += embedding_cost
+                elif method == "sem_sim":
+                    group_sample, embedding_cost = self._semantic_similarity_sampling(
+                        key, group_list, value_sampling, sample_size
+                    )
+                    group_sample.sort(key=lambda x: group_list.index(x))
+                    total_cost += embedding_cost
+
+                group_list = group_sample
+
+            # Only execute merge-based plans if associative = True
+            if "merge_prompt" in self.config and self.config.get("associative", True):
+                result, cost = self._parallel_fold_and_merge(key, group_list)
+            elif "fold_prompt" in self.config:
+                result, cost = self._incremental_reduce(key, group_list)
+            else:
+                result, cost = self._batch_reduce(key, group_list)
+
+            total_cost += cost
+
+            # Apply pass-through at the group level
+            if (
+                result is not None
+                and self.config.get("pass_through", False)
+                and group_elems
+            ):
+                for k, v in group_elems[0].items():
+                    if k not in self.config["output"]["schema"] and k not in result:
+                        result[k] = v
+
+            return result, total_cost
+
+        with ThreadPoolExecutor(max_workers=self.max_threads) as executor:
+            futures = [
+                executor.submit(process_group, key, group)
+                for key, group in grouped_data
+            ]
+            results = []
+            total_cost = 0
+            for future in rich_as_completed(
+                futures,
+                total=len(futures),
+                desc="Processing reduce items",
+                leave=True,
+                console=self.console,
+            ):
+                output, item_cost = future.result()
+                total_cost += item_cost
+                if output is not None:
+                    results.append(output)
+
+        return results, total_cost
+
+    def _get_embeddings(
+        self, items: List[Dict], value_sampling: Dict
+    ) -> Tuple[List[List[float]], float]:
+        embedding_model = value_sampling["embedding_model"]
+        embedding_keys = value_sampling["embedding_keys"]
+        if not embedding_keys:
+            embedding_keys = list(items[0].keys())
+        embeddings = []
+        cost = 0
+        batch_size = 1000
+
+        for i in range(0, len(items), batch_size):
+            batch = items[i : i + batch_size]
+            texts = [
+                " ".join(str(item[key]) for key in embedding_keys if key in item)[
+                    :10000
+                ]
+                for item in batch
+            ]
+            response = gen_embedding(embedding_model, texts)
+            embeddings.extend([data["embedding"] for data in response["data"]])
+            cost += completion_cost(response)
+
+        return embeddings, cost
+
+    def _cluster_based_sampling(
+        self, group_list: List[Dict], value_sampling: Dict, sample_size: int
+    ) -> Tuple[List[Dict], float]:
+        embeddings, cost = self._get_embeddings(group_list, value_sampling)
+
+        kmeans = KMeans(n_clusters=sample_size, random_state=42)
+        cluster_labels = kmeans.fit_predict(embeddings)
+
+        sampled_items = []
+        for i in range(sample_size):
+            cluster_items = [
+                item for item, label in zip(group_list, cluster_labels) if label == i
+            ]
+            if cluster_items:
+                sampled_items.append(random.choice(cluster_items))
+
+        return sampled_items, cost
+
+    def _semantic_similarity_sampling(
+        self, key: Tuple, group_list: List[Dict], value_sampling: Dict, sample_size: int
+    ) -> Tuple[List[Dict], float]:
+        embedding_model = value_sampling["embedding_model"]
+        query_text_template = Template(value_sampling["query_text"])
+        query_text = query_text_template.render(
+            reduce_key=dict(zip(self.config["reduce_key"], key))
+        )
+
+        embeddings, cost = self._get_embeddings(group_list, value_sampling)
+
+        query_response = gen_embedding(embedding_model, [query_text])
+        query_embedding = query_response["data"][0]["embedding"]
+        cost += completion_cost(query_response)
+
+        similarities = cosine_similarity([query_embedding], embeddings)[0]
+
+        top_k_indices = np.argsort(similarities)[-sample_size:]
+
+        return [group_list[i] for i in top_k_indices], cost
+
+    def _parallel_fold_and_merge(
+        self, key: Tuple, group_list: List[Dict]
+    ) -> Tuple[Optional[Dict], float]:
+        """
+        Perform parallel folding and merging on a group of items.
+
+        This method implements a strategy that combines parallel folding of input items
+        and merging of intermediate results to efficiently process large groups. It works as follows:
+        1. The input group is initially divided into smaller batches for efficient processing.
+        2. The method performs an initial round of folding operations on these batches.
+        3. After the first round of folds, a few merges are performed to estimate the merge runtime.
+        4. Based on the estimated merge runtime and observed fold runtime, it calculates the optimal number of parallel folds. Subsequent rounds of folding are then performed concurrently, with the number of parallel folds determined by the runtime estimates.
+        5. The folding process repeats in rounds, progressively reducing the number of items to be processed.
+        6. Once all folding operations are complete, the method recursively performs final merges on the fold results to combine them into a final result.
+        7. Throughout this process, the method may adjust the number of parallel folds based on updated performance metrics (i.e., fold and merge runtimes) to maintain efficiency.
+
+        Args:
+            key (Tuple): The reduce key tuple for the group.
+            group_list (List[Dict]): The list of items in the group to be processed.
+
+        Returns:
+            Tuple[Optional[Dict], float]: A tuple containing the final merged result (or None if processing failed)
+            and the total cost of the operation.
+        """
+        fold_batch_size = self.config["fold_batch_size"]
+        merge_batch_size = self.config["merge_batch_size"]
+        total_cost = 0
+
+        def calculate_num_parallel_folds():
+            fold_time, fold_default = self.get_fold_time()
+            merge_time, merge_default = self.get_merge_time()
+            num_group_items = len(group_list)
+            return (
+                max(
+                    1,
+                    int(
+                        (fold_time * num_group_items * math.log(merge_batch_size))
+                        / (fold_batch_size * merge_time)
+                    ),
+                ),
+                fold_default or merge_default,
+            )
+
+        num_parallel_folds, used_default_times = calculate_num_parallel_folds()
+        fold_results = []
+        remaining_items = group_list
+
+        # Parallel folding and merging
+        with ThreadPoolExecutor(max_workers=self.max_threads) as executor:
+            while remaining_items:
+                # Folding phase
+                fold_futures = []
+                for i in range(min(num_parallel_folds, len(remaining_items))):
+                    batch = remaining_items[:fold_batch_size]
+                    remaining_items = remaining_items[fold_batch_size:]
+                    current_output = fold_results[i] if i < len(fold_results) else None
+                    fold_futures.append(
+                        executor.submit(
+                            self._increment_fold, key, batch, current_output
+                        )
+                    )
+
+                new_fold_results = []
+                for future in as_completed(fold_futures):
+                    result, cost = future.result()
+                    total_cost += cost
+                    if result is not None:
+                        new_fold_results.append(result)
+
+                # Update fold_results with new results
+                fold_results = new_fold_results + fold_results[len(new_fold_results) :]
+
+                # Single pass merging phase
+                if (
+                    len(self.merge_times) < self.min_samples
+                    and len(fold_results) >= merge_batch_size
+                ):
+                    merge_futures = []
+                    for i in range(0, len(fold_results), merge_batch_size):
+                        batch = fold_results[i : i + merge_batch_size]
+                        merge_futures.append(
+                            executor.submit(self._merge_results, key, batch)
+                        )
+
+                    new_results = []
+                    for future in as_completed(merge_futures):
+                        result, cost = future.result()
+                        total_cost += cost
+                        if result is not None:
+                            new_results.append(result)
+
+                    fold_results = new_results
+
+                # Recalculate num_parallel_folds if we used default times
+                if used_default_times:
+                    new_num_parallel_folds, used_default_times = (
+                        calculate_num_parallel_folds()
+                    )
+                    if not used_default_times:
+                        self.console.log(
+                            f"Recalculated num_parallel_folds from {num_parallel_folds} to {new_num_parallel_folds}"
+                        )
+                        num_parallel_folds = new_num_parallel_folds
+
+        # Final merging if needed
+        while len(fold_results) > 1:
+            self.console.log(f"Finished folding! Merging {len(fold_results)} items.")
+            with ThreadPoolExecutor(max_workers=self.max_threads) as executor:
+                merge_futures = []
+                for i in range(0, len(fold_results), merge_batch_size):
+                    batch = fold_results[i : i + merge_batch_size]
+                    merge_futures.append(
+                        executor.submit(self._merge_results, key, batch)
+                    )
+
+                new_results = []
+                for future in as_completed(merge_futures):
+                    result, cost = future.result()
+                    total_cost += cost
+                    if result is not None:
+                        new_results.append(result)
+
+                fold_results = new_results
+
+        return (fold_results[0], total_cost) if fold_results else (None, total_cost)
+
+    def _incremental_reduce(
+        self, key: Tuple, group_list: List[Dict]
+    ) -> Tuple[Optional[Dict], float]:
+        """
+        Perform an incremental reduce operation on a group of items.
+
+        This method processes the group in batches, incrementally folding the results.
+
+        Args:
+            key (Tuple): The reduce key tuple for the group.
+            group_list (List[Dict]): The list of items in the group to be processed.
+
+        Returns:
+            Tuple[Optional[Dict], float]: A tuple containing the final reduced result (or None if processing failed)
+            and the total cost of the operation.
+        """
+        fold_batch_size = self.config["fold_batch_size"]
+        total_cost = 0
+        current_output = None
+
+        # Calculate and log the number of folds to be performed
+        num_folds = (len(group_list) + fold_batch_size - 1) // fold_batch_size
+
+        scratchpad = ""
+        for i in range(0, len(group_list), fold_batch_size):
+            # Log the current iteration and total number of folds
+            current_fold = i // fold_batch_size + 1
+            self.console.log(
+                f"Processing fold {current_fold} of {num_folds} for group with key {key}"
+            )
+            batch = group_list[i : i + fold_batch_size]
+
+            folded_output, fold_cost = self._increment_fold(
+                key, batch, current_output, scratchpad
+            )
+            total_cost += fold_cost
+
+            if folded_output is None:
+                continue
+
+            # Pop off updated_scratchpad
+            if "updated_scratchpad" in folded_output:
+                scratchpad = folded_output["updated_scratchpad"]
+                self.console.log(f"Updated notes: {scratchpad}")
+                del folded_output["updated_scratchpad"]
+
+            current_output = folded_output
+
+        return current_output, total_cost
+
+    def _increment_fold(
+        self,
+        key: Tuple,
+        batch: List[Dict],
+        current_output: Optional[Dict],
+        scratchpad: Optional[str] = None,
+    ) -> Tuple[Optional[Dict], float]:
+        """
+        Perform an incremental fold operation on a batch of items.
+
+        This method folds a batch of items into the current output using the fold prompt.
+
+        Args:
+            key (Tuple): The reduce key tuple for the group.
+            batch (List[Dict]): The batch of items to be folded.
+            current_output (Optional[Dict]): The current accumulated output, if any.
+            scratchpad (Optional[str]): The scratchpad to use for the fold operation.
+        Returns:
+            Tuple[Optional[Dict], float]: A tuple containing the folded output (or None if processing failed)
+            and the cost of the fold operation.
+        """
+        if current_output is None:
+            return self._batch_reduce(key, batch, scratchpad)
+
+        start_time = time.time()
+        fold_prompt_template = Template(self.config["fold_prompt"])
+        fold_prompt = fold_prompt_template.render(
+            inputs=batch,
+            output=current_output,
+            reduce_key=dict(zip(self.config["reduce_key"], key)),
+        )
+        response = call_llm(
+            self.config.get("model", self.default_model),
+            "reduce",
+            [{"role": "user", "content": fold_prompt}],
+            self.config["output"]["schema"],
+            scratchpad=scratchpad,
+            console=self.console,
+        )
+        folded_output = parse_llm_response(response)[0]
+
+        folded_output.update(dict(zip(self.config["reduce_key"], key)))
+        fold_cost = completion_cost(response)
+        end_time = time.time()
+        self._update_fold_time(end_time - start_time)
+
+        if validate_output(self.config, folded_output, self.console):
+            return folded_output, fold_cost
+        return None, fold_cost
+
+    def _merge_results(
+        self, key: Tuple, outputs: List[Dict]
+    ) -> Tuple[Optional[Dict], float]:
+        """
+        Merge multiple outputs into a single result.
+
+        This method merges a list of outputs using the merge prompt.
+
+        Args:
+            key (Tuple): The reduce key tuple for the group.
+            outputs (List[Dict]): The list of outputs to be merged.
+
+        Returns:
+            Tuple[Optional[Dict], float]: A tuple containing the merged output (or None if processing failed)
+            and the cost of the merge operation.
+        """
+        start_time = time.time()
+        merge_prompt_template = Template(self.config["merge_prompt"])
+        merge_prompt = merge_prompt_template.render(
+            outputs=outputs, reduce_key=dict(zip(self.config["reduce_key"], key))
+        )
+        response = call_llm(
+            self.config.get("model", self.default_model),
+            "merge",
+            [{"role": "user", "content": merge_prompt}],
+            self.config["output"]["schema"],
+            console=self.console,
+        )
+        merged_output = parse_llm_response(response)[0]
+        merged_output.update(dict(zip(self.config["reduce_key"], key)))
+        merge_cost = completion_cost(response)
+        end_time = time.time()
+        self._update_merge_time(end_time - start_time)
+
+        if validate_output(self.config, merged_output, self.console):
+            return merged_output, merge_cost
+        return None, merge_cost
+
+    def get_fold_time(self) -> Tuple[float, bool]:
+        """
+        Get the average fold time or a default value.
+
+        Returns:
+            Tuple[float, bool]: A tuple containing the average fold time (or default) and a boolean
+            indicating whether the default value was used.
+        """
+        if "fold_time" in self.config:
+            return self.config["fold_time"], False
+        with self.lock:
+            if len(self.fold_times) >= self.min_samples:
+                return sum(self.fold_times) / len(self.fold_times), False
+        return 1.0, True  # Default to 1 second if no data is available
+
+    def get_merge_time(self) -> Tuple[float, bool]:
+        """
+        Get the average merge time or a default value.
+
+        Returns:
+            Tuple[float, bool]: A tuple containing the average merge time (or default) and a boolean
+            indicating whether the default value was used.
+        """
+        if "merge_time" in self.config:
+            return self.config["merge_time"], False
+        with self.lock:
+            if len(self.merge_times) >= self.min_samples:
+                return sum(self.merge_times) / len(self.merge_times), False
+        return 1.0, True  # Default to 1 second if no data is available
+
+    def _update_fold_time(self, time: float) -> None:
+        """
+        Update the fold time statistics.
+
+        Args:
+            time (float): The time taken for a fold operation.
+        """
+        with self.lock:
+            self.fold_times.append(time)
+
+    def _update_merge_time(self, time: float) -> None:
+        """
+        Update the merge time statistics.
+
+        Args:
+            time (float): The time taken for a merge operation.
+        """
+        with self.lock:
+            self.merge_times.append(time)
+
+    def _batch_reduce(
+        self, key: Tuple, group_list: List[Dict], scratchpad: Optional[str] = None
+    ) -> Tuple[Optional[Dict], float]:
+        """
+        Perform a batch reduce operation on a group of items.
+
+        This method reduces a group of items into a single output using the reduce prompt.
+
+        Args:
+            key (Tuple): The reduce key tuple for the group.
+            group_list (List[Dict]): The list of items to be reduced.
+            scratchpad (Optional[str]): The scratchpad to use for the reduce operation.
+        Returns:
+            Tuple[Optional[Dict], float]: A tuple containing the reduced output (or None if processing failed)
+            and the cost of the reduce operation.
+        """
+        prompt_template = Template(self.config["prompt"])
+        prompt = prompt_template.render(
+            reduce_key=dict(zip(self.config["reduce_key"], key)), inputs=group_list
+        )
+        item_cost = 0
+
+        if "gleaning" in self.config:
+            response, gleaning_cost = call_llm_with_gleaning(
+                self.config.get("model", self.default_model),
+                "reduce",
+                [{"role": "user", "content": prompt}],
+                self.config["output"]["schema"],
+                self.config["gleaning"]["validation_prompt"],
+                self.config["gleaning"]["num_rounds"],
+                console=self.console,
+            )
+            item_cost += gleaning_cost
+        else:
+            response = call_llm(
+                self.config.get("model", self.default_model),
+                "reduce",
+                [{"role": "user", "content": prompt}],
+                self.config["output"]["schema"],
+                console=self.console,
+                scratchpad=scratchpad,
+            )
+
+        item_cost += completion_cost(response)
+
+        output = parse_llm_response(response)[0]
+        output.update(dict(zip(self.config["reduce_key"], key)))
+
+        if validate_output(self.config, output, self.console):
+            return output, item_cost
+        return None, item_cost
+
+
+ + + +
+ + + + + + + + + +
+ + +

+ __init__(*args, **kwargs) + +

+ + +
+ +

Initialize the ReduceOperation.

+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
*args + +
+

Variable length argument list.

+
+
+ () +
**kwargs + +
+

Arbitrary keyword arguments.

+
+
+ {} +
+ +
+ Source code in docetl/operations/reduce.py +
44
+45
+46
+47
+48
+49
+50
+51
+52
+53
+54
+55
+56
+57
+58
+59
+60
+61
+62
def __init__(self, *args, **kwargs):
+    """
+    Initialize the ReduceOperation.
+
+    Args:
+        *args: Variable length argument list.
+        **kwargs: Arbitrary keyword arguments.
+    """
+    super().__init__(*args, **kwargs)
+    self.min_samples = 5
+    self.max_samples = 1000
+    self.fold_times = deque(maxlen=self.max_samples)
+    self.merge_times = deque(maxlen=self.max_samples)
+    self.lock = Lock()
+    self.config["reduce_key"] = (
+        [self.config["reduce_key"]]
+        if isinstance(self.config["reduce_key"], str)
+        else self.config["reduce_key"]
+    )
+
+
+
+ +
+ +
+ + +

+ execute(input_data) + +

+ + +
+ +

Execute the reduce operation on the provided input data.

+

This method sorts and groups the input data by the reduce key(s), then processes each group +using either parallel fold and merge, incremental reduce, or batch reduce strategies.

+ + +

Parameters:

+ + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
input_data + List[Dict] + +
+

The input data to process.

+
+
+ required +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ Tuple[List[Dict], float] + +
+

Tuple[List[Dict], float]: A tuple containing the processed results and the total cost of the operation.

+
+
+ +
+ Source code in docetl/operations/reduce.py +
238
+239
+240
+241
+242
+243
+244
+245
+246
+247
+248
+249
+250
+251
+252
+253
+254
+255
+256
+257
+258
+259
+260
+261
+262
+263
+264
+265
+266
+267
+268
+269
+270
+271
+272
+273
+274
+275
+276
+277
+278
+279
+280
+281
+282
+283
+284
+285
+286
+287
+288
+289
+290
+291
+292
+293
+294
+295
+296
+297
+298
+299
+300
+301
+302
+303
+304
+305
+306
+307
+308
+309
+310
+311
+312
+313
+314
+315
+316
+317
+318
+319
+320
+321
+322
+323
+324
+325
+326
+327
+328
+329
+330
+331
+332
+333
+334
+335
+336
+337
+338
+339
+340
+341
+342
+343
+344
+345
+346
+347
+348
+349
+350
def execute(self, input_data: List[Dict]) -> Tuple[List[Dict], float]:
+    """
+    Execute the reduce operation on the provided input data.
+
+    This method sorts and groups the input data by the reduce key(s), then processes each group
+    using either parallel fold and merge, incremental reduce, or batch reduce strategies.
+
+    Args:
+        input_data (List[Dict]): The input data to process.
+
+    Returns:
+        Tuple[List[Dict], float]: A tuple containing the processed results and the total cost of the operation.
+    """
+    reduce_keys = self.config["reduce_key"]
+    if isinstance(reduce_keys, str):
+        reduce_keys = [reduce_keys]
+    input_schema = self.config.get("input", {}).get("schema", {})
+
+    # Group the input data by the reduce key(s) while maintaining original order
+    def get_group_key(item):
+        return tuple(item[key] for key in reduce_keys)
+
+    grouped_data = {}
+    for item in input_data:
+        key = get_group_key(item)
+        if key not in grouped_data:
+            grouped_data[key] = []
+        grouped_data[key].append(item)
+
+    # Convert the grouped data to a list of tuples
+    grouped_data = list(grouped_data.items())
+
+    def process_group(
+        key: Tuple, group_elems: List[Dict]
+    ) -> Tuple[Optional[Dict], float]:
+        if input_schema:
+            group_list = [
+                {k: item[k] for k in input_schema.keys() if k in item}
+                for item in group_elems
+            ]
+        else:
+            group_list = group_elems
+
+        total_cost = 0.0
+
+        # Apply value sampling if enabled
+        value_sampling = self.config.get("value_sampling", {})
+        if value_sampling.get("enabled", False):
+            sample_size = min(value_sampling["sample_size"], len(group_list))
+            method = value_sampling["method"]
+
+            if method == "random":
+                group_sample = random.sample(group_list, sample_size)
+                group_sample.sort(key=lambda x: group_list.index(x))
+            elif method == "first_n":
+                group_sample = group_list[:sample_size]
+            elif method == "cluster":
+                group_sample, embedding_cost = self._cluster_based_sampling(
+                    group_list, value_sampling, sample_size
+                )
+                group_sample.sort(key=lambda x: group_list.index(x))
+                total_cost += embedding_cost
+            elif method == "sem_sim":
+                group_sample, embedding_cost = self._semantic_similarity_sampling(
+                    key, group_list, value_sampling, sample_size
+                )
+                group_sample.sort(key=lambda x: group_list.index(x))
+                total_cost += embedding_cost
+
+            group_list = group_sample
+
+        # Only execute merge-based plans if associative = True
+        if "merge_prompt" in self.config and self.config.get("associative", True):
+            result, cost = self._parallel_fold_and_merge(key, group_list)
+        elif "fold_prompt" in self.config:
+            result, cost = self._incremental_reduce(key, group_list)
+        else:
+            result, cost = self._batch_reduce(key, group_list)
+
+        total_cost += cost
+
+        # Apply pass-through at the group level
+        if (
+            result is not None
+            and self.config.get("pass_through", False)
+            and group_elems
+        ):
+            for k, v in group_elems[0].items():
+                if k not in self.config["output"]["schema"] and k not in result:
+                    result[k] = v
+
+        return result, total_cost
+
+    with ThreadPoolExecutor(max_workers=self.max_threads) as executor:
+        futures = [
+            executor.submit(process_group, key, group)
+            for key, group in grouped_data
+        ]
+        results = []
+        total_cost = 0
+        for future in rich_as_completed(
+            futures,
+            total=len(futures),
+            desc="Processing reduce items",
+            leave=True,
+            console=self.console,
+        ):
+            output, item_cost = future.result()
+            total_cost += item_cost
+            if output is not None:
+                results.append(output)
+
+    return results, total_cost
+
+
+
+ +
+ +
+ + +

+ get_fold_time() + +

+ + +
+ +

Get the average fold time or a default value.

+ + +

Returns:

+ + + + + + + + + + + + + + + + + +
TypeDescription
+ float + +
+

Tuple[float, bool]: A tuple containing the average fold time (or default) and a boolean

+
+
+ bool + +
+

indicating whether the default value was used.

+
+
+ +
+ Source code in docetl/operations/reduce.py +
680
+681
+682
+683
+684
+685
+686
+687
+688
+689
+690
+691
+692
+693
def get_fold_time(self) -> Tuple[float, bool]:
+    """
+    Get the average fold time or a default value.
+
+    Returns:
+        Tuple[float, bool]: A tuple containing the average fold time (or default) and a boolean
+        indicating whether the default value was used.
+    """
+    if "fold_time" in self.config:
+        return self.config["fold_time"], False
+    with self.lock:
+        if len(self.fold_times) >= self.min_samples:
+            return sum(self.fold_times) / len(self.fold_times), False
+    return 1.0, True  # Default to 1 second if no data is available
+
+
+
+ +
+ +
+ + +

+ get_merge_time() + +

+ + +
+ +

Get the average merge time or a default value.

+ + +

Returns:

+ + + + + + + + + + + + + + + + + +
TypeDescription
+ float + +
+

Tuple[float, bool]: A tuple containing the average merge time (or default) and a boolean

+
+
+ bool + +
+

indicating whether the default value was used.

+
+
+ +
+ Source code in docetl/operations/reduce.py +
695
+696
+697
+698
+699
+700
+701
+702
+703
+704
+705
+706
+707
+708
def get_merge_time(self) -> Tuple[float, bool]:
+    """
+    Get the average merge time or a default value.
+
+    Returns:
+        Tuple[float, bool]: A tuple containing the average merge time (or default) and a boolean
+        indicating whether the default value was used.
+    """
+    if "merge_time" in self.config:
+        return self.config["merge_time"], False
+    with self.lock:
+        if len(self.merge_times) >= self.min_samples:
+            return sum(self.merge_times) / len(self.merge_times), False
+    return 1.0, True  # Default to 1 second if no data is available
+
+
+
+ +
+ +
+ + +

+ syntax_check() + +

+ + +
+ +

Perform comprehensive syntax checks on the configuration of the ReduceOperation.

+

This method validates the presence and correctness of all required configuration keys, Jinja2 templates, and ensures the correct +structure and types of the entire configuration.

+

The method performs the following checks: +1. Verifies the presence of all required keys in the configuration. +2. Validates the structure and content of the 'output' configuration, including its 'schema'. +3. Checks if the main 'prompt' is a valid Jinja2 template and contains the required 'inputs' variable. +4. If 'merge_prompt' is specified, ensures that 'fold_prompt' is also present. +5. If 'fold_prompt' is present, verifies the existence of 'fold_batch_size'. +6. Validates the 'fold_prompt' as a Jinja2 template with required variables 'inputs' and 'output'. +7. If present, checks 'merge_prompt' as a valid Jinja2 template with required 'outputs' variable. +8. Verifies types of various configuration inputs (e.g., 'fold_batch_size' as int). +9. Checks for the presence and validity of optional configurations like 'model'.

+ + +

Raises:

+ + + + + + + + + + + + + + + + + +
TypeDescription
+ ValueError + +
+

If any required configuration is missing, if templates are invalid or missing required + variables, or if any other configuration aspect is incorrect or inconsistent.

+
+
+ TypeError + +
+

If any configuration value has an incorrect type, such as 'schema' not being a dict + or 'fold_batch_size' not being an integer.

+
+
+ +
+ Source code in docetl/operations/reduce.py +
 64
+ 65
+ 66
+ 67
+ 68
+ 69
+ 70
+ 71
+ 72
+ 73
+ 74
+ 75
+ 76
+ 77
+ 78
+ 79
+ 80
+ 81
+ 82
+ 83
+ 84
+ 85
+ 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
+121
+122
+123
+124
+125
+126
+127
+128
+129
+130
+131
+132
+133
+134
+135
+136
+137
+138
+139
+140
+141
+142
+143
+144
+145
+146
+147
+148
+149
+150
+151
+152
+153
+154
+155
+156
+157
+158
+159
+160
+161
+162
+163
+164
+165
+166
+167
+168
+169
+170
+171
+172
+173
+174
+175
+176
+177
+178
+179
+180
+181
+182
+183
+184
+185
+186
+187
+188
+189
+190
+191
+192
+193
+194
+195
+196
+197
+198
+199
+200
+201
+202
+203
+204
+205
+206
+207
+208
+209
+210
+211
+212
+213
+214
+215
+216
+217
+218
+219
+220
+221
+222
+223
+224
+225
+226
+227
+228
+229
+230
+231
+232
+233
+234
+235
+236
def syntax_check(self) -> None:
+    """
+    Perform comprehensive syntax checks on the configuration of the ReduceOperation.
+
+    This method validates the presence and correctness of all required configuration keys, Jinja2 templates, and ensures the correct
+    structure and types of the entire configuration.
+
+    The method performs the following checks:
+    1. Verifies the presence of all required keys in the configuration.
+    2. Validates the structure and content of the 'output' configuration, including its 'schema'.
+    3. Checks if the main 'prompt' is a valid Jinja2 template and contains the required 'inputs' variable.
+    4. If 'merge_prompt' is specified, ensures that 'fold_prompt' is also present.
+    5. If 'fold_prompt' is present, verifies the existence of 'fold_batch_size'.
+    6. Validates the 'fold_prompt' as a Jinja2 template with required variables 'inputs' and 'output'.
+    7. If present, checks 'merge_prompt' as a valid Jinja2 template with required 'outputs' variable.
+    8. Verifies types of various configuration inputs (e.g., 'fold_batch_size' as int).
+    9. Checks for the presence and validity of optional configurations like 'model'.
+
+    Raises:
+        ValueError: If any required configuration is missing, if templates are invalid or missing required
+                    variables, or if any other configuration aspect is incorrect or inconsistent.
+        TypeError: If any configuration value has an incorrect type, such as 'schema' not being a dict
+                   or 'fold_batch_size' not being an integer.
+    """
+    required_keys = ["reduce_key", "prompt", "output"]
+    for key in required_keys:
+        if key not in self.config:
+            raise ValueError(
+                f"Missing required key '{key}' in ReduceOperation configuration"
+            )
+
+    if "schema" not in self.config["output"]:
+        raise ValueError("Missing 'schema' in 'output' configuration")
+
+    if not isinstance(self.config["output"]["schema"], dict):
+        raise TypeError("'schema' in 'output' configuration must be a dictionary")
+
+    if not self.config["output"]["schema"]:
+        raise ValueError("'schema' in 'output' configuration cannot be empty")
+
+    # Check if the prompt is a valid Jinja2 template
+    try:
+        template = Template(self.config["prompt"])
+        template_vars = template.environment.parse(self.config["prompt"]).find_all(
+            jinja2.nodes.Name
+        )
+        template_var_names = {var.name for var in template_vars}
+        if "inputs" not in template_var_names:
+            raise ValueError("Template must include the 'inputs' variable")
+    except Exception as e:
+        raise ValueError(f"Invalid Jinja2 template in 'prompt': {str(e)}")
+
+    # Check if fold_prompt is a valid Jinja2 template (now required if merge exists)
+    if "merge_prompt" in self.config:
+        if "fold_prompt" not in self.config:
+            raise ValueError(
+                "'fold_prompt' is required when 'merge_prompt' is specified"
+            )
+
+    if "fold_prompt" in self.config:
+        if "fold_batch_size" not in self.config:
+            raise ValueError(
+                "'fold_batch_size' is required when 'fold_prompt' is specified"
+            )
+
+        try:
+            fold_template = Template(self.config["fold_prompt"])
+            fold_template_vars = fold_template.environment.parse(
+                self.config["fold_prompt"]
+            ).find_all(jinja2.nodes.Name)
+            fold_template_var_names = {var.name for var in fold_template_vars}
+            required_vars = {"inputs", "output"}
+            if not required_vars.issubset(fold_template_var_names):
+                raise ValueError(
+                    f"Fold template must include variables: {required_vars}. Current template includes: {fold_template_var_names}"
+                )
+        except Exception as e:
+            raise ValueError(f"Invalid Jinja2 template in 'fold_prompt': {str(e)}")
+
+    # Check merge_prompt and merge_batch_size
+    if "merge_prompt" in self.config:
+        if "merge_batch_size" not in self.config:
+            raise ValueError(
+                "'merge_batch_size' is required when 'merge_prompt' is specified"
+            )
+
+        try:
+            merge_template = Template(self.config["merge_prompt"])
+            merge_template_vars = merge_template.environment.parse(
+                self.config["merge_prompt"]
+            ).find_all(jinja2.nodes.Name)
+            merge_template_var_names = {var.name for var in merge_template_vars}
+            if "outputs" not in merge_template_var_names:
+                raise ValueError(
+                    "Merge template must include the 'outputs' variable"
+                )
+        except Exception as e:
+            raise ValueError(f"Invalid Jinja2 template in 'merge_prompt': {str(e)}")
+
+    # Check if the model is specified (optional)
+    if "model" in self.config and not isinstance(self.config["model"], str):
+        raise TypeError("'model' in configuration must be a string")
+
+    # Check if reduce_key is a string or a list of strings
+    if not isinstance(self.config["reduce_key"], (str, list)):
+        raise TypeError("'reduce_key' must be a string or a list of strings")
+    if isinstance(self.config["reduce_key"], list):
+        if not all(isinstance(key, str) for key in self.config["reduce_key"]):
+            raise TypeError("All elements in 'reduce_key' list must be strings")
+
+    # Check if input schema is provided and valid (optional)
+    if "input" in self.config:
+        if "schema" not in self.config["input"]:
+            raise ValueError("Missing 'schema' in 'input' configuration")
+        if not isinstance(self.config["input"]["schema"], dict):
+            raise TypeError(
+                "'schema' in 'input' configuration must be a dictionary"
+            )
+
+    # Check if fold_batch_size and merge_batch_size are positive integers
+    for key in ["fold_batch_size", "merge_batch_size"]:
+        if key in self.config:
+            if not isinstance(self.config[key], int) or self.config[key] <= 0:
+                raise ValueError(f"'{key}' must be a positive integer")
+
+    if "value_sampling" in self.config:
+        sampling = self.config["value_sampling"]
+        if not isinstance(sampling, dict):
+            raise TypeError("'value_sampling' must be a dictionary")
+
+        if "enabled" not in sampling:
+            raise ValueError(
+                "'enabled' is required in 'value_sampling' configuration"
+            )
+        if not isinstance(sampling["enabled"], bool):
+            raise TypeError("'enabled' in 'value_sampling' must be a boolean")
+
+        if sampling["enabled"]:
+            if "sample_size" not in sampling:
+                raise ValueError(
+                    "'sample_size' is required when value_sampling is enabled"
+                )
+            if (
+                not isinstance(sampling["sample_size"], int)
+                or sampling["sample_size"] <= 0
+            ):
+                raise ValueError("'sample_size' must be a positive integer")
+
+            if "method" not in sampling:
+                raise ValueError(
+                    "'method' is required when value_sampling is enabled"
+                )
+            if sampling["method"] not in [
+                "random",
+                "first_n",
+                "cluster",
+                "sem_sim",
+            ]:
+                raise ValueError(
+                    "Invalid 'method'. Must be 'random', 'first_n', or 'embedding'"
+                )
+
+            if sampling["method"] == "embedding":
+                if "embedding_model" not in sampling:
+                    raise ValueError(
+                        "'embedding_model' is required when using embedding-based sampling"
+                    )
+                if "embedding_keys" not in sampling:
+                    raise ValueError(
+                        "'embedding_keys' is required when using embedding-based sampling"
+                    )
+
+    self.gleaning_check()
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ docetl.operations.map.ParallelMapOperation + + +

+ + +
+

+ Bases: BaseOperation

+ + +
+ Source code in docetl/operations/map.py +
198
+199
+200
+201
+202
+203
+204
+205
+206
+207
+208
+209
+210
+211
+212
+213
+214
+215
+216
+217
+218
+219
+220
+221
+222
+223
+224
+225
+226
+227
+228
+229
+230
+231
+232
+233
+234
+235
+236
+237
+238
+239
+240
+241
+242
+243
+244
+245
+246
+247
+248
+249
+250
+251
+252
+253
+254
+255
+256
+257
+258
+259
+260
+261
+262
+263
+264
+265
+266
+267
+268
+269
+270
+271
+272
+273
+274
+275
+276
+277
+278
+279
+280
+281
+282
+283
+284
+285
+286
+287
+288
+289
+290
+291
+292
+293
+294
+295
+296
+297
+298
+299
+300
+301
+302
+303
+304
+305
+306
+307
+308
+309
+310
+311
+312
+313
+314
+315
+316
+317
+318
+319
+320
+321
+322
+323
+324
+325
+326
+327
+328
+329
+330
+331
+332
+333
+334
+335
+336
+337
+338
+339
+340
+341
+342
+343
+344
+345
+346
class ParallelMapOperation(BaseOperation):
+    def syntax_check(self) -> None:
+        """
+        Checks the configuration of the ParallelMapOperation for required keys and valid structure.
+
+        Raises:
+            ValueError: If required keys are missing or if the configuration structure is invalid.
+            TypeError: If the configuration values have incorrect types.
+        """
+        if "prompts" not in self.config or not isinstance(self.config["prompts"], list):
+            raise ValueError(
+                "ParallelMapOperation requires a 'prompts' list in the configuration"
+            )
+
+        if not self.config["prompts"]:
+            raise ValueError("The 'prompts' list cannot be empty")
+
+        for i, prompt_config in enumerate(self.config["prompts"]):
+            if not isinstance(prompt_config, dict):
+                raise TypeError(f"Prompt configuration {i} must be a dictionary")
+
+            required_keys = ["name", "prompt", "output_keys"]
+            for key in required_keys:
+                if key not in prompt_config:
+                    raise ValueError(
+                        f"Missing required key '{key}' in prompt configuration {i}"
+                    )
+
+            if not isinstance(prompt_config["name"], str):
+                raise TypeError(f"'name' in prompt configuration {i} must be a string")
+
+            if not isinstance(prompt_config["prompt"], str):
+                raise TypeError(
+                    f"'prompt' in prompt configuration {i} must be a string"
+                )
+
+            if not isinstance(prompt_config["output_keys"], list):
+                raise TypeError(
+                    f"'output_keys' in prompt configuration {i} must be a list"
+                )
+
+            if not prompt_config["output_keys"]:
+                raise ValueError(
+                    f"'output_keys' list in prompt configuration {i} cannot be empty"
+                )
+
+            # Check if the prompt is a valid Jinja2 template
+            try:
+                Template(prompt_config["prompt"])
+            except Exception as e:
+                raise ValueError(
+                    f"Invalid Jinja2 template in prompt configuration {i}: {str(e)}"
+                )
+
+            # Check if the model is specified (optional)
+            if "model" in prompt_config and not isinstance(prompt_config["model"], str):
+                raise TypeError(f"'model' in prompt configuration {i} must be a string")
+
+        # Check if all output schema keys are covered by the prompts
+        output_schema = self.config["output"]["schema"]
+        output_keys_covered = set()
+        for prompt_config in self.config["prompts"]:
+            output_keys_covered.update(prompt_config["output_keys"])
+
+        missing_keys = set(output_schema.keys()) - output_keys_covered
+        if missing_keys:
+            raise ValueError(
+                f"The following output schema keys are not covered by any prompt: {missing_keys}"
+            )
+
+    def execute(self, input_data: List[Dict]) -> Tuple[List[Dict], float]:
+        """
+        Executes the parallel map operation on the provided input data.
+
+        Args:
+            input_data (List[Dict]): The input data to process.
+
+        Returns:
+            Tuple[List[Dict], float]: A tuple containing the processed results and the total cost of the operation.
+
+        This method performs the following steps:
+        1. Processes each input item using multiple prompts in parallel
+        2. Aggregates results from different prompts for each input item
+        3. Validates the combined output for each item
+        4. Calculates total cost of the operation
+        """
+        results = {}
+        total_cost = 0
+        output_schema = self.config["output"]["schema"]
+
+        def process_prompt(item, prompt_config):
+            prompt_template = Template(prompt_config["prompt"])
+            prompt = prompt_template.render(input=item)
+            local_output_schema = {
+                key: output_schema[key] for key in prompt_config["output_keys"]
+            }
+
+            # If there are tools, we need to pass in the tools
+            response = call_llm(
+                prompt_config.get("model", self.default_model),
+                "parallel_map",
+                [{"role": "user", "content": prompt}],
+                local_output_schema,
+                tools=prompt_config.get("tools", None),
+                console=self.console,
+            )
+            output = parse_llm_response(
+                response, tools=prompt_config.get("tools", None)
+            )[0]
+            return output, completion_cost(response)
+
+        with ThreadPoolExecutor(max_workers=self.max_threads) as executor:
+            # Create all futures at once
+            all_futures = [
+                executor.submit(process_prompt, item, prompt_config)
+                for item in input_data
+                for prompt_config in self.config["prompts"]
+            ]
+
+            # Process results in order
+            pbar = RichLoopBar(
+                range(len(all_futures)),
+                desc="Processing parallel map items",
+                console=self.console,
+            )
+            for i in pbar:
+                future = all_futures[i]
+                output, cost = future.result()
+                total_cost += cost
+
+                # Determine which item this future corresponds to
+                item_index = i // len(self.config["prompts"])
+                prompt_index = i % len(self.config["prompts"])
+
+                # Initialize or update the item_result
+                if prompt_index == 0:
+                    item_result = input_data[item_index].copy()
+                    results[item_index] = item_result
+
+                # Fetch the item_result
+                item_result = results[item_index]
+
+                # Update the item_result with the output
+                item_result.update(output)
+
+                pbar.update(i)
+
+        # Return the results in order
+        return [results[i] for i in range(len(input_data)) if i in results], total_cost
+
+
+ + + +
+ + + + + + + + + +
+ + +

+ execute(input_data) + +

+ + +
+ +

Executes the parallel map operation on the provided input data.

+ + +

Parameters:

+ + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
input_data + List[Dict] + +
+

The input data to process.

+
+
+ required +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ Tuple[List[Dict], float] + +
+

Tuple[List[Dict], float]: A tuple containing the processed results and the total cost of the operation.

+
+
+

This method performs the following steps: +1. Processes each input item using multiple prompts in parallel +2. Aggregates results from different prompts for each input item +3. Validates the combined output for each item +4. Calculates total cost of the operation

+ +
+ Source code in docetl/operations/map.py +
268
+269
+270
+271
+272
+273
+274
+275
+276
+277
+278
+279
+280
+281
+282
+283
+284
+285
+286
+287
+288
+289
+290
+291
+292
+293
+294
+295
+296
+297
+298
+299
+300
+301
+302
+303
+304
+305
+306
+307
+308
+309
+310
+311
+312
+313
+314
+315
+316
+317
+318
+319
+320
+321
+322
+323
+324
+325
+326
+327
+328
+329
+330
+331
+332
+333
+334
+335
+336
+337
+338
+339
+340
+341
+342
+343
+344
+345
+346
def execute(self, input_data: List[Dict]) -> Tuple[List[Dict], float]:
+    """
+    Executes the parallel map operation on the provided input data.
+
+    Args:
+        input_data (List[Dict]): The input data to process.
+
+    Returns:
+        Tuple[List[Dict], float]: A tuple containing the processed results and the total cost of the operation.
+
+    This method performs the following steps:
+    1. Processes each input item using multiple prompts in parallel
+    2. Aggregates results from different prompts for each input item
+    3. Validates the combined output for each item
+    4. Calculates total cost of the operation
+    """
+    results = {}
+    total_cost = 0
+    output_schema = self.config["output"]["schema"]
+
+    def process_prompt(item, prompt_config):
+        prompt_template = Template(prompt_config["prompt"])
+        prompt = prompt_template.render(input=item)
+        local_output_schema = {
+            key: output_schema[key] for key in prompt_config["output_keys"]
+        }
+
+        # If there are tools, we need to pass in the tools
+        response = call_llm(
+            prompt_config.get("model", self.default_model),
+            "parallel_map",
+            [{"role": "user", "content": prompt}],
+            local_output_schema,
+            tools=prompt_config.get("tools", None),
+            console=self.console,
+        )
+        output = parse_llm_response(
+            response, tools=prompt_config.get("tools", None)
+        )[0]
+        return output, completion_cost(response)
+
+    with ThreadPoolExecutor(max_workers=self.max_threads) as executor:
+        # Create all futures at once
+        all_futures = [
+            executor.submit(process_prompt, item, prompt_config)
+            for item in input_data
+            for prompt_config in self.config["prompts"]
+        ]
+
+        # Process results in order
+        pbar = RichLoopBar(
+            range(len(all_futures)),
+            desc="Processing parallel map items",
+            console=self.console,
+        )
+        for i in pbar:
+            future = all_futures[i]
+            output, cost = future.result()
+            total_cost += cost
+
+            # Determine which item this future corresponds to
+            item_index = i // len(self.config["prompts"])
+            prompt_index = i % len(self.config["prompts"])
+
+            # Initialize or update the item_result
+            if prompt_index == 0:
+                item_result = input_data[item_index].copy()
+                results[item_index] = item_result
+
+            # Fetch the item_result
+            item_result = results[item_index]
+
+            # Update the item_result with the output
+            item_result.update(output)
+
+            pbar.update(i)
+
+    # Return the results in order
+    return [results[i] for i in range(len(input_data)) if i in results], total_cost
+
+
+
+ +
+ +
+ + +

+ syntax_check() + +

+ + +
+ +

Checks the configuration of the ParallelMapOperation for required keys and valid structure.

+ + +

Raises:

+ + + + + + + + + + + + + + + + + +
TypeDescription
+ ValueError + +
+

If required keys are missing or if the configuration structure is invalid.

+
+
+ TypeError + +
+

If the configuration values have incorrect types.

+
+
+ +
+ Source code in docetl/operations/map.py +
199
+200
+201
+202
+203
+204
+205
+206
+207
+208
+209
+210
+211
+212
+213
+214
+215
+216
+217
+218
+219
+220
+221
+222
+223
+224
+225
+226
+227
+228
+229
+230
+231
+232
+233
+234
+235
+236
+237
+238
+239
+240
+241
+242
+243
+244
+245
+246
+247
+248
+249
+250
+251
+252
+253
+254
+255
+256
+257
+258
+259
+260
+261
+262
+263
+264
+265
+266
def syntax_check(self) -> None:
+    """
+    Checks the configuration of the ParallelMapOperation for required keys and valid structure.
+
+    Raises:
+        ValueError: If required keys are missing or if the configuration structure is invalid.
+        TypeError: If the configuration values have incorrect types.
+    """
+    if "prompts" not in self.config or not isinstance(self.config["prompts"], list):
+        raise ValueError(
+            "ParallelMapOperation requires a 'prompts' list in the configuration"
+        )
+
+    if not self.config["prompts"]:
+        raise ValueError("The 'prompts' list cannot be empty")
+
+    for i, prompt_config in enumerate(self.config["prompts"]):
+        if not isinstance(prompt_config, dict):
+            raise TypeError(f"Prompt configuration {i} must be a dictionary")
+
+        required_keys = ["name", "prompt", "output_keys"]
+        for key in required_keys:
+            if key not in prompt_config:
+                raise ValueError(
+                    f"Missing required key '{key}' in prompt configuration {i}"
+                )
+
+        if not isinstance(prompt_config["name"], str):
+            raise TypeError(f"'name' in prompt configuration {i} must be a string")
+
+        if not isinstance(prompt_config["prompt"], str):
+            raise TypeError(
+                f"'prompt' in prompt configuration {i} must be a string"
+            )
+
+        if not isinstance(prompt_config["output_keys"], list):
+            raise TypeError(
+                f"'output_keys' in prompt configuration {i} must be a list"
+            )
+
+        if not prompt_config["output_keys"]:
+            raise ValueError(
+                f"'output_keys' list in prompt configuration {i} cannot be empty"
+            )
+
+        # Check if the prompt is a valid Jinja2 template
+        try:
+            Template(prompt_config["prompt"])
+        except Exception as e:
+            raise ValueError(
+                f"Invalid Jinja2 template in prompt configuration {i}: {str(e)}"
+            )
+
+        # Check if the model is specified (optional)
+        if "model" in prompt_config and not isinstance(prompt_config["model"], str):
+            raise TypeError(f"'model' in prompt configuration {i} must be a string")
+
+    # Check if all output schema keys are covered by the prompts
+    output_schema = self.config["output"]["schema"]
+    output_keys_covered = set()
+    for prompt_config in self.config["prompts"]:
+        output_keys_covered.update(prompt_config["output_keys"])
+
+    missing_keys = set(output_schema.keys()) - output_keys_covered
+    if missing_keys:
+        raise ValueError(
+            f"The following output schema keys are not covered by any prompt: {missing_keys}"
+        )
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ docetl.operations.filter.FilterOperation + + +

+ + +
+

+ Bases: BaseOperation

+ + +
+ Source code in docetl/operations/filter.py +
 18
+ 19
+ 20
+ 21
+ 22
+ 23
+ 24
+ 25
+ 26
+ 27
+ 28
+ 29
+ 30
+ 31
+ 32
+ 33
+ 34
+ 35
+ 36
+ 37
+ 38
+ 39
+ 40
+ 41
+ 42
+ 43
+ 44
+ 45
+ 46
+ 47
+ 48
+ 49
+ 50
+ 51
+ 52
+ 53
+ 54
+ 55
+ 56
+ 57
+ 58
+ 59
+ 60
+ 61
+ 62
+ 63
+ 64
+ 65
+ 66
+ 67
+ 68
+ 69
+ 70
+ 71
+ 72
+ 73
+ 74
+ 75
+ 76
+ 77
+ 78
+ 79
+ 80
+ 81
+ 82
+ 83
+ 84
+ 85
+ 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
+121
+122
+123
+124
+125
+126
+127
+128
+129
+130
+131
+132
+133
+134
+135
+136
+137
+138
+139
+140
+141
+142
+143
+144
+145
+146
+147
+148
+149
+150
+151
+152
+153
+154
+155
+156
+157
+158
+159
+160
+161
+162
+163
+164
+165
+166
+167
+168
+169
+170
+171
+172
+173
class FilterOperation(BaseOperation):
+    def syntax_check(self) -> None:
+        """
+        Checks the configuration of the FilterOperation for required keys and valid structure.
+
+        Raises:
+            ValueError: If required keys are missing or if the output schema structure is invalid.
+            TypeError: If the schema in the output configuration is not a dictionary or if the schema value is not of type bool.
+
+        This method checks for the following:
+        - Presence of required keys: 'prompt' and 'output'
+        - Presence of 'schema' in the 'output' configuration
+        - The 'schema' is a non-empty dictionary with exactly one key-value pair
+        - The value in the schema is of type bool
+        """
+        required_keys = ["prompt", "output"]
+        for key in required_keys:
+            if key not in self.config:
+                raise ValueError(
+                    f"Missing required key '{key}' in FilterOperation configuration"
+                )
+
+        if "schema" not in self.config["output"]:
+            raise ValueError("Missing 'schema' in 'output' configuration")
+
+        if not isinstance(self.config["output"]["schema"], dict):
+            raise TypeError("'schema' in 'output' configuration must be a dictionary")
+
+        if not self.config["output"]["schema"]:
+            raise ValueError("'schema' in 'output' configuration cannot be empty")
+
+        schema = self.config["output"]["schema"]
+        if "_short_explanation" in schema:
+            schema = {k: v for k, v in schema.items() if k != "_short_explanation"}
+        if len(schema) != 1:
+            raise ValueError(
+                "The 'schema' in 'output' configuration must have exactly one key-value pair that maps to a boolean value"
+            )
+
+        key, value = next(iter(schema.items()))
+        if value not in ["bool", "boolean"]:
+            raise TypeError(
+                f"The value in the 'schema' must be of type bool, got {value}"
+            )
+
+    def execute(
+        self, input_data: List[Dict], is_build: bool = False
+    ) -> Tuple[List[Dict], float]:
+        """
+        Executes the filter operation on the input data.
+
+        Args:
+            input_data (List[Dict]): A list of dictionaries to process.
+            is_build (bool): Whether the operation is being executed in the build phase. Defaults to False.
+
+        Returns:
+            Tuple[List[Dict], float]: A tuple containing the filtered list of dictionaries
+            and the total cost of the operation.
+
+        This method performs the following steps:
+        1. Processes each input item using an LLM model
+        2. Validates the output
+        3. Filters the results based on the specified filter key
+        4. Calculates the total cost of the operation
+
+        The method uses multi-threading to process items in parallel, improving performance
+        for large datasets.
+
+        Usage:
+        ```python
+        from docetl.operations import FilterOperation
+
+        config = {
+            "prompt": "Determine if the following item is important: {{input}}",
+            "output": {
+                "schema": {"is_important": "bool"}
+            },
+            "model": "gpt-3.5-turbo"
+        }
+        filter_op = FilterOperation(config)
+        input_data = [
+            {"id": 1, "text": "Critical update"},
+            {"id": 2, "text": "Regular maintenance"}
+        ]
+        results, cost = filter_op.execute(input_data)
+        print(f"Filtered results: {results}")
+        print(f"Total cost: {cost}")
+        ```
+        """
+        filter_key = next(
+            iter(
+                [
+                    k
+                    for k in self.config["output"]["schema"].keys()
+                    if k != "_short_explanation"
+                ]
+            )
+        )
+
+        def _process_filter_item(item: Dict) -> Tuple[Optional[Dict], float]:
+            prompt_template = Template(self.config["prompt"])
+            prompt = prompt_template.render(input=item)
+
+            def validation_fn(response: Dict[str, Any]):
+                output = parse_llm_response(response)[0]
+                for key, value in item.items():
+                    if key not in self.config["output"]["schema"]:
+                        output[key] = value
+                if validate_output(self.config, output, self.console):
+                    return output, True
+                return output, False
+
+            output, cost, is_valid = call_llm_with_validation(
+                [{"role": "user", "content": prompt}],
+                llm_call_fn=lambda messages: call_llm(
+                    self.config.get("model", self.default_model),
+                    "filter",
+                    messages,
+                    self.config["output"]["schema"],
+                    console=self.console,
+                ),
+                validation_fn=validation_fn,
+                val_rule=self.config.get("validate", []),
+                num_retries=self.num_retries_on_validate_failure,
+                console=self.console,
+            )
+
+            if is_valid:
+                return output, cost
+
+            return None, cost
+
+        with ThreadPoolExecutor(max_workers=self.max_threads) as executor:
+            futures = [
+                executor.submit(_process_filter_item, item) for item in input_data
+            ]
+            results = []
+            total_cost = 0
+            pbar = RichLoopBar(
+                range(len(futures)),
+                desc="Processing filter items",
+                console=self.console,
+            )
+            for i in pbar:
+                future = futures[i]
+                result, item_cost = future.result()
+                total_cost += item_cost
+                if result is not None:
+                    if is_build:
+                        results.append(result)
+                    else:
+                        if result.get(filter_key, False):
+                            results.append(result)
+                pbar.update(1)
+
+        return results, total_cost
+
+
+ + + +
+ + + + + + + + + +
+ + +

+ execute(input_data, is_build=False) + +

+ + +
+ +

Executes the filter operation on the input data.

+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
input_data + List[Dict] + +
+

A list of dictionaries to process.

+
+
+ required +
is_build + bool + +
+

Whether the operation is being executed in the build phase. Defaults to False.

+
+
+ False +
+ + +

Returns:

+ + + + + + + + + + + + + + + + + +
TypeDescription
+ List[Dict] + +
+

Tuple[List[Dict], float]: A tuple containing the filtered list of dictionaries

+
+
+ float + +
+

and the total cost of the operation.

+
+
+

This method performs the following steps: +1. Processes each input item using an LLM model +2. Validates the output +3. Filters the results based on the specified filter key +4. Calculates the total cost of the operation

+

The method uses multi-threading to process items in parallel, improving performance +for large datasets.

+

Usage: +

from docetl.operations import FilterOperation
+
+config = {
+    "prompt": "Determine if the following item is important: {{input}}",
+    "output": {
+        "schema": {"is_important": "bool"}
+    },
+    "model": "gpt-3.5-turbo"
+}
+filter_op = FilterOperation(config)
+input_data = [
+    {"id": 1, "text": "Critical update"},
+    {"id": 2, "text": "Regular maintenance"}
+]
+results, cost = filter_op.execute(input_data)
+print(f"Filtered results: {results}")
+print(f"Total cost: {cost}")
+

+ +
+ Source code in docetl/operations/filter.py +
 63
+ 64
+ 65
+ 66
+ 67
+ 68
+ 69
+ 70
+ 71
+ 72
+ 73
+ 74
+ 75
+ 76
+ 77
+ 78
+ 79
+ 80
+ 81
+ 82
+ 83
+ 84
+ 85
+ 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
+121
+122
+123
+124
+125
+126
+127
+128
+129
+130
+131
+132
+133
+134
+135
+136
+137
+138
+139
+140
+141
+142
+143
+144
+145
+146
+147
+148
+149
+150
+151
+152
+153
+154
+155
+156
+157
+158
+159
+160
+161
+162
+163
+164
+165
+166
+167
+168
+169
+170
+171
+172
+173
def execute(
+    self, input_data: List[Dict], is_build: bool = False
+) -> Tuple[List[Dict], float]:
+    """
+    Executes the filter operation on the input data.
+
+    Args:
+        input_data (List[Dict]): A list of dictionaries to process.
+        is_build (bool): Whether the operation is being executed in the build phase. Defaults to False.
+
+    Returns:
+        Tuple[List[Dict], float]: A tuple containing the filtered list of dictionaries
+        and the total cost of the operation.
+
+    This method performs the following steps:
+    1. Processes each input item using an LLM model
+    2. Validates the output
+    3. Filters the results based on the specified filter key
+    4. Calculates the total cost of the operation
+
+    The method uses multi-threading to process items in parallel, improving performance
+    for large datasets.
+
+    Usage:
+    ```python
+    from docetl.operations import FilterOperation
+
+    config = {
+        "prompt": "Determine if the following item is important: {{input}}",
+        "output": {
+            "schema": {"is_important": "bool"}
+        },
+        "model": "gpt-3.5-turbo"
+    }
+    filter_op = FilterOperation(config)
+    input_data = [
+        {"id": 1, "text": "Critical update"},
+        {"id": 2, "text": "Regular maintenance"}
+    ]
+    results, cost = filter_op.execute(input_data)
+    print(f"Filtered results: {results}")
+    print(f"Total cost: {cost}")
+    ```
+    """
+    filter_key = next(
+        iter(
+            [
+                k
+                for k in self.config["output"]["schema"].keys()
+                if k != "_short_explanation"
+            ]
+        )
+    )
+
+    def _process_filter_item(item: Dict) -> Tuple[Optional[Dict], float]:
+        prompt_template = Template(self.config["prompt"])
+        prompt = prompt_template.render(input=item)
+
+        def validation_fn(response: Dict[str, Any]):
+            output = parse_llm_response(response)[0]
+            for key, value in item.items():
+                if key not in self.config["output"]["schema"]:
+                    output[key] = value
+            if validate_output(self.config, output, self.console):
+                return output, True
+            return output, False
+
+        output, cost, is_valid = call_llm_with_validation(
+            [{"role": "user", "content": prompt}],
+            llm_call_fn=lambda messages: call_llm(
+                self.config.get("model", self.default_model),
+                "filter",
+                messages,
+                self.config["output"]["schema"],
+                console=self.console,
+            ),
+            validation_fn=validation_fn,
+            val_rule=self.config.get("validate", []),
+            num_retries=self.num_retries_on_validate_failure,
+            console=self.console,
+        )
+
+        if is_valid:
+            return output, cost
+
+        return None, cost
+
+    with ThreadPoolExecutor(max_workers=self.max_threads) as executor:
+        futures = [
+            executor.submit(_process_filter_item, item) for item in input_data
+        ]
+        results = []
+        total_cost = 0
+        pbar = RichLoopBar(
+            range(len(futures)),
+            desc="Processing filter items",
+            console=self.console,
+        )
+        for i in pbar:
+            future = futures[i]
+            result, item_cost = future.result()
+            total_cost += item_cost
+            if result is not None:
+                if is_build:
+                    results.append(result)
+                else:
+                    if result.get(filter_key, False):
+                        results.append(result)
+            pbar.update(1)
+
+    return results, total_cost
+
+
+
+ +
+ +
+ + +

+ syntax_check() + +

+ + +
+ +

Checks the configuration of the FilterOperation for required keys and valid structure.

+ + +

Raises:

+ + + + + + + + + + + + + + + + + +
TypeDescription
+ ValueError + +
+

If required keys are missing or if the output schema structure is invalid.

+
+
+ TypeError + +
+

If the schema in the output configuration is not a dictionary or if the schema value is not of type bool.

+
+
+

This method checks for the following: +- Presence of required keys: 'prompt' and 'output' +- Presence of 'schema' in the 'output' configuration +- The 'schema' is a non-empty dictionary with exactly one key-value pair +- The value in the schema is of type bool

+ +
+ Source code in docetl/operations/filter.py +
19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+40
+41
+42
+43
+44
+45
+46
+47
+48
+49
+50
+51
+52
+53
+54
+55
+56
+57
+58
+59
+60
+61
def syntax_check(self) -> None:
+    """
+    Checks the configuration of the FilterOperation for required keys and valid structure.
+
+    Raises:
+        ValueError: If required keys are missing or if the output schema structure is invalid.
+        TypeError: If the schema in the output configuration is not a dictionary or if the schema value is not of type bool.
+
+    This method checks for the following:
+    - Presence of required keys: 'prompt' and 'output'
+    - Presence of 'schema' in the 'output' configuration
+    - The 'schema' is a non-empty dictionary with exactly one key-value pair
+    - The value in the schema is of type bool
+    """
+    required_keys = ["prompt", "output"]
+    for key in required_keys:
+        if key not in self.config:
+            raise ValueError(
+                f"Missing required key '{key}' in FilterOperation configuration"
+            )
+
+    if "schema" not in self.config["output"]:
+        raise ValueError("Missing 'schema' in 'output' configuration")
+
+    if not isinstance(self.config["output"]["schema"], dict):
+        raise TypeError("'schema' in 'output' configuration must be a dictionary")
+
+    if not self.config["output"]["schema"]:
+        raise ValueError("'schema' in 'output' configuration cannot be empty")
+
+    schema = self.config["output"]["schema"]
+    if "_short_explanation" in schema:
+        schema = {k: v for k, v in schema.items() if k != "_short_explanation"}
+    if len(schema) != 1:
+        raise ValueError(
+            "The 'schema' in 'output' configuration must have exactly one key-value pair that maps to a boolean value"
+        )
+
+    key, value = next(iter(schema.items()))
+    if value not in ["bool", "boolean"]:
+        raise TypeError(
+            f"The value in the 'schema' must be of type bool, got {value}"
+        )
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ docetl.operations.equijoin.EquijoinOperation + + +

+ + +
+

+ Bases: BaseOperation

+ + +
+ Source code in docetl/operations/equijoin.py +
 85
+ 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
+121
+122
+123
+124
+125
+126
+127
+128
+129
+130
+131
+132
+133
+134
+135
+136
+137
+138
+139
+140
+141
+142
+143
+144
+145
+146
+147
+148
+149
+150
+151
+152
+153
+154
+155
+156
+157
+158
+159
+160
+161
+162
+163
+164
+165
+166
+167
+168
+169
+170
+171
+172
+173
+174
+175
+176
+177
+178
+179
+180
+181
+182
+183
+184
+185
+186
+187
+188
+189
+190
+191
+192
+193
+194
+195
+196
+197
+198
+199
+200
+201
+202
+203
+204
+205
+206
+207
+208
+209
+210
+211
+212
+213
+214
+215
+216
+217
+218
+219
+220
+221
+222
+223
+224
+225
+226
+227
+228
+229
+230
+231
+232
+233
+234
+235
+236
+237
+238
+239
+240
+241
+242
+243
+244
+245
+246
+247
+248
+249
+250
+251
+252
+253
+254
+255
+256
+257
+258
+259
+260
+261
+262
+263
+264
+265
+266
+267
+268
+269
+270
+271
+272
+273
+274
+275
+276
+277
+278
+279
+280
+281
+282
+283
+284
+285
+286
+287
+288
+289
+290
+291
+292
+293
+294
+295
+296
+297
+298
+299
+300
+301
+302
+303
+304
+305
+306
+307
+308
+309
+310
+311
+312
+313
+314
+315
+316
+317
+318
+319
+320
+321
+322
+323
+324
+325
+326
+327
+328
+329
+330
+331
+332
+333
+334
+335
+336
+337
+338
+339
+340
+341
+342
+343
+344
+345
+346
+347
+348
+349
+350
+351
+352
+353
+354
+355
+356
+357
+358
+359
+360
+361
+362
+363
+364
+365
+366
+367
+368
+369
+370
+371
+372
+373
+374
+375
+376
+377
+378
+379
+380
+381
+382
+383
+384
+385
+386
+387
+388
+389
+390
+391
+392
+393
+394
+395
+396
+397
+398
+399
+400
+401
+402
+403
+404
+405
+406
+407
+408
+409
+410
+411
+412
+413
+414
+415
+416
+417
+418
+419
+420
+421
+422
+423
+424
+425
+426
+427
+428
+429
+430
+431
+432
class EquijoinOperation(BaseOperation):
+    def syntax_check(self) -> None:
+        """
+        Checks the configuration of the EquijoinOperation for required keys and valid structure.
+
+        Raises:
+            ValueError: If required keys are missing or if the blocking_keys structure is invalid.
+            Specifically:
+            - Raises if 'comparison_prompt' is missing from the config.
+            - Raises if 'left' or 'right' are missing from the 'blocking_keys' structure (if present).
+            - Raises if 'left' or 'right' are missing from the 'limits' structure (if present).
+        """
+        if "comparison_prompt" not in self.config:
+            raise ValueError(
+                "Missing required key 'comparison_prompt' in EquijoinOperation configuration"
+            )
+
+        if "blocking_keys" in self.config:
+            if (
+                "left" not in self.config["blocking_keys"]
+                or "right" not in self.config["blocking_keys"]
+            ):
+                raise ValueError(
+                    "Both 'left' and 'right' must be specified in 'blocking_keys'"
+                )
+
+        if "limits" in self.config:
+            if (
+                "left" not in self.config["limits"]
+                or "right" not in self.config["limits"]
+            ):
+                raise ValueError(
+                    "Both 'left' and 'right' must be specified in 'limits'"
+                )
+
+        if "limit_comparisons" in self.config:
+            if not isinstance(self.config["limit_comparisons"], int):
+                raise ValueError("limit_comparisons must be an integer")
+
+    def execute(
+        self, left_data: List[Dict], right_data: List[Dict]
+    ) -> Tuple[List[Dict], float]:
+        """
+        Executes the equijoin operation on the provided datasets.
+
+        Args:
+            left_data (List[Dict]): The left dataset to join.
+            right_data (List[Dict]): The right dataset to join.
+
+        Returns:
+            Tuple[List[Dict], float]: A tuple containing the joined results and the total cost of the operation.
+
+        Usage:
+        ```python
+        from docetl.operations import EquijoinOperation
+
+        config = {
+            "blocking_keys": {
+                "left": ["id"],
+                "right": ["user_id"]
+            },
+            "limits": {
+                "left": 1,
+                "right": 1
+            },
+            "comparison_prompt": "Compare {{left}} and {{right}} and determine if they match.",
+            "blocking_threshold": 0.8,
+            "blocking_conditions": ["left['id'] == right['user_id']"],
+            "limit_comparisons": 1000
+        }
+        equijoin_op = EquijoinOperation(config)
+        left_data = [{"id": 1, "name": "Alice"}, {"id": 2, "name": "Bob"}]
+        right_data = [{"user_id": 1, "age": 30}, {"user_id": 2, "age": 25}]
+        results, cost = equijoin_op.execute(left_data, right_data)
+        print(f"Joined results: {results}")
+        print(f"Total cost: {cost}")
+        ```
+
+        This method performs the following steps:
+        1. Initial blocking based on specified conditions (if any)
+        2. Embedding-based blocking (if threshold is provided)
+        3. LLM-based comparison for blocked pairs
+        4. Result aggregation and validation
+
+        The method also calculates and logs statistics such as comparisons saved by blocking and join selectivity.
+        """
+
+        blocking_keys = self.config.get("blocking_keys", {})
+        left_keys = blocking_keys.get(
+            "left", list(left_data[0].keys()) if left_data else []
+        )
+        right_keys = blocking_keys.get(
+            "right", list(right_data[0].keys()) if right_data else []
+        )
+        limits = self.config.get(
+            "limits", {"left": float("inf"), "right": float("inf")}
+        )
+        left_limit = limits["left"]
+        right_limit = limits["right"]
+        blocking_threshold = self.config.get("blocking_threshold")
+        blocking_conditions = self.config.get("blocking_conditions", [])
+        limit_comparisons = self.config.get("limit_comparisons")
+        total_cost = 0
+
+        # LLM-based comparison for blocked pairs
+        def get_hashable_key(item: Dict) -> str:
+            return json.dumps(item, sort_keys=True)
+
+        if len(left_data) == 0 or len(right_data) == 0:
+            return [], 0
+
+        # Initial blocking using multiprocessing
+        num_processes = min(cpu_count(), len(left_data))
+
+        self.console.log(
+            f"Starting to run code-based blocking rules for {len(left_data)} left and {len(right_data)} right rows ({len(left_data) * len(right_data)} total pairs) with {num_processes} processes..."
+        )
+
+        with Pool(
+            processes=num_processes,
+            initializer=init_worker,
+            initargs=(right_data, blocking_conditions),
+        ) as pool:
+            blocked_pairs_nested = pool.map(process_left_item, left_data)
+
+        # Flatten the nested list of blocked pairs
+        blocked_pairs = [pair for sublist in blocked_pairs_nested for pair in sublist]
+
+        # Check if we have exceeded the pairwise comparison limit
+        if limit_comparisons is not None and len(blocked_pairs) > limit_comparisons:
+            # Sample pairs randomly
+            sampled_pairs = random.sample(blocked_pairs, limit_comparisons)
+
+            # Calculate number of dropped pairs
+            dropped_pairs = len(blocked_pairs) - limit_comparisons
+
+            # Prompt the user for confirmation
+            if self.status:
+                self.status.stop()
+            if not Confirm.ask(
+                f"[yellow]Warning: {dropped_pairs} pairs will be dropped due to the comparison limit. "
+                f"Proceeding with {limit_comparisons} randomly sampled pairs. "
+                f"Do you want to continue?[/yellow]",
+            ):
+                raise ValueError("Operation cancelled by user due to pair limit.")
+
+            if self.status:
+                self.status.start()
+
+            blocked_pairs = sampled_pairs
+
+        self.console.log(
+            f"Number of blocked pairs after initial blocking: {len(blocked_pairs)}"
+        )
+
+        if blocking_threshold is not None:
+            embedding_model = self.config.get("embedding_model", self.default_model)
+            model_input_context_length = model_cost.get(embedding_model, {}).get(
+                "max_input_tokens", 8192
+            )
+
+            def get_embeddings(
+                input_data: List[Dict[str, Any]], keys: List[str], name: str
+            ) -> Tuple[List[List[float]], float]:
+                texts = [
+                    " ".join(str(item[key]) for key in keys if key in item)[
+                        : model_input_context_length * 4
+                    ]
+                    for item in input_data
+                ]
+
+                embeddings = []
+                total_cost = 0
+                batch_size = 2000
+                for i in range(0, len(texts), batch_size):
+                    batch = texts[i : i + batch_size]
+                    self.console.log(
+                        f"On iteration {i} for creating embeddings for {name} data"
+                    )
+                    response = gen_embedding(
+                        model=embedding_model,
+                        input=batch,
+                    )
+                    embeddings.extend([data["embedding"] for data in response["data"]])
+                    total_cost += completion_cost(response)
+                return embeddings, total_cost
+
+            left_embeddings, left_cost = get_embeddings(left_data, left_keys, "left")
+            right_embeddings, right_cost = get_embeddings(
+                right_data, right_keys, "right"
+            )
+            total_cost += left_cost + right_cost
+            self.console.log(
+                f"Created embeddings for datasets. Total embedding creation cost: {total_cost}"
+            )
+
+            # Compute all cosine similarities in one call
+            similarities = cosine_similarity(left_embeddings, right_embeddings)
+
+            # Additional blocking based on embeddings
+            # Find indices where similarity is above threshold
+            above_threshold = np.argwhere(similarities >= blocking_threshold)
+            self.console.log(
+                f"There are {above_threshold.shape[0]} pairs above the threshold."
+            )
+            block_pair_set = set(
+                (get_hashable_key(left_item), get_hashable_key(right_item))
+                for left_item, right_item in blocked_pairs
+            )
+
+            # If limit_comparisons is set, take only the top pairs
+            if limit_comparisons is not None:
+                # First, get all pairs above threshold
+                above_threshold_pairs = [(int(i), int(j)) for i, j in above_threshold]
+
+                # Sort these pairs by their similarity scores
+                sorted_pairs = sorted(
+                    above_threshold_pairs,
+                    key=lambda pair: similarities[pair[0], pair[1]],
+                    reverse=True,
+                )
+
+                # Take the top 'limit_comparisons' pairs
+                top_pairs = sorted_pairs[:limit_comparisons]
+
+                # Create new blocked_pairs based on top similarities and existing blocked pairs
+                new_blocked_pairs = []
+                remaining_limit = limit_comparisons - len(blocked_pairs)
+
+                # First, include all existing blocked pairs
+                final_blocked_pairs = blocked_pairs.copy()
+
+                # Then, add new pairs from top similarities until we reach the limit
+                for i, j in top_pairs:
+                    if remaining_limit <= 0:
+                        break
+                    left_item, right_item = left_data[i], right_data[j]
+                    left_key = get_hashable_key(left_item)
+                    right_key = get_hashable_key(right_item)
+                    if (left_key, right_key) not in block_pair_set:
+                        new_blocked_pairs.append((left_item, right_item))
+                        block_pair_set.add((left_key, right_key))
+                        remaining_limit -= 1
+
+                final_blocked_pairs.extend(new_blocked_pairs)
+                blocked_pairs = final_blocked_pairs
+
+                self.console.log(
+                    f"Limited comparisons to top {limit_comparisons} pairs, including {len(blocked_pairs) - len(new_blocked_pairs)} from code-based blocking and {len(new_blocked_pairs)} based on cosine similarity. Lowest cosine similarity included: {similarities[top_pairs[-1]]:.4f}"
+                )
+            else:
+                # Add new pairs to blocked_pairs
+                for i, j in above_threshold:
+                    left_item, right_item = left_data[i], right_data[j]
+                    left_key = get_hashable_key(left_item)
+                    right_key = get_hashable_key(right_item)
+                    if (left_key, right_key) not in block_pair_set:
+                        blocked_pairs.append((left_item, right_item))
+                        block_pair_set.add((left_key, right_key))
+
+        # If there are no blocking conditions or embedding threshold, use all pairs
+        if not blocking_conditions and blocking_threshold is None:
+            blocked_pairs = [
+                (left_item, right_item)
+                for left_item in left_data
+                for right_item in right_data
+            ]
+
+        # If there's a limit on the number of comparisons, randomly sample pairs
+        if limit_comparisons is not None and len(blocked_pairs) > limit_comparisons:
+            self.console.log(
+                f"Randomly sampling {limit_comparisons} pairs out of {len(blocked_pairs)} blocked pairs."
+            )
+            blocked_pairs = random.sample(blocked_pairs, limit_comparisons)
+
+        self.console.log(
+            f"Total pairs to compare after blocking and sampling: {len(blocked_pairs)}"
+        )
+
+        # Calculate and print statistics
+        total_possible_comparisons = len(left_data) * len(right_data)
+        comparisons_made = len(blocked_pairs)
+        comparisons_saved = total_possible_comparisons - comparisons_made
+        self.console.log(
+            f"[green]Comparisons saved by blocking: {comparisons_saved} "
+            f"({(comparisons_saved / total_possible_comparisons) * 100:.2f}%)[/green]"
+        )
+
+        left_match_counts = defaultdict(int)
+        right_match_counts = defaultdict(int)
+        results = []
+        comparison_costs = 0
+
+        with ThreadPoolExecutor(max_workers=self.max_threads) as executor:
+            future_to_pair = {
+                executor.submit(
+                    compare_pair,
+                    self.config["comparison_prompt"],
+                    self.config.get("comparison_model", self.default_model),
+                    left,
+                    right,
+                ): (left, right)
+                for left, right in blocked_pairs
+            }
+
+            for future in rich_as_completed(
+                future_to_pair,
+                total=len(future_to_pair),
+                desc="Comparing pairs",
+                console=self.console,
+            ):
+                pair = future_to_pair[future]
+                is_match, cost = future.result()
+                comparison_costs += cost
+
+                if is_match:
+                    joined_item = {}
+                    left_item, right_item = pair
+                    left_key_hash = get_hashable_key(left_item)
+                    right_key_hash = get_hashable_key(right_item)
+                    if (
+                        left_match_counts[left_key_hash] >= left_limit
+                        or right_match_counts[right_key_hash] >= right_limit
+                    ):
+                        continue
+
+                    for key, value in left_item.items():
+                        joined_item[f"{key}_left" if key in right_item else key] = value
+                    for key, value in right_item.items():
+                        joined_item[f"{key}_right" if key in left_item else key] = value
+                    if validate_output(self.config, joined_item, self.console):
+                        results.append(joined_item)
+                        left_match_counts[left_key_hash] += 1
+                        right_match_counts[right_key_hash] += 1
+
+                    # TODO: support retry in validation failure
+
+        total_cost += comparison_costs
+
+        # Calculate and print the join selectivity
+        join_selectivity = (
+            len(results) / (len(left_data) * len(right_data))
+            if len(left_data) * len(right_data) > 0
+            else 0
+        )
+        self.console.log(f"Equijoin selectivity: {join_selectivity:.4f}")
+
+        return results, total_cost
+
+
+ + + +
+ + + + + + + + + +
+ + +

+ execute(left_data, right_data) + +

+ + +
+ +

Executes the equijoin operation on the provided datasets.

+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
left_data + List[Dict] + +
+

The left dataset to join.

+
+
+ required +
right_data + List[Dict] + +
+

The right dataset to join.

+
+
+ required +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ Tuple[List[Dict], float] + +
+

Tuple[List[Dict], float]: A tuple containing the joined results and the total cost of the operation.

+
+
+

Usage: +

from docetl.operations import EquijoinOperation
+
+config = {
+    "blocking_keys": {
+        "left": ["id"],
+        "right": ["user_id"]
+    },
+    "limits": {
+        "left": 1,
+        "right": 1
+    },
+    "comparison_prompt": "Compare {{left}} and {{right}} and determine if they match.",
+    "blocking_threshold": 0.8,
+    "blocking_conditions": ["left['id'] == right['user_id']"],
+    "limit_comparisons": 1000
+}
+equijoin_op = EquijoinOperation(config)
+left_data = [{"id": 1, "name": "Alice"}, {"id": 2, "name": "Bob"}]
+right_data = [{"user_id": 1, "age": 30}, {"user_id": 2, "age": 25}]
+results, cost = equijoin_op.execute(left_data, right_data)
+print(f"Joined results: {results}")
+print(f"Total cost: {cost}")
+

+

This method performs the following steps: +1. Initial blocking based on specified conditions (if any) +2. Embedding-based blocking (if threshold is provided) +3. LLM-based comparison for blocked pairs +4. Result aggregation and validation

+

The method also calculates and logs statistics such as comparisons saved by blocking and join selectivity.

+ +
+ Source code in docetl/operations/equijoin.py +
124
+125
+126
+127
+128
+129
+130
+131
+132
+133
+134
+135
+136
+137
+138
+139
+140
+141
+142
+143
+144
+145
+146
+147
+148
+149
+150
+151
+152
+153
+154
+155
+156
+157
+158
+159
+160
+161
+162
+163
+164
+165
+166
+167
+168
+169
+170
+171
+172
+173
+174
+175
+176
+177
+178
+179
+180
+181
+182
+183
+184
+185
+186
+187
+188
+189
+190
+191
+192
+193
+194
+195
+196
+197
+198
+199
+200
+201
+202
+203
+204
+205
+206
+207
+208
+209
+210
+211
+212
+213
+214
+215
+216
+217
+218
+219
+220
+221
+222
+223
+224
+225
+226
+227
+228
+229
+230
+231
+232
+233
+234
+235
+236
+237
+238
+239
+240
+241
+242
+243
+244
+245
+246
+247
+248
+249
+250
+251
+252
+253
+254
+255
+256
+257
+258
+259
+260
+261
+262
+263
+264
+265
+266
+267
+268
+269
+270
+271
+272
+273
+274
+275
+276
+277
+278
+279
+280
+281
+282
+283
+284
+285
+286
+287
+288
+289
+290
+291
+292
+293
+294
+295
+296
+297
+298
+299
+300
+301
+302
+303
+304
+305
+306
+307
+308
+309
+310
+311
+312
+313
+314
+315
+316
+317
+318
+319
+320
+321
+322
+323
+324
+325
+326
+327
+328
+329
+330
+331
+332
+333
+334
+335
+336
+337
+338
+339
+340
+341
+342
+343
+344
+345
+346
+347
+348
+349
+350
+351
+352
+353
+354
+355
+356
+357
+358
+359
+360
+361
+362
+363
+364
+365
+366
+367
+368
+369
+370
+371
+372
+373
+374
+375
+376
+377
+378
+379
+380
+381
+382
+383
+384
+385
+386
+387
+388
+389
+390
+391
+392
+393
+394
+395
+396
+397
+398
+399
+400
+401
+402
+403
+404
+405
+406
+407
+408
+409
+410
+411
+412
+413
+414
+415
+416
+417
+418
+419
+420
+421
+422
+423
+424
+425
+426
+427
+428
+429
+430
+431
+432
def execute(
+    self, left_data: List[Dict], right_data: List[Dict]
+) -> Tuple[List[Dict], float]:
+    """
+    Executes the equijoin operation on the provided datasets.
+
+    Args:
+        left_data (List[Dict]): The left dataset to join.
+        right_data (List[Dict]): The right dataset to join.
+
+    Returns:
+        Tuple[List[Dict], float]: A tuple containing the joined results and the total cost of the operation.
+
+    Usage:
+    ```python
+    from docetl.operations import EquijoinOperation
+
+    config = {
+        "blocking_keys": {
+            "left": ["id"],
+            "right": ["user_id"]
+        },
+        "limits": {
+            "left": 1,
+            "right": 1
+        },
+        "comparison_prompt": "Compare {{left}} and {{right}} and determine if they match.",
+        "blocking_threshold": 0.8,
+        "blocking_conditions": ["left['id'] == right['user_id']"],
+        "limit_comparisons": 1000
+    }
+    equijoin_op = EquijoinOperation(config)
+    left_data = [{"id": 1, "name": "Alice"}, {"id": 2, "name": "Bob"}]
+    right_data = [{"user_id": 1, "age": 30}, {"user_id": 2, "age": 25}]
+    results, cost = equijoin_op.execute(left_data, right_data)
+    print(f"Joined results: {results}")
+    print(f"Total cost: {cost}")
+    ```
+
+    This method performs the following steps:
+    1. Initial blocking based on specified conditions (if any)
+    2. Embedding-based blocking (if threshold is provided)
+    3. LLM-based comparison for blocked pairs
+    4. Result aggregation and validation
+
+    The method also calculates and logs statistics such as comparisons saved by blocking and join selectivity.
+    """
+
+    blocking_keys = self.config.get("blocking_keys", {})
+    left_keys = blocking_keys.get(
+        "left", list(left_data[0].keys()) if left_data else []
+    )
+    right_keys = blocking_keys.get(
+        "right", list(right_data[0].keys()) if right_data else []
+    )
+    limits = self.config.get(
+        "limits", {"left": float("inf"), "right": float("inf")}
+    )
+    left_limit = limits["left"]
+    right_limit = limits["right"]
+    blocking_threshold = self.config.get("blocking_threshold")
+    blocking_conditions = self.config.get("blocking_conditions", [])
+    limit_comparisons = self.config.get("limit_comparisons")
+    total_cost = 0
+
+    # LLM-based comparison for blocked pairs
+    def get_hashable_key(item: Dict) -> str:
+        return json.dumps(item, sort_keys=True)
+
+    if len(left_data) == 0 or len(right_data) == 0:
+        return [], 0
+
+    # Initial blocking using multiprocessing
+    num_processes = min(cpu_count(), len(left_data))
+
+    self.console.log(
+        f"Starting to run code-based blocking rules for {len(left_data)} left and {len(right_data)} right rows ({len(left_data) * len(right_data)} total pairs) with {num_processes} processes..."
+    )
+
+    with Pool(
+        processes=num_processes,
+        initializer=init_worker,
+        initargs=(right_data, blocking_conditions),
+    ) as pool:
+        blocked_pairs_nested = pool.map(process_left_item, left_data)
+
+    # Flatten the nested list of blocked pairs
+    blocked_pairs = [pair for sublist in blocked_pairs_nested for pair in sublist]
+
+    # Check if we have exceeded the pairwise comparison limit
+    if limit_comparisons is not None and len(blocked_pairs) > limit_comparisons:
+        # Sample pairs randomly
+        sampled_pairs = random.sample(blocked_pairs, limit_comparisons)
+
+        # Calculate number of dropped pairs
+        dropped_pairs = len(blocked_pairs) - limit_comparisons
+
+        # Prompt the user for confirmation
+        if self.status:
+            self.status.stop()
+        if not Confirm.ask(
+            f"[yellow]Warning: {dropped_pairs} pairs will be dropped due to the comparison limit. "
+            f"Proceeding with {limit_comparisons} randomly sampled pairs. "
+            f"Do you want to continue?[/yellow]",
+        ):
+            raise ValueError("Operation cancelled by user due to pair limit.")
+
+        if self.status:
+            self.status.start()
+
+        blocked_pairs = sampled_pairs
+
+    self.console.log(
+        f"Number of blocked pairs after initial blocking: {len(blocked_pairs)}"
+    )
+
+    if blocking_threshold is not None:
+        embedding_model = self.config.get("embedding_model", self.default_model)
+        model_input_context_length = model_cost.get(embedding_model, {}).get(
+            "max_input_tokens", 8192
+        )
+
+        def get_embeddings(
+            input_data: List[Dict[str, Any]], keys: List[str], name: str
+        ) -> Tuple[List[List[float]], float]:
+            texts = [
+                " ".join(str(item[key]) for key in keys if key in item)[
+                    : model_input_context_length * 4
+                ]
+                for item in input_data
+            ]
+
+            embeddings = []
+            total_cost = 0
+            batch_size = 2000
+            for i in range(0, len(texts), batch_size):
+                batch = texts[i : i + batch_size]
+                self.console.log(
+                    f"On iteration {i} for creating embeddings for {name} data"
+                )
+                response = gen_embedding(
+                    model=embedding_model,
+                    input=batch,
+                )
+                embeddings.extend([data["embedding"] for data in response["data"]])
+                total_cost += completion_cost(response)
+            return embeddings, total_cost
+
+        left_embeddings, left_cost = get_embeddings(left_data, left_keys, "left")
+        right_embeddings, right_cost = get_embeddings(
+            right_data, right_keys, "right"
+        )
+        total_cost += left_cost + right_cost
+        self.console.log(
+            f"Created embeddings for datasets. Total embedding creation cost: {total_cost}"
+        )
+
+        # Compute all cosine similarities in one call
+        similarities = cosine_similarity(left_embeddings, right_embeddings)
+
+        # Additional blocking based on embeddings
+        # Find indices where similarity is above threshold
+        above_threshold = np.argwhere(similarities >= blocking_threshold)
+        self.console.log(
+            f"There are {above_threshold.shape[0]} pairs above the threshold."
+        )
+        block_pair_set = set(
+            (get_hashable_key(left_item), get_hashable_key(right_item))
+            for left_item, right_item in blocked_pairs
+        )
+
+        # If limit_comparisons is set, take only the top pairs
+        if limit_comparisons is not None:
+            # First, get all pairs above threshold
+            above_threshold_pairs = [(int(i), int(j)) for i, j in above_threshold]
+
+            # Sort these pairs by their similarity scores
+            sorted_pairs = sorted(
+                above_threshold_pairs,
+                key=lambda pair: similarities[pair[0], pair[1]],
+                reverse=True,
+            )
+
+            # Take the top 'limit_comparisons' pairs
+            top_pairs = sorted_pairs[:limit_comparisons]
+
+            # Create new blocked_pairs based on top similarities and existing blocked pairs
+            new_blocked_pairs = []
+            remaining_limit = limit_comparisons - len(blocked_pairs)
+
+            # First, include all existing blocked pairs
+            final_blocked_pairs = blocked_pairs.copy()
+
+            # Then, add new pairs from top similarities until we reach the limit
+            for i, j in top_pairs:
+                if remaining_limit <= 0:
+                    break
+                left_item, right_item = left_data[i], right_data[j]
+                left_key = get_hashable_key(left_item)
+                right_key = get_hashable_key(right_item)
+                if (left_key, right_key) not in block_pair_set:
+                    new_blocked_pairs.append((left_item, right_item))
+                    block_pair_set.add((left_key, right_key))
+                    remaining_limit -= 1
+
+            final_blocked_pairs.extend(new_blocked_pairs)
+            blocked_pairs = final_blocked_pairs
+
+            self.console.log(
+                f"Limited comparisons to top {limit_comparisons} pairs, including {len(blocked_pairs) - len(new_blocked_pairs)} from code-based blocking and {len(new_blocked_pairs)} based on cosine similarity. Lowest cosine similarity included: {similarities[top_pairs[-1]]:.4f}"
+            )
+        else:
+            # Add new pairs to blocked_pairs
+            for i, j in above_threshold:
+                left_item, right_item = left_data[i], right_data[j]
+                left_key = get_hashable_key(left_item)
+                right_key = get_hashable_key(right_item)
+                if (left_key, right_key) not in block_pair_set:
+                    blocked_pairs.append((left_item, right_item))
+                    block_pair_set.add((left_key, right_key))
+
+    # If there are no blocking conditions or embedding threshold, use all pairs
+    if not blocking_conditions and blocking_threshold is None:
+        blocked_pairs = [
+            (left_item, right_item)
+            for left_item in left_data
+            for right_item in right_data
+        ]
+
+    # If there's a limit on the number of comparisons, randomly sample pairs
+    if limit_comparisons is not None and len(blocked_pairs) > limit_comparisons:
+        self.console.log(
+            f"Randomly sampling {limit_comparisons} pairs out of {len(blocked_pairs)} blocked pairs."
+        )
+        blocked_pairs = random.sample(blocked_pairs, limit_comparisons)
+
+    self.console.log(
+        f"Total pairs to compare after blocking and sampling: {len(blocked_pairs)}"
+    )
+
+    # Calculate and print statistics
+    total_possible_comparisons = len(left_data) * len(right_data)
+    comparisons_made = len(blocked_pairs)
+    comparisons_saved = total_possible_comparisons - comparisons_made
+    self.console.log(
+        f"[green]Comparisons saved by blocking: {comparisons_saved} "
+        f"({(comparisons_saved / total_possible_comparisons) * 100:.2f}%)[/green]"
+    )
+
+    left_match_counts = defaultdict(int)
+    right_match_counts = defaultdict(int)
+    results = []
+    comparison_costs = 0
+
+    with ThreadPoolExecutor(max_workers=self.max_threads) as executor:
+        future_to_pair = {
+            executor.submit(
+                compare_pair,
+                self.config["comparison_prompt"],
+                self.config.get("comparison_model", self.default_model),
+                left,
+                right,
+            ): (left, right)
+            for left, right in blocked_pairs
+        }
+
+        for future in rich_as_completed(
+            future_to_pair,
+            total=len(future_to_pair),
+            desc="Comparing pairs",
+            console=self.console,
+        ):
+            pair = future_to_pair[future]
+            is_match, cost = future.result()
+            comparison_costs += cost
+
+            if is_match:
+                joined_item = {}
+                left_item, right_item = pair
+                left_key_hash = get_hashable_key(left_item)
+                right_key_hash = get_hashable_key(right_item)
+                if (
+                    left_match_counts[left_key_hash] >= left_limit
+                    or right_match_counts[right_key_hash] >= right_limit
+                ):
+                    continue
+
+                for key, value in left_item.items():
+                    joined_item[f"{key}_left" if key in right_item else key] = value
+                for key, value in right_item.items():
+                    joined_item[f"{key}_right" if key in left_item else key] = value
+                if validate_output(self.config, joined_item, self.console):
+                    results.append(joined_item)
+                    left_match_counts[left_key_hash] += 1
+                    right_match_counts[right_key_hash] += 1
+
+                # TODO: support retry in validation failure
+
+    total_cost += comparison_costs
+
+    # Calculate and print the join selectivity
+    join_selectivity = (
+        len(results) / (len(left_data) * len(right_data))
+        if len(left_data) * len(right_data) > 0
+        else 0
+    )
+    self.console.log(f"Equijoin selectivity: {join_selectivity:.4f}")
+
+    return results, total_cost
+
+
+
+ +
+ +
+ + +

+ syntax_check() + +

+ + +
+ +

Checks the configuration of the EquijoinOperation for required keys and valid structure.

+ + +

Raises:

+ + + + + + + + + + + + + + + + + +
TypeDescription
+ ValueError + +
+

If required keys are missing or if the blocking_keys structure is invalid.

+
+
+ Specifically + +
+ +
+
+ +
+ Source code in docetl/operations/equijoin.py +
 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
+121
+122
def syntax_check(self) -> None:
+    """
+    Checks the configuration of the EquijoinOperation for required keys and valid structure.
+
+    Raises:
+        ValueError: If required keys are missing or if the blocking_keys structure is invalid.
+        Specifically:
+        - Raises if 'comparison_prompt' is missing from the config.
+        - Raises if 'left' or 'right' are missing from the 'blocking_keys' structure (if present).
+        - Raises if 'left' or 'right' are missing from the 'limits' structure (if present).
+    """
+    if "comparison_prompt" not in self.config:
+        raise ValueError(
+            "Missing required key 'comparison_prompt' in EquijoinOperation configuration"
+        )
+
+    if "blocking_keys" in self.config:
+        if (
+            "left" not in self.config["blocking_keys"]
+            or "right" not in self.config["blocking_keys"]
+        ):
+            raise ValueError(
+                "Both 'left' and 'right' must be specified in 'blocking_keys'"
+            )
+
+    if "limits" in self.config:
+        if (
+            "left" not in self.config["limits"]
+            or "right" not in self.config["limits"]
+        ):
+            raise ValueError(
+                "Both 'left' and 'right' must be specified in 'limits'"
+            )
+
+    if "limit_comparisons" in self.config:
+        if not isinstance(self.config["limit_comparisons"], int):
+            raise ValueError("limit_comparisons must be an integer")
+
+
+
+ +
+ + + +
+ +
+ +

Auxiliary Operators

+ + +
+ + + +

+ docetl.operations.split.SplitOperation + + +

+ + +
+

+ Bases: BaseOperation

+ + +

A class that implements a split operation on input data, dividing it into manageable chunks.

+

This class extends BaseOperation to: +1. Split input data into chunks of specified size based on the 'split_key' and 'token_count' configuration. +2. Assign unique identifiers to each original document and number chunks sequentially. +3. Return results containing: + - {split_key}_chunk: The content of the split chunk. + - {name}_id: A unique identifier for each original document. + - {name}_chunk_num: The sequential number of the chunk within its original document.

+ +
+ Source code in docetl/operations/split.py +
  9
+ 10
+ 11
+ 12
+ 13
+ 14
+ 15
+ 16
+ 17
+ 18
+ 19
+ 20
+ 21
+ 22
+ 23
+ 24
+ 25
+ 26
+ 27
+ 28
+ 29
+ 30
+ 31
+ 32
+ 33
+ 34
+ 35
+ 36
+ 37
+ 38
+ 39
+ 40
+ 41
+ 42
+ 43
+ 44
+ 45
+ 46
+ 47
+ 48
+ 49
+ 50
+ 51
+ 52
+ 53
+ 54
+ 55
+ 56
+ 57
+ 58
+ 59
+ 60
+ 61
+ 62
+ 63
+ 64
+ 65
+ 66
+ 67
+ 68
+ 69
+ 70
+ 71
+ 72
+ 73
+ 74
+ 75
+ 76
+ 77
+ 78
+ 79
+ 80
+ 81
+ 82
+ 83
+ 84
+ 85
+ 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
class SplitOperation(BaseOperation):
+    """
+    A class that implements a split operation on input data, dividing it into manageable chunks.
+
+    This class extends BaseOperation to:
+    1. Split input data into chunks of specified size based on the 'split_key' and 'token_count' configuration.
+    2. Assign unique identifiers to each original document and number chunks sequentially.
+    3. Return results containing:
+       - {split_key}_chunk: The content of the split chunk.
+       - {name}_id: A unique identifier for each original document.
+       - {name}_chunk_num: The sequential number of the chunk within its original document.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.name = self.config["name"]
+
+    def syntax_check(self) -> None:
+        required_keys = ["split_key", "method", "method_kwargs"]
+        for key in required_keys:
+            if key not in self.config:
+                raise ValueError(
+                    f"Missing required key '{key}' in SplitOperation configuration"
+                )
+
+        if not isinstance(self.config["split_key"], str):
+            raise TypeError("'split_key' must be a string")
+
+        if self.config["method"] not in ["token_count", "delimiter"]:
+            raise ValueError(f"Invalid method '{self.config['method']}'")
+
+        if self.config["method"] == "token_count":
+            if (
+                not isinstance(self.config["method_kwargs"]["token_count"], int)
+                or self.config["method_kwargs"]["token_count"] <= 0
+            ):
+                raise ValueError("'token_count' must be a positive integer")
+        elif self.config["method"] == "delimiter":
+            if not isinstance(self.config["method_kwargs"]["delimiter"], str):
+                raise ValueError("'delimiter' must be a string")
+
+        if "model" in self.config and not isinstance(self.config["model"], str):
+            raise TypeError("'model' in configuration must be a string")
+
+    def execute(self, input_data: List[Dict]) -> Tuple[List[Dict], float]:
+        split_key = self.config["split_key"]
+        method = self.config["method"]
+        method_kwargs = self.config["method_kwargs"]
+        encoder = tiktoken.encoding_for_model(
+            self.config.get("model", self.default_model)
+        )
+        results = []
+        cost = 0.0
+
+        for item in input_data:
+            if split_key not in item:
+                raise KeyError(f"Split key '{split_key}' not found in item")
+
+            content = item[split_key]
+            doc_id = str(uuid.uuid4())
+
+            if method == "token_count":
+                token_count = method_kwargs["token_count"]
+                tokens = encoder.encode(content)
+
+                for chunk_num, i in enumerate(
+                    range(0, len(tokens), token_count), start=1
+                ):
+                    chunk_tokens = tokens[i : i + token_count]
+                    chunk = encoder.decode(chunk_tokens)
+
+                    result = item.copy()
+                    result.update(
+                        {
+                            f"{split_key}_chunk": chunk,
+                            f"{self.name}_id": doc_id,
+                            f"{self.name}_chunk_num": chunk_num,
+                        }
+                    )
+                    results.append(result)
+
+            elif method == "delimiter":
+                delimiter = method_kwargs["delimiter"]
+                num_splits_to_group = method_kwargs.get("num_splits_to_group", 1)
+                chunks = content.split(delimiter)
+
+                # Get rid of empty chunks
+                chunks = [chunk for chunk in chunks if chunk.strip()]
+
+                for chunk_num, i in enumerate(
+                    range(0, len(chunks), num_splits_to_group), start=1
+                ):
+                    grouped_chunks = chunks[i : i + num_splits_to_group]
+                    joined_chunk = delimiter.join(grouped_chunks).strip()
+
+                    result = item.copy()
+                    result.update(
+                        {
+                            f"{split_key}_chunk": joined_chunk,
+                            f"{self.name}_id": doc_id,
+                            f"{self.name}_chunk_num": chunk_num,
+                        }
+                    )
+                    results.append(result)
+
+        return results, cost
+
+
+ + + +
+ + + + + + + + + + + +
+ +
+ +
+ +
+ + + +

+ docetl.operations.gather.GatherOperation + + +

+ + +
+

+ Bases: BaseOperation

+ + +

A class that implements a gather operation on input data, adding contextual information from surrounding chunks.

+

This class extends BaseOperation to: +1. Group chunks by their document ID. +2. Order chunks within each group. +3. Add peripheral context to each chunk based on the configuration. +4. Include headers for each chunk and its upward hierarchy. +5. Return results containing the rendered chunks with added context, including information about skipped characters and headers.

+ +
+ Source code in docetl/operations/gather.py +
  6
+  7
+  8
+  9
+ 10
+ 11
+ 12
+ 13
+ 14
+ 15
+ 16
+ 17
+ 18
+ 19
+ 20
+ 21
+ 22
+ 23
+ 24
+ 25
+ 26
+ 27
+ 28
+ 29
+ 30
+ 31
+ 32
+ 33
+ 34
+ 35
+ 36
+ 37
+ 38
+ 39
+ 40
+ 41
+ 42
+ 43
+ 44
+ 45
+ 46
+ 47
+ 48
+ 49
+ 50
+ 51
+ 52
+ 53
+ 54
+ 55
+ 56
+ 57
+ 58
+ 59
+ 60
+ 61
+ 62
+ 63
+ 64
+ 65
+ 66
+ 67
+ 68
+ 69
+ 70
+ 71
+ 72
+ 73
+ 74
+ 75
+ 76
+ 77
+ 78
+ 79
+ 80
+ 81
+ 82
+ 83
+ 84
+ 85
+ 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
+121
+122
+123
+124
+125
+126
+127
+128
+129
+130
+131
+132
+133
+134
+135
+136
+137
+138
+139
+140
+141
+142
+143
+144
+145
+146
+147
+148
+149
+150
+151
+152
+153
+154
+155
+156
+157
+158
+159
+160
+161
+162
+163
+164
+165
+166
+167
+168
+169
+170
+171
+172
+173
+174
+175
+176
+177
+178
+179
+180
+181
+182
+183
+184
+185
+186
+187
+188
+189
+190
+191
+192
+193
+194
+195
+196
+197
+198
+199
+200
+201
+202
+203
+204
+205
+206
+207
+208
+209
+210
+211
+212
+213
+214
+215
+216
+217
+218
+219
+220
+221
+222
+223
+224
+225
+226
+227
+228
+229
+230
+231
+232
+233
+234
+235
+236
+237
+238
+239
+240
+241
+242
+243
+244
+245
+246
+247
+248
+249
+250
+251
+252
+253
+254
+255
+256
+257
+258
+259
+260
+261
+262
+263
+264
+265
+266
+267
+268
+269
+270
+271
+272
+273
+274
+275
+276
+277
+278
+279
+280
+281
+282
+283
+284
+285
+286
+287
+288
+289
+290
+291
+292
+293
+294
+295
+296
+297
+298
+299
+300
+301
+302
+303
+304
+305
+306
+307
+308
+309
+310
+311
+312
+313
+314
+315
+316
+317
+318
+319
+320
class GatherOperation(BaseOperation):
+    """
+    A class that implements a gather operation on input data, adding contextual information from surrounding chunks.
+
+    This class extends BaseOperation to:
+    1. Group chunks by their document ID.
+    2. Order chunks within each group.
+    3. Add peripheral context to each chunk based on the configuration.
+    4. Include headers for each chunk and its upward hierarchy.
+    5. Return results containing the rendered chunks with added context, including information about skipped characters and headers.
+    """
+
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        """
+        Initialize the GatherOperation.
+
+        Args:
+            *args: Variable length argument list.
+            **kwargs: Arbitrary keyword arguments.
+        """
+        super().__init__(*args, **kwargs)
+
+    def syntax_check(self) -> None:
+        """
+        Perform a syntax check on the operation configuration.
+
+        Raises:
+            ValueError: If required keys are missing or if there are configuration errors.
+            TypeError: If main_chunk_start or main_chunk_end are not strings.
+        """
+        required_keys = ["content_key", "doc_id_key", "order_key"]
+        for key in required_keys:
+            if key not in self.config:
+                raise ValueError(
+                    f"Missing required key '{key}' in GatherOperation configuration"
+                )
+
+        if "peripheral_chunks" not in self.config:
+            raise ValueError(
+                "Missing 'peripheral_chunks' configuration in GatherOperation"
+            )
+
+        peripheral_config = self.config["peripheral_chunks"]
+        for direction in ["previous", "next"]:
+            if direction not in peripheral_config:
+                continue
+            for section in ["head", "middle", "tail"]:
+                if section in peripheral_config[direction]:
+                    section_config = peripheral_config[direction][section]
+                    if section != "middle" and "count" not in section_config:
+                        raise ValueError(
+                            f"Missing 'count' in {direction}.{section} configuration"
+                        )
+
+        if "main_chunk_start" in self.config and not isinstance(
+            self.config["main_chunk_start"], str
+        ):
+            raise TypeError("'main_chunk_start' must be a string")
+        if "main_chunk_end" in self.config and not isinstance(
+            self.config["main_chunk_end"], str
+        ):
+            raise TypeError("'main_chunk_end' must be a string")
+
+    def execute(self, input_data: List[Dict]) -> Tuple[List[Dict], float]:
+        """
+        Execute the gather operation on the input data.
+
+        Args:
+            input_data (List[Dict]): The input data to process.
+
+        Returns:
+            Tuple[List[Dict], float]: A tuple containing the processed results and the cost of the operation.
+        """
+        content_key = self.config["content_key"]
+        doc_id_key = self.config["doc_id_key"]
+        order_key = self.config["order_key"]
+        peripheral_config = self.config["peripheral_chunks"]
+        main_chunk_start = self.config.get(
+            "main_chunk_start", "--- Begin Main Chunk ---"
+        )
+        main_chunk_end = self.config.get("main_chunk_end", "--- End Main Chunk ---")
+        doc_header_key = self.config.get("doc_header_key", None)
+        results = []
+        cost = 0.0
+
+        # Group chunks by document ID
+        grouped_chunks = {}
+        for item in input_data:
+            doc_id = item[doc_id_key]
+            if doc_id not in grouped_chunks:
+                grouped_chunks[doc_id] = []
+            grouped_chunks[doc_id].append(item)
+
+        # Process each group of chunks
+        for doc_id, chunks in grouped_chunks.items():
+            # Sort chunks by their order within the document
+            chunks.sort(key=lambda x: x[order_key])
+
+            # Process each chunk with its peripheral context and headers
+            for i, chunk in enumerate(chunks):
+                rendered_chunk = self.render_chunk_with_context(
+                    chunks,
+                    i,
+                    peripheral_config,
+                    content_key,
+                    order_key,
+                    main_chunk_start,
+                    main_chunk_end,
+                    doc_header_key,
+                )
+
+                result = chunk.copy()
+                result[f"{content_key}_rendered"] = rendered_chunk
+                results.append(result)
+
+        return results, cost
+
+    def render_chunk_with_context(
+        self,
+        chunks: List[Dict],
+        current_index: int,
+        peripheral_config: Dict,
+        content_key: str,
+        order_key: str,
+        main_chunk_start: str,
+        main_chunk_end: str,
+        doc_header_key: str,
+    ) -> str:
+        """
+        Render a chunk with its peripheral context and headers.
+
+        Args:
+            chunks (List[Dict]): List of all chunks in the document.
+            current_index (int): Index of the current chunk being processed.
+            peripheral_config (Dict): Configuration for peripheral chunks.
+            content_key (str): Key for the content in each chunk.
+            order_key (str): Key for the order of each chunk.
+            main_chunk_start (str): String to mark the start of the main chunk.
+            main_chunk_end (str): String to mark the end of the main chunk.
+            doc_header_key (str): The key for the headers in the current chunk.
+
+        Returns:
+            str: Renderted chunk with context and headers.
+        """
+        combined_parts = []
+
+        # Process previous chunks
+        combined_parts.append("--- Previous Context ---")
+        combined_parts.extend(
+            self.process_peripheral_chunks(
+                chunks[:current_index],
+                peripheral_config.get("previous", {}),
+                content_key,
+                order_key,
+            )
+        )
+        combined_parts.append("--- End Previous Context ---\n")
+
+        # Process main chunk
+        main_chunk = chunks[current_index]
+        headers = self.render_hierarchy_headers(
+            main_chunk, chunks[: current_index + 1], doc_header_key
+        )
+        if headers:
+            combined_parts.append(headers)
+        combined_parts.append(f"{main_chunk_start}")
+        combined_parts.append(f"{main_chunk[content_key]}")
+        combined_parts.append(f"{main_chunk_end}")
+
+        # Process next chunks
+        combined_parts.append("\n--- Next Context ---")
+        combined_parts.extend(
+            self.process_peripheral_chunks(
+                chunks[current_index + 1 :],
+                peripheral_config.get("next", {}),
+                content_key,
+                order_key,
+            )
+        )
+        combined_parts.append("--- End Next Context ---")
+
+        return "\n".join(combined_parts)
+
+    def process_peripheral_chunks(
+        self,
+        chunks: List[Dict],
+        config: Dict,
+        content_key: str,
+        order_key: str,
+        reverse: bool = False,
+    ) -> List[str]:
+        """
+        Process peripheral chunks according to the configuration.
+
+        Args:
+            chunks (List[Dict]): List of chunks to process.
+            config (Dict): Configuration for processing peripheral chunks.
+            content_key (str): Key for the content in each chunk.
+            order_key (str): Key for the order of each chunk.
+            reverse (bool, optional): Whether to process chunks in reverse order. Defaults to False.
+
+        Returns:
+            List[str]: List of processed chunk strings.
+        """
+        if reverse:
+            chunks = list(reversed(chunks))
+
+        processed_parts = []
+        included_chunks = []
+        total_chunks = len(chunks)
+
+        head_config = config.get("head", {})
+        tail_config = config.get("tail", {})
+
+        head_count = int(head_config.get("count", 0))
+        tail_count = int(tail_config.get("count", 0))
+        in_skip = False
+        skip_char_count = 0
+
+        for i, chunk in enumerate(chunks):
+            if i < head_count:
+                section = "head"
+            elif i >= total_chunks - tail_count:
+                section = "tail"
+            elif "middle" in config:
+                section = "middle"
+            else:
+                # Show number of characters skipped
+                skipped_chars = len(chunk[content_key])
+                if not in_skip:
+                    skip_char_count = skipped_chars
+                    in_skip = True
+                else:
+                    skip_char_count += skipped_chars
+
+                continue
+
+            if in_skip:
+                processed_parts.append(
+                    f"[... {skip_char_count} characters skipped ...]"
+                )
+                in_skip = False
+                skip_char_count = 0
+
+            section_config = config.get(section, {})
+            section_content_key = section_config.get("content_key", content_key)
+
+            is_summary = section_content_key != content_key
+            summary_suffix = " (Summary)" if is_summary else ""
+
+            chunk_prefix = f"[Chunk {chunk[order_key]}{summary_suffix}]"
+            processed_parts.append(chunk_prefix)
+            processed_parts.append(f"{chunk[section_content_key]}")
+            included_chunks.append(chunk)
+
+        if in_skip:
+            processed_parts.append(f"[... {skip_char_count} characters skipped ...]")
+
+        if reverse:
+            processed_parts = list(reversed(processed_parts))
+
+        return processed_parts
+
+    def render_hierarchy_headers(
+        self,
+        current_chunk: Dict,
+        chunks: List[Dict],
+        doc_header_key: str,
+    ) -> str:
+        """
+        Render headers for the current chunk's hierarchy.
+
+        Args:
+            current_chunk (Dict): The current chunk being processed.
+            chunks (List[Dict]): List of chunks up to and including the current chunk.
+            doc_header_key (str): The key for the headers in the current chunk.
+        Returns:
+            str: Renderted headers in the current chunk's hierarchy.
+        """
+        rendered_headers = []
+        current_hierarchy = {}
+
+        if doc_header_key is None:
+            return ""
+
+        # Find the largest/highest level in the current chunk
+        current_chunk_headers = current_chunk.get(doc_header_key, [])
+        highest_level = float("inf")  # Initialize with positive infinity
+        for header_info in current_chunk_headers:
+            level = header_info.get("level")
+            if level is not None and level < highest_level:
+                highest_level = level
+
+        # If no headers found in the current chunk, set highest_level to None
+        if highest_level == float("inf"):
+            highest_level = None
+
+        for chunk in chunks:
+            for header_info in chunk.get(doc_header_key, []):
+                header = header_info["header"]
+                level = header_info["level"]
+                if header and level:
+                    current_hierarchy[level] = header
+                    # Clear lower levels when a higher level header is found
+                    for lower_level in range(level + 1, len(current_hierarchy) + 1):
+                        if lower_level in current_hierarchy:
+                            current_hierarchy[lower_level] = None
+
+        # Render the headers in the current hierarchy, everything above the highest level in the current chunk (if the highest level in the current chunk is None, render everything)
+        for level, header in sorted(current_hierarchy.items()):
+            if header is not None and (highest_level is None or level < highest_level):
+                rendered_headers.append(f"{'#' * level} {header}")
+
+        rendered_headers = " > ".join(rendered_headers)
+        return f"_Current Section:_ {rendered_headers}" if rendered_headers else ""
+
+
+ + + +
+ + + + + + + + + +
+ + +

+ __init__(*args, **kwargs) + +

+ + +
+ +

Initialize the GatherOperation.

+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
*args + Any + +
+

Variable length argument list.

+
+
+ () +
**kwargs + Any + +
+

Arbitrary keyword arguments.

+
+
+ {} +
+ +
+ Source code in docetl/operations/gather.py +
18
+19
+20
+21
+22
+23
+24
+25
+26
def __init__(self, *args: Any, **kwargs: Any) -> None:
+    """
+    Initialize the GatherOperation.
+
+    Args:
+        *args: Variable length argument list.
+        **kwargs: Arbitrary keyword arguments.
+    """
+    super().__init__(*args, **kwargs)
+
+
+
+ +
+ +
+ + +

+ execute(input_data) + +

+ + +
+ +

Execute the gather operation on the input data.

+ + +

Parameters:

+ + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
input_data + List[Dict] + +
+

The input data to process.

+
+
+ required +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ Tuple[List[Dict], float] + +
+

Tuple[List[Dict], float]: A tuple containing the processed results and the cost of the operation.

+
+
+ +
+ Source code in docetl/operations/gather.py +
 69
+ 70
+ 71
+ 72
+ 73
+ 74
+ 75
+ 76
+ 77
+ 78
+ 79
+ 80
+ 81
+ 82
+ 83
+ 84
+ 85
+ 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
+121
def execute(self, input_data: List[Dict]) -> Tuple[List[Dict], float]:
+    """
+    Execute the gather operation on the input data.
+
+    Args:
+        input_data (List[Dict]): The input data to process.
+
+    Returns:
+        Tuple[List[Dict], float]: A tuple containing the processed results and the cost of the operation.
+    """
+    content_key = self.config["content_key"]
+    doc_id_key = self.config["doc_id_key"]
+    order_key = self.config["order_key"]
+    peripheral_config = self.config["peripheral_chunks"]
+    main_chunk_start = self.config.get(
+        "main_chunk_start", "--- Begin Main Chunk ---"
+    )
+    main_chunk_end = self.config.get("main_chunk_end", "--- End Main Chunk ---")
+    doc_header_key = self.config.get("doc_header_key", None)
+    results = []
+    cost = 0.0
+
+    # Group chunks by document ID
+    grouped_chunks = {}
+    for item in input_data:
+        doc_id = item[doc_id_key]
+        if doc_id not in grouped_chunks:
+            grouped_chunks[doc_id] = []
+        grouped_chunks[doc_id].append(item)
+
+    # Process each group of chunks
+    for doc_id, chunks in grouped_chunks.items():
+        # Sort chunks by their order within the document
+        chunks.sort(key=lambda x: x[order_key])
+
+        # Process each chunk with its peripheral context and headers
+        for i, chunk in enumerate(chunks):
+            rendered_chunk = self.render_chunk_with_context(
+                chunks,
+                i,
+                peripheral_config,
+                content_key,
+                order_key,
+                main_chunk_start,
+                main_chunk_end,
+                doc_header_key,
+            )
+
+            result = chunk.copy()
+            result[f"{content_key}_rendered"] = rendered_chunk
+            results.append(result)
+
+    return results, cost
+
+
+
+ +
+ +
+ + +

+ process_peripheral_chunks(chunks, config, content_key, order_key, reverse=False) + +

+ + +
+ +

Process peripheral chunks according to the configuration.

+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
chunks + List[Dict] + +
+

List of chunks to process.

+
+
+ required +
config + Dict + +
+

Configuration for processing peripheral chunks.

+
+
+ required +
content_key + str + +
+

Key for the content in each chunk.

+
+
+ required +
order_key + str + +
+

Key for the order of each chunk.

+
+
+ required +
reverse + bool + +
+

Whether to process chunks in reverse order. Defaults to False.

+
+
+ False +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ List[str] + +
+

List[str]: List of processed chunk strings.

+
+
+ +
+ Source code in docetl/operations/gather.py +
189
+190
+191
+192
+193
+194
+195
+196
+197
+198
+199
+200
+201
+202
+203
+204
+205
+206
+207
+208
+209
+210
+211
+212
+213
+214
+215
+216
+217
+218
+219
+220
+221
+222
+223
+224
+225
+226
+227
+228
+229
+230
+231
+232
+233
+234
+235
+236
+237
+238
+239
+240
+241
+242
+243
+244
+245
+246
+247
+248
+249
+250
+251
+252
+253
+254
+255
+256
+257
+258
+259
+260
+261
+262
+263
+264
+265
+266
+267
def process_peripheral_chunks(
+    self,
+    chunks: List[Dict],
+    config: Dict,
+    content_key: str,
+    order_key: str,
+    reverse: bool = False,
+) -> List[str]:
+    """
+    Process peripheral chunks according to the configuration.
+
+    Args:
+        chunks (List[Dict]): List of chunks to process.
+        config (Dict): Configuration for processing peripheral chunks.
+        content_key (str): Key for the content in each chunk.
+        order_key (str): Key for the order of each chunk.
+        reverse (bool, optional): Whether to process chunks in reverse order. Defaults to False.
+
+    Returns:
+        List[str]: List of processed chunk strings.
+    """
+    if reverse:
+        chunks = list(reversed(chunks))
+
+    processed_parts = []
+    included_chunks = []
+    total_chunks = len(chunks)
+
+    head_config = config.get("head", {})
+    tail_config = config.get("tail", {})
+
+    head_count = int(head_config.get("count", 0))
+    tail_count = int(tail_config.get("count", 0))
+    in_skip = False
+    skip_char_count = 0
+
+    for i, chunk in enumerate(chunks):
+        if i < head_count:
+            section = "head"
+        elif i >= total_chunks - tail_count:
+            section = "tail"
+        elif "middle" in config:
+            section = "middle"
+        else:
+            # Show number of characters skipped
+            skipped_chars = len(chunk[content_key])
+            if not in_skip:
+                skip_char_count = skipped_chars
+                in_skip = True
+            else:
+                skip_char_count += skipped_chars
+
+            continue
+
+        if in_skip:
+            processed_parts.append(
+                f"[... {skip_char_count} characters skipped ...]"
+            )
+            in_skip = False
+            skip_char_count = 0
+
+        section_config = config.get(section, {})
+        section_content_key = section_config.get("content_key", content_key)
+
+        is_summary = section_content_key != content_key
+        summary_suffix = " (Summary)" if is_summary else ""
+
+        chunk_prefix = f"[Chunk {chunk[order_key]}{summary_suffix}]"
+        processed_parts.append(chunk_prefix)
+        processed_parts.append(f"{chunk[section_content_key]}")
+        included_chunks.append(chunk)
+
+    if in_skip:
+        processed_parts.append(f"[... {skip_char_count} characters skipped ...]")
+
+    if reverse:
+        processed_parts = list(reversed(processed_parts))
+
+    return processed_parts
+
+
+
+ +
+ +
+ + +

+ render_chunk_with_context(chunks, current_index, peripheral_config, content_key, order_key, main_chunk_start, main_chunk_end, doc_header_key) + +

+ + +
+ +

Render a chunk with its peripheral context and headers.

+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
chunks + List[Dict] + +
+

List of all chunks in the document.

+
+
+ required +
current_index + int + +
+

Index of the current chunk being processed.

+
+
+ required +
peripheral_config + Dict + +
+

Configuration for peripheral chunks.

+
+
+ required +
content_key + str + +
+

Key for the content in each chunk.

+
+
+ required +
order_key + str + +
+

Key for the order of each chunk.

+
+
+ required +
main_chunk_start + str + +
+

String to mark the start of the main chunk.

+
+
+ required +
main_chunk_end + str + +
+

String to mark the end of the main chunk.

+
+
+ required +
doc_header_key + str + +
+

The key for the headers in the current chunk.

+
+
+ required +
+ + +

Returns:

+ + + + + + + + + + + + + +
Name TypeDescription
str + str + +
+

Renderted chunk with context and headers.

+
+
+ +
+ Source code in docetl/operations/gather.py +
123
+124
+125
+126
+127
+128
+129
+130
+131
+132
+133
+134
+135
+136
+137
+138
+139
+140
+141
+142
+143
+144
+145
+146
+147
+148
+149
+150
+151
+152
+153
+154
+155
+156
+157
+158
+159
+160
+161
+162
+163
+164
+165
+166
+167
+168
+169
+170
+171
+172
+173
+174
+175
+176
+177
+178
+179
+180
+181
+182
+183
+184
+185
+186
+187
def render_chunk_with_context(
+    self,
+    chunks: List[Dict],
+    current_index: int,
+    peripheral_config: Dict,
+    content_key: str,
+    order_key: str,
+    main_chunk_start: str,
+    main_chunk_end: str,
+    doc_header_key: str,
+) -> str:
+    """
+    Render a chunk with its peripheral context and headers.
+
+    Args:
+        chunks (List[Dict]): List of all chunks in the document.
+        current_index (int): Index of the current chunk being processed.
+        peripheral_config (Dict): Configuration for peripheral chunks.
+        content_key (str): Key for the content in each chunk.
+        order_key (str): Key for the order of each chunk.
+        main_chunk_start (str): String to mark the start of the main chunk.
+        main_chunk_end (str): String to mark the end of the main chunk.
+        doc_header_key (str): The key for the headers in the current chunk.
+
+    Returns:
+        str: Renderted chunk with context and headers.
+    """
+    combined_parts = []
+
+    # Process previous chunks
+    combined_parts.append("--- Previous Context ---")
+    combined_parts.extend(
+        self.process_peripheral_chunks(
+            chunks[:current_index],
+            peripheral_config.get("previous", {}),
+            content_key,
+            order_key,
+        )
+    )
+    combined_parts.append("--- End Previous Context ---\n")
+
+    # Process main chunk
+    main_chunk = chunks[current_index]
+    headers = self.render_hierarchy_headers(
+        main_chunk, chunks[: current_index + 1], doc_header_key
+    )
+    if headers:
+        combined_parts.append(headers)
+    combined_parts.append(f"{main_chunk_start}")
+    combined_parts.append(f"{main_chunk[content_key]}")
+    combined_parts.append(f"{main_chunk_end}")
+
+    # Process next chunks
+    combined_parts.append("\n--- Next Context ---")
+    combined_parts.extend(
+        self.process_peripheral_chunks(
+            chunks[current_index + 1 :],
+            peripheral_config.get("next", {}),
+            content_key,
+            order_key,
+        )
+    )
+    combined_parts.append("--- End Next Context ---")
+
+    return "\n".join(combined_parts)
+
+
+
+ +
+ +
+ + +

+ render_hierarchy_headers(current_chunk, chunks, doc_header_key) + +

+ + +
+ +

Render headers for the current chunk's hierarchy.

+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
current_chunk + Dict + +
+

The current chunk being processed.

+
+
+ required +
chunks + List[Dict] + +
+

List of chunks up to and including the current chunk.

+
+
+ required +
doc_header_key + str + +
+

The key for the headers in the current chunk.

+
+
+ required +
+

Returns: + str: Renderted headers in the current chunk's hierarchy.

+ +
+ Source code in docetl/operations/gather.py +
269
+270
+271
+272
+273
+274
+275
+276
+277
+278
+279
+280
+281
+282
+283
+284
+285
+286
+287
+288
+289
+290
+291
+292
+293
+294
+295
+296
+297
+298
+299
+300
+301
+302
+303
+304
+305
+306
+307
+308
+309
+310
+311
+312
+313
+314
+315
+316
+317
+318
+319
+320
def render_hierarchy_headers(
+    self,
+    current_chunk: Dict,
+    chunks: List[Dict],
+    doc_header_key: str,
+) -> str:
+    """
+    Render headers for the current chunk's hierarchy.
+
+    Args:
+        current_chunk (Dict): The current chunk being processed.
+        chunks (List[Dict]): List of chunks up to and including the current chunk.
+        doc_header_key (str): The key for the headers in the current chunk.
+    Returns:
+        str: Renderted headers in the current chunk's hierarchy.
+    """
+    rendered_headers = []
+    current_hierarchy = {}
+
+    if doc_header_key is None:
+        return ""
+
+    # Find the largest/highest level in the current chunk
+    current_chunk_headers = current_chunk.get(doc_header_key, [])
+    highest_level = float("inf")  # Initialize with positive infinity
+    for header_info in current_chunk_headers:
+        level = header_info.get("level")
+        if level is not None and level < highest_level:
+            highest_level = level
+
+    # If no headers found in the current chunk, set highest_level to None
+    if highest_level == float("inf"):
+        highest_level = None
+
+    for chunk in chunks:
+        for header_info in chunk.get(doc_header_key, []):
+            header = header_info["header"]
+            level = header_info["level"]
+            if header and level:
+                current_hierarchy[level] = header
+                # Clear lower levels when a higher level header is found
+                for lower_level in range(level + 1, len(current_hierarchy) + 1):
+                    if lower_level in current_hierarchy:
+                        current_hierarchy[lower_level] = None
+
+    # Render the headers in the current hierarchy, everything above the highest level in the current chunk (if the highest level in the current chunk is None, render everything)
+    for level, header in sorted(current_hierarchy.items()):
+        if header is not None and (highest_level is None or level < highest_level):
+            rendered_headers.append(f"{'#' * level} {header}")
+
+    rendered_headers = " > ".join(rendered_headers)
+    return f"_Current Section:_ {rendered_headers}" if rendered_headers else ""
+
+
+
+ +
+ +
+ + +

+ syntax_check() + +

+ + +
+ +

Perform a syntax check on the operation configuration.

+ + +

Raises:

+ + + + + + + + + + + + + + + + + +
TypeDescription
+ ValueError + +
+

If required keys are missing or if there are configuration errors.

+
+
+ TypeError + +
+

If main_chunk_start or main_chunk_end are not strings.

+
+
+ +
+ Source code in docetl/operations/gather.py +
28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+40
+41
+42
+43
+44
+45
+46
+47
+48
+49
+50
+51
+52
+53
+54
+55
+56
+57
+58
+59
+60
+61
+62
+63
+64
+65
+66
+67
def syntax_check(self) -> None:
+    """
+    Perform a syntax check on the operation configuration.
+
+    Raises:
+        ValueError: If required keys are missing or if there are configuration errors.
+        TypeError: If main_chunk_start or main_chunk_end are not strings.
+    """
+    required_keys = ["content_key", "doc_id_key", "order_key"]
+    for key in required_keys:
+        if key not in self.config:
+            raise ValueError(
+                f"Missing required key '{key}' in GatherOperation configuration"
+            )
+
+    if "peripheral_chunks" not in self.config:
+        raise ValueError(
+            "Missing 'peripheral_chunks' configuration in GatherOperation"
+        )
+
+    peripheral_config = self.config["peripheral_chunks"]
+    for direction in ["previous", "next"]:
+        if direction not in peripheral_config:
+            continue
+        for section in ["head", "middle", "tail"]:
+            if section in peripheral_config[direction]:
+                section_config = peripheral_config[direction][section]
+                if section != "middle" and "count" not in section_config:
+                    raise ValueError(
+                        f"Missing 'count' in {direction}.{section} configuration"
+                    )
+
+    if "main_chunk_start" in self.config and not isinstance(
+        self.config["main_chunk_start"], str
+    ):
+        raise TypeError("'main_chunk_start' must be a string")
+    if "main_chunk_end" in self.config and not isinstance(
+        self.config["main_chunk_end"], str
+    ):
+        raise TypeError("'main_chunk_end' must be a string")
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ docetl.operations.unnest.UnnestOperation + + +

+ + +
+

+ Bases: BaseOperation

+ + +

A class that represents an operation to unnest a list-like or dictionary value in a dictionary into multiple dictionaries.

+

This operation takes a list of dictionaries and a specified key, and creates new dictionaries based on the value type: +- For list-like values: Creates a new dictionary for each element in the list, copying all other key-value pairs. +- For dictionary values: Expands specified fields from the nested dictionary into the parent dictionary.

+ + +
+ Inherits from +

BaseOperation

+

Usage: +

from docetl.operations import UnnestOperation
+
+# Unnesting a list
+config_list = {"unnest_key": "tags"}
+input_data_list = [
+    {"id": 1, "tags": ["a", "b", "c"]},
+    {"id": 2, "tags": ["d", "e"]}
+]
+
+unnest_op_list = UnnestOperation(config_list)
+result_list, _ = unnest_op_list.execute(input_data_list)
+
+# Result will be:
+# [
+#     {"id": 1, "tags": "a"},
+#     {"id": 1, "tags": "b"},
+#     {"id": 1, "tags": "c"},
+#     {"id": 2, "tags": "d"},
+#     {"id": 2, "tags": "e"}
+# ]
+
+# Unnesting a dictionary
+config_dict = {"unnest_key": "user", "expand_fields": ["name", "age"]}
+input_data_dict = [
+    {"id": 1, "user": {"name": "Alice", "age": 30, "email": "alice@example.com"}},
+    {"id": 2, "user": {"name": "Bob", "age": 25, "email": "bob@example.com"}}
+]
+
+unnest_op_dict = UnnestOperation(config_dict)
+result_dict, _ = unnest_op_dict.execute(input_data_dict)
+
+# Result will be:
+# [
+#     {"id": 1, "name": "Alice", "age": 30, "user": {"name": "Alice", "age": 30, "email": "alice@example.com"}},
+#     {"id": 2, "name": "Bob", "age": 25, "user": {"name": "Bob", "age": 25, "email": "bob@example.com"}}
+# ]
+

+ +
+ Source code in docetl/operations/unnest.py +
  7
+  8
+  9
+ 10
+ 11
+ 12
+ 13
+ 14
+ 15
+ 16
+ 17
+ 18
+ 19
+ 20
+ 21
+ 22
+ 23
+ 24
+ 25
+ 26
+ 27
+ 28
+ 29
+ 30
+ 31
+ 32
+ 33
+ 34
+ 35
+ 36
+ 37
+ 38
+ 39
+ 40
+ 41
+ 42
+ 43
+ 44
+ 45
+ 46
+ 47
+ 48
+ 49
+ 50
+ 51
+ 52
+ 53
+ 54
+ 55
+ 56
+ 57
+ 58
+ 59
+ 60
+ 61
+ 62
+ 63
+ 64
+ 65
+ 66
+ 67
+ 68
+ 69
+ 70
+ 71
+ 72
+ 73
+ 74
+ 75
+ 76
+ 77
+ 78
+ 79
+ 80
+ 81
+ 82
+ 83
+ 84
+ 85
+ 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
+121
+122
+123
+124
+125
+126
+127
+128
+129
+130
+131
+132
+133
+134
+135
+136
+137
+138
+139
+140
+141
+142
+143
+144
+145
+146
+147
+148
+149
+150
+151
+152
+153
+154
+155
+156
+157
+158
+159
+160
+161
+162
+163
+164
+165
+166
+167
+168
+169
+170
+171
+172
+173
+174
+175
+176
+177
+178
+179
+180
+181
+182
+183
+184
+185
+186
+187
+188
+189
+190
+191
+192
+193
+194
+195
+196
+197
+198
+199
+200
+201
class UnnestOperation(BaseOperation):
+    """
+    A class that represents an operation to unnest a list-like or dictionary value in a dictionary into multiple dictionaries.
+
+    This operation takes a list of dictionaries and a specified key, and creates new dictionaries based on the value type:
+    - For list-like values: Creates a new dictionary for each element in the list, copying all other key-value pairs.
+    - For dictionary values: Expands specified fields from the nested dictionary into the parent dictionary.
+
+    Inherits from:
+        BaseOperation
+
+    Usage:
+    ```python
+    from docetl.operations import UnnestOperation
+
+    # Unnesting a list
+    config_list = {"unnest_key": "tags"}
+    input_data_list = [
+        {"id": 1, "tags": ["a", "b", "c"]},
+        {"id": 2, "tags": ["d", "e"]}
+    ]
+
+    unnest_op_list = UnnestOperation(config_list)
+    result_list, _ = unnest_op_list.execute(input_data_list)
+
+    # Result will be:
+    # [
+    #     {"id": 1, "tags": "a"},
+    #     {"id": 1, "tags": "b"},
+    #     {"id": 1, "tags": "c"},
+    #     {"id": 2, "tags": "d"},
+    #     {"id": 2, "tags": "e"}
+    # ]
+
+    # Unnesting a dictionary
+    config_dict = {"unnest_key": "user", "expand_fields": ["name", "age"]}
+    input_data_dict = [
+        {"id": 1, "user": {"name": "Alice", "age": 30, "email": "alice@example.com"}},
+        {"id": 2, "user": {"name": "Bob", "age": 25, "email": "bob@example.com"}}
+    ]
+
+    unnest_op_dict = UnnestOperation(config_dict)
+    result_dict, _ = unnest_op_dict.execute(input_data_dict)
+
+    # Result will be:
+    # [
+    #     {"id": 1, "name": "Alice", "age": 30, "user": {"name": "Alice", "age": 30, "email": "alice@example.com"}},
+    #     {"id": 2, "name": "Bob", "age": 25, "user": {"name": "Bob", "age": 25, "email": "bob@example.com"}}
+    # ]
+    ```
+    """
+
+    def syntax_check(self) -> None:
+        """
+        Checks if the required configuration key is present in the operation's config.
+
+        Raises:
+            ValueError: If the required 'unnest_key' is missing from the configuration.
+        """
+
+        required_keys = ["unnest_key"]
+        for key in required_keys:
+            if key not in self.config:
+                raise ValueError(
+                    f"Missing required key '{key}' in UnnestOperation configuration"
+                )
+
+    def execute(self, input_data: List[Dict]) -> Tuple[List[Dict], float]:
+        """
+        Executes the unnest operation on the input data.
+
+        Args:
+            input_data (List[Dict]): A list of dictionaries to process.
+
+        Returns:
+            Tuple[List[Dict], float]: A tuple containing the processed list of dictionaries
+            and a float value (always 0 in this implementation).
+
+        Raises:
+            KeyError: If the specified unnest_key is not found in an input dictionary.
+            TypeError: If the value of the unnest_key is not iterable (list, tuple, set, or dict).
+            ValueError: If unnesting a dictionary and 'expand_fields' is not provided in the config.
+
+        The operation supports unnesting of both list-like values and dictionary values:
+
+        1. For list-like values (list, tuple, set):
+           Each element in the list becomes a separate dictionary in the output.
+
+        2. For dictionary values:
+           The operation expands specified fields from the nested dictionary into the parent dictionary.
+           The 'expand_fields' config parameter must be provided to specify which fields to expand.
+
+        Examples:
+        ```python
+        # Unnesting a list
+        unnest_op = UnnestOperation({"unnest_key": "colors"})
+        input_data = [
+            {"id": 1, "colors": ["red", "blue"]},
+            {"id": 2, "colors": ["green"]}
+        ]
+        result, _ = unnest_op.execute(input_data)
+        # Result will be:
+        # [
+        #     {"id": 1, "colors": "red"},
+        #     {"id": 1, "colors": "blue"},
+        #     {"id": 2, "colors": "green"}
+        # ]
+
+        # Unnesting a dictionary
+        unnest_op = UnnestOperation({"unnest_key": "details", "expand_fields": ["color", "size"]})
+        input_data = [
+            {"id": 1, "details": {"color": "red", "size": "large", "stock": 5}},
+            {"id": 2, "details": {"color": "blue", "size": "medium", "stock": 3}}
+        ]
+        result, _ = unnest_op.execute(input_data)
+        # Result will be:
+        # [
+        #     {"id": 1, "details": {"color": "red", "size": "large", "stock": 5}, "color": "red", "size": "large"},
+        #     {"id": 2, "details": {"color": "blue", "size": "medium", "stock": 3}, "color": "blue", "size": "medium"}
+        # ]
+        ```
+
+        Note: When unnesting dictionaries, the original nested dictionary is preserved in the output,
+        and the specified fields are expanded into the parent dictionary.
+        """
+
+        unnest_key = self.config["unnest_key"]
+        recursive = self.config.get("recursive", False)
+        depth = self.config.get("depth", None)
+        if not depth:
+            depth = 1 if not recursive else float("inf")
+        results = []
+
+        def unnest_recursive(item, key, level=0):
+            if level == 0 and not isinstance(item[key], (list, tuple, set, dict)):
+                raise TypeError(f"Value of unnest key '{key}' is not iterable")
+
+            if level > 0 and not isinstance(item[key], (list, tuple, set, dict)):
+                return [item]
+
+            if level >= depth:
+                return [item]
+
+            if isinstance(item[key], dict):
+                expand_fields = self.config.get("expand_fields")
+                if expand_fields is None:
+                    expand_fields = item[key].keys()
+                new_item = copy.deepcopy(item)
+                for field in expand_fields:
+                    if field in new_item[key]:
+                        new_item[field] = new_item[key][field]
+                    else:
+                        new_item[field] = None
+                return [new_item]
+            else:
+                nested_results = []
+                for value in item[key]:
+                    new_item = copy.deepcopy(item)
+                    new_item[key] = value
+                    if recursive and isinstance(value, (list, tuple, set, dict)):
+                        nested_results.extend(
+                            unnest_recursive(new_item, key, level + 1)
+                        )
+                    else:
+                        nested_results.append(new_item)
+                return nested_results
+
+        for item in input_data:
+            if unnest_key not in item:
+                raise KeyError(
+                    f"Unnest key '{unnest_key}' not found in item. Other keys are {item.keys()}"
+                )
+
+            results.extend(unnest_recursive(item, unnest_key))
+
+            if not item[unnest_key] and self.config.get("keep_empty", False):
+                expand_fields = self.config.get("expand_fields")
+                new_item = copy.deepcopy(item)
+                if isinstance(item[unnest_key], dict):
+                    if expand_fields is None:
+                        expand_fields = item[unnest_key].keys()
+                    for field in expand_fields:
+                        new_item[field] = None
+                else:
+                    new_item[unnest_key] = None
+                results.append(new_item)
+
+        # Assert that no keys are missing after the operation
+        if results:
+            original_keys = set(input_data[0].keys())
+            assert original_keys.issubset(
+                set(results[0].keys())
+            ), "Keys lost during unnest operation"
+
+        return results, 0
+
+
+ + + +
+ + + + + + + + + +
+ + +

+ execute(input_data) + +

+ + +
+ +

Executes the unnest operation on the input data.

+ + +

Parameters:

+ + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
input_data + List[Dict] + +
+

A list of dictionaries to process.

+
+
+ required +
+ + +

Returns:

+ + + + + + + + + + + + + + + + + +
TypeDescription
+ List[Dict] + +
+

Tuple[List[Dict], float]: A tuple containing the processed list of dictionaries

+
+
+ float + +
+

and a float value (always 0 in this implementation).

+
+
+ + +

Raises:

+ + + + + + + + + + + + + + + + + + + + + +
TypeDescription
+ KeyError + +
+

If the specified unnest_key is not found in an input dictionary.

+
+
+ TypeError + +
+

If the value of the unnest_key is not iterable (list, tuple, set, or dict).

+
+
+ ValueError + +
+

If unnesting a dictionary and 'expand_fields' is not provided in the config.

+
+
+

The operation supports unnesting of both list-like values and dictionary values:

+
    +
  1. +

    For list-like values (list, tuple, set): + Each element in the list becomes a separate dictionary in the output.

    +
  2. +
  3. +

    For dictionary values: + The operation expands specified fields from the nested dictionary into the parent dictionary. + The 'expand_fields' config parameter must be provided to specify which fields to expand.

    +
  4. +
+

Examples: +

# Unnesting a list
+unnest_op = UnnestOperation({"unnest_key": "colors"})
+input_data = [
+    {"id": 1, "colors": ["red", "blue"]},
+    {"id": 2, "colors": ["green"]}
+]
+result, _ = unnest_op.execute(input_data)
+# Result will be:
+# [
+#     {"id": 1, "colors": "red"},
+#     {"id": 1, "colors": "blue"},
+#     {"id": 2, "colors": "green"}
+# ]
+
+# Unnesting a dictionary
+unnest_op = UnnestOperation({"unnest_key": "details", "expand_fields": ["color", "size"]})
+input_data = [
+    {"id": 1, "details": {"color": "red", "size": "large", "stock": 5}},
+    {"id": 2, "details": {"color": "blue", "size": "medium", "stock": 3}}
+]
+result, _ = unnest_op.execute(input_data)
+# Result will be:
+# [
+#     {"id": 1, "details": {"color": "red", "size": "large", "stock": 5}, "color": "red", "size": "large"},
+#     {"id": 2, "details": {"color": "blue", "size": "medium", "stock": 3}, "color": "blue", "size": "medium"}
+# ]
+

+

Note: When unnesting dictionaries, the original nested dictionary is preserved in the output, +and the specified fields are expanded into the parent dictionary.

+ +
+ Source code in docetl/operations/unnest.py +
 74
+ 75
+ 76
+ 77
+ 78
+ 79
+ 80
+ 81
+ 82
+ 83
+ 84
+ 85
+ 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
+121
+122
+123
+124
+125
+126
+127
+128
+129
+130
+131
+132
+133
+134
+135
+136
+137
+138
+139
+140
+141
+142
+143
+144
+145
+146
+147
+148
+149
+150
+151
+152
+153
+154
+155
+156
+157
+158
+159
+160
+161
+162
+163
+164
+165
+166
+167
+168
+169
+170
+171
+172
+173
+174
+175
+176
+177
+178
+179
+180
+181
+182
+183
+184
+185
+186
+187
+188
+189
+190
+191
+192
+193
+194
+195
+196
+197
+198
+199
+200
+201
def execute(self, input_data: List[Dict]) -> Tuple[List[Dict], float]:
+    """
+    Executes the unnest operation on the input data.
+
+    Args:
+        input_data (List[Dict]): A list of dictionaries to process.
+
+    Returns:
+        Tuple[List[Dict], float]: A tuple containing the processed list of dictionaries
+        and a float value (always 0 in this implementation).
+
+    Raises:
+        KeyError: If the specified unnest_key is not found in an input dictionary.
+        TypeError: If the value of the unnest_key is not iterable (list, tuple, set, or dict).
+        ValueError: If unnesting a dictionary and 'expand_fields' is not provided in the config.
+
+    The operation supports unnesting of both list-like values and dictionary values:
+
+    1. For list-like values (list, tuple, set):
+       Each element in the list becomes a separate dictionary in the output.
+
+    2. For dictionary values:
+       The operation expands specified fields from the nested dictionary into the parent dictionary.
+       The 'expand_fields' config parameter must be provided to specify which fields to expand.
+
+    Examples:
+    ```python
+    # Unnesting a list
+    unnest_op = UnnestOperation({"unnest_key": "colors"})
+    input_data = [
+        {"id": 1, "colors": ["red", "blue"]},
+        {"id": 2, "colors": ["green"]}
+    ]
+    result, _ = unnest_op.execute(input_data)
+    # Result will be:
+    # [
+    #     {"id": 1, "colors": "red"},
+    #     {"id": 1, "colors": "blue"},
+    #     {"id": 2, "colors": "green"}
+    # ]
+
+    # Unnesting a dictionary
+    unnest_op = UnnestOperation({"unnest_key": "details", "expand_fields": ["color", "size"]})
+    input_data = [
+        {"id": 1, "details": {"color": "red", "size": "large", "stock": 5}},
+        {"id": 2, "details": {"color": "blue", "size": "medium", "stock": 3}}
+    ]
+    result, _ = unnest_op.execute(input_data)
+    # Result will be:
+    # [
+    #     {"id": 1, "details": {"color": "red", "size": "large", "stock": 5}, "color": "red", "size": "large"},
+    #     {"id": 2, "details": {"color": "blue", "size": "medium", "stock": 3}, "color": "blue", "size": "medium"}
+    # ]
+    ```
+
+    Note: When unnesting dictionaries, the original nested dictionary is preserved in the output,
+    and the specified fields are expanded into the parent dictionary.
+    """
+
+    unnest_key = self.config["unnest_key"]
+    recursive = self.config.get("recursive", False)
+    depth = self.config.get("depth", None)
+    if not depth:
+        depth = 1 if not recursive else float("inf")
+    results = []
+
+    def unnest_recursive(item, key, level=0):
+        if level == 0 and not isinstance(item[key], (list, tuple, set, dict)):
+            raise TypeError(f"Value of unnest key '{key}' is not iterable")
+
+        if level > 0 and not isinstance(item[key], (list, tuple, set, dict)):
+            return [item]
+
+        if level >= depth:
+            return [item]
+
+        if isinstance(item[key], dict):
+            expand_fields = self.config.get("expand_fields")
+            if expand_fields is None:
+                expand_fields = item[key].keys()
+            new_item = copy.deepcopy(item)
+            for field in expand_fields:
+                if field in new_item[key]:
+                    new_item[field] = new_item[key][field]
+                else:
+                    new_item[field] = None
+            return [new_item]
+        else:
+            nested_results = []
+            for value in item[key]:
+                new_item = copy.deepcopy(item)
+                new_item[key] = value
+                if recursive and isinstance(value, (list, tuple, set, dict)):
+                    nested_results.extend(
+                        unnest_recursive(new_item, key, level + 1)
+                    )
+                else:
+                    nested_results.append(new_item)
+            return nested_results
+
+    for item in input_data:
+        if unnest_key not in item:
+            raise KeyError(
+                f"Unnest key '{unnest_key}' not found in item. Other keys are {item.keys()}"
+            )
+
+        results.extend(unnest_recursive(item, unnest_key))
+
+        if not item[unnest_key] and self.config.get("keep_empty", False):
+            expand_fields = self.config.get("expand_fields")
+            new_item = copy.deepcopy(item)
+            if isinstance(item[unnest_key], dict):
+                if expand_fields is None:
+                    expand_fields = item[unnest_key].keys()
+                for field in expand_fields:
+                    new_item[field] = None
+            else:
+                new_item[unnest_key] = None
+            results.append(new_item)
+
+    # Assert that no keys are missing after the operation
+    if results:
+        original_keys = set(input_data[0].keys())
+        assert original_keys.issubset(
+            set(results[0].keys())
+        ), "Keys lost during unnest operation"
+
+    return results, 0
+
+
+
+ +
+ +
+ + +

+ syntax_check() + +

+ + +
+ +

Checks if the required configuration key is present in the operation's config.

+ + +

Raises:

+ + + + + + + + + + + + + +
TypeDescription
+ ValueError + +
+

If the required 'unnest_key' is missing from the configuration.

+
+
+ +
+ Source code in docetl/operations/unnest.py +
59
+60
+61
+62
+63
+64
+65
+66
+67
+68
+69
+70
+71
+72
def syntax_check(self) -> None:
+    """
+    Checks if the required configuration key is present in the operation's config.
+
+    Raises:
+        ValueError: If the required 'unnest_key' is missing from the configuration.
+    """
+
+    required_keys = ["unnest_key"]
+    for key in required_keys:
+        if key not in self.config:
+            raise ValueError(
+                f"Missing required key '{key}' in UnnestOperation configuration"
+            )
+
+
+
+ +
+ + + +
+ +
+ +
+ + + + + + + + + + + + + +
+
+ + + +
+ + + +
+ + + +
+
+
+
+ + + + + + + + + + \ No newline at end of file diff --git a/api-reference/optimizers/index.html b/api-reference/optimizers/index.html new file mode 100644 index 00000000..e4183ebf --- /dev/null +++ b/api-reference/optimizers/index.html @@ -0,0 +1,10171 @@ + + + + + + + + + + + + + + + + + + + + + + + + + docetl.optimizers - docetl docs + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + Skip to content + + +
+
+ +
+ + + + + + +
+ + + + + + + +
+ +
+ + + + +
+
+ + + +
+
+
+ + + + + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

docetl.optimizers

+ +
+ + + +

+ docetl.optimizers.map_optimizer.optimizer.MapOptimizer + + +

+ + +
+ + +

A class for optimizing map operations in data processing pipelines.

+

This optimizer analyzes the input operation configuration and data, +and generates optimized plans for executing the operation. It can +create plans for chunking, metadata extraction, gleaning, chain +decomposition, and parallel execution.

+ + +

Attributes:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescription
config + Dict[str, Any] + +
+

The configuration dictionary for the optimizer.

+
+
console + Console + +
+

A Rich console object for pretty printing.

+
+
llm_client + LLMClient + +
+

A client for interacting with a language model.

+
+
_run_operation + Callable + +
+

A function to execute operations.

+
+
max_threads + int + +
+

The maximum number of threads to use for parallel execution.

+
+
timeout + int + +
+

The timeout in seconds for operation execution.

+
+
+ +
+ Source code in docetl/optimizers/map_optimizer/optimizer.py +
 23
+ 24
+ 25
+ 26
+ 27
+ 28
+ 29
+ 30
+ 31
+ 32
+ 33
+ 34
+ 35
+ 36
+ 37
+ 38
+ 39
+ 40
+ 41
+ 42
+ 43
+ 44
+ 45
+ 46
+ 47
+ 48
+ 49
+ 50
+ 51
+ 52
+ 53
+ 54
+ 55
+ 56
+ 57
+ 58
+ 59
+ 60
+ 61
+ 62
+ 63
+ 64
+ 65
+ 66
+ 67
+ 68
+ 69
+ 70
+ 71
+ 72
+ 73
+ 74
+ 75
+ 76
+ 77
+ 78
+ 79
+ 80
+ 81
+ 82
+ 83
+ 84
+ 85
+ 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
+121
+122
+123
+124
+125
+126
+127
+128
+129
+130
+131
+132
+133
+134
+135
+136
+137
+138
+139
+140
+141
+142
+143
+144
+145
+146
+147
+148
+149
+150
+151
+152
+153
+154
+155
+156
+157
+158
+159
+160
+161
+162
+163
+164
+165
+166
+167
+168
+169
+170
+171
+172
+173
+174
+175
+176
+177
+178
+179
+180
+181
+182
+183
+184
+185
+186
+187
+188
+189
+190
+191
+192
+193
+194
+195
+196
+197
+198
+199
+200
+201
+202
+203
+204
+205
+206
+207
+208
+209
+210
+211
+212
+213
+214
+215
+216
+217
+218
+219
+220
+221
+222
+223
+224
+225
+226
+227
+228
+229
+230
+231
+232
+233
+234
+235
+236
+237
+238
+239
+240
+241
+242
+243
+244
+245
+246
+247
+248
+249
+250
+251
+252
+253
+254
+255
+256
+257
+258
+259
+260
+261
+262
+263
+264
+265
+266
+267
+268
+269
+270
+271
+272
+273
+274
+275
+276
+277
+278
+279
+280
+281
+282
+283
+284
+285
+286
+287
+288
+289
+290
+291
+292
+293
+294
+295
+296
+297
+298
+299
+300
+301
+302
+303
+304
+305
+306
+307
+308
+309
+310
+311
+312
+313
+314
+315
+316
+317
+318
+319
+320
+321
+322
+323
+324
+325
+326
+327
+328
+329
+330
+331
+332
+333
+334
+335
+336
+337
+338
+339
+340
+341
+342
+343
+344
+345
+346
+347
+348
+349
+350
+351
+352
+353
+354
+355
+356
+357
+358
+359
+360
+361
+362
+363
+364
+365
+366
class MapOptimizer:
+    """
+    A class for optimizing map operations in data processing pipelines.
+
+    This optimizer analyzes the input operation configuration and data,
+    and generates optimized plans for executing the operation. It can
+    create plans for chunking, metadata extraction, gleaning, chain
+    decomposition, and parallel execution.
+
+    Attributes:
+        config (Dict[str, Any]): The configuration dictionary for the optimizer.
+        console (Console): A Rich console object for pretty printing.
+        llm_client (LLMClient): A client for interacting with a language model.
+        _run_operation (Callable): A function to execute operations.
+        max_threads (int): The maximum number of threads to use for parallel execution.
+        timeout (int): The timeout in seconds for operation execution.
+
+    """
+
+    def __init__(
+        self,
+        config: Dict[str, Any],
+        console: Console,
+        llm_client: LLMClient,
+        max_threads: int,
+        run_operation: Callable,
+        timeout: int = 10,
+        is_filter: bool = False,
+    ):
+        """
+        Initialize the MapOptimizer.
+
+        Args:
+            config (Dict[str, Any]): The configuration dictionary for the optimizer.
+            console (Console): A Rich console object for pretty printing.
+            llm_client (LLMClient): A client for interacting with a language model.
+            max_threads (int): The maximum number of threads to use for parallel execution.
+            run_operation (Callable): A function to execute operations.
+            timeout (int, optional): The timeout in seconds for operation execution. Defaults to 10.
+            is_filter (bool, optional): If True, the operation is a filter operation. Defaults to False.
+        """
+        self.config = config
+        self.console = console
+        self.llm_client = llm_client
+        self._run_operation = run_operation
+        self.max_threads = max_threads
+        self.timeout = timeout
+        self._num_plans_to_evaluate_in_parallel = 5
+        self.is_filter = is_filter
+
+        self.plan_generator = PlanGenerator(
+            llm_client, console, config, run_operation, max_threads, is_filter
+        )
+        self.evaluator = Evaluator(
+            llm_client,
+            console,
+            run_operation,
+            timeout,
+            self._num_plans_to_evaluate_in_parallel,
+            is_filter,
+        )
+        self.prompt_generator = PromptGenerator(
+            llm_client, console, config, max_threads, is_filter
+        )
+
+    def optimize(
+        self, op_config: Dict[str, Any], input_data: List[Dict[str, Any]]
+    ) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]], float]:
+        """
+        Optimize the given operation configuration for the input data.
+        This method analyzes the operation and input data, generates various
+        optimization plans, evaluates them, and returns the best plan along
+        with its output. A key part of this process is creating a custom
+        validator prompt for evaluation. The validator prompt is generated
+        based on the specific task, input data, and output data. It serves
+        as a critical tool for assessing the quality and correctness of
+        each optimization plan's output. This custom prompt ensures that
+        the evaluation is tailored to the unique requirements and nuances
+        of the given operation. The types of optimization plans include:
+
+        1. Improved Prompt Plan: Enhances the original prompt based on evaluation, aiming to improve output quality.
+
+        2. Chunk Size Plan: Splits input data into chunks of different sizes,
+           processes each chunk separately, and then combines the results. This
+           can improve performance for large inputs.
+
+        3. Gleaning Plans: Implements an iterative refinement process where the
+           output is validated and improved over multiple rounds, enhancing accuracy.
+
+        4. Chain Decomposition Plan: Breaks down complex operations into a series
+           of simpler sub-operations, potentially improving overall performance
+           and interpretability.
+
+        5. Parallel Map Plan: Decomposes the task into subtasks that can be
+           executed in parallel, potentially speeding up processing for
+           independent operations.
+
+        The method generates these plans, evaluates their performance using
+        a custom validator, and selects the best performing plan based on
+        output quality and execution time.
+
+        Args:
+            op_config (Dict[str, Any]): The configuration of the operation to optimize.
+            input_data (List[Dict[str, Any]]): The input data for the operation.
+
+        Returns:
+            Tuple[List[Dict[str, Any]], List[Dict[str, Any]], float]: A tuple containing
+            the best optimization plan and its output. The plan is a list of
+            operation configurations that achieve the best performance.
+            The cost is the cost of the optimizer (from possibly synthesizing resolves).
+
+        """
+        input_data = copy.deepcopy(input_data)
+        # Add id to each input_data
+        for i in range(len(input_data)):
+            input_data[i]["_map_opt_id"] = str(uuid.uuid4())
+
+        # Define the token limit (adjust as needed)
+        model_input_context_length = model_cost.get(
+            op_config.get("model", self.config.get("default_model")), {}
+        ).get("max_input_tokens", 8192)
+
+        # Render the prompt with all sample inputs and count tokens
+        total_tokens = 0
+        exceed_count = 0
+        for sample in input_data:
+            rendered_prompt = Template(op_config["prompt"]).render(input=sample)
+            prompt_tokens = count_tokens(
+                rendered_prompt,
+                op_config.get("model", self.config.get("default_model")),
+            )
+            total_tokens += prompt_tokens
+
+            if prompt_tokens > model_input_context_length:
+                exceed_count += 1
+
+        # Calculate average tokens and percentage of samples exceeding limit
+        avg_tokens = total_tokens / len(input_data)
+        exceed_percentage = (exceed_count / len(input_data)) * 100
+
+        data_exceeds_limit = exceed_count > 0
+        if exceed_count > 0:
+            self.console.log(
+                f"[yellow]Warning: {exceed_percentage:.2f}% of prompts exceed token limit. "
+                f"Average token count: {avg_tokens:.2f}. "
+                f"Truncating input data when generating validators.[/yellow]"
+            )
+
+        # Execute the original operation on the sample data
+        no_change_start = time.time()
+        output_data = self._run_operation(op_config, input_data, is_build=True)
+        no_change_runtime = time.time() - no_change_start
+
+        # Generate custom validator prompt
+        validator_prompt = self.prompt_generator._generate_validator_prompt(
+            op_config, input_data, output_data
+        )
+
+        # Log the validator prompt
+        self.console.log("[bold]Validator Prompt:[/bold]")
+        self.console.log(validator_prompt)
+        self.console.log("\n")  # Add a newline for better readability
+
+        # Step 2: Use the validator prompt to assess the operation's performance
+        assessment = self.evaluator._assess_operation(
+            op_config, input_data, output_data, validator_prompt
+        )
+
+        # Print out the assessment
+        self.console.log(
+            f"[bold]Assessment for whether we should improve operation {op_config['name']}:[/bold]"
+        )
+        self.console.log(json.dumps(assessment, indent=2))
+        self.console.log("\n")  # Add a newline for better readability
+
+        # Check if improvement is needed based on the assessment
+        if not data_exceeds_limit and not assessment.get("needs_improvement", True):
+            self.console.log(
+                f"[green]No improvement needed for operation {op_config['name']}[/green]"
+            )
+            return [op_config], output_data, self.plan_generator.reduce_optimizer_cost
+
+        candidate_plans = {}
+
+        # Generate improved prompt plan
+        if not data_exceeds_limit:
+            #     improved_prompt_plan = self.prompt_generator._get_improved_prompt(
+            #         op_config, assessment, input_data
+            #     )
+            #     candidate_plans["improved_instructions"] = improved_prompt_plan
+            candidate_plans["no_change"] = [op_config]
+
+        # Generate chunk size plans
+        chunk_size_plans = self.plan_generator._generate_chunk_size_plans(
+            op_config, input_data, validator_prompt, model_input_context_length
+        )
+        for pname, plan in chunk_size_plans.items():
+            candidate_plans[pname] = plan
+
+        # Generate gleaning plans
+        if not data_exceeds_limit:
+            gleaning_plans = self.plan_generator._generate_gleaning_plans(
+                op_config, validator_prompt
+            )
+            for pname, plan in gleaning_plans.items():
+                candidate_plans[pname] = plan
+
+        # Generate chain decomposition plans
+        if not data_exceeds_limit:
+            if not self.is_filter:
+                chain_plans = self.plan_generator._generate_chain_plans(
+                    op_config, input_data
+                )
+                for pname, plan in chain_plans.items():
+                    candidate_plans[pname] = plan
+
+                # Generate parallel map plans
+                parallel_plans = self.plan_generator._generate_parallel_plans(
+                    op_config, input_data
+                )
+                for pname, plan in parallel_plans.items():
+                    candidate_plans[pname] = plan
+
+        # Select consistent evaluation samples
+        num_evaluations = min(5, len(input_data))
+        evaluation_samples = select_evaluation_samples(input_data, num_evaluations)
+
+        results = {}
+        plans_list = list(candidate_plans.items())
+        for i in range(0, len(plans_list), self._num_plans_to_evaluate_in_parallel):
+            batch = plans_list[i : i + self._num_plans_to_evaluate_in_parallel]
+            with ThreadPoolExecutor(
+                max_workers=self._num_plans_to_evaluate_in_parallel
+            ) as executor:
+                futures = {
+                    executor.submit(
+                        self.evaluator._evaluate_plan,
+                        plan_name,
+                        op_config,
+                        plan,
+                        copy.deepcopy(evaluation_samples),
+                        validator_prompt,
+                    ): plan_name
+                    for plan_name, plan in batch
+                }
+                for future in as_completed(futures):
+                    plan_name = futures[future]
+                    try:
+                        score, runtime, output = future.result(timeout=self.timeout)
+                        results[plan_name] = (score, runtime, output)
+                    except concurrent.futures.TimeoutError:
+                        self.console.log(
+                            f"[yellow]Plan {plan_name} timed out and will be skipped.[/yellow]"
+                        )
+                    except Exception as e:
+                        # TODO: raise this error if the error is related to a Jinja error
+                        self.console.log(
+                            f"[red]Error in plan {plan_name}: {str(e)}[/red]"
+                        )
+                        import traceback
+
+                        print(traceback.format_exc())
+
+        # Add no change plan
+        if not data_exceeds_limit:
+            results["no_change"] = (
+                results["no_change"][0],
+                no_change_runtime,
+                results["no_change"][2],
+            )
+
+        # Create a table of scores sorted in descending order
+        scores = sorted(
+            [(score, runtime, plan) for plan, (score, runtime, _) in results.items()],
+            reverse=True,
+        )
+
+        # Sort results by score in descending order
+        sorted_results = sorted(results.items(), key=lambda x: x[1][0], reverse=True)
+
+        # Take the top 6 plans
+        top_plans = sorted_results[:6]
+
+        # Check if there are no top plans
+        if len(top_plans) == 0:
+            raise ValueError(
+                "Agent did not generate any plans. Unable to proceed with optimization. Try again."
+            )
+
+        # Include any additional plans that are tied with the last plan
+        tail_score = top_plans[-1][1][0] if len(top_plans) == 6 else float("-inf")
+        filtered_results = dict(
+            top_plans
+            + [
+                item
+                for item in sorted_results[len(top_plans) :]
+                if item[1][0] == tail_score
+            ]
+        )
+
+        # Perform pairwise comparisons on filtered plans
+        if len(filtered_results) > 1:
+            pairwise_rankings = self.evaluator._pairwise_compare_plans(
+                filtered_results, validator_prompt, op_config, evaluation_samples
+            )
+            best_plan_name = max(pairwise_rankings, key=pairwise_rankings.get)
+        else:
+            pairwise_rankings = {k: 0 for k in results.keys()}
+            best_plan_name = (
+                next(iter(filtered_results))
+                if filtered_results
+                else max(results, key=lambda x: results[x][0])
+            )
+
+        self.console.log(
+            f"\n[bold]Plan Evaluation Results for {op_config['name']} ({op_config['type']}, {len(scores)} plans, {num_evaluations} samples):[/bold]"
+        )
+        table = Table(show_header=True, header_style="bold magenta")
+        table.add_column("Plan", style="dim")
+        table.add_column("Score", justify="right", width=10)
+        table.add_column("Runtime", justify="right", width=10)
+        table.add_column("Pairwise Wins", justify="right", width=10)
+
+        for score, runtime, plan in scores:
+            table.add_row(
+                plan,
+                f"{score:.2f}",
+                f"{runtime:.2f}s",
+                f"{pairwise_rankings.get(plan, 0)}",
+            )
+
+        self.console.log(table)
+        self.console.log("\n")
+
+        _, _, best_output = results[best_plan_name]
+        self.console.log(
+            f"[green]Choosing {best_plan_name} for operation {op_config['name']} (Score: {results[best_plan_name][0]:.2f}, Runtime: {results[best_plan_name][1]:.2f}s)[/green]"
+        )
+
+        return (
+            candidate_plans[best_plan_name],
+            best_output,
+            self.plan_generator.reduce_optimizer_cost,
+        )
+
+
+ + + +
+ + + + + + + + + +
+ + +

+ __init__(config, console, llm_client, max_threads, run_operation, timeout=10, is_filter=False) + +

+ + +
+ +

Initialize the MapOptimizer.

+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
config + Dict[str, Any] + +
+

The configuration dictionary for the optimizer.

+
+
+ required +
console + Console + +
+

A Rich console object for pretty printing.

+
+
+ required +
llm_client + LLMClient + +
+

A client for interacting with a language model.

+
+
+ required +
max_threads + int + +
+

The maximum number of threads to use for parallel execution.

+
+
+ required +
run_operation + Callable + +
+

A function to execute operations.

+
+
+ required +
timeout + int + +
+

The timeout in seconds for operation execution. Defaults to 10.

+
+
+ 10 +
is_filter + bool + +
+

If True, the operation is a filter operation. Defaults to False.

+
+
+ False +
+ +
+ Source code in docetl/optimizers/map_optimizer/optimizer.py +
42
+43
+44
+45
+46
+47
+48
+49
+50
+51
+52
+53
+54
+55
+56
+57
+58
+59
+60
+61
+62
+63
+64
+65
+66
+67
+68
+69
+70
+71
+72
+73
+74
+75
+76
+77
+78
+79
+80
+81
+82
+83
+84
+85
+86
def __init__(
+    self,
+    config: Dict[str, Any],
+    console: Console,
+    llm_client: LLMClient,
+    max_threads: int,
+    run_operation: Callable,
+    timeout: int = 10,
+    is_filter: bool = False,
+):
+    """
+    Initialize the MapOptimizer.
+
+    Args:
+        config (Dict[str, Any]): The configuration dictionary for the optimizer.
+        console (Console): A Rich console object for pretty printing.
+        llm_client (LLMClient): A client for interacting with a language model.
+        max_threads (int): The maximum number of threads to use for parallel execution.
+        run_operation (Callable): A function to execute operations.
+        timeout (int, optional): The timeout in seconds for operation execution. Defaults to 10.
+        is_filter (bool, optional): If True, the operation is a filter operation. Defaults to False.
+    """
+    self.config = config
+    self.console = console
+    self.llm_client = llm_client
+    self._run_operation = run_operation
+    self.max_threads = max_threads
+    self.timeout = timeout
+    self._num_plans_to_evaluate_in_parallel = 5
+    self.is_filter = is_filter
+
+    self.plan_generator = PlanGenerator(
+        llm_client, console, config, run_operation, max_threads, is_filter
+    )
+    self.evaluator = Evaluator(
+        llm_client,
+        console,
+        run_operation,
+        timeout,
+        self._num_plans_to_evaluate_in_parallel,
+        is_filter,
+    )
+    self.prompt_generator = PromptGenerator(
+        llm_client, console, config, max_threads, is_filter
+    )
+
+
+
+ +
+ +
+ + +

+ optimize(op_config, input_data) + +

+ + +
+ +

Optimize the given operation configuration for the input data. +This method analyzes the operation and input data, generates various +optimization plans, evaluates them, and returns the best plan along +with its output. A key part of this process is creating a custom +validator prompt for evaluation. The validator prompt is generated +based on the specific task, input data, and output data. It serves +as a critical tool for assessing the quality and correctness of +each optimization plan's output. This custom prompt ensures that +the evaluation is tailored to the unique requirements and nuances +of the given operation. The types of optimization plans include:

+
    +
  1. +

    Improved Prompt Plan: Enhances the original prompt based on evaluation, aiming to improve output quality.

    +
  2. +
  3. +

    Chunk Size Plan: Splits input data into chunks of different sizes, + processes each chunk separately, and then combines the results. This + can improve performance for large inputs.

    +
  4. +
  5. +

    Gleaning Plans: Implements an iterative refinement process where the + output is validated and improved over multiple rounds, enhancing accuracy.

    +
  6. +
  7. +

    Chain Decomposition Plan: Breaks down complex operations into a series + of simpler sub-operations, potentially improving overall performance + and interpretability.

    +
  8. +
  9. +

    Parallel Map Plan: Decomposes the task into subtasks that can be + executed in parallel, potentially speeding up processing for + independent operations.

    +
  10. +
+

The method generates these plans, evaluates their performance using +a custom validator, and selects the best performing plan based on +output quality and execution time.

+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
op_config + Dict[str, Any] + +
+

The configuration of the operation to optimize.

+
+
+ required +
input_data + List[Dict[str, Any]] + +
+

The input data for the operation.

+
+
+ required +
+ + +

Returns:

+ + + + + + + + + + + + + + + + + + + + + + + + + +
TypeDescription
+ List[Dict[str, Any]] + +
+

Tuple[List[Dict[str, Any]], List[Dict[str, Any]], float]: A tuple containing

+
+
+ List[Dict[str, Any]] + +
+

the best optimization plan and its output. The plan is a list of

+
+
+ float + +
+

operation configurations that achieve the best performance.

+
+
+ Tuple[List[Dict[str, Any]], List[Dict[str, Any]], float] + +
+

The cost is the cost of the optimizer (from possibly synthesizing resolves).

+
+
+ +
+ Source code in docetl/optimizers/map_optimizer/optimizer.py +
 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
+121
+122
+123
+124
+125
+126
+127
+128
+129
+130
+131
+132
+133
+134
+135
+136
+137
+138
+139
+140
+141
+142
+143
+144
+145
+146
+147
+148
+149
+150
+151
+152
+153
+154
+155
+156
+157
+158
+159
+160
+161
+162
+163
+164
+165
+166
+167
+168
+169
+170
+171
+172
+173
+174
+175
+176
+177
+178
+179
+180
+181
+182
+183
+184
+185
+186
+187
+188
+189
+190
+191
+192
+193
+194
+195
+196
+197
+198
+199
+200
+201
+202
+203
+204
+205
+206
+207
+208
+209
+210
+211
+212
+213
+214
+215
+216
+217
+218
+219
+220
+221
+222
+223
+224
+225
+226
+227
+228
+229
+230
+231
+232
+233
+234
+235
+236
+237
+238
+239
+240
+241
+242
+243
+244
+245
+246
+247
+248
+249
+250
+251
+252
+253
+254
+255
+256
+257
+258
+259
+260
+261
+262
+263
+264
+265
+266
+267
+268
+269
+270
+271
+272
+273
+274
+275
+276
+277
+278
+279
+280
+281
+282
+283
+284
+285
+286
+287
+288
+289
+290
+291
+292
+293
+294
+295
+296
+297
+298
+299
+300
+301
+302
+303
+304
+305
+306
+307
+308
+309
+310
+311
+312
+313
+314
+315
+316
+317
+318
+319
+320
+321
+322
+323
+324
+325
+326
+327
+328
+329
+330
+331
+332
+333
+334
+335
+336
+337
+338
+339
+340
+341
+342
+343
+344
+345
+346
+347
+348
+349
+350
+351
+352
+353
+354
+355
+356
+357
+358
+359
+360
+361
+362
+363
+364
+365
+366
def optimize(
+    self, op_config: Dict[str, Any], input_data: List[Dict[str, Any]]
+) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]], float]:
+    """
+    Optimize the given operation configuration for the input data.
+    This method analyzes the operation and input data, generates various
+    optimization plans, evaluates them, and returns the best plan along
+    with its output. A key part of this process is creating a custom
+    validator prompt for evaluation. The validator prompt is generated
+    based on the specific task, input data, and output data. It serves
+    as a critical tool for assessing the quality and correctness of
+    each optimization plan's output. This custom prompt ensures that
+    the evaluation is tailored to the unique requirements and nuances
+    of the given operation. The types of optimization plans include:
+
+    1. Improved Prompt Plan: Enhances the original prompt based on evaluation, aiming to improve output quality.
+
+    2. Chunk Size Plan: Splits input data into chunks of different sizes,
+       processes each chunk separately, and then combines the results. This
+       can improve performance for large inputs.
+
+    3. Gleaning Plans: Implements an iterative refinement process where the
+       output is validated and improved over multiple rounds, enhancing accuracy.
+
+    4. Chain Decomposition Plan: Breaks down complex operations into a series
+       of simpler sub-operations, potentially improving overall performance
+       and interpretability.
+
+    5. Parallel Map Plan: Decomposes the task into subtasks that can be
+       executed in parallel, potentially speeding up processing for
+       independent operations.
+
+    The method generates these plans, evaluates their performance using
+    a custom validator, and selects the best performing plan based on
+    output quality and execution time.
+
+    Args:
+        op_config (Dict[str, Any]): The configuration of the operation to optimize.
+        input_data (List[Dict[str, Any]]): The input data for the operation.
+
+    Returns:
+        Tuple[List[Dict[str, Any]], List[Dict[str, Any]], float]: A tuple containing
+        the best optimization plan and its output. The plan is a list of
+        operation configurations that achieve the best performance.
+        The cost is the cost of the optimizer (from possibly synthesizing resolves).
+
+    """
+    input_data = copy.deepcopy(input_data)
+    # Add id to each input_data
+    for i in range(len(input_data)):
+        input_data[i]["_map_opt_id"] = str(uuid.uuid4())
+
+    # Define the token limit (adjust as needed)
+    model_input_context_length = model_cost.get(
+        op_config.get("model", self.config.get("default_model")), {}
+    ).get("max_input_tokens", 8192)
+
+    # Render the prompt with all sample inputs and count tokens
+    total_tokens = 0
+    exceed_count = 0
+    for sample in input_data:
+        rendered_prompt = Template(op_config["prompt"]).render(input=sample)
+        prompt_tokens = count_tokens(
+            rendered_prompt,
+            op_config.get("model", self.config.get("default_model")),
+        )
+        total_tokens += prompt_tokens
+
+        if prompt_tokens > model_input_context_length:
+            exceed_count += 1
+
+    # Calculate average tokens and percentage of samples exceeding limit
+    avg_tokens = total_tokens / len(input_data)
+    exceed_percentage = (exceed_count / len(input_data)) * 100
+
+    data_exceeds_limit = exceed_count > 0
+    if exceed_count > 0:
+        self.console.log(
+            f"[yellow]Warning: {exceed_percentage:.2f}% of prompts exceed token limit. "
+            f"Average token count: {avg_tokens:.2f}. "
+            f"Truncating input data when generating validators.[/yellow]"
+        )
+
+    # Execute the original operation on the sample data
+    no_change_start = time.time()
+    output_data = self._run_operation(op_config, input_data, is_build=True)
+    no_change_runtime = time.time() - no_change_start
+
+    # Generate custom validator prompt
+    validator_prompt = self.prompt_generator._generate_validator_prompt(
+        op_config, input_data, output_data
+    )
+
+    # Log the validator prompt
+    self.console.log("[bold]Validator Prompt:[/bold]")
+    self.console.log(validator_prompt)
+    self.console.log("\n")  # Add a newline for better readability
+
+    # Step 2: Use the validator prompt to assess the operation's performance
+    assessment = self.evaluator._assess_operation(
+        op_config, input_data, output_data, validator_prompt
+    )
+
+    # Print out the assessment
+    self.console.log(
+        f"[bold]Assessment for whether we should improve operation {op_config['name']}:[/bold]"
+    )
+    self.console.log(json.dumps(assessment, indent=2))
+    self.console.log("\n")  # Add a newline for better readability
+
+    # Check if improvement is needed based on the assessment
+    if not data_exceeds_limit and not assessment.get("needs_improvement", True):
+        self.console.log(
+            f"[green]No improvement needed for operation {op_config['name']}[/green]"
+        )
+        return [op_config], output_data, self.plan_generator.reduce_optimizer_cost
+
+    candidate_plans = {}
+
+    # Generate improved prompt plan
+    if not data_exceeds_limit:
+        #     improved_prompt_plan = self.prompt_generator._get_improved_prompt(
+        #         op_config, assessment, input_data
+        #     )
+        #     candidate_plans["improved_instructions"] = improved_prompt_plan
+        candidate_plans["no_change"] = [op_config]
+
+    # Generate chunk size plans
+    chunk_size_plans = self.plan_generator._generate_chunk_size_plans(
+        op_config, input_data, validator_prompt, model_input_context_length
+    )
+    for pname, plan in chunk_size_plans.items():
+        candidate_plans[pname] = plan
+
+    # Generate gleaning plans
+    if not data_exceeds_limit:
+        gleaning_plans = self.plan_generator._generate_gleaning_plans(
+            op_config, validator_prompt
+        )
+        for pname, plan in gleaning_plans.items():
+            candidate_plans[pname] = plan
+
+    # Generate chain decomposition plans
+    if not data_exceeds_limit:
+        if not self.is_filter:
+            chain_plans = self.plan_generator._generate_chain_plans(
+                op_config, input_data
+            )
+            for pname, plan in chain_plans.items():
+                candidate_plans[pname] = plan
+
+            # Generate parallel map plans
+            parallel_plans = self.plan_generator._generate_parallel_plans(
+                op_config, input_data
+            )
+            for pname, plan in parallel_plans.items():
+                candidate_plans[pname] = plan
+
+    # Select consistent evaluation samples
+    num_evaluations = min(5, len(input_data))
+    evaluation_samples = select_evaluation_samples(input_data, num_evaluations)
+
+    results = {}
+    plans_list = list(candidate_plans.items())
+    for i in range(0, len(plans_list), self._num_plans_to_evaluate_in_parallel):
+        batch = plans_list[i : i + self._num_plans_to_evaluate_in_parallel]
+        with ThreadPoolExecutor(
+            max_workers=self._num_plans_to_evaluate_in_parallel
+        ) as executor:
+            futures = {
+                executor.submit(
+                    self.evaluator._evaluate_plan,
+                    plan_name,
+                    op_config,
+                    plan,
+                    copy.deepcopy(evaluation_samples),
+                    validator_prompt,
+                ): plan_name
+                for plan_name, plan in batch
+            }
+            for future in as_completed(futures):
+                plan_name = futures[future]
+                try:
+                    score, runtime, output = future.result(timeout=self.timeout)
+                    results[plan_name] = (score, runtime, output)
+                except concurrent.futures.TimeoutError:
+                    self.console.log(
+                        f"[yellow]Plan {plan_name} timed out and will be skipped.[/yellow]"
+                    )
+                except Exception as e:
+                    # TODO: raise this error if the error is related to a Jinja error
+                    self.console.log(
+                        f"[red]Error in plan {plan_name}: {str(e)}[/red]"
+                    )
+                    import traceback
+
+                    print(traceback.format_exc())
+
+    # Add no change plan
+    if not data_exceeds_limit:
+        results["no_change"] = (
+            results["no_change"][0],
+            no_change_runtime,
+            results["no_change"][2],
+        )
+
+    # Create a table of scores sorted in descending order
+    scores = sorted(
+        [(score, runtime, plan) for plan, (score, runtime, _) in results.items()],
+        reverse=True,
+    )
+
+    # Sort results by score in descending order
+    sorted_results = sorted(results.items(), key=lambda x: x[1][0], reverse=True)
+
+    # Take the top 6 plans
+    top_plans = sorted_results[:6]
+
+    # Check if there are no top plans
+    if len(top_plans) == 0:
+        raise ValueError(
+            "Agent did not generate any plans. Unable to proceed with optimization. Try again."
+        )
+
+    # Include any additional plans that are tied with the last plan
+    tail_score = top_plans[-1][1][0] if len(top_plans) == 6 else float("-inf")
+    filtered_results = dict(
+        top_plans
+        + [
+            item
+            for item in sorted_results[len(top_plans) :]
+            if item[1][0] == tail_score
+        ]
+    )
+
+    # Perform pairwise comparisons on filtered plans
+    if len(filtered_results) > 1:
+        pairwise_rankings = self.evaluator._pairwise_compare_plans(
+            filtered_results, validator_prompt, op_config, evaluation_samples
+        )
+        best_plan_name = max(pairwise_rankings, key=pairwise_rankings.get)
+    else:
+        pairwise_rankings = {k: 0 for k in results.keys()}
+        best_plan_name = (
+            next(iter(filtered_results))
+            if filtered_results
+            else max(results, key=lambda x: results[x][0])
+        )
+
+    self.console.log(
+        f"\n[bold]Plan Evaluation Results for {op_config['name']} ({op_config['type']}, {len(scores)} plans, {num_evaluations} samples):[/bold]"
+    )
+    table = Table(show_header=True, header_style="bold magenta")
+    table.add_column("Plan", style="dim")
+    table.add_column("Score", justify="right", width=10)
+    table.add_column("Runtime", justify="right", width=10)
+    table.add_column("Pairwise Wins", justify="right", width=10)
+
+    for score, runtime, plan in scores:
+        table.add_row(
+            plan,
+            f"{score:.2f}",
+            f"{runtime:.2f}s",
+            f"{pairwise_rankings.get(plan, 0)}",
+        )
+
+    self.console.log(table)
+    self.console.log("\n")
+
+    _, _, best_output = results[best_plan_name]
+    self.console.log(
+        f"[green]Choosing {best_plan_name} for operation {op_config['name']} (Score: {results[best_plan_name][0]:.2f}, Runtime: {results[best_plan_name][1]:.2f}s)[/green]"
+    )
+
+    return (
+        candidate_plans[best_plan_name],
+        best_output,
+        self.plan_generator.reduce_optimizer_cost,
+    )
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ docetl.optimizers.reduce_optimizer.ReduceOptimizer + + +

+ + +
+ + +

A class that optimizes reduce operations in data processing pipelines.

+

This optimizer analyzes the input and output of a reduce operation, creates and evaluates +multiple reduce plans, and selects the best plan for optimizing the operation's performance.

+ + +

Attributes:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescription
config + Dict[str, Any] + +
+

Configuration dictionary for the optimizer.

+
+
console + Console + +
+

Rich console object for pretty printing.

+
+
llm_client + LLMClient + +
+

Client for interacting with a language model.

+
+
_run_operation + Callable + +
+

Function to run an operation.

+
+
max_threads + int + +
+

Maximum number of threads to use for parallel processing.

+
+
num_fold_prompts + int + +
+

Number of fold prompts to generate.

+
+
num_samples_in_validation + int + +
+

Number of samples to use in validation.

+
+
+ +
+ Source code in docetl/optimizers/reduce_optimizer.py +
  19
+  20
+  21
+  22
+  23
+  24
+  25
+  26
+  27
+  28
+  29
+  30
+  31
+  32
+  33
+  34
+  35
+  36
+  37
+  38
+  39
+  40
+  41
+  42
+  43
+  44
+  45
+  46
+  47
+  48
+  49
+  50
+  51
+  52
+  53
+  54
+  55
+  56
+  57
+  58
+  59
+  60
+  61
+  62
+  63
+  64
+  65
+  66
+  67
+  68
+  69
+  70
+  71
+  72
+  73
+  74
+  75
+  76
+  77
+  78
+  79
+  80
+  81
+  82
+  83
+  84
+  85
+  86
+  87
+  88
+  89
+  90
+  91
+  92
+  93
+  94
+  95
+  96
+  97
+  98
+  99
+ 100
+ 101
+ 102
+ 103
+ 104
+ 105
+ 106
+ 107
+ 108
+ 109
+ 110
+ 111
+ 112
+ 113
+ 114
+ 115
+ 116
+ 117
+ 118
+ 119
+ 120
+ 121
+ 122
+ 123
+ 124
+ 125
+ 126
+ 127
+ 128
+ 129
+ 130
+ 131
+ 132
+ 133
+ 134
+ 135
+ 136
+ 137
+ 138
+ 139
+ 140
+ 141
+ 142
+ 143
+ 144
+ 145
+ 146
+ 147
+ 148
+ 149
+ 150
+ 151
+ 152
+ 153
+ 154
+ 155
+ 156
+ 157
+ 158
+ 159
+ 160
+ 161
+ 162
+ 163
+ 164
+ 165
+ 166
+ 167
+ 168
+ 169
+ 170
+ 171
+ 172
+ 173
+ 174
+ 175
+ 176
+ 177
+ 178
+ 179
+ 180
+ 181
+ 182
+ 183
+ 184
+ 185
+ 186
+ 187
+ 188
+ 189
+ 190
+ 191
+ 192
+ 193
+ 194
+ 195
+ 196
+ 197
+ 198
+ 199
+ 200
+ 201
+ 202
+ 203
+ 204
+ 205
+ 206
+ 207
+ 208
+ 209
+ 210
+ 211
+ 212
+ 213
+ 214
+ 215
+ 216
+ 217
+ 218
+ 219
+ 220
+ 221
+ 222
+ 223
+ 224
+ 225
+ 226
+ 227
+ 228
+ 229
+ 230
+ 231
+ 232
+ 233
+ 234
+ 235
+ 236
+ 237
+ 238
+ 239
+ 240
+ 241
+ 242
+ 243
+ 244
+ 245
+ 246
+ 247
+ 248
+ 249
+ 250
+ 251
+ 252
+ 253
+ 254
+ 255
+ 256
+ 257
+ 258
+ 259
+ 260
+ 261
+ 262
+ 263
+ 264
+ 265
+ 266
+ 267
+ 268
+ 269
+ 270
+ 271
+ 272
+ 273
+ 274
+ 275
+ 276
+ 277
+ 278
+ 279
+ 280
+ 281
+ 282
+ 283
+ 284
+ 285
+ 286
+ 287
+ 288
+ 289
+ 290
+ 291
+ 292
+ 293
+ 294
+ 295
+ 296
+ 297
+ 298
+ 299
+ 300
+ 301
+ 302
+ 303
+ 304
+ 305
+ 306
+ 307
+ 308
+ 309
+ 310
+ 311
+ 312
+ 313
+ 314
+ 315
+ 316
+ 317
+ 318
+ 319
+ 320
+ 321
+ 322
+ 323
+ 324
+ 325
+ 326
+ 327
+ 328
+ 329
+ 330
+ 331
+ 332
+ 333
+ 334
+ 335
+ 336
+ 337
+ 338
+ 339
+ 340
+ 341
+ 342
+ 343
+ 344
+ 345
+ 346
+ 347
+ 348
+ 349
+ 350
+ 351
+ 352
+ 353
+ 354
+ 355
+ 356
+ 357
+ 358
+ 359
+ 360
+ 361
+ 362
+ 363
+ 364
+ 365
+ 366
+ 367
+ 368
+ 369
+ 370
+ 371
+ 372
+ 373
+ 374
+ 375
+ 376
+ 377
+ 378
+ 379
+ 380
+ 381
+ 382
+ 383
+ 384
+ 385
+ 386
+ 387
+ 388
+ 389
+ 390
+ 391
+ 392
+ 393
+ 394
+ 395
+ 396
+ 397
+ 398
+ 399
+ 400
+ 401
+ 402
+ 403
+ 404
+ 405
+ 406
+ 407
+ 408
+ 409
+ 410
+ 411
+ 412
+ 413
+ 414
+ 415
+ 416
+ 417
+ 418
+ 419
+ 420
+ 421
+ 422
+ 423
+ 424
+ 425
+ 426
+ 427
+ 428
+ 429
+ 430
+ 431
+ 432
+ 433
+ 434
+ 435
+ 436
+ 437
+ 438
+ 439
+ 440
+ 441
+ 442
+ 443
+ 444
+ 445
+ 446
+ 447
+ 448
+ 449
+ 450
+ 451
+ 452
+ 453
+ 454
+ 455
+ 456
+ 457
+ 458
+ 459
+ 460
+ 461
+ 462
+ 463
+ 464
+ 465
+ 466
+ 467
+ 468
+ 469
+ 470
+ 471
+ 472
+ 473
+ 474
+ 475
+ 476
+ 477
+ 478
+ 479
+ 480
+ 481
+ 482
+ 483
+ 484
+ 485
+ 486
+ 487
+ 488
+ 489
+ 490
+ 491
+ 492
+ 493
+ 494
+ 495
+ 496
+ 497
+ 498
+ 499
+ 500
+ 501
+ 502
+ 503
+ 504
+ 505
+ 506
+ 507
+ 508
+ 509
+ 510
+ 511
+ 512
+ 513
+ 514
+ 515
+ 516
+ 517
+ 518
+ 519
+ 520
+ 521
+ 522
+ 523
+ 524
+ 525
+ 526
+ 527
+ 528
+ 529
+ 530
+ 531
+ 532
+ 533
+ 534
+ 535
+ 536
+ 537
+ 538
+ 539
+ 540
+ 541
+ 542
+ 543
+ 544
+ 545
+ 546
+ 547
+ 548
+ 549
+ 550
+ 551
+ 552
+ 553
+ 554
+ 555
+ 556
+ 557
+ 558
+ 559
+ 560
+ 561
+ 562
+ 563
+ 564
+ 565
+ 566
+ 567
+ 568
+ 569
+ 570
+ 571
+ 572
+ 573
+ 574
+ 575
+ 576
+ 577
+ 578
+ 579
+ 580
+ 581
+ 582
+ 583
+ 584
+ 585
+ 586
+ 587
+ 588
+ 589
+ 590
+ 591
+ 592
+ 593
+ 594
+ 595
+ 596
+ 597
+ 598
+ 599
+ 600
+ 601
+ 602
+ 603
+ 604
+ 605
+ 606
+ 607
+ 608
+ 609
+ 610
+ 611
+ 612
+ 613
+ 614
+ 615
+ 616
+ 617
+ 618
+ 619
+ 620
+ 621
+ 622
+ 623
+ 624
+ 625
+ 626
+ 627
+ 628
+ 629
+ 630
+ 631
+ 632
+ 633
+ 634
+ 635
+ 636
+ 637
+ 638
+ 639
+ 640
+ 641
+ 642
+ 643
+ 644
+ 645
+ 646
+ 647
+ 648
+ 649
+ 650
+ 651
+ 652
+ 653
+ 654
+ 655
+ 656
+ 657
+ 658
+ 659
+ 660
+ 661
+ 662
+ 663
+ 664
+ 665
+ 666
+ 667
+ 668
+ 669
+ 670
+ 671
+ 672
+ 673
+ 674
+ 675
+ 676
+ 677
+ 678
+ 679
+ 680
+ 681
+ 682
+ 683
+ 684
+ 685
+ 686
+ 687
+ 688
+ 689
+ 690
+ 691
+ 692
+ 693
+ 694
+ 695
+ 696
+ 697
+ 698
+ 699
+ 700
+ 701
+ 702
+ 703
+ 704
+ 705
+ 706
+ 707
+ 708
+ 709
+ 710
+ 711
+ 712
+ 713
+ 714
+ 715
+ 716
+ 717
+ 718
+ 719
+ 720
+ 721
+ 722
+ 723
+ 724
+ 725
+ 726
+ 727
+ 728
+ 729
+ 730
+ 731
+ 732
+ 733
+ 734
+ 735
+ 736
+ 737
+ 738
+ 739
+ 740
+ 741
+ 742
+ 743
+ 744
+ 745
+ 746
+ 747
+ 748
+ 749
+ 750
+ 751
+ 752
+ 753
+ 754
+ 755
+ 756
+ 757
+ 758
+ 759
+ 760
+ 761
+ 762
+ 763
+ 764
+ 765
+ 766
+ 767
+ 768
+ 769
+ 770
+ 771
+ 772
+ 773
+ 774
+ 775
+ 776
+ 777
+ 778
+ 779
+ 780
+ 781
+ 782
+ 783
+ 784
+ 785
+ 786
+ 787
+ 788
+ 789
+ 790
+ 791
+ 792
+ 793
+ 794
+ 795
+ 796
+ 797
+ 798
+ 799
+ 800
+ 801
+ 802
+ 803
+ 804
+ 805
+ 806
+ 807
+ 808
+ 809
+ 810
+ 811
+ 812
+ 813
+ 814
+ 815
+ 816
+ 817
+ 818
+ 819
+ 820
+ 821
+ 822
+ 823
+ 824
+ 825
+ 826
+ 827
+ 828
+ 829
+ 830
+ 831
+ 832
+ 833
+ 834
+ 835
+ 836
+ 837
+ 838
+ 839
+ 840
+ 841
+ 842
+ 843
+ 844
+ 845
+ 846
+ 847
+ 848
+ 849
+ 850
+ 851
+ 852
+ 853
+ 854
+ 855
+ 856
+ 857
+ 858
+ 859
+ 860
+ 861
+ 862
+ 863
+ 864
+ 865
+ 866
+ 867
+ 868
+ 869
+ 870
+ 871
+ 872
+ 873
+ 874
+ 875
+ 876
+ 877
+ 878
+ 879
+ 880
+ 881
+ 882
+ 883
+ 884
+ 885
+ 886
+ 887
+ 888
+ 889
+ 890
+ 891
+ 892
+ 893
+ 894
+ 895
+ 896
+ 897
+ 898
+ 899
+ 900
+ 901
+ 902
+ 903
+ 904
+ 905
+ 906
+ 907
+ 908
+ 909
+ 910
+ 911
+ 912
+ 913
+ 914
+ 915
+ 916
+ 917
+ 918
+ 919
+ 920
+ 921
+ 922
+ 923
+ 924
+ 925
+ 926
+ 927
+ 928
+ 929
+ 930
+ 931
+ 932
+ 933
+ 934
+ 935
+ 936
+ 937
+ 938
+ 939
+ 940
+ 941
+ 942
+ 943
+ 944
+ 945
+ 946
+ 947
+ 948
+ 949
+ 950
+ 951
+ 952
+ 953
+ 954
+ 955
+ 956
+ 957
+ 958
+ 959
+ 960
+ 961
+ 962
+ 963
+ 964
+ 965
+ 966
+ 967
+ 968
+ 969
+ 970
+ 971
+ 972
+ 973
+ 974
+ 975
+ 976
+ 977
+ 978
+ 979
+ 980
+ 981
+ 982
+ 983
+ 984
+ 985
+ 986
+ 987
+ 988
+ 989
+ 990
+ 991
+ 992
+ 993
+ 994
+ 995
+ 996
+ 997
+ 998
+ 999
+1000
+1001
+1002
+1003
+1004
+1005
+1006
+1007
+1008
+1009
+1010
+1011
+1012
+1013
+1014
+1015
+1016
+1017
+1018
+1019
+1020
+1021
+1022
+1023
+1024
+1025
+1026
+1027
+1028
+1029
+1030
+1031
+1032
+1033
+1034
+1035
+1036
+1037
+1038
+1039
+1040
+1041
+1042
+1043
+1044
+1045
+1046
+1047
+1048
+1049
+1050
+1051
+1052
+1053
+1054
+1055
+1056
+1057
+1058
+1059
+1060
+1061
+1062
+1063
+1064
+1065
+1066
+1067
+1068
+1069
+1070
+1071
+1072
+1073
+1074
+1075
+1076
+1077
+1078
+1079
+1080
+1081
+1082
+1083
+1084
+1085
+1086
+1087
+1088
+1089
+1090
+1091
+1092
+1093
+1094
+1095
+1096
+1097
+1098
+1099
+1100
+1101
+1102
+1103
+1104
+1105
+1106
+1107
+1108
+1109
+1110
+1111
+1112
+1113
+1114
+1115
+1116
+1117
+1118
+1119
+1120
+1121
+1122
+1123
+1124
+1125
+1126
+1127
+1128
+1129
+1130
+1131
+1132
+1133
+1134
+1135
+1136
+1137
+1138
+1139
+1140
+1141
+1142
+1143
+1144
+1145
+1146
+1147
+1148
+1149
+1150
+1151
+1152
+1153
+1154
+1155
+1156
+1157
+1158
+1159
+1160
+1161
+1162
+1163
+1164
+1165
+1166
+1167
+1168
+1169
+1170
+1171
+1172
+1173
+1174
+1175
+1176
+1177
+1178
+1179
+1180
+1181
+1182
+1183
+1184
+1185
+1186
+1187
+1188
+1189
+1190
+1191
+1192
+1193
+1194
+1195
+1196
+1197
+1198
+1199
+1200
+1201
+1202
+1203
+1204
+1205
+1206
+1207
+1208
+1209
+1210
+1211
+1212
+1213
+1214
+1215
+1216
+1217
+1218
+1219
+1220
+1221
+1222
+1223
+1224
+1225
+1226
+1227
+1228
+1229
+1230
+1231
+1232
+1233
+1234
+1235
+1236
+1237
+1238
+1239
+1240
+1241
+1242
+1243
+1244
+1245
+1246
+1247
+1248
+1249
+1250
+1251
+1252
+1253
+1254
+1255
+1256
+1257
+1258
+1259
+1260
+1261
+1262
+1263
+1264
+1265
+1266
+1267
+1268
+1269
+1270
+1271
+1272
+1273
+1274
+1275
+1276
+1277
+1278
+1279
+1280
+1281
+1282
+1283
+1284
+1285
+1286
+1287
+1288
+1289
+1290
+1291
+1292
+1293
+1294
+1295
+1296
+1297
+1298
+1299
+1300
+1301
+1302
+1303
+1304
+1305
+1306
+1307
+1308
+1309
+1310
+1311
+1312
+1313
+1314
+1315
+1316
+1317
+1318
+1319
+1320
+1321
+1322
+1323
+1324
+1325
+1326
+1327
+1328
+1329
+1330
+1331
+1332
+1333
+1334
+1335
+1336
+1337
+1338
+1339
+1340
+1341
+1342
+1343
+1344
+1345
+1346
+1347
+1348
+1349
+1350
+1351
+1352
+1353
+1354
+1355
+1356
+1357
+1358
+1359
+1360
+1361
+1362
+1363
+1364
+1365
+1366
+1367
+1368
+1369
+1370
+1371
+1372
+1373
+1374
+1375
+1376
+1377
+1378
+1379
+1380
+1381
+1382
+1383
+1384
+1385
+1386
+1387
+1388
+1389
+1390
+1391
+1392
+1393
+1394
+1395
+1396
+1397
+1398
+1399
+1400
+1401
+1402
+1403
+1404
+1405
+1406
+1407
+1408
+1409
+1410
+1411
+1412
+1413
+1414
+1415
+1416
+1417
+1418
+1419
+1420
+1421
+1422
+1423
+1424
+1425
+1426
+1427
+1428
+1429
+1430
+1431
+1432
+1433
+1434
+1435
+1436
+1437
+1438
+1439
+1440
+1441
+1442
+1443
+1444
+1445
+1446
+1447
+1448
+1449
+1450
+1451
+1452
+1453
+1454
+1455
+1456
+1457
+1458
+1459
+1460
+1461
+1462
+1463
+1464
+1465
+1466
+1467
+1468
+1469
+1470
+1471
+1472
+1473
+1474
+1475
+1476
+1477
+1478
+1479
+1480
+1481
+1482
+1483
+1484
+1485
+1486
+1487
+1488
+1489
+1490
+1491
+1492
+1493
+1494
+1495
+1496
+1497
+1498
+1499
+1500
+1501
+1502
+1503
+1504
+1505
+1506
+1507
+1508
+1509
+1510
+1511
+1512
+1513
+1514
+1515
+1516
+1517
+1518
+1519
+1520
+1521
+1522
+1523
+1524
+1525
+1526
+1527
+1528
+1529
+1530
+1531
+1532
+1533
+1534
+1535
+1536
+1537
+1538
+1539
+1540
+1541
+1542
+1543
+1544
+1545
+1546
+1547
+1548
+1549
+1550
+1551
+1552
+1553
+1554
+1555
+1556
+1557
+1558
+1559
+1560
+1561
+1562
+1563
+1564
+1565
+1566
+1567
+1568
+1569
+1570
+1571
+1572
+1573
+1574
+1575
+1576
+1577
+1578
+1579
+1580
+1581
+1582
+1583
+1584
+1585
+1586
+1587
+1588
+1589
+1590
+1591
+1592
+1593
+1594
+1595
+1596
+1597
+1598
+1599
+1600
+1601
+1602
+1603
+1604
+1605
+1606
+1607
+1608
+1609
+1610
+1611
+1612
+1613
+1614
+1615
+1616
+1617
+1618
+1619
+1620
+1621
+1622
+1623
+1624
+1625
+1626
+1627
+1628
+1629
+1630
+1631
+1632
+1633
+1634
+1635
+1636
+1637
+1638
+1639
+1640
+1641
+1642
+1643
+1644
+1645
+1646
+1647
+1648
+1649
+1650
+1651
+1652
+1653
+1654
+1655
+1656
+1657
+1658
+1659
+1660
+1661
+1662
+1663
+1664
+1665
+1666
+1667
+1668
+1669
+1670
+1671
+1672
+1673
+1674
+1675
+1676
+1677
+1678
+1679
+1680
+1681
+1682
+1683
+1684
+1685
+1686
+1687
+1688
+1689
+1690
+1691
+1692
+1693
+1694
+1695
+1696
+1697
+1698
+1699
+1700
+1701
+1702
+1703
+1704
+1705
+1706
+1707
+1708
+1709
+1710
+1711
+1712
+1713
+1714
+1715
+1716
+1717
+1718
+1719
+1720
+1721
+1722
+1723
+1724
+1725
+1726
+1727
+1728
+1729
+1730
class ReduceOptimizer:
+    """
+    A class that optimizes reduce operations in data processing pipelines.
+
+    This optimizer analyzes the input and output of a reduce operation, creates and evaluates
+    multiple reduce plans, and selects the best plan for optimizing the operation's performance.
+
+    Attributes:
+        config (Dict[str, Any]): Configuration dictionary for the optimizer.
+        console (Console): Rich console object for pretty printing.
+        llm_client (LLMClient): Client for interacting with a language model.
+        _run_operation (Callable): Function to run an operation.
+        max_threads (int): Maximum number of threads to use for parallel processing.
+        num_fold_prompts (int): Number of fold prompts to generate.
+        num_samples_in_validation (int): Number of samples to use in validation.
+    """
+
+    def __init__(
+        self,
+        config: Dict[str, Any],
+        console: Console,
+        llm_client: LLMClient,
+        max_threads: int,
+        run_operation: Callable,
+        num_fold_prompts: int = 1,
+        num_samples_in_validation: int = 10,
+    ):
+        """
+        Initialize the ReduceOptimizer.
+
+        Args:
+            config (Dict[str, Any]): Configuration dictionary for the optimizer.
+            console (Console): Rich console object for pretty printing.
+            llm_client (LLMClient): Client for interacting with a language model.
+            max_threads (int): Maximum number of threads to use for parallel processing.
+            run_operation (Callable): Function to run an operation.
+            num_fold_prompts (int, optional): Number of fold prompts to generate. Defaults to 1.
+            num_samples_in_validation (int, optional): Number of samples to use in validation. Defaults to 10.
+        """
+        self.config = config
+        self.console = console
+        self.llm_client = llm_client
+        self._run_operation = run_operation
+        self.max_threads = max_threads
+        self.num_fold_prompts = num_fold_prompts
+        self.num_samples_in_validation = num_samples_in_validation
+
+    def optimize(
+        self,
+        op_config: Dict[str, Any],
+        input_data: List[Dict[str, Any]],
+        level: int = 1,
+    ) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]], float]:
+        """
+        Optimize the reduce operation based on the given configuration and input data.
+
+        This method performs the following steps:
+        1. Run the original operation
+        2. Generate a validator prompt
+        3. Validate the output
+        4. If improvement is needed:
+           a. Evaluate if decomposition is beneficial
+           b. If decomposition is beneficial, recursively optimize each sub-operation
+           c. If not, proceed with single operation optimization
+        5. Run the optimized operation(s)
+
+        Args:
+            op_config (Dict[str, Any]): Configuration for the reduce operation.
+            input_data (List[Dict[str, Any]]): Input data for the reduce operation.
+
+        Returns:
+            Tuple[List[Dict[str, Any]], List[Dict[str, Any]], float]: A tuple containing the list of optimized configurations
+            and the list of outputs from the optimized operation(s), and the cost of the operation due to synthesizing any resolve operations.
+        """
+        # Check if we're running out of token limits for the reduce prompt
+        model = op_config.get("model", self.config.get("default_model", "gpt-4o-mini"))
+        model_input_context_length = model_cost.get(model, {}).get(
+            "max_input_tokens", 4096
+        )
+
+        # Find the key with the longest value
+        longest_key = max(
+            op_config["reduce_key"], key=lambda k: len(str(input_data[0][k]))
+        )
+        sample_key = tuple(
+            input_data[0][k] if k == longest_key else input_data[0][k]
+            for k in op_config["reduce_key"]
+        )
+
+        # Render the prompt with a sample input
+        prompt_template = Template(op_config["prompt"])
+        sample_prompt = prompt_template.render(
+            reduce_key=dict(zip(op_config["reduce_key"], sample_key)),
+            inputs=[input_data[0]],
+        )
+
+        # Count tokens in the sample prompt
+        prompt_tokens = count_tokens(sample_prompt, model)
+
+        add_map_op = False
+        if prompt_tokens * 2 > model_input_context_length:
+            add_map_op = True
+            self.console.log(
+                f"[yellow]Warning: The reduce prompt exceeds the token limit for model {model}. "
+                f"Token count: {prompt_tokens}, Limit: {model_input_context_length}. "
+                f"Add a map operation to the pipeline.[/yellow]"
+            )
+
+        # # Also query an agent to look at a sample of the inputs and see if they think a map operation would be helpful
+        # preprocessing_steps = ""
+        # should_use_map, preprocessing_steps = self._should_use_map(
+        #     op_config, input_data
+        # )
+        # if should_use_map or add_map_op:
+        #     # Synthesize a map operation
+        #     map_prompt, map_output_schema = self._synthesize_map_operation(
+        #         op_config, preprocessing_steps, input_data
+        #     )
+        #     # Change the reduce operation prompt to use the map schema
+        #     new_reduce_prompt = self._change_reduce_prompt_to_use_map_schema(
+        #         op_config["prompt"], map_output_schema
+        #     )
+        #     op_config["prompt"] = new_reduce_prompt
+
+        #     # Return unoptimized map and reduce operations
+        #     return [map_prompt, op_config], input_data, 0.0
+
+        original_output = self._run_operation(op_config, input_data)
+
+        # Step 1: Synthesize a validator prompt
+        validator_prompt = self._generate_validator_prompt(
+            op_config, input_data, original_output
+        )
+
+        # Log the validator prompt
+        self.console.log("[bold]Validator Prompt:[/bold]")
+        self.console.log(validator_prompt)
+        self.console.log("\n")  # Add a newline for better readability
+
+        # Step 2: validate the output
+        validator_inputs = self._create_validation_inputs(
+            input_data, op_config["reduce_key"]
+        )
+        validation_results = self._validate_reduce_output(
+            op_config, validator_inputs, original_output, validator_prompt
+        )
+
+        # Print the validation results
+        self.console.log("[bold]Validation Results:[/bold]")
+        if validation_results["needs_improvement"]:
+            self.console.log(
+                "\n".join(
+                    [
+                        f"Issues: {result['issues']} Suggestions: {result['suggestions']}"
+                        for result in validation_results["validation_results"]
+                    ]
+                )
+            )
+
+            # Step 3: Evaluate if decomposition is beneficial
+            decomposition_result = self._evaluate_decomposition(
+                op_config, input_data, level
+            )
+
+            if decomposition_result["should_decompose"]:
+                return self._optimize_decomposed_reduce(
+                    decomposition_result, op_config, input_data, level
+                )
+
+            return self._optimize_single_reduce(op_config, input_data, validator_prompt)
+        else:
+            self.console.log("No improvements identified.")
+            return [op_config], original_output, 0.0
+
+    def _should_use_map(
+        self, op_config: Dict[str, Any], input_data: List[Dict[str, Any]]
+    ) -> Tuple[bool, str]:
+        """
+        Determine if a map operation should be used based on the input data.
+        """
+        # Sample a random input item
+        sample_input = random.choice(input_data)
+
+        # Format the prompt with the sample input
+        prompt_template = Template(op_config["prompt"])
+        formatted_prompt = prompt_template.render(
+            reduce_key=dict(
+                zip(op_config["reduce_key"], sample_input[op_config["reduce_key"]])
+            ),
+            inputs=[sample_input],
+        )
+
+        # Prepare the message for the LLM
+        messages = [{"role": "user", "content": formatted_prompt}]
+
+        # Truncate the messages to fit the model's context window
+        truncated_messages = truncate_messages(
+            messages, self.config.get("model", self.default_model)
+        )
+
+        # Query the LLM for preprocessing suggestions
+        preprocessing_prompt = (
+            "Based on the following reduce operation prompt, should we do any preprocessing on the input data? "
+            "Consider if we need to remove unnecessary context, or logically construct an output that will help in the task. "
+            "If preprocessing would be beneficial, explain why and suggest specific steps. If not, explain why preprocessing isn't necessary.\n\n"
+            f"Reduce operation prompt:\n{truncated_messages[0]['content']}"
+        )
+
+        preprocessing_response = self.llm_client.generate(
+            model=self.config.get("model", self.default_model),
+            messages=[{"role": "user", "content": preprocessing_prompt}],
+            response_format={
+                "type": "json_object",
+                "schema": {
+                    "type": "object",
+                    "properties": {
+                        "preprocessing_needed": {"type": "boolean"},
+                        "rationale": {"type": "string"},
+                        "suggested_steps": {"type": "string"},
+                    },
+                    "required": [
+                        "preprocessing_needed",
+                        "rationale",
+                        "suggested_steps",
+                    ],
+                },
+            },
+        )
+
+        preprocessing_result = preprocessing_response.choices[0].message.content
+
+        should_preprocess = preprocessing_result["preprocessing_needed"]
+        preprocessing_rationale = preprocessing_result["rationale"]
+
+        self.console.log(f"[bold]Map-Reduce Decomposition Analysis:[/bold]")
+        self.console.log(f"Should write a map operation: {should_preprocess}")
+        self.console.log(f"Rationale: {preprocessing_rationale}")
+
+        if should_preprocess:
+            self.console.log(
+                f"Suggested steps: {preprocessing_result['suggested_steps']}"
+            )
+
+        return should_preprocess, preprocessing_result["suggested_steps"]
+
+    def _optimize_single_reduce(
+        self,
+        op_config: Dict[str, Any],
+        input_data: List[Dict[str, Any]],
+        validator_prompt: str,
+    ) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]], float]:
+        """
+        Optimize a single reduce operation.
+
+        This method performs the following steps:
+        1. Determine and configure value sampling
+        2. Determine if the reduce operation is associative
+        3. Create and evaluate multiple reduce plans
+        4. Run the best reduce plan
+
+        Args:
+            op_config (Dict[str, Any]): Configuration for the reduce operation.
+            input_data (List[Dict[str, Any]]): Input data for the reduce operation.
+            validator_prompt (str): The validator prompt for evaluating reduce plans.
+
+        Returns:
+            Tuple[List[Dict[str, Any]], List[Dict[str, Any]], float]: A tuple containing a single-item list with the optimized configuration
+            and a single-item list with the output from the optimized operation, and the cost of the operation due to synthesizing any resolve operations.
+        """
+        # Step 1: Determine and configure value sampling
+        value_sampling_config = self._determine_value_sampling(op_config, input_data)
+        if value_sampling_config["enabled"]:
+            op_config["value_sampling"] = value_sampling_config
+            self.console.log("[bold]Value Sampling Configuration:[/bold]")
+            self.console.log(json.dumps(value_sampling_config, indent=2))
+
+        # Step 2: Determine if the reduce operation is associative
+        is_associative = self._is_associative(op_config, input_data)
+
+        # Step 3: Create and evaluate multiple reduce plans
+        reduce_plans = self._create_reduce_plans(op_config, input_data, is_associative)
+        best_plan = self._evaluate_reduce_plans(
+            op_config, reduce_plans, input_data, validator_prompt
+        )
+
+        # Step 4: Run the best reduce plan
+        optimized_output = self._run_operation(best_plan, input_data)
+
+        return [best_plan], optimized_output, 0.0
+
+    def _optimize_decomposed_reduce(
+        self,
+        decomposition_result: Dict[str, Any],
+        op_config: Dict[str, Any],
+        input_data: List[Dict[str, Any]],
+        level: int,
+    ) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]], float]:
+        """
+        Optimize a decomposed reduce operation.
+
+        This method performs the following steps:
+        1. Group the input data by the sub-group key.
+        2. Optimize the first reduce operation.
+        3. Run the optimized first reduce operation on all groups.
+        4. Optimize the second reduce operation using the results of the first.
+        5. Run the optimized second reduce operation.
+
+        Args:
+            decomposition_result (Dict[str, Any]): The result of the decomposition evaluation.
+            op_config (Dict[str, Any]): The original reduce operation configuration.
+            input_data (List[Dict[str, Any]]): The input data for the reduce operation.
+            level (int): The current level of decomposition.
+        Returns:
+            Tuple[List[Dict[str, Any]], List[Dict[str, Any]], float]: A tuple containing the list of optimized configurations
+            for both reduce operations and the final output of the second reduce operation, and the cost of the operation due to synthesizing any resolve operations.
+        """
+        sub_group_key = decomposition_result["sub_group_key"]
+        first_reduce_prompt = decomposition_result["first_reduce_prompt"]
+        second_reduce_prompt = decomposition_result["second_reduce_prompt"]
+        pipeline = []
+        all_cost = 0.0
+
+        first_reduce_config = op_config.copy()
+        first_reduce_config["prompt"] = first_reduce_prompt
+        if isinstance(op_config["reduce_key"], list):
+            first_reduce_config["reduce_key"] = [sub_group_key] + op_config[
+                "reduce_key"
+            ]
+        else:
+            first_reduce_config["reduce_key"] = [sub_group_key, op_config["reduce_key"]]
+        first_reduce_config["pass_through"] = True
+
+        if first_reduce_config.get("synthesize_resolve", True):
+            resolve_config = {
+                "type": "resolve",
+                "empty": True,
+                "embedding_model": "text-embedding-3-small",
+                "resolution_model": self.config.get("default_model", "gpt-4o-mini"),
+                "comparison_model": self.config.get("default_model", "gpt-4o-mini"),
+                "_intermediates": {
+                    "map_prompt": op_config.get("_intermediates", {}).get(
+                        "last_map_prompt"
+                    ),
+                    "reduce_key": first_reduce_config["reduce_key"],
+                },
+            }
+            optimized_resolve_config, resolve_cost = JoinOptimizer(
+                self.config,
+                resolve_config,
+                self.console,
+                self.llm_client,
+                self.max_threads,
+            ).optimize_resolve(input_data)
+            all_cost += resolve_cost
+
+            if not optimized_resolve_config.get("empty", False):
+                # Add this to the pipeline
+                pipeline += [optimized_resolve_config]
+
+                # Run the resolver
+                optimized_output = self._run_operation(
+                    optimized_resolve_config, input_data
+                )
+                input_data = optimized_output
+
+        first_optimized_configs, first_outputs, first_cost = self.optimize(
+            first_reduce_config, input_data, level + 1
+        )
+        pipeline += first_optimized_configs
+        all_cost += first_cost
+
+        # Optimize second reduce operation
+        second_reduce_config = op_config.copy()
+        second_reduce_config["prompt"] = second_reduce_prompt
+        second_reduce_config["pass_through"] = True
+
+        second_optimized_configs, second_outputs, second_cost = self.optimize(
+            second_reduce_config, first_outputs, level + 1
+        )
+
+        # Combine optimized configs and return with final output
+        pipeline += second_optimized_configs
+        all_cost += second_cost
+
+        return pipeline, second_outputs, all_cost
+
+    def _evaluate_decomposition(
+        self,
+        op_config: Dict[str, Any],
+        input_data: List[Dict[str, Any]],
+        level: int = 1,
+    ) -> Dict[str, Any]:
+        """
+        Evaluate whether decomposing the reduce operation would be beneficial.
+
+        This method first determines if decomposition would be helpful, and if so,
+        it then determines the sub-group key and prompts for the decomposed operations.
+
+        Args:
+            op_config (Dict[str, Any]): Configuration for the reduce operation.
+            input_data (List[Dict[str, Any]]): Input data for the reduce operation.
+            level (int): The current level of decomposition.
+
+        Returns:
+            Dict[str, Any]: A dictionary containing the decomposition decision and details.
+        """
+        should_decompose = self._should_decompose(op_config, input_data, level)
+
+        # Log the decomposition decision
+        if should_decompose["should_decompose"]:
+            self.console.log(
+                f"[bold green]Decomposition recommended:[/bold green] {should_decompose['explanation']}"
+            )
+        else:
+            self.console.log(
+                f"[bold yellow]Decomposition not recommended:[/bold yellow] {should_decompose['explanation']}"
+            )
+
+        # Return early if decomposition is not recommended
+        if not should_decompose["should_decompose"]:
+            return should_decompose
+
+        decomposition_details = self._get_decomposition_details(op_config, input_data)
+        result = {**should_decompose, **decomposition_details}
+        if decomposition_details["sub_group_key"] in op_config["reduce_key"]:
+            result["should_decompose"] = False
+            result[
+                "explanation"
+            ] += " However, the suggested sub-group key is already part of the current reduce key(s), so decomposition is not recommended."
+            result["sub_group_key"] = ""
+
+        return result
+
+    def _should_decompose(
+        self,
+        op_config: Dict[str, Any],
+        input_data: List[Dict[str, Any]],
+        level: int = 1,
+    ) -> Dict[str, Any]:
+        """
+        Determine if decomposing the reduce operation would be beneficial.
+
+        Args:
+            op_config (Dict[str, Any]): Configuration for the reduce operation.
+            input_data (List[Dict[str, Any]]): Input data for the reduce operation.
+            level (int): The current level of decomposition.
+
+        Returns:
+            Dict[str, Any]: A dictionary containing the decomposition decision and explanation.
+        """
+        # TODO: we have not enabled recursive decomposition yet
+        if level > 1 and not op_config.get("recursively_optimize", False):
+            return {
+                "should_decompose": False,
+                "explanation": "Recursive decomposition is not enabled.",
+            }
+
+        system_prompt = (
+            "You are an AI assistant tasked with optimizing data processing pipelines."
+        )
+
+        # Sample a subset of input data for analysis
+        sample_size = min(10, len(input_data))
+        sample_input = random.sample(input_data, sample_size)
+
+        # Get all keys from the input data
+        all_keys = set().union(*(item.keys() for item in sample_input))
+        reduce_key = op_config["reduce_key"]
+        reduce_keys = [reduce_key] if isinstance(reduce_key, str) else reduce_key
+        other_keys = [key for key in all_keys if key not in reduce_keys]
+
+        # See if there's an input schema and constrain the sample_input to that schema
+        input_schema = op_config.get("input", {}).get("schema", {})
+        if input_schema:
+            sample_input = [
+                {key: item[key] for key in input_schema} for item in sample_input
+            ]
+
+        # Create a sample of values for other keys
+        sample_values = {
+            key: list(set(str(item.get(key))[:50] for item in sample_input))[:5]
+            for key in other_keys
+        }
+
+        prompt = f"""Analyze the following reduce operation and determine if it should be decomposed into two reduce operations chained together:
+
+        Reduce Operation Prompt:
+        ```
+        {op_config['prompt']}
+        ```
+
+        Current Reduce Key(s): {reduce_keys}
+        Other Available Keys: {', '.join(other_keys)}
+
+        Sample values for other keys:
+        {json.dumps(sample_values, indent=2)}
+
+        Based on this information, determine if it would be beneficial to decompose this reduce operation into a sub-reduce operation followed by a final reduce operation. Consider the following:
+
+        1. Is there a natural hierarchy in the data (e.g., country -> state -> city) among the other available keys, with a key at a finer level of granularity than the current reduce key(s)?
+        2. Are the current reduce key(s) some form of ID, and are there many different types of inputs for that ID among the other available keys?
+        3. Does the prompt implicitly ask for sub-grouping based on the other available keys (e.g., "summarize policies by state, then by country")?
+        4. Would splitting the operation improve accuracy (i.e., make sure information isn't lost when reducing)?
+        5. Are all the keys of the potential hierarchy provided in the other available keys? If not, we should not decompose.
+        6. Importantly, do not suggest decomposition using any key that is already part of the current reduce key(s). We are looking for a new key from the other available keys to use for sub-grouping.
+
+        Provide your analysis in the following format:
+        """
+
+        parameters = {
+            "type": "object",
+            "properties": {
+                "should_decompose": {"type": "boolean"},
+                "explanation": {"type": "string"},
+            },
+            "required": ["should_decompose", "explanation"],
+        }
+
+        response = self.llm_client.generate(
+            [{"role": "user", "content": prompt}],
+            system_prompt,
+            parameters,
+        )
+        return json.loads(response.choices[0].message.content)
+
+    def _get_decomposition_details(
+        self,
+        op_config: Dict[str, Any],
+        input_data: List[Dict[str, Any]],
+    ) -> Dict[str, Any]:
+        """
+        Determine the sub-group key and prompts for decomposed reduce operations.
+
+        Args:
+            op_config (Dict[str, Any]): Configuration for the reduce operation.
+            input_data (List[Dict[str, Any]]): Input data for the reduce operation.
+
+        Returns:
+            Dict[str, Any]: A dictionary containing the sub-group key and prompts for decomposed operations.
+        """
+        system_prompt = (
+            "You are an AI assistant tasked with optimizing data processing pipelines."
+        )
+
+        # Sample a subset of input data for analysis
+        sample_size = min(10, len(input_data))
+        sample_input = random.sample(input_data, sample_size)
+
+        # Get all keys from the input data
+        all_keys = set().union(*(item.keys() for item in sample_input))
+        reduce_key = op_config["reduce_key"]
+        reduce_keys = [reduce_key] if isinstance(reduce_key, str) else reduce_key
+        other_keys = [key for key in all_keys if key not in reduce_keys]
+
+        prompt = f"""Given that we've decided to decompose the following reduce operation, suggest a two-step reduce process:
+
+        Reduce Operation Prompt:
+        ```
+        {op_config['prompt']}
+        ```
+
+        Reduce Key(s): {reduce_key}
+        Other Keys: {', '.join(other_keys)}
+
+        Provide the following:
+        1. A sub-group key to use for the first reduce operation
+        2. A prompt for the first reduce operation
+        3. A prompt for the second (final) reduce operation
+
+        For the reduce operation prompts, you should only minimally modify the original prompt. The prompts should be Jinja templates, and the only variables they can access are the `reduce_key` and `inputs` variables.
+
+        Provide your suggestions in the following format:
+        """
+
+        parameters = {
+            "type": "object",
+            "properties": {
+                "sub_group_key": {"type": "string"},
+                "first_reduce_prompt": {"type": "string"},
+                "second_reduce_prompt": {"type": "string"},
+            },
+            "required": [
+                "sub_group_key",
+                "first_reduce_prompt",
+                "second_reduce_prompt",
+            ],
+        }
+
+        response = self.llm_client.generate(
+            [{"role": "user", "content": prompt}],
+            system_prompt,
+            parameters,
+        )
+        return json.loads(response.choices[0].message.content)
+
+    def _determine_value_sampling(
+        self, op_config: Dict[str, Any], input_data: List[Dict[str, Any]]
+    ) -> Dict[str, Any]:
+        """
+        Determine whether value sampling should be enabled and configure its parameters.
+        """
+        system_prompt = (
+            "You are an AI assistant helping to optimize data processing pipelines."
+        )
+
+        # Sample a subset of input data for analysis
+        sample_size = min(100, len(input_data))
+        sample_input = random.sample(input_data, sample_size)
+
+        prompt = f"""
+        Analyze the following reduce operation and determine if value sampling should be enabled:
+
+        Reduce Operation Prompt:
+        {op_config['prompt']}
+
+        Sample Input Data (first 2 items):
+        {json.dumps(sample_input[:2], indent=2)}
+
+        Value sampling is appropriate for reduce operations that don't need to look at all the values for each key to produce a good result, such as generic summarization tasks.
+
+        Based on the reduce operation prompt and the sample input data, determine if value sampling should be enabled.
+        Answer with 'yes' if value sampling should be enabled or 'no' if it should not be enabled. Explain your reasoning briefly.
+        """
+
+        parameters = {
+            "type": "object",
+            "properties": {
+                "enable_sampling": {"type": "boolean"},
+                "explanation": {"type": "string"},
+            },
+            "required": ["enable_sampling", "explanation"],
+        }
+
+        response = self.llm_client.generate(
+            [{"role": "user", "content": prompt}],
+            system_prompt,
+            parameters,
+        )
+        result = json.loads(response.choices[0].message.content)
+
+        if not result["enable_sampling"]:
+            return {"enabled": False}
+
+        # Print the explanation for enabling value sampling
+        self.console.log(f"Value sampling enabled: {result['explanation']}")
+
+        # Determine sampling method
+        prompt = f"""
+        We are optimizing a reduce operation in a data processing pipeline. The reduce operation is defined by the following prompt:
+
+        Reduce Operation Prompt:
+        {op_config['prompt']}
+
+        Sample Input Data (first 2 items):
+        {json.dumps(sample_input[:2], indent=2)}
+
+        We have determined that value sampling should be enabled for this reduce operation. Value sampling is a technique used to process only a subset of the input data for each reduce key, rather than processing all items. This can significantly reduce processing time and costs for very large datasets, especially when the reduce operation doesn't require looking at every single item to produce a good result (e.g., summarization tasks).
+
+        Now we need to choose the most appropriate sampling method. The available methods are:
+
+        1. "random": Randomly select a subset of values.
+        Example: In a customer review analysis task, randomly selecting a subset of reviews to summarize the overall sentiment.
+
+        2. "cluster": Use K-means clustering to select representative samples.
+        Example: In a document categorization task, clustering documents based on their content and selecting representative documents from each cluster to determine the overall categories.
+
+        3. "sem_sim": Use semantic similarity to select the most relevant samples to a query text.
+        Example: In a news article summarization task, selecting articles that are semantically similar to a query like "Major economic events of {{reduce_key}}" to produce a focused summary.
+
+        Based on the reduce operation prompt, the nature of the task, and the sample input data, which sampling method would be most appropriate?
+
+        Provide your answer as either "random", "cluster", or "sem_sim", and explain your reasoning in detail. Consider the following in your explanation:
+        - The nature of the reduce task (e.g., summarization, aggregation, analysis)
+        - The structure and content of the input data
+        - The potential benefits and drawbacks of each sampling method for this specific task
+        """
+
+        parameters = {
+            "type": "object",
+            "properties": {
+                "method": {"type": "string", "enum": ["random", "cluster", "sem_sim"]},
+                "explanation": {"type": "string"},
+            },
+            "required": ["method", "explanation"],
+        }
+
+        response = self.llm_client.generate(
+            [{"role": "user", "content": prompt}],
+            system_prompt,
+            parameters,
+        )
+        result = json.loads(response.choices[0].message.content)
+        method = result["method"]
+
+        value_sampling_config = {
+            "enabled": True,
+            "method": method,
+            "sample_size": 100,  # Default sample size
+            "embedding_model": "text-embedding-3-small",
+        }
+
+        if method in ["cluster", "sem_sim"]:
+            # Determine embedding keys
+            prompt = f"""
+            For the {method} sampling method, we need to determine which keys from the input data should be used for generating embeddings.
+
+            Input data keys:
+            {', '.join(sample_input[0].keys())}
+
+            Sample Input Data:
+            {json.dumps(sample_input[0], indent=2)[:1000]}...
+
+            Based on the reduce operation prompt and the sample input data, which keys should be used for generating embeddings? Use keys that will create meaningful embeddings (i.e., not id-related keys).
+            Provide your answer as a list of key names that is a subset of the input data keys. You should pick only the 1-3 keys that are necessary for generating meaningful embeddings, that have relatively short values.
+            """
+
+            parameters = {
+                "type": "object",
+                "properties": {
+                    "embedding_keys": {"type": "array", "items": {"type": "string"}},
+                    "explanation": {"type": "string"},
+                },
+                "required": ["embedding_keys", "explanation"],
+            }
+
+            response = self.llm_client.generate(
+                [{"role": "user", "content": prompt}],
+                system_prompt,
+                parameters,
+            )
+            result = json.loads(response.choices[0].message.content)
+            # TODO: validate that these exist
+            embedding_keys = result["embedding_keys"]
+            for key in result["embedding_keys"]:
+                if key not in sample_input[0]:
+                    embedding_keys.remove(key)
+
+            if not embedding_keys:
+                # Select the reduce key
+                self.console.log(
+                    "No embedding keys found, selecting reduce key for embedding key"
+                )
+                embedding_keys = (
+                    op_config["reduce_key"]
+                    if isinstance(op_config["reduce_key"], list)
+                    else [op_config["reduce_key"]]
+                )
+
+            value_sampling_config["embedding_keys"] = embedding_keys
+
+        if method == "sem_sim":
+            # Determine query text
+            prompt = f"""
+            For the semantic similarity (sem_sim) sampling method, we need to determine the query text to compare against when selecting samples.
+
+            Reduce Operation Prompt:
+            {op_config['prompt']}
+
+            The query text should be a Jinja template with access to the `reduce_key` variable.
+            Based on the reduce operation prompt, what would be an appropriate query text for selecting relevant samples?
+            """
+
+            parameters = {
+                "type": "object",
+                "properties": {
+                    "query_text": {"type": "string"},
+                    "explanation": {"type": "string"},
+                },
+                "required": ["query_text", "explanation"],
+            }
+
+            response = self.llm_client.generate(
+                [{"role": "user", "content": prompt}],
+                system_prompt,
+                parameters,
+            )
+            result = json.loads(response.choices[0].message.content)
+            value_sampling_config["query_text"] = result["query_text"]
+
+        return value_sampling_config
+
+    def _is_associative(
+        self, op_config: Dict[str, Any], input_data: List[Dict[str, Any]]
+    ) -> bool:
+        """
+        Determine if the reduce operation is associative.
+
+        This method analyzes the reduce operation configuration and a sample of the input data
+        to determine if the operation is associative (i.e., the order of combining elements
+        doesn't affect the final result).
+
+        Args:
+            op_config (Dict[str, Any]): Configuration for the reduce operation.
+            input_data (List[Dict[str, Any]]): Input data for the reduce operation.
+
+        Returns:
+            bool: True if the operation is determined to be associative, False otherwise.
+        """
+        system_prompt = (
+            "You are an AI assistant helping to optimize data processing pipelines."
+        )
+
+        # Sample a subset of input data for analysis
+        sample_size = min(5, len(input_data))
+        sample_input = random.sample(input_data, sample_size)
+
+        prompt = f"""
+        Analyze the following reduce operation and determine if it is associative:
+
+        Reduce Operation Prompt:
+        {op_config['prompt']}
+
+        Sample Input Data:
+        {json.dumps(sample_input, indent=2)[:1000]}...
+
+        Based on the reduce operation prompt, determine whether the order in which we process data matters.
+        Answer with 'yes' if order matters or 'no' if order doesn't matter.
+        Explain your reasoning briefly.
+
+        For example:
+        - Merging extracted key-value pairs from documents does not require order: combining {{"name": "John", "age": 30}} with {{"city": "New York", "job": "Engineer"}} yields the same result regardless of order
+        - Generating a timeline of events requires order: the order of events matters for maintaining chronological accuracy.
+
+        Consider these examples when determining whether the order in which we process data matters. You might also have to consider the specific data.
+        """
+
+        parameters = {
+            "type": "object",
+            "properties": {
+                "order_matters": {"type": "boolean"},
+                "explanation": {"type": "string"},
+            },
+            "required": ["order_matters", "explanation"],
+        }
+
+        response = self.llm_client.generate(
+            [{"role": "user", "content": prompt}],
+            system_prompt,
+            parameters,
+        )
+        result = json.loads(response.choices[0].message.content)
+        result["is_associative"] = not result["order_matters"]
+
+        self.console.log(
+            f"Reduce operation {'is associative' if result['is_associative'] else 'is not associative'}. Analysis: {result['explanation']}"
+        )
+        return result["is_associative"]
+
+    def _generate_validator_prompt(
+        self,
+        op_config: Dict[str, Any],
+        input_data: List[Dict[str, Any]],
+        original_output: List[Dict[str, Any]],
+    ) -> str:
+        """
+        Generate a custom validator prompt for assessing the quality of the reduce operation output.
+
+        This method creates a prompt that will be used to validate the output of the reduce operation.
+        It includes specific questions about the quality and completeness of the output.
+
+        Args:
+            op_config (Dict[str, Any]): Configuration for the reduce operation.
+            input_data (List[Dict[str, Any]]): Input data for the reduce operation.
+            original_output (List[Dict[str, Any]]): Original output of the reduce operation.
+
+        Returns:
+            str: A custom validator prompt as a string.
+        """
+        system_prompt = "You are an AI assistant tasked with creating custom validation prompts for reduce operations in data processing pipelines."
+
+        sample_input = random.choice(input_data)
+        input_keys = op_config.get("input", {}).get("schema", {})
+        if input_keys:
+            sample_input = {k: sample_input[k] for k in input_keys}
+
+        reduce_key = op_config.get("reduce_key")
+        if reduce_key and original_output:
+            if isinstance(reduce_key, list):
+                key = next(
+                    (
+                        tuple(item[k] for k in reduce_key)
+                        for item in original_output
+                        if all(k in item for k in reduce_key)
+                    ),
+                    tuple(None for _ in reduce_key),
+                )
+                sample_output = next(
+                    (
+                        item
+                        for item in original_output
+                        if all(item.get(k) == v for k, v in zip(reduce_key, key))
+                    ),
+                    {},
+                )
+            else:
+                key = next(
+                    (
+                        item[reduce_key]
+                        for item in original_output
+                        if reduce_key in item
+                    ),
+                    None,
+                )
+                sample_output = next(
+                    (item for item in original_output if item.get(reduce_key) == key),
+                    {},
+                )
+        else:
+            sample_output = original_output[0] if original_output else {}
+
+        output_keys = op_config.get("output", {}).get("schema", {})
+        sample_output = {k: sample_output[k] for k in output_keys}
+
+        prompt = f"""
+        Analyze the following reduce operation and its input/output:
+
+        Reduce Operation Prompt:
+        {op_config["prompt"]}
+
+        Sample Input (just one item):
+        {json.dumps(sample_input, indent=2)}
+
+        Sample Output:
+        {json.dumps(sample_output, indent=2)}
+
+        Create a custom validator prompt that will assess how well the reduce operation performed its intended task. The prompt should ask specific 2-3 questions about the quality of the output, such as:
+        1. Does the output accurately reflect the aggregation method specified in the task? For example, if summing numeric values, are the totals correct?
+        2. Are there any missing fields, unexpected null values, or data type mismatches in the output compared to the expected schema?
+        3. Does the output maintain the key information from the input while appropriately condensing or summarizing it? For instance, in a text summarization task, are the main points preserved?
+        4. How well does the output adhere to any specific formatting requirements mentioned in the original prompt, such as character limits for summaries or specific data types for aggregated values?
+
+        Note that the output may reflect more than just the input provided, since we only provide a one-item sample input. Provide your response as a single string containing the custom validator prompt. The prompt should be tailored to the task and avoid generic criteria.
+        """
+
+        parameters = {
+            "type": "object",
+            "properties": {"validator_prompt": {"type": "string"}},
+            "required": ["validator_prompt"],
+        }
+
+        response = self.llm_client.generate(
+            [{"role": "user", "content": prompt}],
+            system_prompt,
+            parameters,
+        )
+        return json.loads(response.choices[0].message.content)["validator_prompt"]
+
+    def _validate_reduce_output(
+        self,
+        op_config: Dict[str, Any],
+        validation_inputs: Dict[Any, List[Dict[str, Any]]],
+        output_data: List[Dict[str, Any]],
+        validator_prompt: str,
+    ) -> Dict[str, Any]:
+        """
+        Validate the output of the reduce operation using the generated validator prompt.
+
+        This method assesses the quality of the reduce operation output by applying the validator prompt
+        to multiple samples of the input and output data.
+
+        Args:
+            op_config (Dict[str, Any]): Configuration for the reduce operation.
+            validation_inputs (Dict[Any, List[Dict[str, Any]]]): Validation inputs for the reduce operation.
+            output_data (List[Dict[str, Any]]): Output data from the reduce operation.
+            validator_prompt (str): The validator prompt generated earlier.
+
+        Returns:
+            Dict[str, Any]: A dictionary containing validation results and a flag indicating if improvement is needed.
+        """
+        system_prompt = "You are an AI assistant tasked with validating the output of reduce operations in data processing pipelines."
+
+        validation_results = []
+        with ThreadPoolExecutor(max_workers=self.max_threads) as executor:
+            futures = []
+            for reduce_key, inputs in validation_inputs.items():
+                if isinstance(op_config["reduce_key"], list):
+                    sample_output = next(
+                        (
+                            item
+                            for item in output_data
+                            if all(
+                                item[key] == reduce_key[i]
+                                for i, key in enumerate(op_config["reduce_key"])
+                            )
+                        ),
+                        None,
+                    )
+                else:
+                    sample_output = next(
+                        (
+                            item
+                            for item in output_data
+                            if item[op_config["reduce_key"]] == reduce_key
+                        ),
+                        None,
+                    )
+
+                if sample_output is None:
+                    self.console.log(
+                        f"Warning: No output found for reduce key {reduce_key}"
+                    )
+                    continue
+
+                input_str = json.dumps(inputs, indent=2)
+                # truncate input_str to 40,000 words
+                input_str = input_str.split()[:40000]
+                input_str = " ".join(input_str) + "..."
+
+                prompt = f"""{validator_prompt}
+
+                Reduce Operation Task:
+                {op_config["prompt"]}
+
+                Input Data Samples:
+                {input_str}
+
+                Output Data Sample:
+                {json.dumps(sample_output, indent=2)}
+
+                Based on the validator prompt and the input/output samples, assess the quality (e.g., correctness, completeness) of the reduce operation output.
+                Provide your assessment in the following format:
+                """
+
+                parameters = {
+                    "type": "object",
+                    "properties": {
+                        "is_valid": {"type": "boolean"},
+                        "issues": {"type": "array", "items": {"type": "string"}},
+                        "suggestions": {"type": "array", "items": {"type": "string"}},
+                    },
+                    "required": ["is_valid", "issues", "suggestions"],
+                }
+
+                futures.append(
+                    executor.submit(
+                        self.llm_client.generate,
+                        [{"role": "user", "content": prompt}],
+                        system_prompt,
+                        parameters,
+                    )
+                )
+
+            for future, (reduce_key, inputs) in zip(futures, validation_inputs.items()):
+                response = future.result()
+                result = json.loads(response.choices[0].message.content)
+                validation_results.append(result)
+
+        # Determine if optimization is needed based on validation results
+        invalid_count = sum(
+            1 for result in validation_results if not result["is_valid"]
+        )
+        needs_improvement = invalid_count > 1
+
+        return {
+            "needs_improvement": needs_improvement,
+            "validation_results": validation_results,
+        }
+
+    def _create_validation_inputs(
+        self, input_data: List[Dict[str, Any]], reduce_key: Union[str, List[str]]
+    ) -> Dict[Any, List[Dict[str, Any]]]:
+        # Group input data by reduce_key
+        grouped_data = {}
+        for item in input_data:
+            if isinstance(reduce_key, list):
+                key = tuple(item[k] for k in reduce_key)
+            else:
+                key = item[reduce_key]
+            if key not in grouped_data:
+                grouped_data[key] = []
+            grouped_data[key].append(item)
+
+        # Select a fixed number of reduce keys
+        selected_keys = random.sample(
+            list(grouped_data.keys()),
+            min(self.num_samples_in_validation, len(grouped_data)),
+        )
+
+        # Create a new dict with only the selected keys
+        validation_inputs = {key: grouped_data[key] for key in selected_keys}
+
+        return validation_inputs
+
+    def _create_reduce_plans(
+        self,
+        op_config: Dict[str, Any],
+        input_data: List[Dict[str, Any]],
+        is_associative: bool,
+    ) -> List[Dict[str, Any]]:
+        """
+        Create multiple reduce plans based on the input data and operation configuration.
+
+        This method generates various reduce plans by varying batch sizes and fold prompts.
+        It takes into account the LLM's context window size to determine appropriate batch sizes.
+
+        Args:
+            op_config (Dict[str, Any]): Configuration for the reduce operation.
+            input_data (List[Dict[str, Any]]): Input data for the reduce operation.
+            is_associative (bool): Flag indicating whether the reduce operation is associative.
+
+        Returns:
+            List[Dict[str, Any]]: A list of reduce plans, each with different batch sizes and fold prompts.
+        """
+        model = op_config.get("model", "gpt-4o-mini")
+        model_input_context_length = model_cost.get(model, {}).get(
+            "max_input_tokens", 8192
+        )
+
+        # Estimate tokens for prompt, input, and output
+        prompt_tokens = count_tokens(op_config["prompt"], model)
+        sample_input = input_data[:100]
+        sample_output = self._run_operation(op_config, input_data[:100])
+
+        prompt_vars = extract_jinja_variables(op_config["prompt"])
+        prompt_vars = [var.split(".")[-1] for var in prompt_vars]
+        avg_input_tokens = mean(
+            [
+                count_tokens(
+                    json.dumps({k: item[k] for k in prompt_vars if k in item}), model
+                )
+                for item in sample_input
+            ]
+        )
+        avg_output_tokens = mean(
+            [
+                count_tokens(
+                    json.dumps({k: item[k] for k in prompt_vars if k in item}), model
+                )
+                for item in sample_output
+            ]
+        )
+
+        # Calculate max batch size that fits in context window
+        max_batch_size = (
+            model_input_context_length - prompt_tokens - avg_output_tokens
+        ) // avg_input_tokens
+
+        # Generate 6 candidate batch sizes
+        batch_sizes = [
+            max(1, int(max_batch_size * ratio))
+            for ratio in [0.1, 0.2, 0.4, 0.6, 0.8, 1.0]
+        ]
+        # Log the generated batch sizes
+        self.console.log("[cyan]Generating plans for batch sizes:[/cyan]")
+        for size in batch_sizes:
+            self.console.log(f"  - {size}")
+        batch_sizes = sorted(set(batch_sizes))  # Remove duplicates and sort
+
+        plans = []
+
+        # Generate multiple fold prompts
+        max_retries = 5
+        retry_count = 0
+        fold_prompts = []
+
+        while retry_count < max_retries and not fold_prompts:
+            try:
+                fold_prompts = self._synthesize_fold_prompts(
+                    op_config,
+                    sample_input,
+                    sample_output,
+                    num_prompts=2,
+                )
+                if not fold_prompts:
+                    raise ValueError("No fold prompts generated")
+            except Exception as e:
+                retry_count += 1
+                if retry_count == max_retries:
+                    raise RuntimeError(
+                        f"Failed to generate fold prompts after {max_retries} attempts: {str(e)}"
+                    )
+                self.console.log(
+                    f"Retry {retry_count}/{max_retries}: Failed to generate fold prompts. Retrying..."
+                )
+
+        for batch_size in batch_sizes:
+            for fold_prompt in fold_prompts:
+                plan = op_config.copy()
+                plan["fold_prompt"] = fold_prompt
+                plan["fold_batch_size"] = batch_size
+                plan["associative"] = is_associative
+                plans.append(plan)
+
+        return plans
+
+    def _calculate_compression_ratio(
+        self,
+        op_config: Dict[str, Any],
+        sample_input: List[Dict[str, Any]],
+        sample_output: List[Dict[str, Any]],
+    ) -> float:
+        """
+        Calculate the compression ratio of the reduce operation.
+
+        This method compares the size of the input data to the size of the output data
+        to determine how much the data is being compressed by the reduce operation.
+
+        Args:
+            op_config (Dict[str, Any]): Configuration for the reduce operation.
+            sample_input (List[Dict[str, Any]]): Sample input data.
+            sample_output (List[Dict[str, Any]]): Sample output data.
+
+        Returns:
+            float: The calculated compression ratio.
+        """
+        reduce_key = op_config["reduce_key"]
+        input_schema = op_config.get("input", {}).get("schema", {})
+        output_schema = op_config["output"]["schema"]
+        model = op_config.get("model", "gpt-4o")
+
+        compression_ratios = {}
+
+        # Handle both single key and list of keys
+        if isinstance(reduce_key, list):
+            distinct_keys = set(
+                tuple(item[k] for k in reduce_key) for item in sample_input
+            )
+        else:
+            distinct_keys = set(item[reduce_key] for item in sample_input)
+
+        for key in distinct_keys:
+            if isinstance(reduce_key, list):
+                key_input = [
+                    item
+                    for item in sample_input
+                    if tuple(item[k] for k in reduce_key) == key
+                ]
+                key_output = [
+                    item
+                    for item in sample_output
+                    if tuple(item[k] for k in reduce_key) == key
+                ]
+            else:
+                key_input = [item for item in sample_input if item[reduce_key] == key]
+                key_output = [item for item in sample_output if item[reduce_key] == key]
+
+            if input_schema:
+                key_input_tokens = sum(
+                    count_tokens(
+                        json.dumps({k: item[k] for k in input_schema if k in item}),
+                        model,
+                    )
+                    for item in key_input
+                )
+            else:
+                key_input_tokens = sum(
+                    count_tokens(json.dumps(item), model) for item in key_input
+                )
+
+            key_output_tokens = sum(
+                count_tokens(
+                    json.dumps({k: item[k] for k in output_schema if k in item}), model
+                )
+                for item in key_output
+            )
+
+            compression_ratios[key] = (
+                key_output_tokens / key_input_tokens if key_input_tokens > 0 else 1
+            )
+
+        if not compression_ratios:
+            return 1
+
+        # Calculate importance weights based on the number of items for each key
+        total_items = len(sample_input)
+        if isinstance(reduce_key, list):
+            importance_weights = {
+                key: len(
+                    [
+                        item
+                        for item in sample_input
+                        if tuple(item[k] for k in reduce_key) == key
+                    ]
+                )
+                / total_items
+                for key in compression_ratios
+            }
+        else:
+            importance_weights = {
+                key: len([item for item in sample_input if item[reduce_key] == key])
+                / total_items
+                for key in compression_ratios
+            }
+
+        # Calculate weighted average of compression ratios
+        weighted_sum = sum(
+            compression_ratios[key] * importance_weights[key]
+            for key in compression_ratios
+        )
+        return weighted_sum
+
+    def _synthesize_fold_prompts(
+        self,
+        op_config: Dict[str, Any],
+        sample_input: List[Dict[str, Any]],
+        sample_output: List[Dict[str, Any]],
+        num_prompts: int = 2,
+    ) -> List[str]:
+        """
+        Synthesize fold prompts for the reduce operation. We generate multiple
+        fold prompts in case one is bad.
+
+        A fold operation is a higher-order function that iterates through a data structure,
+        accumulating the results of applying a given combining operation to its elements.
+        In the context of reduce operations, folding allows processing of data in batches,
+        which can significantly improve performance for large datasets.
+
+        This method generates multiple fold prompts that can be used to optimize the reduce operation
+        by allowing it to run on batches of inputs. It uses the language model to create prompts
+        that are variations of the original reduce prompt, adapted for folding operations.
+
+        Args:
+            op_config (Dict[str, Any]): The configuration of the reduce operation.
+            sample_input (List[Dict[str, Any]]): A sample of the input data.
+            sample_output (List[Dict[str, Any]]): A sample of the output data.
+            num_prompts (int, optional): The number of fold prompts to generate. Defaults to 2.
+
+        Returns:
+            List[str]: A list of synthesized fold prompts.
+
+        The method performs the following steps:
+        1. Sets up the system prompt and parameters for the language model.
+        2. Defines a function to get random examples from the sample data.
+        3. Creates a prompt template for generating fold prompts.
+        4. Uses multi-threading to generate multiple fold prompts in parallel.
+        5. Returns the list of generated fold prompts.
+        """
+        system_prompt = "You are an AI assistant tasked with creating a fold prompt for reduce operations in data processing pipelines."
+        original_prompt = op_config["prompt"]
+
+        input_schema = op_config.get("input", {}).get("schema", {})
+        output_schema = op_config["output"]["schema"]
+        reduce_key = op_config["reduce_key"]
+
+        def get_random_examples():
+            if isinstance(reduce_key, list):
+                random_key = tuple(
+                    random.choice(
+                        [
+                            tuple(item[k] for k in reduce_key if k in item)
+                            for item in sample_input
+                            if all(k in item for k in reduce_key)
+                        ]
+                    )
+                )
+                input_example = random.choice(
+                    [
+                        item
+                        for item in sample_input
+                        if all(item.get(k) == v for k, v in zip(reduce_key, random_key))
+                    ]
+                )
+                output_example = random.choice(
+                    [
+                        item
+                        for item in sample_output
+                        if all(item.get(k) == v for k, v in zip(reduce_key, random_key))
+                    ]
+                )
+            else:
+                random_key = random.choice(
+                    [item[reduce_key] for item in sample_input if reduce_key in item]
+                )
+                input_example = random.choice(
+                    [item for item in sample_input if item[reduce_key] == random_key]
+                )
+                output_example = random.choice(
+                    [item for item in sample_output if item[reduce_key] == random_key]
+                )
+
+            if input_schema:
+                input_example = {
+                    k: input_example[k] for k in input_schema if k in input_example
+                }
+            output_example = {
+                k: output_example[k] for k in output_schema if k in output_example
+            }
+            return input_example, output_example
+
+        parameters = {
+            "type": "object",
+            "properties": {
+                "fold_prompt": {
+                    "type": "string",
+                }
+            },
+            "required": ["fold_prompt"],
+        }
+
+        def generate_single_prompt():
+            input_example, output_example = get_random_examples()
+            prompt = f"""
+            Original Reduce Operation Prompt:
+            {original_prompt}
+
+            Sample Input:
+            {json.dumps(input_example, indent=2)}
+
+            Sample Output:
+            {json.dumps(output_example, indent=2)}
+
+            Create a fold prompt for the reduce operation to run on batches of inputs. The fold prompt should:
+            1. Minimally modify the original reduce prompt
+            2. Describe how to combine the new values with the current reduced value
+            3. Be designed to work iteratively, allowing for multiple fold operations. The first iteration will use the original prompt, and all successive iterations will use the fold prompt.
+
+            The fold prompt should be a Jinja2 template with the following variables available:
+            - {{ output }}: The current reduced value (a dictionary with the current output schema)
+            - {{ inputs }}: A list of new values to be folded in
+            - {{ reduce_key }}: The key used for grouping in the reduce operation
+
+            Provide the fold prompt as a string.
+            """
+            response = self.llm_client.generate(
+                [{"role": "user", "content": prompt}],
+                system_prompt,
+                parameters,
+            )
+            fold_prompt = json.loads(response.choices[0].message.content)["fold_prompt"]
+
+            # Run the operation with the fold prompt
+            # Create a temporary plan with the fold prompt
+            temp_plan = op_config.copy()
+            temp_plan["fold_prompt"] = fold_prompt
+            temp_plan["fold_batch_size"] = min(
+                len(sample_input), 2
+            )  # Use a small batch size for testing
+
+            # Run the operation with the fold prompt
+            self._run_operation(temp_plan, sample_input[: temp_plan["fold_batch_size"]])
+
+            # If the operation runs successfully, return the fold prompt
+            return fold_prompt
+
+        with ThreadPoolExecutor(max_workers=self.max_threads) as executor:
+            fold_prompts = list(
+                executor.map(lambda _: generate_single_prompt(), range(num_prompts))
+            )
+
+        return fold_prompts
+
+    def _evaluate_reduce_plans(
+        self,
+        op_config: Dict[str, Any],
+        plans: List[Dict[str, Any]],
+        input_data: List[Dict[str, Any]],
+        validator_prompt: str,
+    ) -> Dict[str, Any]:
+        """
+        Evaluate multiple reduce plans and select the best one.
+
+        This method takes a list of reduce plans, evaluates each one using the input data
+        and a validator prompt, and selects the best plan based on the evaluation scores.
+        It also attempts to create and evaluate a merged plan that enhances the runtime performance
+        of the best plan.
+
+        A merged plan is an optimization technique applied to the best-performing plan
+        that uses the fold operation. It allows the best plan to run even faster by
+        executing parallel folds and then merging the results of these individual folds
+        together. We default to a merge batch size of 2, but one can increase this.
+
+        Args:
+            op_config (Dict[str, Any]): The configuration of the reduce operation.
+            plans (List[Dict[str, Any]]): A list of reduce plans to evaluate.
+            input_data (List[Dict[str, Any]]): The input data to use for evaluation.
+            validator_prompt (str): The prompt to use for validating the output of each plan.
+
+        Returns:
+            Dict[str, Any]: The best reduce plan, either the top-performing original plan
+                            or a merged plan if it performs well enough.
+
+        The method performs the following steps:
+        1. Evaluates each plan using multi-threading.
+        2. Sorts the plans based on their evaluation scores.
+        3. Selects the best plan and attempts to create a merged plan.
+        4. Evaluates the merged plan and compares it to the best original plan.
+        5. Returns either the merged plan or the best original plan based on their scores.
+        """
+        self.console.log("\n[bold]Evaluating Reduce Plans:[/bold]")
+        for i, plan in enumerate(plans):
+            self.console.log(f"Plan {i+1} (batch size: {plan['fold_batch_size']})")
+
+        plan_scores = []
+        plan_outputs = {}
+
+        # Create a fixed random sample for evaluation
+        sample_size = min(100, len(input_data))
+        evaluation_sample = random.sample(input_data, sample_size)
+
+        # Create a fixed set of validation samples
+        validation_inputs = self._create_validation_inputs(
+            evaluation_sample, plan["reduce_key"]
+        )
+
+        with ThreadPoolExecutor(max_workers=self.max_threads) as executor:
+            futures = [
+                executor.submit(
+                    self._evaluate_single_plan,
+                    plan,
+                    evaluation_sample,
+                    validator_prompt,
+                    validation_inputs,
+                )
+                for plan in plans
+            ]
+            for future in as_completed(futures):
+                plan, score, output = future.result()
+                plan_scores.append((plan, score))
+                plan_outputs[id(plan)] = output
+
+        # Sort plans by score in descending order, then by fold_batch_size in descending order
+        sorted_plans = sorted(
+            plan_scores, key=lambda x: (x[1], x[0]["fold_batch_size"]), reverse=True
+        )
+
+        self.console.log("\n[bold]Reduce Plan Scores:[/bold]")
+        for i, (plan, score) in enumerate(sorted_plans):
+            self.console.log(
+                f"Plan {i+1} (batch size: {plan['fold_batch_size']}): {score:.2f}"
+            )
+
+        best_plan, best_score = sorted_plans[0]
+        self.console.log(
+            f"\n[green]Selected best plan with score: {best_score:.2f} and batch size: {best_plan['fold_batch_size']}[/green]"
+        )
+
+        if op_config.get("synthesize_merge", True):
+            # Create a new plan with merge prompt and updated parameters
+            merged_plan = best_plan.copy()
+
+            # Synthesize merge prompt if it doesn't exist
+            if "merge_prompt" not in merged_plan:
+                merged_plan["merge_prompt"] = self._synthesize_merge_prompt(
+                    merged_plan, plan_outputs[id(best_plan)]
+                )
+                # Print the synthesized merge prompt
+                self.console.log("\n[bold]Synthesized Merge Prompt:[/bold]")
+                self.console.log(merged_plan["merge_prompt"])
+
+            # Set merge_batch_size to 2 and num_parallel_folds to 5
+            merged_plan["merge_batch_size"] = 2
+
+            # Evaluate the merged plan
+            _, merged_plan_score, _, operation_instance = self._evaluate_single_plan(
+                merged_plan,
+                evaluation_sample,
+                validator_prompt,
+                validation_inputs,
+                return_instance=True,
+            )
+
+            # Get the merge and fold times from the operation instance
+            merge_times = operation_instance.merge_times
+            fold_times = operation_instance.fold_times
+            merge_avg_time = mean(merge_times) if merge_times else None
+            fold_avg_time = mean(fold_times) if fold_times else None
+
+            self.console.log("\n[bold]Scores:[/bold]")
+            self.console.log(f"Original plan: {best_score:.2f}")
+            self.console.log(f"Merged plan: {merged_plan_score:.2f}")
+
+            # Compare scores and decide which plan to use
+            if merged_plan_score >= best_score * 0.75:
+                self.console.log(
+                    f"\n[green]Using merged plan with score: {merged_plan_score:.2f}[/green]"
+                )
+                if merge_avg_time and fold_avg_time:
+                    merged_plan["merge_time"] = merge_avg_time
+                    merged_plan["fold_time"] = fold_avg_time
+                return merged_plan
+            else:
+                self.console.log(
+                    f"\n[yellow]Merged plan quality too low. Using original plan with score: {best_score:.2f}[/yellow]"
+                )
+                return best_plan
+        else:
+            return best_plan
+
+    def _evaluate_single_plan(
+        self,
+        plan: Dict[str, Any],
+        input_data: List[Dict[str, Any]],
+        validator_prompt: str,
+        validation_inputs: List[Dict[str, Any]],
+        return_instance: bool = False,
+    ) -> Union[
+        Tuple[Dict[str, Any], float, List[Dict[str, Any]]],
+        Tuple[Dict[str, Any], float, List[Dict[str, Any]], BaseOperation],
+    ]:
+        """
+        Evaluate a single reduce plan using the provided input data and validator prompt.
+
+        This method runs the reduce operation with the given plan, validates the output,
+        and calculates a score based on the validation results. The scoring works as follows:
+        1. It counts the number of valid results from the validation.
+        2. The score is calculated as the ratio of valid results to the total number of validation results.
+        3. This produces a score between 0 and 1, where 1 indicates all results were valid, and 0 indicates none were valid.
+
+        TODO: We should come up with a better scoring method here, maybe pairwise comparisons.
+
+        Args:
+            plan (Dict[str, Any]): The reduce plan to evaluate.
+            input_data (List[Dict[str, Any]]): The input data to use for evaluation.
+            validator_prompt (str): The prompt to use for validating the output.
+            return_instance (bool, optional): Whether to return the operation instance. Defaults to False.
+
+        Returns:
+            Union[
+                Tuple[Dict[str, Any], float, List[Dict[str, Any]]],
+                Tuple[Dict[str, Any], float, List[Dict[str, Any]], BaseOperation],
+            ]: A tuple containing the plan, its score, the output data, and optionally the operation instance.
+
+        The method performs the following steps:
+        1. Runs the reduce operation with the given plan on the input data.
+        2. Validates the output using the validator prompt.
+        3. Calculates a score based on the validation results.
+        4. Returns the plan, score, output data, and optionally the operation instance.
+        """
+        output = self._run_operation(plan, input_data, return_instance)
+        if return_instance:
+            output, operation_instance = output
+
+        validation_result = self._validate_reduce_output(
+            plan, validation_inputs, output, validator_prompt
+        )
+
+        # Calculate a score based on validation results
+        valid_count = sum(
+            1
+            for result in validation_result["validation_results"]
+            if result["is_valid"]
+        )
+        score = valid_count / len(validation_result["validation_results"])
+
+        if return_instance:
+            return plan, score, output, operation_instance
+        else:
+            return plan, score, output
+
+    def _synthesize_merge_prompt(
+        self, plan: Dict[str, Any], sample_outputs: List[Dict[str, Any]]
+    ) -> str:
+        """
+        Synthesize a merge prompt for combining multiple folded outputs in a reduce operation.
+
+        This method generates a merge prompt that can be used to combine the results of multiple
+        parallel fold operations into a single output. It uses the language model to create a prompt
+        that is consistent with the original reduce and fold prompts while addressing the specific
+        requirements of merging multiple outputs.
+
+        Args:
+            plan (Dict[str, Any]): The reduce plan containing the original prompt and fold prompt.
+            sample_outputs (List[Dict[str, Any]]): Sample outputs from the fold operation to use as examples.
+
+        Returns:
+            str: The synthesized merge prompt as a string.
+
+        The method performs the following steps:
+        1. Sets up the system prompt for the language model.
+        2. Prepares a random sample output to use as an example.
+        3. Creates a detailed prompt for the language model, including the original reduce prompt,
+           fold prompt, sample output, and instructions for creating the merge prompt.
+        4. Uses the language model to generate the merge prompt.
+        5. Returns the generated merge prompt.
+        """
+        system_prompt = "You are an AI assistant tasked with creating a merge prompt for reduce operations in data processing pipelines. The pipeline has a reduce operation, and incrementally folds inputs into a single output. We want to optimize the pipeline for speed by running multiple folds on different inputs in parallel, and then merging the fold outputs into a single output."
+
+        output_schema = plan["output"]["schema"]
+        random_output = random.choice(sample_outputs)
+        random_output = {
+            k: random_output[k] for k in output_schema if k in random_output
+        }
+
+        prompt = f"""Reduce Operation Prompt (runs on the first batch of inputs):
+        {plan["prompt"]}
+
+        Fold Prompt (runs on the second and subsequent batches of inputs):
+        {plan["fold_prompt"]}
+
+        Sample output of the fold operation (an input to the merge operation):
+        {json.dumps(random_output, indent=2)}
+
+        Create a merge prompt for the reduce operation to combine 2+ folded outputs. The merge prompt should:
+        1. Give context on the task & fold operations, describing that the prompt will be used to combine multiple outputs from the fold operation (as if the original prompt was run on all inputs at once)
+        2. Describe how to combine multiple folded outputs into a single output
+        3. Minimally deviate from the reduce and fold prompts
+
+        The merge prompt should be a Jinja2 template with the following variables available:
+        - {{ outputs }}: A list of reduced outputs to be merged (each following the output schema). You can access the first output with {{ outputs[0] }} and the second with {{ outputs[1] }}
+
+        Output Schema:
+        {json.dumps(output_schema, indent=2)}
+
+        Provide the merge prompt as a string.
+        """
+
+        parameters = {
+            "type": "object",
+            "properties": {
+                "merge_prompt": {
+                    "type": "string",
+                }
+            },
+            "required": ["merge_prompt"],
+        }
+
+        response = self.llm_client.generate(
+            [{"role": "user", "content": prompt}],
+            system_prompt,
+            parameters,
+        )
+        return json.loads(response.choices[0].message.content)["merge_prompt"]
+
+
+ + + +
+ + + + + + + + + +
+ + +

+ __init__(config, console, llm_client, max_threads, run_operation, num_fold_prompts=1, num_samples_in_validation=10) + +

+ + +
+ +

Initialize the ReduceOptimizer.

+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
config + Dict[str, Any] + +
+

Configuration dictionary for the optimizer.

+
+
+ required +
console + Console + +
+

Rich console object for pretty printing.

+
+
+ required +
llm_client + LLMClient + +
+

Client for interacting with a language model.

+
+
+ required +
max_threads + int + +
+

Maximum number of threads to use for parallel processing.

+
+
+ required +
run_operation + Callable + +
+

Function to run an operation.

+
+
+ required +
num_fold_prompts + int + +
+

Number of fold prompts to generate. Defaults to 1.

+
+
+ 1 +
num_samples_in_validation + int + +
+

Number of samples to use in validation. Defaults to 10.

+
+
+ 10 +
+ +
+ Source code in docetl/optimizers/reduce_optimizer.py +
36
+37
+38
+39
+40
+41
+42
+43
+44
+45
+46
+47
+48
+49
+50
+51
+52
+53
+54
+55
+56
+57
+58
+59
+60
+61
+62
+63
+64
def __init__(
+    self,
+    config: Dict[str, Any],
+    console: Console,
+    llm_client: LLMClient,
+    max_threads: int,
+    run_operation: Callable,
+    num_fold_prompts: int = 1,
+    num_samples_in_validation: int = 10,
+):
+    """
+    Initialize the ReduceOptimizer.
+
+    Args:
+        config (Dict[str, Any]): Configuration dictionary for the optimizer.
+        console (Console): Rich console object for pretty printing.
+        llm_client (LLMClient): Client for interacting with a language model.
+        max_threads (int): Maximum number of threads to use for parallel processing.
+        run_operation (Callable): Function to run an operation.
+        num_fold_prompts (int, optional): Number of fold prompts to generate. Defaults to 1.
+        num_samples_in_validation (int, optional): Number of samples to use in validation. Defaults to 10.
+    """
+    self.config = config
+    self.console = console
+    self.llm_client = llm_client
+    self._run_operation = run_operation
+    self.max_threads = max_threads
+    self.num_fold_prompts = num_fold_prompts
+    self.num_samples_in_validation = num_samples_in_validation
+
+
+
+ +
+ +
+ + +

+ optimize(op_config, input_data, level=1) + +

+ + +
+ +

Optimize the reduce operation based on the given configuration and input data.

+

This method performs the following steps: +1. Run the original operation +2. Generate a validator prompt +3. Validate the output +4. If improvement is needed: + a. Evaluate if decomposition is beneficial + b. If decomposition is beneficial, recursively optimize each sub-operation + c. If not, proceed with single operation optimization +5. Run the optimized operation(s)

+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
op_config + Dict[str, Any] + +
+

Configuration for the reduce operation.

+
+
+ required +
input_data + List[Dict[str, Any]] + +
+

Input data for the reduce operation.

+
+
+ required +
+ + +

Returns:

+ + + + + + + + + + + + + + + + + +
TypeDescription
+ List[Dict[str, Any]] + +
+

Tuple[List[Dict[str, Any]], List[Dict[str, Any]], float]: A tuple containing the list of optimized configurations

+
+
+ List[Dict[str, Any]] + +
+

and the list of outputs from the optimized operation(s), and the cost of the operation due to synthesizing any resolve operations.

+
+
+ +
+ Source code in docetl/optimizers/reduce_optimizer.py +
 66
+ 67
+ 68
+ 69
+ 70
+ 71
+ 72
+ 73
+ 74
+ 75
+ 76
+ 77
+ 78
+ 79
+ 80
+ 81
+ 82
+ 83
+ 84
+ 85
+ 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
+121
+122
+123
+124
+125
+126
+127
+128
+129
+130
+131
+132
+133
+134
+135
+136
+137
+138
+139
+140
+141
+142
+143
+144
+145
+146
+147
+148
+149
+150
+151
+152
+153
+154
+155
+156
+157
+158
+159
+160
+161
+162
+163
+164
+165
+166
+167
+168
+169
+170
+171
+172
+173
+174
+175
+176
+177
+178
+179
+180
+181
+182
+183
+184
+185
+186
+187
+188
+189
+190
+191
def optimize(
+    self,
+    op_config: Dict[str, Any],
+    input_data: List[Dict[str, Any]],
+    level: int = 1,
+) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]], float]:
+    """
+    Optimize the reduce operation based on the given configuration and input data.
+
+    This method performs the following steps:
+    1. Run the original operation
+    2. Generate a validator prompt
+    3. Validate the output
+    4. If improvement is needed:
+       a. Evaluate if decomposition is beneficial
+       b. If decomposition is beneficial, recursively optimize each sub-operation
+       c. If not, proceed with single operation optimization
+    5. Run the optimized operation(s)
+
+    Args:
+        op_config (Dict[str, Any]): Configuration for the reduce operation.
+        input_data (List[Dict[str, Any]]): Input data for the reduce operation.
+
+    Returns:
+        Tuple[List[Dict[str, Any]], List[Dict[str, Any]], float]: A tuple containing the list of optimized configurations
+        and the list of outputs from the optimized operation(s), and the cost of the operation due to synthesizing any resolve operations.
+    """
+    # Check if we're running out of token limits for the reduce prompt
+    model = op_config.get("model", self.config.get("default_model", "gpt-4o-mini"))
+    model_input_context_length = model_cost.get(model, {}).get(
+        "max_input_tokens", 4096
+    )
+
+    # Find the key with the longest value
+    longest_key = max(
+        op_config["reduce_key"], key=lambda k: len(str(input_data[0][k]))
+    )
+    sample_key = tuple(
+        input_data[0][k] if k == longest_key else input_data[0][k]
+        for k in op_config["reduce_key"]
+    )
+
+    # Render the prompt with a sample input
+    prompt_template = Template(op_config["prompt"])
+    sample_prompt = prompt_template.render(
+        reduce_key=dict(zip(op_config["reduce_key"], sample_key)),
+        inputs=[input_data[0]],
+    )
+
+    # Count tokens in the sample prompt
+    prompt_tokens = count_tokens(sample_prompt, model)
+
+    add_map_op = False
+    if prompt_tokens * 2 > model_input_context_length:
+        add_map_op = True
+        self.console.log(
+            f"[yellow]Warning: The reduce prompt exceeds the token limit for model {model}. "
+            f"Token count: {prompt_tokens}, Limit: {model_input_context_length}. "
+            f"Add a map operation to the pipeline.[/yellow]"
+        )
+
+    # # Also query an agent to look at a sample of the inputs and see if they think a map operation would be helpful
+    # preprocessing_steps = ""
+    # should_use_map, preprocessing_steps = self._should_use_map(
+    #     op_config, input_data
+    # )
+    # if should_use_map or add_map_op:
+    #     # Synthesize a map operation
+    #     map_prompt, map_output_schema = self._synthesize_map_operation(
+    #         op_config, preprocessing_steps, input_data
+    #     )
+    #     # Change the reduce operation prompt to use the map schema
+    #     new_reduce_prompt = self._change_reduce_prompt_to_use_map_schema(
+    #         op_config["prompt"], map_output_schema
+    #     )
+    #     op_config["prompt"] = new_reduce_prompt
+
+    #     # Return unoptimized map and reduce operations
+    #     return [map_prompt, op_config], input_data, 0.0
+
+    original_output = self._run_operation(op_config, input_data)
+
+    # Step 1: Synthesize a validator prompt
+    validator_prompt = self._generate_validator_prompt(
+        op_config, input_data, original_output
+    )
+
+    # Log the validator prompt
+    self.console.log("[bold]Validator Prompt:[/bold]")
+    self.console.log(validator_prompt)
+    self.console.log("\n")  # Add a newline for better readability
+
+    # Step 2: validate the output
+    validator_inputs = self._create_validation_inputs(
+        input_data, op_config["reduce_key"]
+    )
+    validation_results = self._validate_reduce_output(
+        op_config, validator_inputs, original_output, validator_prompt
+    )
+
+    # Print the validation results
+    self.console.log("[bold]Validation Results:[/bold]")
+    if validation_results["needs_improvement"]:
+        self.console.log(
+            "\n".join(
+                [
+                    f"Issues: {result['issues']} Suggestions: {result['suggestions']}"
+                    for result in validation_results["validation_results"]
+                ]
+            )
+        )
+
+        # Step 3: Evaluate if decomposition is beneficial
+        decomposition_result = self._evaluate_decomposition(
+            op_config, input_data, level
+        )
+
+        if decomposition_result["should_decompose"]:
+            return self._optimize_decomposed_reduce(
+                decomposition_result, op_config, input_data, level
+            )
+
+        return self._optimize_single_reduce(op_config, input_data, validator_prompt)
+    else:
+        self.console.log("No improvements identified.")
+        return [op_config], original_output, 0.0
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ docetl.optimizers.join_optimizer.JoinOptimizer + + +

+ + +
+ + +
+ Source code in docetl/optimizers/join_optimizer.py +
  20
+  21
+  22
+  23
+  24
+  25
+  26
+  27
+  28
+  29
+  30
+  31
+  32
+  33
+  34
+  35
+  36
+  37
+  38
+  39
+  40
+  41
+  42
+  43
+  44
+  45
+  46
+  47
+  48
+  49
+  50
+  51
+  52
+  53
+  54
+  55
+  56
+  57
+  58
+  59
+  60
+  61
+  62
+  63
+  64
+  65
+  66
+  67
+  68
+  69
+  70
+  71
+  72
+  73
+  74
+  75
+  76
+  77
+  78
+  79
+  80
+  81
+  82
+  83
+  84
+  85
+  86
+  87
+  88
+  89
+  90
+  91
+  92
+  93
+  94
+  95
+  96
+  97
+  98
+  99
+ 100
+ 101
+ 102
+ 103
+ 104
+ 105
+ 106
+ 107
+ 108
+ 109
+ 110
+ 111
+ 112
+ 113
+ 114
+ 115
+ 116
+ 117
+ 118
+ 119
+ 120
+ 121
+ 122
+ 123
+ 124
+ 125
+ 126
+ 127
+ 128
+ 129
+ 130
+ 131
+ 132
+ 133
+ 134
+ 135
+ 136
+ 137
+ 138
+ 139
+ 140
+ 141
+ 142
+ 143
+ 144
+ 145
+ 146
+ 147
+ 148
+ 149
+ 150
+ 151
+ 152
+ 153
+ 154
+ 155
+ 156
+ 157
+ 158
+ 159
+ 160
+ 161
+ 162
+ 163
+ 164
+ 165
+ 166
+ 167
+ 168
+ 169
+ 170
+ 171
+ 172
+ 173
+ 174
+ 175
+ 176
+ 177
+ 178
+ 179
+ 180
+ 181
+ 182
+ 183
+ 184
+ 185
+ 186
+ 187
+ 188
+ 189
+ 190
+ 191
+ 192
+ 193
+ 194
+ 195
+ 196
+ 197
+ 198
+ 199
+ 200
+ 201
+ 202
+ 203
+ 204
+ 205
+ 206
+ 207
+ 208
+ 209
+ 210
+ 211
+ 212
+ 213
+ 214
+ 215
+ 216
+ 217
+ 218
+ 219
+ 220
+ 221
+ 222
+ 223
+ 224
+ 225
+ 226
+ 227
+ 228
+ 229
+ 230
+ 231
+ 232
+ 233
+ 234
+ 235
+ 236
+ 237
+ 238
+ 239
+ 240
+ 241
+ 242
+ 243
+ 244
+ 245
+ 246
+ 247
+ 248
+ 249
+ 250
+ 251
+ 252
+ 253
+ 254
+ 255
+ 256
+ 257
+ 258
+ 259
+ 260
+ 261
+ 262
+ 263
+ 264
+ 265
+ 266
+ 267
+ 268
+ 269
+ 270
+ 271
+ 272
+ 273
+ 274
+ 275
+ 276
+ 277
+ 278
+ 279
+ 280
+ 281
+ 282
+ 283
+ 284
+ 285
+ 286
+ 287
+ 288
+ 289
+ 290
+ 291
+ 292
+ 293
+ 294
+ 295
+ 296
+ 297
+ 298
+ 299
+ 300
+ 301
+ 302
+ 303
+ 304
+ 305
+ 306
+ 307
+ 308
+ 309
+ 310
+ 311
+ 312
+ 313
+ 314
+ 315
+ 316
+ 317
+ 318
+ 319
+ 320
+ 321
+ 322
+ 323
+ 324
+ 325
+ 326
+ 327
+ 328
+ 329
+ 330
+ 331
+ 332
+ 333
+ 334
+ 335
+ 336
+ 337
+ 338
+ 339
+ 340
+ 341
+ 342
+ 343
+ 344
+ 345
+ 346
+ 347
+ 348
+ 349
+ 350
+ 351
+ 352
+ 353
+ 354
+ 355
+ 356
+ 357
+ 358
+ 359
+ 360
+ 361
+ 362
+ 363
+ 364
+ 365
+ 366
+ 367
+ 368
+ 369
+ 370
+ 371
+ 372
+ 373
+ 374
+ 375
+ 376
+ 377
+ 378
+ 379
+ 380
+ 381
+ 382
+ 383
+ 384
+ 385
+ 386
+ 387
+ 388
+ 389
+ 390
+ 391
+ 392
+ 393
+ 394
+ 395
+ 396
+ 397
+ 398
+ 399
+ 400
+ 401
+ 402
+ 403
+ 404
+ 405
+ 406
+ 407
+ 408
+ 409
+ 410
+ 411
+ 412
+ 413
+ 414
+ 415
+ 416
+ 417
+ 418
+ 419
+ 420
+ 421
+ 422
+ 423
+ 424
+ 425
+ 426
+ 427
+ 428
+ 429
+ 430
+ 431
+ 432
+ 433
+ 434
+ 435
+ 436
+ 437
+ 438
+ 439
+ 440
+ 441
+ 442
+ 443
+ 444
+ 445
+ 446
+ 447
+ 448
+ 449
+ 450
+ 451
+ 452
+ 453
+ 454
+ 455
+ 456
+ 457
+ 458
+ 459
+ 460
+ 461
+ 462
+ 463
+ 464
+ 465
+ 466
+ 467
+ 468
+ 469
+ 470
+ 471
+ 472
+ 473
+ 474
+ 475
+ 476
+ 477
+ 478
+ 479
+ 480
+ 481
+ 482
+ 483
+ 484
+ 485
+ 486
+ 487
+ 488
+ 489
+ 490
+ 491
+ 492
+ 493
+ 494
+ 495
+ 496
+ 497
+ 498
+ 499
+ 500
+ 501
+ 502
+ 503
+ 504
+ 505
+ 506
+ 507
+ 508
+ 509
+ 510
+ 511
+ 512
+ 513
+ 514
+ 515
+ 516
+ 517
+ 518
+ 519
+ 520
+ 521
+ 522
+ 523
+ 524
+ 525
+ 526
+ 527
+ 528
+ 529
+ 530
+ 531
+ 532
+ 533
+ 534
+ 535
+ 536
+ 537
+ 538
+ 539
+ 540
+ 541
+ 542
+ 543
+ 544
+ 545
+ 546
+ 547
+ 548
+ 549
+ 550
+ 551
+ 552
+ 553
+ 554
+ 555
+ 556
+ 557
+ 558
+ 559
+ 560
+ 561
+ 562
+ 563
+ 564
+ 565
+ 566
+ 567
+ 568
+ 569
+ 570
+ 571
+ 572
+ 573
+ 574
+ 575
+ 576
+ 577
+ 578
+ 579
+ 580
+ 581
+ 582
+ 583
+ 584
+ 585
+ 586
+ 587
+ 588
+ 589
+ 590
+ 591
+ 592
+ 593
+ 594
+ 595
+ 596
+ 597
+ 598
+ 599
+ 600
+ 601
+ 602
+ 603
+ 604
+ 605
+ 606
+ 607
+ 608
+ 609
+ 610
+ 611
+ 612
+ 613
+ 614
+ 615
+ 616
+ 617
+ 618
+ 619
+ 620
+ 621
+ 622
+ 623
+ 624
+ 625
+ 626
+ 627
+ 628
+ 629
+ 630
+ 631
+ 632
+ 633
+ 634
+ 635
+ 636
+ 637
+ 638
+ 639
+ 640
+ 641
+ 642
+ 643
+ 644
+ 645
+ 646
+ 647
+ 648
+ 649
+ 650
+ 651
+ 652
+ 653
+ 654
+ 655
+ 656
+ 657
+ 658
+ 659
+ 660
+ 661
+ 662
+ 663
+ 664
+ 665
+ 666
+ 667
+ 668
+ 669
+ 670
+ 671
+ 672
+ 673
+ 674
+ 675
+ 676
+ 677
+ 678
+ 679
+ 680
+ 681
+ 682
+ 683
+ 684
+ 685
+ 686
+ 687
+ 688
+ 689
+ 690
+ 691
+ 692
+ 693
+ 694
+ 695
+ 696
+ 697
+ 698
+ 699
+ 700
+ 701
+ 702
+ 703
+ 704
+ 705
+ 706
+ 707
+ 708
+ 709
+ 710
+ 711
+ 712
+ 713
+ 714
+ 715
+ 716
+ 717
+ 718
+ 719
+ 720
+ 721
+ 722
+ 723
+ 724
+ 725
+ 726
+ 727
+ 728
+ 729
+ 730
+ 731
+ 732
+ 733
+ 734
+ 735
+ 736
+ 737
+ 738
+ 739
+ 740
+ 741
+ 742
+ 743
+ 744
+ 745
+ 746
+ 747
+ 748
+ 749
+ 750
+ 751
+ 752
+ 753
+ 754
+ 755
+ 756
+ 757
+ 758
+ 759
+ 760
+ 761
+ 762
+ 763
+ 764
+ 765
+ 766
+ 767
+ 768
+ 769
+ 770
+ 771
+ 772
+ 773
+ 774
+ 775
+ 776
+ 777
+ 778
+ 779
+ 780
+ 781
+ 782
+ 783
+ 784
+ 785
+ 786
+ 787
+ 788
+ 789
+ 790
+ 791
+ 792
+ 793
+ 794
+ 795
+ 796
+ 797
+ 798
+ 799
+ 800
+ 801
+ 802
+ 803
+ 804
+ 805
+ 806
+ 807
+ 808
+ 809
+ 810
+ 811
+ 812
+ 813
+ 814
+ 815
+ 816
+ 817
+ 818
+ 819
+ 820
+ 821
+ 822
+ 823
+ 824
+ 825
+ 826
+ 827
+ 828
+ 829
+ 830
+ 831
+ 832
+ 833
+ 834
+ 835
+ 836
+ 837
+ 838
+ 839
+ 840
+ 841
+ 842
+ 843
+ 844
+ 845
+ 846
+ 847
+ 848
+ 849
+ 850
+ 851
+ 852
+ 853
+ 854
+ 855
+ 856
+ 857
+ 858
+ 859
+ 860
+ 861
+ 862
+ 863
+ 864
+ 865
+ 866
+ 867
+ 868
+ 869
+ 870
+ 871
+ 872
+ 873
+ 874
+ 875
+ 876
+ 877
+ 878
+ 879
+ 880
+ 881
+ 882
+ 883
+ 884
+ 885
+ 886
+ 887
+ 888
+ 889
+ 890
+ 891
+ 892
+ 893
+ 894
+ 895
+ 896
+ 897
+ 898
+ 899
+ 900
+ 901
+ 902
+ 903
+ 904
+ 905
+ 906
+ 907
+ 908
+ 909
+ 910
+ 911
+ 912
+ 913
+ 914
+ 915
+ 916
+ 917
+ 918
+ 919
+ 920
+ 921
+ 922
+ 923
+ 924
+ 925
+ 926
+ 927
+ 928
+ 929
+ 930
+ 931
+ 932
+ 933
+ 934
+ 935
+ 936
+ 937
+ 938
+ 939
+ 940
+ 941
+ 942
+ 943
+ 944
+ 945
+ 946
+ 947
+ 948
+ 949
+ 950
+ 951
+ 952
+ 953
+ 954
+ 955
+ 956
+ 957
+ 958
+ 959
+ 960
+ 961
+ 962
+ 963
+ 964
+ 965
+ 966
+ 967
+ 968
+ 969
+ 970
+ 971
+ 972
+ 973
+ 974
+ 975
+ 976
+ 977
+ 978
+ 979
+ 980
+ 981
+ 982
+ 983
+ 984
+ 985
+ 986
+ 987
+ 988
+ 989
+ 990
+ 991
+ 992
+ 993
+ 994
+ 995
+ 996
+ 997
+ 998
+ 999
+1000
+1001
+1002
+1003
+1004
+1005
+1006
+1007
+1008
+1009
+1010
+1011
+1012
+1013
+1014
+1015
+1016
+1017
+1018
+1019
+1020
+1021
+1022
+1023
+1024
+1025
+1026
+1027
+1028
+1029
+1030
+1031
+1032
+1033
+1034
+1035
+1036
+1037
+1038
+1039
+1040
+1041
+1042
+1043
+1044
+1045
+1046
+1047
+1048
+1049
+1050
+1051
+1052
+1053
+1054
+1055
+1056
+1057
+1058
+1059
+1060
+1061
+1062
+1063
+1064
+1065
+1066
+1067
+1068
+1069
+1070
+1071
+1072
+1073
+1074
+1075
+1076
+1077
+1078
+1079
+1080
+1081
+1082
+1083
+1084
+1085
+1086
+1087
+1088
+1089
+1090
+1091
+1092
+1093
+1094
+1095
+1096
+1097
+1098
+1099
+1100
+1101
+1102
+1103
+1104
+1105
+1106
+1107
+1108
+1109
+1110
+1111
+1112
+1113
+1114
+1115
+1116
+1117
+1118
+1119
+1120
+1121
+1122
+1123
+1124
+1125
+1126
+1127
+1128
+1129
+1130
+1131
+1132
+1133
+1134
+1135
+1136
+1137
+1138
+1139
+1140
+1141
+1142
+1143
+1144
+1145
+1146
+1147
+1148
+1149
+1150
+1151
+1152
+1153
+1154
+1155
+1156
+1157
+1158
+1159
+1160
+1161
+1162
+1163
+1164
+1165
+1166
+1167
+1168
+1169
+1170
+1171
+1172
+1173
+1174
+1175
+1176
+1177
+1178
+1179
+1180
+1181
+1182
+1183
+1184
+1185
+1186
+1187
+1188
+1189
+1190
+1191
+1192
+1193
+1194
+1195
+1196
+1197
+1198
+1199
+1200
+1201
+1202
+1203
+1204
+1205
+1206
+1207
+1208
+1209
+1210
+1211
+1212
+1213
+1214
+1215
+1216
+1217
+1218
+1219
+1220
+1221
+1222
+1223
+1224
+1225
+1226
+1227
+1228
+1229
+1230
+1231
+1232
+1233
+1234
+1235
+1236
+1237
+1238
+1239
+1240
+1241
+1242
+1243
+1244
+1245
+1246
+1247
+1248
+1249
+1250
+1251
+1252
+1253
+1254
+1255
+1256
+1257
+1258
+1259
+1260
+1261
+1262
+1263
+1264
+1265
+1266
+1267
+1268
+1269
+1270
+1271
+1272
+1273
+1274
+1275
+1276
+1277
+1278
+1279
+1280
+1281
+1282
+1283
+1284
+1285
+1286
+1287
+1288
+1289
+1290
+1291
+1292
+1293
+1294
+1295
+1296
+1297
+1298
+1299
+1300
+1301
+1302
+1303
+1304
+1305
+1306
+1307
+1308
+1309
+1310
+1311
+1312
+1313
+1314
+1315
+1316
+1317
+1318
+1319
+1320
+1321
+1322
+1323
+1324
+1325
+1326
+1327
+1328
+1329
+1330
+1331
+1332
+1333
+1334
+1335
+1336
+1337
+1338
+1339
+1340
+1341
+1342
+1343
+1344
+1345
+1346
+1347
+1348
+1349
+1350
+1351
+1352
+1353
+1354
+1355
+1356
+1357
+1358
+1359
+1360
+1361
+1362
+1363
+1364
+1365
+1366
+1367
+1368
+1369
+1370
+1371
+1372
+1373
+1374
+1375
+1376
+1377
+1378
+1379
+1380
+1381
+1382
+1383
+1384
+1385
+1386
+1387
+1388
+1389
+1390
+1391
+1392
+1393
+1394
+1395
+1396
+1397
+1398
+1399
+1400
+1401
+1402
+1403
+1404
+1405
+1406
+1407
+1408
+1409
+1410
+1411
+1412
+1413
+1414
+1415
+1416
+1417
+1418
+1419
+1420
+1421
+1422
+1423
+1424
+1425
+1426
+1427
+1428
+1429
+1430
+1431
+1432
+1433
+1434
+1435
+1436
+1437
+1438
+1439
+1440
+1441
+1442
+1443
+1444
+1445
+1446
+1447
+1448
+1449
+1450
+1451
+1452
+1453
+1454
+1455
+1456
+1457
+1458
+1459
+1460
+1461
+1462
+1463
+1464
+1465
+1466
+1467
+1468
+1469
+1470
+1471
+1472
+1473
+1474
+1475
+1476
+1477
+1478
+1479
+1480
+1481
+1482
+1483
+1484
+1485
+1486
+1487
+1488
+1489
+1490
+1491
+1492
+1493
+1494
+1495
+1496
+1497
+1498
+1499
+1500
+1501
+1502
+1503
+1504
+1505
+1506
+1507
+1508
+1509
+1510
+1511
+1512
+1513
+1514
+1515
+1516
+1517
+1518
+1519
+1520
+1521
+1522
+1523
+1524
+1525
+1526
+1527
+1528
+1529
+1530
+1531
+1532
+1533
+1534
+1535
+1536
+1537
+1538
+1539
+1540
+1541
+1542
+1543
+1544
+1545
+1546
+1547
+1548
+1549
+1550
+1551
+1552
+1553
+1554
+1555
+1556
+1557
+1558
+1559
+1560
+1561
+1562
+1563
+1564
+1565
+1566
+1567
+1568
+1569
+1570
+1571
+1572
+1573
+1574
+1575
+1576
+1577
+1578
+1579
+1580
+1581
+1582
+1583
+1584
+1585
+1586
+1587
+1588
+1589
+1590
+1591
+1592
+1593
+1594
+1595
+1596
+1597
+1598
+1599
+1600
+1601
+1602
+1603
+1604
+1605
+1606
+1607
+1608
+1609
+1610
+1611
+1612
+1613
+1614
+1615
+1616
+1617
+1618
+1619
+1620
+1621
+1622
+1623
+1624
+1625
+1626
+1627
+1628
+1629
+1630
+1631
+1632
+1633
+1634
+1635
+1636
+1637
+1638
+1639
+1640
+1641
+1642
+1643
+1644
+1645
+1646
+1647
+1648
+1649
+1650
+1651
+1652
+1653
+1654
+1655
+1656
+1657
+1658
+1659
+1660
+1661
+1662
+1663
+1664
+1665
+1666
+1667
+1668
+1669
+1670
+1671
+1672
+1673
+1674
+1675
+1676
+1677
+1678
+1679
+1680
+1681
+1682
+1683
+1684
+1685
+1686
+1687
class JoinOptimizer:
+    def __init__(
+        self,
+        config: Dict[str, Any],
+        op_config: Dict[str, Any],
+        console: Console,
+        llm_client: Any,
+        max_threads: int,
+        target_recall: float = 0.95,
+        sample_size: int = 500,
+        sampling_weight: float = 20,
+        agent_max_retries: int = 5,
+        estimated_selectivity: float = None,
+        status: Status = None,
+    ):
+        self.config = config
+        self.op_config = op_config
+        self.llm_client = llm_client
+        self.max_threads = max_threads
+        self.console = console
+        self.target_recall = target_recall
+        self.sample_size = sample_size
+        self.sampling_weight = sampling_weight
+        self.agent_max_retries = agent_max_retries
+        self.estimated_selectivity = estimated_selectivity
+        self.console.log(f"Target Recall: {self.target_recall}")
+        self.status = status
+        # if self.estimated_selectivity is not None:
+        #     self.console.log(
+        #         f"[yellow]Using estimated selectivity of {self.estimated_selectivity}[/yellow]"
+        #     )
+
+    def _analyze_map_prompt_categorization(self, map_prompt: str) -> bool:
+        """
+        Analyze the map prompt to determine if it's explicitly categorical.
+
+        Args:
+            map_prompt (str): The map prompt to analyze.
+
+        Returns:
+            bool: True if the prompt is explicitly categorical, False otherwise.
+        """
+        messages = [
+            {
+                "role": "system",
+                "content": "You are an AI assistant tasked with analyzing prompts for data processing operations.",
+            },
+            {
+                "role": "user",
+                "content": f"""Analyze the following map operation prompt and determine if it is explicitly categorical,
+                meaning it details a specific set of possible outputs:
+
+                {map_prompt}
+
+                Respond with 'Yes' if the prompt is explicitly categorical, detailing a finite set of possible outputs.
+                Respond with 'No' if the prompt allows for open-ended or non-categorical responses.
+                Provide a brief explanation for your decision.""",
+            },
+        ]
+
+        response = self.llm_client.generate(
+            messages,
+            "You are an expert in analyzing natural language prompts for data processing tasks.",
+            {
+                "type": "object",
+                "properties": {
+                    "is_categorical": {
+                        "type": "string",
+                        "enum": ["Yes", "No"],
+                        "description": "Whether the prompt is explicitly categorical",
+                    },
+                    "explanation": {
+                        "type": "string",
+                        "description": "Brief explanation for the decision",
+                    },
+                },
+                "required": ["is_categorical", "explanation"],
+            },
+        )
+
+        analysis = json.loads(response.choices[0].message.content)
+
+        self.console.log("[bold]Map Prompt Analysis:[/bold]")
+        self.console.log(f"Is Categorical: {analysis['is_categorical']}")
+        self.console.log(f"Explanation: {analysis['explanation']}")
+
+        return analysis["is_categorical"].lower() == "yes"
+
+    def _determine_duplicate_keys(
+        self,
+        input_data: List[Dict[str, Any]],
+        reduce_key: List[str],
+        map_prompt: Optional[str] = None,
+    ) -> bool:
+        # Prepare a sample of the input data for analysis
+        sample_size = min(10, len(input_data))
+        data_sample = random.sample(
+            [{rk: item[rk] for rk in reduce_key} for item in input_data], sample_size
+        )
+
+        context_prefix = ""
+        if map_prompt:
+            context_prefix = f"For context, these values came out of a pipeline with the following prompt:\n\n{map_prompt}\n\n"
+
+        messages = [
+            {
+                "role": "user",
+                "content": f"{context_prefix}I want to do a reduce operation on these values, and I need to determine if there are semantic duplicates in the data, where the strings are different but they technically belong in the same group. Note that exact string duplicates should not be considered here.\n\nHere's a sample of the data (showing the '{reduce_key}' field(s)): {data_sample}\n\nBased on this {'context and ' if map_prompt else ''}sample, are there likely to be such semantic duplicates (not exact string matches) in the dataset? Respond with 'yes' only if you think there are semantic duplicates, or 'no' if you don't see evidence of semantic duplicates or if you only see exact string duplicates.",
+            },
+        ]
+        response = self.llm_client.generate(
+            messages,
+            "You are an expert data analyst. Analyze the given data sample and determine if there are likely to be semantic duplicate values that belong in the same group, even if the strings are different.",
+            {
+                "type": "object",
+                "properties": {
+                    "likely_duplicates": {
+                        "type": "string",
+                        "enum": ["Yes", "No"],
+                        "description": "Whether duplicates are likely to exist in the full dataset",
+                    },
+                    "explanation": {
+                        "type": "string",
+                        "description": "Brief explanation for the decision",
+                    },
+                },
+                "required": ["likely_duplicates", "explanation"],
+            },
+        )
+
+        analysis = json.loads(response.choices[0].message.content)
+
+        self.console.log(f"[bold]Duplicate Analysis for '{reduce_key}':[/bold]")
+        self.console.log(f"Likely Duplicates: {analysis['likely_duplicates']}")
+        self.console.log(f"Explanation: {analysis['explanation']}")
+
+        if analysis["likely_duplicates"].lower() == "yes":
+            self.console.log(
+                "[yellow]Duplicates are likely. Consider using a deduplication strategy in the resolution step.[/yellow]"
+            )
+            return True
+        return False
+
+    def _sample_random_pairs(
+        self, input_data: List[Dict[str, Any]], n: int
+    ) -> List[Tuple[int, int]]:
+        """Sample random pairs of indices, excluding exact matches."""
+        pairs = set()
+        max_attempts = n * 10  # Avoid infinite loop
+        attempts = 0
+
+        while len(pairs) < n and attempts < max_attempts:
+            i, j = random.sample(range(len(input_data)), 2)
+            if i != j and input_data[i] != input_data[j]:
+                pairs.add((min(i, j), max(i, j)))  # Ensure ordered pairs
+            attempts += 1
+
+        return list(pairs)
+
+    def _check_duplicates_with_llm(
+        self,
+        input_data: List[Dict[str, Any]],
+        pairs: List[Tuple[int, int]],
+        reduce_key: List[str],
+        map_prompt: Optional[str],
+    ) -> bool:
+        """Use LLM to check if any pairs are duplicates."""
+
+        content = "Analyze the following pairs of entries and determine if any of them are likely duplicates. Respond with 'Yes' if you find any likely duplicates, or 'No' if none of the pairs seem to be duplicates. Provide a brief explanation for your decision.\n\n"
+
+        if map_prompt:
+            content = (
+                f"For reference, here is the map prompt used earlier in the pipeline: {map_prompt}\n\n"
+                + content
+            )
+
+        for i, (idx1, idx2) in enumerate(pairs, 1):
+            content += f"Pair {i}:\n"
+            content += "Entry 1:\n"
+            for key in reduce_key:
+                content += f"{key}: {json.dumps(input_data[idx1][key], indent=2)}\n"
+            content += "\nEntry 2:\n"
+            for key in reduce_key:
+                content += f"{key}: {json.dumps(input_data[idx2][key], indent=2)}\n"
+            content += "\n"
+
+        messages = [{"role": "user", "content": content}]
+
+        system_prompt = "You are an AI assistant tasked with identifying potential duplicate entries in a dataset."
+        response_schema = {
+            "type": "object",
+            "properties": {
+                "duplicates_found": {"type": "string", "enum": ["Yes", "No"]},
+                "explanation": {"type": "string"},
+            },
+            "required": ["duplicates_found", "explanation"],
+        }
+
+        response = self.llm_client.generate(messages, system_prompt, response_schema)
+
+        # Print the duplicates_found and explanation
+        self.console.log(
+            f"[bold]Duplicates in keys found:[/bold] {response['duplicates_found']}\n"
+            f"[bold]Explanation:[/bold] {response['explanation']}"
+        )
+
+        return response["duplicates_found"].lower() == "yes"
+
+    def synthesize_compare_prompt(
+        self, map_prompt: Optional[str], reduce_key: List[str]
+    ) -> str:
+
+        system_prompt = f"You are an AI assistant tasked with creating a comparison prompt for LLM-assisted entity resolution. Your task is to create a comparison prompt that will be used to compare two entities, referred to as input1 and input2, to see if they are likely the same entity based on the following reduce key(s): {', '.join(reduce_key)}."
+        if map_prompt:
+            system_prompt += f"\n\nFor context, here is the prompt used earlier in the pipeline to create the inputs to resolve: {map_prompt}"
+
+        messages = [
+            {
+                "role": "user",
+                "content": f"""
+    Create a comparison prompt for entity resolution: The prompt should:
+    1. Be tailored to the specific domain and type of data being compared, based on the context provided.
+    2. Instruct to compare two entities, referred to as input1 and input2.
+    3. Specifically mention comparing each reduce key in input1 and input2 (e.g., input1.{{key}} and input2.{{key}} for each key in {reduce_key}).
+    4. Include instructions to consider relevant attributes or characteristics for comparison.
+    5. Ask to respond with "True" if the entities are likely the same, or "False" if they are likely different.
+
+    Example structure:
+    ```
+    Compare the following two [entity type]:
+
+    [Entity 1]:
+    {{{{ input1.key1 }}}}
+
+    [Entity 2]:
+    {{{{ input2.key1 }}}}
+
+    Are these [entities] likely referring to the same [entity type]? Consider [list relevant attributes or characteristics to compare]. Respond with "True" if they are likely the same [entity type], or "False" if they are likely different [entity types].
+    ```
+
+    Please generate the comparison prompt:
+    """,
+            }
+        ]
+
+        response = self.llm_client.generate(
+            messages,
+            system_prompt,
+            {
+                "type": "object",
+                "properties": {
+                    "comparison_prompt": {
+                        "type": "string",
+                        "description": "Detailed comparison prompt for entity resolution",
+                    }
+                },
+                "required": ["comparison_prompt"],
+            },
+        )
+
+        comparison_prompt = json.loads(response.choices[0].message.content)[
+            "comparison_prompt"
+        ]
+
+        # Log the synthesized comparison prompt
+        self.console.log("[green]Synthesized comparison prompt:[/green]")
+        self.console.log(comparison_prompt)
+
+        if not comparison_prompt:
+            raise ValueError(
+                "Could not synthesize a comparison prompt. Please provide a comparison prompt in the config."
+            )
+
+        return comparison_prompt
+
+    def synthesize_resolution_prompt(
+        self,
+        map_prompt: Optional[str],
+        reduce_key: List[str],
+        output_schema: Dict[str, str],
+    ) -> str:
+        system_prompt = f"""You are an AI assistant tasked with creating a resolution prompt for LLM-assisted entity resolution.
+        Your task is to create a prompt that will be used to merge multiple duplicate keys into a single, consolidated key.
+        The key(s) being resolved (known as the reduce_key) are {', '.join(reduce_key)}.
+        The duplicate keys will be provided in a list called 'inputs' in a Jinja2 template.
+        """
+
+        if map_prompt:
+            system_prompt += f"\n\nFor context, here is the prompt used earlier in the pipeline to create the inputs to resolve: {map_prompt}"
+
+        messages = [
+            {
+                "role": "user",
+                "content": f"""
+    Create a resolution prompt for merging duplicate keys into a single key. The prompt should:
+    1. Be tailored to the specific domain and type of data being merged, based on the context provided.
+    2. Use a Jinja2 template to iterate over the duplicate keys (accessed as 'inputs', where each item is a dictionary containing the reduce_key fields, which you can access as entry.reduce_key for each reduce_key in {reduce_key}).
+    3. Instruct to create a single, consolidated key from the duplicate keys.
+    4. Include guidelines for resolving conflicts (e.g., choosing the most recent, most complete, or most reliable information).
+    5. Specify that the output of the resolution prompt should conform to the given output schema: {json.dumps(output_schema, indent=2)}
+
+    Example structure:
+    ```
+    Analyze the following duplicate entries:
+
+    {{% for key in inputs %}}
+    Entry {{{{ loop.index }}}}:
+    {{{{ key | tojson }}}}
+
+    {{% endfor %}}
+
+    Create a single, consolidated key that combines the information from all duplicate entries.
+    When merging, follow these guidelines:
+    1. [Provide specific merging instructions relevant to the data type]
+    2. [Provide conflict resolution guidelines]
+    3. [Any other relevant instructions]
+
+    Ensure that the merged key conforms to the following schema:
+    {json.dumps(output_schema, indent=2)}
+
+    Return the consolidated key as a single [appropriate data type] value.
+    ```
+
+    Please generate the resolution prompt:
+    """,
+            }
+        ]
+
+        response = self.llm_client.generate(
+            messages,
+            system_prompt,
+            {
+                "type": "object",
+                "properties": {
+                    "resolution_prompt": {
+                        "type": "string",
+                        "description": "Detailed resolution prompt for merging duplicate keys",
+                    }
+                },
+                "required": ["resolution_prompt"],
+            },
+        )
+
+        resolution_prompt = json.loads(response.choices[0].message.content)[
+            "resolution_prompt"
+        ]
+
+        # Log the synthesized resolution prompt
+        self.console.log("[green]Synthesized resolution prompt:[/green]")
+        self.console.log(resolution_prompt)
+
+        if not resolution_prompt:
+            raise ValueError(
+                "Could not synthesize a resolution prompt. Please provide a resolution prompt in the config."
+            )
+
+        return resolution_prompt
+
+    def optimize_resolve(
+        self, input_data: List[Dict[str, Any]]
+    ) -> Tuple[Dict[str, Any], float]:
+
+        # Check if the operation is marked as empty
+        if self.op_config.get("empty", False):
+            # Extract the map prompt from the intermediates
+            map_prompt = self.op_config["_intermediates"]["map_prompt"]
+            reduce_key = self.op_config["_intermediates"]["reduce_key"]
+
+            if reduce_key is None:
+                raise ValueError(
+                    "[yellow]Warning: No reduce key found in intermediates for synthesized resolve operation.[/yellow]"
+                )
+
+            dedup = True
+
+            if map_prompt:
+                # Analyze the map prompt
+                analysis = self._analyze_map_prompt_categorization(map_prompt)
+
+                if analysis:
+                    dedup = False
+            else:
+                self.console.log(
+                    "[yellow]No map prompt found in intermediates for analysis.[/yellow]"
+                )
+
+            # TODO: figure out why this would ever be the case
+            if not map_prompt:
+                map_prompt = "N/A"
+
+            if dedup is False:
+                dedup = self._determine_duplicate_keys(
+                    input_data, reduce_key, map_prompt
+                )
+
+            # Now do the last attempt of pairwise comparisons
+            if dedup is False:
+                # Sample up to 20 random pairs of keys for duplicate analysis
+                sampled_pairs = self._sample_random_pairs(input_data, 20)
+
+                # Use LLM to check for duplicates
+                duplicates_found = self._check_duplicates_with_llm(
+                    input_data, sampled_pairs, reduce_key, map_prompt
+                )
+
+                if duplicates_found:
+                    dedup = True
+
+            if dedup is False:
+                # If no deduplication is needed, return the same config with 0 cost
+                return self.op_config, 0.0
+
+            # Add the reduce key to the output schema in the config
+            self.op_config["output"] = {"schema": {rk: "string" for rk in reduce_key}}
+            for attempt in range(2):  # Try up to 2 times
+                self.op_config["comparison_prompt"] = self.synthesize_compare_prompt(
+                    map_prompt, reduce_key
+                )
+                if (
+                    "input1" in self.op_config["comparison_prompt"]
+                    and "input2" in self.op_config["comparison_prompt"]
+                ):
+                    break
+                elif attempt == 0:
+                    self.console.log(
+                        "[yellow]Warning: 'input1' or 'input2' not found in comparison prompt. Retrying...[/yellow]"
+                    )
+            if (
+                "input1" not in self.op_config["comparison_prompt"]
+                or "input2" not in self.op_config["comparison_prompt"]
+            ):
+                self.console.log(
+                    "[red]Error: Failed to generate comparison prompt with 'input1' and 'input2'. Using last generated prompt.[/red]"
+                )
+            for attempt in range(2):  # Try up to 2 times
+                self.op_config["resolution_prompt"] = self.synthesize_resolution_prompt(
+                    map_prompt, reduce_key, self.op_config["output"]["schema"]
+                )
+                if "inputs" in self.op_config["resolution_prompt"]:
+                    break
+                elif attempt == 0:
+                    self.console.log(
+                        "[yellow]Warning: 'inputs' not found in resolution prompt. Retrying...[/yellow]"
+                    )
+            if "inputs" not in self.op_config["resolution_prompt"]:
+                self.console.log(
+                    "[red]Error: Failed to generate resolution prompt with 'inputs'. Using last generated prompt.[/red]"
+                )
+
+            # Pop off the empty flag
+            self.op_config.pop("empty")
+
+        embeddings, blocking_keys, embedding_cost = self._compute_embeddings(input_data)
+        self.console.log(
+            f"[bold]Cost of creating embeddings on the sample: ${embedding_cost:.4f}[/bold]"
+        )
+
+        similarities = self._calculate_cosine_similarities(embeddings)
+
+        sampled_pairs = self._sample_pairs(similarities)
+        comparison_results, comparison_cost = self._perform_comparisons_resolve(
+            input_data, sampled_pairs
+        )
+
+        self._print_similarity_histogram(similarities, comparison_results)
+
+        threshold, estimated_selectivity = self._find_optimal_threshold(
+            comparison_results, similarities
+        )
+
+        blocking_rules = self._generate_blocking_rules(
+            blocking_keys, input_data, comparison_results
+        )
+
+        if blocking_rules:
+            false_negatives, rule_selectivity = self._verify_blocking_rule(
+                input_data,
+                blocking_rules[0],
+                blocking_keys,
+                comparison_results,
+            )
+            if not false_negatives and rule_selectivity <= estimated_selectivity:
+                self.console.log(
+                    "[green]Blocking rule verified. No false negatives detected in the sample and selectivity is within estimated selectivity.[/green]"
+                )
+            else:
+                if false_negatives:
+                    self.console.log(
+                        f"[red]Blocking rule rejected. {len(false_negatives)} false negatives detected in the sample.[/red]"
+                    )
+                    for i, j in false_negatives[:5]:  # Show up to 5 examples
+                        self.console.log(
+                            f"  Filtered pair: {{ {blocking_keys[0]}: {input_data[i][blocking_keys[0]]} }} and {{ {blocking_keys[0]}: {input_data[j][blocking_keys[0]]} }}"
+                        )
+                    if len(false_negatives) > 5:
+                        self.console.log(f"  ... and {len(false_negatives) - 5} more.")
+                if rule_selectivity > estimated_selectivity:
+                    self.console.log(
+                        f"[red]Blocking rule rejected. Rule selectivity ({rule_selectivity:.4f}) is higher than the estimated selectivity ({estimated_selectivity:.4f}).[/red]"
+                    )
+                blocking_rules = (
+                    []
+                )  # Clear the blocking rule if it introduces false negatives or is too selective
+
+        optimized_config = self._update_config(threshold, blocking_keys, blocking_rules)
+        return optimized_config, embedding_cost + comparison_cost
+
+    def optimize_equijoin(
+        self, left_data: List[Dict[str, Any]], right_data: List[Dict[str, Any]]
+    ) -> Tuple[Dict[str, Any], float, Dict[str, Any]]:
+        left_keys = self.op_config.get("blocking_keys", {}).get("left", [])
+        right_keys = self.op_config.get("blocking_keys", {}).get("right", [])
+
+        if not left_keys and not right_keys:
+            # Ask the LLM agent if it would be beneficial to do a map operation on
+            # one of the datasets before doing an equijoin
+            apply_transformation, dataset_to_transform, reason = (
+                self._should_apply_map_transformation(
+                    left_keys, right_keys, left_data, right_data
+                )
+            )
+
+            if apply_transformation:
+                self.console.log(
+                    f"LLM agent suggested applying a map transformation to {dataset_to_transform} dataset because: {reason}"
+                )
+                extraction_prompt, output_key, new_comparison_prompt = (
+                    self._generate_map_and_new_join_transformation(
+                        dataset_to_transform, reason, left_data, right_data
+                    )
+                )
+                self.console.log(
+                    f"Generated map transformation prompt: {extraction_prompt}"
+                )
+                self.console.log(f"\nNew output key: {output_key}")
+                self.console.log(
+                    f"\nNew equijoin comparison prompt: {new_comparison_prompt}"
+                )
+
+                # Update the comparison prompt
+                self.op_config["comparison_prompt"] = new_comparison_prompt
+
+                # Add the output key to the left_keys or right_keys
+                if dataset_to_transform == "left":
+                    left_keys.append(output_key)
+                else:
+                    right_keys.append(output_key)
+
+                # Reset the blocking keys in the config
+                self.op_config["blocking_keys"] = {
+                    "left": left_keys,
+                    "right": right_keys,
+                }
+
+                # Bubble up this config and return the transformation prompt, so we can optimize the map operation
+                return (
+                    self.op_config,
+                    0.0,
+                    {
+                        "optimize_map": True,
+                        "map_prompt": extraction_prompt,
+                        "output_key": output_key,
+                        "dataset_to_transform": dataset_to_transform,
+                    },
+                )
+
+            # Print the reason for not applying a map transformation
+            self.console.log(
+                f"Reason for not synthesizing a map transformation for either left or right dataset: {reason}"
+            )
+
+        # If there are no blocking keys, generate them
+        if not left_keys or not right_keys:
+            generated_left_keys, generated_right_keys = (
+                self._generate_blocking_keys_equijoin(left_data, right_data)
+            )
+            left_keys.extend(generated_left_keys)
+            right_keys.extend(generated_right_keys)
+            left_keys = list(set(left_keys))
+            right_keys = list(set(right_keys))
+
+            # Log the generated blocking keys
+            self.console.log(
+                f"[bold]Generated blocking keys (for embeddings-based blocking):[/bold]"
+            )
+            self.console.log(f"Left keys: {left_keys}")
+            self.console.log(f"Right keys: {right_keys}")
+
+        left_embeddings, _, left_embedding_cost = self._compute_embeddings(
+            left_data, keys=left_keys
+        )
+        right_embeddings, _, right_embedding_cost = self._compute_embeddings(
+            right_data, keys=right_keys
+        )
+        self.console.log(
+            f"[bold]Cost of creating embeddings on the sample: ${left_embedding_cost + right_embedding_cost:.4f}[/bold]"
+        )
+
+        similarities = self._calculate_cross_similarities(
+            left_embeddings, right_embeddings
+        )
+
+        sampled_pairs = self._sample_pairs(similarities)
+        comparison_results, comparison_cost = self._perform_comparisons_equijoin(
+            left_data, right_data, sampled_pairs
+        )
+        self._print_similarity_histogram(similarities, comparison_results)
+        while not any(result[2] for result in comparison_results):
+            self.console.log(
+                "[yellow]No matches found in the current sample. Resampling pairs to compare...[/yellow]"
+            )
+            sampled_pairs = self._sample_pairs(similarities)
+            comparison_results, current_cost = self._perform_comparisons_equijoin(
+                left_data, right_data, sampled_pairs
+            )
+            comparison_cost += current_cost
+            self._print_similarity_histogram(similarities, comparison_results)
+
+        threshold, estimated_selectivity = self._find_optimal_threshold(
+            comparison_results, similarities
+        )
+        self.estimated_selectivity = estimated_selectivity
+
+        blocking_rules = self._generate_blocking_rules_equijoin(
+            left_keys, right_keys, left_data, right_data, comparison_results
+        )
+
+        if blocking_rules:
+            false_negatives, rule_selectivity = self._verify_blocking_rule_equijoin(
+                left_data,
+                right_data,
+                blocking_rules[0],
+                left_keys,
+                right_keys,
+                comparison_results,
+            )
+            if not false_negatives and rule_selectivity <= estimated_selectivity:
+                self.console.log(
+                    "[green]Blocking rule verified. No false negatives detected in the sample and selectivity is within bounds.[/green]"
+                )
+            else:
+                if false_negatives:
+                    self.console.log(
+                        f"[red]Blocking rule rejected. {len(false_negatives)} false negatives detected in the sample.[/red]"
+                    )
+                    for i, j in false_negatives[:5]:  # Show up to 5 examples
+                        self.console.log(
+                            f"  Filtered pair: Left: {{{', '.join(f'{key}: {left_data[i][key]}' for key in left_keys)}}} and Right: {{{', '.join(f'{key}: {right_data[j][key]}' for key in right_keys)}}}"
+                        )
+                    if len(false_negatives) > 5:
+                        self.console.log(f"  ... and {len(false_negatives) - 5} more.")
+                if rule_selectivity > estimated_selectivity:
+                    self.console.log(
+                        f"[red]Blocking rule rejected. Rule selectivity ({rule_selectivity:.4f}) is higher than the estimated selectivity ({estimated_selectivity:.4f}).[/red]"
+                    )
+                blocking_rules = (
+                    []
+                )  # Clear the blocking rule if it introduces false negatives or is too selective
+
+        containment_rules = self._generate_containment_rules_equijoin(
+            left_data, right_data
+        )
+        self.console.log(
+            f"[bold]Generated {len(containment_rules)} containment rules. Please select which ones to use as blocking conditions:[/bold]"
+        )
+        selected_containment_rules = []
+        for rule in containment_rules:
+            self.console.log(f"[green]{rule}[/green]")
+            # Temporarily stop the status
+            if self.status:
+                self.status.stop()
+            # Use Rich's Confirm for input
+            if Confirm.ask("Use this rule?"):
+                selected_containment_rules.append(rule)
+            # Restart the status
+            if self.status:
+                self.status.start()
+
+        if len(containment_rules) > 0:
+            self.console.log(
+                f"[bold]Selected {len(selected_containment_rules)} containment rules for blocking.[/bold]"
+            )
+        blocking_rules.extend(selected_containment_rules)
+
+        optimized_config = self._update_config_equijoin(
+            threshold, left_keys, right_keys, blocking_rules
+        )
+        return (
+            optimized_config,
+            left_embedding_cost + right_embedding_cost + comparison_cost,
+            {},
+        )
+
+    def _should_apply_map_transformation(
+        self,
+        left_keys: List[str],
+        right_keys: List[str],
+        left_data: List[Dict[str, Any]],
+        right_data: List[Dict[str, Any]],
+        sample_size: int = 5,
+    ) -> Tuple[bool, str, str]:
+        # Sample data
+        left_sample = random.sample(left_data, min(sample_size, len(left_data)))
+        right_sample = random.sample(right_data, min(sample_size, len(right_data)))
+
+        # Get keys and their average lengths
+        all_left_keys = {
+            k: sum(len(str(d[k])) for d in left_sample) / len(left_sample)
+            for k in left_sample[0].keys()
+        }
+        all_right_keys = {
+            k: sum(len(str(d[k])) for d in right_sample) / len(right_sample)
+            for k in right_sample[0].keys()
+        }
+
+        messages = [
+            {
+                "role": "user",
+                "content": f"""Analyze the following datasets and determine if an additional LLM transformation should be applied to generate a new key-value pair for easier joining:
+
+                Comparison prompt for the join operation: {self.op_config.get('comparison_prompt', 'No comparison prompt provided.')}
+
+                Left dataset keys and average lengths: {json.dumps(all_left_keys, indent=2)}
+                Right dataset keys and average lengths: {json.dumps(all_right_keys, indent=2)}
+
+                Left dataset sample:
+                {json.dumps(left_sample, indent=2)}
+
+                Right dataset sample:
+                {json.dumps(right_sample, indent=2)}
+
+                Current keys used for embedding-based ranking of likely matches:
+                Left keys: {left_keys}
+                Right keys: {right_keys}
+
+                Consider the following:
+                1. Are the current keys sufficient for accurate embedding-based ranking of likely matches? We don't want to use too many keys, or keys with too much information, as this will dilute the signal in the embeddings.
+                2. Are there any keys particularly long (e.g., full text fields), containing information that is not relevant for the join operation?
+                3. Is there information spread across multiple fields that could be combined?
+                4. Would a summary or extraction of key information be beneficial?
+                5. Is there a mismatch in information representation between the datasets?
+                6. Could an additional LLM-generated field improve the accuracy of embeddings or join comparisons?
+
+                If you believe an additional LLM transformation would be beneficial, specify which dataset (left or right) should be transformed and explain why. In most cases, you should pick the dataset with the longer keys unless there is a specific reason to pick the other dataset. Otherwise, indicate that no additional transformation is needed and explain why the current blocking keys are sufficient.""",
+            }
+        ]
+
+        response = self.llm_client.generate(
+            messages,
+            "You are an AI expert in data analysis and entity matching.",
+            {
+                "type": "object",
+                "properties": {
+                    "apply_transformation": {"type": "boolean"},
+                    "dataset_to_transform": {
+                        "type": "string",
+                        "enum": ["left", "right", "none"],
+                    },
+                    "reason": {"type": "string"},
+                },
+                "required": ["apply_transformation", "dataset_to_transform", "reason"],
+            },
+        )
+
+        result = json.loads(response.choices[0].message.content)
+
+        return (
+            result["apply_transformation"],
+            result["dataset_to_transform"],
+            result["reason"],
+        )
+
+    def _generate_map_and_new_join_transformation(
+        self,
+        dataset_to_transform: str,
+        reason: str,
+        left_data: List[Dict[str, Any]],
+        right_data: List[Dict[str, Any]],
+        sample_size: int = 5,
+    ) -> Tuple[str, str, str]:
+        # Sample data
+        left_sample = random.sample(left_data, min(sample_size, len(left_data)))
+        right_sample = random.sample(right_data, min(sample_size, len(right_data)))
+
+        target_data = left_sample if dataset_to_transform == "left" else right_sample
+
+        messages = [
+            {
+                "role": "user",
+                "content": f"""Generate an LLM prompt to transform the {dataset_to_transform} dataset for easier joining. The transformation should create a new key-value pair.
+
+                Current comparison prompt for the join operation: {self.op_config.get('comparison_prompt', 'No comparison prompt provided.')}
+
+                Target ({dataset_to_transform}) dataset sample:
+                {json.dumps(target_data, indent=2)}
+
+                Other ({'left' if dataset_to_transform == "right" else "right"}) dataset sample:
+                {json.dumps(right_sample if dataset_to_transform == "left" else left_sample, indent=2)}
+
+                Reason for transforming {dataset_to_transform} dataset: {reason}
+
+                Please provide:
+                1. An LLM prompt to extract a smaller representation of what is relevant to the join task. The prompt should be a Jinja2 template, referring to any fields in the input data as {{ input.field_name }}. The prompt should instruct the LLM to return some **non-empty** string-valued output. The transformation should be tailored to the join task if possible, not just a generic summary of the data. 
+                2. A name for the new output key that will store the transformed data.
+                3. An edited comparison prompt that leverages the new attribute created by the transformation. This prompt should be a Jinja2 template, referring to any fields in the input data as {{ left.field_name }} and {{ right.field_name }}. The prompt should be the same as the current comparison prompt, but with a new instruction that leverages the new attribute created by the transformation. The prompt should instruct the LLM to return a boolean-valued output, like the current comparison prompt.""",
+            }
+        ]
+
+        response = self.llm_client.generate(
+            messages,
+            "You are an AI expert in data analysis and decomposing complex data processing pipelines.",
+            {
+                "type": "object",
+                "properties": {
+                    "extraction_prompt": {"type": "string"},
+                    "output_key": {"type": "string"},
+                    "new_comparison_prompt": {"type": "string"},
+                },
+                "required": [
+                    "extraction_prompt",
+                    "output_key",
+                    "new_comparison_prompt",
+                ],
+            },
+        )
+
+        result = json.loads(response.choices[0].message.content)
+
+        return (
+            result["extraction_prompt"],
+            result["output_key"],
+            result["new_comparison_prompt"],
+        )
+
+    def _generate_blocking_keys_equijoin(
+        self,
+        left_data: List[Dict[str, Any]],
+        right_data: List[Dict[str, Any]],
+        sample_size: int = 5,
+    ) -> Tuple[List[str], List[str]]:
+        # Sample data
+        left_sample = random.sample(left_data, min(sample_size, len(left_data)))
+        right_sample = random.sample(right_data, min(sample_size, len(right_data)))
+
+        # Prepare sample data for LLM
+        left_keys = list(left_sample[0].keys())
+        right_keys = list(right_sample[0].keys())
+
+        messages = [
+            {
+                "role": "user",
+                "content": f"""Given the following sample data from two datasets, select appropriate blocking keys for an equijoin operation.
+                The blocking process works as follows:
+                1. We create embeddings for the selected keys from both datasets.
+                2. We use cosine similarity between these embeddings to filter pairs for more detailed LLM comparison.
+                3. Pairs with high similarity will be passed to the LLM for final comparison.
+
+                The blocking keys should have relatively short values and be useful for generating embeddings that capture the essence of potential matches.
+
+                Left dataset keys: {left_keys}
+                Right dataset keys: {right_keys}
+
+                Sample from left dataset:
+                {json.dumps(left_sample, indent=2)}
+
+                Sample from right dataset:
+                {json.dumps(right_sample, indent=2)}
+
+                For context, here is the comparison prompt that will be used for the more detailed LLM comparison:
+                {self.op_config.get('comparison_prompt', 'No comparison prompt provided.')}
+
+                Please select one or more keys from each dataset that would be suitable for blocking. The keys should contain information that's likely to be similar in matching records and align with the comparison prompt's focus.""",
+            }
+        ]
+
+        response = self.llm_client.generate(
+            messages,
+            "You are an expert in entity matching and database operations.",
+            {
+                "type": "object",
+                "properties": {
+                    "left_blocking_keys": {
+                        "type": "array",
+                        "items": {"type": "string"},
+                        "description": "List of selected blocking keys from the left dataset",
+                    },
+                    "right_blocking_keys": {
+                        "type": "array",
+                        "items": {"type": "string"},
+                        "description": "List of selected blocking keys from the right dataset",
+                    },
+                },
+                "required": ["left_blocking_keys", "right_blocking_keys"],
+            },
+        )
+
+        result = json.loads(response.choices[0].message.content)
+        left_blocking_keys = result["left_blocking_keys"]
+        right_blocking_keys = result["right_blocking_keys"]
+
+        return left_blocking_keys, right_blocking_keys
+
+    def _compute_embeddings(
+        self,
+        input_data: List[Dict[str, Any]],
+        keys: List[str] = None,
+        is_join: bool = True,
+    ) -> Tuple[List[List[float]], List[str], float]:
+        if keys is None:
+            keys = self.op_config.get("blocking_keys", [])
+            if not keys:
+                prompt_template = self.op_config.get("comparison_prompt", "")
+                prompt_vars = extract_jinja_variables(prompt_template)
+                # Get rid of input, input1, input2
+                prompt_vars = [
+                    var
+                    for var in prompt_vars
+                    if var not in ["input", "input1", "input2"]
+                ]
+
+                # strip all things before . in the prompt_vars
+                keys += list(set([var.split(".")[-1] for var in prompt_vars]))
+            if not keys:
+                self.console.log(
+                    "[yellow]Warning: No blocking keys found. Using all keys for blocking.[/yellow]"
+                )
+                keys = list(input_data[0].keys())
+
+        model_input_context_length = model_cost.get(
+            self.op_config.get("embedding_model", "text-embedding-3-small"), {}
+        ).get("max_input_tokens", 8192)
+        texts = [
+            " ".join(str(item[key]) for key in keys if key in item)[
+                :model_input_context_length
+            ]
+            for item in input_data
+        ]
+
+        embeddings = []
+        total_cost = 0
+        batch_size = 2000
+        for i in range(0, len(texts), batch_size):
+            batch = texts[i : i + batch_size]
+            self.console.log(
+                f"[cyan]Processing batch {i//batch_size + 1} of {len(texts)//batch_size + 1}[/cyan]"
+            )
+            response = gen_embedding(
+                model=self.op_config.get("embedding_model", "text-embedding-3-small"),
+                input=batch,
+            )
+            embeddings.extend([data["embedding"] for data in response["data"]])
+            total_cost += completion_cost(response)
+        embeddings = [data["embedding"] for data in response["data"]]
+        cost = completion_cost(response)
+        return embeddings, keys, cost
+
+    def _calculate_cosine_similarities(
+        self, embeddings: List[List[float]]
+    ) -> List[Tuple[int, int, float]]:
+        embeddings_array = np.array(embeddings)
+        norms = np.linalg.norm(embeddings_array, axis=1)
+        dot_products = np.dot(embeddings_array, embeddings_array.T)
+        similarities_matrix = dot_products / np.outer(norms, norms)
+        i, j = np.triu_indices(len(embeddings), k=1)
+        similarities = list(
+            zip(i.tolist(), j.tolist(), similarities_matrix[i, j].tolist())
+        )
+        return similarities
+
+    def _print_similarity_histogram(
+        self,
+        similarities: List[Tuple[int, int, float]],
+        comparison_results: List[Tuple[int, int, bool]],
+    ):
+        flat_similarities = [sim[-1] for sim in similarities if sim[-1] != 1]
+        hist, bin_edges = np.histogram(flat_similarities, bins=20)
+        max_bar_width, max_count = 50, max(hist)
+        normalized_hist = [int(count / max_count * max_bar_width) for count in hist]
+
+        # Create a dictionary to store true labels
+        true_labels = {(i, j): is_match for i, j, is_match in comparison_results}
+
+        self.console.log("\n[bold]Embedding Cosine Similarity Distribution:[/bold]")
+        for i, count in enumerate(normalized_hist):
+            bar = "█" * count
+            label = f"{bin_edges[i]:.2f}-{bin_edges[i+1]:.2f}"
+
+            # Count true matches and not matches in this bin
+            true_matches = 0
+            not_matches = 0
+            labeled_count = 0
+            for sim in similarities:
+                if bin_edges[i] <= sim[2] < bin_edges[i + 1]:
+                    if (sim[0], sim[1]) in true_labels:
+                        labeled_count += 1
+                        if true_labels[(sim[0], sim[1])]:
+                            true_matches += 1
+                        else:
+                            not_matches += 1
+
+            # Calculate percentages of labeled pairs
+            if labeled_count > 0:
+                true_match_percent = (true_matches / labeled_count) * 100
+                not_match_percent = (not_matches / labeled_count) * 100
+            else:
+                true_match_percent = 0
+                not_match_percent = 0
+
+            self.console.log(
+                f"{label}: {bar} "
+                f"(Labeled: {labeled_count}/{hist[i]}, [green]{true_match_percent:.1f}% match[/green], [red]{not_match_percent:.1f}% not match[/red])"
+            )
+        self.console.log("\n")
+
+    def _sample_pairs(
+        self, similarities: List[Tuple[int, int, float]]
+    ) -> List[Tuple[int, int]]:
+        # Sort similarities in descending order
+        sorted_similarities = sorted(similarities, key=lambda x: x[2], reverse=True)
+
+        # Calculate weights using exponential weighting with self.sampling_weight
+        similarities_array = np.array([sim[2] for sim in sorted_similarities])
+        weights = np.exp(self.sampling_weight * similarities_array)
+        weights /= weights.sum()  # Normalize weights to sum to 1
+
+        # Sample pairs based on the calculated weights
+        sampled_indices = np.random.choice(
+            len(sorted_similarities),
+            size=min(self.sample_size, len(sorted_similarities)),
+            replace=False,
+            p=weights,
+        )
+
+        sampled_pairs = [
+            (sorted_similarities[i][0], sorted_similarities[i][1])
+            for i in sampled_indices
+        ]
+        return sampled_pairs
+
+    def _calculate_cross_similarities(
+        self, left_embeddings: List[List[float]], right_embeddings: List[List[float]]
+    ) -> List[Tuple[int, int, float]]:
+        left_array = np.array(left_embeddings)
+        right_array = np.array(right_embeddings)
+        dot_product = np.dot(left_array, right_array.T)
+        norm_left = np.linalg.norm(left_array, axis=1)
+        norm_right = np.linalg.norm(right_array, axis=1)
+        similarities = dot_product / np.outer(norm_left, norm_right)
+        return [
+            (i, j, sim)
+            for i, row in enumerate(similarities)
+            for j, sim in enumerate(row)
+        ]
+
+    def _perform_comparisons_resolve(
+        self, input_data: List[Dict[str, Any]], pairs: List[Tuple[int, int]]
+    ) -> Tuple[List[Tuple[int, int, bool]], float]:
+        comparisons, total_cost = [], 0
+        with ThreadPoolExecutor(max_workers=self.max_threads) as executor:
+            futures = [
+                executor.submit(
+                    compare_pair_resolve,
+                    self.op_config["comparison_prompt"],
+                    self.op_config.get(
+                        "comparison_model", self.config.get("model", "gpt-4o-mini")
+                    ),
+                    input_data[i],
+                    input_data[j],
+                )
+                for i, j in pairs
+            ]
+            for future, (i, j) in zip(futures, pairs):
+                is_match, cost = future.result()
+                comparisons.append((i, j, is_match))
+                total_cost += cost
+
+        self.console.log(
+            f"[bold]Cost of pairwise comparisons on the sample: ${total_cost:.4f}[/bold]"
+        )
+        return comparisons, total_cost
+
+    def _perform_comparisons_equijoin(
+        self,
+        left_data: List[Dict[str, Any]],
+        right_data: List[Dict[str, Any]],
+        pairs: List[Tuple[int, int]],
+    ) -> Tuple[List[Tuple[int, int, bool]], float]:
+        comparisons, total_cost = [], 0
+        with ThreadPoolExecutor(max_workers=self.max_threads) as executor:
+            futures = [
+                executor.submit(
+                    compare_pair_equijoin,
+                    self.op_config["comparison_prompt"],
+                    self.op_config.get(
+                        "comparison_model", self.config.get("model", "gpt-4o-mini")
+                    ),
+                    left_data[i],
+                    right_data[j] if right_data else left_data[j],
+                )
+                for i, j in pairs
+            ]
+            for future, (i, j) in zip(futures, pairs):
+                is_match, cost = future.result()
+                comparisons.append((i, j, is_match))
+                total_cost += cost
+
+        self.console.log(
+            f"[bold]Cost of pairwise comparisons on the sample: ${total_cost:.4f}[/bold]"
+        )
+        return comparisons, total_cost
+
+    def _find_optimal_threshold(
+        self,
+        comparisons: List[Tuple[int, int, bool]],
+        similarities: List[Tuple[int, int, float]],
+    ) -> Tuple[float, float, float]:
+        true_labels = np.array([comp[2] for comp in comparisons])
+        sim_dict = {(i, j): sim for i, j, sim in similarities}
+        sim_scores = np.array([sim_dict[(i, j)] for i, j, _ in comparisons])
+
+        thresholds = np.linspace(0, 1, 100)
+        precisions, recalls = [], []
+
+        for threshold in thresholds:
+            predictions = sim_scores >= threshold
+            tp = np.sum(predictions & true_labels)
+            fp = np.sum(predictions & ~true_labels)
+            fn = np.sum(~predictions & true_labels)
+
+            precision = tp / (tp + fp) if (tp + fp) > 0 else 0
+            recall = tp / (tp + fn) if (tp + fn) > 0 else 0
+
+            precisions.append(precision)
+            recalls.append(recall)
+
+        valid_indices = [i for i, r in enumerate(recalls) if r >= self.target_recall]
+        if not valid_indices:
+            optimal_threshold = float(thresholds[np.argmax(recalls)])
+        else:
+            optimal_threshold = float(thresholds[max(valid_indices)])
+
+        # Improved selectivity estimation
+        all_similarities = np.array([s[2] for s in similarities])
+        sampled_similarities = sim_scores
+
+        # Calculate sampling probabilities
+        sampling_probs = np.exp(self.sampling_weight * sampled_similarities)
+        sampling_probs /= sampling_probs.sum()
+
+        # Estimate selectivity using importance sampling
+        weights = 1 / (len(all_similarities) * sampling_probs)
+        numerator = np.sum(weights * true_labels)
+        denominator = np.sum(weights)
+        selectivity_estimate = numerator / denominator
+
+        self.console.log(
+            "[bold cyan]┌─ Estimated Self-Join Selectivity ─────────────────────────┐[/bold cyan]"
+        )
+        self.console.log(
+            f"[bold cyan]│[/bold cyan] [yellow]Target Recall:[/yellow] {self.target_recall:.0%}"
+        )
+        self.console.log(
+            f"[bold cyan]│[/bold cyan] [yellow]Estimate:[/yellow] {selectivity_estimate:.4f}"
+        )
+        self.console.log(
+            "[bold cyan]└───────────────────────────────────────────────────────────┘[/bold cyan]"
+        )
+        self.console.log(
+            f"[bold]Chosen similarity threshold for blocking: {optimal_threshold:.4f}[/bold]"
+        )
+
+        return round(optimal_threshold, 4), selectivity_estimate
+
+    def _generate_blocking_rules(
+        self,
+        blocking_keys: List[str],
+        input_data: List[Dict[str, Any]],
+        comparisons: List[Tuple[int, int, bool]],
+    ) -> List[str]:
+        # Sample 2 true and 2 false comparisons
+        true_comparisons = [comp for comp in comparisons if comp[2]][:2]
+        false_comparisons = [comp for comp in comparisons if not comp[2]][:2]
+        sample_datas = [
+            (
+                {key: input_data[i][key] for key in blocking_keys},
+                {key: input_data[j][key] for key in blocking_keys},
+                is_match,
+            )
+            for i, j, is_match in true_comparisons + false_comparisons
+        ]
+
+        messages = [
+            {
+                "role": "user",
+                "content": f"""Given the following sample comparisons between entities, generate a single-line Python statement that acts as a blocking rule for entity resolution. This rule will be used in the form: `eval(blocking_rule, {{"input1": item1, "input2": item2}})`.
+
+    Sample comparisons (note: these are just a few examples and may not represent all possible cases):
+    {json.dumps(sample_datas, indent=2)}
+
+    For context, here is the comparison prompt that will be used for the more expensive, detailed comparison:
+    {self.op_config.get('comparison_prompt', 'No comparison prompt provided.')}
+
+    Please generate ONE one-line blocking rule that adheres to the following criteria:
+    1. The rule should evaluate to True if the entities are possibly a match and require further comparison.
+    2. The rule should evaluate to False ONLY if the entities are definitely not a match.
+    3. The rule must be a single Python expression that can be evaluated using the eval() function.
+    4. The rule should be much faster to evaluate than the full comparison prompt.
+    5. The rule should capture the essence of the comparison prompt but in a simplified manner.
+    6. The rule should be general enough to work well on the entire dataset, not just these specific examples.
+    7. The rule should handle inconsistent casing by using string methods like .lower() when comparing string values.
+    8. The rule should err on the side of inclusivity - it's better to have false positives than false negatives.
+
+    Example structure of a one-line blocking rule:
+    "(condition1) or (condition2) or (condition3)"
+
+    Where conditions could be comparisons like:
+    "input1['field'].lower() == input2['field'].lower()"
+    "abs(len(input1['text']) - len(input2['text'])) <= 5"
+    "any(word in input1['description'].lower() for word in input2['description'].lower().split())"
+
+    If there's no clear rule that can be generated based on the given information, return the string "True" to ensure all pairs are compared.
+
+    Remember, the primary goal of the blocking rule is to safely reduce the number of comparisons by quickly identifying pairs that are definitely not matches, while keeping all potential matches for further evaluation.""",
+            }
+        ]
+
+        for attempt in range(self.agent_max_retries):  # Up to 3 attempts
+            # Generate blocking rule using the LLM
+            response = self.llm_client.generate(
+                messages,
+                "You are an expert in entity resolution and Python programming. Your task is to generate one efficient blocking rule based on the given sample comparisons and data structure.",
+                {
+                    "type": "object",
+                    "properties": {
+                        "blocking_rule": {
+                            "type": "string",
+                            "description": "One-line Python statement acting as a blocking rule",
+                        }
+                    },
+                    "required": ["blocking_rule"],
+                },
+            )
+
+            # Extract the blocking rule from the LLM response
+            blocking_rule = response.choices[0].message.content
+            blocking_rule = json.loads(blocking_rule).get("blocking_rule")
+
+            if blocking_rule:
+                self.console.log("")  # Print a newline
+
+                if blocking_rule.strip() == "True":
+                    self.console.log(
+                        "[yellow]No suitable blocking rule could be found. Proceeding without a blocking rule.[/yellow]"
+                    )
+                    return []
+
+                self.console.log(
+                    f"[bold]Generated blocking rule (Attempt {attempt + 1}):[/bold] {blocking_rule}"
+                )
+
+                # Test the blocking rule
+                filtered_pairs = self._test_blocking_rule(
+                    input_data, blocking_keys, blocking_rule, comparisons
+                )
+
+                if not filtered_pairs:
+                    self.console.log(
+                        "[green]Blocking rule looks good! No known matches were filtered out.[/green]"
+                    )
+                    return [blocking_rule]
+                else:
+                    feedback = f"The previous rule incorrectly filtered out {len(filtered_pairs)} known matches. "
+                    feedback += (
+                        "Here are up to 3 examples of incorrectly filtered pairs:\n"
+                    )
+                    for i, j in filtered_pairs[:3]:
+                        feedback += f"Item 1: {json.dumps({key: input_data[i][key] for key in blocking_keys})}\Item 2: {json.dumps({key: input_data[j][key] for key in blocking_keys})}\n"
+                        feedback += "These pairs are known matches but were filtered out by the rule.\n"
+                    feedback += "Please generate a new rule that doesn't filter out these matches."
+
+                    messages.append({"role": "assistant", "content": blocking_rule})
+                    messages.append({"role": "user", "content": feedback})
+            else:
+                self.console.log("[yellow]No blocking rule generated.[/yellow]")
+                return []
+
+        self.console.log(
+            f"[yellow]Failed to generate a suitable blocking rule after {self.agent_max_retries} attempts. Proceeding without a blocking rule.[/yellow]"
+        )
+        return []
+
+    def _test_blocking_rule(
+        self,
+        input_data: List[Dict[str, Any]],
+        blocking_keys: List[str],
+        blocking_rule: str,
+        comparisons: List[Tuple[int, int, bool]],
+    ) -> List[Tuple[int, int]]:
+        def apply_blocking_rule(item1, item2):
+            try:
+                return eval(blocking_rule, {"input1": item1, "input2": item2})
+            except Exception as e:
+                self.console.log(f"[red]Error applying blocking rule: {e}[/red]")
+                return True  # If there's an error, we default to comparing the pair
+
+        filtered_pairs = []
+
+        for i, j, is_match in comparisons:
+            if is_match:
+                item1 = {
+                    k: input_data[i][k] for k in blocking_keys if k in input_data[i]
+                }
+                item2 = {
+                    k: input_data[j][k] for k in blocking_keys if k in input_data[j]
+                }
+
+                if not apply_blocking_rule(item1, item2):
+                    filtered_pairs.append((i, j))
+
+        if filtered_pairs:
+            self.console.log(
+                f"[yellow italic]LLM Correction: The blocking rule incorrectly filtered out {len(filtered_pairs)} known positive matches.[/yellow italic]"
+            )
+            for i, j in filtered_pairs[:5]:  # Show up to 5 examples
+                self.console.log(
+                    f"  Incorrectly filtered pair 1: {json.dumps({key: input_data[i][key] for key in blocking_keys})}  and pair 2: {json.dumps({key: input_data[j][key] for key in blocking_keys})}"
+                )
+            if len(filtered_pairs) > 5:
+                self.console.log(
+                    f"  ... and {len(filtered_pairs) - 5} more incorrect pairs."
+                )
+
+        return filtered_pairs
+
+    def _generate_containment_rules_equijoin(
+        self,
+        left_data: List[Dict[str, Any]],
+        right_data: List[Dict[str, Any]],
+    ) -> List[str]:
+        # Get all available keys from the sample data
+        left_keys = set(left_data[0].keys())
+        right_keys = set(right_data[0].keys())
+
+        # Sample a few records from each dataset
+        sample_left = random.sample(left_data, min(3, len(left_data)))
+        sample_right = random.sample(right_data, min(3, len(right_data)))
+
+        messages = [
+            {
+                "role": "system",
+                "content": "You are an AI assistant tasked with generating containment-based blocking rules for an equijoin operation.",
+            },
+            {
+                "role": "user",
+                "content": f"""Generate multiple one-line Python statements that act as containment-based blocking rules for equijoin. These rules will be used in the form: `eval(blocking_rule, {{"left": item1, "right": item2}})`.
+
+Available keys in left dataset: {', '.join(left_keys)}
+Available keys in right dataset: {', '.join(right_keys)}
+
+Sample data from left dataset:
+{json.dumps(sample_left, indent=2)}
+
+Sample data from right dataset:
+{json.dumps(sample_right, indent=2)}
+
+Comparison prompt used for detailed comparison:
+{self.op_config.get('comparison_prompt', 'No comparison prompt provided.')}
+
+Please generate multiple one-line blocking rules that adhere to the following criteria:
+1. The rules should focus on containment relationships between fields in the left and right datasets. Containment can mean that the left field contains all the words in the right field, or the right field contains all the words in the left field.
+2. Each rule should evaluate to True if there's a potential match based on containment, False otherwise.
+3. Rules must be single Python expressions that can be evaluated using the eval() function.
+4. Rules should handle inconsistent casing by using string methods like .lower() when comparing string values.
+5. Consider the length of the fields when generating rules: for example, if the left field is much longer than the right field, it's more likely to contain all the words in the right field.
+
+Example structures of containment-based blocking rules:
+"all(word in left['{{left_key}}'].lower() for word in right['{{right_key}}'].lower().split())"
+"any(word in right['{{right_key}}'].lower().split() for word in left['{{left_key}}'].lower().split())"
+
+Please provide 3-5 different containment-based blocking rules, based on the keys and sample data provided.""",
+            },
+        ]
+
+        response = self.llm_client.generate(
+            messages,
+            "You are an expert in data matching and Python programming.",
+            {
+                "type": "object",
+                "properties": {
+                    "containment_rules": {
+                        "type": "array",
+                        "items": {"type": "string"},
+                        "description": "List of containment-based blocking rules as Python expressions",
+                    }
+                },
+                "required": ["containment_rules"],
+            },
+        )
+
+        containment_rules = response.choices[0].message.content
+        containment_rules = json.loads(containment_rules).get("containment_rules")
+        return containment_rules
+
+    def _generate_blocking_rules_equijoin(
+        self,
+        left_keys: List[str],
+        right_keys: List[str],
+        left_data: List[Dict[str, Any]],
+        right_data: List[Dict[str, Any]],
+        comparisons: List[Tuple[int, int, bool]],
+    ) -> List[str]:
+        if not left_keys or not right_keys:
+            left_keys = list(left_data[0].keys())
+            right_keys = list(right_data[0].keys())
+
+        # Sample 2 true and 2 false comparisons
+        true_comparisons = [comp for comp in comparisons if comp[2]][:2]
+        false_comparisons = [comp for comp in comparisons if not comp[2]][:2]
+        sample_datas = [
+            (
+                {key: left_data[i][key] for key in left_keys if key in left_data[i]},
+                {key: right_data[j][key] for key in right_keys if key in right_data[j]},
+                is_match,
+            )
+            for i, j, is_match in true_comparisons + false_comparisons
+        ]
+
+        messages = [
+            {
+                "role": "user",
+                "content": f"""Given the following sample comparisons between entities, generate a single-line Python statement that acts as a blocking rule for equijoin. This rule will be used in the form: `eval(blocking_rule, {{"left": item1, "right": item2}})`.
+
+    Sample comparisons (note: these are just a few examples and may not represent all possible cases):
+    {json.dumps(sample_datas, indent=2)}
+
+    For context, here is the comparison prompt that will be used for the more expensive, detailed comparison:
+    {self.op_config.get('comparison_prompt', 'No comparison prompt provided.')}
+
+    Please generate ONE one-line blocking rule that adheres to the following criteria:
+    1. The rule should evaluate to True if the entities are possibly a match and require further comparison.
+    2. The rule should evaluate to False ONLY if the entities are definitely not a match.
+    3. The rule must be a single Python expression that can be evaluated using the eval() function.
+    4. The rule should be much faster to evaluate than the full comparison prompt.
+    5. The rule should capture the essence of the comparison prompt but in a simplified manner.
+    6. The rule should be general enough to work well on the entire dataset, not just these specific examples.
+    7. The rule should handle inconsistent casing by using string methods like .lower() when comparing string values.
+    8. The rule should err on the side of inclusivity - it's better to have false positives than false negatives.
+
+    Example structure of a one-line blocking rule:
+    "(condition1) or (condition2) or (condition3)"
+
+    Where conditions could be comparisons like:
+    "left['{left_keys[0]}'].lower() == right['{right_keys[0]}'].lower()"
+    "abs(len(left['{left_keys[0]}']) - len(right['{right_keys[0]}'])) <= 5"
+    "any(word in left['{left_keys[0]}'].lower() for word in right['{right_keys[0]}'].lower().split())"
+
+    If there's no clear rule that can be generated based on the given information, return the string "True" to ensure all pairs are compared.
+
+    Remember, the primary goal of the blocking rule is to safely reduce the number of comparisons by quickly identifying pairs that are definitely not matches, while keeping all potential matches for further evaluation.""",
+            }
+        ]
+
+        for attempt in range(self.agent_max_retries):
+            response = self.llm_client.generate(
+                messages,
+                "You are an expert in entity resolution and Python programming. Your task is to generate one efficient blocking rule based on the given sample comparisons and data structure.",
+                {
+                    "type": "object",
+                    "properties": {
+                        "blocking_rule": {
+                            "type": "string",
+                            "description": "One-line Python statement acting as a blocking rule",
+                        }
+                    },
+                    "required": ["blocking_rule"],
+                },
+            )
+
+            blocking_rule = response.choices[0].message.content
+            blocking_rule = json.loads(blocking_rule).get("blocking_rule")
+
+            if blocking_rule:
+                self.console.log("")
+
+                if blocking_rule.strip() == "True":
+                    self.console.log(
+                        "[yellow]No suitable blocking rule could be found. Proceeding without a blocking rule.[/yellow]"
+                    )
+                    return []
+
+                self.console.log(
+                    f"[bold]Generated blocking rule (Attempt {attempt + 1}):[/bold] {blocking_rule}"
+                )
+
+                # Test the blocking rule
+                filtered_pairs = self._test_blocking_rule_equijoin(
+                    left_data,
+                    right_data,
+                    left_keys,
+                    right_keys,
+                    blocking_rule,
+                    comparisons,
+                )
+
+                if not filtered_pairs:
+                    self.console.log(
+                        "[green]Blocking rule looks good! No known matches were filtered out.[/green]"
+                    )
+                    return [blocking_rule]
+                else:
+                    feedback = f"The previous rule incorrectly filtered out {len(filtered_pairs)} known matches. "
+                    feedback += (
+                        "Here are up to 3 examples of incorrectly filtered pairs:\n"
+                    )
+                    for i, j in filtered_pairs[:3]:
+                        feedback += f"Left: {json.dumps({key: left_data[i][key] for key in left_keys})}\n"
+                        feedback += f"Right: {json.dumps({key: right_data[j][key] for key in right_keys})}\n"
+                        feedback += "These pairs are known matches but were filtered out by the rule.\n"
+                    feedback += "Please generate a new rule that doesn't filter out these matches."
+
+                    messages.append({"role": "assistant", "content": blocking_rule})
+                    messages.append({"role": "user", "content": feedback})
+            else:
+                self.console.log("[yellow]No blocking rule generated.[/yellow]")
+                return []
+
+        self.console.log(
+            f"[yellow]Failed to generate a suitable blocking rule after {self.agent_max_retries} attempts. Proceeding without a blocking rule.[/yellow]"
+        )
+        return []
+
+    def _test_blocking_rule_equijoin(
+        self,
+        left_data: List[Dict[str, Any]],
+        right_data: List[Dict[str, Any]],
+        left_keys: List[str],
+        right_keys: List[str],
+        blocking_rule: str,
+        comparisons: List[Tuple[int, int, bool]],
+    ) -> List[Tuple[int, int]]:
+        def apply_blocking_rule(left, right):
+            try:
+                return eval(blocking_rule, {"left": left, "right": right})
+            except Exception as e:
+                self.console.log(f"[red]Error applying blocking rule: {e}[/red]")
+                return True  # If there's an error, we default to comparing the pair
+
+        filtered_pairs = []
+
+        for i, j, is_match in comparisons:
+            if is_match:
+                left = left_data[i]
+                right = right_data[j]
+                if not apply_blocking_rule(left, right):
+                    filtered_pairs.append((i, j))
+
+        if filtered_pairs:
+            self.console.log(
+                f"[yellow italic]LLM Correction: The blocking rule incorrectly filtered out {len(filtered_pairs)} known positive matches.[/yellow italic]"
+            )
+            for i, j in filtered_pairs[:5]:  # Show up to 5 examples
+                left_dict = {key: left_data[i][key] for key in left_keys}
+                right_dict = {key: right_data[j][key] for key in right_keys}
+                self.console.log(
+                    f"  Incorrectly filtered pair - Left: {json.dumps(left_dict)}  Right: {json.dumps(right_dict)}"
+                )
+            if len(filtered_pairs) > 5:
+                self.console.log(
+                    f"  ... and {len(filtered_pairs) - 5} more incorrect pairs."
+                )
+
+        return filtered_pairs
+
+    def _verify_blocking_rule_equijoin(
+        self,
+        left_data: List[Dict[str, Any]],
+        right_data: List[Dict[str, Any]],
+        blocking_rule: str,
+        left_keys: List[str],
+        right_keys: List[str],
+        comparison_results: List[Tuple[int, int, bool]],
+    ) -> Tuple[List[Tuple[int, int]], float]:
+        def apply_blocking_rule(left, right):
+            try:
+                return eval(blocking_rule, {"left": left, "right": right})
+            except Exception as e:
+                self.console.log(f"[red]Error applying blocking rule: {e}[/red]")
+                return True  # If there's an error, we default to comparing the pair
+
+        false_negatives = []
+        total_pairs = 0
+        blocked_pairs = 0
+
+        for i, j, is_match in comparison_results:
+            total_pairs += 1
+            left = left_data[i]
+            right = right_data[j]
+            if apply_blocking_rule(left, right):
+                blocked_pairs += 1
+                if is_match:
+                    false_negatives.append((i, j))
+
+        rule_selectivity = blocked_pairs / total_pairs if total_pairs > 0 else 0
+
+        return false_negatives, rule_selectivity
+
+    def _update_config_equijoin(
+        self,
+        threshold: float,
+        left_keys: List[str],
+        right_keys: List[str],
+        blocking_rules: List[str],
+    ) -> Dict[str, Any]:
+        optimized_config = self.op_config.copy()
+        optimized_config["blocking_keys"] = {
+            "left": left_keys,
+            "right": right_keys,
+        }
+        optimized_config["blocking_threshold"] = threshold
+        if blocking_rules:
+            optimized_config["blocking_conditions"] = blocking_rules
+        if "embedding_model" not in optimized_config:
+            optimized_config["embedding_model"] = "text-embedding-3-small"
+        return optimized_config
+
+    def _verify_blocking_rule(
+        self,
+        input_data: List[Dict[str, Any]],
+        blocking_rule: str,
+        blocking_keys: List[str],
+        comparison_results: List[Tuple[int, int, bool]],
+    ) -> Tuple[List[Tuple[int, int]], float]:
+        def apply_blocking_rule(item1, item2):
+            try:
+                return eval(blocking_rule, {"input1": item1, "input2": item2})
+            except Exception as e:
+                self.console.log(f"[red]Error applying blocking rule: {e}[/red]")
+                return True  # If there's an error, we default to comparing the pair
+
+        false_negatives = []
+        total_pairs = 0
+        blocked_pairs = 0
+
+        for i, j, is_match in comparison_results:
+            total_pairs += 1
+            item1 = {k: input_data[i][k] for k in blocking_keys if k in input_data[i]}
+            item2 = {k: input_data[j][k] for k in blocking_keys if k in input_data[j]}
+
+            if apply_blocking_rule(item1, item2):
+                blocked_pairs += 1
+                if is_match:
+                    false_negatives.append((i, j))
+
+        rule_selectivity = blocked_pairs / total_pairs if total_pairs > 0 else 0
+
+        return false_negatives, rule_selectivity
+
+    def _update_config(
+        self, threshold: float, blocking_keys: List[str], blocking_rules: List[str]
+    ) -> Dict[str, Any]:
+        optimized_config = self.op_config.copy()
+        optimized_config["blocking_keys"] = blocking_keys
+        optimized_config["blocking_threshold"] = threshold
+        if blocking_rules:
+            optimized_config["blocking_conditions"] = blocking_rules
+        if "embedding_model" not in optimized_config:
+            optimized_config["embedding_model"] = "text-embedding-3-small"
+        return optimized_config
+
+
+ + + +
+ + + + + + + + + + + +
+ +
+ +
+ + + + + + + + + + + + + +
+
+ + + +
+ + + +
+ + + +
+
+
+
+ + + + + + + + + + \ No newline at end of file diff --git a/assets/_mkdocstrings.css b/assets/_mkdocstrings.css new file mode 100644 index 00000000..85449ec7 --- /dev/null +++ b/assets/_mkdocstrings.css @@ -0,0 +1,119 @@ + +/* Avoid breaking parameter names, etc. in table cells. */ +.doc-contents td code { + word-break: normal !important; +} + +/* No line break before first paragraph of descriptions. */ +.doc-md-description, +.doc-md-description>p:first-child { + display: inline; +} + +/* Max width for docstring sections tables. */ +.doc .md-typeset__table, +.doc .md-typeset__table table { + display: table !important; + width: 100%; +} + +.doc .md-typeset__table tr { + display: table-row; +} + +/* Defaults in Spacy table style. */ +.doc-param-default { + float: right; +} + +/* Backward-compatibility: docstring section titles in bold. */ +.doc-section-title { + font-weight: bold; +} + +/* Symbols in Navigation and ToC. */ +:root, +[data-md-color-scheme="default"] { + --doc-symbol-attribute-fg-color: #953800; + --doc-symbol-function-fg-color: #8250df; + --doc-symbol-method-fg-color: #8250df; + --doc-symbol-class-fg-color: #0550ae; + --doc-symbol-module-fg-color: #5cad0f; + + --doc-symbol-attribute-bg-color: #9538001a; + --doc-symbol-function-bg-color: #8250df1a; + --doc-symbol-method-bg-color: #8250df1a; + --doc-symbol-class-bg-color: #0550ae1a; + --doc-symbol-module-bg-color: #5cad0f1a; +} + +[data-md-color-scheme="slate"] { + --doc-symbol-attribute-fg-color: #ffa657; + --doc-symbol-function-fg-color: #d2a8ff; + --doc-symbol-method-fg-color: #d2a8ff; + --doc-symbol-class-fg-color: #79c0ff; + --doc-symbol-module-fg-color: #baff79; + + --doc-symbol-attribute-bg-color: #ffa6571a; + --doc-symbol-function-bg-color: #d2a8ff1a; + --doc-symbol-method-bg-color: #d2a8ff1a; + --doc-symbol-class-bg-color: #79c0ff1a; + --doc-symbol-module-bg-color: #baff791a; +} + +code.doc-symbol { + border-radius: .1rem; + font-size: .85em; + padding: 0 .3em; + font-weight: bold; +} + +code.doc-symbol-attribute { + color: var(--doc-symbol-attribute-fg-color); + background-color: var(--doc-symbol-attribute-bg-color); +} + +code.doc-symbol-attribute::after { + content: "attr"; +} + +code.doc-symbol-function { + color: var(--doc-symbol-function-fg-color); + background-color: var(--doc-symbol-function-bg-color); +} + +code.doc-symbol-function::after { + content: "func"; +} + +code.doc-symbol-method { + color: var(--doc-symbol-method-fg-color); + background-color: var(--doc-symbol-method-bg-color); +} + +code.doc-symbol-method::after { + content: "meth"; +} + +code.doc-symbol-class { + color: var(--doc-symbol-class-fg-color); + background-color: var(--doc-symbol-class-bg-color); +} + +code.doc-symbol-class::after { + content: "class"; +} + +code.doc-symbol-module { + color: var(--doc-symbol-module-fg-color); + background-color: var(--doc-symbol-module-bg-color); +} + +code.doc-symbol-module::after { + content: "mod"; +} + +.doc-signature .autorefs { + color: inherit; + border-bottom: 1px dotted currentcolor; +} diff --git a/assets/docetl-favicon-color.png b/assets/docetl-favicon-color.png new file mode 100644 index 00000000..51fe742f Binary files /dev/null and b/assets/docetl-favicon-color.png differ diff --git a/assets/headerdiagram.png b/assets/headerdiagram.png new file mode 100644 index 00000000..6f1e3f52 Binary files /dev/null and b/assets/headerdiagram.png differ diff --git a/assets/images/favicon.png b/assets/images/favicon.png new file mode 100644 index 00000000..1cf13b9f Binary files /dev/null and b/assets/images/favicon.png differ diff --git a/assets/javascripts/bundle.56dfad97.min.js b/assets/javascripts/bundle.56dfad97.min.js new file mode 100644 index 00000000..1df62cd7 --- /dev/null +++ b/assets/javascripts/bundle.56dfad97.min.js @@ -0,0 +1,16 @@ +"use strict";(()=>{var Fi=Object.create;var gr=Object.defineProperty;var Wi=Object.getOwnPropertyDescriptor;var Ui=Object.getOwnPropertyNames,Vt=Object.getOwnPropertySymbols,Di=Object.getPrototypeOf,yr=Object.prototype.hasOwnProperty,io=Object.prototype.propertyIsEnumerable;var no=(e,t,r)=>t in e?gr(e,t,{enumerable:!0,configurable:!0,writable:!0,value:r}):e[t]=r,$=(e,t)=>{for(var r in t||(t={}))yr.call(t,r)&&no(e,r,t[r]);if(Vt)for(var r of Vt(t))io.call(t,r)&&no(e,r,t[r]);return e};var ao=(e,t)=>{var r={};for(var o in e)yr.call(e,o)&&t.indexOf(o)<0&&(r[o]=e[o]);if(e!=null&&Vt)for(var o of Vt(e))t.indexOf(o)<0&&io.call(e,o)&&(r[o]=e[o]);return r};var xr=(e,t)=>()=>(t||e((t={exports:{}}).exports,t),t.exports);var Vi=(e,t,r,o)=>{if(t&&typeof t=="object"||typeof t=="function")for(let n of Ui(t))!yr.call(e,n)&&n!==r&&gr(e,n,{get:()=>t[n],enumerable:!(o=Wi(t,n))||o.enumerable});return e};var Lt=(e,t,r)=>(r=e!=null?Fi(Di(e)):{},Vi(t||!e||!e.__esModule?gr(r,"default",{value:e,enumerable:!0}):r,e));var so=(e,t,r)=>new Promise((o,n)=>{var i=p=>{try{s(r.next(p))}catch(c){n(c)}},a=p=>{try{s(r.throw(p))}catch(c){n(c)}},s=p=>p.done?o(p.value):Promise.resolve(p.value).then(i,a);s((r=r.apply(e,t)).next())});var po=xr((Er,co)=>{(function(e,t){typeof Er=="object"&&typeof co!="undefined"?t():typeof define=="function"&&define.amd?define(t):t()})(Er,function(){"use strict";function e(r){var o=!0,n=!1,i=null,a={text:!0,search:!0,url:!0,tel:!0,email:!0,password:!0,number:!0,date:!0,month:!0,week:!0,time:!0,datetime:!0,"datetime-local":!0};function s(k){return!!(k&&k!==document&&k.nodeName!=="HTML"&&k.nodeName!=="BODY"&&"classList"in k&&"contains"in k.classList)}function p(k){var ft=k.type,qe=k.tagName;return!!(qe==="INPUT"&&a[ft]&&!k.readOnly||qe==="TEXTAREA"&&!k.readOnly||k.isContentEditable)}function c(k){k.classList.contains("focus-visible")||(k.classList.add("focus-visible"),k.setAttribute("data-focus-visible-added",""))}function l(k){k.hasAttribute("data-focus-visible-added")&&(k.classList.remove("focus-visible"),k.removeAttribute("data-focus-visible-added"))}function f(k){k.metaKey||k.altKey||k.ctrlKey||(s(r.activeElement)&&c(r.activeElement),o=!0)}function u(k){o=!1}function d(k){s(k.target)&&(o||p(k.target))&&c(k.target)}function y(k){s(k.target)&&(k.target.classList.contains("focus-visible")||k.target.hasAttribute("data-focus-visible-added"))&&(n=!0,window.clearTimeout(i),i=window.setTimeout(function(){n=!1},100),l(k.target))}function M(k){document.visibilityState==="hidden"&&(n&&(o=!0),X())}function X(){document.addEventListener("mousemove",J),document.addEventListener("mousedown",J),document.addEventListener("mouseup",J),document.addEventListener("pointermove",J),document.addEventListener("pointerdown",J),document.addEventListener("pointerup",J),document.addEventListener("touchmove",J),document.addEventListener("touchstart",J),document.addEventListener("touchend",J)}function te(){document.removeEventListener("mousemove",J),document.removeEventListener("mousedown",J),document.removeEventListener("mouseup",J),document.removeEventListener("pointermove",J),document.removeEventListener("pointerdown",J),document.removeEventListener("pointerup",J),document.removeEventListener("touchmove",J),document.removeEventListener("touchstart",J),document.removeEventListener("touchend",J)}function J(k){k.target.nodeName&&k.target.nodeName.toLowerCase()==="html"||(o=!1,te())}document.addEventListener("keydown",f,!0),document.addEventListener("mousedown",u,!0),document.addEventListener("pointerdown",u,!0),document.addEventListener("touchstart",u,!0),document.addEventListener("visibilitychange",M,!0),X(),r.addEventListener("focus",d,!0),r.addEventListener("blur",y,!0),r.nodeType===Node.DOCUMENT_FRAGMENT_NODE&&r.host?r.host.setAttribute("data-js-focus-visible",""):r.nodeType===Node.DOCUMENT_NODE&&(document.documentElement.classList.add("js-focus-visible"),document.documentElement.setAttribute("data-js-focus-visible",""))}if(typeof window!="undefined"&&typeof document!="undefined"){window.applyFocusVisiblePolyfill=e;var t;try{t=new CustomEvent("focus-visible-polyfill-ready")}catch(r){t=document.createEvent("CustomEvent"),t.initCustomEvent("focus-visible-polyfill-ready",!1,!1,{})}window.dispatchEvent(t)}typeof document!="undefined"&&e(document)})});var qr=xr((ly,Sn)=>{"use strict";/*! + * escape-html + * Copyright(c) 2012-2013 TJ Holowaychuk + * Copyright(c) 2015 Andreas Lubbe + * Copyright(c) 2015 Tiancheng "Timothy" Gu + * MIT Licensed + */var ka=/["'&<>]/;Sn.exports=Ha;function Ha(e){var t=""+e,r=ka.exec(t);if(!r)return t;var o,n="",i=0,a=0;for(i=r.index;i{/*! + * clipboard.js v2.0.11 + * https://clipboardjs.com/ + * + * Licensed MIT © Zeno Rocha + */(function(t,r){typeof It=="object"&&typeof Yr=="object"?Yr.exports=r():typeof define=="function"&&define.amd?define([],r):typeof It=="object"?It.ClipboardJS=r():t.ClipboardJS=r()})(It,function(){return function(){var e={686:function(o,n,i){"use strict";i.d(n,{default:function(){return ji}});var a=i(279),s=i.n(a),p=i(370),c=i.n(p),l=i(817),f=i.n(l);function u(V){try{return document.execCommand(V)}catch(A){return!1}}var d=function(A){var L=f()(A);return u("cut"),L},y=d;function M(V){var A=document.documentElement.getAttribute("dir")==="rtl",L=document.createElement("textarea");L.style.fontSize="12pt",L.style.border="0",L.style.padding="0",L.style.margin="0",L.style.position="absolute",L.style[A?"right":"left"]="-9999px";var F=window.pageYOffset||document.documentElement.scrollTop;return L.style.top="".concat(F,"px"),L.setAttribute("readonly",""),L.value=V,L}var X=function(A,L){var F=M(A);L.container.appendChild(F);var D=f()(F);return u("copy"),F.remove(),D},te=function(A){var L=arguments.length>1&&arguments[1]!==void 0?arguments[1]:{container:document.body},F="";return typeof A=="string"?F=X(A,L):A instanceof HTMLInputElement&&!["text","search","url","tel","password"].includes(A==null?void 0:A.type)?F=X(A.value,L):(F=f()(A),u("copy")),F},J=te;function k(V){"@babel/helpers - typeof";return typeof Symbol=="function"&&typeof Symbol.iterator=="symbol"?k=function(L){return typeof L}:k=function(L){return L&&typeof Symbol=="function"&&L.constructor===Symbol&&L!==Symbol.prototype?"symbol":typeof L},k(V)}var ft=function(){var A=arguments.length>0&&arguments[0]!==void 0?arguments[0]:{},L=A.action,F=L===void 0?"copy":L,D=A.container,Y=A.target,$e=A.text;if(F!=="copy"&&F!=="cut")throw new Error('Invalid "action" value, use either "copy" or "cut"');if(Y!==void 0)if(Y&&k(Y)==="object"&&Y.nodeType===1){if(F==="copy"&&Y.hasAttribute("disabled"))throw new Error('Invalid "target" attribute. Please use "readonly" instead of "disabled" attribute');if(F==="cut"&&(Y.hasAttribute("readonly")||Y.hasAttribute("disabled")))throw new Error(`Invalid "target" attribute. You can't cut text from elements with "readonly" or "disabled" attributes`)}else throw new Error('Invalid "target" value, use a valid Element');if($e)return J($e,{container:D});if(Y)return F==="cut"?y(Y):J(Y,{container:D})},qe=ft;function Fe(V){"@babel/helpers - typeof";return typeof Symbol=="function"&&typeof Symbol.iterator=="symbol"?Fe=function(L){return typeof L}:Fe=function(L){return L&&typeof Symbol=="function"&&L.constructor===Symbol&&L!==Symbol.prototype?"symbol":typeof L},Fe(V)}function Ai(V,A){if(!(V instanceof A))throw new TypeError("Cannot call a class as a function")}function oo(V,A){for(var L=0;L0&&arguments[0]!==void 0?arguments[0]:{};this.action=typeof D.action=="function"?D.action:this.defaultAction,this.target=typeof D.target=="function"?D.target:this.defaultTarget,this.text=typeof D.text=="function"?D.text:this.defaultText,this.container=Fe(D.container)==="object"?D.container:document.body}},{key:"listenClick",value:function(D){var Y=this;this.listener=c()(D,"click",function($e){return Y.onClick($e)})}},{key:"onClick",value:function(D){var Y=D.delegateTarget||D.currentTarget,$e=this.action(Y)||"copy",Dt=qe({action:$e,container:this.container,target:this.target(Y),text:this.text(Y)});this.emit(Dt?"success":"error",{action:$e,text:Dt,trigger:Y,clearSelection:function(){Y&&Y.focus(),window.getSelection().removeAllRanges()}})}},{key:"defaultAction",value:function(D){return vr("action",D)}},{key:"defaultTarget",value:function(D){var Y=vr("target",D);if(Y)return document.querySelector(Y)}},{key:"defaultText",value:function(D){return vr("text",D)}},{key:"destroy",value:function(){this.listener.destroy()}}],[{key:"copy",value:function(D){var Y=arguments.length>1&&arguments[1]!==void 0?arguments[1]:{container:document.body};return J(D,Y)}},{key:"cut",value:function(D){return y(D)}},{key:"isSupported",value:function(){var D=arguments.length>0&&arguments[0]!==void 0?arguments[0]:["copy","cut"],Y=typeof D=="string"?[D]:D,$e=!!document.queryCommandSupported;return Y.forEach(function(Dt){$e=$e&&!!document.queryCommandSupported(Dt)}),$e}}]),L}(s()),ji=Ii},828:function(o){var n=9;if(typeof Element!="undefined"&&!Element.prototype.matches){var i=Element.prototype;i.matches=i.matchesSelector||i.mozMatchesSelector||i.msMatchesSelector||i.oMatchesSelector||i.webkitMatchesSelector}function a(s,p){for(;s&&s.nodeType!==n;){if(typeof s.matches=="function"&&s.matches(p))return s;s=s.parentNode}}o.exports=a},438:function(o,n,i){var a=i(828);function s(l,f,u,d,y){var M=c.apply(this,arguments);return l.addEventListener(u,M,y),{destroy:function(){l.removeEventListener(u,M,y)}}}function p(l,f,u,d,y){return typeof l.addEventListener=="function"?s.apply(null,arguments):typeof u=="function"?s.bind(null,document).apply(null,arguments):(typeof l=="string"&&(l=document.querySelectorAll(l)),Array.prototype.map.call(l,function(M){return s(M,f,u,d,y)}))}function c(l,f,u,d){return function(y){y.delegateTarget=a(y.target,f),y.delegateTarget&&d.call(l,y)}}o.exports=p},879:function(o,n){n.node=function(i){return i!==void 0&&i instanceof HTMLElement&&i.nodeType===1},n.nodeList=function(i){var a=Object.prototype.toString.call(i);return i!==void 0&&(a==="[object NodeList]"||a==="[object HTMLCollection]")&&"length"in i&&(i.length===0||n.node(i[0]))},n.string=function(i){return typeof i=="string"||i instanceof String},n.fn=function(i){var a=Object.prototype.toString.call(i);return a==="[object Function]"}},370:function(o,n,i){var a=i(879),s=i(438);function p(u,d,y){if(!u&&!d&&!y)throw new Error("Missing required arguments");if(!a.string(d))throw new TypeError("Second argument must be a String");if(!a.fn(y))throw new TypeError("Third argument must be a Function");if(a.node(u))return c(u,d,y);if(a.nodeList(u))return l(u,d,y);if(a.string(u))return f(u,d,y);throw new TypeError("First argument must be a String, HTMLElement, HTMLCollection, or NodeList")}function c(u,d,y){return u.addEventListener(d,y),{destroy:function(){u.removeEventListener(d,y)}}}function l(u,d,y){return Array.prototype.forEach.call(u,function(M){M.addEventListener(d,y)}),{destroy:function(){Array.prototype.forEach.call(u,function(M){M.removeEventListener(d,y)})}}}function f(u,d,y){return s(document.body,u,d,y)}o.exports=p},817:function(o){function n(i){var a;if(i.nodeName==="SELECT")i.focus(),a=i.value;else if(i.nodeName==="INPUT"||i.nodeName==="TEXTAREA"){var s=i.hasAttribute("readonly");s||i.setAttribute("readonly",""),i.select(),i.setSelectionRange(0,i.value.length),s||i.removeAttribute("readonly"),a=i.value}else{i.hasAttribute("contenteditable")&&i.focus();var p=window.getSelection(),c=document.createRange();c.selectNodeContents(i),p.removeAllRanges(),p.addRange(c),a=p.toString()}return a}o.exports=n},279:function(o){function n(){}n.prototype={on:function(i,a,s){var p=this.e||(this.e={});return(p[i]||(p[i]=[])).push({fn:a,ctx:s}),this},once:function(i,a,s){var p=this;function c(){p.off(i,c),a.apply(s,arguments)}return c._=a,this.on(i,c,s)},emit:function(i){var a=[].slice.call(arguments,1),s=((this.e||(this.e={}))[i]||[]).slice(),p=0,c=s.length;for(p;p0&&i[i.length-1])&&(c[0]===6||c[0]===2)){r=0;continue}if(c[0]===3&&(!i||c[1]>i[0]&&c[1]=e.length&&(e=void 0),{value:e&&e[o++],done:!e}}};throw new TypeError(t?"Object is not iterable.":"Symbol.iterator is not defined.")}function N(e,t){var r=typeof Symbol=="function"&&e[Symbol.iterator];if(!r)return e;var o=r.call(e),n,i=[],a;try{for(;(t===void 0||t-- >0)&&!(n=o.next()).done;)i.push(n.value)}catch(s){a={error:s}}finally{try{n&&!n.done&&(r=o.return)&&r.call(o)}finally{if(a)throw a.error}}return i}function q(e,t,r){if(r||arguments.length===2)for(var o=0,n=t.length,i;o1||p(d,M)})},y&&(n[d]=y(n[d])))}function p(d,y){try{c(o[d](y))}catch(M){u(i[0][3],M)}}function c(d){d.value instanceof nt?Promise.resolve(d.value.v).then(l,f):u(i[0][2],d)}function l(d){p("next",d)}function f(d){p("throw",d)}function u(d,y){d(y),i.shift(),i.length&&p(i[0][0],i[0][1])}}function fo(e){if(!Symbol.asyncIterator)throw new TypeError("Symbol.asyncIterator is not defined.");var t=e[Symbol.asyncIterator],r;return t?t.call(e):(e=typeof he=="function"?he(e):e[Symbol.iterator](),r={},o("next"),o("throw"),o("return"),r[Symbol.asyncIterator]=function(){return this},r);function o(i){r[i]=e[i]&&function(a){return new Promise(function(s,p){a=e[i](a),n(s,p,a.done,a.value)})}}function n(i,a,s,p){Promise.resolve(p).then(function(c){i({value:c,done:s})},a)}}function H(e){return typeof e=="function"}function ut(e){var t=function(o){Error.call(o),o.stack=new Error().stack},r=e(t);return r.prototype=Object.create(Error.prototype),r.prototype.constructor=r,r}var zt=ut(function(e){return function(r){e(this),this.message=r?r.length+` errors occurred during unsubscription: +`+r.map(function(o,n){return n+1+") "+o.toString()}).join(` + `):"",this.name="UnsubscriptionError",this.errors=r}});function Qe(e,t){if(e){var r=e.indexOf(t);0<=r&&e.splice(r,1)}}var We=function(){function e(t){this.initialTeardown=t,this.closed=!1,this._parentage=null,this._finalizers=null}return e.prototype.unsubscribe=function(){var t,r,o,n,i;if(!this.closed){this.closed=!0;var a=this._parentage;if(a)if(this._parentage=null,Array.isArray(a))try{for(var s=he(a),p=s.next();!p.done;p=s.next()){var c=p.value;c.remove(this)}}catch(M){t={error:M}}finally{try{p&&!p.done&&(r=s.return)&&r.call(s)}finally{if(t)throw t.error}}else a.remove(this);var l=this.initialTeardown;if(H(l))try{l()}catch(M){i=M instanceof zt?M.errors:[M]}var f=this._finalizers;if(f){this._finalizers=null;try{for(var u=he(f),d=u.next();!d.done;d=u.next()){var y=d.value;try{uo(y)}catch(M){i=i!=null?i:[],M instanceof zt?i=q(q([],N(i)),N(M.errors)):i.push(M)}}}catch(M){o={error:M}}finally{try{d&&!d.done&&(n=u.return)&&n.call(u)}finally{if(o)throw o.error}}}if(i)throw new zt(i)}},e.prototype.add=function(t){var r;if(t&&t!==this)if(this.closed)uo(t);else{if(t instanceof e){if(t.closed||t._hasParent(this))return;t._addParent(this)}(this._finalizers=(r=this._finalizers)!==null&&r!==void 0?r:[]).push(t)}},e.prototype._hasParent=function(t){var r=this._parentage;return r===t||Array.isArray(r)&&r.includes(t)},e.prototype._addParent=function(t){var r=this._parentage;this._parentage=Array.isArray(r)?(r.push(t),r):r?[r,t]:t},e.prototype._removeParent=function(t){var r=this._parentage;r===t?this._parentage=null:Array.isArray(r)&&Qe(r,t)},e.prototype.remove=function(t){var r=this._finalizers;r&&Qe(r,t),t instanceof e&&t._removeParent(this)},e.EMPTY=function(){var t=new e;return t.closed=!0,t}(),e}();var Tr=We.EMPTY;function qt(e){return e instanceof We||e&&"closed"in e&&H(e.remove)&&H(e.add)&&H(e.unsubscribe)}function uo(e){H(e)?e():e.unsubscribe()}var Pe={onUnhandledError:null,onStoppedNotification:null,Promise:void 0,useDeprecatedSynchronousErrorHandling:!1,useDeprecatedNextContext:!1};var dt={setTimeout:function(e,t){for(var r=[],o=2;o0},enumerable:!1,configurable:!0}),t.prototype._trySubscribe=function(r){return this._throwIfClosed(),e.prototype._trySubscribe.call(this,r)},t.prototype._subscribe=function(r){return this._throwIfClosed(),this._checkFinalizedStatuses(r),this._innerSubscribe(r)},t.prototype._innerSubscribe=function(r){var o=this,n=this,i=n.hasError,a=n.isStopped,s=n.observers;return i||a?Tr:(this.currentObservers=null,s.push(r),new We(function(){o.currentObservers=null,Qe(s,r)}))},t.prototype._checkFinalizedStatuses=function(r){var o=this,n=o.hasError,i=o.thrownError,a=o.isStopped;n?r.error(i):a&&r.complete()},t.prototype.asObservable=function(){var r=new j;return r.source=this,r},t.create=function(r,o){return new wo(r,o)},t}(j);var wo=function(e){oe(t,e);function t(r,o){var n=e.call(this)||this;return n.destination=r,n.source=o,n}return t.prototype.next=function(r){var o,n;(n=(o=this.destination)===null||o===void 0?void 0:o.next)===null||n===void 0||n.call(o,r)},t.prototype.error=function(r){var o,n;(n=(o=this.destination)===null||o===void 0?void 0:o.error)===null||n===void 0||n.call(o,r)},t.prototype.complete=function(){var r,o;(o=(r=this.destination)===null||r===void 0?void 0:r.complete)===null||o===void 0||o.call(r)},t.prototype._subscribe=function(r){var o,n;return(n=(o=this.source)===null||o===void 0?void 0:o.subscribe(r))!==null&&n!==void 0?n:Tr},t}(g);var _r=function(e){oe(t,e);function t(r){var o=e.call(this)||this;return o._value=r,o}return Object.defineProperty(t.prototype,"value",{get:function(){return this.getValue()},enumerable:!1,configurable:!0}),t.prototype._subscribe=function(r){var o=e.prototype._subscribe.call(this,r);return!o.closed&&r.next(this._value),o},t.prototype.getValue=function(){var r=this,o=r.hasError,n=r.thrownError,i=r._value;if(o)throw n;return this._throwIfClosed(),i},t.prototype.next=function(r){e.prototype.next.call(this,this._value=r)},t}(g);var At={now:function(){return(At.delegate||Date).now()},delegate:void 0};var Ct=function(e){oe(t,e);function t(r,o,n){r===void 0&&(r=1/0),o===void 0&&(o=1/0),n===void 0&&(n=At);var i=e.call(this)||this;return i._bufferSize=r,i._windowTime=o,i._timestampProvider=n,i._buffer=[],i._infiniteTimeWindow=!0,i._infiniteTimeWindow=o===1/0,i._bufferSize=Math.max(1,r),i._windowTime=Math.max(1,o),i}return t.prototype.next=function(r){var o=this,n=o.isStopped,i=o._buffer,a=o._infiniteTimeWindow,s=o._timestampProvider,p=o._windowTime;n||(i.push(r),!a&&i.push(s.now()+p)),this._trimBuffer(),e.prototype.next.call(this,r)},t.prototype._subscribe=function(r){this._throwIfClosed(),this._trimBuffer();for(var o=this._innerSubscribe(r),n=this,i=n._infiniteTimeWindow,a=n._buffer,s=a.slice(),p=0;p0?e.prototype.schedule.call(this,r,o):(this.delay=o,this.state=r,this.scheduler.flush(this),this)},t.prototype.execute=function(r,o){return o>0||this.closed?e.prototype.execute.call(this,r,o):this._execute(r,o)},t.prototype.requestAsyncId=function(r,o,n){return n===void 0&&(n=0),n!=null&&n>0||n==null&&this.delay>0?e.prototype.requestAsyncId.call(this,r,o,n):(r.flush(this),0)},t}(gt);var Oo=function(e){oe(t,e);function t(){return e!==null&&e.apply(this,arguments)||this}return t}(yt);var kr=new Oo(So);var Mo=function(e){oe(t,e);function t(r,o){var n=e.call(this,r,o)||this;return n.scheduler=r,n.work=o,n}return t.prototype.requestAsyncId=function(r,o,n){return n===void 0&&(n=0),n!==null&&n>0?e.prototype.requestAsyncId.call(this,r,o,n):(r.actions.push(this),r._scheduled||(r._scheduled=vt.requestAnimationFrame(function(){return r.flush(void 0)})))},t.prototype.recycleAsyncId=function(r,o,n){var i;if(n===void 0&&(n=0),n!=null?n>0:this.delay>0)return e.prototype.recycleAsyncId.call(this,r,o,n);var a=r.actions;o!=null&&((i=a[a.length-1])===null||i===void 0?void 0:i.id)!==o&&(vt.cancelAnimationFrame(o),r._scheduled=void 0)},t}(gt);var Lo=function(e){oe(t,e);function t(){return e!==null&&e.apply(this,arguments)||this}return t.prototype.flush=function(r){this._active=!0;var o=this._scheduled;this._scheduled=void 0;var n=this.actions,i;r=r||n.shift();do if(i=r.execute(r.state,r.delay))break;while((r=n[0])&&r.id===o&&n.shift());if(this._active=!1,i){for(;(r=n[0])&&r.id===o&&n.shift();)r.unsubscribe();throw i}},t}(yt);var me=new Lo(Mo);var S=new j(function(e){return e.complete()});function Yt(e){return e&&H(e.schedule)}function Hr(e){return e[e.length-1]}function Xe(e){return H(Hr(e))?e.pop():void 0}function ke(e){return Yt(Hr(e))?e.pop():void 0}function Bt(e,t){return typeof Hr(e)=="number"?e.pop():t}var xt=function(e){return e&&typeof e.length=="number"&&typeof e!="function"};function Gt(e){return H(e==null?void 0:e.then)}function Jt(e){return H(e[bt])}function Xt(e){return Symbol.asyncIterator&&H(e==null?void 0:e[Symbol.asyncIterator])}function Zt(e){return new TypeError("You provided "+(e!==null&&typeof e=="object"?"an invalid object":"'"+e+"'")+" where a stream was expected. You can provide an Observable, Promise, ReadableStream, Array, AsyncIterable, or Iterable.")}function Ji(){return typeof Symbol!="function"||!Symbol.iterator?"@@iterator":Symbol.iterator}var er=Ji();function tr(e){return H(e==null?void 0:e[er])}function rr(e){return mo(this,arguments,function(){var r,o,n,i;return Nt(this,function(a){switch(a.label){case 0:r=e.getReader(),a.label=1;case 1:a.trys.push([1,,9,10]),a.label=2;case 2:return[4,nt(r.read())];case 3:return o=a.sent(),n=o.value,i=o.done,i?[4,nt(void 0)]:[3,5];case 4:return[2,a.sent()];case 5:return[4,nt(n)];case 6:return[4,a.sent()];case 7:return a.sent(),[3,2];case 8:return[3,10];case 9:return r.releaseLock(),[7];case 10:return[2]}})})}function or(e){return H(e==null?void 0:e.getReader)}function W(e){if(e instanceof j)return e;if(e!=null){if(Jt(e))return Xi(e);if(xt(e))return Zi(e);if(Gt(e))return ea(e);if(Xt(e))return _o(e);if(tr(e))return ta(e);if(or(e))return ra(e)}throw Zt(e)}function Xi(e){return new j(function(t){var r=e[bt]();if(H(r.subscribe))return r.subscribe(t);throw new TypeError("Provided object does not correctly implement Symbol.observable")})}function Zi(e){return new j(function(t){for(var r=0;r=2;return function(o){return o.pipe(e?b(function(n,i){return e(n,i,o)}):le,Te(1),r?De(t):qo(function(){return new ir}))}}function jr(e){return e<=0?function(){return S}:E(function(t,r){var o=[];t.subscribe(T(r,function(n){o.push(n),e=2,!0))}function pe(e){e===void 0&&(e={});var t=e.connector,r=t===void 0?function(){return new g}:t,o=e.resetOnError,n=o===void 0?!0:o,i=e.resetOnComplete,a=i===void 0?!0:i,s=e.resetOnRefCountZero,p=s===void 0?!0:s;return function(c){var l,f,u,d=0,y=!1,M=!1,X=function(){f==null||f.unsubscribe(),f=void 0},te=function(){X(),l=u=void 0,y=M=!1},J=function(){var k=l;te(),k==null||k.unsubscribe()};return E(function(k,ft){d++,!M&&!y&&X();var qe=u=u!=null?u:r();ft.add(function(){d--,d===0&&!M&&!y&&(f=Wr(J,p))}),qe.subscribe(ft),!l&&d>0&&(l=new at({next:function(Fe){return qe.next(Fe)},error:function(Fe){M=!0,X(),f=Wr(te,n,Fe),qe.error(Fe)},complete:function(){y=!0,X(),f=Wr(te,a),qe.complete()}}),W(k).subscribe(l))})(c)}}function Wr(e,t){for(var r=[],o=2;oe.next(document)),e}function P(e,t=document){return Array.from(t.querySelectorAll(e))}function R(e,t=document){let r=fe(e,t);if(typeof r=="undefined")throw new ReferenceError(`Missing element: expected "${e}" to be present`);return r}function fe(e,t=document){return t.querySelector(e)||void 0}function Ie(){var e,t,r,o;return(o=(r=(t=(e=document.activeElement)==null?void 0:e.shadowRoot)==null?void 0:t.activeElement)!=null?r:document.activeElement)!=null?o:void 0}var xa=O(h(document.body,"focusin"),h(document.body,"focusout")).pipe(_e(1),Q(void 0),m(()=>Ie()||document.body),G(1));function et(e){return xa.pipe(m(t=>e.contains(t)),K())}function $t(e,t){return C(()=>O(h(e,"mouseenter").pipe(m(()=>!0)),h(e,"mouseleave").pipe(m(()=>!1))).pipe(t?Ht(r=>Me(+!r*t)):le,Q(e.matches(":hover"))))}function Go(e,t){if(typeof t=="string"||typeof t=="number")e.innerHTML+=t.toString();else if(t instanceof Node)e.appendChild(t);else if(Array.isArray(t))for(let r of t)Go(e,r)}function x(e,t,...r){let o=document.createElement(e);if(t)for(let n of Object.keys(t))typeof t[n]!="undefined"&&(typeof t[n]!="boolean"?o.setAttribute(n,t[n]):o.setAttribute(n,""));for(let n of r)Go(o,n);return o}function sr(e){if(e>999){let t=+((e-950)%1e3>99);return`${((e+1e-6)/1e3).toFixed(t)}k`}else return e.toString()}function Tt(e){let t=x("script",{src:e});return C(()=>(document.head.appendChild(t),O(h(t,"load"),h(t,"error").pipe(v(()=>$r(()=>new ReferenceError(`Invalid script: ${e}`))))).pipe(m(()=>{}),_(()=>document.head.removeChild(t)),Te(1))))}var Jo=new g,Ea=C(()=>typeof ResizeObserver=="undefined"?Tt("https://unpkg.com/resize-observer-polyfill"):I(void 0)).pipe(m(()=>new ResizeObserver(e=>e.forEach(t=>Jo.next(t)))),v(e=>O(Ye,I(e)).pipe(_(()=>e.disconnect()))),G(1));function ce(e){return{width:e.offsetWidth,height:e.offsetHeight}}function ge(e){let t=e;for(;t.clientWidth===0&&t.parentElement;)t=t.parentElement;return Ea.pipe(w(r=>r.observe(t)),v(r=>Jo.pipe(b(o=>o.target===t),_(()=>r.unobserve(t)))),m(()=>ce(e)),Q(ce(e)))}function St(e){return{width:e.scrollWidth,height:e.scrollHeight}}function cr(e){let t=e.parentElement;for(;t&&(e.scrollWidth<=t.scrollWidth&&e.scrollHeight<=t.scrollHeight);)t=(e=t).parentElement;return t?e:void 0}function Xo(e){let t=[],r=e.parentElement;for(;r;)(e.clientWidth>r.clientWidth||e.clientHeight>r.clientHeight)&&t.push(r),r=(e=r).parentElement;return t.length===0&&t.push(document.documentElement),t}function Ve(e){return{x:e.offsetLeft,y:e.offsetTop}}function Zo(e){let t=e.getBoundingClientRect();return{x:t.x+window.scrollX,y:t.y+window.scrollY}}function en(e){return O(h(window,"load"),h(window,"resize")).pipe(Le(0,me),m(()=>Ve(e)),Q(Ve(e)))}function pr(e){return{x:e.scrollLeft,y:e.scrollTop}}function Ne(e){return O(h(e,"scroll"),h(window,"scroll"),h(window,"resize")).pipe(Le(0,me),m(()=>pr(e)),Q(pr(e)))}var tn=new g,wa=C(()=>I(new IntersectionObserver(e=>{for(let t of e)tn.next(t)},{threshold:0}))).pipe(v(e=>O(Ye,I(e)).pipe(_(()=>e.disconnect()))),G(1));function tt(e){return wa.pipe(w(t=>t.observe(e)),v(t=>tn.pipe(b(({target:r})=>r===e),_(()=>t.unobserve(e)),m(({isIntersecting:r})=>r))))}function rn(e,t=16){return Ne(e).pipe(m(({y:r})=>{let o=ce(e),n=St(e);return r>=n.height-o.height-t}),K())}var lr={drawer:R("[data-md-toggle=drawer]"),search:R("[data-md-toggle=search]")};function on(e){return lr[e].checked}function Je(e,t){lr[e].checked!==t&&lr[e].click()}function ze(e){let t=lr[e];return h(t,"change").pipe(m(()=>t.checked),Q(t.checked))}function Ta(e,t){switch(e.constructor){case HTMLInputElement:return e.type==="radio"?/^Arrow/.test(t):!0;case HTMLSelectElement:case HTMLTextAreaElement:return!0;default:return e.isContentEditable}}function Sa(){return O(h(window,"compositionstart").pipe(m(()=>!0)),h(window,"compositionend").pipe(m(()=>!1))).pipe(Q(!1))}function nn(){let e=h(window,"keydown").pipe(b(t=>!(t.metaKey||t.ctrlKey)),m(t=>({mode:on("search")?"search":"global",type:t.key,claim(){t.preventDefault(),t.stopPropagation()}})),b(({mode:t,type:r})=>{if(t==="global"){let o=Ie();if(typeof o!="undefined")return!Ta(o,r)}return!0}),pe());return Sa().pipe(v(t=>t?S:e))}function ye(){return new URL(location.href)}function lt(e,t=!1){if(B("navigation.instant")&&!t){let r=x("a",{href:e.href});document.body.appendChild(r),r.click(),r.remove()}else location.href=e.href}function an(){return new g}function sn(){return location.hash.slice(1)}function cn(e){let t=x("a",{href:e});t.addEventListener("click",r=>r.stopPropagation()),t.click()}function Oa(e){return O(h(window,"hashchange"),e).pipe(m(sn),Q(sn()),b(t=>t.length>0),G(1))}function pn(e){return Oa(e).pipe(m(t=>fe(`[id="${t}"]`)),b(t=>typeof t!="undefined"))}function Pt(e){let t=matchMedia(e);return ar(r=>t.addListener(()=>r(t.matches))).pipe(Q(t.matches))}function ln(){let e=matchMedia("print");return O(h(window,"beforeprint").pipe(m(()=>!0)),h(window,"afterprint").pipe(m(()=>!1))).pipe(Q(e.matches))}function Nr(e,t){return e.pipe(v(r=>r?t():S))}function zr(e,t){return new j(r=>{let o=new XMLHttpRequest;return o.open("GET",`${e}`),o.responseType="blob",o.addEventListener("load",()=>{o.status>=200&&o.status<300?(r.next(o.response),r.complete()):r.error(new Error(o.statusText))}),o.addEventListener("error",()=>{r.error(new Error("Network error"))}),o.addEventListener("abort",()=>{r.complete()}),typeof(t==null?void 0:t.progress$)!="undefined"&&(o.addEventListener("progress",n=>{var i;if(n.lengthComputable)t.progress$.next(n.loaded/n.total*100);else{let a=(i=o.getResponseHeader("Content-Length"))!=null?i:0;t.progress$.next(n.loaded/+a*100)}}),t.progress$.next(5)),o.send(),()=>o.abort()})}function je(e,t){return zr(e,t).pipe(v(r=>r.text()),m(r=>JSON.parse(r)),G(1))}function mn(e,t){let r=new DOMParser;return zr(e,t).pipe(v(o=>o.text()),m(o=>r.parseFromString(o,"text/html")),G(1))}function fn(e,t){let r=new DOMParser;return zr(e,t).pipe(v(o=>o.text()),m(o=>r.parseFromString(o,"text/xml")),G(1))}function un(){return{x:Math.max(0,scrollX),y:Math.max(0,scrollY)}}function dn(){return O(h(window,"scroll",{passive:!0}),h(window,"resize",{passive:!0})).pipe(m(un),Q(un()))}function hn(){return{width:innerWidth,height:innerHeight}}function bn(){return h(window,"resize",{passive:!0}).pipe(m(hn),Q(hn()))}function vn(){return z([dn(),bn()]).pipe(m(([e,t])=>({offset:e,size:t})),G(1))}function mr(e,{viewport$:t,header$:r}){let o=t.pipe(ee("size")),n=z([o,r]).pipe(m(()=>Ve(e)));return z([r,t,n]).pipe(m(([{height:i},{offset:a,size:s},{x:p,y:c}])=>({offset:{x:a.x-p,y:a.y-c+i},size:s})))}function Ma(e){return h(e,"message",t=>t.data)}function La(e){let t=new g;return t.subscribe(r=>e.postMessage(r)),t}function gn(e,t=new Worker(e)){let r=Ma(t),o=La(t),n=new g;n.subscribe(o);let i=o.pipe(Z(),ie(!0));return n.pipe(Z(),Re(r.pipe(U(i))),pe())}var _a=R("#__config"),Ot=JSON.parse(_a.textContent);Ot.base=`${new URL(Ot.base,ye())}`;function xe(){return Ot}function B(e){return Ot.features.includes(e)}function Ee(e,t){return typeof t!="undefined"?Ot.translations[e].replace("#",t.toString()):Ot.translations[e]}function Se(e,t=document){return R(`[data-md-component=${e}]`,t)}function ae(e,t=document){return P(`[data-md-component=${e}]`,t)}function Aa(e){let t=R(".md-typeset > :first-child",e);return h(t,"click",{once:!0}).pipe(m(()=>R(".md-typeset",e)),m(r=>({hash:__md_hash(r.innerHTML)})))}function yn(e){if(!B("announce.dismiss")||!e.childElementCount)return S;if(!e.hidden){let t=R(".md-typeset",e);__md_hash(t.innerHTML)===__md_get("__announce")&&(e.hidden=!0)}return C(()=>{let t=new g;return t.subscribe(({hash:r})=>{e.hidden=!0,__md_set("__announce",r)}),Aa(e).pipe(w(r=>t.next(r)),_(()=>t.complete()),m(r=>$({ref:e},r)))})}function Ca(e,{target$:t}){return t.pipe(m(r=>({hidden:r!==e})))}function xn(e,t){let r=new g;return r.subscribe(({hidden:o})=>{e.hidden=o}),Ca(e,t).pipe(w(o=>r.next(o)),_(()=>r.complete()),m(o=>$({ref:e},o)))}function Rt(e,t){return t==="inline"?x("div",{class:"md-tooltip md-tooltip--inline",id:e,role:"tooltip"},x("div",{class:"md-tooltip__inner md-typeset"})):x("div",{class:"md-tooltip",id:e,role:"tooltip"},x("div",{class:"md-tooltip__inner md-typeset"}))}function En(...e){return x("div",{class:"md-tooltip2",role:"tooltip"},x("div",{class:"md-tooltip2__inner md-typeset"},e))}function wn(e,t){if(t=t?`${t}_annotation_${e}`:void 0,t){let r=t?`#${t}`:void 0;return x("aside",{class:"md-annotation",tabIndex:0},Rt(t),x("a",{href:r,class:"md-annotation__index",tabIndex:-1},x("span",{"data-md-annotation-id":e})))}else return x("aside",{class:"md-annotation",tabIndex:0},Rt(t),x("span",{class:"md-annotation__index",tabIndex:-1},x("span",{"data-md-annotation-id":e})))}function Tn(e){return x("button",{class:"md-clipboard md-icon",title:Ee("clipboard.copy"),"data-clipboard-target":`#${e} > code`})}var On=Lt(qr());function Qr(e,t){let r=t&2,o=t&1,n=Object.keys(e.terms).filter(p=>!e.terms[p]).reduce((p,c)=>[...p,x("del",null,(0,On.default)(c))," "],[]).slice(0,-1),i=xe(),a=new URL(e.location,i.base);B("search.highlight")&&a.searchParams.set("h",Object.entries(e.terms).filter(([,p])=>p).reduce((p,[c])=>`${p} ${c}`.trim(),""));let{tags:s}=xe();return x("a",{href:`${a}`,class:"md-search-result__link",tabIndex:-1},x("article",{class:"md-search-result__article md-typeset","data-md-score":e.score.toFixed(2)},r>0&&x("div",{class:"md-search-result__icon md-icon"}),r>0&&x("h1",null,e.title),r<=0&&x("h2",null,e.title),o>0&&e.text.length>0&&e.text,e.tags&&e.tags.map(p=>{let c=s?p in s?`md-tag-icon md-tag--${s[p]}`:"md-tag-icon":"";return x("span",{class:`md-tag ${c}`},p)}),o>0&&n.length>0&&x("p",{class:"md-search-result__terms"},Ee("search.result.term.missing"),": ",...n)))}function Mn(e){let t=e[0].score,r=[...e],o=xe(),n=r.findIndex(l=>!`${new URL(l.location,o.base)}`.includes("#")),[i]=r.splice(n,1),a=r.findIndex(l=>l.scoreQr(l,1)),...p.length?[x("details",{class:"md-search-result__more"},x("summary",{tabIndex:-1},x("div",null,p.length>0&&p.length===1?Ee("search.result.more.one"):Ee("search.result.more.other",p.length))),...p.map(l=>Qr(l,1)))]:[]];return x("li",{class:"md-search-result__item"},c)}function Ln(e){return x("ul",{class:"md-source__facts"},Object.entries(e).map(([t,r])=>x("li",{class:`md-source__fact md-source__fact--${t}`},typeof r=="number"?sr(r):r)))}function Kr(e){let t=`tabbed-control tabbed-control--${e}`;return x("div",{class:t,hidden:!0},x("button",{class:"tabbed-button",tabIndex:-1,"aria-hidden":"true"}))}function _n(e){return x("div",{class:"md-typeset__scrollwrap"},x("div",{class:"md-typeset__table"},e))}function $a(e){var o;let t=xe(),r=new URL(`../${e.version}/`,t.base);return x("li",{class:"md-version__item"},x("a",{href:`${r}`,class:"md-version__link"},e.title,((o=t.version)==null?void 0:o.alias)&&e.aliases.length>0&&x("span",{class:"md-version__alias"},e.aliases[0])))}function An(e,t){var o;let r=xe();return e=e.filter(n=>{var i;return!((i=n.properties)!=null&&i.hidden)}),x("div",{class:"md-version"},x("button",{class:"md-version__current","aria-label":Ee("select.version")},t.title,((o=r.version)==null?void 0:o.alias)&&t.aliases.length>0&&x("span",{class:"md-version__alias"},t.aliases[0])),x("ul",{class:"md-version__list"},e.map($a)))}var Pa=0;function Ra(e){let t=z([et(e),$t(e)]).pipe(m(([o,n])=>o||n),K()),r=C(()=>Xo(e)).pipe(ne(Ne),pt(1),He(t),m(()=>Zo(e)));return t.pipe(Ae(o=>o),v(()=>z([t,r])),m(([o,n])=>({active:o,offset:n})),pe())}function Ia(e,t){let{content$:r,viewport$:o}=t,n=`__tooltip2_${Pa++}`;return C(()=>{let i=new g,a=new _r(!1);i.pipe(Z(),ie(!1)).subscribe(a);let s=a.pipe(Ht(c=>Me(+!c*250,kr)),K(),v(c=>c?r:S),w(c=>c.id=n),pe());z([i.pipe(m(({active:c})=>c)),s.pipe(v(c=>$t(c,250)),Q(!1))]).pipe(m(c=>c.some(l=>l))).subscribe(a);let p=a.pipe(b(c=>c),re(s,o),m(([c,l,{size:f}])=>{let u=e.getBoundingClientRect(),d=u.width/2;if(l.role==="tooltip")return{x:d,y:8+u.height};if(u.y>=f.height/2){let{height:y}=ce(l);return{x:d,y:-16-y}}else return{x:d,y:16+u.height}}));return z([s,i,p]).subscribe(([c,{offset:l},f])=>{c.style.setProperty("--md-tooltip-host-x",`${l.x}px`),c.style.setProperty("--md-tooltip-host-y",`${l.y}px`),c.style.setProperty("--md-tooltip-x",`${f.x}px`),c.style.setProperty("--md-tooltip-y",`${f.y}px`),c.classList.toggle("md-tooltip2--top",f.y<0),c.classList.toggle("md-tooltip2--bottom",f.y>=0)}),a.pipe(b(c=>c),re(s,(c,l)=>l),b(c=>c.role==="tooltip")).subscribe(c=>{let l=ce(R(":scope > *",c));c.style.setProperty("--md-tooltip-width",`${l.width}px`),c.style.setProperty("--md-tooltip-tail","0px")}),a.pipe(K(),ve(me),re(s)).subscribe(([c,l])=>{l.classList.toggle("md-tooltip2--active",c)}),z([a.pipe(b(c=>c)),s]).subscribe(([c,l])=>{l.role==="dialog"?(e.setAttribute("aria-controls",n),e.setAttribute("aria-haspopup","dialog")):e.setAttribute("aria-describedby",n)}),a.pipe(b(c=>!c)).subscribe(()=>{e.removeAttribute("aria-controls"),e.removeAttribute("aria-describedby"),e.removeAttribute("aria-haspopup")}),Ra(e).pipe(w(c=>i.next(c)),_(()=>i.complete()),m(c=>$({ref:e},c)))})}function mt(e,{viewport$:t},r=document.body){return Ia(e,{content$:new j(o=>{let n=e.title,i=En(n);return o.next(i),e.removeAttribute("title"),r.append(i),()=>{i.remove(),e.setAttribute("title",n)}}),viewport$:t})}function ja(e,t){let r=C(()=>z([en(e),Ne(t)])).pipe(m(([{x:o,y:n},i])=>{let{width:a,height:s}=ce(e);return{x:o-i.x+a/2,y:n-i.y+s/2}}));return et(e).pipe(v(o=>r.pipe(m(n=>({active:o,offset:n})),Te(+!o||1/0))))}function Cn(e,t,{target$:r}){let[o,n]=Array.from(e.children);return C(()=>{let i=new g,a=i.pipe(Z(),ie(!0));return i.subscribe({next({offset:s}){e.style.setProperty("--md-tooltip-x",`${s.x}px`),e.style.setProperty("--md-tooltip-y",`${s.y}px`)},complete(){e.style.removeProperty("--md-tooltip-x"),e.style.removeProperty("--md-tooltip-y")}}),tt(e).pipe(U(a)).subscribe(s=>{e.toggleAttribute("data-md-visible",s)}),O(i.pipe(b(({active:s})=>s)),i.pipe(_e(250),b(({active:s})=>!s))).subscribe({next({active:s}){s?e.prepend(o):o.remove()},complete(){e.prepend(o)}}),i.pipe(Le(16,me)).subscribe(({active:s})=>{o.classList.toggle("md-tooltip--active",s)}),i.pipe(pt(125,me),b(()=>!!e.offsetParent),m(()=>e.offsetParent.getBoundingClientRect()),m(({x:s})=>s)).subscribe({next(s){s?e.style.setProperty("--md-tooltip-0",`${-s}px`):e.style.removeProperty("--md-tooltip-0")},complete(){e.style.removeProperty("--md-tooltip-0")}}),h(n,"click").pipe(U(a),b(s=>!(s.metaKey||s.ctrlKey))).subscribe(s=>{s.stopPropagation(),s.preventDefault()}),h(n,"mousedown").pipe(U(a),re(i)).subscribe(([s,{active:p}])=>{var c;if(s.button!==0||s.metaKey||s.ctrlKey)s.preventDefault();else if(p){s.preventDefault();let l=e.parentElement.closest(".md-annotation");l instanceof HTMLElement?l.focus():(c=Ie())==null||c.blur()}}),r.pipe(U(a),b(s=>s===o),Ge(125)).subscribe(()=>e.focus()),ja(e,t).pipe(w(s=>i.next(s)),_(()=>i.complete()),m(s=>$({ref:e},s)))})}function Fa(e){return e.tagName==="CODE"?P(".c, .c1, .cm",e):[e]}function Wa(e){let t=[];for(let r of Fa(e)){let o=[],n=document.createNodeIterator(r,NodeFilter.SHOW_TEXT);for(let i=n.nextNode();i;i=n.nextNode())o.push(i);for(let i of o){let a;for(;a=/(\(\d+\))(!)?/.exec(i.textContent);){let[,s,p]=a;if(typeof p=="undefined"){let c=i.splitText(a.index);i=c.splitText(s.length),t.push(c)}else{i.textContent=s,t.push(i);break}}}}return t}function kn(e,t){t.append(...Array.from(e.childNodes))}function fr(e,t,{target$:r,print$:o}){let n=t.closest("[id]"),i=n==null?void 0:n.id,a=new Map;for(let s of Wa(t)){let[,p]=s.textContent.match(/\((\d+)\)/);fe(`:scope > li:nth-child(${p})`,e)&&(a.set(p,wn(p,i)),s.replaceWith(a.get(p)))}return a.size===0?S:C(()=>{let s=new g,p=s.pipe(Z(),ie(!0)),c=[];for(let[l,f]of a)c.push([R(".md-typeset",f),R(`:scope > li:nth-child(${l})`,e)]);return o.pipe(U(p)).subscribe(l=>{e.hidden=!l,e.classList.toggle("md-annotation-list",l);for(let[f,u]of c)l?kn(f,u):kn(u,f)}),O(...[...a].map(([,l])=>Cn(l,t,{target$:r}))).pipe(_(()=>s.complete()),pe())})}function Hn(e){if(e.nextElementSibling){let t=e.nextElementSibling;if(t.tagName==="OL")return t;if(t.tagName==="P"&&!t.children.length)return Hn(t)}}function $n(e,t){return C(()=>{let r=Hn(e);return typeof r!="undefined"?fr(r,e,t):S})}var Pn=Lt(Br());var Ua=0;function Rn(e){if(e.nextElementSibling){let t=e.nextElementSibling;if(t.tagName==="OL")return t;if(t.tagName==="P"&&!t.children.length)return Rn(t)}}function Da(e){return ge(e).pipe(m(({width:t})=>({scrollable:St(e).width>t})),ee("scrollable"))}function In(e,t){let{matches:r}=matchMedia("(hover)"),o=C(()=>{let n=new g,i=n.pipe(jr(1));n.subscribe(({scrollable:c})=>{c&&r?e.setAttribute("tabindex","0"):e.removeAttribute("tabindex")});let a=[];if(Pn.default.isSupported()&&(e.closest(".copy")||B("content.code.copy")&&!e.closest(".no-copy"))){let c=e.closest("pre");c.id=`__code_${Ua++}`;let l=Tn(c.id);c.insertBefore(l,e),B("content.tooltips")&&a.push(mt(l,{viewport$}))}let s=e.closest(".highlight");if(s instanceof HTMLElement){let c=Rn(s);if(typeof c!="undefined"&&(s.classList.contains("annotate")||B("content.code.annotate"))){let l=fr(c,e,t);a.push(ge(s).pipe(U(i),m(({width:f,height:u})=>f&&u),K(),v(f=>f?l:S)))}}return P(":scope > span[id]",e).length&&e.classList.add("md-code__content"),Da(e).pipe(w(c=>n.next(c)),_(()=>n.complete()),m(c=>$({ref:e},c)),Re(...a))});return B("content.lazy")?tt(e).pipe(b(n=>n),Te(1),v(()=>o)):o}function Va(e,{target$:t,print$:r}){let o=!0;return O(t.pipe(m(n=>n.closest("details:not([open])")),b(n=>e===n),m(()=>({action:"open",reveal:!0}))),r.pipe(b(n=>n||!o),w(()=>o=e.open),m(n=>({action:n?"open":"close"}))))}function jn(e,t){return C(()=>{let r=new g;return r.subscribe(({action:o,reveal:n})=>{e.toggleAttribute("open",o==="open"),n&&e.scrollIntoView()}),Va(e,t).pipe(w(o=>r.next(o)),_(()=>r.complete()),m(o=>$({ref:e},o)))})}var Fn=".node circle,.node ellipse,.node path,.node polygon,.node rect{fill:var(--md-mermaid-node-bg-color);stroke:var(--md-mermaid-node-fg-color)}marker{fill:var(--md-mermaid-edge-color)!important}.edgeLabel .label rect{fill:#0000}.label{color:var(--md-mermaid-label-fg-color);font-family:var(--md-mermaid-font-family)}.label foreignObject{line-height:normal;overflow:visible}.label div .edgeLabel{color:var(--md-mermaid-label-fg-color)}.edgeLabel,.edgeLabel p,.label div .edgeLabel{background-color:var(--md-mermaid-label-bg-color)}.edgeLabel,.edgeLabel p{fill:var(--md-mermaid-label-bg-color);color:var(--md-mermaid-edge-color)}.edgePath .path,.flowchart-link{stroke:var(--md-mermaid-edge-color);stroke-width:.05rem}.edgePath .arrowheadPath{fill:var(--md-mermaid-edge-color);stroke:none}.cluster rect{fill:var(--md-default-fg-color--lightest);stroke:var(--md-default-fg-color--lighter)}.cluster span{color:var(--md-mermaid-label-fg-color);font-family:var(--md-mermaid-font-family)}g #flowchart-circleEnd,g #flowchart-circleStart,g #flowchart-crossEnd,g #flowchart-crossStart,g #flowchart-pointEnd,g #flowchart-pointStart{stroke:none}g.classGroup line,g.classGroup rect{fill:var(--md-mermaid-node-bg-color);stroke:var(--md-mermaid-node-fg-color)}g.classGroup text{fill:var(--md-mermaid-label-fg-color);font-family:var(--md-mermaid-font-family)}.classLabel .box{fill:var(--md-mermaid-label-bg-color);background-color:var(--md-mermaid-label-bg-color);opacity:1}.classLabel .label{fill:var(--md-mermaid-label-fg-color);font-family:var(--md-mermaid-font-family)}.node .divider{stroke:var(--md-mermaid-node-fg-color)}.relation{stroke:var(--md-mermaid-edge-color)}.cardinality{fill:var(--md-mermaid-label-fg-color);font-family:var(--md-mermaid-font-family)}.cardinality text{fill:inherit!important}defs #classDiagram-compositionEnd,defs #classDiagram-compositionStart,defs #classDiagram-dependencyEnd,defs #classDiagram-dependencyStart,defs #classDiagram-extensionEnd,defs #classDiagram-extensionStart{fill:var(--md-mermaid-edge-color)!important;stroke:var(--md-mermaid-edge-color)!important}defs #classDiagram-aggregationEnd,defs #classDiagram-aggregationStart{fill:var(--md-mermaid-label-bg-color)!important;stroke:var(--md-mermaid-edge-color)!important}g.stateGroup rect{fill:var(--md-mermaid-node-bg-color);stroke:var(--md-mermaid-node-fg-color)}g.stateGroup .state-title{fill:var(--md-mermaid-label-fg-color)!important;font-family:var(--md-mermaid-font-family)}g.stateGroup .composit{fill:var(--md-mermaid-label-bg-color)}.nodeLabel,.nodeLabel p{color:var(--md-mermaid-label-fg-color);font-family:var(--md-mermaid-font-family)}a .nodeLabel{text-decoration:underline}.node circle.state-end,.node circle.state-start,.start-state{fill:var(--md-mermaid-edge-color);stroke:none}.end-state-inner,.end-state-outer{fill:var(--md-mermaid-edge-color)}.end-state-inner,.node circle.state-end{stroke:var(--md-mermaid-label-bg-color)}.transition{stroke:var(--md-mermaid-edge-color)}[id^=state-fork] rect,[id^=state-join] rect{fill:var(--md-mermaid-edge-color)!important;stroke:none!important}.statediagram-cluster.statediagram-cluster .inner{fill:var(--md-default-bg-color)}.statediagram-cluster rect{fill:var(--md-mermaid-node-bg-color);stroke:var(--md-mermaid-node-fg-color)}.statediagram-state rect.divider{fill:var(--md-default-fg-color--lightest);stroke:var(--md-default-fg-color--lighter)}defs #statediagram-barbEnd{stroke:var(--md-mermaid-edge-color)}.attributeBoxEven,.attributeBoxOdd{fill:var(--md-mermaid-node-bg-color);stroke:var(--md-mermaid-node-fg-color)}.entityBox{fill:var(--md-mermaid-label-bg-color);stroke:var(--md-mermaid-node-fg-color)}.entityLabel{fill:var(--md-mermaid-label-fg-color);font-family:var(--md-mermaid-font-family)}.relationshipLabelBox{fill:var(--md-mermaid-label-bg-color);fill-opacity:1;background-color:var(--md-mermaid-label-bg-color);opacity:1}.relationshipLabel{fill:var(--md-mermaid-label-fg-color)}.relationshipLine{stroke:var(--md-mermaid-edge-color)}defs #ONE_OR_MORE_END *,defs #ONE_OR_MORE_START *,defs #ONLY_ONE_END *,defs #ONLY_ONE_START *,defs #ZERO_OR_MORE_END *,defs #ZERO_OR_MORE_START *,defs #ZERO_OR_ONE_END *,defs #ZERO_OR_ONE_START *{stroke:var(--md-mermaid-edge-color)!important}defs #ZERO_OR_MORE_END circle,defs #ZERO_OR_MORE_START circle{fill:var(--md-mermaid-label-bg-color)}.actor{fill:var(--md-mermaid-sequence-actor-bg-color);stroke:var(--md-mermaid-sequence-actor-border-color)}text.actor>tspan{fill:var(--md-mermaid-sequence-actor-fg-color);font-family:var(--md-mermaid-font-family)}line{stroke:var(--md-mermaid-sequence-actor-line-color)}.actor-man circle,.actor-man line{fill:var(--md-mermaid-sequence-actorman-bg-color);stroke:var(--md-mermaid-sequence-actorman-line-color)}.messageLine0,.messageLine1{stroke:var(--md-mermaid-sequence-message-line-color)}.note{fill:var(--md-mermaid-sequence-note-bg-color);stroke:var(--md-mermaid-sequence-note-border-color)}.loopText,.loopText>tspan,.messageText,.noteText>tspan{stroke:none;font-family:var(--md-mermaid-font-family)!important}.messageText{fill:var(--md-mermaid-sequence-message-fg-color)}.loopText,.loopText>tspan{fill:var(--md-mermaid-sequence-loop-fg-color)}.noteText>tspan{fill:var(--md-mermaid-sequence-note-fg-color)}#arrowhead path{fill:var(--md-mermaid-sequence-message-line-color);stroke:none}.loopLine{fill:var(--md-mermaid-sequence-loop-bg-color);stroke:var(--md-mermaid-sequence-loop-border-color)}.labelBox{fill:var(--md-mermaid-sequence-label-bg-color);stroke:none}.labelText,.labelText>span{fill:var(--md-mermaid-sequence-label-fg-color);font-family:var(--md-mermaid-font-family)}.sequenceNumber{fill:var(--md-mermaid-sequence-number-fg-color)}rect.rect{fill:var(--md-mermaid-sequence-box-bg-color);stroke:none}rect.rect+text.text{fill:var(--md-mermaid-sequence-box-fg-color)}defs #sequencenumber{fill:var(--md-mermaid-sequence-number-bg-color)!important}";var Gr,za=0;function qa(){return typeof mermaid=="undefined"||mermaid instanceof Element?Tt("https://unpkg.com/mermaid@11/dist/mermaid.min.js"):I(void 0)}function Wn(e){return e.classList.remove("mermaid"),Gr||(Gr=qa().pipe(w(()=>mermaid.initialize({startOnLoad:!1,themeCSS:Fn,sequence:{actorFontSize:"16px",messageFontSize:"16px",noteFontSize:"16px"}})),m(()=>{}),G(1))),Gr.subscribe(()=>so(this,null,function*(){e.classList.add("mermaid");let t=`__mermaid_${za++}`,r=x("div",{class:"mermaid"}),o=e.textContent,{svg:n,fn:i}=yield mermaid.render(t,o),a=r.attachShadow({mode:"closed"});a.innerHTML=n,e.replaceWith(r),i==null||i(a)})),Gr.pipe(m(()=>({ref:e})))}var Un=x("table");function Dn(e){return e.replaceWith(Un),Un.replaceWith(_n(e)),I({ref:e})}function Qa(e){let t=e.find(r=>r.checked)||e[0];return O(...e.map(r=>h(r,"change").pipe(m(()=>R(`label[for="${r.id}"]`))))).pipe(Q(R(`label[for="${t.id}"]`)),m(r=>({active:r})))}function Vn(e,{viewport$:t,target$:r}){let o=R(".tabbed-labels",e),n=P(":scope > input",e),i=Kr("prev");e.append(i);let a=Kr("next");return e.append(a),C(()=>{let s=new g,p=s.pipe(Z(),ie(!0));z([s,ge(e),tt(e)]).pipe(U(p),Le(1,me)).subscribe({next([{active:c},l]){let f=Ve(c),{width:u}=ce(c);e.style.setProperty("--md-indicator-x",`${f.x}px`),e.style.setProperty("--md-indicator-width",`${u}px`);let d=pr(o);(f.xd.x+l.width)&&o.scrollTo({left:Math.max(0,f.x-16),behavior:"smooth"})},complete(){e.style.removeProperty("--md-indicator-x"),e.style.removeProperty("--md-indicator-width")}}),z([Ne(o),ge(o)]).pipe(U(p)).subscribe(([c,l])=>{let f=St(o);i.hidden=c.x<16,a.hidden=c.x>f.width-l.width-16}),O(h(i,"click").pipe(m(()=>-1)),h(a,"click").pipe(m(()=>1))).pipe(U(p)).subscribe(c=>{let{width:l}=ce(o);o.scrollBy({left:l*c,behavior:"smooth"})}),r.pipe(U(p),b(c=>n.includes(c))).subscribe(c=>c.click()),o.classList.add("tabbed-labels--linked");for(let c of n){let l=R(`label[for="${c.id}"]`);l.replaceChildren(x("a",{href:`#${l.htmlFor}`,tabIndex:-1},...Array.from(l.childNodes))),h(l.firstElementChild,"click").pipe(U(p),b(f=>!(f.metaKey||f.ctrlKey)),w(f=>{f.preventDefault(),f.stopPropagation()})).subscribe(()=>{history.replaceState({},"",`#${l.htmlFor}`),l.click()})}return B("content.tabs.link")&&s.pipe(Ce(1),re(t)).subscribe(([{active:c},{offset:l}])=>{let f=c.innerText.trim();if(c.hasAttribute("data-md-switching"))c.removeAttribute("data-md-switching");else{let u=e.offsetTop-l.y;for(let y of P("[data-tabs]"))for(let M of P(":scope > input",y)){let X=R(`label[for="${M.id}"]`);if(X!==c&&X.innerText.trim()===f){X.setAttribute("data-md-switching",""),M.click();break}}window.scrollTo({top:e.offsetTop-u});let d=__md_get("__tabs")||[];__md_set("__tabs",[...new Set([f,...d])])}}),s.pipe(U(p)).subscribe(()=>{for(let c of P("audio, video",e))c.pause()}),Qa(n).pipe(w(c=>s.next(c)),_(()=>s.complete()),m(c=>$({ref:e},c)))}).pipe(Ke(se))}function Nn(e,{viewport$:t,target$:r,print$:o}){return O(...P(".annotate:not(.highlight)",e).map(n=>$n(n,{target$:r,print$:o})),...P("pre:not(.mermaid) > code",e).map(n=>In(n,{target$:r,print$:o})),...P("pre.mermaid",e).map(n=>Wn(n)),...P("table:not([class])",e).map(n=>Dn(n)),...P("details",e).map(n=>jn(n,{target$:r,print$:o})),...P("[data-tabs]",e).map(n=>Vn(n,{viewport$:t,target$:r})),...P("[title]",e).filter(()=>B("content.tooltips")).map(n=>mt(n,{viewport$:t})))}function Ka(e,{alert$:t}){return t.pipe(v(r=>O(I(!0),I(!1).pipe(Ge(2e3))).pipe(m(o=>({message:r,active:o})))))}function zn(e,t){let r=R(".md-typeset",e);return C(()=>{let o=new g;return o.subscribe(({message:n,active:i})=>{e.classList.toggle("md-dialog--active",i),r.textContent=n}),Ka(e,t).pipe(w(n=>o.next(n)),_(()=>o.complete()),m(n=>$({ref:e},n)))})}var Ya=0;function Ba(e,t){document.body.append(e);let{width:r}=ce(e);e.style.setProperty("--md-tooltip-width",`${r}px`),e.remove();let o=cr(t),n=typeof o!="undefined"?Ne(o):I({x:0,y:0}),i=O(et(t),$t(t)).pipe(K());return z([i,n]).pipe(m(([a,s])=>{let{x:p,y:c}=Ve(t),l=ce(t),f=t.closest("table");return f&&t.parentElement&&(p+=f.offsetLeft+t.parentElement.offsetLeft,c+=f.offsetTop+t.parentElement.offsetTop),{active:a,offset:{x:p-s.x+l.width/2-r/2,y:c-s.y+l.height+8}}}))}function qn(e){let t=e.title;if(!t.length)return S;let r=`__tooltip_${Ya++}`,o=Rt(r,"inline"),n=R(".md-typeset",o);return n.innerHTML=t,C(()=>{let i=new g;return i.subscribe({next({offset:a}){o.style.setProperty("--md-tooltip-x",`${a.x}px`),o.style.setProperty("--md-tooltip-y",`${a.y}px`)},complete(){o.style.removeProperty("--md-tooltip-x"),o.style.removeProperty("--md-tooltip-y")}}),O(i.pipe(b(({active:a})=>a)),i.pipe(_e(250),b(({active:a})=>!a))).subscribe({next({active:a}){a?(e.insertAdjacentElement("afterend",o),e.setAttribute("aria-describedby",r),e.removeAttribute("title")):(o.remove(),e.removeAttribute("aria-describedby"),e.setAttribute("title",t))},complete(){o.remove(),e.removeAttribute("aria-describedby"),e.setAttribute("title",t)}}),i.pipe(Le(16,me)).subscribe(({active:a})=>{o.classList.toggle("md-tooltip--active",a)}),i.pipe(pt(125,me),b(()=>!!e.offsetParent),m(()=>e.offsetParent.getBoundingClientRect()),m(({x:a})=>a)).subscribe({next(a){a?o.style.setProperty("--md-tooltip-0",`${-a}px`):o.style.removeProperty("--md-tooltip-0")},complete(){o.style.removeProperty("--md-tooltip-0")}}),Ba(o,e).pipe(w(a=>i.next(a)),_(()=>i.complete()),m(a=>$({ref:e},a)))}).pipe(Ke(se))}function Ga({viewport$:e}){if(!B("header.autohide"))return I(!1);let t=e.pipe(m(({offset:{y:n}})=>n),Be(2,1),m(([n,i])=>[nMath.abs(i-n.y)>100),m(([,[n]])=>n),K()),o=ze("search");return z([e,o]).pipe(m(([{offset:n},i])=>n.y>400&&!i),K(),v(n=>n?r:I(!1)),Q(!1))}function Qn(e,t){return C(()=>z([ge(e),Ga(t)])).pipe(m(([{height:r},o])=>({height:r,hidden:o})),K((r,o)=>r.height===o.height&&r.hidden===o.hidden),G(1))}function Kn(e,{header$:t,main$:r}){return C(()=>{let o=new g,n=o.pipe(Z(),ie(!0));o.pipe(ee("active"),He(t)).subscribe(([{active:a},{hidden:s}])=>{e.classList.toggle("md-header--shadow",a&&!s),e.hidden=s});let i=ue(P("[title]",e)).pipe(b(()=>B("content.tooltips")),ne(a=>qn(a)));return r.subscribe(o),t.pipe(U(n),m(a=>$({ref:e},a)),Re(i.pipe(U(n))))})}function Ja(e,{viewport$:t,header$:r}){return mr(e,{viewport$:t,header$:r}).pipe(m(({offset:{y:o}})=>{let{height:n}=ce(e);return{active:o>=n}}),ee("active"))}function Yn(e,t){return C(()=>{let r=new g;r.subscribe({next({active:n}){e.classList.toggle("md-header__title--active",n)},complete(){e.classList.remove("md-header__title--active")}});let o=fe(".md-content h1");return typeof o=="undefined"?S:Ja(o,t).pipe(w(n=>r.next(n)),_(()=>r.complete()),m(n=>$({ref:e},n)))})}function Bn(e,{viewport$:t,header$:r}){let o=r.pipe(m(({height:i})=>i),K()),n=o.pipe(v(()=>ge(e).pipe(m(({height:i})=>({top:e.offsetTop,bottom:e.offsetTop+i})),ee("bottom"))));return z([o,n,t]).pipe(m(([i,{top:a,bottom:s},{offset:{y:p},size:{height:c}}])=>(c=Math.max(0,c-Math.max(0,a-p,i)-Math.max(0,c+p-s)),{offset:a-i,height:c,active:a-i<=p})),K((i,a)=>i.offset===a.offset&&i.height===a.height&&i.active===a.active))}function Xa(e){let t=__md_get("__palette")||{index:e.findIndex(o=>matchMedia(o.getAttribute("data-md-color-media")).matches)},r=Math.max(0,Math.min(t.index,e.length-1));return I(...e).pipe(ne(o=>h(o,"change").pipe(m(()=>o))),Q(e[r]),m(o=>({index:e.indexOf(o),color:{media:o.getAttribute("data-md-color-media"),scheme:o.getAttribute("data-md-color-scheme"),primary:o.getAttribute("data-md-color-primary"),accent:o.getAttribute("data-md-color-accent")}})),G(1))}function Gn(e){let t=P("input",e),r=x("meta",{name:"theme-color"});document.head.appendChild(r);let o=x("meta",{name:"color-scheme"});document.head.appendChild(o);let n=Pt("(prefers-color-scheme: light)");return C(()=>{let i=new g;return i.subscribe(a=>{if(document.body.setAttribute("data-md-color-switching",""),a.color.media==="(prefers-color-scheme)"){let s=matchMedia("(prefers-color-scheme: light)"),p=document.querySelector(s.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']");a.color.scheme=p.getAttribute("data-md-color-scheme"),a.color.primary=p.getAttribute("data-md-color-primary"),a.color.accent=p.getAttribute("data-md-color-accent")}for(let[s,p]of Object.entries(a.color))document.body.setAttribute(`data-md-color-${s}`,p);for(let s=0;sa.key==="Enter"),re(i,(a,s)=>s)).subscribe(({index:a})=>{a=(a+1)%t.length,t[a].click(),t[a].focus()}),i.pipe(m(()=>{let a=Se("header"),s=window.getComputedStyle(a);return o.content=s.colorScheme,s.backgroundColor.match(/\d+/g).map(p=>(+p).toString(16).padStart(2,"0")).join("")})).subscribe(a=>r.content=`#${a}`),i.pipe(ve(se)).subscribe(()=>{document.body.removeAttribute("data-md-color-switching")}),Xa(t).pipe(U(n.pipe(Ce(1))),ct(),w(a=>i.next(a)),_(()=>i.complete()),m(a=>$({ref:e},a)))})}function Jn(e,{progress$:t}){return C(()=>{let r=new g;return r.subscribe(({value:o})=>{e.style.setProperty("--md-progress-value",`${o}`)}),t.pipe(w(o=>r.next({value:o})),_(()=>r.complete()),m(o=>({ref:e,value:o})))})}var Jr=Lt(Br());function Za(e){e.setAttribute("data-md-copying","");let t=e.closest("[data-copy]"),r=t?t.getAttribute("data-copy"):e.innerText;return e.removeAttribute("data-md-copying"),r.trimEnd()}function Xn({alert$:e}){Jr.default.isSupported()&&new j(t=>{new Jr.default("[data-clipboard-target], [data-clipboard-text]",{text:r=>r.getAttribute("data-clipboard-text")||Za(R(r.getAttribute("data-clipboard-target")))}).on("success",r=>t.next(r))}).pipe(w(t=>{t.trigger.focus()}),m(()=>Ee("clipboard.copied"))).subscribe(e)}function Zn(e,t){return e.protocol=t.protocol,e.hostname=t.hostname,e}function es(e,t){let r=new Map;for(let o of P("url",e)){let n=R("loc",o),i=[Zn(new URL(n.textContent),t)];r.set(`${i[0]}`,i);for(let a of P("[rel=alternate]",o)){let s=a.getAttribute("href");s!=null&&i.push(Zn(new URL(s),t))}}return r}function ur(e){return fn(new URL("sitemap.xml",e)).pipe(m(t=>es(t,new URL(e))),de(()=>I(new Map)))}function ts(e,t){if(!(e.target instanceof Element))return S;let r=e.target.closest("a");if(r===null)return S;if(r.target||e.metaKey||e.ctrlKey)return S;let o=new URL(r.href);return o.search=o.hash="",t.has(`${o}`)?(e.preventDefault(),I(new URL(r.href))):S}function ei(e){let t=new Map;for(let r of P(":scope > *",e.head))t.set(r.outerHTML,r);return t}function ti(e){for(let t of P("[href], [src]",e))for(let r of["href","src"]){let o=t.getAttribute(r);if(o&&!/^(?:[a-z]+:)?\/\//i.test(o)){t[r]=t[r];break}}return I(e)}function rs(e){for(let o of["[data-md-component=announce]","[data-md-component=container]","[data-md-component=header-topic]","[data-md-component=outdated]","[data-md-component=logo]","[data-md-component=skip]",...B("navigation.tabs.sticky")?["[data-md-component=tabs]"]:[]]){let n=fe(o),i=fe(o,e);typeof n!="undefined"&&typeof i!="undefined"&&n.replaceWith(i)}let t=ei(document);for(let[o,n]of ei(e))t.has(o)?t.delete(o):document.head.appendChild(n);for(let o of t.values()){let n=o.getAttribute("name");n!=="theme-color"&&n!=="color-scheme"&&o.remove()}let r=Se("container");return Ue(P("script",r)).pipe(v(o=>{let n=e.createElement("script");if(o.src){for(let i of o.getAttributeNames())n.setAttribute(i,o.getAttribute(i));return o.replaceWith(n),new j(i=>{n.onload=()=>i.complete()})}else return n.textContent=o.textContent,o.replaceWith(n),S}),Z(),ie(document))}function ri({location$:e,viewport$:t,progress$:r}){let o=xe();if(location.protocol==="file:")return S;let n=ur(o.base);I(document).subscribe(ti);let i=h(document.body,"click").pipe(He(n),v(([p,c])=>ts(p,c)),pe()),a=h(window,"popstate").pipe(m(ye),pe());i.pipe(re(t)).subscribe(([p,{offset:c}])=>{history.replaceState(c,""),history.pushState(null,"",p)}),O(i,a).subscribe(e);let s=e.pipe(ee("pathname"),v(p=>mn(p,{progress$:r}).pipe(de(()=>(lt(p,!0),S)))),v(ti),v(rs),pe());return O(s.pipe(re(e,(p,c)=>c)),s.pipe(v(()=>e),ee("pathname"),v(()=>e),ee("hash")),e.pipe(K((p,c)=>p.pathname===c.pathname&&p.hash===c.hash),v(()=>i),w(()=>history.back()))).subscribe(p=>{var c,l;history.state!==null||!p.hash?window.scrollTo(0,(l=(c=history.state)==null?void 0:c.y)!=null?l:0):(history.scrollRestoration="auto",cn(p.hash),history.scrollRestoration="manual")}),e.subscribe(()=>{history.scrollRestoration="manual"}),h(window,"beforeunload").subscribe(()=>{history.scrollRestoration="auto"}),t.pipe(ee("offset"),_e(100)).subscribe(({offset:p})=>{history.replaceState(p,"")}),s}var oi=Lt(qr());function ni(e){let t=e.separator.split("|").map(n=>n.replace(/(\(\?[!=<][^)]+\))/g,"").length===0?"\uFFFD":n).join("|"),r=new RegExp(t,"img"),o=(n,i,a)=>`${i}${a}`;return n=>{n=n.replace(/[\s*+\-:~^]+/g," ").trim();let i=new RegExp(`(^|${e.separator}|)(${n.replace(/[|\\{}()[\]^$+*?.-]/g,"\\$&").replace(r,"|")})`,"img");return a=>(0,oi.default)(a).replace(i,o).replace(/<\/mark>(\s+)]*>/img,"$1")}}function jt(e){return e.type===1}function dr(e){return e.type===3}function ii(e,t){let r=gn(e);return O(I(location.protocol!=="file:"),ze("search")).pipe(Ae(o=>o),v(()=>t)).subscribe(({config:o,docs:n})=>r.next({type:0,data:{config:o,docs:n,options:{suggest:B("search.suggest")}}})),r}function ai({document$:e}){let t=xe(),r=je(new URL("../versions.json",t.base)).pipe(de(()=>S)),o=r.pipe(m(n=>{let[,i]=t.base.match(/([^/]+)\/?$/);return n.find(({version:a,aliases:s})=>a===i||s.includes(i))||n[0]}));r.pipe(m(n=>new Map(n.map(i=>[`${new URL(`../${i.version}/`,t.base)}`,i]))),v(n=>h(document.body,"click").pipe(b(i=>!i.metaKey&&!i.ctrlKey),re(o),v(([i,a])=>{if(i.target instanceof Element){let s=i.target.closest("a");if(s&&!s.target&&n.has(s.href)){let p=s.href;return!i.target.closest(".md-version")&&n.get(p)===a?S:(i.preventDefault(),I(p))}}return S}),v(i=>ur(new URL(i)).pipe(m(a=>{let p=ye().href.replace(t.base,i);return a.has(p.split("#")[0])?new URL(p):new URL(i)})))))).subscribe(n=>lt(n,!0)),z([r,o]).subscribe(([n,i])=>{R(".md-header__topic").appendChild(An(n,i))}),e.pipe(v(()=>o)).subscribe(n=>{var a;let i=__md_get("__outdated",sessionStorage);if(i===null){i=!0;let s=((a=t.version)==null?void 0:a.default)||"latest";Array.isArray(s)||(s=[s]);e:for(let p of s)for(let c of n.aliases.concat(n.version))if(new RegExp(p,"i").test(c)){i=!1;break e}__md_set("__outdated",i,sessionStorage)}if(i)for(let s of ae("outdated"))s.hidden=!1})}function is(e,{worker$:t}){let{searchParams:r}=ye();r.has("q")&&(Je("search",!0),e.value=r.get("q"),e.focus(),ze("search").pipe(Ae(i=>!i)).subscribe(()=>{let i=ye();i.searchParams.delete("q"),history.replaceState({},"",`${i}`)}));let o=et(e),n=O(t.pipe(Ae(jt)),h(e,"keyup"),o).pipe(m(()=>e.value),K());return z([n,o]).pipe(m(([i,a])=>({value:i,focus:a})),G(1))}function si(e,{worker$:t}){let r=new g,o=r.pipe(Z(),ie(!0));z([t.pipe(Ae(jt)),r],(i,a)=>a).pipe(ee("value")).subscribe(({value:i})=>t.next({type:2,data:i})),r.pipe(ee("focus")).subscribe(({focus:i})=>{i&&Je("search",i)}),h(e.form,"reset").pipe(U(o)).subscribe(()=>e.focus());let n=R("header [for=__search]");return h(n,"click").subscribe(()=>e.focus()),is(e,{worker$:t}).pipe(w(i=>r.next(i)),_(()=>r.complete()),m(i=>$({ref:e},i)),G(1))}function ci(e,{worker$:t,query$:r}){let o=new g,n=rn(e.parentElement).pipe(b(Boolean)),i=e.parentElement,a=R(":scope > :first-child",e),s=R(":scope > :last-child",e);ze("search").subscribe(l=>s.setAttribute("role",l?"list":"presentation")),o.pipe(re(r),Ur(t.pipe(Ae(jt)))).subscribe(([{items:l},{value:f}])=>{switch(l.length){case 0:a.textContent=f.length?Ee("search.result.none"):Ee("search.result.placeholder");break;case 1:a.textContent=Ee("search.result.one");break;default:let u=sr(l.length);a.textContent=Ee("search.result.other",u)}});let p=o.pipe(w(()=>s.innerHTML=""),v(({items:l})=>O(I(...l.slice(0,10)),I(...l.slice(10)).pipe(Be(4),Vr(n),v(([f])=>f)))),m(Mn),pe());return p.subscribe(l=>s.appendChild(l)),p.pipe(ne(l=>{let f=fe("details",l);return typeof f=="undefined"?S:h(f,"toggle").pipe(U(o),m(()=>f))})).subscribe(l=>{l.open===!1&&l.offsetTop<=i.scrollTop&&i.scrollTo({top:l.offsetTop})}),t.pipe(b(dr),m(({data:l})=>l)).pipe(w(l=>o.next(l)),_(()=>o.complete()),m(l=>$({ref:e},l)))}function as(e,{query$:t}){return t.pipe(m(({value:r})=>{let o=ye();return o.hash="",r=r.replace(/\s+/g,"+").replace(/&/g,"%26").replace(/=/g,"%3D"),o.search=`q=${r}`,{url:o}}))}function pi(e,t){let r=new g,o=r.pipe(Z(),ie(!0));return r.subscribe(({url:n})=>{e.setAttribute("data-clipboard-text",e.href),e.href=`${n}`}),h(e,"click").pipe(U(o)).subscribe(n=>n.preventDefault()),as(e,t).pipe(w(n=>r.next(n)),_(()=>r.complete()),m(n=>$({ref:e},n)))}function li(e,{worker$:t,keyboard$:r}){let o=new g,n=Se("search-query"),i=O(h(n,"keydown"),h(n,"focus")).pipe(ve(se),m(()=>n.value),K());return o.pipe(He(i),m(([{suggest:s},p])=>{let c=p.split(/([\s-]+)/);if(s!=null&&s.length&&c[c.length-1]){let l=s[s.length-1];l.startsWith(c[c.length-1])&&(c[c.length-1]=l)}else c.length=0;return c})).subscribe(s=>e.innerHTML=s.join("").replace(/\s/g," ")),r.pipe(b(({mode:s})=>s==="search")).subscribe(s=>{switch(s.type){case"ArrowRight":e.innerText.length&&n.selectionStart===n.value.length&&(n.value=e.innerText);break}}),t.pipe(b(dr),m(({data:s})=>s)).pipe(w(s=>o.next(s)),_(()=>o.complete()),m(()=>({ref:e})))}function mi(e,{index$:t,keyboard$:r}){let o=xe();try{let n=ii(o.search,t),i=Se("search-query",e),a=Se("search-result",e);h(e,"click").pipe(b(({target:p})=>p instanceof Element&&!!p.closest("a"))).subscribe(()=>Je("search",!1)),r.pipe(b(({mode:p})=>p==="search")).subscribe(p=>{let c=Ie();switch(p.type){case"Enter":if(c===i){let l=new Map;for(let f of P(":first-child [href]",a)){let u=f.firstElementChild;l.set(f,parseFloat(u.getAttribute("data-md-score")))}if(l.size){let[[f]]=[...l].sort(([,u],[,d])=>d-u);f.click()}p.claim()}break;case"Escape":case"Tab":Je("search",!1),i.blur();break;case"ArrowUp":case"ArrowDown":if(typeof c=="undefined")i.focus();else{let l=[i,...P(":not(details) > [href], summary, details[open] [href]",a)],f=Math.max(0,(Math.max(0,l.indexOf(c))+l.length+(p.type==="ArrowUp"?-1:1))%l.length);l[f].focus()}p.claim();break;default:i!==Ie()&&i.focus()}}),r.pipe(b(({mode:p})=>p==="global")).subscribe(p=>{switch(p.type){case"f":case"s":case"/":i.focus(),i.select(),p.claim();break}});let s=si(i,{worker$:n});return O(s,ci(a,{worker$:n,query$:s})).pipe(Re(...ae("search-share",e).map(p=>pi(p,{query$:s})),...ae("search-suggest",e).map(p=>li(p,{worker$:n,keyboard$:r}))))}catch(n){return e.hidden=!0,Ye}}function fi(e,{index$:t,location$:r}){return z([t,r.pipe(Q(ye()),b(o=>!!o.searchParams.get("h")))]).pipe(m(([o,n])=>ni(o.config)(n.searchParams.get("h"))),m(o=>{var a;let n=new Map,i=document.createNodeIterator(e,NodeFilter.SHOW_TEXT);for(let s=i.nextNode();s;s=i.nextNode())if((a=s.parentElement)!=null&&a.offsetHeight){let p=s.textContent,c=o(p);c.length>p.length&&n.set(s,c)}for(let[s,p]of n){let{childNodes:c}=x("span",null,p);s.replaceWith(...Array.from(c))}return{ref:e,nodes:n}}))}function ss(e,{viewport$:t,main$:r}){let o=e.closest(".md-grid"),n=o.offsetTop-o.parentElement.offsetTop;return z([r,t]).pipe(m(([{offset:i,height:a},{offset:{y:s}}])=>(a=a+Math.min(n,Math.max(0,s-i))-n,{height:a,locked:s>=i+n})),K((i,a)=>i.height===a.height&&i.locked===a.locked))}function Xr(e,o){var n=o,{header$:t}=n,r=ao(n,["header$"]);let i=R(".md-sidebar__scrollwrap",e),{y:a}=Ve(i);return C(()=>{let s=new g,p=s.pipe(Z(),ie(!0)),c=s.pipe(Le(0,me));return c.pipe(re(t)).subscribe({next([{height:l},{height:f}]){i.style.height=`${l-2*a}px`,e.style.top=`${f}px`},complete(){i.style.height="",e.style.top=""}}),c.pipe(Ae()).subscribe(()=>{for(let l of P(".md-nav__link--active[href]",e)){if(!l.clientHeight)continue;let f=l.closest(".md-sidebar__scrollwrap");if(typeof f!="undefined"){let u=l.offsetTop-f.offsetTop,{height:d}=ce(f);f.scrollTo({top:u-d/2})}}}),ue(P("label[tabindex]",e)).pipe(ne(l=>h(l,"click").pipe(ve(se),m(()=>l),U(p)))).subscribe(l=>{let f=R(`[id="${l.htmlFor}"]`);R(`[aria-labelledby="${l.id}"]`).setAttribute("aria-expanded",`${f.checked}`)}),ss(e,r).pipe(w(l=>s.next(l)),_(()=>s.complete()),m(l=>$({ref:e},l)))})}function ui(e,t){if(typeof t!="undefined"){let r=`https://api.github.com/repos/${e}/${t}`;return st(je(`${r}/releases/latest`).pipe(de(()=>S),m(o=>({version:o.tag_name})),De({})),je(r).pipe(de(()=>S),m(o=>({stars:o.stargazers_count,forks:o.forks_count})),De({}))).pipe(m(([o,n])=>$($({},o),n)))}else{let r=`https://api.github.com/users/${e}`;return je(r).pipe(m(o=>({repositories:o.public_repos})),De({}))}}function di(e,t){let r=`https://${e}/api/v4/projects/${encodeURIComponent(t)}`;return st(je(`${r}/releases/permalink/latest`).pipe(de(()=>S),m(({tag_name:o})=>({version:o})),De({})),je(r).pipe(de(()=>S),m(({star_count:o,forks_count:n})=>({stars:o,forks:n})),De({}))).pipe(m(([o,n])=>$($({},o),n)))}function hi(e){let t=e.match(/^.+github\.com\/([^/]+)\/?([^/]+)?/i);if(t){let[,r,o]=t;return ui(r,o)}if(t=e.match(/^.+?([^/]*gitlab[^/]+)\/(.+?)\/?$/i),t){let[,r,o]=t;return di(r,o)}return S}var cs;function ps(e){return cs||(cs=C(()=>{let t=__md_get("__source",sessionStorage);if(t)return I(t);if(ae("consent").length){let o=__md_get("__consent");if(!(o&&o.github))return S}return hi(e.href).pipe(w(o=>__md_set("__source",o,sessionStorage)))}).pipe(de(()=>S),b(t=>Object.keys(t).length>0),m(t=>({facts:t})),G(1)))}function bi(e){let t=R(":scope > :last-child",e);return C(()=>{let r=new g;return r.subscribe(({facts:o})=>{t.appendChild(Ln(o)),t.classList.add("md-source__repository--active")}),ps(e).pipe(w(o=>r.next(o)),_(()=>r.complete()),m(o=>$({ref:e},o)))})}function ls(e,{viewport$:t,header$:r}){return ge(document.body).pipe(v(()=>mr(e,{header$:r,viewport$:t})),m(({offset:{y:o}})=>({hidden:o>=10})),ee("hidden"))}function vi(e,t){return C(()=>{let r=new g;return r.subscribe({next({hidden:o}){e.hidden=o},complete(){e.hidden=!1}}),(B("navigation.tabs.sticky")?I({hidden:!1}):ls(e,t)).pipe(w(o=>r.next(o)),_(()=>r.complete()),m(o=>$({ref:e},o)))})}function ms(e,{viewport$:t,header$:r}){let o=new Map,n=P(".md-nav__link",e);for(let s of n){let p=decodeURIComponent(s.hash.substring(1)),c=fe(`[id="${p}"]`);typeof c!="undefined"&&o.set(s,c)}let i=r.pipe(ee("height"),m(({height:s})=>{let p=Se("main"),c=R(":scope > :first-child",p);return s+.8*(c.offsetTop-p.offsetTop)}),pe());return ge(document.body).pipe(ee("height"),v(s=>C(()=>{let p=[];return I([...o].reduce((c,[l,f])=>{for(;p.length&&o.get(p[p.length-1]).tagName>=f.tagName;)p.pop();let u=f.offsetTop;for(;!u&&f.parentElement;)f=f.parentElement,u=f.offsetTop;let d=f.offsetParent;for(;d;d=d.offsetParent)u+=d.offsetTop;return c.set([...p=[...p,l]].reverse(),u)},new Map))}).pipe(m(p=>new Map([...p].sort(([,c],[,l])=>c-l))),He(i),v(([p,c])=>t.pipe(Fr(([l,f],{offset:{y:u},size:d})=>{let y=u+d.height>=Math.floor(s.height);for(;f.length;){let[,M]=f[0];if(M-c=u&&!y)f=[l.pop(),...f];else break}return[l,f]},[[],[...p]]),K((l,f)=>l[0]===f[0]&&l[1]===f[1])))))).pipe(m(([s,p])=>({prev:s.map(([c])=>c),next:p.map(([c])=>c)})),Q({prev:[],next:[]}),Be(2,1),m(([s,p])=>s.prev.length{let i=new g,a=i.pipe(Z(),ie(!0));if(i.subscribe(({prev:s,next:p})=>{for(let[c]of p)c.classList.remove("md-nav__link--passed"),c.classList.remove("md-nav__link--active");for(let[c,[l]]of s.entries())l.classList.add("md-nav__link--passed"),l.classList.toggle("md-nav__link--active",c===s.length-1)}),B("toc.follow")){let s=O(t.pipe(_e(1),m(()=>{})),t.pipe(_e(250),m(()=>"smooth")));i.pipe(b(({prev:p})=>p.length>0),He(o.pipe(ve(se))),re(s)).subscribe(([[{prev:p}],c])=>{let[l]=p[p.length-1];if(l.offsetHeight){let f=cr(l);if(typeof f!="undefined"){let u=l.offsetTop-f.offsetTop,{height:d}=ce(f);f.scrollTo({top:u-d/2,behavior:c})}}})}return B("navigation.tracking")&&t.pipe(U(a),ee("offset"),_e(250),Ce(1),U(n.pipe(Ce(1))),ct({delay:250}),re(i)).subscribe(([,{prev:s}])=>{let p=ye(),c=s[s.length-1];if(c&&c.length){let[l]=c,{hash:f}=new URL(l.href);p.hash!==f&&(p.hash=f,history.replaceState({},"",`${p}`))}else p.hash="",history.replaceState({},"",`${p}`)}),ms(e,{viewport$:t,header$:r}).pipe(w(s=>i.next(s)),_(()=>i.complete()),m(s=>$({ref:e},s)))})}function fs(e,{viewport$:t,main$:r,target$:o}){let n=t.pipe(m(({offset:{y:a}})=>a),Be(2,1),m(([a,s])=>a>s&&s>0),K()),i=r.pipe(m(({active:a})=>a));return z([i,n]).pipe(m(([a,s])=>!(a&&s)),K(),U(o.pipe(Ce(1))),ie(!0),ct({delay:250}),m(a=>({hidden:a})))}function yi(e,{viewport$:t,header$:r,main$:o,target$:n}){let i=new g,a=i.pipe(Z(),ie(!0));return i.subscribe({next({hidden:s}){e.hidden=s,s?(e.setAttribute("tabindex","-1"),e.blur()):e.removeAttribute("tabindex")},complete(){e.style.top="",e.hidden=!0,e.removeAttribute("tabindex")}}),r.pipe(U(a),ee("height")).subscribe(({height:s})=>{e.style.top=`${s+16}px`}),h(e,"click").subscribe(s=>{s.preventDefault(),window.scrollTo({top:0})}),fs(e,{viewport$:t,main$:o,target$:n}).pipe(w(s=>i.next(s)),_(()=>i.complete()),m(s=>$({ref:e},s)))}function xi({document$:e,viewport$:t}){e.pipe(v(()=>P(".md-ellipsis")),ne(r=>tt(r).pipe(U(e.pipe(Ce(1))),b(o=>o),m(()=>r),Te(1))),b(r=>r.offsetWidth{let o=r.innerText,n=r.closest("a")||r;return n.title=o,B("content.tooltips")?mt(n,{viewport$:t}).pipe(U(e.pipe(Ce(1))),_(()=>n.removeAttribute("title"))):S})).subscribe(),B("content.tooltips")&&e.pipe(v(()=>P(".md-status")),ne(r=>mt(r,{viewport$:t}))).subscribe()}function Ei({document$:e,tablet$:t}){e.pipe(v(()=>P(".md-toggle--indeterminate")),w(r=>{r.indeterminate=!0,r.checked=!1}),ne(r=>h(r,"change").pipe(Dr(()=>r.classList.contains("md-toggle--indeterminate")),m(()=>r))),re(t)).subscribe(([r,o])=>{r.classList.remove("md-toggle--indeterminate"),o&&(r.checked=!1)})}function us(){return/(iPad|iPhone|iPod)/.test(navigator.userAgent)}function wi({document$:e}){e.pipe(v(()=>P("[data-md-scrollfix]")),w(t=>t.removeAttribute("data-md-scrollfix")),b(us),ne(t=>h(t,"touchstart").pipe(m(()=>t)))).subscribe(t=>{let r=t.scrollTop;r===0?t.scrollTop=1:r+t.offsetHeight===t.scrollHeight&&(t.scrollTop=r-1)})}function Ti({viewport$:e,tablet$:t}){z([ze("search"),t]).pipe(m(([r,o])=>r&&!o),v(r=>I(r).pipe(Ge(r?400:100))),re(e)).subscribe(([r,{offset:{y:o}}])=>{if(r)document.body.setAttribute("data-md-scrolllock",""),document.body.style.top=`-${o}px`;else{let n=-1*parseInt(document.body.style.top,10);document.body.removeAttribute("data-md-scrolllock"),document.body.style.top="",n&&window.scrollTo(0,n)}})}Object.entries||(Object.entries=function(e){let t=[];for(let r of Object.keys(e))t.push([r,e[r]]);return t});Object.values||(Object.values=function(e){let t=[];for(let r of Object.keys(e))t.push(e[r]);return t});typeof Element!="undefined"&&(Element.prototype.scrollTo||(Element.prototype.scrollTo=function(e,t){typeof e=="object"?(this.scrollLeft=e.left,this.scrollTop=e.top):(this.scrollLeft=e,this.scrollTop=t)}),Element.prototype.replaceWith||(Element.prototype.replaceWith=function(...e){let t=this.parentNode;if(t){e.length===0&&t.removeChild(this);for(let r=e.length-1;r>=0;r--){let o=e[r];typeof o=="string"?o=document.createTextNode(o):o.parentNode&&o.parentNode.removeChild(o),r?t.insertBefore(this.previousSibling,o):t.replaceChild(o,this)}}}));function ds(){return location.protocol==="file:"?Tt(`${new URL("search/search_index.js",Zr.base)}`).pipe(m(()=>__index),G(1)):je(new URL("search/search_index.json",Zr.base))}document.documentElement.classList.remove("no-js");document.documentElement.classList.add("js");var ot=Bo(),Wt=an(),Mt=pn(Wt),eo=nn(),Oe=vn(),hr=Pt("(min-width: 960px)"),Oi=Pt("(min-width: 1220px)"),Mi=ln(),Zr=xe(),Li=document.forms.namedItem("search")?ds():Ye,to=new g;Xn({alert$:to});var ro=new g;B("navigation.instant")&&ri({location$:Wt,viewport$:Oe,progress$:ro}).subscribe(ot);var Si;((Si=Zr.version)==null?void 0:Si.provider)==="mike"&&ai({document$:ot});O(Wt,Mt).pipe(Ge(125)).subscribe(()=>{Je("drawer",!1),Je("search",!1)});eo.pipe(b(({mode:e})=>e==="global")).subscribe(e=>{switch(e.type){case"p":case",":let t=fe("link[rel=prev]");typeof t!="undefined"&<(t);break;case"n":case".":let r=fe("link[rel=next]");typeof r!="undefined"&<(r);break;case"Enter":let o=Ie();o instanceof HTMLLabelElement&&o.click()}});xi({viewport$:Oe,document$:ot});Ei({document$:ot,tablet$:hr});wi({document$:ot});Ti({viewport$:Oe,tablet$:hr});var rt=Qn(Se("header"),{viewport$:Oe}),Ft=ot.pipe(m(()=>Se("main")),v(e=>Bn(e,{viewport$:Oe,header$:rt})),G(1)),hs=O(...ae("consent").map(e=>xn(e,{target$:Mt})),...ae("dialog").map(e=>zn(e,{alert$:to})),...ae("header").map(e=>Kn(e,{viewport$:Oe,header$:rt,main$:Ft})),...ae("palette").map(e=>Gn(e)),...ae("progress").map(e=>Jn(e,{progress$:ro})),...ae("search").map(e=>mi(e,{index$:Li,keyboard$:eo})),...ae("source").map(e=>bi(e))),bs=C(()=>O(...ae("announce").map(e=>yn(e)),...ae("content").map(e=>Nn(e,{viewport$:Oe,target$:Mt,print$:Mi})),...ae("content").map(e=>B("search.highlight")?fi(e,{index$:Li,location$:Wt}):S),...ae("header-title").map(e=>Yn(e,{viewport$:Oe,header$:rt})),...ae("sidebar").map(e=>e.getAttribute("data-md-type")==="navigation"?Nr(Oi,()=>Xr(e,{viewport$:Oe,header$:rt,main$:Ft})):Nr(hr,()=>Xr(e,{viewport$:Oe,header$:rt,main$:Ft}))),...ae("tabs").map(e=>vi(e,{viewport$:Oe,header$:rt})),...ae("toc").map(e=>gi(e,{viewport$:Oe,header$:rt,main$:Ft,target$:Mt})),...ae("top").map(e=>yi(e,{viewport$:Oe,header$:rt,main$:Ft,target$:Mt})))),_i=ot.pipe(v(()=>bs),Re(hs),G(1));_i.subscribe();window.document$=ot;window.location$=Wt;window.target$=Mt;window.keyboard$=eo;window.viewport$=Oe;window.tablet$=hr;window.screen$=Oi;window.print$=Mi;window.alert$=to;window.progress$=ro;window.component$=_i;})(); +//# sourceMappingURL=bundle.56dfad97.min.js.map + diff --git a/assets/javascripts/bundle.56dfad97.min.js.map b/assets/javascripts/bundle.56dfad97.min.js.map new file mode 100644 index 00000000..eb83bdb3 --- /dev/null +++ b/assets/javascripts/bundle.56dfad97.min.js.map @@ -0,0 +1,7 @@ +{ + "version": 3, + "sources": ["node_modules/focus-visible/dist/focus-visible.js", "node_modules/escape-html/index.js", "node_modules/clipboard/dist/clipboard.js", "src/templates/assets/javascripts/bundle.ts", "node_modules/tslib/tslib.es6.mjs", "node_modules/rxjs/src/internal/util/isFunction.ts", "node_modules/rxjs/src/internal/util/createErrorClass.ts", "node_modules/rxjs/src/internal/util/UnsubscriptionError.ts", "node_modules/rxjs/src/internal/util/arrRemove.ts", "node_modules/rxjs/src/internal/Subscription.ts", "node_modules/rxjs/src/internal/config.ts", "node_modules/rxjs/src/internal/scheduler/timeoutProvider.ts", "node_modules/rxjs/src/internal/util/reportUnhandledError.ts", "node_modules/rxjs/src/internal/util/noop.ts", "node_modules/rxjs/src/internal/NotificationFactories.ts", "node_modules/rxjs/src/internal/util/errorContext.ts", "node_modules/rxjs/src/internal/Subscriber.ts", "node_modules/rxjs/src/internal/symbol/observable.ts", "node_modules/rxjs/src/internal/util/identity.ts", "node_modules/rxjs/src/internal/util/pipe.ts", "node_modules/rxjs/src/internal/Observable.ts", "node_modules/rxjs/src/internal/util/lift.ts", "node_modules/rxjs/src/internal/operators/OperatorSubscriber.ts", "node_modules/rxjs/src/internal/scheduler/animationFrameProvider.ts", "node_modules/rxjs/src/internal/util/ObjectUnsubscribedError.ts", "node_modules/rxjs/src/internal/Subject.ts", "node_modules/rxjs/src/internal/BehaviorSubject.ts", "node_modules/rxjs/src/internal/scheduler/dateTimestampProvider.ts", "node_modules/rxjs/src/internal/ReplaySubject.ts", "node_modules/rxjs/src/internal/scheduler/Action.ts", "node_modules/rxjs/src/internal/scheduler/intervalProvider.ts", "node_modules/rxjs/src/internal/scheduler/AsyncAction.ts", "node_modules/rxjs/src/internal/Scheduler.ts", "node_modules/rxjs/src/internal/scheduler/AsyncScheduler.ts", "node_modules/rxjs/src/internal/scheduler/async.ts", "node_modules/rxjs/src/internal/scheduler/QueueAction.ts", "node_modules/rxjs/src/internal/scheduler/QueueScheduler.ts", "node_modules/rxjs/src/internal/scheduler/queue.ts", "node_modules/rxjs/src/internal/scheduler/AnimationFrameAction.ts", "node_modules/rxjs/src/internal/scheduler/AnimationFrameScheduler.ts", "node_modules/rxjs/src/internal/scheduler/animationFrame.ts", "node_modules/rxjs/src/internal/observable/empty.ts", "node_modules/rxjs/src/internal/util/isScheduler.ts", "node_modules/rxjs/src/internal/util/args.ts", "node_modules/rxjs/src/internal/util/isArrayLike.ts", "node_modules/rxjs/src/internal/util/isPromise.ts", "node_modules/rxjs/src/internal/util/isInteropObservable.ts", "node_modules/rxjs/src/internal/util/isAsyncIterable.ts", "node_modules/rxjs/src/internal/util/throwUnobservableError.ts", "node_modules/rxjs/src/internal/symbol/iterator.ts", "node_modules/rxjs/src/internal/util/isIterable.ts", "node_modules/rxjs/src/internal/util/isReadableStreamLike.ts", "node_modules/rxjs/src/internal/observable/innerFrom.ts", "node_modules/rxjs/src/internal/util/executeSchedule.ts", "node_modules/rxjs/src/internal/operators/observeOn.ts", "node_modules/rxjs/src/internal/operators/subscribeOn.ts", "node_modules/rxjs/src/internal/scheduled/scheduleObservable.ts", "node_modules/rxjs/src/internal/scheduled/schedulePromise.ts", "node_modules/rxjs/src/internal/scheduled/scheduleArray.ts", "node_modules/rxjs/src/internal/scheduled/scheduleIterable.ts", "node_modules/rxjs/src/internal/scheduled/scheduleAsyncIterable.ts", "node_modules/rxjs/src/internal/scheduled/scheduleReadableStreamLike.ts", "node_modules/rxjs/src/internal/scheduled/scheduled.ts", "node_modules/rxjs/src/internal/observable/from.ts", "node_modules/rxjs/src/internal/observable/of.ts", "node_modules/rxjs/src/internal/observable/throwError.ts", "node_modules/rxjs/src/internal/util/EmptyError.ts", "node_modules/rxjs/src/internal/util/isDate.ts", "node_modules/rxjs/src/internal/operators/map.ts", "node_modules/rxjs/src/internal/util/mapOneOrManyArgs.ts", "node_modules/rxjs/src/internal/util/argsArgArrayOrObject.ts", "node_modules/rxjs/src/internal/util/createObject.ts", "node_modules/rxjs/src/internal/observable/combineLatest.ts", "node_modules/rxjs/src/internal/operators/mergeInternals.ts", "node_modules/rxjs/src/internal/operators/mergeMap.ts", "node_modules/rxjs/src/internal/operators/mergeAll.ts", "node_modules/rxjs/src/internal/operators/concatAll.ts", "node_modules/rxjs/src/internal/observable/concat.ts", "node_modules/rxjs/src/internal/observable/defer.ts", "node_modules/rxjs/src/internal/observable/fromEvent.ts", "node_modules/rxjs/src/internal/observable/fromEventPattern.ts", "node_modules/rxjs/src/internal/observable/timer.ts", "node_modules/rxjs/src/internal/observable/merge.ts", "node_modules/rxjs/src/internal/observable/never.ts", "node_modules/rxjs/src/internal/util/argsOrArgArray.ts", "node_modules/rxjs/src/internal/operators/filter.ts", "node_modules/rxjs/src/internal/observable/zip.ts", "node_modules/rxjs/src/internal/operators/audit.ts", "node_modules/rxjs/src/internal/operators/auditTime.ts", "node_modules/rxjs/src/internal/operators/bufferCount.ts", "node_modules/rxjs/src/internal/operators/catchError.ts", "node_modules/rxjs/src/internal/operators/scanInternals.ts", "node_modules/rxjs/src/internal/operators/combineLatest.ts", "node_modules/rxjs/src/internal/operators/combineLatestWith.ts", "node_modules/rxjs/src/internal/operators/debounce.ts", "node_modules/rxjs/src/internal/operators/debounceTime.ts", "node_modules/rxjs/src/internal/operators/defaultIfEmpty.ts", "node_modules/rxjs/src/internal/operators/take.ts", "node_modules/rxjs/src/internal/operators/ignoreElements.ts", "node_modules/rxjs/src/internal/operators/mapTo.ts", "node_modules/rxjs/src/internal/operators/delayWhen.ts", "node_modules/rxjs/src/internal/operators/delay.ts", "node_modules/rxjs/src/internal/operators/distinctUntilChanged.ts", "node_modules/rxjs/src/internal/operators/distinctUntilKeyChanged.ts", "node_modules/rxjs/src/internal/operators/throwIfEmpty.ts", "node_modules/rxjs/src/internal/operators/endWith.ts", "node_modules/rxjs/src/internal/operators/finalize.ts", "node_modules/rxjs/src/internal/operators/first.ts", "node_modules/rxjs/src/internal/operators/takeLast.ts", "node_modules/rxjs/src/internal/operators/merge.ts", "node_modules/rxjs/src/internal/operators/mergeWith.ts", "node_modules/rxjs/src/internal/operators/repeat.ts", "node_modules/rxjs/src/internal/operators/scan.ts", "node_modules/rxjs/src/internal/operators/share.ts", "node_modules/rxjs/src/internal/operators/shareReplay.ts", "node_modules/rxjs/src/internal/operators/skip.ts", "node_modules/rxjs/src/internal/operators/skipUntil.ts", "node_modules/rxjs/src/internal/operators/startWith.ts", "node_modules/rxjs/src/internal/operators/switchMap.ts", "node_modules/rxjs/src/internal/operators/takeUntil.ts", "node_modules/rxjs/src/internal/operators/takeWhile.ts", "node_modules/rxjs/src/internal/operators/tap.ts", "node_modules/rxjs/src/internal/operators/throttle.ts", "node_modules/rxjs/src/internal/operators/throttleTime.ts", "node_modules/rxjs/src/internal/operators/withLatestFrom.ts", "node_modules/rxjs/src/internal/operators/zip.ts", "node_modules/rxjs/src/internal/operators/zipWith.ts", "src/templates/assets/javascripts/browser/document/index.ts", "src/templates/assets/javascripts/browser/element/_/index.ts", "src/templates/assets/javascripts/browser/element/focus/index.ts", "src/templates/assets/javascripts/browser/element/hover/index.ts", "src/templates/assets/javascripts/utilities/h/index.ts", "src/templates/assets/javascripts/utilities/round/index.ts", "src/templates/assets/javascripts/browser/script/index.ts", "src/templates/assets/javascripts/browser/element/size/_/index.ts", "src/templates/assets/javascripts/browser/element/size/content/index.ts", "src/templates/assets/javascripts/browser/element/offset/_/index.ts", "src/templates/assets/javascripts/browser/element/offset/content/index.ts", "src/templates/assets/javascripts/browser/element/visibility/index.ts", "src/templates/assets/javascripts/browser/toggle/index.ts", "src/templates/assets/javascripts/browser/keyboard/index.ts", "src/templates/assets/javascripts/browser/location/_/index.ts", "src/templates/assets/javascripts/browser/location/hash/index.ts", "src/templates/assets/javascripts/browser/media/index.ts", "src/templates/assets/javascripts/browser/request/index.ts", "src/templates/assets/javascripts/browser/viewport/offset/index.ts", "src/templates/assets/javascripts/browser/viewport/size/index.ts", "src/templates/assets/javascripts/browser/viewport/_/index.ts", "src/templates/assets/javascripts/browser/viewport/at/index.ts", "src/templates/assets/javascripts/browser/worker/index.ts", "src/templates/assets/javascripts/_/index.ts", "src/templates/assets/javascripts/components/_/index.ts", "src/templates/assets/javascripts/components/announce/index.ts", "src/templates/assets/javascripts/components/consent/index.ts", "src/templates/assets/javascripts/templates/tooltip/index.tsx", "src/templates/assets/javascripts/templates/annotation/index.tsx", "src/templates/assets/javascripts/templates/clipboard/index.tsx", "src/templates/assets/javascripts/templates/search/index.tsx", "src/templates/assets/javascripts/templates/source/index.tsx", "src/templates/assets/javascripts/templates/tabbed/index.tsx", "src/templates/assets/javascripts/templates/table/index.tsx", "src/templates/assets/javascripts/templates/version/index.tsx", "src/templates/assets/javascripts/components/tooltip2/index.ts", "src/templates/assets/javascripts/components/content/annotation/_/index.ts", "src/templates/assets/javascripts/components/content/annotation/list/index.ts", "src/templates/assets/javascripts/components/content/annotation/block/index.ts", "src/templates/assets/javascripts/components/content/code/_/index.ts", "src/templates/assets/javascripts/components/content/details/index.ts", "src/templates/assets/javascripts/components/content/mermaid/index.css", "src/templates/assets/javascripts/components/content/mermaid/index.ts", "src/templates/assets/javascripts/components/content/table/index.ts", "src/templates/assets/javascripts/components/content/tabs/index.ts", "src/templates/assets/javascripts/components/content/_/index.ts", "src/templates/assets/javascripts/components/dialog/index.ts", "src/templates/assets/javascripts/components/tooltip/index.ts", "src/templates/assets/javascripts/components/header/_/index.ts", "src/templates/assets/javascripts/components/header/title/index.ts", "src/templates/assets/javascripts/components/main/index.ts", "src/templates/assets/javascripts/components/palette/index.ts", "src/templates/assets/javascripts/components/progress/index.ts", "src/templates/assets/javascripts/integrations/clipboard/index.ts", "src/templates/assets/javascripts/integrations/sitemap/index.ts", "src/templates/assets/javascripts/integrations/instant/index.ts", "src/templates/assets/javascripts/integrations/search/highlighter/index.ts", "src/templates/assets/javascripts/integrations/search/worker/message/index.ts", "src/templates/assets/javascripts/integrations/search/worker/_/index.ts", "src/templates/assets/javascripts/integrations/version/index.ts", "src/templates/assets/javascripts/components/search/query/index.ts", "src/templates/assets/javascripts/components/search/result/index.ts", "src/templates/assets/javascripts/components/search/share/index.ts", "src/templates/assets/javascripts/components/search/suggest/index.ts", "src/templates/assets/javascripts/components/search/_/index.ts", "src/templates/assets/javascripts/components/search/highlight/index.ts", "src/templates/assets/javascripts/components/sidebar/index.ts", "src/templates/assets/javascripts/components/source/facts/github/index.ts", "src/templates/assets/javascripts/components/source/facts/gitlab/index.ts", "src/templates/assets/javascripts/components/source/facts/_/index.ts", "src/templates/assets/javascripts/components/source/_/index.ts", "src/templates/assets/javascripts/components/tabs/index.ts", "src/templates/assets/javascripts/components/toc/index.ts", "src/templates/assets/javascripts/components/top/index.ts", "src/templates/assets/javascripts/patches/ellipsis/index.ts", "src/templates/assets/javascripts/patches/indeterminate/index.ts", "src/templates/assets/javascripts/patches/scrollfix/index.ts", "src/templates/assets/javascripts/patches/scrolllock/index.ts", "src/templates/assets/javascripts/polyfills/index.ts"], + "sourcesContent": ["(function (global, factory) {\n typeof exports === 'object' && typeof module !== 'undefined' ? factory() :\n typeof define === 'function' && define.amd ? define(factory) :\n (factory());\n}(this, (function () { 'use strict';\n\n /**\n * Applies the :focus-visible polyfill at the given scope.\n * A scope in this case is either the top-level Document or a Shadow Root.\n *\n * @param {(Document|ShadowRoot)} scope\n * @see https://github.com/WICG/focus-visible\n */\n function applyFocusVisiblePolyfill(scope) {\n var hadKeyboardEvent = true;\n var hadFocusVisibleRecently = false;\n var hadFocusVisibleRecentlyTimeout = null;\n\n var inputTypesAllowlist = {\n text: true,\n search: true,\n url: true,\n tel: true,\n email: true,\n password: true,\n number: true,\n date: true,\n month: true,\n week: true,\n time: true,\n datetime: true,\n 'datetime-local': true\n };\n\n /**\n * Helper function for legacy browsers and iframes which sometimes focus\n * elements like document, body, and non-interactive SVG.\n * @param {Element} el\n */\n function isValidFocusTarget(el) {\n if (\n el &&\n el !== document &&\n el.nodeName !== 'HTML' &&\n el.nodeName !== 'BODY' &&\n 'classList' in el &&\n 'contains' in el.classList\n ) {\n return true;\n }\n return false;\n }\n\n /**\n * Computes whether the given element should automatically trigger the\n * `focus-visible` class being added, i.e. whether it should always match\n * `:focus-visible` when focused.\n * @param {Element} el\n * @return {boolean}\n */\n function focusTriggersKeyboardModality(el) {\n var type = el.type;\n var tagName = el.tagName;\n\n if (tagName === 'INPUT' && inputTypesAllowlist[type] && !el.readOnly) {\n return true;\n }\n\n if (tagName === 'TEXTAREA' && !el.readOnly) {\n return true;\n }\n\n if (el.isContentEditable) {\n return true;\n }\n\n return false;\n }\n\n /**\n * Add the `focus-visible` class to the given element if it was not added by\n * the author.\n * @param {Element} el\n */\n function addFocusVisibleClass(el) {\n if (el.classList.contains('focus-visible')) {\n return;\n }\n el.classList.add('focus-visible');\n el.setAttribute('data-focus-visible-added', '');\n }\n\n /**\n * Remove the `focus-visible` class from the given element if it was not\n * originally added by the author.\n * @param {Element} el\n */\n function removeFocusVisibleClass(el) {\n if (!el.hasAttribute('data-focus-visible-added')) {\n return;\n }\n el.classList.remove('focus-visible');\n el.removeAttribute('data-focus-visible-added');\n }\n\n /**\n * If the most recent user interaction was via the keyboard;\n * and the key press did not include a meta, alt/option, or control key;\n * then the modality is keyboard. Otherwise, the modality is not keyboard.\n * Apply `focus-visible` to any current active element and keep track\n * of our keyboard modality state with `hadKeyboardEvent`.\n * @param {KeyboardEvent} e\n */\n function onKeyDown(e) {\n if (e.metaKey || e.altKey || e.ctrlKey) {\n return;\n }\n\n if (isValidFocusTarget(scope.activeElement)) {\n addFocusVisibleClass(scope.activeElement);\n }\n\n hadKeyboardEvent = true;\n }\n\n /**\n * If at any point a user clicks with a pointing device, ensure that we change\n * the modality away from keyboard.\n * This avoids the situation where a user presses a key on an already focused\n * element, and then clicks on a different element, focusing it with a\n * pointing device, while we still think we're in keyboard modality.\n * @param {Event} e\n */\n function onPointerDown(e) {\n hadKeyboardEvent = false;\n }\n\n /**\n * On `focus`, add the `focus-visible` class to the target if:\n * - the target received focus as a result of keyboard navigation, or\n * - the event target is an element that will likely require interaction\n * via the keyboard (e.g. a text box)\n * @param {Event} e\n */\n function onFocus(e) {\n // Prevent IE from focusing the document or HTML element.\n if (!isValidFocusTarget(e.target)) {\n return;\n }\n\n if (hadKeyboardEvent || focusTriggersKeyboardModality(e.target)) {\n addFocusVisibleClass(e.target);\n }\n }\n\n /**\n * On `blur`, remove the `focus-visible` class from the target.\n * @param {Event} e\n */\n function onBlur(e) {\n if (!isValidFocusTarget(e.target)) {\n return;\n }\n\n if (\n e.target.classList.contains('focus-visible') ||\n e.target.hasAttribute('data-focus-visible-added')\n ) {\n // To detect a tab/window switch, we look for a blur event followed\n // rapidly by a visibility change.\n // If we don't see a visibility change within 100ms, it's probably a\n // regular focus change.\n hadFocusVisibleRecently = true;\n window.clearTimeout(hadFocusVisibleRecentlyTimeout);\n hadFocusVisibleRecentlyTimeout = window.setTimeout(function() {\n hadFocusVisibleRecently = false;\n }, 100);\n removeFocusVisibleClass(e.target);\n }\n }\n\n /**\n * If the user changes tabs, keep track of whether or not the previously\n * focused element had .focus-visible.\n * @param {Event} e\n */\n function onVisibilityChange(e) {\n if (document.visibilityState === 'hidden') {\n // If the tab becomes active again, the browser will handle calling focus\n // on the element (Safari actually calls it twice).\n // If this tab change caused a blur on an element with focus-visible,\n // re-apply the class when the user switches back to the tab.\n if (hadFocusVisibleRecently) {\n hadKeyboardEvent = true;\n }\n addInitialPointerMoveListeners();\n }\n }\n\n /**\n * Add a group of listeners to detect usage of any pointing devices.\n * These listeners will be added when the polyfill first loads, and anytime\n * the window is blurred, so that they are active when the window regains\n * focus.\n */\n function addInitialPointerMoveListeners() {\n document.addEventListener('mousemove', onInitialPointerMove);\n document.addEventListener('mousedown', onInitialPointerMove);\n document.addEventListener('mouseup', onInitialPointerMove);\n document.addEventListener('pointermove', onInitialPointerMove);\n document.addEventListener('pointerdown', onInitialPointerMove);\n document.addEventListener('pointerup', onInitialPointerMove);\n document.addEventListener('touchmove', onInitialPointerMove);\n document.addEventListener('touchstart', onInitialPointerMove);\n document.addEventListener('touchend', onInitialPointerMove);\n }\n\n function removeInitialPointerMoveListeners() {\n document.removeEventListener('mousemove', onInitialPointerMove);\n document.removeEventListener('mousedown', onInitialPointerMove);\n document.removeEventListener('mouseup', onInitialPointerMove);\n document.removeEventListener('pointermove', onInitialPointerMove);\n document.removeEventListener('pointerdown', onInitialPointerMove);\n document.removeEventListener('pointerup', onInitialPointerMove);\n document.removeEventListener('touchmove', onInitialPointerMove);\n document.removeEventListener('touchstart', onInitialPointerMove);\n document.removeEventListener('touchend', onInitialPointerMove);\n }\n\n /**\n * When the polfyill first loads, assume the user is in keyboard modality.\n * If any event is received from a pointing device (e.g. mouse, pointer,\n * touch), turn off keyboard modality.\n * This accounts for situations where focus enters the page from the URL bar.\n * @param {Event} e\n */\n function onInitialPointerMove(e) {\n // Work around a Safari quirk that fires a mousemove on whenever the\n // window blurs, even if you're tabbing out of the page. \u00AF\\_(\u30C4)_/\u00AF\n if (e.target.nodeName && e.target.nodeName.toLowerCase() === 'html') {\n return;\n }\n\n hadKeyboardEvent = false;\n removeInitialPointerMoveListeners();\n }\n\n // For some kinds of state, we are interested in changes at the global scope\n // only. For example, global pointer input, global key presses and global\n // visibility change should affect the state at every scope:\n document.addEventListener('keydown', onKeyDown, true);\n document.addEventListener('mousedown', onPointerDown, true);\n document.addEventListener('pointerdown', onPointerDown, true);\n document.addEventListener('touchstart', onPointerDown, true);\n document.addEventListener('visibilitychange', onVisibilityChange, true);\n\n addInitialPointerMoveListeners();\n\n // For focus and blur, we specifically care about state changes in the local\n // scope. This is because focus / blur events that originate from within a\n // shadow root are not re-dispatched from the host element if it was already\n // the active element in its own scope:\n scope.addEventListener('focus', onFocus, true);\n scope.addEventListener('blur', onBlur, true);\n\n // We detect that a node is a ShadowRoot by ensuring that it is a\n // DocumentFragment and also has a host property. This check covers native\n // implementation and polyfill implementation transparently. If we only cared\n // about the native implementation, we could just check if the scope was\n // an instance of a ShadowRoot.\n if (scope.nodeType === Node.DOCUMENT_FRAGMENT_NODE && scope.host) {\n // Since a ShadowRoot is a special kind of DocumentFragment, it does not\n // have a root element to add a class to. So, we add this attribute to the\n // host element instead:\n scope.host.setAttribute('data-js-focus-visible', '');\n } else if (scope.nodeType === Node.DOCUMENT_NODE) {\n document.documentElement.classList.add('js-focus-visible');\n document.documentElement.setAttribute('data-js-focus-visible', '');\n }\n }\n\n // It is important to wrap all references to global window and document in\n // these checks to support server-side rendering use cases\n // @see https://github.com/WICG/focus-visible/issues/199\n if (typeof window !== 'undefined' && typeof document !== 'undefined') {\n // Make the polyfill helper globally available. This can be used as a signal\n // to interested libraries that wish to coordinate with the polyfill for e.g.,\n // applying the polyfill to a shadow root:\n window.applyFocusVisiblePolyfill = applyFocusVisiblePolyfill;\n\n // Notify interested libraries of the polyfill's presence, in case the\n // polyfill was loaded lazily:\n var event;\n\n try {\n event = new CustomEvent('focus-visible-polyfill-ready');\n } catch (error) {\n // IE11 does not support using CustomEvent as a constructor directly:\n event = document.createEvent('CustomEvent');\n event.initCustomEvent('focus-visible-polyfill-ready', false, false, {});\n }\n\n window.dispatchEvent(event);\n }\n\n if (typeof document !== 'undefined') {\n // Apply the polyfill to the global document, so that no JavaScript\n // coordination is required to use the polyfill in the top-level document:\n applyFocusVisiblePolyfill(document);\n }\n\n})));\n", "/*!\n * escape-html\n * Copyright(c) 2012-2013 TJ Holowaychuk\n * Copyright(c) 2015 Andreas Lubbe\n * Copyright(c) 2015 Tiancheng \"Timothy\" Gu\n * MIT Licensed\n */\n\n'use strict';\n\n/**\n * Module variables.\n * @private\n */\n\nvar matchHtmlRegExp = /[\"'&<>]/;\n\n/**\n * Module exports.\n * @public\n */\n\nmodule.exports = escapeHtml;\n\n/**\n * Escape special characters in the given string of html.\n *\n * @param {string} string The string to escape for inserting into HTML\n * @return {string}\n * @public\n */\n\nfunction escapeHtml(string) {\n var str = '' + string;\n var match = matchHtmlRegExp.exec(str);\n\n if (!match) {\n return str;\n }\n\n var escape;\n var html = '';\n var index = 0;\n var lastIndex = 0;\n\n for (index = match.index; index < str.length; index++) {\n switch (str.charCodeAt(index)) {\n case 34: // \"\n escape = '"';\n break;\n case 38: // &\n escape = '&';\n break;\n case 39: // '\n escape = ''';\n break;\n case 60: // <\n escape = '<';\n break;\n case 62: // >\n escape = '>';\n break;\n default:\n continue;\n }\n\n if (lastIndex !== index) {\n html += str.substring(lastIndex, index);\n }\n\n lastIndex = index + 1;\n html += escape;\n }\n\n return lastIndex !== index\n ? html + str.substring(lastIndex, index)\n : html;\n}\n", "/*!\n * clipboard.js v2.0.11\n * https://clipboardjs.com/\n *\n * Licensed MIT \u00A9 Zeno Rocha\n */\n(function webpackUniversalModuleDefinition(root, factory) {\n\tif(typeof exports === 'object' && typeof module === 'object')\n\t\tmodule.exports = factory();\n\telse if(typeof define === 'function' && define.amd)\n\t\tdefine([], factory);\n\telse if(typeof exports === 'object')\n\t\texports[\"ClipboardJS\"] = factory();\n\telse\n\t\troot[\"ClipboardJS\"] = factory();\n})(this, function() {\nreturn /******/ (function() { // webpackBootstrap\n/******/ \tvar __webpack_modules__ = ({\n\n/***/ 686:\n/***/ (function(__unused_webpack_module, __webpack_exports__, __webpack_require__) {\n\n\"use strict\";\n\n// EXPORTS\n__webpack_require__.d(__webpack_exports__, {\n \"default\": function() { return /* binding */ clipboard; }\n});\n\n// EXTERNAL MODULE: ./node_modules/tiny-emitter/index.js\nvar tiny_emitter = __webpack_require__(279);\nvar tiny_emitter_default = /*#__PURE__*/__webpack_require__.n(tiny_emitter);\n// EXTERNAL MODULE: ./node_modules/good-listener/src/listen.js\nvar listen = __webpack_require__(370);\nvar listen_default = /*#__PURE__*/__webpack_require__.n(listen);\n// EXTERNAL MODULE: ./node_modules/select/src/select.js\nvar src_select = __webpack_require__(817);\nvar select_default = /*#__PURE__*/__webpack_require__.n(src_select);\n;// CONCATENATED MODULE: ./src/common/command.js\n/**\n * Executes a given operation type.\n * @param {String} type\n * @return {Boolean}\n */\nfunction command(type) {\n try {\n return document.execCommand(type);\n } catch (err) {\n return false;\n }\n}\n;// CONCATENATED MODULE: ./src/actions/cut.js\n\n\n/**\n * Cut action wrapper.\n * @param {String|HTMLElement} target\n * @return {String}\n */\n\nvar ClipboardActionCut = function ClipboardActionCut(target) {\n var selectedText = select_default()(target);\n command('cut');\n return selectedText;\n};\n\n/* harmony default export */ var actions_cut = (ClipboardActionCut);\n;// CONCATENATED MODULE: ./src/common/create-fake-element.js\n/**\n * Creates a fake textarea element with a value.\n * @param {String} value\n * @return {HTMLElement}\n */\nfunction createFakeElement(value) {\n var isRTL = document.documentElement.getAttribute('dir') === 'rtl';\n var fakeElement = document.createElement('textarea'); // Prevent zooming on iOS\n\n fakeElement.style.fontSize = '12pt'; // Reset box model\n\n fakeElement.style.border = '0';\n fakeElement.style.padding = '0';\n fakeElement.style.margin = '0'; // Move element out of screen horizontally\n\n fakeElement.style.position = 'absolute';\n fakeElement.style[isRTL ? 'right' : 'left'] = '-9999px'; // Move element to the same position vertically\n\n var yPosition = window.pageYOffset || document.documentElement.scrollTop;\n fakeElement.style.top = \"\".concat(yPosition, \"px\");\n fakeElement.setAttribute('readonly', '');\n fakeElement.value = value;\n return fakeElement;\n}\n;// CONCATENATED MODULE: ./src/actions/copy.js\n\n\n\n/**\n * Create fake copy action wrapper using a fake element.\n * @param {String} target\n * @param {Object} options\n * @return {String}\n */\n\nvar fakeCopyAction = function fakeCopyAction(value, options) {\n var fakeElement = createFakeElement(value);\n options.container.appendChild(fakeElement);\n var selectedText = select_default()(fakeElement);\n command('copy');\n fakeElement.remove();\n return selectedText;\n};\n/**\n * Copy action wrapper.\n * @param {String|HTMLElement} target\n * @param {Object} options\n * @return {String}\n */\n\n\nvar ClipboardActionCopy = function ClipboardActionCopy(target) {\n var options = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : {\n container: document.body\n };\n var selectedText = '';\n\n if (typeof target === 'string') {\n selectedText = fakeCopyAction(target, options);\n } else if (target instanceof HTMLInputElement && !['text', 'search', 'url', 'tel', 'password'].includes(target === null || target === void 0 ? void 0 : target.type)) {\n // If input type doesn't support `setSelectionRange`. Simulate it. https://developer.mozilla.org/en-US/docs/Web/API/HTMLInputElement/setSelectionRange\n selectedText = fakeCopyAction(target.value, options);\n } else {\n selectedText = select_default()(target);\n command('copy');\n }\n\n return selectedText;\n};\n\n/* harmony default export */ var actions_copy = (ClipboardActionCopy);\n;// CONCATENATED MODULE: ./src/actions/default.js\nfunction _typeof(obj) { \"@babel/helpers - typeof\"; if (typeof Symbol === \"function\" && typeof Symbol.iterator === \"symbol\") { _typeof = function _typeof(obj) { return typeof obj; }; } else { _typeof = function _typeof(obj) { return obj && typeof Symbol === \"function\" && obj.constructor === Symbol && obj !== Symbol.prototype ? \"symbol\" : typeof obj; }; } return _typeof(obj); }\n\n\n\n/**\n * Inner function which performs selection from either `text` or `target`\n * properties and then executes copy or cut operations.\n * @param {Object} options\n */\n\nvar ClipboardActionDefault = function ClipboardActionDefault() {\n var options = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : {};\n // Defines base properties passed from constructor.\n var _options$action = options.action,\n action = _options$action === void 0 ? 'copy' : _options$action,\n container = options.container,\n target = options.target,\n text = options.text; // Sets the `action` to be performed which can be either 'copy' or 'cut'.\n\n if (action !== 'copy' && action !== 'cut') {\n throw new Error('Invalid \"action\" value, use either \"copy\" or \"cut\"');\n } // Sets the `target` property using an element that will be have its content copied.\n\n\n if (target !== undefined) {\n if (target && _typeof(target) === 'object' && target.nodeType === 1) {\n if (action === 'copy' && target.hasAttribute('disabled')) {\n throw new Error('Invalid \"target\" attribute. Please use \"readonly\" instead of \"disabled\" attribute');\n }\n\n if (action === 'cut' && (target.hasAttribute('readonly') || target.hasAttribute('disabled'))) {\n throw new Error('Invalid \"target\" attribute. You can\\'t cut text from elements with \"readonly\" or \"disabled\" attributes');\n }\n } else {\n throw new Error('Invalid \"target\" value, use a valid Element');\n }\n } // Define selection strategy based on `text` property.\n\n\n if (text) {\n return actions_copy(text, {\n container: container\n });\n } // Defines which selection strategy based on `target` property.\n\n\n if (target) {\n return action === 'cut' ? actions_cut(target) : actions_copy(target, {\n container: container\n });\n }\n};\n\n/* harmony default export */ var actions_default = (ClipboardActionDefault);\n;// CONCATENATED MODULE: ./src/clipboard.js\nfunction clipboard_typeof(obj) { \"@babel/helpers - typeof\"; if (typeof Symbol === \"function\" && typeof Symbol.iterator === \"symbol\") { clipboard_typeof = function _typeof(obj) { return typeof obj; }; } else { clipboard_typeof = function _typeof(obj) { return obj && typeof Symbol === \"function\" && obj.constructor === Symbol && obj !== Symbol.prototype ? \"symbol\" : typeof obj; }; } return clipboard_typeof(obj); }\n\nfunction _classCallCheck(instance, Constructor) { if (!(instance instanceof Constructor)) { throw new TypeError(\"Cannot call a class as a function\"); } }\n\nfunction _defineProperties(target, props) { for (var i = 0; i < props.length; i++) { var descriptor = props[i]; descriptor.enumerable = descriptor.enumerable || false; descriptor.configurable = true; if (\"value\" in descriptor) descriptor.writable = true; Object.defineProperty(target, descriptor.key, descriptor); } }\n\nfunction _createClass(Constructor, protoProps, staticProps) { if (protoProps) _defineProperties(Constructor.prototype, protoProps); if (staticProps) _defineProperties(Constructor, staticProps); return Constructor; }\n\nfunction _inherits(subClass, superClass) { if (typeof superClass !== \"function\" && superClass !== null) { throw new TypeError(\"Super expression must either be null or a function\"); } subClass.prototype = Object.create(superClass && superClass.prototype, { constructor: { value: subClass, writable: true, configurable: true } }); if (superClass) _setPrototypeOf(subClass, superClass); }\n\nfunction _setPrototypeOf(o, p) { _setPrototypeOf = Object.setPrototypeOf || function _setPrototypeOf(o, p) { o.__proto__ = p; return o; }; return _setPrototypeOf(o, p); }\n\nfunction _createSuper(Derived) { var hasNativeReflectConstruct = _isNativeReflectConstruct(); return function _createSuperInternal() { var Super = _getPrototypeOf(Derived), result; if (hasNativeReflectConstruct) { var NewTarget = _getPrototypeOf(this).constructor; result = Reflect.construct(Super, arguments, NewTarget); } else { result = Super.apply(this, arguments); } return _possibleConstructorReturn(this, result); }; }\n\nfunction _possibleConstructorReturn(self, call) { if (call && (clipboard_typeof(call) === \"object\" || typeof call === \"function\")) { return call; } return _assertThisInitialized(self); }\n\nfunction _assertThisInitialized(self) { if (self === void 0) { throw new ReferenceError(\"this hasn't been initialised - super() hasn't been called\"); } return self; }\n\nfunction _isNativeReflectConstruct() { if (typeof Reflect === \"undefined\" || !Reflect.construct) return false; if (Reflect.construct.sham) return false; if (typeof Proxy === \"function\") return true; try { Date.prototype.toString.call(Reflect.construct(Date, [], function () {})); return true; } catch (e) { return false; } }\n\nfunction _getPrototypeOf(o) { _getPrototypeOf = Object.setPrototypeOf ? Object.getPrototypeOf : function _getPrototypeOf(o) { return o.__proto__ || Object.getPrototypeOf(o); }; return _getPrototypeOf(o); }\n\n\n\n\n\n\n/**\n * Helper function to retrieve attribute value.\n * @param {String} suffix\n * @param {Element} element\n */\n\nfunction getAttributeValue(suffix, element) {\n var attribute = \"data-clipboard-\".concat(suffix);\n\n if (!element.hasAttribute(attribute)) {\n return;\n }\n\n return element.getAttribute(attribute);\n}\n/**\n * Base class which takes one or more elements, adds event listeners to them,\n * and instantiates a new `ClipboardAction` on each click.\n */\n\n\nvar Clipboard = /*#__PURE__*/function (_Emitter) {\n _inherits(Clipboard, _Emitter);\n\n var _super = _createSuper(Clipboard);\n\n /**\n * @param {String|HTMLElement|HTMLCollection|NodeList} trigger\n * @param {Object} options\n */\n function Clipboard(trigger, options) {\n var _this;\n\n _classCallCheck(this, Clipboard);\n\n _this = _super.call(this);\n\n _this.resolveOptions(options);\n\n _this.listenClick(trigger);\n\n return _this;\n }\n /**\n * Defines if attributes would be resolved using internal setter functions\n * or custom functions that were passed in the constructor.\n * @param {Object} options\n */\n\n\n _createClass(Clipboard, [{\n key: \"resolveOptions\",\n value: function resolveOptions() {\n var options = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : {};\n this.action = typeof options.action === 'function' ? options.action : this.defaultAction;\n this.target = typeof options.target === 'function' ? options.target : this.defaultTarget;\n this.text = typeof options.text === 'function' ? options.text : this.defaultText;\n this.container = clipboard_typeof(options.container) === 'object' ? options.container : document.body;\n }\n /**\n * Adds a click event listener to the passed trigger.\n * @param {String|HTMLElement|HTMLCollection|NodeList} trigger\n */\n\n }, {\n key: \"listenClick\",\n value: function listenClick(trigger) {\n var _this2 = this;\n\n this.listener = listen_default()(trigger, 'click', function (e) {\n return _this2.onClick(e);\n });\n }\n /**\n * Defines a new `ClipboardAction` on each click event.\n * @param {Event} e\n */\n\n }, {\n key: \"onClick\",\n value: function onClick(e) {\n var trigger = e.delegateTarget || e.currentTarget;\n var action = this.action(trigger) || 'copy';\n var text = actions_default({\n action: action,\n container: this.container,\n target: this.target(trigger),\n text: this.text(trigger)\n }); // Fires an event based on the copy operation result.\n\n this.emit(text ? 'success' : 'error', {\n action: action,\n text: text,\n trigger: trigger,\n clearSelection: function clearSelection() {\n if (trigger) {\n trigger.focus();\n }\n\n window.getSelection().removeAllRanges();\n }\n });\n }\n /**\n * Default `action` lookup function.\n * @param {Element} trigger\n */\n\n }, {\n key: \"defaultAction\",\n value: function defaultAction(trigger) {\n return getAttributeValue('action', trigger);\n }\n /**\n * Default `target` lookup function.\n * @param {Element} trigger\n */\n\n }, {\n key: \"defaultTarget\",\n value: function defaultTarget(trigger) {\n var selector = getAttributeValue('target', trigger);\n\n if (selector) {\n return document.querySelector(selector);\n }\n }\n /**\n * Allow fire programmatically a copy action\n * @param {String|HTMLElement} target\n * @param {Object} options\n * @returns Text copied.\n */\n\n }, {\n key: \"defaultText\",\n\n /**\n * Default `text` lookup function.\n * @param {Element} trigger\n */\n value: function defaultText(trigger) {\n return getAttributeValue('text', trigger);\n }\n /**\n * Destroy lifecycle.\n */\n\n }, {\n key: \"destroy\",\n value: function destroy() {\n this.listener.destroy();\n }\n }], [{\n key: \"copy\",\n value: function copy(target) {\n var options = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : {\n container: document.body\n };\n return actions_copy(target, options);\n }\n /**\n * Allow fire programmatically a cut action\n * @param {String|HTMLElement} target\n * @returns Text cutted.\n */\n\n }, {\n key: \"cut\",\n value: function cut(target) {\n return actions_cut(target);\n }\n /**\n * Returns the support of the given action, or all actions if no action is\n * given.\n * @param {String} [action]\n */\n\n }, {\n key: \"isSupported\",\n value: function isSupported() {\n var action = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : ['copy', 'cut'];\n var actions = typeof action === 'string' ? [action] : action;\n var support = !!document.queryCommandSupported;\n actions.forEach(function (action) {\n support = support && !!document.queryCommandSupported(action);\n });\n return support;\n }\n }]);\n\n return Clipboard;\n}((tiny_emitter_default()));\n\n/* harmony default export */ var clipboard = (Clipboard);\n\n/***/ }),\n\n/***/ 828:\n/***/ (function(module) {\n\nvar DOCUMENT_NODE_TYPE = 9;\n\n/**\n * A polyfill for Element.matches()\n */\nif (typeof Element !== 'undefined' && !Element.prototype.matches) {\n var proto = Element.prototype;\n\n proto.matches = proto.matchesSelector ||\n proto.mozMatchesSelector ||\n proto.msMatchesSelector ||\n proto.oMatchesSelector ||\n proto.webkitMatchesSelector;\n}\n\n/**\n * Finds the closest parent that matches a selector.\n *\n * @param {Element} element\n * @param {String} selector\n * @return {Function}\n */\nfunction closest (element, selector) {\n while (element && element.nodeType !== DOCUMENT_NODE_TYPE) {\n if (typeof element.matches === 'function' &&\n element.matches(selector)) {\n return element;\n }\n element = element.parentNode;\n }\n}\n\nmodule.exports = closest;\n\n\n/***/ }),\n\n/***/ 438:\n/***/ (function(module, __unused_webpack_exports, __webpack_require__) {\n\nvar closest = __webpack_require__(828);\n\n/**\n * Delegates event to a selector.\n *\n * @param {Element} element\n * @param {String} selector\n * @param {String} type\n * @param {Function} callback\n * @param {Boolean} useCapture\n * @return {Object}\n */\nfunction _delegate(element, selector, type, callback, useCapture) {\n var listenerFn = listener.apply(this, arguments);\n\n element.addEventListener(type, listenerFn, useCapture);\n\n return {\n destroy: function() {\n element.removeEventListener(type, listenerFn, useCapture);\n }\n }\n}\n\n/**\n * Delegates event to a selector.\n *\n * @param {Element|String|Array} [elements]\n * @param {String} selector\n * @param {String} type\n * @param {Function} callback\n * @param {Boolean} useCapture\n * @return {Object}\n */\nfunction delegate(elements, selector, type, callback, useCapture) {\n // Handle the regular Element usage\n if (typeof elements.addEventListener === 'function') {\n return _delegate.apply(null, arguments);\n }\n\n // Handle Element-less usage, it defaults to global delegation\n if (typeof type === 'function') {\n // Use `document` as the first parameter, then apply arguments\n // This is a short way to .unshift `arguments` without running into deoptimizations\n return _delegate.bind(null, document).apply(null, arguments);\n }\n\n // Handle Selector-based usage\n if (typeof elements === 'string') {\n elements = document.querySelectorAll(elements);\n }\n\n // Handle Array-like based usage\n return Array.prototype.map.call(elements, function (element) {\n return _delegate(element, selector, type, callback, useCapture);\n });\n}\n\n/**\n * Finds closest match and invokes callback.\n *\n * @param {Element} element\n * @param {String} selector\n * @param {String} type\n * @param {Function} callback\n * @return {Function}\n */\nfunction listener(element, selector, type, callback) {\n return function(e) {\n e.delegateTarget = closest(e.target, selector);\n\n if (e.delegateTarget) {\n callback.call(element, e);\n }\n }\n}\n\nmodule.exports = delegate;\n\n\n/***/ }),\n\n/***/ 879:\n/***/ (function(__unused_webpack_module, exports) {\n\n/**\n * Check if argument is a HTML element.\n *\n * @param {Object} value\n * @return {Boolean}\n */\nexports.node = function(value) {\n return value !== undefined\n && value instanceof HTMLElement\n && value.nodeType === 1;\n};\n\n/**\n * Check if argument is a list of HTML elements.\n *\n * @param {Object} value\n * @return {Boolean}\n */\nexports.nodeList = function(value) {\n var type = Object.prototype.toString.call(value);\n\n return value !== undefined\n && (type === '[object NodeList]' || type === '[object HTMLCollection]')\n && ('length' in value)\n && (value.length === 0 || exports.node(value[0]));\n};\n\n/**\n * Check if argument is a string.\n *\n * @param {Object} value\n * @return {Boolean}\n */\nexports.string = function(value) {\n return typeof value === 'string'\n || value instanceof String;\n};\n\n/**\n * Check if argument is a function.\n *\n * @param {Object} value\n * @return {Boolean}\n */\nexports.fn = function(value) {\n var type = Object.prototype.toString.call(value);\n\n return type === '[object Function]';\n};\n\n\n/***/ }),\n\n/***/ 370:\n/***/ (function(module, __unused_webpack_exports, __webpack_require__) {\n\nvar is = __webpack_require__(879);\nvar delegate = __webpack_require__(438);\n\n/**\n * Validates all params and calls the right\n * listener function based on its target type.\n *\n * @param {String|HTMLElement|HTMLCollection|NodeList} target\n * @param {String} type\n * @param {Function} callback\n * @return {Object}\n */\nfunction listen(target, type, callback) {\n if (!target && !type && !callback) {\n throw new Error('Missing required arguments');\n }\n\n if (!is.string(type)) {\n throw new TypeError('Second argument must be a String');\n }\n\n if (!is.fn(callback)) {\n throw new TypeError('Third argument must be a Function');\n }\n\n if (is.node(target)) {\n return listenNode(target, type, callback);\n }\n else if (is.nodeList(target)) {\n return listenNodeList(target, type, callback);\n }\n else if (is.string(target)) {\n return listenSelector(target, type, callback);\n }\n else {\n throw new TypeError('First argument must be a String, HTMLElement, HTMLCollection, or NodeList');\n }\n}\n\n/**\n * Adds an event listener to a HTML element\n * and returns a remove listener function.\n *\n * @param {HTMLElement} node\n * @param {String} type\n * @param {Function} callback\n * @return {Object}\n */\nfunction listenNode(node, type, callback) {\n node.addEventListener(type, callback);\n\n return {\n destroy: function() {\n node.removeEventListener(type, callback);\n }\n }\n}\n\n/**\n * Add an event listener to a list of HTML elements\n * and returns a remove listener function.\n *\n * @param {NodeList|HTMLCollection} nodeList\n * @param {String} type\n * @param {Function} callback\n * @return {Object}\n */\nfunction listenNodeList(nodeList, type, callback) {\n Array.prototype.forEach.call(nodeList, function(node) {\n node.addEventListener(type, callback);\n });\n\n return {\n destroy: function() {\n Array.prototype.forEach.call(nodeList, function(node) {\n node.removeEventListener(type, callback);\n });\n }\n }\n}\n\n/**\n * Add an event listener to a selector\n * and returns a remove listener function.\n *\n * @param {String} selector\n * @param {String} type\n * @param {Function} callback\n * @return {Object}\n */\nfunction listenSelector(selector, type, callback) {\n return delegate(document.body, selector, type, callback);\n}\n\nmodule.exports = listen;\n\n\n/***/ }),\n\n/***/ 817:\n/***/ (function(module) {\n\nfunction select(element) {\n var selectedText;\n\n if (element.nodeName === 'SELECT') {\n element.focus();\n\n selectedText = element.value;\n }\n else if (element.nodeName === 'INPUT' || element.nodeName === 'TEXTAREA') {\n var isReadOnly = element.hasAttribute('readonly');\n\n if (!isReadOnly) {\n element.setAttribute('readonly', '');\n }\n\n element.select();\n element.setSelectionRange(0, element.value.length);\n\n if (!isReadOnly) {\n element.removeAttribute('readonly');\n }\n\n selectedText = element.value;\n }\n else {\n if (element.hasAttribute('contenteditable')) {\n element.focus();\n }\n\n var selection = window.getSelection();\n var range = document.createRange();\n\n range.selectNodeContents(element);\n selection.removeAllRanges();\n selection.addRange(range);\n\n selectedText = selection.toString();\n }\n\n return selectedText;\n}\n\nmodule.exports = select;\n\n\n/***/ }),\n\n/***/ 279:\n/***/ (function(module) {\n\nfunction E () {\n // Keep this empty so it's easier to inherit from\n // (via https://github.com/lipsmack from https://github.com/scottcorgan/tiny-emitter/issues/3)\n}\n\nE.prototype = {\n on: function (name, callback, ctx) {\n var e = this.e || (this.e = {});\n\n (e[name] || (e[name] = [])).push({\n fn: callback,\n ctx: ctx\n });\n\n return this;\n },\n\n once: function (name, callback, ctx) {\n var self = this;\n function listener () {\n self.off(name, listener);\n callback.apply(ctx, arguments);\n };\n\n listener._ = callback\n return this.on(name, listener, ctx);\n },\n\n emit: function (name) {\n var data = [].slice.call(arguments, 1);\n var evtArr = ((this.e || (this.e = {}))[name] || []).slice();\n var i = 0;\n var len = evtArr.length;\n\n for (i; i < len; i++) {\n evtArr[i].fn.apply(evtArr[i].ctx, data);\n }\n\n return this;\n },\n\n off: function (name, callback) {\n var e = this.e || (this.e = {});\n var evts = e[name];\n var liveEvents = [];\n\n if (evts && callback) {\n for (var i = 0, len = evts.length; i < len; i++) {\n if (evts[i].fn !== callback && evts[i].fn._ !== callback)\n liveEvents.push(evts[i]);\n }\n }\n\n // Remove event from queue to prevent memory leak\n // Suggested by https://github.com/lazd\n // Ref: https://github.com/scottcorgan/tiny-emitter/commit/c6ebfaa9bc973b33d110a84a307742b7cf94c953#commitcomment-5024910\n\n (liveEvents.length)\n ? e[name] = liveEvents\n : delete e[name];\n\n return this;\n }\n};\n\nmodule.exports = E;\nmodule.exports.TinyEmitter = E;\n\n\n/***/ })\n\n/******/ \t});\n/************************************************************************/\n/******/ \t// The module cache\n/******/ \tvar __webpack_module_cache__ = {};\n/******/ \t\n/******/ \t// The require function\n/******/ \tfunction __webpack_require__(moduleId) {\n/******/ \t\t// Check if module is in cache\n/******/ \t\tif(__webpack_module_cache__[moduleId]) {\n/******/ \t\t\treturn __webpack_module_cache__[moduleId].exports;\n/******/ \t\t}\n/******/ \t\t// Create a new module (and put it into the cache)\n/******/ \t\tvar module = __webpack_module_cache__[moduleId] = {\n/******/ \t\t\t// no module.id needed\n/******/ \t\t\t// no module.loaded needed\n/******/ \t\t\texports: {}\n/******/ \t\t};\n/******/ \t\n/******/ \t\t// Execute the module function\n/******/ \t\t__webpack_modules__[moduleId](module, module.exports, __webpack_require__);\n/******/ \t\n/******/ \t\t// Return the exports of the module\n/******/ \t\treturn module.exports;\n/******/ \t}\n/******/ \t\n/************************************************************************/\n/******/ \t/* webpack/runtime/compat get default export */\n/******/ \t!function() {\n/******/ \t\t// getDefaultExport function for compatibility with non-harmony modules\n/******/ \t\t__webpack_require__.n = function(module) {\n/******/ \t\t\tvar getter = module && module.__esModule ?\n/******/ \t\t\t\tfunction() { return module['default']; } :\n/******/ \t\t\t\tfunction() { return module; };\n/******/ \t\t\t__webpack_require__.d(getter, { a: getter });\n/******/ \t\t\treturn getter;\n/******/ \t\t};\n/******/ \t}();\n/******/ \t\n/******/ \t/* webpack/runtime/define property getters */\n/******/ \t!function() {\n/******/ \t\t// define getter functions for harmony exports\n/******/ \t\t__webpack_require__.d = function(exports, definition) {\n/******/ \t\t\tfor(var key in definition) {\n/******/ \t\t\t\tif(__webpack_require__.o(definition, key) && !__webpack_require__.o(exports, key)) {\n/******/ \t\t\t\t\tObject.defineProperty(exports, key, { enumerable: true, get: definition[key] });\n/******/ \t\t\t\t}\n/******/ \t\t\t}\n/******/ \t\t};\n/******/ \t}();\n/******/ \t\n/******/ \t/* webpack/runtime/hasOwnProperty shorthand */\n/******/ \t!function() {\n/******/ \t\t__webpack_require__.o = function(obj, prop) { return Object.prototype.hasOwnProperty.call(obj, prop); }\n/******/ \t}();\n/******/ \t\n/************************************************************************/\n/******/ \t// module exports must be returned from runtime so entry inlining is disabled\n/******/ \t// startup\n/******/ \t// Load entry module and return exports\n/******/ \treturn __webpack_require__(686);\n/******/ })()\n.default;\n});", "/*\n * Copyright (c) 2016-2024 Martin Donath \n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport \"focus-visible\"\n\nimport {\n EMPTY,\n NEVER,\n Observable,\n Subject,\n defer,\n delay,\n filter,\n map,\n merge,\n mergeWith,\n shareReplay,\n switchMap\n} from \"rxjs\"\n\nimport { configuration, feature } from \"./_\"\nimport {\n at,\n getActiveElement,\n getOptionalElement,\n requestJSON,\n setLocation,\n setToggle,\n watchDocument,\n watchKeyboard,\n watchLocation,\n watchLocationTarget,\n watchMedia,\n watchPrint,\n watchScript,\n watchViewport\n} from \"./browser\"\nimport {\n getComponentElement,\n getComponentElements,\n mountAnnounce,\n mountBackToTop,\n mountConsent,\n mountContent,\n mountDialog,\n mountHeader,\n mountHeaderTitle,\n mountPalette,\n mountProgress,\n mountSearch,\n mountSearchHiglight,\n mountSidebar,\n mountSource,\n mountTableOfContents,\n mountTabs,\n watchHeader,\n watchMain\n} from \"./components\"\nimport {\n SearchIndex,\n setupClipboardJS,\n setupInstantNavigation,\n setupVersionSelector\n} from \"./integrations\"\nimport {\n patchEllipsis,\n patchIndeterminate,\n patchScrollfix,\n patchScrolllock\n} from \"./patches\"\nimport \"./polyfills\"\n\n/* ----------------------------------------------------------------------------\n * Functions - @todo refactor\n * ------------------------------------------------------------------------- */\n\n/**\n * Fetch search index\n *\n * @returns Search index observable\n */\nfunction fetchSearchIndex(): Observable {\n if (location.protocol === \"file:\") {\n return watchScript(\n `${new URL(\"search/search_index.js\", config.base)}`\n )\n .pipe(\n // @ts-ignore - @todo fix typings\n map(() => __index),\n shareReplay(1)\n )\n } else {\n return requestJSON(\n new URL(\"search/search_index.json\", config.base)\n )\n }\n}\n\n/* ----------------------------------------------------------------------------\n * Application\n * ------------------------------------------------------------------------- */\n\n/* Yay, JavaScript is available */\ndocument.documentElement.classList.remove(\"no-js\")\ndocument.documentElement.classList.add(\"js\")\n\n/* Set up navigation observables and subjects */\nconst document$ = watchDocument()\nconst location$ = watchLocation()\nconst target$ = watchLocationTarget(location$)\nconst keyboard$ = watchKeyboard()\n\n/* Set up media observables */\nconst viewport$ = watchViewport()\nconst tablet$ = watchMedia(\"(min-width: 960px)\")\nconst screen$ = watchMedia(\"(min-width: 1220px)\")\nconst print$ = watchPrint()\n\n/* Retrieve search index, if search is enabled */\nconst config = configuration()\nconst index$ = document.forms.namedItem(\"search\")\n ? fetchSearchIndex()\n : NEVER\n\n/* Set up Clipboard.js integration */\nconst alert$ = new Subject()\nsetupClipboardJS({ alert$ })\n\n/* Set up progress indicator */\nconst progress$ = new Subject()\n\n/* Set up instant navigation, if enabled */\nif (feature(\"navigation.instant\"))\n setupInstantNavigation({ location$, viewport$, progress$ })\n .subscribe(document$)\n\n/* Set up version selector */\nif (config.version?.provider === \"mike\")\n setupVersionSelector({ document$ })\n\n/* Always close drawer and search on navigation */\nmerge(location$, target$)\n .pipe(\n delay(125)\n )\n .subscribe(() => {\n setToggle(\"drawer\", false)\n setToggle(\"search\", false)\n })\n\n/* Set up global keyboard handlers */\nkeyboard$\n .pipe(\n filter(({ mode }) => mode === \"global\")\n )\n .subscribe(key => {\n switch (key.type) {\n\n /* Go to previous page */\n case \"p\":\n case \",\":\n const prev = getOptionalElement(\"link[rel=prev]\")\n if (typeof prev !== \"undefined\")\n setLocation(prev)\n break\n\n /* Go to next page */\n case \"n\":\n case \".\":\n const next = getOptionalElement(\"link[rel=next]\")\n if (typeof next !== \"undefined\")\n setLocation(next)\n break\n\n /* Expand navigation, see https://bit.ly/3ZjG5io */\n case \"Enter\":\n const active = getActiveElement()\n if (active instanceof HTMLLabelElement)\n active.click()\n }\n })\n\n/* Set up patches */\npatchEllipsis({ viewport$, document$ })\npatchIndeterminate({ document$, tablet$ })\npatchScrollfix({ document$ })\npatchScrolllock({ viewport$, tablet$ })\n\n/* Set up header and main area observable */\nconst header$ = watchHeader(getComponentElement(\"header\"), { viewport$ })\nconst main$ = document$\n .pipe(\n map(() => getComponentElement(\"main\")),\n switchMap(el => watchMain(el, { viewport$, header$ })),\n shareReplay(1)\n )\n\n/* Set up control component observables */\nconst control$ = merge(\n\n /* Consent */\n ...getComponentElements(\"consent\")\n .map(el => mountConsent(el, { target$ })),\n\n /* Dialog */\n ...getComponentElements(\"dialog\")\n .map(el => mountDialog(el, { alert$ })),\n\n /* Header */\n ...getComponentElements(\"header\")\n .map(el => mountHeader(el, { viewport$, header$, main$ })),\n\n /* Color palette */\n ...getComponentElements(\"palette\")\n .map(el => mountPalette(el)),\n\n /* Progress bar */\n ...getComponentElements(\"progress\")\n .map(el => mountProgress(el, { progress$ })),\n\n /* Search */\n ...getComponentElements(\"search\")\n .map(el => mountSearch(el, { index$, keyboard$ })),\n\n /* Repository information */\n ...getComponentElements(\"source\")\n .map(el => mountSource(el))\n)\n\n/* Set up content component observables */\nconst content$ = defer(() => merge(\n\n /* Announcement bar */\n ...getComponentElements(\"announce\")\n .map(el => mountAnnounce(el)),\n\n /* Content */\n ...getComponentElements(\"content\")\n .map(el => mountContent(el, { viewport$, target$, print$ })),\n\n /* Search highlighting */\n ...getComponentElements(\"content\")\n .map(el => feature(\"search.highlight\")\n ? mountSearchHiglight(el, { index$, location$ })\n : EMPTY\n ),\n\n /* Header title */\n ...getComponentElements(\"header-title\")\n .map(el => mountHeaderTitle(el, { viewport$, header$ })),\n\n /* Sidebar */\n ...getComponentElements(\"sidebar\")\n .map(el => el.getAttribute(\"data-md-type\") === \"navigation\"\n ? at(screen$, () => mountSidebar(el, { viewport$, header$, main$ }))\n : at(tablet$, () => mountSidebar(el, { viewport$, header$, main$ }))\n ),\n\n /* Navigation tabs */\n ...getComponentElements(\"tabs\")\n .map(el => mountTabs(el, { viewport$, header$ })),\n\n /* Table of contents */\n ...getComponentElements(\"toc\")\n .map(el => mountTableOfContents(el, {\n viewport$, header$, main$, target$\n })),\n\n /* Back-to-top button */\n ...getComponentElements(\"top\")\n .map(el => mountBackToTop(el, { viewport$, header$, main$, target$ }))\n))\n\n/* Set up component observables */\nconst component$ = document$\n .pipe(\n switchMap(() => content$),\n mergeWith(control$),\n shareReplay(1)\n )\n\n/* Subscribe to all components */\ncomponent$.subscribe()\n\n/* ----------------------------------------------------------------------------\n * Exports\n * ------------------------------------------------------------------------- */\n\nwindow.document$ = document$ /* Document observable */\nwindow.location$ = location$ /* Location subject */\nwindow.target$ = target$ /* Location target observable */\nwindow.keyboard$ = keyboard$ /* Keyboard observable */\nwindow.viewport$ = viewport$ /* Viewport observable */\nwindow.tablet$ = tablet$ /* Media tablet observable */\nwindow.screen$ = screen$ /* Media screen observable */\nwindow.print$ = print$ /* Media print observable */\nwindow.alert$ = alert$ /* Alert subject */\nwindow.progress$ = progress$ /* Progress indicator subject */\nwindow.component$ = component$ /* Component observable */\n", "/******************************************************************************\nCopyright (c) Microsoft Corporation.\n\nPermission to use, copy, modify, and/or distribute this software for any\npurpose with or without fee is hereby granted.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH\nREGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY\nAND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,\nINDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM\nLOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR\nOTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR\nPERFORMANCE OF THIS SOFTWARE.\n***************************************************************************** */\n/* global Reflect, Promise, SuppressedError, Symbol, Iterator */\n\nvar extendStatics = function(d, b) {\n extendStatics = Object.setPrototypeOf ||\n ({ __proto__: [] } instanceof Array && function (d, b) { d.__proto__ = b; }) ||\n function (d, b) { for (var p in b) if (Object.prototype.hasOwnProperty.call(b, p)) d[p] = b[p]; };\n return extendStatics(d, b);\n};\n\nexport function __extends(d, b) {\n if (typeof b !== \"function\" && b !== null)\n throw new TypeError(\"Class extends value \" + String(b) + \" is not a constructor or null\");\n extendStatics(d, b);\n function __() { this.constructor = d; }\n d.prototype = b === null ? Object.create(b) : (__.prototype = b.prototype, new __());\n}\n\nexport var __assign = function() {\n __assign = Object.assign || function __assign(t) {\n for (var s, i = 1, n = arguments.length; i < n; i++) {\n s = arguments[i];\n for (var p in s) if (Object.prototype.hasOwnProperty.call(s, p)) t[p] = s[p];\n }\n return t;\n }\n return __assign.apply(this, arguments);\n}\n\nexport function __rest(s, e) {\n var t = {};\n for (var p in s) if (Object.prototype.hasOwnProperty.call(s, p) && e.indexOf(p) < 0)\n t[p] = s[p];\n if (s != null && typeof Object.getOwnPropertySymbols === \"function\")\n for (var i = 0, p = Object.getOwnPropertySymbols(s); i < p.length; i++) {\n if (e.indexOf(p[i]) < 0 && Object.prototype.propertyIsEnumerable.call(s, p[i]))\n t[p[i]] = s[p[i]];\n }\n return t;\n}\n\nexport function __decorate(decorators, target, key, desc) {\n var c = arguments.length, r = c < 3 ? target : desc === null ? desc = Object.getOwnPropertyDescriptor(target, key) : desc, d;\n if (typeof Reflect === \"object\" && typeof Reflect.decorate === \"function\") r = Reflect.decorate(decorators, target, key, desc);\n else for (var i = decorators.length - 1; i >= 0; i--) if (d = decorators[i]) r = (c < 3 ? d(r) : c > 3 ? d(target, key, r) : d(target, key)) || r;\n return c > 3 && r && Object.defineProperty(target, key, r), r;\n}\n\nexport function __param(paramIndex, decorator) {\n return function (target, key) { decorator(target, key, paramIndex); }\n}\n\nexport function __esDecorate(ctor, descriptorIn, decorators, contextIn, initializers, extraInitializers) {\n function accept(f) { if (f !== void 0 && typeof f !== \"function\") throw new TypeError(\"Function expected\"); return f; }\n var kind = contextIn.kind, key = kind === \"getter\" ? \"get\" : kind === \"setter\" ? \"set\" : \"value\";\n var target = !descriptorIn && ctor ? contextIn[\"static\"] ? ctor : ctor.prototype : null;\n var descriptor = descriptorIn || (target ? Object.getOwnPropertyDescriptor(target, contextIn.name) : {});\n var _, done = false;\n for (var i = decorators.length - 1; i >= 0; i--) {\n var context = {};\n for (var p in contextIn) context[p] = p === \"access\" ? {} : contextIn[p];\n for (var p in contextIn.access) context.access[p] = contextIn.access[p];\n context.addInitializer = function (f) { if (done) throw new TypeError(\"Cannot add initializers after decoration has completed\"); extraInitializers.push(accept(f || null)); };\n var result = (0, decorators[i])(kind === \"accessor\" ? { get: descriptor.get, set: descriptor.set } : descriptor[key], context);\n if (kind === \"accessor\") {\n if (result === void 0) continue;\n if (result === null || typeof result !== \"object\") throw new TypeError(\"Object expected\");\n if (_ = accept(result.get)) descriptor.get = _;\n if (_ = accept(result.set)) descriptor.set = _;\n if (_ = accept(result.init)) initializers.unshift(_);\n }\n else if (_ = accept(result)) {\n if (kind === \"field\") initializers.unshift(_);\n else descriptor[key] = _;\n }\n }\n if (target) Object.defineProperty(target, contextIn.name, descriptor);\n done = true;\n};\n\nexport function __runInitializers(thisArg, initializers, value) {\n var useValue = arguments.length > 2;\n for (var i = 0; i < initializers.length; i++) {\n value = useValue ? initializers[i].call(thisArg, value) : initializers[i].call(thisArg);\n }\n return useValue ? value : void 0;\n};\n\nexport function __propKey(x) {\n return typeof x === \"symbol\" ? x : \"\".concat(x);\n};\n\nexport function __setFunctionName(f, name, prefix) {\n if (typeof name === \"symbol\") name = name.description ? \"[\".concat(name.description, \"]\") : \"\";\n return Object.defineProperty(f, \"name\", { configurable: true, value: prefix ? \"\".concat(prefix, \" \", name) : name });\n};\n\nexport function __metadata(metadataKey, metadataValue) {\n if (typeof Reflect === \"object\" && typeof Reflect.metadata === \"function\") return Reflect.metadata(metadataKey, metadataValue);\n}\n\nexport function __awaiter(thisArg, _arguments, P, generator) {\n function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }\n return new (P || (P = Promise))(function (resolve, reject) {\n function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }\n function rejected(value) { try { step(generator[\"throw\"](value)); } catch (e) { reject(e); } }\n function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }\n step((generator = generator.apply(thisArg, _arguments || [])).next());\n });\n}\n\nexport function __generator(thisArg, body) {\n var _ = { label: 0, sent: function() { if (t[0] & 1) throw t[1]; return t[1]; }, trys: [], ops: [] }, f, y, t, g = Object.create((typeof Iterator === \"function\" ? Iterator : Object).prototype);\n return g.next = verb(0), g[\"throw\"] = verb(1), g[\"return\"] = verb(2), typeof Symbol === \"function\" && (g[Symbol.iterator] = function() { return this; }), g;\n function verb(n) { return function (v) { return step([n, v]); }; }\n function step(op) {\n if (f) throw new TypeError(\"Generator is already executing.\");\n while (g && (g = 0, op[0] && (_ = 0)), _) try {\n if (f = 1, y && (t = op[0] & 2 ? y[\"return\"] : op[0] ? y[\"throw\"] || ((t = y[\"return\"]) && t.call(y), 0) : y.next) && !(t = t.call(y, op[1])).done) return t;\n if (y = 0, t) op = [op[0] & 2, t.value];\n switch (op[0]) {\n case 0: case 1: t = op; break;\n case 4: _.label++; return { value: op[1], done: false };\n case 5: _.label++; y = op[1]; op = [0]; continue;\n case 7: op = _.ops.pop(); _.trys.pop(); continue;\n default:\n if (!(t = _.trys, t = t.length > 0 && t[t.length - 1]) && (op[0] === 6 || op[0] === 2)) { _ = 0; continue; }\n if (op[0] === 3 && (!t || (op[1] > t[0] && op[1] < t[3]))) { _.label = op[1]; break; }\n if (op[0] === 6 && _.label < t[1]) { _.label = t[1]; t = op; break; }\n if (t && _.label < t[2]) { _.label = t[2]; _.ops.push(op); break; }\n if (t[2]) _.ops.pop();\n _.trys.pop(); continue;\n }\n op = body.call(thisArg, _);\n } catch (e) { op = [6, e]; y = 0; } finally { f = t = 0; }\n if (op[0] & 5) throw op[1]; return { value: op[0] ? op[1] : void 0, done: true };\n }\n}\n\nexport var __createBinding = Object.create ? (function(o, m, k, k2) {\n if (k2 === undefined) k2 = k;\n var desc = Object.getOwnPropertyDescriptor(m, k);\n if (!desc || (\"get\" in desc ? !m.__esModule : desc.writable || desc.configurable)) {\n desc = { enumerable: true, get: function() { return m[k]; } };\n }\n Object.defineProperty(o, k2, desc);\n}) : (function(o, m, k, k2) {\n if (k2 === undefined) k2 = k;\n o[k2] = m[k];\n});\n\nexport function __exportStar(m, o) {\n for (var p in m) if (p !== \"default\" && !Object.prototype.hasOwnProperty.call(o, p)) __createBinding(o, m, p);\n}\n\nexport function __values(o) {\n var s = typeof Symbol === \"function\" && Symbol.iterator, m = s && o[s], i = 0;\n if (m) return m.call(o);\n if (o && typeof o.length === \"number\") return {\n next: function () {\n if (o && i >= o.length) o = void 0;\n return { value: o && o[i++], done: !o };\n }\n };\n throw new TypeError(s ? \"Object is not iterable.\" : \"Symbol.iterator is not defined.\");\n}\n\nexport function __read(o, n) {\n var m = typeof Symbol === \"function\" && o[Symbol.iterator];\n if (!m) return o;\n var i = m.call(o), r, ar = [], e;\n try {\n while ((n === void 0 || n-- > 0) && !(r = i.next()).done) ar.push(r.value);\n }\n catch (error) { e = { error: error }; }\n finally {\n try {\n if (r && !r.done && (m = i[\"return\"])) m.call(i);\n }\n finally { if (e) throw e.error; }\n }\n return ar;\n}\n\n/** @deprecated */\nexport function __spread() {\n for (var ar = [], i = 0; i < arguments.length; i++)\n ar = ar.concat(__read(arguments[i]));\n return ar;\n}\n\n/** @deprecated */\nexport function __spreadArrays() {\n for (var s = 0, i = 0, il = arguments.length; i < il; i++) s += arguments[i].length;\n for (var r = Array(s), k = 0, i = 0; i < il; i++)\n for (var a = arguments[i], j = 0, jl = a.length; j < jl; j++, k++)\n r[k] = a[j];\n return r;\n}\n\nexport function __spreadArray(to, from, pack) {\n if (pack || arguments.length === 2) for (var i = 0, l = from.length, ar; i < l; i++) {\n if (ar || !(i in from)) {\n if (!ar) ar = Array.prototype.slice.call(from, 0, i);\n ar[i] = from[i];\n }\n }\n return to.concat(ar || Array.prototype.slice.call(from));\n}\n\nexport function __await(v) {\n return this instanceof __await ? (this.v = v, this) : new __await(v);\n}\n\nexport function __asyncGenerator(thisArg, _arguments, generator) {\n if (!Symbol.asyncIterator) throw new TypeError(\"Symbol.asyncIterator is not defined.\");\n var g = generator.apply(thisArg, _arguments || []), i, q = [];\n return i = Object.create((typeof AsyncIterator === \"function\" ? AsyncIterator : Object).prototype), verb(\"next\"), verb(\"throw\"), verb(\"return\", awaitReturn), i[Symbol.asyncIterator] = function () { return this; }, i;\n function awaitReturn(f) { return function (v) { return Promise.resolve(v).then(f, reject); }; }\n function verb(n, f) { if (g[n]) { i[n] = function (v) { return new Promise(function (a, b) { q.push([n, v, a, b]) > 1 || resume(n, v); }); }; if (f) i[n] = f(i[n]); } }\n function resume(n, v) { try { step(g[n](v)); } catch (e) { settle(q[0][3], e); } }\n function step(r) { r.value instanceof __await ? Promise.resolve(r.value.v).then(fulfill, reject) : settle(q[0][2], r); }\n function fulfill(value) { resume(\"next\", value); }\n function reject(value) { resume(\"throw\", value); }\n function settle(f, v) { if (f(v), q.shift(), q.length) resume(q[0][0], q[0][1]); }\n}\n\nexport function __asyncDelegator(o) {\n var i, p;\n return i = {}, verb(\"next\"), verb(\"throw\", function (e) { throw e; }), verb(\"return\"), i[Symbol.iterator] = function () { return this; }, i;\n function verb(n, f) { i[n] = o[n] ? function (v) { return (p = !p) ? { value: __await(o[n](v)), done: false } : f ? f(v) : v; } : f; }\n}\n\nexport function __asyncValues(o) {\n if (!Symbol.asyncIterator) throw new TypeError(\"Symbol.asyncIterator is not defined.\");\n var m = o[Symbol.asyncIterator], i;\n return m ? m.call(o) : (o = typeof __values === \"function\" ? __values(o) : o[Symbol.iterator](), i = {}, verb(\"next\"), verb(\"throw\"), verb(\"return\"), i[Symbol.asyncIterator] = function () { return this; }, i);\n function verb(n) { i[n] = o[n] && function (v) { return new Promise(function (resolve, reject) { v = o[n](v), settle(resolve, reject, v.done, v.value); }); }; }\n function settle(resolve, reject, d, v) { Promise.resolve(v).then(function(v) { resolve({ value: v, done: d }); }, reject); }\n}\n\nexport function __makeTemplateObject(cooked, raw) {\n if (Object.defineProperty) { Object.defineProperty(cooked, \"raw\", { value: raw }); } else { cooked.raw = raw; }\n return cooked;\n};\n\nvar __setModuleDefault = Object.create ? (function(o, v) {\n Object.defineProperty(o, \"default\", { enumerable: true, value: v });\n}) : function(o, v) {\n o[\"default\"] = v;\n};\n\nexport function __importStar(mod) {\n if (mod && mod.__esModule) return mod;\n var result = {};\n if (mod != null) for (var k in mod) if (k !== \"default\" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);\n __setModuleDefault(result, mod);\n return result;\n}\n\nexport function __importDefault(mod) {\n return (mod && mod.__esModule) ? mod : { default: mod };\n}\n\nexport function __classPrivateFieldGet(receiver, state, kind, f) {\n if (kind === \"a\" && !f) throw new TypeError(\"Private accessor was defined without a getter\");\n if (typeof state === \"function\" ? receiver !== state || !f : !state.has(receiver)) throw new TypeError(\"Cannot read private member from an object whose class did not declare it\");\n return kind === \"m\" ? f : kind === \"a\" ? f.call(receiver) : f ? f.value : state.get(receiver);\n}\n\nexport function __classPrivateFieldSet(receiver, state, value, kind, f) {\n if (kind === \"m\") throw new TypeError(\"Private method is not writable\");\n if (kind === \"a\" && !f) throw new TypeError(\"Private accessor was defined without a setter\");\n if (typeof state === \"function\" ? receiver !== state || !f : !state.has(receiver)) throw new TypeError(\"Cannot write private member to an object whose class did not declare it\");\n return (kind === \"a\" ? f.call(receiver, value) : f ? f.value = value : state.set(receiver, value)), value;\n}\n\nexport function __classPrivateFieldIn(state, receiver) {\n if (receiver === null || (typeof receiver !== \"object\" && typeof receiver !== \"function\")) throw new TypeError(\"Cannot use 'in' operator on non-object\");\n return typeof state === \"function\" ? receiver === state : state.has(receiver);\n}\n\nexport function __addDisposableResource(env, value, async) {\n if (value !== null && value !== void 0) {\n if (typeof value !== \"object\" && typeof value !== \"function\") throw new TypeError(\"Object expected.\");\n var dispose, inner;\n if (async) {\n if (!Symbol.asyncDispose) throw new TypeError(\"Symbol.asyncDispose is not defined.\");\n dispose = value[Symbol.asyncDispose];\n }\n if (dispose === void 0) {\n if (!Symbol.dispose) throw new TypeError(\"Symbol.dispose is not defined.\");\n dispose = value[Symbol.dispose];\n if (async) inner = dispose;\n }\n if (typeof dispose !== \"function\") throw new TypeError(\"Object not disposable.\");\n if (inner) dispose = function() { try { inner.call(this); } catch (e) { return Promise.reject(e); } };\n env.stack.push({ value: value, dispose: dispose, async: async });\n }\n else if (async) {\n env.stack.push({ async: true });\n }\n return value;\n}\n\nvar _SuppressedError = typeof SuppressedError === \"function\" ? SuppressedError : function (error, suppressed, message) {\n var e = new Error(message);\n return e.name = \"SuppressedError\", e.error = error, e.suppressed = suppressed, e;\n};\n\nexport function __disposeResources(env) {\n function fail(e) {\n env.error = env.hasError ? new _SuppressedError(e, env.error, \"An error was suppressed during disposal.\") : e;\n env.hasError = true;\n }\n var r, s = 0;\n function next() {\n while (r = env.stack.pop()) {\n try {\n if (!r.async && s === 1) return s = 0, env.stack.push(r), Promise.resolve().then(next);\n if (r.dispose) {\n var result = r.dispose.call(r.value);\n if (r.async) return s |= 2, Promise.resolve(result).then(next, function(e) { fail(e); return next(); });\n }\n else s |= 1;\n }\n catch (e) {\n fail(e);\n }\n }\n if (s === 1) return env.hasError ? Promise.reject(env.error) : Promise.resolve();\n if (env.hasError) throw env.error;\n }\n return next();\n}\n\nexport default {\n __extends,\n __assign,\n __rest,\n __decorate,\n __param,\n __metadata,\n __awaiter,\n __generator,\n __createBinding,\n __exportStar,\n __values,\n __read,\n __spread,\n __spreadArrays,\n __spreadArray,\n __await,\n __asyncGenerator,\n __asyncDelegator,\n __asyncValues,\n __makeTemplateObject,\n __importStar,\n __importDefault,\n __classPrivateFieldGet,\n __classPrivateFieldSet,\n __classPrivateFieldIn,\n __addDisposableResource,\n __disposeResources,\n};\n", "/**\n * Returns true if the object is a function.\n * @param value The value to check\n */\nexport function isFunction(value: any): value is (...args: any[]) => any {\n return typeof value === 'function';\n}\n", "/**\n * Used to create Error subclasses until the community moves away from ES5.\n *\n * This is because compiling from TypeScript down to ES5 has issues with subclassing Errors\n * as well as other built-in types: https://github.com/Microsoft/TypeScript/issues/12123\n *\n * @param createImpl A factory function to create the actual constructor implementation. The returned\n * function should be a named function that calls `_super` internally.\n */\nexport function createErrorClass(createImpl: (_super: any) => any): T {\n const _super = (instance: any) => {\n Error.call(instance);\n instance.stack = new Error().stack;\n };\n\n const ctorFunc = createImpl(_super);\n ctorFunc.prototype = Object.create(Error.prototype);\n ctorFunc.prototype.constructor = ctorFunc;\n return ctorFunc;\n}\n", "import { createErrorClass } from './createErrorClass';\n\nexport interface UnsubscriptionError extends Error {\n readonly errors: any[];\n}\n\nexport interface UnsubscriptionErrorCtor {\n /**\n * @deprecated Internal implementation detail. Do not construct error instances.\n * Cannot be tagged as internal: https://github.com/ReactiveX/rxjs/issues/6269\n */\n new (errors: any[]): UnsubscriptionError;\n}\n\n/**\n * An error thrown when one or more errors have occurred during the\n * `unsubscribe` of a {@link Subscription}.\n */\nexport const UnsubscriptionError: UnsubscriptionErrorCtor = createErrorClass(\n (_super) =>\n function UnsubscriptionErrorImpl(this: any, errors: (Error | string)[]) {\n _super(this);\n this.message = errors\n ? `${errors.length} errors occurred during unsubscription:\n${errors.map((err, i) => `${i + 1}) ${err.toString()}`).join('\\n ')}`\n : '';\n this.name = 'UnsubscriptionError';\n this.errors = errors;\n }\n);\n", "/**\n * Removes an item from an array, mutating it.\n * @param arr The array to remove the item from\n * @param item The item to remove\n */\nexport function arrRemove(arr: T[] | undefined | null, item: T) {\n if (arr) {\n const index = arr.indexOf(item);\n 0 <= index && arr.splice(index, 1);\n }\n}\n", "import { isFunction } from './util/isFunction';\nimport { UnsubscriptionError } from './util/UnsubscriptionError';\nimport { SubscriptionLike, TeardownLogic, Unsubscribable } from './types';\nimport { arrRemove } from './util/arrRemove';\n\n/**\n * Represents a disposable resource, such as the execution of an Observable. A\n * Subscription has one important method, `unsubscribe`, that takes no argument\n * and just disposes the resource held by the subscription.\n *\n * Additionally, subscriptions may be grouped together through the `add()`\n * method, which will attach a child Subscription to the current Subscription.\n * When a Subscription is unsubscribed, all its children (and its grandchildren)\n * will be unsubscribed as well.\n *\n * @class Subscription\n */\nexport class Subscription implements SubscriptionLike {\n /** @nocollapse */\n public static EMPTY = (() => {\n const empty = new Subscription();\n empty.closed = true;\n return empty;\n })();\n\n /**\n * A flag to indicate whether this Subscription has already been unsubscribed.\n */\n public closed = false;\n\n private _parentage: Subscription[] | Subscription | null = null;\n\n /**\n * The list of registered finalizers to execute upon unsubscription. Adding and removing from this\n * list occurs in the {@link #add} and {@link #remove} methods.\n */\n private _finalizers: Exclude[] | null = null;\n\n /**\n * @param initialTeardown A function executed first as part of the finalization\n * process that is kicked off when {@link #unsubscribe} is called.\n */\n constructor(private initialTeardown?: () => void) {}\n\n /**\n * Disposes the resources held by the subscription. May, for instance, cancel\n * an ongoing Observable execution or cancel any other type of work that\n * started when the Subscription was created.\n * @return {void}\n */\n unsubscribe(): void {\n let errors: any[] | undefined;\n\n if (!this.closed) {\n this.closed = true;\n\n // Remove this from it's parents.\n const { _parentage } = this;\n if (_parentage) {\n this._parentage = null;\n if (Array.isArray(_parentage)) {\n for (const parent of _parentage) {\n parent.remove(this);\n }\n } else {\n _parentage.remove(this);\n }\n }\n\n const { initialTeardown: initialFinalizer } = this;\n if (isFunction(initialFinalizer)) {\n try {\n initialFinalizer();\n } catch (e) {\n errors = e instanceof UnsubscriptionError ? e.errors : [e];\n }\n }\n\n const { _finalizers } = this;\n if (_finalizers) {\n this._finalizers = null;\n for (const finalizer of _finalizers) {\n try {\n execFinalizer(finalizer);\n } catch (err) {\n errors = errors ?? [];\n if (err instanceof UnsubscriptionError) {\n errors = [...errors, ...err.errors];\n } else {\n errors.push(err);\n }\n }\n }\n }\n\n if (errors) {\n throw new UnsubscriptionError(errors);\n }\n }\n }\n\n /**\n * Adds a finalizer to this subscription, so that finalization will be unsubscribed/called\n * when this subscription is unsubscribed. If this subscription is already {@link #closed},\n * because it has already been unsubscribed, then whatever finalizer is passed to it\n * will automatically be executed (unless the finalizer itself is also a closed subscription).\n *\n * Closed Subscriptions cannot be added as finalizers to any subscription. Adding a closed\n * subscription to a any subscription will result in no operation. (A noop).\n *\n * Adding a subscription to itself, or adding `null` or `undefined` will not perform any\n * operation at all. (A noop).\n *\n * `Subscription` instances that are added to this instance will automatically remove themselves\n * if they are unsubscribed. Functions and {@link Unsubscribable} objects that you wish to remove\n * will need to be removed manually with {@link #remove}\n *\n * @param teardown The finalization logic to add to this subscription.\n */\n add(teardown: TeardownLogic): void {\n // Only add the finalizer if it's not undefined\n // and don't add a subscription to itself.\n if (teardown && teardown !== this) {\n if (this.closed) {\n // If this subscription is already closed,\n // execute whatever finalizer is handed to it automatically.\n execFinalizer(teardown);\n } else {\n if (teardown instanceof Subscription) {\n // We don't add closed subscriptions, and we don't add the same subscription\n // twice. Subscription unsubscribe is idempotent.\n if (teardown.closed || teardown._hasParent(this)) {\n return;\n }\n teardown._addParent(this);\n }\n (this._finalizers = this._finalizers ?? []).push(teardown);\n }\n }\n }\n\n /**\n * Checks to see if a this subscription already has a particular parent.\n * This will signal that this subscription has already been added to the parent in question.\n * @param parent the parent to check for\n */\n private _hasParent(parent: Subscription) {\n const { _parentage } = this;\n return _parentage === parent || (Array.isArray(_parentage) && _parentage.includes(parent));\n }\n\n /**\n * Adds a parent to this subscription so it can be removed from the parent if it\n * unsubscribes on it's own.\n *\n * NOTE: THIS ASSUMES THAT {@link _hasParent} HAS ALREADY BEEN CHECKED.\n * @param parent The parent subscription to add\n */\n private _addParent(parent: Subscription) {\n const { _parentage } = this;\n this._parentage = Array.isArray(_parentage) ? (_parentage.push(parent), _parentage) : _parentage ? [_parentage, parent] : parent;\n }\n\n /**\n * Called on a child when it is removed via {@link #remove}.\n * @param parent The parent to remove\n */\n private _removeParent(parent: Subscription) {\n const { _parentage } = this;\n if (_parentage === parent) {\n this._parentage = null;\n } else if (Array.isArray(_parentage)) {\n arrRemove(_parentage, parent);\n }\n }\n\n /**\n * Removes a finalizer from this subscription that was previously added with the {@link #add} method.\n *\n * Note that `Subscription` instances, when unsubscribed, will automatically remove themselves\n * from every other `Subscription` they have been added to. This means that using the `remove` method\n * is not a common thing and should be used thoughtfully.\n *\n * If you add the same finalizer instance of a function or an unsubscribable object to a `Subscription` instance\n * more than once, you will need to call `remove` the same number of times to remove all instances.\n *\n * All finalizer instances are removed to free up memory upon unsubscription.\n *\n * @param teardown The finalizer to remove from this subscription\n */\n remove(teardown: Exclude): void {\n const { _finalizers } = this;\n _finalizers && arrRemove(_finalizers, teardown);\n\n if (teardown instanceof Subscription) {\n teardown._removeParent(this);\n }\n }\n}\n\nexport const EMPTY_SUBSCRIPTION = Subscription.EMPTY;\n\nexport function isSubscription(value: any): value is Subscription {\n return (\n value instanceof Subscription ||\n (value && 'closed' in value && isFunction(value.remove) && isFunction(value.add) && isFunction(value.unsubscribe))\n );\n}\n\nfunction execFinalizer(finalizer: Unsubscribable | (() => void)) {\n if (isFunction(finalizer)) {\n finalizer();\n } else {\n finalizer.unsubscribe();\n }\n}\n", "import { Subscriber } from './Subscriber';\nimport { ObservableNotification } from './types';\n\n/**\n * The {@link GlobalConfig} object for RxJS. It is used to configure things\n * like how to react on unhandled errors.\n */\nexport const config: GlobalConfig = {\n onUnhandledError: null,\n onStoppedNotification: null,\n Promise: undefined,\n useDeprecatedSynchronousErrorHandling: false,\n useDeprecatedNextContext: false,\n};\n\n/**\n * The global configuration object for RxJS, used to configure things\n * like how to react on unhandled errors. Accessible via {@link config}\n * object.\n */\nexport interface GlobalConfig {\n /**\n * A registration point for unhandled errors from RxJS. These are errors that\n * cannot were not handled by consuming code in the usual subscription path. For\n * example, if you have this configured, and you subscribe to an observable without\n * providing an error handler, errors from that subscription will end up here. This\n * will _always_ be called asynchronously on another job in the runtime. This is because\n * we do not want errors thrown in this user-configured handler to interfere with the\n * behavior of the library.\n */\n onUnhandledError: ((err: any) => void) | null;\n\n /**\n * A registration point for notifications that cannot be sent to subscribers because they\n * have completed, errored or have been explicitly unsubscribed. By default, next, complete\n * and error notifications sent to stopped subscribers are noops. However, sometimes callers\n * might want a different behavior. For example, with sources that attempt to report errors\n * to stopped subscribers, a caller can configure RxJS to throw an unhandled error instead.\n * This will _always_ be called asynchronously on another job in the runtime. This is because\n * we do not want errors thrown in this user-configured handler to interfere with the\n * behavior of the library.\n */\n onStoppedNotification: ((notification: ObservableNotification, subscriber: Subscriber) => void) | null;\n\n /**\n * The promise constructor used by default for {@link Observable#toPromise toPromise} and {@link Observable#forEach forEach}\n * methods.\n *\n * @deprecated As of version 8, RxJS will no longer support this sort of injection of a\n * Promise constructor. If you need a Promise implementation other than native promises,\n * please polyfill/patch Promise as you see appropriate. Will be removed in v8.\n */\n Promise?: PromiseConstructorLike;\n\n /**\n * If true, turns on synchronous error rethrowing, which is a deprecated behavior\n * in v6 and higher. This behavior enables bad patterns like wrapping a subscribe\n * call in a try/catch block. It also enables producer interference, a nasty bug\n * where a multicast can be broken for all observers by a downstream consumer with\n * an unhandled error. DO NOT USE THIS FLAG UNLESS IT'S NEEDED TO BUY TIME\n * FOR MIGRATION REASONS.\n *\n * @deprecated As of version 8, RxJS will no longer support synchronous throwing\n * of unhandled errors. All errors will be thrown on a separate call stack to prevent bad\n * behaviors described above. Will be removed in v8.\n */\n useDeprecatedSynchronousErrorHandling: boolean;\n\n /**\n * If true, enables an as-of-yet undocumented feature from v5: The ability to access\n * `unsubscribe()` via `this` context in `next` functions created in observers passed\n * to `subscribe`.\n *\n * This is being removed because the performance was severely problematic, and it could also cause\n * issues when types other than POJOs are passed to subscribe as subscribers, as they will likely have\n * their `this` context overwritten.\n *\n * @deprecated As of version 8, RxJS will no longer support altering the\n * context of next functions provided as part of an observer to Subscribe. Instead,\n * you will have access to a subscription or a signal or token that will allow you to do things like\n * unsubscribe and test closed status. Will be removed in v8.\n */\n useDeprecatedNextContext: boolean;\n}\n", "import type { TimerHandle } from './timerHandle';\ntype SetTimeoutFunction = (handler: () => void, timeout?: number, ...args: any[]) => TimerHandle;\ntype ClearTimeoutFunction = (handle: TimerHandle) => void;\n\ninterface TimeoutProvider {\n setTimeout: SetTimeoutFunction;\n clearTimeout: ClearTimeoutFunction;\n delegate:\n | {\n setTimeout: SetTimeoutFunction;\n clearTimeout: ClearTimeoutFunction;\n }\n | undefined;\n}\n\nexport const timeoutProvider: TimeoutProvider = {\n // When accessing the delegate, use the variable rather than `this` so that\n // the functions can be called without being bound to the provider.\n setTimeout(handler: () => void, timeout?: number, ...args) {\n const { delegate } = timeoutProvider;\n if (delegate?.setTimeout) {\n return delegate.setTimeout(handler, timeout, ...args);\n }\n return setTimeout(handler, timeout, ...args);\n },\n clearTimeout(handle) {\n const { delegate } = timeoutProvider;\n return (delegate?.clearTimeout || clearTimeout)(handle as any);\n },\n delegate: undefined,\n};\n", "import { config } from '../config';\nimport { timeoutProvider } from '../scheduler/timeoutProvider';\n\n/**\n * Handles an error on another job either with the user-configured {@link onUnhandledError},\n * or by throwing it on that new job so it can be picked up by `window.onerror`, `process.on('error')`, etc.\n *\n * This should be called whenever there is an error that is out-of-band with the subscription\n * or when an error hits a terminal boundary of the subscription and no error handler was provided.\n *\n * @param err the error to report\n */\nexport function reportUnhandledError(err: any) {\n timeoutProvider.setTimeout(() => {\n const { onUnhandledError } = config;\n if (onUnhandledError) {\n // Execute the user-configured error handler.\n onUnhandledError(err);\n } else {\n // Throw so it is picked up by the runtime's uncaught error mechanism.\n throw err;\n }\n });\n}\n", "/* tslint:disable:no-empty */\nexport function noop() { }\n", "import { CompleteNotification, NextNotification, ErrorNotification } from './types';\n\n/**\n * A completion object optimized for memory use and created to be the\n * same \"shape\" as other notifications in v8.\n * @internal\n */\nexport const COMPLETE_NOTIFICATION = (() => createNotification('C', undefined, undefined) as CompleteNotification)();\n\n/**\n * Internal use only. Creates an optimized error notification that is the same \"shape\"\n * as other notifications.\n * @internal\n */\nexport function errorNotification(error: any): ErrorNotification {\n return createNotification('E', undefined, error) as any;\n}\n\n/**\n * Internal use only. Creates an optimized next notification that is the same \"shape\"\n * as other notifications.\n * @internal\n */\nexport function nextNotification(value: T) {\n return createNotification('N', value, undefined) as NextNotification;\n}\n\n/**\n * Ensures that all notifications created internally have the same \"shape\" in v8.\n *\n * TODO: This is only exported to support a crazy legacy test in `groupBy`.\n * @internal\n */\nexport function createNotification(kind: 'N' | 'E' | 'C', value: any, error: any) {\n return {\n kind,\n value,\n error,\n };\n}\n", "import { config } from '../config';\n\nlet context: { errorThrown: boolean; error: any } | null = null;\n\n/**\n * Handles dealing with errors for super-gross mode. Creates a context, in which\n * any synchronously thrown errors will be passed to {@link captureError}. Which\n * will record the error such that it will be rethrown after the call back is complete.\n * TODO: Remove in v8\n * @param cb An immediately executed function.\n */\nexport function errorContext(cb: () => void) {\n if (config.useDeprecatedSynchronousErrorHandling) {\n const isRoot = !context;\n if (isRoot) {\n context = { errorThrown: false, error: null };\n }\n cb();\n if (isRoot) {\n const { errorThrown, error } = context!;\n context = null;\n if (errorThrown) {\n throw error;\n }\n }\n } else {\n // This is the general non-deprecated path for everyone that\n // isn't crazy enough to use super-gross mode (useDeprecatedSynchronousErrorHandling)\n cb();\n }\n}\n\n/**\n * Captures errors only in super-gross mode.\n * @param err the error to capture\n */\nexport function captureError(err: any) {\n if (config.useDeprecatedSynchronousErrorHandling && context) {\n context.errorThrown = true;\n context.error = err;\n }\n}\n", "import { isFunction } from './util/isFunction';\nimport { Observer, ObservableNotification } from './types';\nimport { isSubscription, Subscription } from './Subscription';\nimport { config } from './config';\nimport { reportUnhandledError } from './util/reportUnhandledError';\nimport { noop } from './util/noop';\nimport { nextNotification, errorNotification, COMPLETE_NOTIFICATION } from './NotificationFactories';\nimport { timeoutProvider } from './scheduler/timeoutProvider';\nimport { captureError } from './util/errorContext';\n\n/**\n * Implements the {@link Observer} interface and extends the\n * {@link Subscription} class. While the {@link Observer} is the public API for\n * consuming the values of an {@link Observable}, all Observers get converted to\n * a Subscriber, in order to provide Subscription-like capabilities such as\n * `unsubscribe`. Subscriber is a common type in RxJS, and crucial for\n * implementing operators, but it is rarely used as a public API.\n *\n * @class Subscriber\n */\nexport class Subscriber extends Subscription implements Observer {\n /**\n * A static factory for a Subscriber, given a (potentially partial) definition\n * of an Observer.\n * @param next The `next` callback of an Observer.\n * @param error The `error` callback of an\n * Observer.\n * @param complete The `complete` callback of an\n * Observer.\n * @return A Subscriber wrapping the (partially defined)\n * Observer represented by the given arguments.\n * @nocollapse\n * @deprecated Do not use. Will be removed in v8. There is no replacement for this\n * method, and there is no reason to be creating instances of `Subscriber` directly.\n * If you have a specific use case, please file an issue.\n */\n static create(next?: (x?: T) => void, error?: (e?: any) => void, complete?: () => void): Subscriber {\n return new SafeSubscriber(next, error, complete);\n }\n\n /** @deprecated Internal implementation detail, do not use directly. Will be made internal in v8. */\n protected isStopped: boolean = false;\n /** @deprecated Internal implementation detail, do not use directly. Will be made internal in v8. */\n protected destination: Subscriber | Observer; // this `any` is the escape hatch to erase extra type param (e.g. R)\n\n /**\n * @deprecated Internal implementation detail, do not use directly. Will be made internal in v8.\n * There is no reason to directly create an instance of Subscriber. This type is exported for typings reasons.\n */\n constructor(destination?: Subscriber | Observer) {\n super();\n if (destination) {\n this.destination = destination;\n // Automatically chain subscriptions together here.\n // if destination is a Subscription, then it is a Subscriber.\n if (isSubscription(destination)) {\n destination.add(this);\n }\n } else {\n this.destination = EMPTY_OBSERVER;\n }\n }\n\n /**\n * The {@link Observer} callback to receive notifications of type `next` from\n * the Observable, with a value. The Observable may call this method 0 or more\n * times.\n * @param {T} [value] The `next` value.\n * @return {void}\n */\n next(value?: T): void {\n if (this.isStopped) {\n handleStoppedNotification(nextNotification(value), this);\n } else {\n this._next(value!);\n }\n }\n\n /**\n * The {@link Observer} callback to receive notifications of type `error` from\n * the Observable, with an attached `Error`. Notifies the Observer that\n * the Observable has experienced an error condition.\n * @param {any} [err] The `error` exception.\n * @return {void}\n */\n error(err?: any): void {\n if (this.isStopped) {\n handleStoppedNotification(errorNotification(err), this);\n } else {\n this.isStopped = true;\n this._error(err);\n }\n }\n\n /**\n * The {@link Observer} callback to receive a valueless notification of type\n * `complete` from the Observable. Notifies the Observer that the Observable\n * has finished sending push-based notifications.\n * @return {void}\n */\n complete(): void {\n if (this.isStopped) {\n handleStoppedNotification(COMPLETE_NOTIFICATION, this);\n } else {\n this.isStopped = true;\n this._complete();\n }\n }\n\n unsubscribe(): void {\n if (!this.closed) {\n this.isStopped = true;\n super.unsubscribe();\n this.destination = null!;\n }\n }\n\n protected _next(value: T): void {\n this.destination.next(value);\n }\n\n protected _error(err: any): void {\n try {\n this.destination.error(err);\n } finally {\n this.unsubscribe();\n }\n }\n\n protected _complete(): void {\n try {\n this.destination.complete();\n } finally {\n this.unsubscribe();\n }\n }\n}\n\n/**\n * This bind is captured here because we want to be able to have\n * compatibility with monoid libraries that tend to use a method named\n * `bind`. In particular, a library called Monio requires this.\n */\nconst _bind = Function.prototype.bind;\n\nfunction bind any>(fn: Fn, thisArg: any): Fn {\n return _bind.call(fn, thisArg);\n}\n\n/**\n * Internal optimization only, DO NOT EXPOSE.\n * @internal\n */\nclass ConsumerObserver implements Observer {\n constructor(private partialObserver: Partial>) {}\n\n next(value: T): void {\n const { partialObserver } = this;\n if (partialObserver.next) {\n try {\n partialObserver.next(value);\n } catch (error) {\n handleUnhandledError(error);\n }\n }\n }\n\n error(err: any): void {\n const { partialObserver } = this;\n if (partialObserver.error) {\n try {\n partialObserver.error(err);\n } catch (error) {\n handleUnhandledError(error);\n }\n } else {\n handleUnhandledError(err);\n }\n }\n\n complete(): void {\n const { partialObserver } = this;\n if (partialObserver.complete) {\n try {\n partialObserver.complete();\n } catch (error) {\n handleUnhandledError(error);\n }\n }\n }\n}\n\nexport class SafeSubscriber extends Subscriber {\n constructor(\n observerOrNext?: Partial> | ((value: T) => void) | null,\n error?: ((e?: any) => void) | null,\n complete?: (() => void) | null\n ) {\n super();\n\n let partialObserver: Partial>;\n if (isFunction(observerOrNext) || !observerOrNext) {\n // The first argument is a function, not an observer. The next\n // two arguments *could* be observers, or they could be empty.\n partialObserver = {\n next: (observerOrNext ?? undefined) as (((value: T) => void) | undefined),\n error: error ?? undefined,\n complete: complete ?? undefined,\n };\n } else {\n // The first argument is a partial observer.\n let context: any;\n if (this && config.useDeprecatedNextContext) {\n // This is a deprecated path that made `this.unsubscribe()` available in\n // next handler functions passed to subscribe. This only exists behind a flag\n // now, as it is *very* slow.\n context = Object.create(observerOrNext);\n context.unsubscribe = () => this.unsubscribe();\n partialObserver = {\n next: observerOrNext.next && bind(observerOrNext.next, context),\n error: observerOrNext.error && bind(observerOrNext.error, context),\n complete: observerOrNext.complete && bind(observerOrNext.complete, context),\n };\n } else {\n // The \"normal\" path. Just use the partial observer directly.\n partialObserver = observerOrNext;\n }\n }\n\n // Wrap the partial observer to ensure it's a full observer, and\n // make sure proper error handling is accounted for.\n this.destination = new ConsumerObserver(partialObserver);\n }\n}\n\nfunction handleUnhandledError(error: any) {\n if (config.useDeprecatedSynchronousErrorHandling) {\n captureError(error);\n } else {\n // Ideal path, we report this as an unhandled error,\n // which is thrown on a new call stack.\n reportUnhandledError(error);\n }\n}\n\n/**\n * An error handler used when no error handler was supplied\n * to the SafeSubscriber -- meaning no error handler was supplied\n * do the `subscribe` call on our observable.\n * @param err The error to handle\n */\nfunction defaultErrorHandler(err: any) {\n throw err;\n}\n\n/**\n * A handler for notifications that cannot be sent to a stopped subscriber.\n * @param notification The notification being sent\n * @param subscriber The stopped subscriber\n */\nfunction handleStoppedNotification(notification: ObservableNotification, subscriber: Subscriber) {\n const { onStoppedNotification } = config;\n onStoppedNotification && timeoutProvider.setTimeout(() => onStoppedNotification(notification, subscriber));\n}\n\n/**\n * The observer used as a stub for subscriptions where the user did not\n * pass any arguments to `subscribe`. Comes with the default error handling\n * behavior.\n */\nexport const EMPTY_OBSERVER: Readonly> & { closed: true } = {\n closed: true,\n next: noop,\n error: defaultErrorHandler,\n complete: noop,\n};\n", "/**\n * Symbol.observable or a string \"@@observable\". Used for interop\n *\n * @deprecated We will no longer be exporting this symbol in upcoming versions of RxJS.\n * Instead polyfill and use Symbol.observable directly *or* use https://www.npmjs.com/package/symbol-observable\n */\nexport const observable: string | symbol = (() => (typeof Symbol === 'function' && Symbol.observable) || '@@observable')();\n", "/**\n * This function takes one parameter and just returns it. Simply put,\n * this is like `(x: T): T => x`.\n *\n * ## Examples\n *\n * This is useful in some cases when using things like `mergeMap`\n *\n * ```ts\n * import { interval, take, map, range, mergeMap, identity } from 'rxjs';\n *\n * const source$ = interval(1000).pipe(take(5));\n *\n * const result$ = source$.pipe(\n * map(i => range(i)),\n * mergeMap(identity) // same as mergeMap(x => x)\n * );\n *\n * result$.subscribe({\n * next: console.log\n * });\n * ```\n *\n * Or when you want to selectively apply an operator\n *\n * ```ts\n * import { interval, take, identity } from 'rxjs';\n *\n * const shouldLimit = () => Math.random() < 0.5;\n *\n * const source$ = interval(1000);\n *\n * const result$ = source$.pipe(shouldLimit() ? take(5) : identity);\n *\n * result$.subscribe({\n * next: console.log\n * });\n * ```\n *\n * @param x Any value that is returned by this function\n * @returns The value passed as the first parameter to this function\n */\nexport function identity(x: T): T {\n return x;\n}\n", "import { identity } from './identity';\nimport { UnaryFunction } from '../types';\n\nexport function pipe(): typeof identity;\nexport function pipe(fn1: UnaryFunction): UnaryFunction;\nexport function pipe(fn1: UnaryFunction, fn2: UnaryFunction): UnaryFunction;\nexport function pipe(fn1: UnaryFunction, fn2: UnaryFunction, fn3: UnaryFunction): UnaryFunction;\nexport function pipe(\n fn1: UnaryFunction,\n fn2: UnaryFunction,\n fn3: UnaryFunction,\n fn4: UnaryFunction\n): UnaryFunction;\nexport function pipe(\n fn1: UnaryFunction,\n fn2: UnaryFunction,\n fn3: UnaryFunction,\n fn4: UnaryFunction,\n fn5: UnaryFunction\n): UnaryFunction;\nexport function pipe(\n fn1: UnaryFunction,\n fn2: UnaryFunction,\n fn3: UnaryFunction,\n fn4: UnaryFunction,\n fn5: UnaryFunction,\n fn6: UnaryFunction\n): UnaryFunction;\nexport function pipe(\n fn1: UnaryFunction,\n fn2: UnaryFunction,\n fn3: UnaryFunction,\n fn4: UnaryFunction,\n fn5: UnaryFunction,\n fn6: UnaryFunction,\n fn7: UnaryFunction\n): UnaryFunction;\nexport function pipe(\n fn1: UnaryFunction,\n fn2: UnaryFunction,\n fn3: UnaryFunction,\n fn4: UnaryFunction,\n fn5: UnaryFunction,\n fn6: UnaryFunction,\n fn7: UnaryFunction,\n fn8: UnaryFunction\n): UnaryFunction;\nexport function pipe(\n fn1: UnaryFunction,\n fn2: UnaryFunction,\n fn3: UnaryFunction,\n fn4: UnaryFunction,\n fn5: UnaryFunction,\n fn6: UnaryFunction,\n fn7: UnaryFunction,\n fn8: UnaryFunction,\n fn9: UnaryFunction\n): UnaryFunction;\nexport function pipe(\n fn1: UnaryFunction,\n fn2: UnaryFunction,\n fn3: UnaryFunction,\n fn4: UnaryFunction,\n fn5: UnaryFunction,\n fn6: UnaryFunction,\n fn7: UnaryFunction,\n fn8: UnaryFunction,\n fn9: UnaryFunction,\n ...fns: UnaryFunction[]\n): UnaryFunction;\n\n/**\n * pipe() can be called on one or more functions, each of which can take one argument (\"UnaryFunction\")\n * and uses it to return a value.\n * It returns a function that takes one argument, passes it to the first UnaryFunction, and then\n * passes the result to the next one, passes that result to the next one, and so on. \n */\nexport function pipe(...fns: Array>): UnaryFunction {\n return pipeFromArray(fns);\n}\n\n/** @internal */\nexport function pipeFromArray(fns: Array>): UnaryFunction {\n if (fns.length === 0) {\n return identity as UnaryFunction;\n }\n\n if (fns.length === 1) {\n return fns[0];\n }\n\n return function piped(input: T): R {\n return fns.reduce((prev: any, fn: UnaryFunction) => fn(prev), input as any);\n };\n}\n", "import { Operator } from './Operator';\nimport { SafeSubscriber, Subscriber } from './Subscriber';\nimport { isSubscription, Subscription } from './Subscription';\nimport { TeardownLogic, OperatorFunction, Subscribable, Observer } from './types';\nimport { observable as Symbol_observable } from './symbol/observable';\nimport { pipeFromArray } from './util/pipe';\nimport { config } from './config';\nimport { isFunction } from './util/isFunction';\nimport { errorContext } from './util/errorContext';\n\n/**\n * A representation of any set of values over any amount of time. This is the most basic building block\n * of RxJS.\n *\n * @class Observable\n */\nexport class Observable implements Subscribable {\n /**\n * @deprecated Internal implementation detail, do not use directly. Will be made internal in v8.\n */\n source: Observable | undefined;\n\n /**\n * @deprecated Internal implementation detail, do not use directly. Will be made internal in v8.\n */\n operator: Operator | undefined;\n\n /**\n * @constructor\n * @param {Function} subscribe the function that is called when the Observable is\n * initially subscribed to. This function is given a Subscriber, to which new values\n * can be `next`ed, or an `error` method can be called to raise an error, or\n * `complete` can be called to notify of a successful completion.\n */\n constructor(subscribe?: (this: Observable, subscriber: Subscriber) => TeardownLogic) {\n if (subscribe) {\n this._subscribe = subscribe;\n }\n }\n\n // HACK: Since TypeScript inherits static properties too, we have to\n // fight against TypeScript here so Subject can have a different static create signature\n /**\n * Creates a new Observable by calling the Observable constructor\n * @owner Observable\n * @method create\n * @param {Function} subscribe? the subscriber function to be passed to the Observable constructor\n * @return {Observable} a new observable\n * @nocollapse\n * @deprecated Use `new Observable()` instead. Will be removed in v8.\n */\n static create: (...args: any[]) => any = (subscribe?: (subscriber: Subscriber) => TeardownLogic) => {\n return new Observable(subscribe);\n };\n\n /**\n * Creates a new Observable, with this Observable instance as the source, and the passed\n * operator defined as the new observable's operator.\n * @method lift\n * @param operator the operator defining the operation to take on the observable\n * @return a new observable with the Operator applied\n * @deprecated Internal implementation detail, do not use directly. Will be made internal in v8.\n * If you have implemented an operator using `lift`, it is recommended that you create an\n * operator by simply returning `new Observable()` directly. See \"Creating new operators from\n * scratch\" section here: https://rxjs.dev/guide/operators\n */\n lift(operator?: Operator): Observable {\n const observable = new Observable();\n observable.source = this;\n observable.operator = operator;\n return observable;\n }\n\n subscribe(observerOrNext?: Partial> | ((value: T) => void)): Subscription;\n /** @deprecated Instead of passing separate callback arguments, use an observer argument. Signatures taking separate callback arguments will be removed in v8. Details: https://rxjs.dev/deprecations/subscribe-arguments */\n subscribe(next?: ((value: T) => void) | null, error?: ((error: any) => void) | null, complete?: (() => void) | null): Subscription;\n /**\n * Invokes an execution of an Observable and registers Observer handlers for notifications it will emit.\n *\n * Use it when you have all these Observables, but still nothing is happening.\n *\n * `subscribe` is not a regular operator, but a method that calls Observable's internal `subscribe` function. It\n * might be for example a function that you passed to Observable's constructor, but most of the time it is\n * a library implementation, which defines what will be emitted by an Observable, and when it be will emitted. This means\n * that calling `subscribe` is actually the moment when Observable starts its work, not when it is created, as it is often\n * the thought.\n *\n * Apart from starting the execution of an Observable, this method allows you to listen for values\n * that an Observable emits, as well as for when it completes or errors. You can achieve this in two\n * of the following ways.\n *\n * The first way is creating an object that implements {@link Observer} interface. It should have methods\n * defined by that interface, but note that it should be just a regular JavaScript object, which you can create\n * yourself in any way you want (ES6 class, classic function constructor, object literal etc.). In particular, do\n * not attempt to use any RxJS implementation details to create Observers - you don't need them. Remember also\n * that your object does not have to implement all methods. If you find yourself creating a method that doesn't\n * do anything, you can simply omit it. Note however, if the `error` method is not provided and an error happens,\n * it will be thrown asynchronously. Errors thrown asynchronously cannot be caught using `try`/`catch`. Instead,\n * use the {@link onUnhandledError} configuration option or use a runtime handler (like `window.onerror` or\n * `process.on('error)`) to be notified of unhandled errors. Because of this, it's recommended that you provide\n * an `error` method to avoid missing thrown errors.\n *\n * The second way is to give up on Observer object altogether and simply provide callback functions in place of its methods.\n * This means you can provide three functions as arguments to `subscribe`, where the first function is equivalent\n * of a `next` method, the second of an `error` method and the third of a `complete` method. Just as in case of an Observer,\n * if you do not need to listen for something, you can omit a function by passing `undefined` or `null`,\n * since `subscribe` recognizes these functions by where they were placed in function call. When it comes\n * to the `error` function, as with an Observer, if not provided, errors emitted by an Observable will be thrown asynchronously.\n *\n * You can, however, subscribe with no parameters at all. This may be the case where you're not interested in terminal events\n * and you also handled emissions internally by using operators (e.g. using `tap`).\n *\n * Whichever style of calling `subscribe` you use, in both cases it returns a Subscription object.\n * This object allows you to call `unsubscribe` on it, which in turn will stop the work that an Observable does and will clean\n * up all resources that an Observable used. Note that cancelling a subscription will not call `complete` callback\n * provided to `subscribe` function, which is reserved for a regular completion signal that comes from an Observable.\n *\n * Remember that callbacks provided to `subscribe` are not guaranteed to be called asynchronously.\n * It is an Observable itself that decides when these functions will be called. For example {@link of}\n * by default emits all its values synchronously. Always check documentation for how given Observable\n * will behave when subscribed and if its default behavior can be modified with a `scheduler`.\n *\n * #### Examples\n *\n * Subscribe with an {@link guide/observer Observer}\n *\n * ```ts\n * import { of } from 'rxjs';\n *\n * const sumObserver = {\n * sum: 0,\n * next(value) {\n * console.log('Adding: ' + value);\n * this.sum = this.sum + value;\n * },\n * error() {\n * // We actually could just remove this method,\n * // since we do not really care about errors right now.\n * },\n * complete() {\n * console.log('Sum equals: ' + this.sum);\n * }\n * };\n *\n * of(1, 2, 3) // Synchronously emits 1, 2, 3 and then completes.\n * .subscribe(sumObserver);\n *\n * // Logs:\n * // 'Adding: 1'\n * // 'Adding: 2'\n * // 'Adding: 3'\n * // 'Sum equals: 6'\n * ```\n *\n * Subscribe with functions ({@link deprecations/subscribe-arguments deprecated})\n *\n * ```ts\n * import { of } from 'rxjs'\n *\n * let sum = 0;\n *\n * of(1, 2, 3).subscribe(\n * value => {\n * console.log('Adding: ' + value);\n * sum = sum + value;\n * },\n * undefined,\n * () => console.log('Sum equals: ' + sum)\n * );\n *\n * // Logs:\n * // 'Adding: 1'\n * // 'Adding: 2'\n * // 'Adding: 3'\n * // 'Sum equals: 6'\n * ```\n *\n * Cancel a subscription\n *\n * ```ts\n * import { interval } from 'rxjs';\n *\n * const subscription = interval(1000).subscribe({\n * next(num) {\n * console.log(num)\n * },\n * complete() {\n * // Will not be called, even when cancelling subscription.\n * console.log('completed!');\n * }\n * });\n *\n * setTimeout(() => {\n * subscription.unsubscribe();\n * console.log('unsubscribed!');\n * }, 2500);\n *\n * // Logs:\n * // 0 after 1s\n * // 1 after 2s\n * // 'unsubscribed!' after 2.5s\n * ```\n *\n * @param {Observer|Function} observerOrNext (optional) Either an observer with methods to be called,\n * or the first of three possible handlers, which is the handler for each value emitted from the subscribed\n * Observable.\n * @param {Function} error (optional) A handler for a terminal event resulting from an error. If no error handler is provided,\n * the error will be thrown asynchronously as unhandled.\n * @param {Function} complete (optional) A handler for a terminal event resulting from successful completion.\n * @return {Subscription} a subscription reference to the registered handlers\n * @method subscribe\n */\n subscribe(\n observerOrNext?: Partial> | ((value: T) => void) | null,\n error?: ((error: any) => void) | null,\n complete?: (() => void) | null\n ): Subscription {\n const subscriber = isSubscriber(observerOrNext) ? observerOrNext : new SafeSubscriber(observerOrNext, error, complete);\n\n errorContext(() => {\n const { operator, source } = this;\n subscriber.add(\n operator\n ? // We're dealing with a subscription in the\n // operator chain to one of our lifted operators.\n operator.call(subscriber, source)\n : source\n ? // If `source` has a value, but `operator` does not, something that\n // had intimate knowledge of our API, like our `Subject`, must have\n // set it. We're going to just call `_subscribe` directly.\n this._subscribe(subscriber)\n : // In all other cases, we're likely wrapping a user-provided initializer\n // function, so we need to catch errors and handle them appropriately.\n this._trySubscribe(subscriber)\n );\n });\n\n return subscriber;\n }\n\n /** @internal */\n protected _trySubscribe(sink: Subscriber): TeardownLogic {\n try {\n return this._subscribe(sink);\n } catch (err) {\n // We don't need to return anything in this case,\n // because it's just going to try to `add()` to a subscription\n // above.\n sink.error(err);\n }\n }\n\n /**\n * Used as a NON-CANCELLABLE means of subscribing to an observable, for use with\n * APIs that expect promises, like `async/await`. You cannot unsubscribe from this.\n *\n * **WARNING**: Only use this with observables you *know* will complete. If the source\n * observable does not complete, you will end up with a promise that is hung up, and\n * potentially all of the state of an async function hanging out in memory. To avoid\n * this situation, look into adding something like {@link timeout}, {@link take},\n * {@link takeWhile}, or {@link takeUntil} amongst others.\n *\n * #### Example\n *\n * ```ts\n * import { interval, take } from 'rxjs';\n *\n * const source$ = interval(1000).pipe(take(4));\n *\n * async function getTotal() {\n * let total = 0;\n *\n * await source$.forEach(value => {\n * total += value;\n * console.log('observable -> ' + value);\n * });\n *\n * return total;\n * }\n *\n * getTotal().then(\n * total => console.log('Total: ' + total)\n * );\n *\n * // Expected:\n * // 'observable -> 0'\n * // 'observable -> 1'\n * // 'observable -> 2'\n * // 'observable -> 3'\n * // 'Total: 6'\n * ```\n *\n * @param next a handler for each value emitted by the observable\n * @return a promise that either resolves on observable completion or\n * rejects with the handled error\n */\n forEach(next: (value: T) => void): Promise;\n\n /**\n * @param next a handler for each value emitted by the observable\n * @param promiseCtor a constructor function used to instantiate the Promise\n * @return a promise that either resolves on observable completion or\n * rejects with the handled error\n * @deprecated Passing a Promise constructor will no longer be available\n * in upcoming versions of RxJS. This is because it adds weight to the library, for very\n * little benefit. If you need this functionality, it is recommended that you either\n * polyfill Promise, or you create an adapter to convert the returned native promise\n * to whatever promise implementation you wanted. Will be removed in v8.\n */\n forEach(next: (value: T) => void, promiseCtor: PromiseConstructorLike): Promise;\n\n forEach(next: (value: T) => void, promiseCtor?: PromiseConstructorLike): Promise {\n promiseCtor = getPromiseCtor(promiseCtor);\n\n return new promiseCtor((resolve, reject) => {\n const subscriber = new SafeSubscriber({\n next: (value) => {\n try {\n next(value);\n } catch (err) {\n reject(err);\n subscriber.unsubscribe();\n }\n },\n error: reject,\n complete: resolve,\n });\n this.subscribe(subscriber);\n }) as Promise;\n }\n\n /** @internal */\n protected _subscribe(subscriber: Subscriber): TeardownLogic {\n return this.source?.subscribe(subscriber);\n }\n\n /**\n * An interop point defined by the es7-observable spec https://github.com/zenparsing/es-observable\n * @method Symbol.observable\n * @return {Observable} this instance of the observable\n */\n [Symbol_observable]() {\n return this;\n }\n\n /* tslint:disable:max-line-length */\n pipe(): Observable;\n pipe(op1: OperatorFunction): Observable;\n pipe(op1: OperatorFunction, op2: OperatorFunction): Observable;\n pipe(op1: OperatorFunction, op2: OperatorFunction, op3: OperatorFunction): Observable;\n pipe(\n op1: OperatorFunction,\n op2: OperatorFunction,\n op3: OperatorFunction,\n op4: OperatorFunction\n ): Observable;\n pipe(\n op1: OperatorFunction,\n op2: OperatorFunction,\n op3: OperatorFunction,\n op4: OperatorFunction,\n op5: OperatorFunction\n ): Observable;\n pipe(\n op1: OperatorFunction,\n op2: OperatorFunction,\n op3: OperatorFunction,\n op4: OperatorFunction,\n op5: OperatorFunction,\n op6: OperatorFunction\n ): Observable;\n pipe(\n op1: OperatorFunction,\n op2: OperatorFunction,\n op3: OperatorFunction,\n op4: OperatorFunction,\n op5: OperatorFunction,\n op6: OperatorFunction,\n op7: OperatorFunction\n ): Observable;\n pipe(\n op1: OperatorFunction,\n op2: OperatorFunction,\n op3: OperatorFunction,\n op4: OperatorFunction,\n op5: OperatorFunction,\n op6: OperatorFunction,\n op7: OperatorFunction,\n op8: OperatorFunction\n ): Observable;\n pipe(\n op1: OperatorFunction,\n op2: OperatorFunction,\n op3: OperatorFunction,\n op4: OperatorFunction,\n op5: OperatorFunction,\n op6: OperatorFunction,\n op7: OperatorFunction,\n op8: OperatorFunction,\n op9: OperatorFunction\n ): Observable;\n pipe(\n op1: OperatorFunction,\n op2: OperatorFunction,\n op3: OperatorFunction,\n op4: OperatorFunction,\n op5: OperatorFunction,\n op6: OperatorFunction,\n op7: OperatorFunction,\n op8: OperatorFunction,\n op9: OperatorFunction,\n ...operations: OperatorFunction[]\n ): Observable;\n /* tslint:enable:max-line-length */\n\n /**\n * Used to stitch together functional operators into a chain.\n * @method pipe\n * @return {Observable} the Observable result of all of the operators having\n * been called in the order they were passed in.\n *\n * ## Example\n *\n * ```ts\n * import { interval, filter, map, scan } from 'rxjs';\n *\n * interval(1000)\n * .pipe(\n * filter(x => x % 2 === 0),\n * map(x => x + x),\n * scan((acc, x) => acc + x)\n * )\n * .subscribe(x => console.log(x));\n * ```\n */\n pipe(...operations: OperatorFunction[]): Observable {\n return pipeFromArray(operations)(this);\n }\n\n /* tslint:disable:max-line-length */\n /** @deprecated Replaced with {@link firstValueFrom} and {@link lastValueFrom}. Will be removed in v8. Details: https://rxjs.dev/deprecations/to-promise */\n toPromise(): Promise;\n /** @deprecated Replaced with {@link firstValueFrom} and {@link lastValueFrom}. Will be removed in v8. Details: https://rxjs.dev/deprecations/to-promise */\n toPromise(PromiseCtor: typeof Promise): Promise;\n /** @deprecated Replaced with {@link firstValueFrom} and {@link lastValueFrom}. Will be removed in v8. Details: https://rxjs.dev/deprecations/to-promise */\n toPromise(PromiseCtor: PromiseConstructorLike): Promise;\n /* tslint:enable:max-line-length */\n\n /**\n * Subscribe to this Observable and get a Promise resolving on\n * `complete` with the last emission (if any).\n *\n * **WARNING**: Only use this with observables you *know* will complete. If the source\n * observable does not complete, you will end up with a promise that is hung up, and\n * potentially all of the state of an async function hanging out in memory. To avoid\n * this situation, look into adding something like {@link timeout}, {@link take},\n * {@link takeWhile}, or {@link takeUntil} amongst others.\n *\n * @method toPromise\n * @param [promiseCtor] a constructor function used to instantiate\n * the Promise\n * @return A Promise that resolves with the last value emit, or\n * rejects on an error. If there were no emissions, Promise\n * resolves with undefined.\n * @deprecated Replaced with {@link firstValueFrom} and {@link lastValueFrom}. Will be removed in v8. Details: https://rxjs.dev/deprecations/to-promise\n */\n toPromise(promiseCtor?: PromiseConstructorLike): Promise {\n promiseCtor = getPromiseCtor(promiseCtor);\n\n return new promiseCtor((resolve, reject) => {\n let value: T | undefined;\n this.subscribe(\n (x: T) => (value = x),\n (err: any) => reject(err),\n () => resolve(value)\n );\n }) as Promise;\n }\n}\n\n/**\n * Decides between a passed promise constructor from consuming code,\n * A default configured promise constructor, and the native promise\n * constructor and returns it. If nothing can be found, it will throw\n * an error.\n * @param promiseCtor The optional promise constructor to passed by consuming code\n */\nfunction getPromiseCtor(promiseCtor: PromiseConstructorLike | undefined) {\n return promiseCtor ?? config.Promise ?? Promise;\n}\n\nfunction isObserver(value: any): value is Observer {\n return value && isFunction(value.next) && isFunction(value.error) && isFunction(value.complete);\n}\n\nfunction isSubscriber(value: any): value is Subscriber {\n return (value && value instanceof Subscriber) || (isObserver(value) && isSubscription(value));\n}\n", "import { Observable } from '../Observable';\nimport { Subscriber } from '../Subscriber';\nimport { OperatorFunction } from '../types';\nimport { isFunction } from './isFunction';\n\n/**\n * Used to determine if an object is an Observable with a lift function.\n */\nexport function hasLift(source: any): source is { lift: InstanceType['lift'] } {\n return isFunction(source?.lift);\n}\n\n/**\n * Creates an `OperatorFunction`. Used to define operators throughout the library in a concise way.\n * @param init The logic to connect the liftedSource to the subscriber at the moment of subscription.\n */\nexport function operate(\n init: (liftedSource: Observable, subscriber: Subscriber) => (() => void) | void\n): OperatorFunction {\n return (source: Observable) => {\n if (hasLift(source)) {\n return source.lift(function (this: Subscriber, liftedSource: Observable) {\n try {\n return init(liftedSource, this);\n } catch (err) {\n this.error(err);\n }\n });\n }\n throw new TypeError('Unable to lift unknown Observable type');\n };\n}\n", "import { Subscriber } from '../Subscriber';\n\n/**\n * Creates an instance of an `OperatorSubscriber`.\n * @param destination The downstream subscriber.\n * @param onNext Handles next values, only called if this subscriber is not stopped or closed. Any\n * error that occurs in this function is caught and sent to the `error` method of this subscriber.\n * @param onError Handles errors from the subscription, any errors that occur in this handler are caught\n * and send to the `destination` error handler.\n * @param onComplete Handles completion notification from the subscription. Any errors that occur in\n * this handler are sent to the `destination` error handler.\n * @param onFinalize Additional teardown logic here. This will only be called on teardown if the\n * subscriber itself is not already closed. This is called after all other teardown logic is executed.\n */\nexport function createOperatorSubscriber(\n destination: Subscriber,\n onNext?: (value: T) => void,\n onComplete?: () => void,\n onError?: (err: any) => void,\n onFinalize?: () => void\n): Subscriber {\n return new OperatorSubscriber(destination, onNext, onComplete, onError, onFinalize);\n}\n\n/**\n * A generic helper for allowing operators to be created with a Subscriber and\n * use closures to capture necessary state from the operator function itself.\n */\nexport class OperatorSubscriber extends Subscriber {\n /**\n * Creates an instance of an `OperatorSubscriber`.\n * @param destination The downstream subscriber.\n * @param onNext Handles next values, only called if this subscriber is not stopped or closed. Any\n * error that occurs in this function is caught and sent to the `error` method of this subscriber.\n * @param onError Handles errors from the subscription, any errors that occur in this handler are caught\n * and send to the `destination` error handler.\n * @param onComplete Handles completion notification from the subscription. Any errors that occur in\n * this handler are sent to the `destination` error handler.\n * @param onFinalize Additional finalization logic here. This will only be called on finalization if the\n * subscriber itself is not already closed. This is called after all other finalization logic is executed.\n * @param shouldUnsubscribe An optional check to see if an unsubscribe call should truly unsubscribe.\n * NOTE: This currently **ONLY** exists to support the strange behavior of {@link groupBy}, where unsubscription\n * to the resulting observable does not actually disconnect from the source if there are active subscriptions\n * to any grouped observable. (DO NOT EXPOSE OR USE EXTERNALLY!!!)\n */\n constructor(\n destination: Subscriber,\n onNext?: (value: T) => void,\n onComplete?: () => void,\n onError?: (err: any) => void,\n private onFinalize?: () => void,\n private shouldUnsubscribe?: () => boolean\n ) {\n // It's important - for performance reasons - that all of this class's\n // members are initialized and that they are always initialized in the same\n // order. This will ensure that all OperatorSubscriber instances have the\n // same hidden class in V8. This, in turn, will help keep the number of\n // hidden classes involved in property accesses within the base class as\n // low as possible. If the number of hidden classes involved exceeds four,\n // the property accesses will become megamorphic and performance penalties\n // will be incurred - i.e. inline caches won't be used.\n //\n // The reasons for ensuring all instances have the same hidden class are\n // further discussed in this blog post from Benedikt Meurer:\n // https://benediktmeurer.de/2018/03/23/impact-of-polymorphism-on-component-based-frameworks-like-react/\n super(destination);\n this._next = onNext\n ? function (this: OperatorSubscriber, value: T) {\n try {\n onNext(value);\n } catch (err) {\n destination.error(err);\n }\n }\n : super._next;\n this._error = onError\n ? function (this: OperatorSubscriber, err: any) {\n try {\n onError(err);\n } catch (err) {\n // Send any errors that occur down stream.\n destination.error(err);\n } finally {\n // Ensure finalization.\n this.unsubscribe();\n }\n }\n : super._error;\n this._complete = onComplete\n ? function (this: OperatorSubscriber) {\n try {\n onComplete();\n } catch (err) {\n // Send any errors that occur down stream.\n destination.error(err);\n } finally {\n // Ensure finalization.\n this.unsubscribe();\n }\n }\n : super._complete;\n }\n\n unsubscribe() {\n if (!this.shouldUnsubscribe || this.shouldUnsubscribe()) {\n const { closed } = this;\n super.unsubscribe();\n // Execute additional teardown if we have any and we didn't already do so.\n !closed && this.onFinalize?.();\n }\n }\n}\n", "import { Subscription } from '../Subscription';\n\ninterface AnimationFrameProvider {\n schedule(callback: FrameRequestCallback): Subscription;\n requestAnimationFrame: typeof requestAnimationFrame;\n cancelAnimationFrame: typeof cancelAnimationFrame;\n delegate:\n | {\n requestAnimationFrame: typeof requestAnimationFrame;\n cancelAnimationFrame: typeof cancelAnimationFrame;\n }\n | undefined;\n}\n\nexport const animationFrameProvider: AnimationFrameProvider = {\n // When accessing the delegate, use the variable rather than `this` so that\n // the functions can be called without being bound to the provider.\n schedule(callback) {\n let request = requestAnimationFrame;\n let cancel: typeof cancelAnimationFrame | undefined = cancelAnimationFrame;\n const { delegate } = animationFrameProvider;\n if (delegate) {\n request = delegate.requestAnimationFrame;\n cancel = delegate.cancelAnimationFrame;\n }\n const handle = request((timestamp) => {\n // Clear the cancel function. The request has been fulfilled, so\n // attempting to cancel the request upon unsubscription would be\n // pointless.\n cancel = undefined;\n callback(timestamp);\n });\n return new Subscription(() => cancel?.(handle));\n },\n requestAnimationFrame(...args) {\n const { delegate } = animationFrameProvider;\n return (delegate?.requestAnimationFrame || requestAnimationFrame)(...args);\n },\n cancelAnimationFrame(...args) {\n const { delegate } = animationFrameProvider;\n return (delegate?.cancelAnimationFrame || cancelAnimationFrame)(...args);\n },\n delegate: undefined,\n};\n", "import { createErrorClass } from './createErrorClass';\n\nexport interface ObjectUnsubscribedError extends Error {}\n\nexport interface ObjectUnsubscribedErrorCtor {\n /**\n * @deprecated Internal implementation detail. Do not construct error instances.\n * Cannot be tagged as internal: https://github.com/ReactiveX/rxjs/issues/6269\n */\n new (): ObjectUnsubscribedError;\n}\n\n/**\n * An error thrown when an action is invalid because the object has been\n * unsubscribed.\n *\n * @see {@link Subject}\n * @see {@link BehaviorSubject}\n *\n * @class ObjectUnsubscribedError\n */\nexport const ObjectUnsubscribedError: ObjectUnsubscribedErrorCtor = createErrorClass(\n (_super) =>\n function ObjectUnsubscribedErrorImpl(this: any) {\n _super(this);\n this.name = 'ObjectUnsubscribedError';\n this.message = 'object unsubscribed';\n }\n);\n", "import { Operator } from './Operator';\nimport { Observable } from './Observable';\nimport { Subscriber } from './Subscriber';\nimport { Subscription, EMPTY_SUBSCRIPTION } from './Subscription';\nimport { Observer, SubscriptionLike, TeardownLogic } from './types';\nimport { ObjectUnsubscribedError } from './util/ObjectUnsubscribedError';\nimport { arrRemove } from './util/arrRemove';\nimport { errorContext } from './util/errorContext';\n\n/**\n * A Subject is a special type of Observable that allows values to be\n * multicasted to many Observers. Subjects are like EventEmitters.\n *\n * Every Subject is an Observable and an Observer. You can subscribe to a\n * Subject, and you can call next to feed values as well as error and complete.\n */\nexport class Subject extends Observable implements SubscriptionLike {\n closed = false;\n\n private currentObservers: Observer[] | null = null;\n\n /** @deprecated Internal implementation detail, do not use directly. Will be made internal in v8. */\n observers: Observer[] = [];\n /** @deprecated Internal implementation detail, do not use directly. Will be made internal in v8. */\n isStopped = false;\n /** @deprecated Internal implementation detail, do not use directly. Will be made internal in v8. */\n hasError = false;\n /** @deprecated Internal implementation detail, do not use directly. Will be made internal in v8. */\n thrownError: any = null;\n\n /**\n * Creates a \"subject\" by basically gluing an observer to an observable.\n *\n * @nocollapse\n * @deprecated Recommended you do not use. Will be removed at some point in the future. Plans for replacement still under discussion.\n */\n static create: (...args: any[]) => any = (destination: Observer, source: Observable): AnonymousSubject => {\n return new AnonymousSubject(destination, source);\n };\n\n constructor() {\n // NOTE: This must be here to obscure Observable's constructor.\n super();\n }\n\n /** @deprecated Internal implementation detail, do not use directly. Will be made internal in v8. */\n lift(operator: Operator): Observable {\n const subject = new AnonymousSubject(this, this);\n subject.operator = operator as any;\n return subject as any;\n }\n\n /** @internal */\n protected _throwIfClosed() {\n if (this.closed) {\n throw new ObjectUnsubscribedError();\n }\n }\n\n next(value: T) {\n errorContext(() => {\n this._throwIfClosed();\n if (!this.isStopped) {\n if (!this.currentObservers) {\n this.currentObservers = Array.from(this.observers);\n }\n for (const observer of this.currentObservers) {\n observer.next(value);\n }\n }\n });\n }\n\n error(err: any) {\n errorContext(() => {\n this._throwIfClosed();\n if (!this.isStopped) {\n this.hasError = this.isStopped = true;\n this.thrownError = err;\n const { observers } = this;\n while (observers.length) {\n observers.shift()!.error(err);\n }\n }\n });\n }\n\n complete() {\n errorContext(() => {\n this._throwIfClosed();\n if (!this.isStopped) {\n this.isStopped = true;\n const { observers } = this;\n while (observers.length) {\n observers.shift()!.complete();\n }\n }\n });\n }\n\n unsubscribe() {\n this.isStopped = this.closed = true;\n this.observers = this.currentObservers = null!;\n }\n\n get observed() {\n return this.observers?.length > 0;\n }\n\n /** @internal */\n protected _trySubscribe(subscriber: Subscriber): TeardownLogic {\n this._throwIfClosed();\n return super._trySubscribe(subscriber);\n }\n\n /** @internal */\n protected _subscribe(subscriber: Subscriber): Subscription {\n this._throwIfClosed();\n this._checkFinalizedStatuses(subscriber);\n return this._innerSubscribe(subscriber);\n }\n\n /** @internal */\n protected _innerSubscribe(subscriber: Subscriber) {\n const { hasError, isStopped, observers } = this;\n if (hasError || isStopped) {\n return EMPTY_SUBSCRIPTION;\n }\n this.currentObservers = null;\n observers.push(subscriber);\n return new Subscription(() => {\n this.currentObservers = null;\n arrRemove(observers, subscriber);\n });\n }\n\n /** @internal */\n protected _checkFinalizedStatuses(subscriber: Subscriber) {\n const { hasError, thrownError, isStopped } = this;\n if (hasError) {\n subscriber.error(thrownError);\n } else if (isStopped) {\n subscriber.complete();\n }\n }\n\n /**\n * Creates a new Observable with this Subject as the source. You can do this\n * to create custom Observer-side logic of the Subject and conceal it from\n * code that uses the Observable.\n * @return {Observable} Observable that the Subject casts to\n */\n asObservable(): Observable {\n const observable: any = new Observable();\n observable.source = this;\n return observable;\n }\n}\n\n/**\n * @class AnonymousSubject\n */\nexport class AnonymousSubject extends Subject {\n constructor(\n /** @deprecated Internal implementation detail, do not use directly. Will be made internal in v8. */\n public destination?: Observer,\n source?: Observable\n ) {\n super();\n this.source = source;\n }\n\n next(value: T) {\n this.destination?.next?.(value);\n }\n\n error(err: any) {\n this.destination?.error?.(err);\n }\n\n complete() {\n this.destination?.complete?.();\n }\n\n /** @internal */\n protected _subscribe(subscriber: Subscriber): Subscription {\n return this.source?.subscribe(subscriber) ?? EMPTY_SUBSCRIPTION;\n }\n}\n", "import { Subject } from './Subject';\nimport { Subscriber } from './Subscriber';\nimport { Subscription } from './Subscription';\n\n/**\n * A variant of Subject that requires an initial value and emits its current\n * value whenever it is subscribed to.\n *\n * @class BehaviorSubject\n */\nexport class BehaviorSubject extends Subject {\n constructor(private _value: T) {\n super();\n }\n\n get value(): T {\n return this.getValue();\n }\n\n /** @internal */\n protected _subscribe(subscriber: Subscriber): Subscription {\n const subscription = super._subscribe(subscriber);\n !subscription.closed && subscriber.next(this._value);\n return subscription;\n }\n\n getValue(): T {\n const { hasError, thrownError, _value } = this;\n if (hasError) {\n throw thrownError;\n }\n this._throwIfClosed();\n return _value;\n }\n\n next(value: T): void {\n super.next((this._value = value));\n }\n}\n", "import { TimestampProvider } from '../types';\n\ninterface DateTimestampProvider extends TimestampProvider {\n delegate: TimestampProvider | undefined;\n}\n\nexport const dateTimestampProvider: DateTimestampProvider = {\n now() {\n // Use the variable rather than `this` so that the function can be called\n // without being bound to the provider.\n return (dateTimestampProvider.delegate || Date).now();\n },\n delegate: undefined,\n};\n", "import { Subject } from './Subject';\nimport { TimestampProvider } from './types';\nimport { Subscriber } from './Subscriber';\nimport { Subscription } from './Subscription';\nimport { dateTimestampProvider } from './scheduler/dateTimestampProvider';\n\n/**\n * A variant of {@link Subject} that \"replays\" old values to new subscribers by emitting them when they first subscribe.\n *\n * `ReplaySubject` has an internal buffer that will store a specified number of values that it has observed. Like `Subject`,\n * `ReplaySubject` \"observes\" values by having them passed to its `next` method. When it observes a value, it will store that\n * value for a time determined by the configuration of the `ReplaySubject`, as passed to its constructor.\n *\n * When a new subscriber subscribes to the `ReplaySubject` instance, it will synchronously emit all values in its buffer in\n * a First-In-First-Out (FIFO) manner. The `ReplaySubject` will also complete, if it has observed completion; and it will\n * error if it has observed an error.\n *\n * There are two main configuration items to be concerned with:\n *\n * 1. `bufferSize` - This will determine how many items are stored in the buffer, defaults to infinite.\n * 2. `windowTime` - The amount of time to hold a value in the buffer before removing it from the buffer.\n *\n * Both configurations may exist simultaneously. So if you would like to buffer a maximum of 3 values, as long as the values\n * are less than 2 seconds old, you could do so with a `new ReplaySubject(3, 2000)`.\n *\n * ### Differences with BehaviorSubject\n *\n * `BehaviorSubject` is similar to `new ReplaySubject(1)`, with a couple of exceptions:\n *\n * 1. `BehaviorSubject` comes \"primed\" with a single value upon construction.\n * 2. `ReplaySubject` will replay values, even after observing an error, where `BehaviorSubject` will not.\n *\n * @see {@link Subject}\n * @see {@link BehaviorSubject}\n * @see {@link shareReplay}\n */\nexport class ReplaySubject extends Subject {\n private _buffer: (T | number)[] = [];\n private _infiniteTimeWindow = true;\n\n /**\n * @param bufferSize The size of the buffer to replay on subscription\n * @param windowTime The amount of time the buffered items will stay buffered\n * @param timestampProvider An object with a `now()` method that provides the current timestamp. This is used to\n * calculate the amount of time something has been buffered.\n */\n constructor(\n private _bufferSize = Infinity,\n private _windowTime = Infinity,\n private _timestampProvider: TimestampProvider = dateTimestampProvider\n ) {\n super();\n this._infiniteTimeWindow = _windowTime === Infinity;\n this._bufferSize = Math.max(1, _bufferSize);\n this._windowTime = Math.max(1, _windowTime);\n }\n\n next(value: T): void {\n const { isStopped, _buffer, _infiniteTimeWindow, _timestampProvider, _windowTime } = this;\n if (!isStopped) {\n _buffer.push(value);\n !_infiniteTimeWindow && _buffer.push(_timestampProvider.now() + _windowTime);\n }\n this._trimBuffer();\n super.next(value);\n }\n\n /** @internal */\n protected _subscribe(subscriber: Subscriber): Subscription {\n this._throwIfClosed();\n this._trimBuffer();\n\n const subscription = this._innerSubscribe(subscriber);\n\n const { _infiniteTimeWindow, _buffer } = this;\n // We use a copy here, so reentrant code does not mutate our array while we're\n // emitting it to a new subscriber.\n const copy = _buffer.slice();\n for (let i = 0; i < copy.length && !subscriber.closed; i += _infiniteTimeWindow ? 1 : 2) {\n subscriber.next(copy[i] as T);\n }\n\n this._checkFinalizedStatuses(subscriber);\n\n return subscription;\n }\n\n private _trimBuffer() {\n const { _bufferSize, _timestampProvider, _buffer, _infiniteTimeWindow } = this;\n // If we don't have an infinite buffer size, and we're over the length,\n // use splice to truncate the old buffer values off. Note that we have to\n // double the size for instances where we're not using an infinite time window\n // because we're storing the values and the timestamps in the same array.\n const adjustedBufferSize = (_infiniteTimeWindow ? 1 : 2) * _bufferSize;\n _bufferSize < Infinity && adjustedBufferSize < _buffer.length && _buffer.splice(0, _buffer.length - adjustedBufferSize);\n\n // Now, if we're not in an infinite time window, remove all values where the time is\n // older than what is allowed.\n if (!_infiniteTimeWindow) {\n const now = _timestampProvider.now();\n let last = 0;\n // Search the array for the first timestamp that isn't expired and\n // truncate the buffer up to that point.\n for (let i = 1; i < _buffer.length && (_buffer[i] as number) <= now; i += 2) {\n last = i;\n }\n last && _buffer.splice(0, last + 1);\n }\n }\n}\n", "import { Scheduler } from '../Scheduler';\nimport { Subscription } from '../Subscription';\nimport { SchedulerAction } from '../types';\n\n/**\n * A unit of work to be executed in a `scheduler`. An action is typically\n * created from within a {@link SchedulerLike} and an RxJS user does not need to concern\n * themselves about creating and manipulating an Action.\n *\n * ```ts\n * class Action extends Subscription {\n * new (scheduler: Scheduler, work: (state?: T) => void);\n * schedule(state?: T, delay: number = 0): Subscription;\n * }\n * ```\n *\n * @class Action\n */\nexport class Action extends Subscription {\n constructor(scheduler: Scheduler, work: (this: SchedulerAction, state?: T) => void) {\n super();\n }\n /**\n * Schedules this action on its parent {@link SchedulerLike} for execution. May be passed\n * some context object, `state`. May happen at some point in the future,\n * according to the `delay` parameter, if specified.\n * @param {T} [state] Some contextual data that the `work` function uses when\n * called by the Scheduler.\n * @param {number} [delay] Time to wait before executing the work, where the\n * time unit is implicit and defined by the Scheduler.\n * @return {void}\n */\n public schedule(state?: T, delay: number = 0): Subscription {\n return this;\n }\n}\n", "import type { TimerHandle } from './timerHandle';\ntype SetIntervalFunction = (handler: () => void, timeout?: number, ...args: any[]) => TimerHandle;\ntype ClearIntervalFunction = (handle: TimerHandle) => void;\n\ninterface IntervalProvider {\n setInterval: SetIntervalFunction;\n clearInterval: ClearIntervalFunction;\n delegate:\n | {\n setInterval: SetIntervalFunction;\n clearInterval: ClearIntervalFunction;\n }\n | undefined;\n}\n\nexport const intervalProvider: IntervalProvider = {\n // When accessing the delegate, use the variable rather than `this` so that\n // the functions can be called without being bound to the provider.\n setInterval(handler: () => void, timeout?: number, ...args) {\n const { delegate } = intervalProvider;\n if (delegate?.setInterval) {\n return delegate.setInterval(handler, timeout, ...args);\n }\n return setInterval(handler, timeout, ...args);\n },\n clearInterval(handle) {\n const { delegate } = intervalProvider;\n return (delegate?.clearInterval || clearInterval)(handle as any);\n },\n delegate: undefined,\n};\n", "import { Action } from './Action';\nimport { SchedulerAction } from '../types';\nimport { Subscription } from '../Subscription';\nimport { AsyncScheduler } from './AsyncScheduler';\nimport { intervalProvider } from './intervalProvider';\nimport { arrRemove } from '../util/arrRemove';\nimport { TimerHandle } from './timerHandle';\n\nexport class AsyncAction extends Action {\n public id: TimerHandle | undefined;\n public state?: T;\n // @ts-ignore: Property has no initializer and is not definitely assigned\n public delay: number;\n protected pending: boolean = false;\n\n constructor(protected scheduler: AsyncScheduler, protected work: (this: SchedulerAction, state?: T) => void) {\n super(scheduler, work);\n }\n\n public schedule(state?: T, delay: number = 0): Subscription {\n if (this.closed) {\n return this;\n }\n\n // Always replace the current state with the new state.\n this.state = state;\n\n const id = this.id;\n const scheduler = this.scheduler;\n\n //\n // Important implementation note:\n //\n // Actions only execute once by default, unless rescheduled from within the\n // scheduled callback. This allows us to implement single and repeat\n // actions via the same code path, without adding API surface area, as well\n // as mimic traditional recursion but across asynchronous boundaries.\n //\n // However, JS runtimes and timers distinguish between intervals achieved by\n // serial `setTimeout` calls vs. a single `setInterval` call. An interval of\n // serial `setTimeout` calls can be individually delayed, which delays\n // scheduling the next `setTimeout`, and so on. `setInterval` attempts to\n // guarantee the interval callback will be invoked more precisely to the\n // interval period, regardless of load.\n //\n // Therefore, we use `setInterval` to schedule single and repeat actions.\n // If the action reschedules itself with the same delay, the interval is not\n // canceled. If the action doesn't reschedule, or reschedules with a\n // different delay, the interval will be canceled after scheduled callback\n // execution.\n //\n if (id != null) {\n this.id = this.recycleAsyncId(scheduler, id, delay);\n }\n\n // Set the pending flag indicating that this action has been scheduled, or\n // has recursively rescheduled itself.\n this.pending = true;\n\n this.delay = delay;\n // If this action has already an async Id, don't request a new one.\n this.id = this.id ?? this.requestAsyncId(scheduler, this.id, delay);\n\n return this;\n }\n\n protected requestAsyncId(scheduler: AsyncScheduler, _id?: TimerHandle, delay: number = 0): TimerHandle {\n return intervalProvider.setInterval(scheduler.flush.bind(scheduler, this), delay);\n }\n\n protected recycleAsyncId(_scheduler: AsyncScheduler, id?: TimerHandle, delay: number | null = 0): TimerHandle | undefined {\n // If this action is rescheduled with the same delay time, don't clear the interval id.\n if (delay != null && this.delay === delay && this.pending === false) {\n return id;\n }\n // Otherwise, if the action's delay time is different from the current delay,\n // or the action has been rescheduled before it's executed, clear the interval id\n if (id != null) {\n intervalProvider.clearInterval(id);\n }\n\n return undefined;\n }\n\n /**\n * Immediately executes this action and the `work` it contains.\n * @return {any}\n */\n public execute(state: T, delay: number): any {\n if (this.closed) {\n return new Error('executing a cancelled action');\n }\n\n this.pending = false;\n const error = this._execute(state, delay);\n if (error) {\n return error;\n } else if (this.pending === false && this.id != null) {\n // Dequeue if the action didn't reschedule itself. Don't call\n // unsubscribe(), because the action could reschedule later.\n // For example:\n // ```\n // scheduler.schedule(function doWork(counter) {\n // /* ... I'm a busy worker bee ... */\n // var originalAction = this;\n // /* wait 100ms before rescheduling the action */\n // setTimeout(function () {\n // originalAction.schedule(counter + 1);\n // }, 100);\n // }, 1000);\n // ```\n this.id = this.recycleAsyncId(this.scheduler, this.id, null);\n }\n }\n\n protected _execute(state: T, _delay: number): any {\n let errored: boolean = false;\n let errorValue: any;\n try {\n this.work(state);\n } catch (e) {\n errored = true;\n // HACK: Since code elsewhere is relying on the \"truthiness\" of the\n // return here, we can't have it return \"\" or 0 or false.\n // TODO: Clean this up when we refactor schedulers mid-version-8 or so.\n errorValue = e ? e : new Error('Scheduled action threw falsy error');\n }\n if (errored) {\n this.unsubscribe();\n return errorValue;\n }\n }\n\n unsubscribe() {\n if (!this.closed) {\n const { id, scheduler } = this;\n const { actions } = scheduler;\n\n this.work = this.state = this.scheduler = null!;\n this.pending = false;\n\n arrRemove(actions, this);\n if (id != null) {\n this.id = this.recycleAsyncId(scheduler, id, null);\n }\n\n this.delay = null!;\n super.unsubscribe();\n }\n }\n}\n", "import { Action } from './scheduler/Action';\nimport { Subscription } from './Subscription';\nimport { SchedulerLike, SchedulerAction } from './types';\nimport { dateTimestampProvider } from './scheduler/dateTimestampProvider';\n\n/**\n * An execution context and a data structure to order tasks and schedule their\n * execution. Provides a notion of (potentially virtual) time, through the\n * `now()` getter method.\n *\n * Each unit of work in a Scheduler is called an `Action`.\n *\n * ```ts\n * class Scheduler {\n * now(): number;\n * schedule(work, delay?, state?): Subscription;\n * }\n * ```\n *\n * @class Scheduler\n * @deprecated Scheduler is an internal implementation detail of RxJS, and\n * should not be used directly. Rather, create your own class and implement\n * {@link SchedulerLike}. Will be made internal in v8.\n */\nexport class Scheduler implements SchedulerLike {\n public static now: () => number = dateTimestampProvider.now;\n\n constructor(private schedulerActionCtor: typeof Action, now: () => number = Scheduler.now) {\n this.now = now;\n }\n\n /**\n * A getter method that returns a number representing the current time\n * (at the time this function was called) according to the scheduler's own\n * internal clock.\n * @return {number} A number that represents the current time. May or may not\n * have a relation to wall-clock time. May or may not refer to a time unit\n * (e.g. milliseconds).\n */\n public now: () => number;\n\n /**\n * Schedules a function, `work`, for execution. May happen at some point in\n * the future, according to the `delay` parameter, if specified. May be passed\n * some context object, `state`, which will be passed to the `work` function.\n *\n * The given arguments will be processed an stored as an Action object in a\n * queue of actions.\n *\n * @param {function(state: ?T): ?Subscription} work A function representing a\n * task, or some unit of work to be executed by the Scheduler.\n * @param {number} [delay] Time to wait before executing the work, where the\n * time unit is implicit and defined by the Scheduler itself.\n * @param {T} [state] Some contextual data that the `work` function uses when\n * called by the Scheduler.\n * @return {Subscription} A subscription in order to be able to unsubscribe\n * the scheduled work.\n */\n public schedule(work: (this: SchedulerAction, state?: T) => void, delay: number = 0, state?: T): Subscription {\n return new this.schedulerActionCtor(this, work).schedule(state, delay);\n }\n}\n", "import { Scheduler } from '../Scheduler';\nimport { Action } from './Action';\nimport { AsyncAction } from './AsyncAction';\nimport { TimerHandle } from './timerHandle';\n\nexport class AsyncScheduler extends Scheduler {\n public actions: Array> = [];\n /**\n * A flag to indicate whether the Scheduler is currently executing a batch of\n * queued actions.\n * @type {boolean}\n * @internal\n */\n public _active: boolean = false;\n /**\n * An internal ID used to track the latest asynchronous task such as those\n * coming from `setTimeout`, `setInterval`, `requestAnimationFrame`, and\n * others.\n * @type {any}\n * @internal\n */\n public _scheduled: TimerHandle | undefined;\n\n constructor(SchedulerAction: typeof Action, now: () => number = Scheduler.now) {\n super(SchedulerAction, now);\n }\n\n public flush(action: AsyncAction): void {\n const { actions } = this;\n\n if (this._active) {\n actions.push(action);\n return;\n }\n\n let error: any;\n this._active = true;\n\n do {\n if ((error = action.execute(action.state, action.delay))) {\n break;\n }\n } while ((action = actions.shift()!)); // exhaust the scheduler queue\n\n this._active = false;\n\n if (error) {\n while ((action = actions.shift()!)) {\n action.unsubscribe();\n }\n throw error;\n }\n }\n}\n", "import { AsyncAction } from './AsyncAction';\nimport { AsyncScheduler } from './AsyncScheduler';\n\n/**\n *\n * Async Scheduler\n *\n * Schedule task as if you used setTimeout(task, duration)\n *\n * `async` scheduler schedules tasks asynchronously, by putting them on the JavaScript\n * event loop queue. It is best used to delay tasks in time or to schedule tasks repeating\n * in intervals.\n *\n * If you just want to \"defer\" task, that is to perform it right after currently\n * executing synchronous code ends (commonly achieved by `setTimeout(deferredTask, 0)`),\n * better choice will be the {@link asapScheduler} scheduler.\n *\n * ## Examples\n * Use async scheduler to delay task\n * ```ts\n * import { asyncScheduler } from 'rxjs';\n *\n * const task = () => console.log('it works!');\n *\n * asyncScheduler.schedule(task, 2000);\n *\n * // After 2 seconds logs:\n * // \"it works!\"\n * ```\n *\n * Use async scheduler to repeat task in intervals\n * ```ts\n * import { asyncScheduler } from 'rxjs';\n *\n * function task(state) {\n * console.log(state);\n * this.schedule(state + 1, 1000); // `this` references currently executing Action,\n * // which we reschedule with new state and delay\n * }\n *\n * asyncScheduler.schedule(task, 3000, 0);\n *\n * // Logs:\n * // 0 after 3s\n * // 1 after 4s\n * // 2 after 5s\n * // 3 after 6s\n * ```\n */\n\nexport const asyncScheduler = new AsyncScheduler(AsyncAction);\n\n/**\n * @deprecated Renamed to {@link asyncScheduler}. Will be removed in v8.\n */\nexport const async = asyncScheduler;\n", "import { AsyncAction } from './AsyncAction';\nimport { Subscription } from '../Subscription';\nimport { QueueScheduler } from './QueueScheduler';\nimport { SchedulerAction } from '../types';\nimport { TimerHandle } from './timerHandle';\n\nexport class QueueAction extends AsyncAction {\n constructor(protected scheduler: QueueScheduler, protected work: (this: SchedulerAction, state?: T) => void) {\n super(scheduler, work);\n }\n\n public schedule(state?: T, delay: number = 0): Subscription {\n if (delay > 0) {\n return super.schedule(state, delay);\n }\n this.delay = delay;\n this.state = state;\n this.scheduler.flush(this);\n return this;\n }\n\n public execute(state: T, delay: number): any {\n return delay > 0 || this.closed ? super.execute(state, delay) : this._execute(state, delay);\n }\n\n protected requestAsyncId(scheduler: QueueScheduler, id?: TimerHandle, delay: number = 0): TimerHandle {\n // If delay exists and is greater than 0, or if the delay is null (the\n // action wasn't rescheduled) but was originally scheduled as an async\n // action, then recycle as an async action.\n\n if ((delay != null && delay > 0) || (delay == null && this.delay > 0)) {\n return super.requestAsyncId(scheduler, id, delay);\n }\n\n // Otherwise flush the scheduler starting with this action.\n scheduler.flush(this);\n\n // HACK: In the past, this was returning `void`. However, `void` isn't a valid\n // `TimerHandle`, and generally the return value here isn't really used. So the\n // compromise is to return `0` which is both \"falsy\" and a valid `TimerHandle`,\n // as opposed to refactoring every other instanceo of `requestAsyncId`.\n return 0;\n }\n}\n", "import { AsyncScheduler } from './AsyncScheduler';\n\nexport class QueueScheduler extends AsyncScheduler {\n}\n", "import { QueueAction } from './QueueAction';\nimport { QueueScheduler } from './QueueScheduler';\n\n/**\n *\n * Queue Scheduler\n *\n * Put every next task on a queue, instead of executing it immediately\n *\n * `queue` scheduler, when used with delay, behaves the same as {@link asyncScheduler} scheduler.\n *\n * When used without delay, it schedules given task synchronously - executes it right when\n * it is scheduled. However when called recursively, that is when inside the scheduled task,\n * another task is scheduled with queue scheduler, instead of executing immediately as well,\n * that task will be put on a queue and wait for current one to finish.\n *\n * This means that when you execute task with `queue` scheduler, you are sure it will end\n * before any other task scheduled with that scheduler will start.\n *\n * ## Examples\n * Schedule recursively first, then do something\n * ```ts\n * import { queueScheduler } from 'rxjs';\n *\n * queueScheduler.schedule(() => {\n * queueScheduler.schedule(() => console.log('second')); // will not happen now, but will be put on a queue\n *\n * console.log('first');\n * });\n *\n * // Logs:\n * // \"first\"\n * // \"second\"\n * ```\n *\n * Reschedule itself recursively\n * ```ts\n * import { queueScheduler } from 'rxjs';\n *\n * queueScheduler.schedule(function(state) {\n * if (state !== 0) {\n * console.log('before', state);\n * this.schedule(state - 1); // `this` references currently executing Action,\n * // which we reschedule with new state\n * console.log('after', state);\n * }\n * }, 0, 3);\n *\n * // In scheduler that runs recursively, you would expect:\n * // \"before\", 3\n * // \"before\", 2\n * // \"before\", 1\n * // \"after\", 1\n * // \"after\", 2\n * // \"after\", 3\n *\n * // But with queue it logs:\n * // \"before\", 3\n * // \"after\", 3\n * // \"before\", 2\n * // \"after\", 2\n * // \"before\", 1\n * // \"after\", 1\n * ```\n */\n\nexport const queueScheduler = new QueueScheduler(QueueAction);\n\n/**\n * @deprecated Renamed to {@link queueScheduler}. Will be removed in v8.\n */\nexport const queue = queueScheduler;\n", "import { AsyncAction } from './AsyncAction';\nimport { AnimationFrameScheduler } from './AnimationFrameScheduler';\nimport { SchedulerAction } from '../types';\nimport { animationFrameProvider } from './animationFrameProvider';\nimport { TimerHandle } from './timerHandle';\n\nexport class AnimationFrameAction extends AsyncAction {\n constructor(protected scheduler: AnimationFrameScheduler, protected work: (this: SchedulerAction, state?: T) => void) {\n super(scheduler, work);\n }\n\n protected requestAsyncId(scheduler: AnimationFrameScheduler, id?: TimerHandle, delay: number = 0): TimerHandle {\n // If delay is greater than 0, request as an async action.\n if (delay !== null && delay > 0) {\n return super.requestAsyncId(scheduler, id, delay);\n }\n // Push the action to the end of the scheduler queue.\n scheduler.actions.push(this);\n // If an animation frame has already been requested, don't request another\n // one. If an animation frame hasn't been requested yet, request one. Return\n // the current animation frame request id.\n return scheduler._scheduled || (scheduler._scheduled = animationFrameProvider.requestAnimationFrame(() => scheduler.flush(undefined)));\n }\n\n protected recycleAsyncId(scheduler: AnimationFrameScheduler, id?: TimerHandle, delay: number = 0): TimerHandle | undefined {\n // If delay exists and is greater than 0, or if the delay is null (the\n // action wasn't rescheduled) but was originally scheduled as an async\n // action, then recycle as an async action.\n if (delay != null ? delay > 0 : this.delay > 0) {\n return super.recycleAsyncId(scheduler, id, delay);\n }\n // If the scheduler queue has no remaining actions with the same async id,\n // cancel the requested animation frame and set the scheduled flag to\n // undefined so the next AnimationFrameAction will request its own.\n const { actions } = scheduler;\n if (id != null && actions[actions.length - 1]?.id !== id) {\n animationFrameProvider.cancelAnimationFrame(id as number);\n scheduler._scheduled = undefined;\n }\n // Return undefined so the action knows to request a new async id if it's rescheduled.\n return undefined;\n }\n}\n", "import { AsyncAction } from './AsyncAction';\nimport { AsyncScheduler } from './AsyncScheduler';\n\nexport class AnimationFrameScheduler extends AsyncScheduler {\n public flush(action?: AsyncAction): void {\n this._active = true;\n // The async id that effects a call to flush is stored in _scheduled.\n // Before executing an action, it's necessary to check the action's async\n // id to determine whether it's supposed to be executed in the current\n // flush.\n // Previous implementations of this method used a count to determine this,\n // but that was unsound, as actions that are unsubscribed - i.e. cancelled -\n // are removed from the actions array and that can shift actions that are\n // scheduled to be executed in a subsequent flush into positions at which\n // they are executed within the current flush.\n const flushId = this._scheduled;\n this._scheduled = undefined;\n\n const { actions } = this;\n let error: any;\n action = action || actions.shift()!;\n\n do {\n if ((error = action.execute(action.state, action.delay))) {\n break;\n }\n } while ((action = actions[0]) && action.id === flushId && actions.shift());\n\n this._active = false;\n\n if (error) {\n while ((action = actions[0]) && action.id === flushId && actions.shift()) {\n action.unsubscribe();\n }\n throw error;\n }\n }\n}\n", "import { AnimationFrameAction } from './AnimationFrameAction';\nimport { AnimationFrameScheduler } from './AnimationFrameScheduler';\n\n/**\n *\n * Animation Frame Scheduler\n *\n * Perform task when `window.requestAnimationFrame` would fire\n *\n * When `animationFrame` scheduler is used with delay, it will fall back to {@link asyncScheduler} scheduler\n * behaviour.\n *\n * Without delay, `animationFrame` scheduler can be used to create smooth browser animations.\n * It makes sure scheduled task will happen just before next browser content repaint,\n * thus performing animations as efficiently as possible.\n *\n * ## Example\n * Schedule div height animation\n * ```ts\n * // html:
\n * import { animationFrameScheduler } from 'rxjs';\n *\n * const div = document.querySelector('div');\n *\n * animationFrameScheduler.schedule(function(height) {\n * div.style.height = height + \"px\";\n *\n * this.schedule(height + 1); // `this` references currently executing Action,\n * // which we reschedule with new state\n * }, 0, 0);\n *\n * // You will see a div element growing in height\n * ```\n */\n\nexport const animationFrameScheduler = new AnimationFrameScheduler(AnimationFrameAction);\n\n/**\n * @deprecated Renamed to {@link animationFrameScheduler}. Will be removed in v8.\n */\nexport const animationFrame = animationFrameScheduler;\n", "import { Observable } from '../Observable';\nimport { SchedulerLike } from '../types';\n\n/**\n * A simple Observable that emits no items to the Observer and immediately\n * emits a complete notification.\n *\n * Just emits 'complete', and nothing else.\n *\n * ![](empty.png)\n *\n * A simple Observable that only emits the complete notification. It can be used\n * for composing with other Observables, such as in a {@link mergeMap}.\n *\n * ## Examples\n *\n * Log complete notification\n *\n * ```ts\n * import { EMPTY } from 'rxjs';\n *\n * EMPTY.subscribe({\n * next: () => console.log('Next'),\n * complete: () => console.log('Complete!')\n * });\n *\n * // Outputs\n * // Complete!\n * ```\n *\n * Emit the number 7, then complete\n *\n * ```ts\n * import { EMPTY, startWith } from 'rxjs';\n *\n * const result = EMPTY.pipe(startWith(7));\n * result.subscribe(x => console.log(x));\n *\n * // Outputs\n * // 7\n * ```\n *\n * Map and flatten only odd numbers to the sequence `'a'`, `'b'`, `'c'`\n *\n * ```ts\n * import { interval, mergeMap, of, EMPTY } from 'rxjs';\n *\n * const interval$ = interval(1000);\n * const result = interval$.pipe(\n * mergeMap(x => x % 2 === 1 ? of('a', 'b', 'c') : EMPTY),\n * );\n * result.subscribe(x => console.log(x));\n *\n * // Results in the following to the console:\n * // x is equal to the count on the interval, e.g. (0, 1, 2, 3, ...)\n * // x will occur every 1000ms\n * // if x % 2 is equal to 1, print a, b, c (each on its own)\n * // if x % 2 is not equal to 1, nothing will be output\n * ```\n *\n * @see {@link Observable}\n * @see {@link NEVER}\n * @see {@link of}\n * @see {@link throwError}\n */\nexport const EMPTY = new Observable((subscriber) => subscriber.complete());\n\n/**\n * @param scheduler A {@link SchedulerLike} to use for scheduling\n * the emission of the complete notification.\n * @deprecated Replaced with the {@link EMPTY} constant or {@link scheduled} (e.g. `scheduled([], scheduler)`). Will be removed in v8.\n */\nexport function empty(scheduler?: SchedulerLike) {\n return scheduler ? emptyScheduled(scheduler) : EMPTY;\n}\n\nfunction emptyScheduled(scheduler: SchedulerLike) {\n return new Observable((subscriber) => scheduler.schedule(() => subscriber.complete()));\n}\n", "import { SchedulerLike } from '../types';\nimport { isFunction } from './isFunction';\n\nexport function isScheduler(value: any): value is SchedulerLike {\n return value && isFunction(value.schedule);\n}\n", "import { SchedulerLike } from '../types';\nimport { isFunction } from './isFunction';\nimport { isScheduler } from './isScheduler';\n\nfunction last(arr: T[]): T | undefined {\n return arr[arr.length - 1];\n}\n\nexport function popResultSelector(args: any[]): ((...args: unknown[]) => unknown) | undefined {\n return isFunction(last(args)) ? args.pop() : undefined;\n}\n\nexport function popScheduler(args: any[]): SchedulerLike | undefined {\n return isScheduler(last(args)) ? args.pop() : undefined;\n}\n\nexport function popNumber(args: any[], defaultValue: number): number {\n return typeof last(args) === 'number' ? args.pop()! : defaultValue;\n}\n", "export const isArrayLike = ((x: any): x is ArrayLike => x && typeof x.length === 'number' && typeof x !== 'function');", "import { isFunction } from \"./isFunction\";\n\n/**\n * Tests to see if the object is \"thennable\".\n * @param value the object to test\n */\nexport function isPromise(value: any): value is PromiseLike {\n return isFunction(value?.then);\n}\n", "import { InteropObservable } from '../types';\nimport { observable as Symbol_observable } from '../symbol/observable';\nimport { isFunction } from './isFunction';\n\n/** Identifies an input as being Observable (but not necessary an Rx Observable) */\nexport function isInteropObservable(input: any): input is InteropObservable {\n return isFunction(input[Symbol_observable]);\n}\n", "import { isFunction } from './isFunction';\n\nexport function isAsyncIterable(obj: any): obj is AsyncIterable {\n return Symbol.asyncIterator && isFunction(obj?.[Symbol.asyncIterator]);\n}\n", "/**\n * Creates the TypeError to throw if an invalid object is passed to `from` or `scheduled`.\n * @param input The object that was passed.\n */\nexport function createInvalidObservableTypeError(input: any) {\n // TODO: We should create error codes that can be looked up, so this can be less verbose.\n return new TypeError(\n `You provided ${\n input !== null && typeof input === 'object' ? 'an invalid object' : `'${input}'`\n } where a stream was expected. You can provide an Observable, Promise, ReadableStream, Array, AsyncIterable, or Iterable.`\n );\n}\n", "export function getSymbolIterator(): symbol {\n if (typeof Symbol !== 'function' || !Symbol.iterator) {\n return '@@iterator' as any;\n }\n\n return Symbol.iterator;\n}\n\nexport const iterator = getSymbolIterator();\n", "import { iterator as Symbol_iterator } from '../symbol/iterator';\nimport { isFunction } from './isFunction';\n\n/** Identifies an input as being an Iterable */\nexport function isIterable(input: any): input is Iterable {\n return isFunction(input?.[Symbol_iterator]);\n}\n", "import { ReadableStreamLike } from '../types';\nimport { isFunction } from './isFunction';\n\nexport async function* readableStreamLikeToAsyncGenerator(readableStream: ReadableStreamLike): AsyncGenerator {\n const reader = readableStream.getReader();\n try {\n while (true) {\n const { value, done } = await reader.read();\n if (done) {\n return;\n }\n yield value!;\n }\n } finally {\n reader.releaseLock();\n }\n}\n\nexport function isReadableStreamLike(obj: any): obj is ReadableStreamLike {\n // We don't want to use instanceof checks because they would return\n // false for instances from another Realm, like an