Clean up readme

ucbepic · Sep 3, 2024 · b39e34c · b39e34c
1 parent af3a1c7
commit b39e34c
Show file tree

Hide file tree

Showing 40 changed files with 712 additions and 507 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,10 @@
 .env
 *__pycache__*
 *scratch*
-*relevance_assessment*
+*relevance_assessment*
+palimpzest/*
+paper_workloads/contracts/full_contract_txt*
+paper_workloads/contracts/sample_contract_txt*
+*.xlsx
+*.csv
+paper_workloads/*
diff --git a/README.md b/README.md
@@ -30,8 +30,23 @@ To install Motion, clone this repository and install the required dependencies:
 
 ```bash
 git clone https://github.com/shreyashankar/motion-v3.git
-cd motion
-pip install -r requirements.txt
+cd motion-v3
+pip install poetry
+make install
+```
+
+Then set up a .env file in your repository with the following:
+
+```bash
+OPENAI_API_KEY=your_openai_api_key
+```
+
+or you can set the OPENAI_API_KEY environment variable in your environment.
+
+Then run the basic test suite to ensure everything is working:
+
+```bash
+make tests-basic
 ```
 
 ## Usage
@@ -57,6 +72,12 @@ The configuration file is a YAML document with the following top-level keys:
 
 Motion supports various operation types, each designed for specific data transformation tasks. All prompt templates used in these operations are Jinja2 templates, allowing for the use of loops, conditionals, and other Jinja2 features to create dynamic prompts based on input data.
 
+All operations have the following optional parameters:
+
+- `optimize`: Boolean flag. If true, the operation will be optimized. Default is True.
+- `recursively_optimize`: Boolean flag. If true, the operation will be recursively optimized (e.g., reduces generated in map operations will be optimized). Default is false. I recommend not settting this to true unless you are willing to babysit the optimizer.
+- `sample_size`: Integer. The number of samples to use for the operation, if you want to run it only on a sample of data. (Only applicable at runtime, not in optimization time.)
+
 Here's an overview of the supported operation types:
 
 ### Map
@@ -66,7 +87,7 @@ The Map operation applies a transformation to each item in the input data.
 Required parameters:
 
 - `type`: Must be set to `"map"`.
-- `prompt`: The prompt template to use for the transformation.
+- `prompt`: The prompt template to use for the transformation. Access variables with `input.keyname`
 - `output`: Schema definition for the output from the LLM.
 - `model` (optional): The language model to use, falls back to `default_model` if not specified.
 
@@ -397,16 +418,18 @@ Required parameters:
 
 - `type`: Must be set to `"reduce"`.
 - `reduce_key`: The key to use for grouping data. This can be a single key (string) or a list of keys.
-- `prompt`: The prompt template to use for the reduction operation. This template can access the grouped values using `{{ values }}` (a list of dictionary objects or records) and the reduce key using `{{ reduce_key }}`.
+- `prompt`: The prompt template to use for the reduction operation. This template can access the grouped values using `{{ inputs }}` (a list of dictionary objects or records) and the reduce key using `{{ reduce_key }}`.
 - `output`: Schema definition for the output from the LLM.
 
 Optional parameters:
 
+- `synthesize_resolve`: Boolean flag. If false, we will not synthesize a resolve operation in between a map and a reduce operation. Default is true.
+- `synthesize_merge`: Boolean flag. If false, we will not synthesize a merge optimization (we will only rely on folding). Default is true.
 - `model`: The language model to use, falls back to `default_model` if not specified.
 - `input`: Specifies the schema or keys to subselect from each item or value to pass into the prompt. If omitted, all keys from the input items will be used.
 - `pass_through`: Boolean flag. If true, keys (not on input) from the first item in the group will be passed through to the output. Default is false.
 - `commutative`: Boolean flag. If true, the reduce operation is commutative, meaning the order of operations doesn't matter. This can enable further optimizations. Default is true.
-- `fold_prompt`: A prompt template for incremental folding. This enables processing of large groups in smaller batches. The template should access the current reduced values using `{{ output.field_name }}` and the new batch of values using `{{ values }}`.
+- `fold_prompt`: A prompt template for incremental folding. This enables processing of large groups in smaller batches. The template should access the current reduced values using `{{ output.field_name }}` and the new batch of values using `{{ inputs }}`.
 - `fold_batch_size`: The number of items to process in each fold operation when using incremental folding.
 - `merge_prompt`: A prompt template for merging the results of multiple fold operations. This is used when processing large groups in parallel. The template should access the list of intermediate results using `{{ outputs }}`.
 - `merge_batch_size`: The number of intermediate results to merge in each merge operation. The optimizers uses a default of 2 if it can find a good merge prompt.
@@ -430,7 +453,7 @@ reduce_operation:
   reduce_key: category
   prompt: |
     Analyze the following items in the category '{{ reduce_key.category }}':
-    {% for item in values %}
+    {% for item in inputs %}
     - {{ item.name }}: ${{ item.price }}
     {% endfor %}
 
@@ -465,7 +488,7 @@ reduce_operation:
       age: integer
   prompt: |
     Analyze the following group of values for the group '{{ reduce_key }}':
-    {% for value in values %}
+    {% for value in inputs %}
     - {{ value }}
     {% endfor %}
 
@@ -487,7 +510,7 @@ reduce_operation:
   reduce_key: group
   prompt: |
     Analyze the following group of values for the group '{{ reduce_key }}':
-    {% for value in values %}
+    {% for value in inputs %}
     - {{ value }}
     {% endfor %}
 
@@ -500,7 +523,7 @@ reduce_operation:
     Average: {{ output.avg }}
 
     New values to be folded in:
-    {% for value in values %}
+    {% for value in inputs %}
     - {{ value }}
     {% endfor %}
 
@@ -531,7 +554,7 @@ Required parameters:
 
 - `type`: Must be set to `"resolve"`.
 - `comparison_prompt`: The prompt template to use for comparing potential matches.
-- `resolution_prompt`: The prompt template to use for reducing matched entries. The matched entries are accessed via the `matched_entries` variable.
+- `resolution_prompt`: The prompt template to use for reducing matched entries. The matched entries are accessed via the `inputs` variable.
 - `output`: Schema definition for the output from the LLM. This should include the resolved key.
 
 Optional parameters:
@@ -564,7 +587,7 @@ resolve_operation:
   resolution_prompt: |
     Merge the following patient records into a single, consolidated record:
 
-    {% for entry in matched_entries %}
+    {% for entry in inputs %}
     Patient Record {{ loop.index }}:
     {{ entry | tojson }}
 
@@ -713,55 +736,92 @@ If any of these validation rules fail, the output will be discarded and not pass
 
 ## Example Pipeline
 
-Here's an example of a pipeline that performs sentiment analysis and word counting using a parallel map operation, then filters based on word count:
+Here's an example of a pipeline that extracts themes from student survey responses, unnests the themes, and then summarizes the responses for each theme:
 
 ```yaml
 default_model: gpt-4o-mini
 
+datasets:
+  student_submissions:
+    type: file
+    path: "data/student_survey_responses.json" # Assuming all items have a "survey_response" attribute
+
 operations:
-  parallel_map_operation:
-    type: parallel_map
-    prompts:
-      - name: sentiment
-        prompt: "Analyze the sentiment of the following text: '{{ input.text }}'. Classify it as either positive, negative, or neutral."
-        output_keys: ["sentiment"]
-      - name: word_count
-        prompt: "Count the number of words in the following text: '{{ input.text }}'. Return the count as an integer."
-        output_keys: ["word_count"]
+  extract_themes:
+    type: map
+    prompt: |
+      I'm teaching a class on databases. Analyze the following student survey response:
+
+      {{ input.survey_response }}
+
+      Extract 2-3 main themes from this response. Return the themes as a list of strings.
     output:
       schema:
-        sentiment: string
-        word_count: integer
+        themes: list[str]
+        class_id: str
     validate:
-      - output["word_count"] > 0
-      - output["sentiment"] in ["positive", "negative", "neutral"]
+      - len(output["themes"]) >= 2)
+    num_retries_on_validate_failure: 3
 
-  filter_operation:
-    type: filter
-    prompt: "Determine if the word count {{ input.word_count }} is greater than 10. Return true if it is, false otherwise."
-    output:
+  unnest_themes:
+    type: unnest
+    unnest_key: themes
+
+  resolve_themes:
+    type: resolve
+    embedding_model: text-embedding-3-small
+    blocking_threshold: 0.7
+    limit_comparisons: 1000 # You can change this or remove it entirely
+
+    comparison_prompt: |
+      Compare the following two themes extracted from student survey responses about a database class:
+
+      Theme 1: {{ left.theme }}
+      Theme 2: {{ right.theme }}
+
+      Are these themes essentially the same or very closely related?
+    resolution_prompt: |
+      You are merging similar themes from student survey responses about a database class. Here are the themes to merge:
+
+      {% for theme in inputs %}
+      Theme {{ loop.index }}: {{ theme.theme }}
+      {% endfor %}
+
+      Create a single, concise theme that captures the essence of all these themes.
+    output: # Merge prompt output. no need to define schema for comparison prompt output
       schema:
-        keep: boolean
+        theme: str
+    model: gpt-4o-mini
 
-datasets:
-  sample_dataset:
-    type: file
-    path: "data/sample_data.json"
+  summarize_themes:
+    type: reduce
+    reduce_key: theme
+    prompt: |
+      I am teaching a class on databases. You are helping me analyze student survey responses. Summarize the responses for the theme: {{ inputs[0].theme }}
+
+      Responses:
+      {% for item in inputs %}
+      Survey {{ loop.index }}:
+      - {{ item.survey_response }}
+      {% endfor %}
+
+      Summarize the main points from the surveys expressed about this theme. Do not mention any names of students or any other identifying information.
+    output:
+      schema:
+        summary: str
 
 pipeline:
   steps:
-    - name: analyze_text
-      input: sample_dataset
-      operations:
-        - parallel_map_operation
-    - name: filter_long_texts
-      input: analyze_text
+    - name: extract_response_themes
+      input: student_submissions
       operations:
-        - filter_operation
+        - extract_themes
+        - unnest_themes
+        - summarize_themes
 
   output:
     type: file
-    path: "output/results.json"
+    path: "output/theme_summaries.json" # Your summaries will be saved to the summary key
 ```
 
 To run this pipeline, save it as `pipeline.yaml` and execute:
@@ -770,4 +830,4 @@ To run this pipeline, save it as `pipeline.yaml` and execute:
 motion pipeline.yaml
 ```
 
-This will process the data in `data/sample_data.json`, perform sentiment analysis and word counting in parallel, validate the results, filter out short texts, and save the results in `output/results.json`.
+This will process the student submissions data, extract themes from each response, unnest the themes, summarize the responses for each theme, and save the theme summaries in `output/theme_summaries.json`.
diff --git a/book_pipeline.yaml b/book_pipeline.yaml
@@ -99,7 +99,7 @@ operations:
       Are these genres likely to be the same or closely related?
     resolution_prompt: |
       Given the following matched genre entries:
-      {% for entry in matched_entries %}
+      {% for entry in inputs %}
       Entry {{ loop.index }}:
       Genre: {{ entry.genre }}
       Example Book: {{ entry.title }}
@@ -179,7 +179,7 @@ operations:
     prompt: |
       Summarize the books in the {{ reduce_key }} genre:
       Books:
-      {% for book in values %}
+      {% for book in inputs %}
       - Title: "{{ book.title }}"
         Word Count: {{ book.word_count }}
         Theme: "{{ book.theme }}"

diff --git a/motion/builder.py b/motion/builder.py
@@ -135,6 +135,7 @@ def __init__(
             self.sample_size_map.update(self.config["optimizer_config"]["sample_sizes"])
 
         self.status = None
+        self.step_op_to_optimized_ops = {}
 
         self.print_optimizer_config()
 
@@ -239,7 +240,10 @@ def compute_sample_size(
         upstream_ops = []
         for step_op in step_ops:
             if step_op != op_config.get("name"):
-                upstream_ops.append(step_op)
+                if step_op in self.step_op_to_optimized_ops:
+                    upstream_ops.extend(self.step_op_to_optimized_ops[step_op])
+                else:
+                    upstream_ops.append(step_op)
             else:
                 break
 
@@ -291,7 +295,9 @@ def _insert_empty_resolve_operations(self):
                 if op_type == "map":
                     has_map = True
                     map_op = op
-                elif op_type == "reduce":
+                elif op_type == "reduce" and self.config["operations"][op].get(
+                    "synthesize_resolve", True
+                ):
                     has_reduce = True
                     reduce_op = op
                 elif op_type == "resolve":
@@ -428,7 +434,8 @@ def optimize(self):
                 if step["name"] != step_name
             ] + [optimized_step]
 
-            # Save the result to datasets using the step name
+            self.step_op_to_optimized_ops[step_name] = optimized_step["operations"]
+
             step_hash = (
                 hashlib.md5(
                     json.dumps(
@@ -458,7 +465,9 @@ def optimize(self):
             ):
                 # Run the entire step
                 input_data = self._run_partial_step(
-                    step, step_operations, float("inf"), optimized_operations
+                    step,
+                    step_operations,
+                    float("inf"),  # TODO: FIX THIS
                 )
                 self.datasets[step_hash] = copy.deepcopy(input_data)
             else:
@@ -772,7 +781,20 @@ def _get_sample_data(
                     data, op_config.get("reduce_key"), sample_size
                 )
 
-        return random.sample(data, min(sample_size, len(data)))
+        # Take the random 500 examples or all if less than 500
+        initial_data = random.sample(data, min(500, len(data)))
+
+        # Calculate counts for each example
+        char_counts = [len(str(item)) for item in initial_data]
+        total_counts = sum(char_counts)
+
+        # Calculate weights based on word counts
+        weights = [count / total_counts for count in char_counts]
+
+        # Perform weighted random sampling
+        return random.choices(
+            initial_data, weights=weights, k=min(sample_size, len(initial_data))
+        )
 
     def _get_reduce_sample(
         self, data: List[Dict[str, Any]], reduce_key: str, sample_size: int

diff --git a/motion/cli.py b/motion/cli.py
@@ -67,11 +67,7 @@ def clear_cache():
     """
     Clear the LLM cache stored on disk.
     """
-    try:
-        cc()
-        typer.echo("Cache cleared successfully.")
-    except Exception as e:
-        typer.echo(f"An error occurred while clearing the cache: {str(e)}")
+    cc()
 
 
 if __name__ == "__main__":

diff --git a/motion/operations/equijoin.py b/motion/operations/equijoin.py
@@ -13,7 +13,8 @@
 import numpy as np
 
 from jinja2 import Template
-from litellm import completion_cost, embedding, model_cost
+from litellm import embedding, model_cost
+from motion.utils import completion_cost
 from sklearn.metrics.pairwise import cosine_similarity
 
 from motion.operations.base import BaseOperation

diff --git a/motion/operations/map.py b/motion/operations/map.py
@@ -6,7 +6,7 @@
 from typing import Any, Dict, List, Optional, Tuple
 
 from jinja2 import Template
-from litellm import completion_cost
+from motion.utils import completion_cost
 
 from motion.operations.base import BaseOperation
 from motion.operations.utils import (