docs: improve clarity and example

ucbepic · Oct 1, 2024 · b79c889 · b79c889
1 parent 2b98e9c
commit b79c889
Show file tree

Hide file tree

Showing 4 changed files with 29 additions and 12 deletions.
diff --git a/docs/best-practices.md b/docs/best-practices.md
@@ -69,7 +69,7 @@ This guide outlines best practices for using DocETL effectively, focusing on the
    prompt: |
      Here are some transcripts of conversations between a doctor and a patient:
 
-     {% for value in values %}
+     {% for value in inputs %}
      Transcript {{ loop.index }}:
      {{ value.src }}
      {% endfor %}

diff --git a/docs/optimization/example.md b/docs/optimization/example.md
@@ -2,10 +2,10 @@
 
 !!! note "Optimizer Stability"
 
-    The optimization process can be unstable, as well as resource-intensive (we've seen it take up to 10 minutes to optimize a single operation, spending up to ~$50 in API costs for end-to-end pipelines). We recommend optimizing one operation at a time and retrying if necessary, as results may vary between runs. This approach also allows you to confidently verify that each optimized operation is performing as expected before moving on to the next. 
-    
-    See the [API](#optimizer-api) for more details on how to resume the optimizer from a failed run, by rerunning `docetl build pipeline.yaml --resume` (with the `--resume` flag). 
-    
+    The optimization process can be unstable, as well as resource-intensive (we've seen it take up to 10 minutes to optimize a single operation, spending up to ~$50 in API costs for end-to-end pipelines). We recommend optimizing one operation at a time and retrying if necessary, as results may vary between runs. This approach also allows you to confidently verify that each optimized operation is performing as expected before moving on to the next.
+
+    See the [API](#optimizer-api) for more details on how to resume the optimizer from a failed run, by rerunning `docetl build pipeline.yaml --resume` (with the `--resume` flag).
+
     Also, you can use gpt-4o-mini for cheaper optimizations (rather than the default gpt-4o), which you can do via `docetl build pipeline.yaml --model=gpt-4o-mini`.
 
 To optimize your pipeline, start with your initial configuration and follow these steps:
@@ -64,7 +64,7 @@ operations:
         uses: str
     prompt: |
       Summarize side effects and uses of {{ reduce_key }} from:
-      {% for value in values %}
+      {% for value in inputs %}
       Transcript {{ loop.index }}: {{ value.src }}
       {% endfor %}
 
@@ -129,7 +129,7 @@ operations:
       Are these the same or closely related?
     resolution_prompt: |
       Standardize the name for:
-      {% for entry in matched_entries %}
+      {% for entry in inputs %}
       - {{ entry.medication }}
       {% endfor %}
 
@@ -143,13 +143,13 @@ operations:
         uses: str
     prompt: |
       Summarize side effects and uses of {{ reduce_key }} from:
-      {% for value in values %}
+      {% for value in inputs %}
       Transcript {{ loop.index }}: {{ value.src }}
       {% endfor %}
     fold_batch_size: 10
     fold_prompt: |
       Update the existing summary of side effects and uses for {{ reduce_key }} based on the following additional transcripts:
-      {% for value in values %}
+      {% for value in inputs %}
       Transcript {{ loop.index }}: {{ value.src }}
       {% endfor %}
 

diff --git a/docs/optimization/python-api.md b/docs/optimization/python-api.md
@@ -30,14 +30,14 @@ operations = [
         blocking_keys=["medication"],
         optimize=True,  # This operation will be optimized
         comparison_prompt="Compare medications:\n1: {{ input1.medication }}\n2: {{ input2.medication }}\nAre these the same or closely related?",
-        resolution_prompt="Standardize the name for:\n{% for entry in matched_entries %}\n- {{ entry.medication }}\n{% endfor %}"
+        resolution_prompt="Standardize the name for:\n{% for entry in inputs %}\n- {{ entry.medication }}\n{% endfor %}"
     ),
     ReduceOp(
         name="summarize_prescriptions",
         type="reduce",
         reduce_key=["medication"],
         output={"schema": {"side_effects": "str", "uses": "str"}},
-        prompt="Summarize side effects and uses of {{ reduce_key }} from:\n{% for value in values %}\nTranscript {{ loop.index }}: {{ value.src }}\n{% endfor %}",
+        prompt="Summarize side effects and uses of {{ reduce_key }} from:\n{% for value in inputs %}\nTranscript {{ loop.index }}: {{ value.src }}\n{% endfor %}",
         optimize=True,  # This operation will be optimized
     )
 ]

diff --git a/docs/tutorial.md b/docs/tutorial.md
@@ -104,7 +104,7 @@ Create a file named `pipeline.yaml` with the following structure:
             medication: str
         resolution_prompt: |
           Given the following matched medication entries:
-          {% for entry in matched_entries %}
+          {% for entry in inputs %}
           Entry {{ loop.index }}: {{ entry.medication }}
           {% endfor %}
           Determine the best resolved medication name for this group of entries. The resolved
@@ -156,6 +156,23 @@ Create a file named `pipeline.yaml` with the following structure:
 
 ## Running the Pipeline
 
+!!! info "Pipeline Performance"
+
+    When running this pipeline on a sample dataset, we observed the following performance metrics using `gpt-4o-mini` as defined in the pipeline:
+
+    - Total cost: $0.10
+    - Total execution time: 49.13 seconds
+
+    If you want to run it on a smaller sample, set the `sample` parameter for the map operation. For example, `sample: 10` will run the pipeline on a random sample of 10 transcripts:
+
+    ```yaml
+    operations:
+      - name: extract_medications
+        type: map
+        sample: 10
+        ...
+    ```
+
 To execute the pipeline, run the following command in your terminal:
 
 ```bash