Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: have global system prompt and decription #210

Merged
merged 4 commits into from
Nov 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion docetl/operations/code_operations.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,15 @@ def get_group_key(item):
for k, v in group[0].items():
if k not in result:
result[k] = v


# Also add the reduce key
if reduce_keys != ["_all"]:
for k in reduce_keys:
if k not in result:
result[k] = group[0][k]

result[f"_counts_prereduce_{self.config['name']}"] = len(group)

results.append(result)

return results, 0.0
Expand Down
31 changes: 23 additions & 8 deletions docetl/operations/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,7 @@ def cache_key(
messages: List[Dict[str, str]],
output_schema: Dict[str, str],
scratchpad: Optional[str] = None,
system_prompt: Optional[Dict[str, str]] = None,
) -> str:
"""
Generate a unique cache key based on function arguments.
Expand All @@ -209,6 +210,7 @@ def cache_key(
"messages": json.dumps(messages, sort_keys=True),
"output_schema": json.dumps(output_schema, sort_keys=True),
"scratchpad": scratchpad,
"system_prompt": json.dumps(system_prompt, sort_keys=True),
}
return hashlib.md5(json.dumps(key_dict, sort_keys=True).encode()).hexdigest()

Expand Down Expand Up @@ -690,7 +692,7 @@ def call_llm(
Raises:
TimeoutError: If the call times out after retrying.
"""
key = cache_key(model, op_type, messages, output_schema, scratchpad)
key = cache_key(model, op_type, messages, output_schema, scratchpad, self.runner.config.get("system_prompt", {}))

max_retries = max_retries_per_timeout
attempt = 0
Expand Down Expand Up @@ -809,28 +811,41 @@ def _call_llm_with_cache(
tools = None
tool_choice = None

system_prompt = f"You are a helpful assistant, intelligently processing data. This is a {op_type} operation. You will perform the specified task on the provided data. The result should be a structured output that you will send back to the user."
persona = self.runner.config.get("system_prompt", {}).get("persona", "a helpful assistant")
dataset_description = self.runner.config.get("system_prompt", {}).get("dataset_description", "a collection of unstructured documents")
parethetical_op_instructions = "many inputs:one output" if op_type == "reduce" else "one input:one output"

system_prompt = f"You are a {persona}, intelligently transforming data. The dataset description is: {dataset_description}. You will be performing a {op_type} operation ({parethetical_op_instructions}). You will perform the specified task on the provided data, as accurately, precisely, and exhaustively as possible. The result should be a structured output that you will send back to the user."
if scratchpad:
system_prompt += f"""

You are incrementally processing data across multiple batches. Maintain intermediate state between batches to accomplish this task effectively.
You are incrementally processing data across multiple batches. You will see:
1. The current batch of data to process
2. The intermediate output so far (what you returned last time)
3. A scratchpad for tracking additional state: {scratchpad}

The intermediate output contains the partial result that directly answers the user's task, just on a subset of the data.
The scratchpad contains supporting information needed to process future batches correctly, but isn't part of the answer itself.

Current scratchpad: {scratchpad}
Example for counting words that appear >2 times:
- Intermediate output: {{"frequent_words": ["the", "and"]}} # Words seen 3+ times
- Scratchpad: {{"pending": {{"cat": 2, "dog": 1}}}} # Track words seen 1-2 times

As you process each batch:
1. Update the scratchpad with crucial information for subsequent batches.
2. This may include partial results, counters, or data that doesn't fit into {list(output_schema.keys())}.
3. Example: For counting elements that appear more than twice, track all occurrences in the scratchpad until an item exceeds the threshold.
1. Use both the intermediate output and scratchpad to inform your processing
2. Update the scratchpad with any new information needed for future batches
3. Return both your partial result (representing the answer on the current batch and the previous batches' intermediate output) and updated scratchpad

Keep the scratchpad concise (~500 chars) and easily parsable. Use clear structures like:
- Bullet points
- Bullet points
- Key-value pairs
- JSON-like format

Update the 'updated_scratchpad' field in your output with the new scratchpad content.

Remember: The scratchpad should contain information necessary for processing future batches, not the final result."""


# Truncate messages if they exceed the model's context length
messages = truncate_messages(messages, model)

Expand Down
10 changes: 5 additions & 5 deletions docetl/optimizers/map_optimizer/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,14 +307,14 @@ def _assess_operation(
self.llm_client.model,
)

prompt = f"""Task: Assess the performance of a data processing operation based on sample input-output pairs and a custom validator prompt.
prompt = f"""Task: Assess the performance of a data processing operation based on sample rows and a custom validator prompt. You will see the output of the operation for each row.

Operation Name: {op_config['name']}
Operation Type: {op_config['type']}
Current Task Prompt: {op_config.get('prompt', 'N/A')}

Sample Input-Output Pairs:
---Pair 1---
Sample Rows:
---Row 1---
{json.dumps({"input": input_1, "output": output_1}, indent=2)}
"""

Expand All @@ -332,7 +332,7 @@ def _assess_operation(
self.llm_client.model,
)
prompt += f"""
---Pair 2---
---Row 2---
{json.dumps({"input": input_2, "output": output_2}, indent=2)}
"""

Expand All @@ -350,7 +350,7 @@ def _assess_operation(
self.llm_client.model,
)
prompt += f"""
---Pair 3---
---Row 3---
{json.dumps({"input": input_3, "output": output_3}, indent=2)}
"""

Expand Down
2 changes: 1 addition & 1 deletion docetl/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ def __init__(self, config: Dict, max_threads: int = None, **kwargs):

all_ops_until_and_including_current = [
op_map[prev_op] for prev_op in step["operations"][:idx]
] + [op_map[op_name]]
] + [op_map[op_name]] + [self.config.get("system_prompt", {})]
# If there's no model in the op, add the default model
for op in all_ops_until_and_including_current:
if "model" not in op:
Expand Down
21 changes: 19 additions & 2 deletions website/src/app/api/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,11 @@ export function generatePipelineConfig(
homeDir: string,
sample_size: number | null,
optimize: boolean = false,
clear_intermediate: boolean = false
clear_intermediate: boolean = false,
system_prompt: {
datasetDescription: string | null;
persona: string | null;
} | null = null
) {
const datasets = {
input: {
Expand Down Expand Up @@ -156,7 +160,7 @@ export function generatePipelineConfig(
{
name: "data_processing",
input: Object.keys(datasets)[0], // Assuming the first dataset is the input
operations: operationsToRun.map((op: any) => op.name),
operations: operationsToRun.map((op) => op.name),
},
],
output: {
Expand All @@ -177,8 +181,21 @@ export function generatePipelineConfig(
),
},
},
system_prompt: {},
};

if (system_prompt) {
if (system_prompt.datasetDescription) {
// @ts-ignore
pipelineConfig.system_prompt!.dataset_description =
system_prompt.datasetDescription;
}
if (system_prompt.persona) {
// @ts-ignore
pipelineConfig.system_prompt!.persona = system_prompt.persona;
}
}

// Get the inputPath from the intermediate_dir
let inputPath;
let outputPath;
Expand Down
4 changes: 3 additions & 1 deletion website/src/app/api/writePipelineConfig/route.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ export async function POST(request: Request) {
sample_size,
optimize = false,
clear_intermediate = false,
system_prompt,
} = await request.json();

if (!name) {
Expand Down Expand Up @@ -42,7 +43,8 @@ export async function POST(request: Request) {
homeDir,
sample_size,
optimize,
clear_intermediate
clear_intermediate,
system_prompt
);

// Save the YAML file in the user's home directory
Expand Down
1 change: 1 addition & 0 deletions website/src/app/localStorageKeys.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,4 @@ export const DEFAULT_MODEL_KEY = "docetl_defaultModel";
export const OPTIMIZER_MODEL_KEY = "docetl_optimizerModel";
export const AUTO_OPTIMIZE_CHECK_KEY = "docetl_autoOptimizeCheck";
export const HIGH_LEVEL_GOAL_KEY = "docetl_highLevelGoal";
export const SYSTEM_PROMPT_KEY = "docetl_systemPrompt";
11 changes: 9 additions & 2 deletions website/src/app/playground/page.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -296,10 +296,17 @@ const CodeEditorPipelineApp: React.FC = () => {
</MenubarContent>
</MenubarMenu>
<MenubarMenu>
<MenubarTrigger>Assistant</MenubarTrigger>
<MenubarTrigger>Help</MenubarTrigger>
<MenubarContent>
<MenubarItem
onSelect={() =>
window.open("https://ucbepic.github.io/docetl/", "_blank")
}
>
Show Documentation
</MenubarItem>
<MenubarItem onSelect={() => setShowChat(!showChat)}>
Toggle Chat
Show Chat
</MenubarItem>
</MenubarContent>
</MenubarMenu>
Expand Down
73 changes: 68 additions & 5 deletions website/src/components/AIChatPanel.tsx
Original file line number Diff line number Diff line change
@@ -1,8 +1,14 @@
"use client";

import React, { useRef, useState, useEffect } from "react";
import React, {
useRef,
useState,
useEffect,
useMemo,
useCallback,
} from "react";
import { ResizableBox } from "react-resizable";
import { Eraser, RefreshCw, X, Copy } from "lucide-react";
import { RefreshCw, X, Copy } from "lucide-react";
import { Button } from "@/components/ui/button";
import { Input } from "@/components/ui/input";
import { ScrollArea } from "@/components/ui/scroll-area";
Expand All @@ -13,6 +19,13 @@ import "react-resizable/css/styles.css";
import { LLMContextPopover } from "@/components/LLMContextPopover";
import { usePipelineContext } from "@/contexts/PipelineContext";
import ReactMarkdown from "react-markdown";
import {
Popover,
PopoverContent,
PopoverTrigger,
} from "@/components/ui/popover";
import { Textarea } from "@/components/ui/textarea";
import { debounce } from "lodash";

interface AIChatPanelProps {
onClose: () => void;
Expand Down Expand Up @@ -45,7 +58,9 @@ const AIChatPanel: React.FC<AIChatPanelProps> = ({ onClose }) => {
initialMessages: [],
id: "persistent-chat",
});
const { serializeState } = usePipelineContext();
const { serializeState, highLevelGoal, setHighLevelGoal } =
usePipelineContext();
const [localGoal, setLocalGoal] = useState(highLevelGoal);

const handleMouseDown = (e: React.MouseEvent<HTMLDivElement>) => {
if ((e.target as HTMLElement).classList.contains("drag-handle")) {
Expand Down Expand Up @@ -184,6 +199,25 @@ Remember, all the output fields have been converted to strings, even if they wer
);
};

const debouncedSetHighLevelGoal = useMemo(
() => debounce((value: string) => setHighLevelGoal(value), 1000),
[setHighLevelGoal]
);

useEffect(() => {
return () => {
debouncedSetHighLevelGoal.cancel();
};
}, [debouncedSetHighLevelGoal]);

const handleGoalUpdate = useCallback(
(newGoal: string) => {
setLocalGoal(newGoal);
debouncedSetHighLevelGoal(newGoal);
},
[debouncedSetHighLevelGoal]
);

return (
<div
style={{
Expand All @@ -210,6 +244,33 @@ Remember, all the output fields have been converted to strings, even if they wer
<LLMContextPopover />
</span>
<div className="flex items-center gap-1">
<Popover>
<PopoverTrigger asChild>
<span className="text-s text-primary font-medium flex items-center gap-2 cursor-pointer">
<Button
variant="ghost"
size="sm"
className="h-4 px-2 text-xs"
>
{highLevelGoal ? "Edit Analysis Goal" : "Set Analysis Goal"}
</Button>
</span>
</PopoverTrigger>
<PopoverContent className="w-80 z-[10000]" side="top" align="end">
<div className="space-y-2">
<h4 className="font-medium text-sm">Pipeline Goal</h4>
<Textarea
placeholder="Describe the high-level goal of your pipeline..."
className="min-h-[100px]"
value={localGoal}
onChange={(e) => handleGoalUpdate(e.target.value)}
/>
<p className="text-xs text-muted-foreground">
This helps the assistant provide more relevant suggestions.
</p>
</div>
</PopoverContent>
</Popover>
<Button
variant="ghost"
size="sm"
Expand Down Expand Up @@ -242,8 +303,10 @@ Remember, all the output fields have been converted to strings, even if they wer
onClick={() => {
handleInputChange({
target: { value: suggestion },
} as any);
handleMessageSubmit({ preventDefault: () => {} } as any);
} as React.ChangeEvent<HTMLInputElement>);
handleMessageSubmit({
preventDefault: () => {},
} as React.FormEvent);
}}
>
{suggestion}
Expand Down
Loading