Skip to content

Commit

Permalink
Add pre-commit hooks
Browse files Browse the repository at this point in the history
  • Loading branch information
shreyashankar committed Aug 20, 2024
1 parent 481b6c7 commit 4965c2d
Show file tree
Hide file tree
Showing 26 changed files with 389 additions and 113 deletions.
56 changes: 56 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
ci:
autofix_prs: false

files: "^(motion|ui)/"
exclude: '\__init__.py$'

repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.5.0
hooks:
- id: trailing-whitespace
- id: end-of-file-fixer
exclude: ^.*\.egg-info/
- id: check-merge-conflict
- id: check-case-conflict
- id: pretty-format-json
args: [--autofix, --no-ensure-ascii, --no-sort-keys]
- id: check-ast
- id: debug-statements
- id: check-docstring-first

- repo: https://github.com/hadialqattan/pycln
rev: v2.4.0
hooks:
- id: pycln
args: [--all]

- repo: https://github.com/psf/black
rev: 24.1.1
hooks:
- id: black

- repo: https://github.com/pycqa/isort
rev: 5.13.2
hooks:
- id: isort
name: "isort (python)"
types: [python]
args: [--profile, black]

- repo: https://github.com/charliermarsh/ruff-pre-commit
# Ruff version.
rev: "v0.2.1"
hooks:
- id: ruff

- repo: https://github.com/pre-commit/pre-commit
rev: v3.6.0
hooks:
- id: validate_manifest

- repo: https://github.com/pre-commit/mirrors-prettier
rev: "v4.0.0-alpha.8" # Prettier version
hooks:
- id: prettier
files: "^ui/"
19 changes: 10 additions & 9 deletions motion/builder.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,20 @@
from collections import Counter, defaultdict
import copy
import json
import os
import random
from collections import Counter, defaultdict
from typing import Any, Dict, List, Optional, Tuple, Union

import yaml
from typing import Dict, List, Any, Optional, Tuple, Union
from rich.console import Console

from motion.operations import get_operation
from motion.operations.base import BaseOperation
from motion.optimizers.join_optimizer import JoinOptimizer
from motion.optimizers.map_optimizer import MapOptimizer
from motion.optimizers.reduce_optimizer import ReduceOptimizer
from motion.optimizers.join_optimizer import JoinOptimizer
from motion.utils import load_config
from rich.console import Console
import random
import json
import os
from motion.optimizers.utils import LLMClient

from motion.utils import load_config

SUPPORTED_OPS = ["map", "resolve", "reduce", "equijoin", "filter"]

Expand Down
7 changes: 4 additions & 3 deletions motion/cli.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import typer
from typing import Optional
from pathlib import Path
from typing import Optional

import typer

from motion.runner import DSLRunner
from motion.builder import Optimizer
from motion.runner import DSLRunner

app = typer.Typer()

Expand Down
3 changes: 2 additions & 1 deletion motion/operations/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
"""

from abc import ABC, abstractmethod
from typing import Dict, List, Tuple, Optional
from typing import Dict, List, Optional, Tuple

from rich.console import Console


Expand Down
17 changes: 10 additions & 7 deletions motion/operations/equijoin.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,23 @@
The `EquijoinOperation` class is a subclass of `BaseOperation` that performs an equijoin operation on two datasets. It uses a combination of blocking techniques and LLM-based comparisons to efficiently join the datasets.
"""

from typing import Dict, List, Any, Tuple
import json
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor
from typing import Any, Dict, List, Tuple

from jinja2 import Template
from collections import defaultdict
import json
from litellm import completion_cost
from sklearn.metrics.pairwise import cosine_similarity

from motion.operations.base import BaseOperation
from motion.operations.utils import (
call_llm,
parse_llm_response,
embedding,
parse_llm_response,
rich_as_completed,
validate_output,
)
from motion.operations.utils import validate_output, rich_as_completed
from litellm import completion_cost
from sklearn.metrics.pairwise import cosine_similarity


def compare_pair(
Expand Down
9 changes: 6 additions & 3 deletions motion/operations/filter.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,18 @@
"""The `FilterOperation` class is a subclass of `BaseOperation` that implements a filtering operation on input data using a language model."""

from typing import Dict, List, Any, Tuple, Optional
from concurrent.futures import ThreadPoolExecutor
from typing import Any, Dict, List, Optional, Tuple

from jinja2 import Template

from motion.operations.base import BaseOperation
from motion.operations.utils import (
RichLoopBar,
call_llm,
parse_llm_response,
call_llm_with_validation,
parse_llm_response,
validate_output,
)
from motion.operations.utils import validate_output, RichLoopBar


class FilterOperation(BaseOperation):
Expand Down
9 changes: 6 additions & 3 deletions motion/operations/map.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,20 @@
The `MapOperation` and `ParallelMapOperation` classes are subclasses of `BaseOperation` that perform mapping operations on input data. They use LLM-based processing to transform input items into output items based on specified prompts and schemas.
"""

from typing import Dict, List, Any, Tuple, Optional
from concurrent.futures import ThreadPoolExecutor
from typing import Any, Dict, List, Optional, Tuple

from jinja2 import Template

from motion.operations.base import BaseOperation
from motion.operations.utils import (
RichLoopBar,
call_llm,
parse_llm_response,
call_llm_with_gleaning,
call_llm_with_validation,
parse_llm_response,
validate_output,
)
from motion.operations.utils import validate_output, RichLoopBar


class MapOperation(BaseOperation):
Expand Down
25 changes: 15 additions & 10 deletions motion/operations/reduce.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,21 +9,26 @@
import math
import random
import time
from typing import Dict, List, Tuple, Optional
from collections import deque
from concurrent.futures import ThreadPoolExecutor, as_completed
from threading import Lock
from typing import Dict, List, Optional, Tuple

import jinja2
import numpy as np
from jinja2 import Template
from motion.operations.base import BaseOperation
from motion.operations.utils import call_llm, call_llm_with_gleaning, parse_llm_response
from motion.operations.utils import validate_output, rich_as_completed
from litellm import completion_cost
import jinja2
from threading import Lock
from collections import deque
from litellm import embedding
from sklearn.metrics.pairwise import cosine_similarity
from litellm import completion_cost, embedding
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity

from motion.operations.base import BaseOperation
from motion.operations.utils import (
call_llm,
call_llm_with_gleaning,
parse_llm_response,
rich_as_completed,
validate_output,
)


class ReduceOperation(BaseOperation):
Expand Down
22 changes: 15 additions & 7 deletions motion/operations/resolve.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,23 @@
The `ResolveOperation` class is a subclass of `BaseOperation` that performs a resolution operation on a dataset. It uses a combination of blocking techniques and LLM-based comparisons to efficiently identify and resolve duplicate or related entries within the dataset.
"""

from typing import Dict, List, Any, Tuple
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Any, Dict, List, Tuple

import jinja2
from jinja2 import Template
from motion.operations.base import BaseOperation
from motion.operations.utils import call_llm, parse_llm_response, embedding
from motion.operations.utils import validate_output, rich_as_completed, RichLoopBar
from litellm import completion_cost
from sklearn.metrics.pairwise import cosine_similarity
import jinja2

from motion.operations.base import BaseOperation
from motion.operations.utils import (
RichLoopBar,
call_llm,
embedding,
parse_llm_response,
rich_as_completed,
validate_output,
)


def compare_pair(
Expand Down Expand Up @@ -288,9 +296,9 @@ def should_compare(pair):

for future in as_completed(future_to_pair):
pair = future_to_pair[future]
is_match, cost = future.result()
is_match_result, cost = future.result()
pair_costs += cost
if is_match:
if is_match_result:
merge_clusters(pair[0], pair[1])

pbar.update(i)
Expand Down
12 changes: 7 additions & 5 deletions motion/operations/split.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
import math
import uuid
from concurrent.futures import ThreadPoolExecutor
from typing import Dict, List, Tuple
import uuid
from motion.operations.utils import call_llm, parse_llm_response

import tiktoken
from motion.operations.base import BaseOperation
import math
from litellm import completion_cost
from jinja2 import Template
from litellm import completion_cost

from motion.operations.base import BaseOperation
from motion.operations.utils import call_llm, parse_llm_response


class SplitOperation(BaseOperation):
Expand Down
1 change: 1 addition & 0 deletions motion/operations/unnest.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import copy
from typing import Dict, List, Tuple

from motion.operations.base import BaseOperation


Expand Down
15 changes: 8 additions & 7 deletions motion/operations/utils.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
import functools
import hashlib
import json
import threading
from typing import Callable, Dict, List, Any, Optional, Tuple, Iterable, Union
from litellm import completion, completion_cost
from concurrent.futures import as_completed
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union

from dotenv import load_dotenv
from frozendict import frozendict
from jinja2 import Template
from litellm import completion, completion_cost
from rich.console import Console
import hashlib
import functools
from concurrent.futures import as_completed
from tqdm import tqdm
from jinja2 import Template
from frozendict import frozendict

load_dotenv()
# litellm.set_verbose = True
Expand Down
16 changes: 9 additions & 7 deletions motion/optimizers/join_optimizer.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
from concurrent.futures import ThreadPoolExecutor
import json
import random
import re
from typing import List, Dict, Any, Optional, Tuple
from concurrent.futures import ThreadPoolExecutor
from typing import Any, Dict, List, Optional, Tuple

import numpy as np
from litellm import embedding, completion_cost
from litellm import completion_cost, embedding
from rich.console import Console
from motion.operations.resolve import compare_pair as compare_pair_resolve

from motion.operations.equijoin import compare_pair as compare_pair_equijoin
from motion.operations.resolve import compare_pair as compare_pair_resolve


class JoinOptimizer:
Expand Down Expand Up @@ -50,7 +52,7 @@ def _analyze_map_prompt_categorization(self, map_prompt: str) -> bool:
},
{
"role": "user",
"content": f"""Analyze the following map operation prompt and determine if it is explicitly categorical,
"content": f"""Analyze the following map operation prompt and determine if it is explicitly categorical,
meaning it details a specific set of possible outputs:
{map_prompt}
Expand Down Expand Up @@ -282,7 +284,7 @@ def synthesize_resolution_prompt(
reduce_key: List[str],
output_schema: Dict[str, str],
) -> str:
system_prompt = f"""You are an AI assistant tasked with creating a resolution prompt for LLM-assisted entity resolution.
system_prompt = f"""You are an AI assistant tasked with creating a resolution prompt for LLM-assisted entity resolution.
Your task is to create a prompt that will be used to merge multiple duplicate keys into a single, consolidated key.
The key(s) being resolved (known as the reduce_key) are {', '.join(reduce_key)}.
The duplicate keys will be provided in a list called 'matched_entries' in a Jinja2 template.
Expand Down Expand Up @@ -312,7 +314,7 @@ def synthesize_resolution_prompt(
{{% endfor %}}
Create a single, consolidated key that combines the information from all duplicate entries.
Create a single, consolidated key that combines the information from all duplicate entries.
When merging, follow these guidelines:
1. [Provide specific merging instructions relevant to the data type]
2. [Provide conflict resolution guidelines]
Expand Down
12 changes: 7 additions & 5 deletions motion/optimizers/map_optimizer/config_generators.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
import copy
import random
import json
from typing import Dict, Any, List
from motion.optimizers.utils import extract_jinja_variables, LLMClient
from motion.optimizers.map_optimizer.utils import generate_and_validate_prompt
import random
from typing import Any, Dict, List

from rich.console import Console

from motion.optimizers.map_optimizer.utils import generate_and_validate_prompt
from motion.optimizers.utils import LLMClient, extract_jinja_variables


class ConfigGenerator:
def __init__(
Expand Down Expand Up @@ -78,7 +80,7 @@ def _get_split_config(
Determine the split key and subprompt for processing chunks of the input data.
The split key should be a key in the input data that contains a string to be split.
The subprompt should be designed to process individual chunks of the split data.
The subprompt should be designed to process individual chunks of the split data.
Note that the subprompt's output schema will be: {json.dumps(output_schema, indent=2)}.
Important:
Expand Down
2 changes: 1 addition & 1 deletion motion/optimizers/map_optimizer/evaluator.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import json
import time
from concurrent.futures import ThreadPoolExecutor
from typing import Dict, Any, List, Tuple, Optional, Union, Callable
from typing import Any, Callable, Dict, List, Optional, Tuple, Union

from rich.console import Console

Expand Down
Loading

0 comments on commit 4965c2d

Please sign in to comment.