Merge branch 'evidentlyai:main' into ROUGE-metric-(evidentlyai#1318)

pmittaldev · Oct 28, 2024 · 7105e1d · 7105e1d
2 parents ac62235 + 421630f
commit 7105e1d
Show file tree

Hide file tree

Showing 123 changed files with 1,390 additions and 633 deletions.
diff --git a/.github/workflows/examples.yml b/.github/workflows/examples.yml
@@ -42,7 +42,7 @@ jobs:
         if: matrix.minimal
         run: pip install -r requirements.min.txt
       - name: Prepare examples dependencies
-        run: pip install catboost sentence-transformers
+        run: pip install catboost sentence-transformers openai
       - name: Export examples
         run: jupyter nbconvert --to python examples/*/*.ipynb --output-dir example_scripts
       - name: Run examples

diff --git a/docs/book/reference/all-metrics.md b/docs/book/reference/all-metrics.md
@@ -274,7 +274,14 @@ Check for regular expression matches.
 | **ExcludesWords()** <ul><li>Checks if the text excludes all specified words.</li><li> Considers only vocabulary words (from NLTK vocabulary). </li><li>By default, considers inflected and variant forms of the same word. </li><li>Returns True/False for every input. </li></ul> Example use:<br> `ExcludesWords(words_list=['buy', 'sell', 'bet']`| **Required:** <br>`words_list: List[str]` <br><br>**Optional:**<ul><li>`display_name`</li><li>`mode = 'all'` or `'any'`</li><li>`lemmatize = True` or `False`</li></ul> |
 | **ItemMatch()** <ul><li>Checks whether the text contains **any** (default) or **all** specified  items that are specific to each row (represented as tuples) </li><li>Returns True/False for each row. </li></ul> Example use:<br> `ItemMatch(with_column="expected")`| **Required:** <br>`with_column: str`<br><br>**Optional:**<ul><li>`display_name`</li><li>`mode = 'all'` or `'any'`</li></li><li>`case_sensitive = True` or `False`</li></ul> |
 | **ItemNoMatch()** <ul><li>Checks whether the text excludes **any** (default) or **all** specified  items that are specific to each row (represented as tuples) </li><li>Returns True/False for each row. </li></ul> Example use:<br> `ItemMatch(with_column="forbidden")`| **Required:** <br>`with_column: str`<br><br>**Optional:**<ul><li>`display_name`</li><li>`mode = 'all'` or `'any'`</li></li><li>`case_sensitive = True` or `False`</li></ul> |
+| **WordMatch()** <ul><li> Checks whether the text includes **any** (default) or **all** specified words for each row (represented as tuples). </li><li> Considers only vocabulary words (from NLTK vocabulary). </li><li> By default, considers inflected and variant forms of the same word. </li><li> Returns True/False for every input. </li></ul> Example use:<br> `WordMatch(with_column="expected"` | **Required:** <br> `with_column: str`<br><br>**Optional:**<ul><li>`display_name`</li><li>`mode = 'any'` or `'all'`</li><li>`lemmatize = True` or `False`</li></ul> |
+| **WordNoMatch()** <ul><li> Checks whether the text excludes **any** (default) or **all** specified words for each row (represented as tuples). </li><li> Considers only vocabulary words (from NLTK vocabulary). </li><li> By default, considers inflected and variant forms of the same word. </li><li> Returns True/False for every input. </li></ul> Example use:<br> `WordMatch(with_column="forbidden"` | **Required:** <br> `with_column: str`<br><br>**Optional:**<ul><li>`display_name`</li><li>`mode = 'any'` or `'all'`</li><li>`lemmatize = True` or `False`</li></ul> |
+| **ExactMatch()** <ul><li>Checks if the text matches between two columns.</li><li>Returns True/False for every input. </li></ul> Example use:<br> `ExactMatch(column_name='column_1')`| **Required:** <br>`with_column` <br><br>**Optional:**<ul><li>`display_name`</li></ul> |
+| **IsValidJSON()** <ul><li>Checks if the text in a specified column is a valid JSON.</li><li>Returns True/False for every input. </li></ul> Example use:<br> `IsValidJSON(column_name='column_1')`| **Required:** <br>`column_name` <br><br>**Optional:**<ul><li>`display_name`</li></ul> |
 | **JSONSchemaMatch()** <ul><li>Checks if the text contains a JSON object matching the **expected_schema**. Supports exact (**exact=True**) or minimal (**exact=False**) matching, with optional strict type validation (**validate_types=True**).  </li><li>Returns True/False for each row. </li></ul> Example use:<br> `JSONSchemaMatch(expected_schema={"name": str, "age": int}, exact_match=False, validate_types=True)`| **Required:** <br>`expected_schema: Dict[str, type]`<br><br>**Optional:**<ul><li>`exact_match = True` or `False`</li><li>`validate_types = True` or `False`</li></ul> |
+| **JSONMatch()** <ul><li>Compares two columns of a dataframe and checks whether the two objects in each row of the dataframe are matching JSON's or not. </li><li>Returns True/False for every input. </li></ul> Example use:<br> `JSONMatch(with_column="column_2")`| **Required:** <br> `with_column : str` <br><br>**Optional:**<ul><li>`display_name`</li> |
+| **ContainsLink()** <ul><li>Checks if the text contains at least one valid URL. </li><li>Returns True/False for each row. </li></ul> Example use:<br> `ContainsLink(column_name='column_1')`| **Required:** <br>`column_name: str`<br><br>**Optional:**<ul><li>`display_name`</li></ul> |
+| **IsValidPython()** <ul><li>Checks if the text is valid Python code without syntax errors.</li><li>Returns True/False for every input. </li></ul> Example use:<br> `IsValidPython(column_name='column_1')`| **Required:** <br>n/a<br><br>**Optional:**<ul><li>`display_name`</li></ul> |
 
 ## Descriptors: Text stats
 

diff --git a/example_test.py b/example_test.py
@@ -12,8 +12,8 @@
     "comparing_custom_statest_with_classic_distributions.py",
     "how_to_evaluate_llm_with_text_descriptors.py",
     "how_to_run_drift_report_for_text_data.py",  # too slow & torch version conflict?
-    "llm_evaluation_tutorial.ipynb",  # cloud usage
-    "llm_tracing_tutorial.ipynb",  # cloud usage
+    "llm_evaluation_tutorial.py",  # cloud usage
+    "llm_tracing_tutorial.py",  # cloud usage
 ]
 
 

diff --git a/examples/how_to_questions/how_to_make_custom_metric_and_test.ipynb b/examples/how_to_questions/how_to_make_custom_metric_and_test.ipynb
@@ -98,6 +98,8 @@
    "outputs": [],
    "source": [
     "class MyMetricResult(MetricResult):\n",
+    "    class Config:\n",
+    "        type_alias = \"evidently:metric_result:MyMetricResult\"\n",
     "    sum_value: float"
    ]
   },
@@ -119,6 +121,8 @@
    "outputs": [],
    "source": [
     "class MyMetric(Metric[MyMetricResult]):\n",
+    "  class Config:\n",
+    "    type_alias = \"evidently:metric:MyMetric\"\n",
     "  column_name: str\n",
     "\n",
     "  def __init__(self, column_name: str):\n",
@@ -235,6 +239,8 @@
     "\n",
     "\n",
     "class MyMetricResult(MetricResult):\n",
+    "    class Config:\n",
+    "        type_alias = \"evidently:metric_result:MyMetricResult\"\n",
     "    feature_name: str\n",
     "    current_sum_value: float\n",
     "    x_values_for_hist: list\n",
@@ -243,6 +249,8 @@
     "\n",
     "\n",
     "class MyMetric(Metric[MyMetricResult]):\n",
+    "  class Config:\n",
+    "    type_alias = \"evidently:metric:MyMetric\"\n",
     "  column_name: str\n",
     "\n",
     "  def __init__(self, column_name: str) -> None:\n",

diff --git a/examples/how_to_questions/how_to_run_calculations_over_text_data.ipynb b/examples/how_to_questions/how_to_run_calculations_over_text_data.ipynb
@@ -44,7 +44,6 @@
     "from evidently.metric_preset import RegressionPreset\n",
     "from evidently.metric_preset import ClassificationPreset\n",
     "from evidently.metric_preset import TargetDriftPreset\n",
-    "from evidently.metric_preset import TextOverviewPreset\n",
     "\n",
     "from evidently.metrics import *\n",
     "\n",
@@ -360,52 +359,6 @@
     "classification_report.json()"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "IAe0g1SWlV4L"
-   },
-   "source": [
-    "# Text Overview Preset"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/",
-     "height": 1000
-    },
-    "id": "9H8xHPN-tYY8",
-    "outputId": "644abbf8-d717-484f-8125-902b99288c59"
-   },
-   "outputs": [],
-   "source": [
-    "text_overview_report = Report(metrics=[\n",
-    "    TextOverviewPreset(column_name=\"Review_Text\")\n",
-    "])\n",
-    "\n",
-    "text_overview_report.run(reference_data=reviews_ref, current_data=reviews_cur, column_mapping=column_mapping)\n",
-    "text_overview_report"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/",
-     "height": 104
-    },
-    "id": "msjgy3j-f-5i",
-    "outputId": "0e86becd-b75d-42f9-d115-72de002f8786"
-   },
-   "outputs": [],
-   "source": [
-    "text_overview_report.json()"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {

diff --git a/examples/how_to_questions/how_to_use_llm_judge_template.ipynb b/examples/how_to_questions/how_to_use_llm_judge_template.ipynb
@@ -191,9 +191,7 @@
    "id": "204d90a4-694e-406b-949a-f7ba3b601eac",
    "metadata": {},
    "outputs": [],
-   "source": [
-    "print(ToxicityLLMEval().get_template().get_prompt_template())"
-   ]
+   "source": "print(ToxicityLLMEval().get_template().get_template())"
   },
   {
    "cell_type": "code",
@@ -308,7 +306,7 @@
    "source": [
     "#that's how you can see the prompt\n",
     "\n",
-    "print(ContextQualityLLMEval(question=\"question\").get_template().get_prompt_template())"
+    "print(ContextQualityLLMEval(question=\"question\").get_template().get_template())"
    ]
   },
   {
@@ -414,12 +412,13 @@
     "        pre_messages=[(\"system\", \"You are a judge which evaluates text.\")],\n",
     "        ),\n",
     "    provider = \"openai\",\n",
-    "    model = \"gpt-4o-mini\"\n",
+    "    model = \"gpt-4o-mini\",\n",
+    "    display_name=\"test\"\n",
     ")\n",
     "\n",
     "report = Report(metrics=[\n",
     "    TextEvals(column_name=\"response\", descriptors=[\n",
-    "        custom_judge(display_name=\"test\")\n",
+    "        custom_judge\n",
     "    ])\n",
     "])\n",
     "\n",

diff --git a/requirements.dev.txt b/requirements.dev.txt
@@ -3,7 +3,8 @@ wheel==0.38.1
 setuptools==65.5.1; python_version < '3.12'
 setuptools==68.2.2; python_version >= '3.12'
 jupyter==1.0.0
-mypy==0.981
+mypy==1.1.1
+pandas-stubs
 pytest==7.4.4
 pytest-asyncio==0.23.7
 types-PyYAML==6.0.1

diff --git a/setup.cfg b/setup.cfg
@@ -26,6 +26,8 @@ files = src/evidently
 python_version = 3.8
 disable_error_code = misc
 namespace_packages = true
+no_implicit_optional = False
+plugins = pydantic.mypy,numpy.typing.mypy_plugin
 
 [mypy-nltk.*]
 ignore_missing_imports = True
@@ -46,12 +48,6 @@ ignore_missing_imports = True
 [mypy-sentence_transformers.*]
 ignore_missing_imports = True
 
-[mypy-pandas.*]
-ignore_missing_imports = True
-
-[mypy-numpy.*]
-ignore_missing_imports = True
-
 [mypy-scipy.*]
 ignore_missing_imports = True
 

diff --git a/setup.py b/setup.py
@@ -85,7 +85,8 @@
             "setuptools==65.5.1; python_version < '3.12'",
             "setuptools==68.2.2; python_version >= '3.12'",
             "jupyter==1.0.0",
-            "mypy==0.981",
+            "mypy==1.1.1",
+            "pandas-stubs>=1.3.5",
             "pytest==7.4.4",
             "types-PyYAML==6.0.1",
             "types-requests==2.26.0",

diff --git a/src/evidently/_pydantic_compat.py b/src/evidently/_pydantic_compat.py
@@ -30,13 +30,13 @@
         from pydantic.v1.typing import DictStrAny
 
 else:
-    from pydantic import BaseConfig
-    from pydantic import BaseModel
-    from pydantic import Extra
-    from pydantic import Field
+    from pydantic import BaseConfig  # type: ignore[assignment]
+    from pydantic import BaseModel  # type: ignore[assignment]
+    from pydantic import Extra  # type: ignore[assignment]
+    from pydantic import Field  # type: ignore[assignment]
     from pydantic import PrivateAttr
-    from pydantic import SecretStr
-    from pydantic import ValidationError
+    from pydantic import SecretStr  # type: ignore[assignment]
+    from pydantic import ValidationError  # type: ignore[assignment]
     from pydantic import parse_obj_as
     from pydantic import validator
     from pydantic.fields import SHAPE_DICT  # type: ignore[attr-defined,no-redef]

diff --git a/src/evidently/base_metric.py b/src/evidently/base_metric.py
@@ -18,6 +18,7 @@
 import pandas as pd
 import typing_inspect
 
+from evidently._pydantic_compat import Field
 from evidently._pydantic_compat import ModelMetaclass
 from evidently._pydantic_compat import PrivateAttr
 from evidently.core import BaseResult
@@ -142,12 +143,14 @@ def get_datasets(self) -> Tuple[Optional[TEngineDataType], TEngineDataType]:
 
 class InputData(GenericInputData[pd.DataFrame]):
     @staticmethod
-    def _get_by_column_name(dataset: pd.DataFrame, additional: pd.DataFrame, column: ColumnName) -> pd.Series:
+    def _get_by_column_name(dataset: pd.DataFrame, additional: Optional[pd.DataFrame], column: ColumnName) -> pd.Series:
         if column.dataset == DatasetType.MAIN:
             if column.name not in dataset.columns:
                 raise ColumnNotFound(column.name)
             return dataset[column.name]
         if column.dataset == DatasetType.ADDITIONAL:
+            if additional is None:
+                raise ValueError("no additional dataset is provided, but field requested")
             return additional[column.name]
         raise ValueError("unknown column data")
 
@@ -220,7 +223,9 @@ def __get__(self, instance: Optional["Metric"], type: Type["Metric"]) -> FieldPa
 
 class WithResultFieldPathMetaclass(FrozenBaseMeta):
     def result_type(cls) -> Type[MetricResult]:
-        return typing_inspect.get_args(next(b for b in cls.__orig_bases__ if typing_inspect.is_generic_type(b)))[0]
+        return typing_inspect.get_args(
+            next(b for b in cls.__orig_bases__ if typing_inspect.is_generic_type(b))  # type: ignore[attr-defined]
+        )[0]
 
 
 class BasePreset(EvidentlyBaseModel):
@@ -236,7 +241,7 @@ class Config:
 
     _context: Optional["Context"] = None
 
-    options: Options
+    options: Optional[Options] = Field(default=None)
 
     fields: ClassVar[FieldsDescriptor] = FieldsDescriptor()
     # resulting options will be determined via

diff --git a/src/evidently/calculation_engine/engine.py b/src/evidently/calculation_engine/engine.py
@@ -117,8 +117,6 @@ def get_additional_features(self, data_definition: DataDefinition) -> List[Gener
                 continue
             for feature in required_features:
                 fp = feature.get_fingerprint()
-                if fp in feature:
-                    continue
                 features[fp] = feature
         return list(features.values())
 

diff --git a/src/evidently/calculation_engine/python_engine.py b/src/evidently/calculation_engine/python_engine.py
@@ -71,8 +71,8 @@ def calculate_additional_features(
     def merge_additional_features(
         self, features: Dict[GeneratedFeatures, FeatureResult[pd.DataFrame]]
     ) -> EngineDatasets[pd.DataFrame]:
-        currents: List[pd.DataFrame] = []
-        references: List[pd.DataFrame] = []
+        currents = []
+        references = []
 
         for feature, result in features.items():
             currents.append(result.current)
@@ -84,15 +84,13 @@ def merge_additional_features(
         elif len(currents) == 1:
             current = currents[0]
         else:
-            cur, *currents = currents
-            current = cur.join(currents)
+            current = currents[0].join(currents[1:])  # type: ignore[arg-type]
 
         if len(references) == 0:
             return EngineDatasets(current=current, reference=None)
         if len(references) == 1:
             return EngineDatasets(current=current, reference=references[0])
-        ref, *references = references
-        return EngineDatasets(current=current, reference=ref.join(references))
+        return EngineDatasets(current=current, reference=references[0].join(references[1:]))  # type: ignore[arg-type]
 
     def get_metric_implementation(self, metric):
         impl = super().get_metric_implementation(metric)

diff --git a/src/evidently/calculations/classification_performance.py b/src/evidently/calculations/classification_performance.py
@@ -225,7 +225,7 @@ def get_prediction_data(
     return PredictionData(
         predictions=data[prediction],
         prediction_probas=None,
-        labels=data[prediction].unique().tolist(),
+        labels=data[prediction].unique().tolist(),  # type: ignore[operator]
     )
 
 
@@ -376,7 +376,7 @@ def calculate_metrics(
         f1 = metrics.f1_score(target, prediction.predictions, average="macro")
     if prediction.prediction_probas is not None:
         binaraized_target = (
-            target.astype(str).values.reshape(-1, 1) == list(prediction.prediction_probas.columns.astype(str))
+            target.astype(str).to_numpy().reshape(-1, 1) == list(prediction.prediction_probas.columns.astype(str))
         ).astype(int)
         prediction_probas_array = prediction.prediction_probas.to_numpy()
         roc_auc = metrics.roc_auc_score(binaraized_target, prediction_probas_array, average="macro")

diff --git a/src/evidently/calculations/data_drift.py b/src/evidently/calculations/data_drift.py
@@ -210,7 +210,7 @@ def get_one_column_drift(
                 current_scatter["Timestamp"] = current_data[datetime_column_name]
                 x_name = "Timestamp"
             else:
-                current_scatter["Index"] = current_data.index
+                current_scatter["Index"] = current_data.index.to_series()
                 x_name = "Index"
         else:
             current_scatter = {}
@@ -225,7 +225,8 @@ def get_one_column_drift(
                 column_name,
                 datetime_column_name,
             )
-            current_scatter["current (mean)"] = df
+            # TODO: assignment DataFrame to Series
+            current_scatter["current (mean)"] = df  # type: ignore[assignment]
             if prefix is None:
                 x_name = "Index binned"
             else:
@@ -295,8 +296,13 @@ def get_one_column_drift(
             if len(new_values) > 0:
                 raise ValueError(f"Values {new_values} not presented in 'target_names'")
             else:
-                current_column = current_column.map(dataset_columns.target_names)
-                reference_column = reference_column.map(dataset_columns.target_names)
+                target_names_mapping = (
+                    dataset_columns.target_names
+                    if isinstance(dataset_columns.target_names, dict)
+                    else {idx: value for (idx, value) in enumerate(dataset_columns.target_names)}
+                )
+                current_column = current_column.map(target_names_mapping)
+                reference_column = reference_column.map(target_names_mapping)
         current_distribution, reference_distribution = get_distribution_for_column(
             column_type=column_type.value,
             current=current_column,