Merge branch 'main' of github.com:Precis-Digital/pandabear

Precis-Digital · Oct 30, 2023 · 1a963b1 · 1a963b1
2 parents 5857d79 + 61fc7de
commit 1a963b1
Show file tree

Hide file tree

Showing 14 changed files with 365 additions and 128 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,9 @@
+## v0.8.1 (2023-10-30)
+
+### Fix
+
+- remove local refs
+
 ## v0.8.0 (2023-10-26)
 
 ### Feat

diff --git a/README.md b/README.md
@@ -46,7 +46,9 @@ def foo(df: pb.DataFrame[InputDFSchema]) -> pb.DataFrame[OutputDFSchema]:
     return df
 ```
 
-Now, whenever `foo` is called, you can be sure that the data follows your predefined schemas at input and return. If it does not, an exception will be raised.
+Now, **whenever `foo` is called**, validation triggers and you can be sure that the data follows your predefined schemas at input and return. If it does not, an exception will be raised.
+
+*This package is heavily inspired by the [`pandera`](https://github.com/unionai-oss/pandera) Python package. Pandera is a fantastic Python library for statistical data testing, that offers a lot more functionality than `pandabear`. Consider this a lighter, `pandas`-only version of `pandera`. If you're looking for a more comprehensive solution that supports other backends than just `pandas` (like `spark`, `polars`, etc.), we highly recommend you check it out.*
 
 **See package level [README.md](src/pandabear/README.md) for documentation and usage examples**
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -59,7 +59,7 @@ fail_under = 0
 
 [tool.commitizen]
 name = "cz_conventional_commits"
-version = "0.8.0"
+version = "0.8.1"
 version_files = [
     "src/pandabear/__init__.py:__version__"
 ]

diff --git a/src/pandabear/__init__.py b/src/pandabear/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.8.0"
+__version__ = "0.8.1"
 
 
 # Set default logging handler to avoid "No handler found" warnings.

diff --git a/src/pandabear/exceptions.py b/src/pandabear/exceptions.py
@@ -1,5 +1,5 @@
 import re
-from typing import Any
+from typing import Any, Type
 
 import pandas as pd
 
@@ -42,6 +42,16 @@ def __init__(self, message):
         super().__init__(message)
 
 
+class UnsupportedTypeError(Exception):
+    """Raise when a field is defined with a unsupported type.
+
+    This may happen when the user defines types that are not supported.
+    """
+
+    def __init__(self, message):
+        super().__init__(message)
+
+
 class SchemaValidationError(Exception):
     """Raise when `df` does not match `schema`.
 
@@ -104,3 +114,31 @@ def _get_message(self) -> str:
         )
         fails_msg = fail_series.head(MAX_FAILURE_ROWS).to_string()
         return f"{text_msg}\n{fails_msg}"
+
+
+class IndexCheckError(Exception):
+    """Raise when an index check fails checks defined in `Field` variable.
+
+    Report the percentage of rows that failed the check, and display the first
+    few rows that failed the check.
+    """
+
+    def __init__(self, check_name: str, check_value: Any, index: Type[pd.Index], result: pd.Series):
+        self.check_name = check_name
+        self.check_value = check_value
+        self.series = index.to_series()
+        self.result = result
+        super().__init__(self._get_message())
+
+    def _get_message(self) -> str:
+        fail_series = self.series[~self.result]
+        total = len(self.series)
+        fails = len(fail_series)
+        fail_pc = int(round(100 * fails / total))
+        check_name = self.check_name.replace("series_", "")
+        text_msg = (
+            f"Column '{self.series.name}' failed check {check_name}({self.check_value}): "
+            f"{fails} of {total} ({fail_pc} %)"
+        )
+        fails_msg = fail_series.head(MAX_FAILURE_ROWS).to_string()
+        return f"{text_msg}\n{fails_msg}"