From 2f7e7f3d3295bb0c9e3069b6afe2dc2a37b167a1 Mon Sep 17 00:00:00 2001 From: Michael Milton Date: Mon, 18 Nov 2019 01:24:46 +1100 Subject: [PATCH 01/31] Initial groundwork for the rewrite --- pandas_schema/column.py | 63 ++++-- pandas_schema/errors.py | 6 + pandas_schema/schema.py | 2 +- pandas_schema/validation.py | 411 +++--------------------------------- 4 files changed, 85 insertions(+), 397 deletions(-) diff --git a/pandas_schema/column.py b/pandas_schema/column.py index 199b883..cec4153 100644 --- a/pandas_schema/column.py +++ b/pandas_schema/column.py @@ -4,24 +4,51 @@ from . import validation from .validation_warning import ValidationWarning -class Column: - def __init__(self, name: str, validations: typing.Iterable['validation._BaseValidation'] = [], allow_empty=False): - """ - Creates a new Column object - :param name: The column header that defines this column. This must be identical to the header used in the CSV/Data Frame you are validating. - :param validations: An iterable of objects implementing _BaseValidation that will generate ValidationErrors - :param allow_empty: True if an empty column is considered valid. False if we leave that logic up to the Validation - """ - self.name = name - self.validations = list(validations) - self.allow_empty = allow_empty +def _column( + validations: typing.Iterable[validation.IndexSeriesValidation], + index: typing.Union[int, str] = None, + position: bool = False +): + """ + A utility method for setting the index data on a set of Validations + :param validations: A list of validations to modify + :param index: The index of the series that these validations will now consider + :param position: If true, these validations use positional indexing. + See :py:class:`pandas_schema.validation.IndexSeriesValidation` + """ + for valid in validations: + valid.index = index + valid.position = position - def validate(self, series: pd.Series) -> typing.List[ValidationWarning]: - """ - Creates a list of validation errors using the Validation objects contained in the Column - :param series: A pandas Series to validate - :return: An iterable of ValidationError instances generated by the validation - """ - return [error for validation in self.validations for error in validation.get_errors(series, self)] +def label_column( + validations: typing.Iterable[validation.IndexSeriesValidation], + index: typing.Union[int, str], +): + """ + A utility method for setting the label-based column for each validation + :param validations: A list of validations to modify + :param index: The label of the series that these validations will now consider + """ + return _column( + validations, + index, + position=False + ) + + +def positional_column( + validations: typing.Iterable[validation.IndexSeriesValidation], + index: int, +): + """ + A utility method for setting the position-based column for each validation + :param validations: A list of validations to modify + :param index: The index of the series that these validations will now consider + """ + return _column( + validations, + index, + position=True + ) diff --git a/pandas_schema/errors.py b/pandas_schema/errors.py index a9176bf..ab5e73d 100644 --- a/pandas_schema/errors.py +++ b/pandas_schema/errors.py @@ -10,6 +10,12 @@ class PanSchInvalidSchemaError(PanSchError): """ +class PanSchNoIndexError(PanSchInvalidSchemaError): + """ + A validation was provided that has not specified an index + """ + + class PanSchArgumentError(PanSchError): """ An argument passed to a function has an invalid type or value diff --git a/pandas_schema/schema.py b/pandas_schema/schema.py index 5c0442e..13d8158 100644 --- a/pandas_schema/schema.py +++ b/pandas_schema/schema.py @@ -11,7 +11,7 @@ class Schema: A schema that defines the columns required in the target DataFrame """ - def __init__(self, columns: typing.Iterable[Column], ordered: bool = False): + def __init__(self, columns: typing.Iterable[Column], ordered: bool = False): """ :param columns: A list of column objects :param ordered: True if the Schema should associate its Columns with DataFrame columns by position only, ignoring diff --git a/pandas_schema/validation.py b/pandas_schema/validation.py index 2a3f2f8..9343d7b 100644 --- a/pandas_schema/validation.py +++ b/pandas_schema/validation.py @@ -8,412 +8,67 @@ from . import column from .validation_warning import ValidationWarning -from .errors import PanSchArgumentError +from .errors import PanSchArgumentError, PanSchNoIndexError from pandas.api.types import is_categorical_dtype, is_numeric_dtype -class _BaseValidation: - """ - The validation base class that defines any object that can create a list of errors from a Series - """ - __metaclass__ = abc.ABCMeta - +class _BaseValidation(abc.ABC): @abc.abstractmethod - def get_errors(self, series: pd.Series, column: 'column.Column') -> typing.Iterable[ValidationWarning]: + def validate(self, df: pd.DataFrame) -> typing.Iterable[ValidationWarning]: """ - Return a list of errors in the given series - :param series: - :param column: - :return: + Validates a data frame + :param df: Data frame to validate + :return: All validation failures detected by this validation """ class _SeriesValidation(_BaseValidation): """ - Implements the _BaseValidation interface by returning a Boolean series for each element that either passes or - fails the validation + A _SeriesValidation validates a DataFrame by selecting a single series from it, and applying some validation + to it """ - __metaclass__ = abc.ABCMeta - - def __init__(self, **kwargs): - self._custom_message = kwargs.get('message') - - @property - def message(self): - return self._custom_message or self.default_message - - @abc.abstractproperty - def default_message(self) -> str: - """ - Create a message to be displayed whenever this validation fails - This should be a generic message for the validation type, but can be overwritten if the user provides a - message kwarg - """ @abc.abstractmethod - def validate(self, series: pd.Series) -> pd.Series: - """ - Returns a Boolean series, where each value of False is an element in the Series that has failed the validation - :param series: - :return: - """ - - def __invert__(self): - """ - Returns a negated version of this validation - """ - return _InverseValidation(self) - - def __or__(self, other: '_SeriesValidation'): - """ - Returns a validation which is true if either this or the other validation is true - """ - return _CombinedValidation(self, other, operator.or_) - - def __and__(self, other: '_SeriesValidation'): + def select_series(self, df: pd.DataFrame) -> pd.Series: """ - Returns a validation which is true if either this or the other validation is true + Selects a series from the DataFrame that will be validated """ - return _CombinedValidation(self, other, operator.and_) - - def get_errors(self, series: pd.Series, column: 'column.Column'): - - errors = [] - - # Calculate which columns are valid using the child class's validate function, skipping empty entries if the - # column specifies to do so - simple_validation = ~self.validate(series) - if column.allow_empty: - # Failing results are those that are not empty, and fail the validation - # explicitly check to make sure the series isn't a category because issubdtype will FAIL if it is - if is_categorical_dtype(series) or is_numeric_dtype(series): - validated = ~series.isnull() & simple_validation - else: - validated = (series.str.len() > 0) & simple_validation - - else: - validated = simple_validation - - # Cut down the original series to only ones that failed the validation - indices = series.index[validated] - - # Use these indices to find the failing items. Also print the index which is probably a row number - for i in indices: - element = series[i] - errors.append(ValidationWarning( - message=self.message, - value=element, - row=i, - column=series.name - )) - - return errors - - -class _InverseValidation(_SeriesValidation): - """ - Negates an ElementValidation - """ - - def __init__(self, validation: _SeriesValidation): - self.negated = validation - super().__init__() - - def validate(self, series: pd.Series): - return ~ self.negated.validate(series) - - @property - def default_message(self): - return self.negated.message + ' ' - - -class _CombinedValidation(_SeriesValidation): - """ - Validates if one and/or the other validation is true for an element - """ - - def __init__(self, validation_a: _SeriesValidation, validation_b: _SeriesValidation, operator): - self.operator = operator - self.v_a = validation_a - self.v_b = validation_b - super().__init__() - - def validate(self, series: pd.Series): - return self.operator(self.v_a.validate(series), self.v_b.validate(series)) - - @property - def default_message(self): - return '({}) {} ({})'.format(self.v_a.message, self.operator, self.v_b.message) - - -class CustomSeriesValidation(_SeriesValidation): - """ - Validates using a user-provided function that operates on an entire series (for example by using one of the pandas - Series methods: http://pandas.pydata.org/pandas-docs/stable/api.html#series) - """ - - def __init__(self, validation: typing.Callable[[pd.Series], pd.Series], message: str): - """ - :param message: The error message to provide to the user if this validation fails. The row and column and - failing value will automatically be prepended to this message, so you only have to provide a message that - describes what went wrong, for example 'failed my validation' will become - - {row: 1, column: "Column Name"}: "Value" failed my validation - :param validation: A function that takes a pandas Series and returns a boolean Series, where each cell is equal - to True if the object passed validation, and False if it failed - """ - self._validation = validation - super().__init__(message=message) - - def validate(self, series: pd.Series) -> pd.Series: - return self._validation(series) - - -class CustomElementValidation(_SeriesValidation): - """ - Validates using a user-provided function that operates on each element - """ - - def __init__(self, validation: typing.Callable[[typing.Any], typing.Any], message: str): - """ - :param message: The error message to provide to the user if this validation fails. The row and column and - failing value will automatically be prepended to this message, so you only have to provide a message that - describes what went wrong, for example 'failed my validation' will become - - {row: 1, column: "Column Name"}: "Value" failed my validation - :param validation: A function that takes the value of a data frame cell and returns True if it passes the - the validation, and false if it doesn't - """ - self._validation = validation - super().__init__(message=message) - - def validate(self, series: pd.Series) -> pd.Series: - return series.apply(self._validation) - - -class InRangeValidation(_SeriesValidation): - """ - Checks that each element in the series is within a given numerical range - """ - - def __init__(self, min: float = -math.inf, max: float = math.inf, **kwargs): - """ - :param min: The minimum (inclusive) value to accept - :param max: The maximum (exclusive) value to accept - """ - self.min = min - self.max = max - super().__init__(**kwargs) - - @property - def default_message(self): - return 'was not in the range [{}, {})'.format(self.min, self.max) - - def validate(self, series: pd.Series) -> pd.Series: - series = pd.to_numeric(series) - return (series >= self.min) & (series < self.max) - -class IsDtypeValidation(_BaseValidation): - """ - Checks that a series has a certain numpy dtype - """ - - def __init__(self, dtype: np.dtype, **kwargs): - """ - :param dtype: The numpy dtype to check the column against - """ - self.dtype = dtype - super().__init__(**kwargs) - - def get_errors(self, series: pd.Series, column: 'column.Column' = None): - if not np.issubdtype(series.dtype, self.dtype): - return [ValidationWarning( - 'The column {} has a dtype of {} which is not a subclass of the required type {}'.format( - column.name if column else '', series.dtype, self.dtype - ) - )] - else: - return [] - - -class CanCallValidation(_SeriesValidation): - """ - Validates if a given function can be called on each element in a column without raising an exception - """ - - def __init__(self, func: typing.Callable, **kwargs): - """ - :param func: A python function that will be called with the value of each cell in the DataFrame. If this - function throws an error, this cell is considered to have failed the validation. Otherwise it has passed. - """ - if callable(type): - self.callable = func - else: - raise PanSchArgumentError('The object "{}" passed to CanCallValidation is not callable!'.format(type)) - super().__init__(**kwargs) - - @property - def default_message(self): - return 'raised an exception when the callable {} was called on it'.format(self.callable) - - def can_call(self, var): - try: - self.callable(var) - return True - except: - return False - - def validate(self, series: pd.Series) -> pd.Series: - return series.apply(self.can_call) - - -class CanConvertValidation(CanCallValidation): - """ - Checks if each element in a column can be converted to a Python object type - """ - - """ - Internally this uses the same logic as CanCallValidation since all types are callable in python. - However this class overrides the error messages to make them more directed towards types - """ - - def __init__(self, _type: type, **kwargs): + @abc.abstractmethod + def validate_series(self, series: pd.Series) -> typing.Iterable[ValidationWarning]: """ - :param _type: Any python type. Its constructor will be called with the value of the individual cell as its - only argument. If it throws an exception, the value is considered to fail the validation, otherwise it has passed + Validate a single series """ - if isinstance(_type, type): - super(CanConvertValidation, self).__init__(_type, **kwargs) - else: - raise PanSchArgumentError('{} is not a valid type'.format(_type)) - @property - def default_message(self): - return 'cannot be converted to type {}'.format(self.callable) + def validate(self, df: pd.DataFrame) -> typing.Iterable[ValidationWarning]: + series = self.select_series(df) + return self.validate_series(series) -class MatchesPatternValidation(_SeriesValidation): +class IndexSeriesValidation(_SeriesValidation): """ - Validates that a string or regular expression can match somewhere in each element in this column + Selects a series from the DataFrame, using label or position-based indexes that can be provided at instantiation + or later """ - def __init__(self, pattern, options={}, **kwargs): + def __init__(self, index: typing.Union[int, str] = None, position: bool = False): """ - :param kwargs: Arguments to pass to Series.str.contains - (http://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.str.contains.html) - pat is the only required argument + Creates a new IndexSeriesValidation + :param index: An index with which to select the series + :param position: If true, the index is a position along the axis (ie, index=0 indicates the first element). + Otherwise it's a label (ie, index=0) indicates the column with the label of 0 """ - self.pattern = pattern - self.options = options - super().__init__(**kwargs) - - @property - def default_message(self): - return 'does not match the pattern "{}"'.format(self.pattern) - - def validate(self, series: pd.Series) -> pd.Series: - return series.astype(str).str.contains(self.pattern, **self.options) - - -class TrailingWhitespaceValidation(_SeriesValidation): - """ - Checks that there is no trailing whitespace in this column - """ - - def __init__(self, **kwargs): - super().__init__(**kwargs) - - @property - def default_message(self): - return 'contains trailing whitespace' - - def validate(self, series: pd.Series) -> pd.Series: - return ~series.astype(str).str.contains('\s+$') - - -class LeadingWhitespaceValidation(_SeriesValidation): - """ - Checks that there is no leading whitespace in this column - """ - - def __init__(self, **kwargs): - super().__init__(**kwargs) - - @property - def default_message(self): - return 'contains leading whitespace' - - def validate(self, series: pd.Series) -> pd.Series: - return ~series.astype(str).str.contains('^\s+') - - -class IsDistinctValidation(_SeriesValidation): - """ - Checks that every element of this column is different from each other element - """ - - def __init__(self, **kwargs): - super().__init__(**kwargs) - - @property - def default_message(self): - return 'contains values that are not unique' + self.index = column + self.position = position - def validate(self, series: pd.Series) -> pd.Series: - return ~series.duplicated(keep='first') - - -class InListValidation(_SeriesValidation): - """ - Checks that each element in this column is contained within a list of possibilities - """ - - def __init__(self, options: typing.Iterable, case_sensitive: bool = True, **kwargs): + def select_series(self, df: pd.DataFrame) -> pd.Series: """ - :param options: A list of values to check. If the value of a cell is in this list, it is considered to pass the - validation + Select a series using the data stored in this validation """ - self.case_sensitive = case_sensitive - self.options = options - super().__init__(**kwargs) - - @property - def default_message(self): - values = ', '.join(str(v) for v in self.options) - return 'is not in the list of legal options ({})'.format(values) + if self.index is None: + raise PanSchNoIndexError() - def validate(self, series: pd.Series) -> pd.Series: - if self.case_sensitive: - return series.isin(self.options) + if self.position: + return df.iloc[self.index] else: - return series.str.lower().isin([s.lower() for s in self.options]) - - -class DateFormatValidation(_SeriesValidation): - """ - Checks that each element in this column is a valid date according to a provided format string - """ - - def __init__(self, date_format: str, **kwargs): - """ - :param date_format: The date format string to validate the column against. Refer to the date format code - documentation at https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior for a full - list of format codes - """ - self.date_format = date_format - super().__init__(**kwargs) - - @property - def default_message(self): - return 'does not match the date format string "{}"'.format(self.date_format) - - def valid_date(self, val): - try: - datetime.datetime.strptime(val, self.date_format) - return True - except: - return False - - def validate(self, series: pd.Series) -> pd.Series: - return series.astype(str).apply(self.valid_date) + return df.loc[self.index] From e92045d12b4ca8c4d0e7ea26ac94ec7f4a4cc9c0 Mon Sep 17 00:00:00 2001 From: Michael Milton Date: Wed, 25 Dec 2019 01:59:06 +1100 Subject: [PATCH 02/31] InRangeValidation working with tests --- .gitignore | 0 .travis.yml | 0 LICENSE | 0 README.rst | 0 doc/common/introduction.rst | 0 doc/readme/README.rst | 0 doc/readme/conf.py | 0 doc/site/Makefile | 0 doc/site/conf.py | 0 doc/site/index.rst | 0 example/boolean.py | 0 example/boolean.txt | 0 example/example.py | 0 example/example.txt | 0 pandas_schema/__init__.py | 2 -- pandas_schema/column.py | 11 +++---- pandas_schema/{validation.py => core.py} | 39 ++++++++++++++++++++++-- pandas_schema/errors.py | 0 pandas_schema/schema.py | 1 - pandas_schema/validation_warning.py | 0 pandas_schema/validations.py | 27 ++++++++++++++++ pandas_schema/version.py | 0 requirements.txt | 0 test/__init__.py | 0 test/test_column.py | 0 test/test_example.py | 0 test/test_metadata.py | 0 test/test_schema.py | 0 test/test_validation.py | 8 ++--- test/test_validation_warning.py | 0 30 files changed, 72 insertions(+), 16 deletions(-) mode change 100644 => 100755 .gitignore mode change 100644 => 100755 .travis.yml mode change 100644 => 100755 LICENSE mode change 100644 => 100755 README.rst mode change 100644 => 100755 doc/common/introduction.rst mode change 100644 => 100755 doc/readme/README.rst mode change 100644 => 100755 doc/readme/conf.py mode change 100644 => 100755 doc/site/Makefile mode change 100644 => 100755 doc/site/conf.py mode change 100644 => 100755 doc/site/index.rst mode change 100644 => 100755 example/boolean.py mode change 100644 => 100755 example/boolean.txt mode change 100644 => 100755 example/example.py mode change 100644 => 100755 example/example.txt mode change 100644 => 100755 pandas_schema/__init__.py mode change 100644 => 100755 pandas_schema/column.py rename pandas_schema/{validation.py => core.py} (65%) mode change 100644 => 100755 mode change 100644 => 100755 pandas_schema/errors.py mode change 100644 => 100755 pandas_schema/schema.py mode change 100644 => 100755 pandas_schema/validation_warning.py create mode 100755 pandas_schema/validations.py mode change 100644 => 100755 pandas_schema/version.py mode change 100644 => 100755 requirements.txt mode change 100644 => 100755 test/__init__.py mode change 100644 => 100755 test/test_column.py mode change 100644 => 100755 test/test_example.py mode change 100644 => 100755 test/test_metadata.py mode change 100644 => 100755 test/test_schema.py mode change 100644 => 100755 test/test_validation.py mode change 100644 => 100755 test/test_validation_warning.py diff --git a/.gitignore b/.gitignore old mode 100644 new mode 100755 diff --git a/.travis.yml b/.travis.yml old mode 100644 new mode 100755 diff --git a/LICENSE b/LICENSE old mode 100644 new mode 100755 diff --git a/README.rst b/README.rst old mode 100644 new mode 100755 diff --git a/doc/common/introduction.rst b/doc/common/introduction.rst old mode 100644 new mode 100755 diff --git a/doc/readme/README.rst b/doc/readme/README.rst old mode 100644 new mode 100755 diff --git a/doc/readme/conf.py b/doc/readme/conf.py old mode 100644 new mode 100755 diff --git a/doc/site/Makefile b/doc/site/Makefile old mode 100644 new mode 100755 diff --git a/doc/site/conf.py b/doc/site/conf.py old mode 100644 new mode 100755 diff --git a/doc/site/index.rst b/doc/site/index.rst old mode 100644 new mode 100755 diff --git a/example/boolean.py b/example/boolean.py old mode 100644 new mode 100755 diff --git a/example/boolean.txt b/example/boolean.txt old mode 100644 new mode 100755 diff --git a/example/example.py b/example/example.py old mode 100644 new mode 100755 diff --git a/example/example.txt b/example/example.txt old mode 100644 new mode 100755 diff --git a/pandas_schema/__init__.py b/pandas_schema/__init__.py old mode 100644 new mode 100755 index 6f7ff97..fabe184 --- a/pandas_schema/__init__.py +++ b/pandas_schema/__init__.py @@ -1,4 +1,2 @@ -from .column import Column from .validation_warning import ValidationWarning -from .schema import Schema from .version import __version__ diff --git a/pandas_schema/column.py b/pandas_schema/column.py old mode 100644 new mode 100755 index cec4153..e0df39a --- a/pandas_schema/column.py +++ b/pandas_schema/column.py @@ -1,12 +1,11 @@ import typing import pandas as pd -from . import validation +import pandas_schema.core from .validation_warning import ValidationWarning - def _column( - validations: typing.Iterable[validation.IndexSeriesValidation], + validations: typing.Iterable['pandas_schema.core.IndexSeriesValidation'], index: typing.Union[int, str] = None, position: bool = False ): @@ -21,9 +20,8 @@ def _column( valid.index = index valid.position = position - def label_column( - validations: typing.Iterable[validation.IndexSeriesValidation], + validations: typing.Iterable['pandas_schema.core.IndexSeriesValidation'], index: typing.Union[int, str], ): """ @@ -37,9 +35,8 @@ def label_column( position=False ) - def positional_column( - validations: typing.Iterable[validation.IndexSeriesValidation], + validations: typing.Iterable['pandas_schema.core.IndexSeriesValidation'], index: int, ): """ diff --git a/pandas_schema/validation.py b/pandas_schema/core.py old mode 100644 new mode 100755 similarity index 65% rename from pandas_schema/validation.py rename to pandas_schema/core.py index 9343d7b..9b3c2fc --- a/pandas_schema/validation.py +++ b/pandas_schema/core.py @@ -5,6 +5,7 @@ import numpy as np import typing import operator +import re from . import column from .validation_warning import ValidationWarning @@ -13,6 +14,9 @@ class _BaseValidation(abc.ABC): + """ + A validation is, broadly, just a function that maps a data frame to a list of errors + """ @abc.abstractmethod def validate(self, df: pd.DataFrame) -> typing.Iterable[ValidationWarning]: """ @@ -51,15 +55,42 @@ class IndexSeriesValidation(_SeriesValidation): or later """ - def __init__(self, index: typing.Union[int, str] = None, position: bool = False): + def __init__(self, index: typing.Union[int, str] = None, position: bool = False, message:str=None): """ Creates a new IndexSeriesValidation :param index: An index with which to select the series :param position: If true, the index is a position along the axis (ie, index=0 indicates the first element). Otherwise it's a label (ie, index=0) indicates the column with the label of 0 """ - self.index = column + self.index = index self.position = position + self.custom_message = message + + @property + def message(self): + """ + Gets a message describing how the DataFrame cell failed the validation + This shouldn't really be overridden, instead override default_message so that users can still set per-object + messages + :return: + """ + return self.custom_message or self.default_message + + @property + def readable_name(self): + """ + A readable name for this validation, to be shown in validation warnings + """ + return type(self).__name__ + + @property + def default_message(self) -> str: + """ + Create a message to be displayed whenever this validation fails + This should be a generic message for the validation type, but can be overwritten if the user provides a + message kwarg + """ + return 'failed the {}'.format(self.readable_name) def select_series(self, df: pd.DataFrame) -> pd.Series: """ @@ -72,3 +103,7 @@ def select_series(self, df: pd.DataFrame) -> pd.Series: return df.iloc[self.index] else: return df.loc[self.index] + + @abc.abstractmethod + def validate_series(self, series: pd.Series) -> typing.Iterable[ValidationWarning]: + pass diff --git a/pandas_schema/errors.py b/pandas_schema/errors.py old mode 100644 new mode 100755 diff --git a/pandas_schema/schema.py b/pandas_schema/schema.py old mode 100644 new mode 100755 index 13d8158..da27d81 --- a/pandas_schema/schema.py +++ b/pandas_schema/schema.py @@ -3,7 +3,6 @@ from .errors import PanSchInvalidSchemaError, PanSchArgumentError from .validation_warning import ValidationWarning -from .column import Column class Schema: diff --git a/pandas_schema/validation_warning.py b/pandas_schema/validation_warning.py old mode 100644 new mode 100755 diff --git a/pandas_schema/validations.py b/pandas_schema/validations.py new file mode 100755 index 0000000..14c3df8 --- /dev/null +++ b/pandas_schema/validations.py @@ -0,0 +1,27 @@ +from .core import _SeriesValidation, IndexSeriesValidation +from .validation_warning import ValidationWarning +import pandas as pd +import math +import typing + +class InRangeValidation(IndexSeriesValidation): + """ + Checks that each element in the series is within a given numerical range + """ + + def __init__(self, min: float = -math.inf, max: float = math.inf, **kwargs): + """ + :param min: The minimum (inclusive) value to accept + :param max: The maximum (exclusive) value to accept + """ + self.min = min + self.max = max + super().__init__(**kwargs) + + @property + def default_message(self): + return 'was not in the range [{}, {})'.format(self.min, self.max) + + def validate_series(self, series: pd.Series) -> typing.Iterable[ValidationWarning]: + series = pd.to_numeric(series) + return (series >= self.min) & (series < self.max) diff --git a/pandas_schema/version.py b/pandas_schema/version.py old mode 100644 new mode 100755 diff --git a/requirements.txt b/requirements.txt old mode 100644 new mode 100755 diff --git a/test/__init__.py b/test/__init__.py old mode 100644 new mode 100755 diff --git a/test/test_column.py b/test/test_column.py old mode 100644 new mode 100755 diff --git a/test/test_example.py b/test/test_example.py old mode 100644 new mode 100755 diff --git a/test/test_metadata.py b/test/test_metadata.py old mode 100644 new mode 100755 diff --git a/test/test_schema.py b/test/test_schema.py old mode 100644 new mode 100755 diff --git a/test/test_validation.py b/test/test_validation.py old mode 100644 new mode 100755 index 7914025..d8928dc --- a/test/test_validation.py +++ b/test/test_validation.py @@ -3,10 +3,10 @@ import re from numpy import nan, dtype +import pandas as pd -from pandas_schema import Column, Schema -from pandas_schema.validation import _BaseValidation -from pandas_schema.validation import * +from pandas_schema.validations import InRangeValidation +from pandas_schema.core import _BaseValidation from pandas_schema import ValidationWarning @@ -32,7 +32,7 @@ def validate_and_compare(self, series: list, expected_result: bool, msg: str = N self.addTypeEqualityFunc(pd.Series, self.seriesEquality) # Convert the input list to a series and validate it - results = self.validator.validate(pd.Series(series, dtype=series_dtype)) + results = self.validator.validate_series(pd.Series(series, dtype=series_dtype)) # Now find any items where their validation does not correspond to the expected_result for item, result in zip(series, results): diff --git a/test/test_validation_warning.py b/test/test_validation_warning.py old mode 100644 new mode 100755 From dcb04c45e0b922a7a7e7aba2d764d359e5a64ef4 Mon Sep 17 00:00:00 2001 From: Michael Milton Date: Tue, 21 Jan 2020 00:54:36 +1100 Subject: [PATCH 03/31] Clarify and cleanup Warning class, add back in the standard validations like LeadingWhitespace --- pandas_schema/core.py | 104 +++++++++++++++-- pandas_schema/validations.py | 217 ++++++++++++++++++++++++++++++++++- test/test_validation.py | 15 ++- 3 files changed, 313 insertions(+), 23 deletions(-) diff --git a/pandas_schema/core.py b/pandas_schema/core.py index 9b3c2fc..08807b3 100755 --- a/pandas_schema/core.py +++ b/pandas_schema/core.py @@ -8,30 +8,61 @@ import re from . import column -from .validation_warning import ValidationWarning from .errors import PanSchArgumentError, PanSchNoIndexError from pandas.api.types import is_categorical_dtype, is_numeric_dtype -class _BaseValidation(abc.ABC): +class BaseValidation(abc.ABC): """ A validation is, broadly, just a function that maps a data frame to a list of errors """ + @abc.abstractmethod - def validate(self, df: pd.DataFrame) -> typing.Iterable[ValidationWarning]: + def validate(self, df: pd.DataFrame) -> typing.Iterable[Warning]: """ Validates a data frame :param df: Data frame to validate :return: All validation failures detected by this validation """ + class Warning: + """ + Represents a difference between the schema and data frame, found during the validation of the data frame + Child classes can define their own subclass of :py:class:~pandas_schema.core.BaseValidation.Warning, but + need only do that if the subclass needs to store additional data. + """ + + def __init__(self, validation: 'BaseValidation', message: str): + self.message = message -class _SeriesValidation(_BaseValidation): + def __str__(self) -> str: + """ + The entire warning message as a string + """ + return self.message + + +class SeriesValidation(BaseValidation): """ A _SeriesValidation validates a DataFrame by selecting a single series from it, and applying some validation to it """ + class Warning(BaseValidation.Warning): + """ + Represents a difference between the schema and data frame, found during the validation of the data frame + """ + + def __init__(self, validation: BaseValidation, message: str, series: pd.Series): + super().__init__(validation, message) + self.series = series + + def __str__(self) -> str: + """ + The entire warning message as a string + """ + return '{} {}'.format(self.series.name, self.message) + @abc.abstractmethod def select_series(self, df: pd.DataFrame) -> pd.Series: """ @@ -39,31 +70,46 @@ def select_series(self, df: pd.DataFrame) -> pd.Series: """ @abc.abstractmethod - def validate_series(self, series: pd.Series) -> typing.Iterable[ValidationWarning]: + def validate_series(self, series: pd.Series) -> typing.Iterable[Warning]: """ Validate a single series """ - def validate(self, df: pd.DataFrame) -> typing.Iterable[ValidationWarning]: + def validate(self, df: pd.DataFrame) -> typing.Iterable[Warning]: series = self.select_series(df) return self.validate_series(series) -class IndexSeriesValidation(_SeriesValidation): +class IndexSeriesValidation(SeriesValidation): """ Selects a series from the DataFrame, using label or position-based indexes that can be provided at instantiation or later """ + class Warning(SeriesValidation.Warning): + """ + Represents a difference between the schema and data frame, found during the validation of the data frame + """ + + def __init__(self, validation: BaseValidation, message: str, series: pd.Series, col_index, positional): + super().__init__(validation, message, series) + self.col_index = col_index + self.positional = positional - def __init__(self, index: typing.Union[int, str] = None, position: bool = False, message:str=None): + def __str__(self) -> str: + """ + The entire warning message as a string + """ + return 'Column {} {}'.format(self.col_index, self.message) + + def __init__(self, index: typing.Union[int, str] = None, positional: bool = False, message: str = None): """ Creates a new IndexSeriesValidation :param index: An index with which to select the series - :param position: If true, the index is a position along the axis (ie, index=0 indicates the first element). + :param positional: If true, the index is a position along the axis (ie, index=0 indicates the first element). Otherwise it's a label (ie, index=0) indicates the column with the label of 0 """ self.index = index - self.position = position + self.positional = positional self.custom_message = message @property @@ -99,11 +145,45 @@ def select_series(self, df: pd.DataFrame) -> pd.Series: if self.index is None: raise PanSchNoIndexError() - if self.position: + if self.positional: return df.iloc[self.index] else: return df.loc[self.index] @abc.abstractmethod - def validate_series(self, series: pd.Series) -> typing.Iterable[ValidationWarning]: + def validate_series(self, series: pd.Series) -> typing.Iterable[Warning]: pass + + +class BooleanSeriesValidation(IndexSeriesValidation): + """ + Validation is defined by the function :py:meth:~select_cells that returns a boolean series. + Each cell that has False has failed the validation. + + Child classes need not create their own :py:class:~pandas_schema.core.BooleanSeriesValidation.Warning subclass, + because the data is in the same form for each cell. You need only define a :py:meth~default_message. + """ + class Warning(IndexSeriesValidation.Warning): + def __init__(self, validation: BaseValidation, message: str, series: pd.Series, col_index, positional, row_index, value): + super().__init__(validation, message, series, col_index, positional) + self.row_index = row_index + self.value = value + + def __str__(self) -> str: + return '{{row: {}, column: "{}"}}: "{}" {}'.format(self.row_index, self.col_index, self.value, self.message) + + @abc.abstractmethod + def select_cells(self, series: pd.Series) -> pd.Series: + """ + A BooleanSeriesValidation must return a boolean series. Each cell that has False has failed the + validation + :param series: The series to validate + """ + pass + + def validate_series(self, series: pd.Series) -> typing.Iterable[Warning]: + indices = self.select_cells(series) + cells = series[indices] + return ( + Warning(self, self.message, series, self.index, self.positional, row_idx, cell) for row_idx, cell in cells.items() + ) diff --git a/pandas_schema/validations.py b/pandas_schema/validations.py index 14c3df8..58b9396 100755 --- a/pandas_schema/validations.py +++ b/pandas_schema/validations.py @@ -1,10 +1,35 @@ -from .core import _SeriesValidation, IndexSeriesValidation +from .core import SeriesValidation, IndexSeriesValidation, BooleanSeriesValidation from .validation_warning import ValidationWarning +from .errors import PanSchError, PanSchArgumentError +import numpy as np import pandas as pd import math import typing +import datetime -class InRangeValidation(IndexSeriesValidation): + +class IsDtypeValidation(IndexSeriesValidation): + """ + Checks that a series has a certain numpy dtype + """ + + def __init__(self, dtype: np.dtype, **kwargs): + """ + :param dtype: The numpy dtype to check the column against + """ + self.dtype = dtype + super().__init__(**kwargs) + + @property + def default_message(self): + return 'did not have the dtype "{}"'.format(self.dtype.name) + + def validate_series(self, series: pd.Series): + if not series.dtype == self.dtype: + return [self.Warning(self, self.message, series, self.index, self.positional)] + + +class InRangeValidation(BooleanSeriesValidation): """ Checks that each element in the series is within a given numerical range """ @@ -18,10 +43,192 @@ def __init__(self, min: float = -math.inf, max: float = math.inf, **kwargs): self.max = max super().__init__(**kwargs) + def select_cells(self, series: pd.Series) -> pd.Series: + series = pd.to_numeric(series) + return (series >= self.min) & (series < self.max) + @property def default_message(self): return 'was not in the range [{}, {})'.format(self.min, self.max) - def validate_series(self, series: pd.Series) -> typing.Iterable[ValidationWarning]: - series = pd.to_numeric(series) - return (series >= self.min) & (series < self.max) + +class CanCallValidation(BooleanSeriesValidation): + """ + Validates if a given function can be called on each element in a column without raising an exception + """ + + def select_cells(self, series: pd.Series) -> pd.Series: + return series.apply(self.can_call) + + def __init__(self, func: typing.Callable, **kwargs): + """ + :param func: A python function that will be called with the value of each cell in the DataFrame. If this + function throws an error, this cell is considered to have failed the validation. Otherwise it has passed. + """ + if callable(type): + self.callable = func + else: + raise PanSchArgumentError('The object "{}" passed to CanCallValidation is not callable!'.format(type)) + super().__init__(**kwargs) + + @property + def default_message(self): + return 'raised an exception when the callable {} was called on it'.format(self.callable) + + def can_call(self, var): + try: + self.callable(var) + return True + except: + return False + + +class CanConvertValidation(CanCallValidation): + """ + Checks if each element in a column can be converted to a Python object type + """ + + """ + Internally this uses the same logic as CanCallValidation since all types are callable in python. + However this class overrides the error messages to make them more directed towards types + """ + + def __init__(self, _type: type, **kwargs): + """ + :param _type: Any python type. Its constructor will be called with the value of the individual cell as its + only argument. If it throws an exception, the value is considered to fail the validation, otherwise it has passed + """ + if isinstance(_type, type): + super(CanConvertValidation, self).__init__(_type, **kwargs) + else: + raise PanSchArgumentError('{} is not a valid type'.format(_type)) + + @property + def default_message(self): + return 'cannot be converted to type {}'.format(self.callable) + + +class MatchesPatternValidation(BooleanSeriesValidation): + """ + Validates that a string or regular expression can match somewhere in each element in this column + """ + + def __init__(self, pattern, options={}, **kwargs): + """ + :param kwargs: Arguments to pass to Series.str.contains + (http://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.str.contains.html) + pat is the only required argument + """ + self.pattern = pattern + self.options = options + super().__init__(**kwargs) + + @property + def default_message(self): + return 'does not match the pattern "{}"'.format(self.pattern) + + def select_cells(self, series: pd.Series) -> pd.Series: + return series.astype(str).str.contains(self.pattern, **self.options) + + +class TrailingWhitespaceValidation(BooleanSeriesValidation): + """ + Checks that there is no trailing whitespace in this column + """ + + def __init__(self, **kwargs): + super().__init__(**kwargs) + + @property + def default_message(self): + return 'contains trailing whitespace' + + def select_cells(self, series: pd.Series) -> pd.Series: + return ~series.astype(str).str.contains('\s+$') + + +class LeadingWhitespaceValidation(BooleanSeriesValidation): + """ + Checks that there is no leading whitespace in this column + """ + + def __init__(self, **kwargs): + super().__init__(**kwargs) + + @property + def default_message(self): + return 'contains leading whitespace' + + def select_cells(self, series: pd.Series) -> pd.Series: + return ~series.astype(str).str.contains('^\s+') + + +class IsDistinctValidation(BooleanSeriesValidation): + """ + Checks that every element of this column is different from each other element + """ + + def __init__(self, **kwargs): + super().__init__(**kwargs) + + @property + def default_message(self): + return 'contains values that are not unique' + + def select_cells(self, series: pd.Series) -> pd.Series: + return ~series.duplicated(keep='first') + + +class InListValidation(BooleanSeriesValidation): + """ + Checks that each element in this column is contained within a list of possibilities + """ + + def __init__(self, options: typing.Iterable, case_sensitive: bool = True, **kwargs): + """ + :param options: A list of values to check. If the value of a cell is in this list, it is considered to pass the + validation + """ + self.case_sensitive = case_sensitive + self.options = options + super().__init__(**kwargs) + + @property + def default_message(self): + values = ', '.join(str(v) for v in self.options) + return 'is not in the list of legal options ({})'.format(values) + + def select_cells(self, series: pd.Series) -> pd.Series: + if self.case_sensitive: + return series.isin(self.options) + else: + return series.str.lower().isin([s.lower() for s in self.options]) + + +class DateFormatValidation(BooleanSeriesValidation): + """ + Checks that each element in this column is a valid date according to a provided format string + """ + + def __init__(self, date_format: str, **kwargs): + """ + :param date_format: The date format string to validate the column against. Refer to the date format code + documentation at https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior for a full + list of format codes + """ + self.date_format = date_format + super().__init__(**kwargs) + + @property + def default_message(self): + return 'does not match the date format string "{}"'.format(self.date_format) + + def valid_date(self, val): + try: + datetime.datetime.strptime(val, self.date_format) + return True + except: + return False + + def select_cells(self, series: pd.Series) -> pd.Series: + return series.astype(str).apply(self.valid_date) diff --git a/test/test_validation.py b/test/test_validation.py index d8928dc..fef3958 100755 --- a/test/test_validation.py +++ b/test/test_validation.py @@ -1,3 +1,6 @@ +""" +Tests for pandas_schema.validations +""" import json import unittest import re @@ -5,8 +8,8 @@ from numpy import nan, dtype import pandas as pd -from pandas_schema.validations import InRangeValidation -from pandas_schema.core import _BaseValidation +from pandas_schema.validations import * +from pandas_schema.core import BooleanSeriesValidation from pandas_schema import ValidationWarning @@ -15,24 +18,24 @@ def seriesEquality(self, s1: pd.Series, s2: pd.Series, msg: str = None): if not s1.equals(s2): raise self.failureException(msg) - def validate_and_compare(self, series: list, expected_result: bool, msg: str = None, series_dtype: object = None): + def validate_and_compare(self, series: list, expected_result: bool, msg: str = None): """ Checks that every element in the provided series is equal to `expected_result` after validation - :param series_dtype: Explicity specifies the dtype for the generated Series + :param series_dtype: Explicitly specifies the dtype for the generated Series :param series: The series to check :param expected_result: Whether the elements in this series should pass the validation :param msg: The message to display if this test fails """ # Check that self.validator is correct - if not self.validator or not isinstance(self.validator, _BaseValidation): + if not self.validator or not isinstance(self.validator, BooleanSeriesValidation): raise ValueError('The class must have the validator field set to an instance of a Validation subclass') # Ensure we're comparing series correctly self.addTypeEqualityFunc(pd.Series, self.seriesEquality) # Convert the input list to a series and validate it - results = self.validator.validate_series(pd.Series(series, dtype=series_dtype)) + results = self.validator.select_cells(pd.Series(series)) # Now find any items where their validation does not correspond to the expected_result for item, result in zip(series, results): From 8bffe9357090526aa9d424c5a44e57f744c6a401 Mon Sep 17 00:00:00 2001 From: Michael Milton Date: Fri, 24 Jan 2020 01:13:13 +1100 Subject: [PATCH 04/31] Re-use some old validations for nicer diff; fix some tests --- pandas_schema/validations.py | 100 ++++++++++++++++++++++++++--------- test/test_validation.py | 24 ++++----- 2 files changed, 88 insertions(+), 36 deletions(-) diff --git a/pandas_schema/validations.py b/pandas_schema/validations.py index 58b9396..ea36595 100755 --- a/pandas_schema/validations.py +++ b/pandas_schema/validations.py @@ -1,32 +1,61 @@ -from .core import SeriesValidation, IndexSeriesValidation, BooleanSeriesValidation -from .validation_warning import ValidationWarning -from .errors import PanSchError, PanSchArgumentError -import numpy as np -import pandas as pd +import abc import math -import typing import datetime +import pandas as pd +import numpy as np +import typing +import operator +from . import column +from .core import IndexSeriesValidation, BooleanSeriesValidation +from .validation_warning import ValidationWarning +from .errors import PanSchArgumentError +from pandas.api.types import is_categorical_dtype, is_numeric_dtype -class IsDtypeValidation(IndexSeriesValidation): + +class CustomSeriesValidation(BooleanSeriesValidation): """ - Checks that a series has a certain numpy dtype + Validates using a user-provided function that operates on an entire series (for example by using one of the pandas + Series methods: http://pandas.pydata.org/pandas-docs/stable/api.html#series) """ - def __init__(self, dtype: np.dtype, **kwargs): + def __init__(self, validation: typing.Callable[[pd.Series], pd.Series], message: str): """ - :param dtype: The numpy dtype to check the column against + :param message: The error message to provide to the user if this validation fails. The row and column and + failing value will automatically be prepended to this message, so you only have to provide a message that + describes what went wrong, for example 'failed my validation' will become + + {row: 1, column: "Column Name"}: "Value" failed my validation + :param validation: A function that takes a pandas Series and returns a boolean Series, where each cell is equal + to True if the object passed validation, and False if it failed """ - self.dtype = dtype - super().__init__(**kwargs) + self._validation = validation + super().__init__(message=message) - @property - def default_message(self): - return 'did not have the dtype "{}"'.format(self.dtype.name) + def select_cells(self, series: pd.Series) -> pd.Series: + return self._validation(series) + + +class CustomElementValidation(BooleanSeriesValidation): + """ + Validates using a user-provided function that operates on each element + """ + + def __init__(self, validation: typing.Callable[[typing.Any], typing.Any], message: str): + """ + :param message: The error message to provide to the user if this validation fails. The row and column and + failing value will automatically be prepended to this message, so you only have to provide a message that + describes what went wrong, for example 'failed my validation' will become - def validate_series(self, series: pd.Series): - if not series.dtype == self.dtype: - return [self.Warning(self, self.message, series, self.index, self.positional)] + {row: 1, column: "Column Name"}: "Value" failed my validation + :param validation: A function that takes the value of a data frame cell and returns True if it passes the + the validation, and false if it doesn't + """ + self._validation = validation + super().__init__(message=message) + + def select_cells(self, series: pd.Series) -> pd.Series: + return series.apply(self._validation) class InRangeValidation(BooleanSeriesValidation): @@ -43,13 +72,36 @@ def __init__(self, min: float = -math.inf, max: float = math.inf, **kwargs): self.max = max super().__init__(**kwargs) + @property + def default_message(self): + return 'was not in the range [{}, {})'.format(self.min, self.max) + def select_cells(self, series: pd.Series) -> pd.Series: series = pd.to_numeric(series) return (series >= self.min) & (series < self.max) - @property - def default_message(self): - return 'was not in the range [{}, {})'.format(self.min, self.max) + +class IsDtypeValidation(IndexSeriesValidation): + """ + Checks that a series has a certain numpy dtype + """ + + def __init__(self, dtype: np.dtype, **kwargs): + """ + :param dtype: The numpy dtype to check the column against + """ + self.dtype = dtype + super().__init__(**kwargs) + + def validate_series(self, series: pd.Series) -> typing.Iterable[Warning]: + if not np.issubdtype(series.dtype, self.dtype): + return [ValidationWarning( + 'The column {} has a dtype of {} which is not a subclass of the required type {}'.format( + column.name if column else '', series.dtype, self.dtype + ) + )] + else: + return [] class CanCallValidation(BooleanSeriesValidation): @@ -57,9 +109,6 @@ class CanCallValidation(BooleanSeriesValidation): Validates if a given function can be called on each element in a column without raising an exception """ - def select_cells(self, series: pd.Series) -> pd.Series: - return series.apply(self.can_call) - def __init__(self, func: typing.Callable, **kwargs): """ :param func: A python function that will be called with the value of each cell in the DataFrame. If this @@ -82,6 +131,9 @@ def can_call(self, var): except: return False + def select_cells(self, series: pd.Series) -> pd.Series: + return series.apply(self.can_call) + class CanConvertValidation(CanCallValidation): """ diff --git a/test/test_validation.py b/test/test_validation.py index fef3958..0bd6623 100755 --- a/test/test_validation.py +++ b/test/test_validation.py @@ -390,7 +390,7 @@ def test_valid_strings(self): ) def test_invalid_strings(self): - validation = self.validator.validate(pd.Series([ + validation = self.validator.select_cells(pd.Series([ '1', '1', '3', @@ -476,7 +476,7 @@ def setUp(self): self.validator = IsDtypeValidation(np.number) def test_valid_items(self): - errors = self.validator.get_errors(pd.Series( + errors = self.validator.validate_series(pd.Series( [ 1, 2, @@ -486,7 +486,7 @@ def test_valid_items(self): self.assertEqual(len(errors), 0) def test_invalid_items(self): - errors = self.validator.get_errors(pd.Series( + errors = self.validator.validate_series(pd.Series( [ 'a', '', @@ -600,7 +600,7 @@ def setUp(self): def test_default_message(self): validator = InRangeValidation(min=4) - for error in validator.get_errors(pd.Series( + for error in validator.validate_series(pd.Series( [ 1, 2, @@ -611,7 +611,7 @@ def test_default_message(self): def test_custom_message(self): validator = InRangeValidation(min=4, message=self.message) - for error in validator.get_errors(pd.Series( + for error in validator.validate_series(pd.Series( [ 1, 2, @@ -631,17 +631,17 @@ def setUp(self): def test_in_range_allow_empty_with_error(self): validator = InRangeValidation(min=4) - errors = validator.get_errors(pd.Series(self.vals), Column('', allow_empty=True)) + errors = validator.validate_series(pd.Series(self.vals), Column('', allow_empty=True)) self.assertEqual(len(errors), sum(v is not None for v in self.vals)) def test_in_range_allow_empty_with_no_error(self): validator = InRangeValidation(min=0) - errors = validator.get_errors(pd.Series(self.vals), Column('', allow_empty=True)) + errors = validator.validate_series(pd.Series(self.vals), Column('', allow_empty=True)) self.assertEqual(len(errors), 0) def test_in_range_allow_empty_false_with_error(self): validator = InRangeValidation(min=4) - errors = validator.get_errors(pd.Series(self.vals), Column('', allow_empty=False)) + errors = validator.validate_series(pd.Series(self.vals), Column('', allow_empty=False)) self.assertEqual(len(errors), len(self.vals)) @@ -654,21 +654,21 @@ def setUp(self): self.validator = InListValidation(['a', 'b', 'c'], case_sensitive=False) def test_valid_elements(self): - errors = self.validator.get_errors(pd.Series(['a', 'b', 'c', None, 'A', 'B', 'C'], dtype='category'), + errors = self.validator.validate_series(pd.Series(['a', 'b', 'c', None, 'A', 'B', 'C'], dtype='category'), Column('', allow_empty=True)) self.assertEqual(len(errors), 0) def test_invalid_empty_elements(self): - errors = self.validator.get_errors(pd.Series(['aa', 'bb', 'd', None], dtype='category'), + errors = self.validator.validate_series(pd.Series(['aa', 'bb', 'd', None], dtype='category'), Column('', allow_empty=False)) self.assertEqual(len(errors), 4) def test_invalid_and_empty_elements(self): - errors = self.validator.get_errors(pd.Series(['a', None], dtype='category'), + errors = self.validator.validate_series(pd.Series(['a', None], dtype='category'), Column('', allow_empty=False)) self.assertEqual(len(errors), 1) def test_invalid_elements(self): - errors = self.validator.get_errors(pd.Series(['aa', 'bb', 'd'], dtype='category'), + errors = self.validator.validate_series(pd.Series(['aa', 'bb', 'd'], dtype='category'), Column('', allow_empty=True)) self.assertEqual(len(errors), 3) From 9c6b910bd7b7a7d27c5aa971cb7585589b57294d Mon Sep 17 00:00:00 2001 From: Michael Milton Date: Fri, 31 Jan 2020 20:54:22 +1100 Subject: [PATCH 05/31] Some miscellaneous design docs and updates --- TODO.md | 1 + UPDATE.md | 28 +++++++++++++++++++++ pandas_schema/core.py | 47 +++++++++++++----------------------- pandas_schema/validations.py | 4 ++- setup.py | 6 ++++- test/test_validation.py | 22 +++++++---------- 6 files changed, 63 insertions(+), 45 deletions(-) create mode 100755 TODO.md create mode 100755 UPDATE.md diff --git a/TODO.md b/TODO.md new file mode 100755 index 0000000..4350b8b --- /dev/null +++ b/TODO.md @@ -0,0 +1 @@ +* Add validations that apply to every column in the DF equally \ No newline at end of file diff --git a/UPDATE.md b/UPDATE.md new file mode 100755 index 0000000..80c3562 --- /dev/null +++ b/UPDATE.md @@ -0,0 +1,28 @@ +# ValidationWarnings +## Options for the ValidationWarning data +* We keep it as is, with one single ValidationWarning class that stores a `message` and a reference to the validation +that spawned it +* PREFERRED: As above, but we add a dictionary of miscellaneous kwargs to the ValidationWarning for storing stuff like the row index that failed +* We have a dataclass for each Validation type that stores things in a more structured way + * Why bother doing this if the Validation stores its own structure for the column index etc? + +## Options for the ValidationWarning message +* It's generated from the Validation as a fixed string, as it is now +* It's generated dynamically by the VW + * This means that custom messages means overriding the VW class +* PREFERRED: It's generated dynamically in the VW by calling the parent Validation with a reference to itself, e.g. + ```python + class ValidationWarning: + def __str__(self): + return self.validation.generate_message(self) + + class Validation: + def generate_message(warning: ValidationWarning) -> str: + pass + ``` + * This lets the message function use all the validation properties, and the dictionary of kwargs that it specified + * `generate_message()` will call `default_message(**kwargs)`, the dynamic class method, or `self.custom_message`, the + non-dynamic string specified by the user + * Each category of Validation will define a `create_prefix()` method, that creates the {row: 1, column: 2} prefix + that goes before each message. Thus, `generate_message()` will concatenate that with the actual message +* diff --git a/pandas_schema/core.py b/pandas_schema/core.py index 08807b3..0a7da33 100755 --- a/pandas_schema/core.py +++ b/pandas_schema/core.py @@ -6,6 +6,7 @@ import typing import operator import re +from dataclasses import dataclass from . import column from .errors import PanSchArgumentError, PanSchNoIndexError @@ -25,21 +26,23 @@ def validate(self, df: pd.DataFrame) -> typing.Iterable[Warning]: :return: All validation failures detected by this validation """ + def message(self, **kwargs) -> str: + pass + + @dataclass class Warning: """ Represents a difference between the schema and data frame, found during the validation of the data frame Child classes can define their own subclass of :py:class:~pandas_schema.core.BaseValidation.Warning, but need only do that if the subclass needs to store additional data. """ - - def __init__(self, validation: 'BaseValidation', message: str): - self.message = message + validation: 'BaseValidation' def __str__(self) -> str: """ The entire warning message as a string """ - return self.message + return self.validation.message() class SeriesValidation(BaseValidation): @@ -52,16 +55,7 @@ class Warning(BaseValidation.Warning): """ Represents a difference between the schema and data frame, found during the validation of the data frame """ - - def __init__(self, validation: BaseValidation, message: str, series: pd.Series): - super().__init__(validation, message) - self.series = series - - def __str__(self) -> str: - """ - The entire warning message as a string - """ - return '{} {}'.format(self.series.name, self.message) + series: pd.Series @abc.abstractmethod def select_series(self, df: pd.DataFrame) -> pd.Series: @@ -89,17 +83,8 @@ class Warning(SeriesValidation.Warning): """ Represents a difference between the schema and data frame, found during the validation of the data frame """ - - def __init__(self, validation: BaseValidation, message: str, series: pd.Series, col_index, positional): - super().__init__(validation, message, series) - self.col_index = col_index - self.positional = positional - - def __str__(self) -> str: - """ - The entire warning message as a string - """ - return 'Column {} {}'.format(self.col_index, self.message) + col_index: int + positional: bool def __init__(self, index: typing.Union[int, str] = None, positional: bool = False, message: str = None): """ @@ -113,24 +98,26 @@ def __init__(self, index: typing.Union[int, str] = None, positional: bool = Fals self.custom_message = message @property - def message(self): + def message(self, **kwargs): """ Gets a message describing how the DataFrame cell failed the validation This shouldn't really be overridden, instead override default_message so that users can still set per-object messages :return: """ - return self.custom_message or self.default_message + if self.custom_message: + return self.custom_message() + else: + return self.default_message(**kwargs) @property - def readable_name(self): + def readable_name(self, **kwargs): """ A readable name for this validation, to be shown in validation warnings """ return type(self).__name__ - @property - def default_message(self) -> str: + def default_message(self, **kwargs) -> str: """ Create a message to be displayed whenever this validation fails This should be a generic message for the validation type, but can be overwritten if the user provides a diff --git a/pandas_schema/validations.py b/pandas_schema/validations.py index ea36595..d2d4b72 100755 --- a/pandas_schema/validations.py +++ b/pandas_schema/validations.py @@ -85,7 +85,6 @@ class IsDtypeValidation(IndexSeriesValidation): """ Checks that a series has a certain numpy dtype """ - def __init__(self, dtype: np.dtype, **kwargs): """ :param dtype: The numpy dtype to check the column against @@ -93,6 +92,9 @@ def __init__(self, dtype: np.dtype, **kwargs): self.dtype = dtype super().__init__(**kwargs) + def default_message(self) -> str: + return 'has a dtype of {} which is not a subclass of the required type {}'.format(self.dtype,) + def validate_series(self, series: pd.Series) -> typing.Iterable[Warning]: if not np.issubdtype(series.dtype, self.dtype): return [ValidationWarning( diff --git a/setup.py b/setup.py index 8d8d0fd..2441567 100755 --- a/setup.py +++ b/setup.py @@ -81,7 +81,11 @@ def run(self): ], keywords='pandas csv verification schema', packages=find_packages(include=['pandas_schema']), - install_requires=['numpy', 'pandas>=0.19'], + install_requires=[ + 'numpy', + 'pandas>=0.19', + 'dataclasses' + ], cmdclass={ 'build_readme': BuildReadme, 'build_site': BuildHtmlDocs diff --git a/test/test_validation.py b/test/test_validation.py index 0bd6623..8efed36 100755 --- a/test/test_validation.py +++ b/test/test_validation.py @@ -606,7 +606,7 @@ def test_default_message(self): 2, 3 ] - ), Column('')): + )): self.assertNotRegex(error.message, self.message, 'Validator not using the default warning message!') def test_custom_message(self): @@ -617,7 +617,7 @@ def test_custom_message(self): 2, 3 ] - ), Column('')): + )): self.assertRegex(error.message, self.message, 'Validator not using the custom warning message!') @@ -631,17 +631,17 @@ def setUp(self): def test_in_range_allow_empty_with_error(self): validator = InRangeValidation(min=4) - errors = validator.validate_series(pd.Series(self.vals), Column('', allow_empty=True)) + errors = validator.validate_series(pd.Series(self.vals)) self.assertEqual(len(errors), sum(v is not None for v in self.vals)) def test_in_range_allow_empty_with_no_error(self): validator = InRangeValidation(min=0) - errors = validator.validate_series(pd.Series(self.vals), Column('', allow_empty=True)) + errors = validator.validate_series(pd.Series(self.vals)) self.assertEqual(len(errors), 0) def test_in_range_allow_empty_false_with_error(self): validator = InRangeValidation(min=4) - errors = validator.validate_series(pd.Series(self.vals), Column('', allow_empty=False)) + errors = validator.validate_series(pd.Series(self.vals)) self.assertEqual(len(errors), len(self.vals)) @@ -654,21 +654,17 @@ def setUp(self): self.validator = InListValidation(['a', 'b', 'c'], case_sensitive=False) def test_valid_elements(self): - errors = self.validator.validate_series(pd.Series(['a', 'b', 'c', None, 'A', 'B', 'C'], dtype='category'), - Column('', allow_empty=True)) + errors = self.validator.validate_series(pd.Series(['a', 'b', 'c', None, 'A', 'B', 'C'], dtype='category')) self.assertEqual(len(errors), 0) def test_invalid_empty_elements(self): - errors = self.validator.validate_series(pd.Series(['aa', 'bb', 'd', None], dtype='category'), - Column('', allow_empty=False)) + errors = self.validator.validate_series(pd.Series(['aa', 'bb', 'd', None], dtype='category')) self.assertEqual(len(errors), 4) def test_invalid_and_empty_elements(self): - errors = self.validator.validate_series(pd.Series(['a', None], dtype='category'), - Column('', allow_empty=False)) + errors = self.validator.validate_series(pd.Series(['a', None], dtype='category')) self.assertEqual(len(errors), 1) def test_invalid_elements(self): - errors = self.validator.validate_series(pd.Series(['aa', 'bb', 'd'], dtype='category'), - Column('', allow_empty=True)) + errors = self.validator.validate_series(pd.Series(['aa', 'bb', 'd'], dtype='category')) self.assertEqual(len(errors), 3) From 28e2c115371bb9bef88407b0a1dd693b0c94b020 Mon Sep 17 00:00:00 2001 From: Michael Milton Date: Sun, 2 Feb 2020 00:09:40 +1100 Subject: [PATCH 06/31] Sort out new ValidationWarning structure --- pandas_schema/core.py | 60 ++++++++--------------------- pandas_schema/validation_warning.py | 37 ++++++++++-------- pandas_schema/validations.py | 23 ++++++----- test/test_validation.py | 1 + 4 files changed, 50 insertions(+), 71 deletions(-) diff --git a/pandas_schema/core.py b/pandas_schema/core.py index 0a7da33..76cf323 100755 --- a/pandas_schema/core.py +++ b/pandas_schema/core.py @@ -10,6 +10,7 @@ from . import column from .errors import PanSchArgumentError, PanSchNoIndexError +from pandas_schema.validation_warning import ValidationWarning from pandas.api.types import is_categorical_dtype, is_numeric_dtype @@ -19,44 +20,23 @@ class BaseValidation(abc.ABC): """ @abc.abstractmethod - def validate(self, df: pd.DataFrame) -> typing.Iterable[Warning]: + def validate(self, df: pd.DataFrame) -> typing.Iterable[ValidationWarning]: """ Validates a data frame :param df: Data frame to validate :return: All validation failures detected by this validation """ - def message(self, **kwargs) -> str: + def message(self, warning: ValidationWarning) -> str: pass - @dataclass - class Warning: - """ - Represents a difference between the schema and data frame, found during the validation of the data frame - Child classes can define their own subclass of :py:class:~pandas_schema.core.BaseValidation.Warning, but - need only do that if the subclass needs to store additional data. - """ - validation: 'BaseValidation' - - def __str__(self) -> str: - """ - The entire warning message as a string - """ - return self.validation.message() - class SeriesValidation(BaseValidation): """ - A _SeriesValidation validates a DataFrame by selecting a single series from it, and applying some validation - to it + A _SeriesValidation validates a DataFrame by selecting a single series from it, and + applying some validation to it """ - class Warning(BaseValidation.Warning): - """ - Represents a difference between the schema and data frame, found during the validation of the data frame - """ - series: pd.Series - @abc.abstractmethod def select_series(self, df: pd.DataFrame) -> pd.Series: """ @@ -64,12 +44,12 @@ def select_series(self, df: pd.DataFrame) -> pd.Series: """ @abc.abstractmethod - def validate_series(self, series: pd.Series) -> typing.Iterable[Warning]: + def validate_series(self, series: pd.Series) -> typing.Iterable[ValidationWarning]: """ Validate a single series """ - def validate(self, df: pd.DataFrame) -> typing.Iterable[Warning]: + def validate(self, df: pd.DataFrame) -> typing.Iterable[ValidationWarning]: series = self.select_series(df) return self.validate_series(series) @@ -79,14 +59,9 @@ class IndexSeriesValidation(SeriesValidation): Selects a series from the DataFrame, using label or position-based indexes that can be provided at instantiation or later """ - class Warning(SeriesValidation.Warning): - """ - Represents a difference between the schema and data frame, found during the validation of the data frame - """ - col_index: int - positional: bool - def __init__(self, index: typing.Union[int, str] = None, positional: bool = False, message: str = None): + def __init__(self, index: typing.Union[int, str] = None, positional: bool = False, + message: str = None): """ Creates a new IndexSeriesValidation :param index: An index with which to select the series @@ -138,7 +113,7 @@ def select_series(self, df: pd.DataFrame) -> pd.Series: return df.loc[self.index] @abc.abstractmethod - def validate_series(self, series: pd.Series) -> typing.Iterable[Warning]: + def validate_series(self, series: pd.Series) -> typing.Iterable[ValidationWarning]: pass @@ -150,14 +125,6 @@ class BooleanSeriesValidation(IndexSeriesValidation): Child classes need not create their own :py:class:~pandas_schema.core.BooleanSeriesValidation.Warning subclass, because the data is in the same form for each cell. You need only define a :py:meth~default_message. """ - class Warning(IndexSeriesValidation.Warning): - def __init__(self, validation: BaseValidation, message: str, series: pd.Series, col_index, positional, row_index, value): - super().__init__(validation, message, series, col_index, positional) - self.row_index = row_index - self.value = value - - def __str__(self) -> str: - return '{{row: {}, column: "{}"}}: "{}" {}'.format(self.row_index, self.col_index, self.value, self.message) @abc.abstractmethod def select_cells(self, series: pd.Series) -> pd.Series: @@ -168,9 +135,12 @@ def select_cells(self, series: pd.Series) -> pd.Series: """ pass - def validate_series(self, series: pd.Series) -> typing.Iterable[Warning]: + def validate_series(self, series: pd.Series) -> typing.Iterable[ValidationWarning]: indices = self.select_cells(series) cells = series[indices] return ( - Warning(self, self.message, series, self.index, self.positional, row_idx, cell) for row_idx, cell in cells.items() + ValidationWarning(self, { + 'row': row_idx, + 'value': cell + }) for row_idx, cell in cells.items() ) diff --git a/pandas_schema/validation_warning.py b/pandas_schema/validation_warning.py index 320be65..3eec3db 100755 --- a/pandas_schema/validation_warning.py +++ b/pandas_schema/validation_warning.py @@ -1,22 +1,25 @@ +import pandas_schema +from dataclasses import dataclass, field + + +@dataclass class ValidationWarning: """ - Represents a difference between the schema and data frame, found during the validation of the data frame + Represents a difference between the schema and data frame, found during the validation + of the data frame + """ + validation: 'pandas_schema.BaseValidation' + """ + The validation that spawned this warning """ - def __init__(self, message: str, value: str = None, row: int = -1, column: str = None): - self.message = message - self.value = value - """The value of the failing cell in the DataFrame""" - self.row = row - """The row index (usually an integer starting from 0) of the cell that failed the validation""" - self.column = column - """The column name of the cell that failed the validation""" + props: dict = field(default_factory=dict) + """ + List of data about this warning in addition to that provided by the validation, for + example, if a cell in the DataFrame didn't match the validation, the props might + include a `value` key, for storing what the actual value was + """ - def __str__(self) -> str: - """ - The entire warning message as a string - """ - if self.row is not None and self.column is not None and self.value is not None: - return '{{row: {}, column: "{}"}}: "{}" {}'.format(self.row, self.column, self.value, self.message) - else: - return self.message + @property + def message(self): + return self.validation.message(self) diff --git a/pandas_schema/validations.py b/pandas_schema/validations.py index d2d4b72..442f237 100755 --- a/pandas_schema/validations.py +++ b/pandas_schema/validations.py @@ -41,7 +41,8 @@ class CustomElementValidation(BooleanSeriesValidation): Validates using a user-provided function that operates on each element """ - def __init__(self, validation: typing.Callable[[typing.Any], typing.Any], message: str): + def __init__(self, validation: typing.Callable[[typing.Any], typing.Any], + message: str): """ :param message: The error message to provide to the user if this validation fails. The row and column and failing value will automatically be prepended to this message, so you only have to provide a message that @@ -85,22 +86,23 @@ class IsDtypeValidation(IndexSeriesValidation): """ Checks that a series has a certain numpy dtype """ + def __init__(self, dtype: np.dtype, **kwargs): """ :param dtype: The numpy dtype to check the column against """ - self.dtype = dtype super().__init__(**kwargs) + self.dtype = dtype - def default_message(self) -> str: - return 'has a dtype of {} which is not a subclass of the required type {}'.format(self.dtype,) + def default_message(self, validation) -> str: + return 'has a dtype of {} which is not a subclass of the required type {}'.format( + self.dtype, validation.props['dtype']) def validate_series(self, series: pd.Series) -> typing.Iterable[Warning]: if not np.issubdtype(series.dtype, self.dtype): return [ValidationWarning( - 'The column {} has a dtype of {} which is not a subclass of the required type {}'.format( - column.name if column else '', series.dtype, self.dtype - ) + self, + {'dtype': series.dtype} )] else: return [] @@ -119,12 +121,15 @@ def __init__(self, func: typing.Callable, **kwargs): if callable(type): self.callable = func else: - raise PanSchArgumentError('The object "{}" passed to CanCallValidation is not callable!'.format(type)) + raise PanSchArgumentError( + 'The object "{}" passed to CanCallValidation is not callable!'.format( + type)) super().__init__(**kwargs) @property def default_message(self): - return 'raised an exception when the callable {} was called on it'.format(self.callable) + return 'raised an exception when the callable {} was called on it'.format( + self.callable) def can_call(self, var): try: diff --git a/test/test_validation.py b/test/test_validation.py index 8efed36..dbea90f 100755 --- a/test/test_validation.py +++ b/test/test_validation.py @@ -10,6 +10,7 @@ from pandas_schema.validations import * from pandas_schema.core import BooleanSeriesValidation +from pandas_schema.schema import Schema from pandas_schema import ValidationWarning From 04893b3c0c39ef171b8f301f3298d76a2f20dbf0 Mon Sep 17 00:00:00 2001 From: Michael Milton Date: Tue, 4 Feb 2020 01:16:04 +1100 Subject: [PATCH 07/31] Add indexer class, solidify message format and ValidationWarning --- pandas_schema/column.py | 78 +++++++++++++++++++------------ pandas_schema/core.py | 31 ++++++------ pandas_schema/errors.py | 14 +++++- pandas_schema/index.py | 68 +++++++++++++++++++++++++++ pandas_schema/schema.py | 91 +++++++++--------------------------- pandas_schema/validations.py | 30 +++++------- test/test_validation.py | 35 +++++++------- 7 files changed, 195 insertions(+), 152 deletions(-) create mode 100755 pandas_schema/index.py diff --git a/pandas_schema/column.py b/pandas_schema/column.py index e0df39a..048907c 100755 --- a/pandas_schema/column.py +++ b/pandas_schema/column.py @@ -1,51 +1,67 @@ import typing -import pandas as pd import pandas_schema.core -from .validation_warning import ValidationWarning +from pandas_schema.index import PandasIndexer -def _column( + +def column( validations: typing.Iterable['pandas_schema.core.IndexSeriesValidation'], - index: typing.Union[int, str] = None, - position: bool = False + index: PandasIndexer = None, + override: bool = False ): """ A utility method for setting the index data on a set of Validations :param validations: A list of validations to modify :param index: The index of the series that these validations will now consider - :param position: If true, these validations use positional indexing. + :param override: If true, override existing index values. Otherwise keep the existing ones See :py:class:`pandas_schema.validation.IndexSeriesValidation` """ for valid in validations: - valid.index = index - valid.position = position + if override or valid.index is None: + valid.index = index -def label_column( - validations: typing.Iterable['pandas_schema.core.IndexSeriesValidation'], - index: typing.Union[int, str], -): - """ - A utility method for setting the label-based column for each validation - :param validations: A list of validations to modify - :param index: The label of the series that these validations will now consider - """ - return _column( - validations, - index, - position=False - ) -def positional_column( +def column_sequence( validations: typing.Iterable['pandas_schema.core.IndexSeriesValidation'], - index: int, + override: bool = False ): """ - A utility method for setting the position-based column for each validation + A utility method for setting the index data on a set of Validations. Applies a sequential position based index, so + that the first validation gets index 0, the second gets index 1 etc. Note: this will not modify any index that + already has some kind of index :param validations: A list of validations to modify - :param index: The index of the series that these validations will now consider + :param override: If true, override existing index values. Otherwise keep the existing ones """ - return _column( - validations, - index, - position=True - ) + for i, valid in validations: + if override or valid.index is None: + valid.index = PandasIndexer(i, typ='positional') +# +# def label_column( +# validations: typing.Iterable['pandas_schema.core.IndexSeriesValidation'], +# index: typing.Union[int, str], +# ): +# """ +# A utility method for setting the label-based column for each validation +# :param validations: A list of validations to modify +# :param index: The label of the series that these validations will now consider +# """ +# return _column( +# validations, +# index, +# position=False +# ) +# +# def positional_column( +# validations: typing.Iterable['pandas_schema.core.IndexSeriesValidation'], +# index: int, +# ): +# """ +# A utility method for setting the position-based column for each validation +# :param validations: A list of validations to modify +# :param index: The index of the series that these validations will now consider +# """ +# return _column( +# validations, +# index, +# position=True + diff --git a/pandas_schema/core.py b/pandas_schema/core.py index 76cf323..54ce0b6 100755 --- a/pandas_schema/core.py +++ b/pandas_schema/core.py @@ -11,6 +11,7 @@ from . import column from .errors import PanSchArgumentError, PanSchNoIndexError from pandas_schema.validation_warning import ValidationWarning +from pandas_schema.index import PandasIndexer from pandas.api.types import is_categorical_dtype, is_numeric_dtype @@ -60,30 +61,33 @@ class IndexSeriesValidation(SeriesValidation): or later """ - def __init__(self, index: typing.Union[int, str] = None, positional: bool = False, - message: str = None): + def __init__(self, index: PandasIndexer = None, message: str = None): """ Creates a new IndexSeriesValidation :param index: An index with which to select the series - :param positional: If true, the index is a position along the axis (ie, index=0 indicates the first element). Otherwise it's a label (ie, index=0) indicates the column with the label of 0 """ self.index = index - self.positional = positional self.custom_message = message - @property - def message(self, **kwargs): + def message(self, warning: ValidationWarning): """ Gets a message describing how the DataFrame cell failed the validation This shouldn't really be overridden, instead override default_message so that users can still set per-object messages :return: """ + if self.index.type == 'position': + prefix = self.index.index + else: + prefix = '"{}"'.format(self.index.index) + if self.custom_message: - return self.custom_message() + suffix = self.custom_message else: - return self.default_message(**kwargs) + suffix = self.default_message(warning) + + return "Column {} {}".format(prefix, suffix) @property def readable_name(self, **kwargs): @@ -92,7 +96,7 @@ def readable_name(self, **kwargs): """ return type(self).__name__ - def default_message(self, **kwargs) -> str: + def default_message(self, warning: ValidationWarning) -> str: """ Create a message to be displayed whenever this validation fails This should be a generic message for the validation type, but can be overwritten if the user provides a @@ -107,10 +111,7 @@ def select_series(self, df: pd.DataFrame) -> pd.Series: if self.index is None: raise PanSchNoIndexError() - if self.positional: - return df.iloc[self.index] - else: - return df.loc[self.index] + return self.index(df) @abc.abstractmethod def validate_series(self, series: pd.Series) -> typing.Iterable[ValidationWarning]: @@ -136,8 +137,8 @@ def select_cells(self, series: pd.Series) -> pd.Series: pass def validate_series(self, series: pd.Series) -> typing.Iterable[ValidationWarning]: - indices = self.select_cells(series) - cells = series[indices] + failed = ~self.select_cells(series) + cells = series[failed] return ( ValidationWarning(self, { 'row': row_idx, diff --git a/pandas_schema/errors.py b/pandas_schema/errors.py index ab5e73d..cdc3132 100755 --- a/pandas_schema/errors.py +++ b/pandas_schema/errors.py @@ -1,8 +1,20 @@ -class PanSchError(BaseException): +class PanSchError(Exception): """ Base class for all pandas_schema exceptions """ + def __init__(self, message=None): + super().__init__(message) + + +class PanSchIndexError(PanSchError): + """ + Some issue with creating a PandasIndexer + """ + + def __init__(self, message): + super().__init__(message=message) + class PanSchInvalidSchemaError(PanSchError): """ diff --git a/pandas_schema/index.py b/pandas_schema/index.py new file mode 100755 index 0000000..d37cd91 --- /dev/null +++ b/pandas_schema/index.py @@ -0,0 +1,68 @@ +from pandas_schema.errors import PanSchIndexError +from dataclasses import dataclass +from typing import Union +import numpy +import pandas + +IndexValue = Union[numpy.string_, numpy.int_, str, int] +""" +A pandas index can either be an integer or string, or an array of either. This typing is a bit sketchy because really +a lot of things are accepted here +""" + + +class PandasIndexer: + """ + An index into a particular axis of a DataFrame. Attempts to recreate the behaviour of `df.ix[some_index]` + """ + + valid_types = {'position', 'label'} + index: IndexValue + """ + The index to use, either an integer for position-based indexing, or a string for label-based indexing + """ + type: str + """ + The type of indexing to use, either 'position' or 'label' + """ + + def __init__(self, index: IndexValue, typ: str = None): + self.index = index + + if typ is not None: + # If the type is provided, validate it + if typ not in self.valid_types: + raise PanSchIndexError('The index type was not one of {}'.format(' or '.join(self.valid_types))) + else: + self.type = typ + else: + # If the type isn't provided, guess it based on the datatype of the index + if numpy.issubdtype(type(index), numpy.character): + self.type = 'label' + elif numpy.issubdtype(type(index), numpy.int_): + self.type = 'position' + else: + raise PanSchIndexError('The index value was not either an integer or string, or an array of either of ' + 'these') + + + def __call__(self, df: pandas.DataFrame, axis: int = 0): + """ + Apply this index + :param df: The DataFrame to index + :param axis: The axis to index along. axis=0 will select a row, and axis=1 will select a column + """ + if self.type == 'label': + return df.loc(axis=axis)[self.index] + elif self.type == 'label': + return df.iloc(axis=axis)[self.index] + + +class RowIndexer(PandasIndexer): + def __call__(self, df: pandas.DataFrame): + return super().__call__(df, axis=0) + + +class ColumnIndexer(PandasIndexer): + def __call__(self, df: pandas.DataFrame): + return super().__call__(df, axis=1) diff --git a/pandas_schema/schema.py b/pandas_schema/schema.py index da27d81..83ad9c5 100755 --- a/pandas_schema/schema.py +++ b/pandas_schema/schema.py @@ -1,8 +1,10 @@ import pandas as pd import typing -from .errors import PanSchInvalidSchemaError, PanSchArgumentError -from .validation_warning import ValidationWarning +from pandas_schema.core import BaseValidation +from pandas_schema.errors import PanSchArgumentError, PanSchInvalidSchemaError +from pandas_schema.validation_warning import ValidationWarning +from pandas_schema.index import PandasIndexer class Schema: @@ -10,83 +12,32 @@ class Schema: A schema that defines the columns required in the target DataFrame """ - def __init__(self, columns: typing.Iterable[Column], ordered: bool = False): + def __init__(self, validations: typing.Iterable[BaseValidation]): """ - :param columns: A list of column objects - :param ordered: True if the Schema should associate its Columns with DataFrame columns by position only, ignoring - the header names. False if the columns should be associated by column header names only. Defaults to False + :param validations: A list of validations that will be applied to the DataFrame upon validation """ - if not columns: - raise PanSchInvalidSchemaError('An instance of the schema class must have a columns list') + if not validations: + raise PanSchInvalidSchemaError('An instance of the schema class must have a validations list') - if not isinstance(columns, typing.List): - raise PanSchInvalidSchemaError('The columns field must be a list of Column objects') + if not isinstance(validations, typing.Iterable): + raise PanSchInvalidSchemaError('The columns field must be an iterable of Validation objects') - if not isinstance(ordered, bool): - raise PanSchInvalidSchemaError('The ordered field must be a boolean') + self.validations = list(validations) - self.columns = list(columns) - self.ordered = ordered - - def validate(self, df: pd.DataFrame, columns: typing.List[str] = None) -> typing.List[ValidationWarning]: + def validate(self, df: pd.DataFrame, subset: PandasIndexer = None) -> typing.List[ValidationWarning]: """ Runs a full validation of the target DataFrame using the internal columns list :param df: A pandas DataFrame to validate - :param columns: A list of columns indicating a subset of the schema that we want to validate + :param subset: A list of columns indicating a subset of the schema that we want to validate. Can be any :return: A list of ValidationWarning objects that list the ways in which the DataFrame was invalid """ - errors = [] - df_cols = len(df.columns) - - # If no columns are passed, validate against every column in the schema. This is the default behaviour - if columns is None: - schema_cols = len(self.columns) - columns_to_pair = self.columns - if df_cols != schema_cols: - errors.append( - ValidationWarning( - 'Invalid number of columns. The schema specifies {}, but the data frame has {}'.format( - schema_cols, - df_cols) - ) - ) - return errors - - # If we did pass in columns, check that they are part of the current schema - else: - if set(columns).issubset(self.get_column_names()): - columns_to_pair = [column for column in self.columns if column.name in columns] - else: - raise PanSchArgumentError( - 'Columns {} passed in are not part of the schema'.format(set(columns).difference(self.columns)) - ) - - # We associate the column objects in the schema with data frame series either by name or by position, depending - # on the value of self.ordered - if self.ordered: - series = [x[1] for x in df.iteritems()] - column_pairs = zip(series, self.columns) - else: - column_pairs = [] - for column in columns_to_pair: - - # Throw an error if the schema column isn't in the data frame - if column.name not in df: - errors.append(ValidationWarning( - 'The column {} exists in the schema but not in the data frame'.format(column.name))) - return errors + # Apply the subset if we have one + if subset is not None: + df = subset(df) - column_pairs.append((df[column.name], column)) - - # Iterate over each pair of schema columns and data frame series and run validations - for series, column in column_pairs: - errors += column.validate(series) - - return sorted(errors, key=lambda e: e.row) - - def get_column_names(self): - """ - Returns the column names contained in the schema - """ - return [column.name for column in self.columns] + # Build the list of errors + errors = [] + for validation in self.validations: + errors.extend(validation.validate(df)) + return errors diff --git a/pandas_schema/validations.py b/pandas_schema/validations.py index 442f237..d05e180 100755 --- a/pandas_schema/validations.py +++ b/pandas_schema/validations.py @@ -94,11 +94,11 @@ def __init__(self, dtype: np.dtype, **kwargs): super().__init__(**kwargs) self.dtype = dtype - def default_message(self, validation) -> str: + def default_message(self, warning: ValidationWarning) -> str: return 'has a dtype of {} which is not a subclass of the required type {}'.format( - self.dtype, validation.props['dtype']) + self.dtype, warning.props['dtype']) - def validate_series(self, series: pd.Series) -> typing.Iterable[Warning]: + def validate_series(self, series: pd.Series) -> typing.Iterable[ValidationWarning]: if not np.issubdtype(series.dtype, self.dtype): return [ValidationWarning( self, @@ -126,8 +126,7 @@ def __init__(self, func: typing.Callable, **kwargs): type)) super().__init__(**kwargs) - @property - def default_message(self): + def default_message(self, warning: ValidationWarning): return 'raised an exception when the callable {} was called on it'.format( self.callable) @@ -162,8 +161,7 @@ def __init__(self, _type: type, **kwargs): else: raise PanSchArgumentError('{} is not a valid type'.format(_type)) - @property - def default_message(self): + def default_message(self, warning: ValidationWarning): return 'cannot be converted to type {}'.format(self.callable) @@ -182,8 +180,7 @@ def __init__(self, pattern, options={}, **kwargs): self.options = options super().__init__(**kwargs) - @property - def default_message(self): + def default_message(self, warning: ValidationWarning): return 'does not match the pattern "{}"'.format(self.pattern) def select_cells(self, series: pd.Series) -> pd.Series: @@ -198,8 +195,7 @@ class TrailingWhitespaceValidation(BooleanSeriesValidation): def __init__(self, **kwargs): super().__init__(**kwargs) - @property - def default_message(self): + def default_message(self, warning: ValidationWarning): return 'contains trailing whitespace' def select_cells(self, series: pd.Series) -> pd.Series: @@ -214,8 +210,7 @@ class LeadingWhitespaceValidation(BooleanSeriesValidation): def __init__(self, **kwargs): super().__init__(**kwargs) - @property - def default_message(self): + def default_message(self, warning: ValidationWarning): return 'contains leading whitespace' def select_cells(self, series: pd.Series) -> pd.Series: @@ -230,8 +225,7 @@ class IsDistinctValidation(BooleanSeriesValidation): def __init__(self, **kwargs): super().__init__(**kwargs) - @property - def default_message(self): + def default_message(self, warning: ValidationWarning): return 'contains values that are not unique' def select_cells(self, series: pd.Series) -> pd.Series: @@ -252,8 +246,7 @@ def __init__(self, options: typing.Iterable, case_sensitive: bool = True, **kwar self.options = options super().__init__(**kwargs) - @property - def default_message(self): + def default_message(self, warning: ValidationWarning): values = ', '.join(str(v) for v in self.options) return 'is not in the list of legal options ({})'.format(values) @@ -278,8 +271,7 @@ def __init__(self, date_format: str, **kwargs): self.date_format = date_format super().__init__(**kwargs) - @property - def default_message(self): + def default_message(self, warning: ValidationWarning): return 'does not match the date format string "{}"'.format(self.date_format) def valid_date(self, val): diff --git a/test/test_validation.py b/test/test_validation.py index dbea90f..c02615e 100755 --- a/test/test_validation.py +++ b/test/test_validation.py @@ -6,11 +6,14 @@ import re from numpy import nan, dtype +import numpy as np import pandas as pd from pandas_schema.validations import * from pandas_schema.core import BooleanSeriesValidation +from pandas_schema.index import ColumnIndexer as ci from pandas_schema.schema import Schema +from pandas_schema.column import column, column_sequence from pandas_schema import ValidationWarning @@ -510,20 +513,20 @@ def test_schema(self): }) schema = Schema([ - Column('wrong_dtype1', [IsDtypeValidation(dtype('int64'))]), - Column('wrong_dtype2', [IsDtypeValidation(dtype('float64'))]), - Column('wrong_dtype3', [IsDtypeValidation(dtype('int64'))]), + IsDtypeValidation(dtype('int64'), index=ci('wrong_dtype1')), + IsDtypeValidation(dtype('float64'), index=ci('wrong_dtype2')), + IsDtypeValidation(dtype('int64'), index=ci('wrong_dtype3')), ]) errors = schema.validate(df) self.assertEqual( - sorted([str(x) for x in errors]), - sorted([ - 'The column wrong_dtype1 has a dtype of object which is not a subclass of the required type int64', - 'The column wrong_dtype2 has a dtype of int64 which is not a subclass of the required type float64', - 'The column wrong_dtype3 has a dtype of float64 which is not a subclass of the required type int64' - ]) + [x.props for x in errors], + [ + {'dtype': np.object}, + {'dtype': np.int64}, + {'dtype': np.float64}, + ] ) @@ -632,17 +635,17 @@ def setUp(self): def test_in_range_allow_empty_with_error(self): validator = InRangeValidation(min=4) - errors = validator.validate_series(pd.Series(self.vals)) + errors = list(validator.validate_series(pd.Series(self.vals))) self.assertEqual(len(errors), sum(v is not None for v in self.vals)) def test_in_range_allow_empty_with_no_error(self): validator = InRangeValidation(min=0) - errors = validator.validate_series(pd.Series(self.vals)) + errors = list(validator.validate_series(pd.Series(self.vals))) self.assertEqual(len(errors), 0) def test_in_range_allow_empty_false_with_error(self): validator = InRangeValidation(min=4) - errors = validator.validate_series(pd.Series(self.vals)) + errors = list(validator.validate_series(pd.Series(self.vals))) self.assertEqual(len(errors), len(self.vals)) @@ -656,16 +659,16 @@ def setUp(self): def test_valid_elements(self): errors = self.validator.validate_series(pd.Series(['a', 'b', 'c', None, 'A', 'B', 'C'], dtype='category')) - self.assertEqual(len(errors), 0) + self.assertEqual(len(list(errors)), 0) def test_invalid_empty_elements(self): errors = self.validator.validate_series(pd.Series(['aa', 'bb', 'd', None], dtype='category')) - self.assertEqual(len(errors), 4) + self.assertEqual(len(list(errors)), 4) def test_invalid_and_empty_elements(self): errors = self.validator.validate_series(pd.Series(['a', None], dtype='category')) - self.assertEqual(len(errors), 1) + self.assertEqual(len(list(errors)), 1) def test_invalid_elements(self): errors = self.validator.validate_series(pd.Series(['aa', 'bb', 'd'], dtype='category')) - self.assertEqual(len(errors), 3) + self.assertEqual(len(list(errors)), 3) From 7d8aa932ea09b2be3ad758ded34f2e653a08fe0d Mon Sep 17 00:00:00 2001 From: Michael Milton Date: Thu, 6 Feb 2020 01:09:51 +1100 Subject: [PATCH 08/31] First attempt at CombinedValidations in the new API --- TODO.md | 6 +++++- pandas_schema/column.py | 4 +++- pandas_schema/core.py | 36 +++++++++++++++++++++++++++++++++++- 3 files changed, 43 insertions(+), 3 deletions(-) diff --git a/TODO.md b/TODO.md index 4350b8b..0a6c2c6 100755 --- a/TODO.md +++ b/TODO.md @@ -1 +1,5 @@ -* Add validations that apply to every column in the DF equally \ No newline at end of file +* [ ] Add validations that apply to every column in the DF equally +* [x] Fix CombinedValidations +* [x] Add replacement for allow_empty Columns +* [ ] New column() tests +* [ ] New CombinedValidation tests \ No newline at end of file diff --git a/pandas_schema/column.py b/pandas_schema/column.py index 048907c..ab3b58a 100755 --- a/pandas_schema/column.py +++ b/pandas_schema/column.py @@ -7,13 +7,15 @@ def column( validations: typing.Iterable['pandas_schema.core.IndexSeriesValidation'], index: PandasIndexer = None, - override: bool = False + override: bool = False, + allow_empty=False ): """ A utility method for setting the index data on a set of Validations :param validations: A list of validations to modify :param index: The index of the series that these validations will now consider :param override: If true, override existing index values. Otherwise keep the existing ones + :param allow_empty: Allow empty rows (NaN) to pass the validation See :py:class:`pandas_schema.validation.IndexSeriesValidation` """ for valid in validations: diff --git a/pandas_schema/core.py b/pandas_schema/core.py index 54ce0b6..6c4bf99 100755 --- a/pandas_schema/core.py +++ b/pandas_schema/core.py @@ -34,7 +34,7 @@ def message(self, warning: ValidationWarning) -> str: class SeriesValidation(BaseValidation): """ - A _SeriesValidation validates a DataFrame by selecting a single series from it, and + A SeriesValidation validates a DataFrame by selecting a single series from it, and applying some validation to it """ @@ -145,3 +145,37 @@ def validate_series(self, series: pd.Series) -> typing.Iterable[ValidationWarnin 'value': cell }) for row_idx, cell in cells.items() ) + + +class CombinedValidation(BaseValidation): + """ + Validates if one and/or the other validation is true for an element + """ + + def __init__(self, validation_a: BooleanSeriesValidation, validation_b: BooleanSeriesValidation, operator, + message: str): + super().__init__(message=message) + self.operator = operator + self.v_a = validation_a + self.v_b = validation_b + + def validate(self, df: pd.DataFrame) -> typing.Iterable[ValidationWarning]: + # Let both validations separately select and filter a column + left_series = self.v_a.select_series(df) + right_series = self.v_a.select_series(df) + + left_failed = ~self.v_a.select_cells(left_series) + right_failed = ~self.v_b.select_cells(right_series) + + # Then, we combine the two resulting boolean series, and determine the row indices of the result + failed = self.operator(left_failed, right_failed) + + return ( + ValidationWarning(self, { + 'row': row_idx, + }) for row_idx in np.where(failed) + ) + + @property + def default_message(self): + return '({}) {} ({})'.format(self.v_a.message, self.operator, self.v_b.message) From c36761ae148fe90dfd900fe161e9135208541e66 Mon Sep 17 00:00:00 2001 From: Michael Milton Date: Sun, 16 Feb 2020 17:29:35 +1100 Subject: [PATCH 09/31] Rework CombinedValidations --- pandas_schema/core.py | 66 +++++++++++++++++++++++++++++-------------- 1 file changed, 45 insertions(+), 21 deletions(-) diff --git a/pandas_schema/core.py b/pandas_schema/core.py index 6c4bf99..1d21ed6 100755 --- a/pandas_schema/core.py +++ b/pandas_schema/core.py @@ -1,6 +1,7 @@ import abc import math import datetime +from itertools import chain import pandas as pd import numpy as np import typing @@ -136,15 +137,36 @@ def select_cells(self, series: pd.Series) -> pd.Series: """ pass - def validate_series(self, series: pd.Series) -> typing.Iterable[ValidationWarning]: + # def generate_warnings(self, series: pd.Series) -> typing.Iterable[ValidationWarning]: + # """ + # Given a series that has been sliced down to only those that definitely failed, produce a list of + # ValidationWarnings. + # Note, this is different to validate_series, which actually calculates which rows have failed. + # Having this as a separate method allows it to be accessed by the CombinedValidation + # + # :param series: A series that has been sliced down to only those that definitely failed + # """ + # return ( + # ValidationWarning(self, { + # 'row': row_idx, + # 'value': cell + # }) for row_idx, cell in series.items() + # ) + + def warning_series(self, series): failed = ~self.select_cells(series) - cells = series[failed] - return ( - ValidationWarning(self, { - 'row': row_idx, - 'value': cell - }) for row_idx, cell in cells.items() - ) + + # Slice out the failed items, then map each into a list of validation warnings at each respective index + return series[failed].to_frame().apply(lambda row: [ValidationWarning(self, { + 'row': row.name, + 'value': row[0] + })], axis='columns') + + def validate_series(self, series: pd.Series) -> typing.Iterable[ValidationWarning]: + warnings = self.warning_series(series) + + # Remove the empty elements, split the list of warnings in each cell, and then compile that into a list + return warnings.dropna().explode().tolist() class CombinedValidation(BaseValidation): @@ -156,25 +178,27 @@ def __init__(self, validation_a: BooleanSeriesValidation, validation_b: BooleanS message: str): super().__init__(message=message) self.operator = operator - self.v_a = validation_a - self.v_b = validation_b + self.left = validation_a + self.right = validation_b def validate(self, df: pd.DataFrame) -> typing.Iterable[ValidationWarning]: # Let both validations separately select and filter a column - left_series = self.v_a.select_series(df) - right_series = self.v_a.select_series(df) + left_series = self.left.select_series(df) + right_series = self.right.select_series(df) - left_failed = ~self.v_a.select_cells(left_series) - right_failed = ~self.v_b.select_cells(right_series) + left_errors = self.left.warning_series(left_series) + right_errors = self.right.warning_series(right_series) - # Then, we combine the two resulting boolean series, and determine the row indices of the result - failed = self.operator(left_failed, right_failed) + # TODO - return ( - ValidationWarning(self, { - 'row': row_idx, - }) for row_idx in np.where(failed) - ) + # Then, we combine the two resulting boolean series, and determine the row indices of the result + # failed = self.operator(left_errors, right_errors) + # + # # If they did fail, obtain warnings from the validation that caused it + # return chain( + # self.v_a.generate_warnings(left_series[left_failed & failed]), + # self.v_b.generate_warnings(right_series[right_failed & failed]), + # ) @property def default_message(self): From 9bd2704d35ba3d9ecc5f533dbd180ed5f69cb562 Mon Sep 17 00:00:00 2001 From: Michael Milton Date: Tue, 25 Feb 2020 00:36:42 +1100 Subject: [PATCH 10/31] More work --- UPDATE.md | 5 +++ pandas_schema/core.py | 88 ++++++++++++++++++++++--------------------- test/test_column.py | 68 --------------------------------- 3 files changed, 51 insertions(+), 110 deletions(-) delete mode 100755 test/test_column.py diff --git a/UPDATE.md b/UPDATE.md index 80c3562..5bbc15f 100755 --- a/UPDATE.md +++ b/UPDATE.md @@ -26,3 +26,8 @@ that spawned it * Each category of Validation will define a `create_prefix()` method, that creates the {row: 1, column: 2} prefix that goes before each message. Thus, `generate_message()` will concatenate that with the actual message * + +## Options for placing CombinedValidation in the inheritance hierarchy +* In order to make both CombinedValidation and BooleanSeriesValidation both share a class, so they can be chained together, +either we had to make a mixin that creates a "side path" that doesn't call `validate` (in this case, `validate_with_series`), +or we diff --git a/pandas_schema/core.py b/pandas_schema/core.py index 1d21ed6..982a480 100755 --- a/pandas_schema/core.py +++ b/pandas_schema/core.py @@ -119,7 +119,24 @@ def validate_series(self, series: pd.Series) -> typing.Iterable[ValidationWarnin pass -class BooleanSeriesValidation(IndexSeriesValidation): +class WarningSeriesGenerator(BaseValidation, abc.ABC): + """ + Mixin class that indicates that this Validation can produce a "warning series", which is a pandas Series with one + or more warnings in each cell, corresponding to warnings detected in the DataFrame at the same index + """ + + @abc.abstractmethod + def get_warning_series(self, df: pd.DataFrame) -> pd.Series: + """ + Return a series of ValidationWarnings, not an iterable of ValidationWarnings like the normal validate() method + """ + + def validate(self, df: pd.DataFrame) -> typing.Iterable[ValidationWarning]: + warnings = self.get_warning_series(df) + return warnings.dropna().explode().tolist() + + +class BooleanSeriesValidation(IndexSeriesValidation, WarningSeriesGenerator): """ Validation is defined by the function :py:meth:~select_cells that returns a boolean series. Each cell that has False has failed the validation. @@ -137,68 +154,55 @@ def select_cells(self, series: pd.Series) -> pd.Series: """ pass - # def generate_warnings(self, series: pd.Series) -> typing.Iterable[ValidationWarning]: - # """ - # Given a series that has been sliced down to only those that definitely failed, produce a list of - # ValidationWarnings. - # Note, this is different to validate_series, which actually calculates which rows have failed. - # Having this as a separate method allows it to be accessed by the CombinedValidation - # - # :param series: A series that has been sliced down to only those that definitely failed - # """ - # return ( - # ValidationWarning(self, { - # 'row': row_idx, - # 'value': cell - # }) for row_idx, cell in series.items() - # ) - - def warning_series(self, series): + def get_warning_series(self, series) -> pd.Series: + """ + Validates a series and returns a series of warnings. + This is shared by the two validation entrypoints, :py:meth:~validate_with_series, and :py:meth:`~validate_series + :param series: The series to validate + """ failed = ~self.select_cells(series) # Slice out the failed items, then map each into a list of validation warnings at each respective index return series[failed].to_frame().apply(lambda row: [ValidationWarning(self, { - 'row': row.name, - 'value': row[0] - })], axis='columns') - - def validate_series(self, series: pd.Series) -> typing.Iterable[ValidationWarning]: - warnings = self.warning_series(series) - - # Remove the empty elements, split the list of warnings in each cell, and then compile that into a list - return warnings.dropna().explode().tolist() + 'row': row.name, + 'value': row[0] + })], axis='columns') -class CombinedValidation(BaseValidation): +class CombinedValidation(WarningSeriesGenerator): """ Validates if one and/or the other validation is true for an element """ - def __init__(self, validation_a: BooleanSeriesValidation, validation_b: BooleanSeriesValidation, operator, + def __init__(self, validation_a: BooleanSeriesValidation, validation_b: BooleanSeriesValidation, operator: str, message: str): super().__init__(message=message) self.operator = operator self.left = validation_a self.right = validation_b - def validate(self, df: pd.DataFrame) -> typing.Iterable[ValidationWarning]: + def get_warning_series(self, df: pd.DataFrame) -> pd.Series: # Let both validations separately select and filter a column left_series = self.left.select_series(df) right_series = self.right.select_series(df) - left_errors = self.left.warning_series(left_series) - right_errors = self.right.warning_series(right_series) - - # TODO + left_errors = self.left.get_warning_series(left_series) + right_errors = self.right.get_warning_series(right_series) + + if self.operator == 'and': + # If it's an "and" validation, left, right, or both failing means an error, so we can simply concatenate + # the lists of errors + combined = left_errors.combine(right_errors, func=operator.add) + elif self.operator == 'or': + # [error] and [] = [] + # [error_1] and [error_2] = [error_2] + # [] and [] = [] + # Thus, we can use the and operator to implement "or" validations + combined = left_errors.combine(right_errors, func=operator.and_)#func=lambda a, b: [] if len(a) == 0 or len(b) == 0 else a + b) + else: + raise Exception('Operator must be "and" or "or"') - # Then, we combine the two resulting boolean series, and determine the row indices of the result - # failed = self.operator(left_errors, right_errors) - # - # # If they did fail, obtain warnings from the validation that caused it - # return chain( - # self.v_a.generate_warnings(left_series[left_failed & failed]), - # self.v_b.generate_warnings(right_series[right_failed & failed]), - # ) + return combined @property def default_message(self): diff --git a/test/test_column.py b/test/test_column.py deleted file mode 100755 index 38e61f0..0000000 --- a/test/test_column.py +++ /dev/null @@ -1,68 +0,0 @@ -import unittest -import pandas as pd - -from pandas_schema import Column -from pandas_schema.validation import CanConvertValidation, LeadingWhitespaceValidation, TrailingWhitespaceValidation - - -class SingleValidationColumn(unittest.TestCase): - """ - Test a column with one single validation - """ - NAME = 'col1' - - col = Column(NAME, [CanConvertValidation(int)], allow_empty=False) - ser = pd.Series([ - 'a', - 'b', - 'c' - ]) - - def test_name(self): - self.assertEqual(self.col.name, self.NAME, 'A Column does not store its name correctly') - - def test_outputs(self): - results = self.col.validate(self.ser) - - self.assertEqual(len(results), len(self.ser), 'A Column produces the wrong number of errors') - for i in range(2): - self.assertTrue(any([r.row == i for r in results]), 'A Column does not report errors for every row') - - -class DoubleValidationColumn(unittest.TestCase): - """ - Test a column with two different validations - """ - NAME = 'col1' - - col = Column(NAME, [TrailingWhitespaceValidation(), LeadingWhitespaceValidation()], allow_empty=False) - ser = pd.Series([ - ' a ', - ' b ', - ' c ' - ]) - - def test_outputs(self): - results = self.col.validate(self.ser) - - # There should be 6 errors, 2 for each row - self.assertEqual(len(results), 2 * len(self.ser), 'A Column produces the wrong number of errors') - for i in range(2): - in_row = [r for r in results if r.row == i] - self.assertEqual(len(in_row), 2, 'A Column does not report both errors for every row') - - -class AllowEmptyColumn(unittest.TestCase): - """ - Test a column with one single validation that allows empty columns - """ - NAME = 'col1' - - col = Column(NAME, [CanConvertValidation(int)], allow_empty=True) - ser = pd.Series([ - '', - ]) - - def test_outputs(self): - results = self.col.validate(self.ser) - self.assertEqual(len(results), 0, 'allow_empty is not allowing empty columns') From f502167932b39597676b5b828a36f3ebffec39d3 Mon Sep 17 00:00:00 2001 From: Michael Milton Date: Tue, 17 Mar 2020 00:05:31 +1100 Subject: [PATCH 11/31] Fix more tests --- pandas_schema/core.py | 176 ++++++----- pandas_schema/index.py | 41 ++- pandas_schema/validations.py | 38 ++- test/test_validation.py | 558 ++++++++++++++--------------------- 4 files changed, 387 insertions(+), 426 deletions(-) diff --git a/pandas_schema/core.py b/pandas_schema/core.py index 982a480..a7963a6 100755 --- a/pandas_schema/core.py +++ b/pandas_schema/core.py @@ -12,7 +12,7 @@ from . import column from .errors import PanSchArgumentError, PanSchNoIndexError from pandas_schema.validation_warning import ValidationWarning -from pandas_schema.index import PandasIndexer +from pandas_schema.index import PandasIndexer, IndexValue from pandas.api.types import is_categorical_dtype, is_numeric_dtype @@ -29,64 +29,37 @@ def validate(self, df: pd.DataFrame) -> typing.Iterable[ValidationWarning]: :return: All validation failures detected by this validation """ + @abc.abstractmethod def message(self, warning: ValidationWarning) -> str: pass -class SeriesValidation(BaseValidation): +class IndexValidation(BaseValidation): """ - A SeriesValidation validates a DataFrame by selecting a single series from it, and - applying some validation to it + Mixin for Validation classes, giving them access to an index for selecting a Series out of the DataFrame """ - @abc.abstractmethod - def select_series(self, df: pd.DataFrame) -> pd.Series: - """ - Selects a series from the DataFrame that will be validated - """ - - @abc.abstractmethod - def validate_series(self, series: pd.Series) -> typing.Iterable[ValidationWarning]: - """ - Validate a single series - """ - - def validate(self, df: pd.DataFrame) -> typing.Iterable[ValidationWarning]: - series = self.select_series(df) - return self.validate_series(series) - - -class IndexSeriesValidation(SeriesValidation): - """ - Selects a series from the DataFrame, using label or position-based indexes that can be provided at instantiation - or later - """ - - def __init__(self, index: PandasIndexer = None, message: str = None): + def __init__(self, index: typing.Union[PandasIndexer, IndexValue], message: str = None, **kwargs): """ Creates a new IndexSeriesValidation :param index: An index with which to select the series Otherwise it's a label (ie, index=0) indicates the column with the label of 0 """ - self.index = index + super().__init__(**kwargs) + if isinstance(index, PandasIndexer): + self.index = index + else: + # If it isn't already an indexer object, convert it to one + self.index = PandasIndexer(index=index) self.custom_message = message - def message(self, warning: ValidationWarning): - """ - Gets a message describing how the DataFrame cell failed the validation - This shouldn't really be overridden, instead override default_message so that users can still set per-object - messages - :return: - """ - if self.index.type == 'position': - prefix = self.index.index - else: - prefix = '"{}"'.format(self.index.index) + def message(self, warning: ValidationWarning) -> str: + prefix = self.prefix() if self.custom_message: suffix = self.custom_message else: - suffix = self.default_message(warning) + suffix = self.default_message return "Column {} {}".format(prefix, suffix) @@ -97,12 +70,7 @@ def readable_name(self, **kwargs): """ return type(self).__name__ - def default_message(self, warning: ValidationWarning) -> str: - """ - Create a message to be displayed whenever this validation fails - This should be a generic message for the validation type, but can be overwritten if the user provides a - message kwarg - """ + def default_message(self) -> str: return 'failed the {}'.format(self.readable_name) def select_series(self, df: pd.DataFrame) -> pd.Series: @@ -114,6 +82,53 @@ def select_series(self, df: pd.DataFrame) -> pd.Series: return self.index(df) + def prefix(self): + """ + Return a string that could be used to prefix a message that relates to this index + """ + if self.index is None: + return "" + + if self.index.type == 'position': + return self.index.index + else: + return '"{}"'.format(self.index.index) + + +# +# class SeriesValidation(BaseValidation): +# """ +# A SeriesValidation validates a DataFrame by selecting a single series from it, and +# applying some validation to it +# """ +# +# @abc.abstractmethod +# def select_series(self, df: pd.DataFrame) -> pd.Series: +# """ +# Selects a series from the DataFrame that will be validated +# """ +# +# @abc.abstractmethod +# def validate_series(self, series: pd.Series) -> typing.Iterable[ValidationWarning]: +# """ +# Validate a single series +# """ +# +# def validate(self, df: pd.DataFrame) -> typing.Iterable[ValidationWarning]: +# series = self.select_series(df) +# return self.validate_series(series) + + +class SeriesValidation(IndexValidation): + """ + A SeriesValidation validates a DataFrame by selecting a single series from it, and + applying some validation to it + """ + + def validate(self, df: pd.DataFrame) -> typing.Iterable[ValidationWarning]: + series = self.index(df) + return self.validate_series(series) + @abc.abstractmethod def validate_series(self, series: pd.Series) -> typing.Iterable[ValidationWarning]: pass @@ -131,12 +146,25 @@ def get_warning_series(self, df: pd.DataFrame) -> pd.Series: Return a series of ValidationWarnings, not an iterable of ValidationWarnings like the normal validate() method """ - def validate(self, df: pd.DataFrame) -> typing.Iterable[ValidationWarning]: + @staticmethod + def flatten_warning_series(warnings: pd.Series): + """ + Converts a warning series into an iterable of warnings + """ + return warnings[warnings.astype(bool)].explode().tolist() + + def validate(self, df: pd.DataFrame, flatten=True) -> typing.Union[ + typing.Iterable[ValidationWarning], + pd.Series + ]: warnings = self.get_warning_series(df) - return warnings.dropna().explode().tolist() + if flatten: + return self.flatten_warning_series(warnings) + else: + return warnings -class BooleanSeriesValidation(IndexSeriesValidation, WarningSeriesGenerator): +class BooleanSeriesValidation(IndexValidation, WarningSeriesGenerator): """ Validation is defined by the function :py:meth:~select_cells that returns a boolean series. Each cell that has False has failed the validation. @@ -154,19 +182,35 @@ def select_cells(self, series: pd.Series) -> pd.Series: """ pass - def get_warning_series(self, series) -> pd.Series: + def validate_series(self, series, flatten=True) -> typing.Union[ + typing.Iterable[ValidationWarning], + pd.Series + ]: """ - Validates a series and returns a series of warnings. - This is shared by the two validation entrypoints, :py:meth:~validate_with_series, and :py:meth:`~validate_series - :param series: The series to validate + Utility method for shortcutting data-frame validation and instead validating only a single series """ failed = ~self.select_cells(series) # Slice out the failed items, then map each into a list of validation warnings at each respective index - return series[failed].to_frame().apply(lambda row: [ValidationWarning(self, { + warnings = series[failed].to_frame().apply(lambda row: [ValidationWarning(self, { 'row': row.name, 'value': row[0] - })], axis='columns') + })], axis='columns', result_type='reduce') + # warnings = warnings.iloc[:, 0] + + if flatten: + return self.flatten_warning_series(warnings) + else: + return warnings + + def get_warning_series(self, df: pd.DataFrame) -> pd.Series: + """ + Validates a series and returns a series of warnings. + This is shared by the two validation entrypoints, :py:meth:~validate_with_series, and :py:meth:`~validate_series + :param series: The series to validate + """ + series = self.select_series(df) + return self.validate_series(series, flatten=False) class CombinedValidation(WarningSeriesGenerator): @@ -174,31 +218,31 @@ class CombinedValidation(WarningSeriesGenerator): Validates if one and/or the other validation is true for an element """ - def __init__(self, validation_a: BooleanSeriesValidation, validation_b: BooleanSeriesValidation, operator: str, - message: str): - super().__init__(message=message) + def message(self, warning: ValidationWarning) -> str: + pass + + def __init__(self, validation_a: WarningSeriesGenerator, validation_b: WarningSeriesGenerator, operator: str): + super().__init__() self.operator = operator self.left = validation_a self.right = validation_b def get_warning_series(self, df: pd.DataFrame) -> pd.Series: # Let both validations separately select and filter a column - left_series = self.left.select_series(df) - right_series = self.right.select_series(df) - - left_errors = self.left.get_warning_series(left_series) - right_errors = self.right.get_warning_series(right_series) + left_errors = self.left.validate(df, flatten=False) + right_errors = self.right.validate(df, flatten=False) if self.operator == 'and': # If it's an "and" validation, left, right, or both failing means an error, so we can simply concatenate # the lists of errors - combined = left_errors.combine(right_errors, func=operator.add) + combined = left_errors.combine(right_errors, func=operator.add, fill_value=[]) elif self.operator == 'or': # [error] and [] = [] # [error_1] and [error_2] = [error_2] # [] and [] = [] # Thus, we can use the and operator to implement "or" validations - combined = left_errors.combine(right_errors, func=operator.and_)#func=lambda a, b: [] if len(a) == 0 or len(b) == 0 else a + b) + combined = left_errors.combine(right_errors, func=lambda l, r: l + r if l and r else [], fill_value=[]) + # func=lambda a, b: [] if len(a) == 0 or len(b) == 0 else a + b) else: raise Exception('Operator must be "and" or "or"') diff --git a/pandas_schema/index.py b/pandas_schema/index.py index d37cd91..51f1172 100755 --- a/pandas_schema/index.py +++ b/pandas_schema/index.py @@ -3,6 +3,7 @@ from typing import Union import numpy import pandas +from enum import Enum IndexValue = Union[numpy.string_, numpy.int_, str, int] """ @@ -11,23 +12,34 @@ """ +class IndexType(Enum): + POSITION = 0 + LABEL = 1 + + class PandasIndexer: """ An index into a particular axis of a DataFrame. Attempts to recreate the behaviour of `df.ix[some_index]` """ - valid_types = {'position', 'label'} + # valid_types = {'position', 'label'} index: IndexValue """ The index to use, either an integer for position-based indexing, or a string for label-based indexing """ - type: str + type: IndexType """ The type of indexing to use, either 'position' or 'label' """ - def __init__(self, index: IndexValue, typ: str = None): + axis: int + """ + The axis for the indexer + """ + + def __init__(self, index: IndexValue, typ: IndexType = None, axis: int = 1): self.index = index + self.axis = axis if typ is not None: # If the type is provided, validate it @@ -38,31 +50,30 @@ def __init__(self, index: IndexValue, typ: str = None): else: # If the type isn't provided, guess it based on the datatype of the index if numpy.issubdtype(type(index), numpy.character): - self.type = 'label' + self.type = IndexType.LABEL elif numpy.issubdtype(type(index), numpy.int_): - self.type = 'position' + self.type = IndexType.POSITION else: raise PanSchIndexError('The index value was not either an integer or string, or an array of either of ' 'these') - - def __call__(self, df: pandas.DataFrame, axis: int = 0): + def __call__(self, df: pandas.DataFrame): """ Apply this index :param df: The DataFrame to index :param axis: The axis to index along. axis=0 will select a row, and axis=1 will select a column """ - if self.type == 'label': - return df.loc(axis=axis)[self.index] - elif self.type == 'label': - return df.iloc(axis=axis)[self.index] + if self.type == IndexType.LABEL: + return df.loc(axis=self.axis)[self.index] + elif self.type == IndexType.POSITION: + return df.iloc(axis=self.axis)[self.index] class RowIndexer(PandasIndexer): - def __call__(self, df: pandas.DataFrame): - return super().__call__(df, axis=0) + def __init__(self, index: IndexValue, typ: IndexType = None): + super().__init__(index=index, typ=typ, axis=0) class ColumnIndexer(PandasIndexer): - def __call__(self, df: pandas.DataFrame): - return super().__call__(df, axis=1) + def __init__(self, index: IndexValue, typ: IndexType = None): + super().__init__(index=index, typ=typ, axis=1) diff --git a/pandas_schema/validations.py b/pandas_schema/validations.py index d05e180..b2ffae1 100755 --- a/pandas_schema/validations.py +++ b/pandas_schema/validations.py @@ -7,7 +7,7 @@ import operator from . import column -from .core import IndexSeriesValidation, BooleanSeriesValidation +from .core import SeriesValidation, BooleanSeriesValidation, IndexValidation from .validation_warning import ValidationWarning from .errors import PanSchArgumentError from pandas.api.types import is_categorical_dtype, is_numeric_dtype @@ -19,7 +19,7 @@ class CustomSeriesValidation(BooleanSeriesValidation): Series methods: http://pandas.pydata.org/pandas-docs/stable/api.html#series) """ - def __init__(self, validation: typing.Callable[[pd.Series], pd.Series], message: str): + def __init__(self, validation: typing.Callable[[pd.Series], pd.Series], *args, **kwargs): """ :param message: The error message to provide to the user if this validation fails. The row and column and failing value will automatically be prepended to this message, so you only have to provide a message that @@ -29,8 +29,9 @@ def __init__(self, validation: typing.Callable[[pd.Series], pd.Series], message: :param validation: A function that takes a pandas Series and returns a boolean Series, where each cell is equal to True if the object passed validation, and False if it failed """ + super().__init__(*args, **kwargs) self._validation = validation - super().__init__(message=message) + def select_cells(self, series: pd.Series) -> pd.Series: return self._validation(series) @@ -41,8 +42,7 @@ class CustomElementValidation(BooleanSeriesValidation): Validates using a user-provided function that operates on each element """ - def __init__(self, validation: typing.Callable[[typing.Any], typing.Any], - message: str): + def __init__(self, validation: typing.Callable[[typing.Any], typing.Any], *args, **kwargs): """ :param message: The error message to provide to the user if this validation fails. The row and column and failing value will automatically be prepended to this message, so you only have to provide a message that @@ -53,7 +53,7 @@ def __init__(self, validation: typing.Callable[[typing.Any], typing.Any], the validation, and false if it doesn't """ self._validation = validation - super().__init__(message=message) + super().__init__(*args, **kwargs) def select_cells(self, series: pd.Series) -> pd.Series: return series.apply(self._validation) @@ -82,7 +82,7 @@ def select_cells(self, series: pd.Series) -> pd.Series: return (series >= self.min) & (series < self.max) -class IsDtypeValidation(IndexSeriesValidation): +class IsDtypeValidation(SeriesValidation): """ Checks that a series has a certain numpy dtype """ @@ -126,7 +126,8 @@ def __init__(self, func: typing.Callable, **kwargs): type)) super().__init__(**kwargs) - def default_message(self, warning: ValidationWarning): + @property + def default_message(self): return 'raised an exception when the callable {} was called on it'.format( self.callable) @@ -161,7 +162,8 @@ def __init__(self, _type: type, **kwargs): else: raise PanSchArgumentError('{} is not a valid type'.format(_type)) - def default_message(self, warning: ValidationWarning): + @property + def default_message(self): return 'cannot be converted to type {}'.format(self.callable) @@ -180,7 +182,8 @@ def __init__(self, pattern, options={}, **kwargs): self.options = options super().__init__(**kwargs) - def default_message(self, warning: ValidationWarning): + @property + def default_message(self): return 'does not match the pattern "{}"'.format(self.pattern) def select_cells(self, series: pd.Series) -> pd.Series: @@ -195,7 +198,8 @@ class TrailingWhitespaceValidation(BooleanSeriesValidation): def __init__(self, **kwargs): super().__init__(**kwargs) - def default_message(self, warning: ValidationWarning): + @property + def default_message(self): return 'contains trailing whitespace' def select_cells(self, series: pd.Series) -> pd.Series: @@ -210,7 +214,8 @@ class LeadingWhitespaceValidation(BooleanSeriesValidation): def __init__(self, **kwargs): super().__init__(**kwargs) - def default_message(self, warning: ValidationWarning): + @property + def default_message(self): return 'contains leading whitespace' def select_cells(self, series: pd.Series) -> pd.Series: @@ -225,7 +230,8 @@ class IsDistinctValidation(BooleanSeriesValidation): def __init__(self, **kwargs): super().__init__(**kwargs) - def default_message(self, warning: ValidationWarning): + @property + def default_message(self): return 'contains values that are not unique' def select_cells(self, series: pd.Series) -> pd.Series: @@ -246,7 +252,8 @@ def __init__(self, options: typing.Iterable, case_sensitive: bool = True, **kwar self.options = options super().__init__(**kwargs) - def default_message(self, warning: ValidationWarning): + @property + def default_message(self): values = ', '.join(str(v) for v in self.options) return 'is not in the list of legal options ({})'.format(values) @@ -271,7 +278,8 @@ def __init__(self, date_format: str, **kwargs): self.date_format = date_format super().__init__(**kwargs) - def default_message(self, warning: ValidationWarning): + @property + def default_message(self): return 'does not match the date format string "{}"'.format(self.date_format) def valid_date(self, val): diff --git a/test/test_validation.py b/test/test_validation.py index c02615e..a0c316c 100755 --- a/test/test_validation.py +++ b/test/test_validation.py @@ -10,41 +10,43 @@ import pandas as pd from pandas_schema.validations import * -from pandas_schema.core import BooleanSeriesValidation +from pandas_schema.core import BooleanSeriesValidation, CombinedValidation, BaseValidation from pandas_schema.index import ColumnIndexer as ci from pandas_schema.schema import Schema from pandas_schema.column import column, column_sequence from pandas_schema import ValidationWarning -class ValidationTestBase(unittest.TestCase): - def seriesEquality(self, s1: pd.Series, s2: pd.Series, msg: str = None): - if not s1.equals(s2): - raise self.failureException(msg) - - def validate_and_compare(self, series: list, expected_result: bool, msg: str = None): - """ - Checks that every element in the provided series is equal to `expected_result` after validation - :param series_dtype: Explicitly specifies the dtype for the generated Series - :param series: The series to check - :param expected_result: Whether the elements in this series should pass the validation - :param msg: The message to display if this test fails - """ +def get_warnings(validator: BaseValidation, series: list) -> typing.Collection[ValidationWarning]: + """ + Tests a validator by asserting that it generates the amount of warnings + :param series_dtype: Explicitly specifies the dtype for the generated Series + :param series: The series to check + :param expected_result: Whether the elements in this series should pass the validation + :param msg: The message to display if this test fails + """ - # Check that self.validator is correct - if not self.validator or not isinstance(self.validator, BooleanSeriesValidation): - raise ValueError('The class must have the validator field set to an instance of a Validation subclass') + # # Check that self.validator is correct + # if not self.validator or not isinstance(self.validator, BooleanSeriesValidation, index=0): + # raise ValueError('The class must have the validator field set to an instance of a Validation subclass') + # + # # Ensure we're comparing series correctly + # self.addTypeEqualityFunc(pd.Series, self.seriesEquality) - # Ensure we're comparing series correctly - self.addTypeEqualityFunc(pd.Series, self.seriesEquality) + df = pd.Series(series).to_frame() + warnings = validator.validate(df) + return list(warnings) + # + # # Now find any items where their validation does not correspond to the expected_result + # for item, result in zip(series, results): + # with self.subTest(value=item): + # self.assertEqual(result, expected_result, msg) - # Convert the input list to a series and validate it - results = self.validator.select_cells(pd.Series(series)) - # Now find any items where their validation does not correspond to the expected_result - for item, result in zip(series, results): - with self.subTest(value=item): - self.assertEqual(result, expected_result, msg) +class ValidationTestBase(unittest.TestCase): + def seriesEquality(self, s1: pd.Series, s2: pd.Series, msg: str = None): + if not s1.equals(s2): + raise self.failureException(msg) class CustomSeries(ValidationTestBase): @@ -53,13 +55,17 @@ class CustomSeries(ValidationTestBase): """ def setUp(self): - self.validator = CustomSeriesValidation(lambda s: ~s.str.contains('fail'), 'contained the word fail') + self.validator = CustomSeriesValidation( + lambda s: ~s.str.contains('fail'), + message='contained the word fail', + index=0 + ) def test_valid_inputs(self): - self.validate_and_compare(['good', 'success'], True, 'did not accept valid inputs') + assert len(get_warnings(self.validator, ['good', 'success'])) == 0, 'did not accept valid inputs' def test_invalid_inputs(self): - self.validate_and_compare(['fail', 'failure'], False, 'accepted invalid inputs') + assert len(get_warnings(self.validator, ['fail', 'failure'])) == 2, 'accepted invalid inputs' class CustomElement(ValidationTestBase): @@ -68,13 +74,18 @@ class CustomElement(ValidationTestBase): """ def setUp(self): - self.validator = CustomElementValidation(lambda s: s.startswith('_start_'), "Didn't begin with '_start_'") + self.validator = CustomElementValidation( + lambda s: s.startswith('_start_'), + message="Didn't begin with '_start_'", + index=0 + ) def test_valid_inputs(self): - self.validate_and_compare(['_start_sdiyhsd', '_start_234fpwunxc\n'], True, 'did not accept valid inputs') + assert len( + get_warnings(self.validator, ['_start_sdiyhsd', '_start_234fpwunxc\n'])) == 0, 'did not accept valid inputs' def test_invalid_inputs(self): - self.validate_and_compare(['fail', '324wfp9ni'], False, 'accepted invalid inputs') + assert len(get_warnings(self.validator, ['fail', '324wfp9ni'])) == 2, 'accepted invalid inputs' class LeadingWhitespace(ValidationTestBase): @@ -83,43 +94,31 @@ class LeadingWhitespace(ValidationTestBase): """ def setUp(self): - self.validator = LeadingWhitespaceValidation() + self.validator = LeadingWhitespaceValidation(index=0) def test_validate_trailing_whitespace(self): - self.validate_and_compare( - [ - 'trailing space ', - 'trailing tabs ', - '''trailing newline - ''' - ], - True, - 'is incorrectly failing on trailing whitespace' - ) + assert len(get_warnings(self.validator, [ + 'trailing space ', + 'trailing tabs ', + '''trailing newline + ''' + ])) == 0, 'is incorrectly failing on trailing whitespace' def test_validate_leading_whitespace(self): - self.validate_and_compare( - [ - ' leading spaces', - ' leading tabs', - ''' - leading newline''', - ], - False, - 'does not detect leading whitespace' - ) + assert len(get_warnings(self.validator, [ + ' leading spaces', + ' leading tabs', + ''' + leading newline''', + ])) == 3, 'does not detect leading whitespace' def test_validate_middle_whitespace(self): - self.validate_and_compare( - [ - 'middle spaces', - 'middle tabs', - '''middle - newline''', - ], - True, - 'is incorrectly failing on central whitespace' - ) + assert len(get_warnings(self.validator, [ + 'middle spaces', + 'middle tabs', + '''middle + newline''', + ])) == 0, 'is incorrectly failing on central whitespace' class TrailingWhitespace(ValidationTestBase): @@ -128,44 +127,32 @@ class TrailingWhitespace(ValidationTestBase): """ def setUp(self): - self.validator = TrailingWhitespaceValidation() + self.validator = TrailingWhitespaceValidation(index=0) super().setUp() def test_validate_trailing_whitespace(self): - self.validate_and_compare( - [ - 'trailing space ', - 'trailing tabs ', - '''trailing newline - ''' - ], - False, - 'is not detecting trailing whitespace' - ) + assert len(get_warnings(self.validator, [ + 'trailing space ', + 'trailing tabs ', + '''trailing newline + ''' + ])) == 3, 'is not detecting trailing whitespace' def test_validate_leading_whitespace(self): - self.validate_and_compare( - [ - ' leading spaces', - ' leading tabs', - ''' - leading newline''', - ], - True, - 'is incorrectly failing on leading whitespace' - ) + assert len(get_warnings(self.validator, [ + ' leading spaces', + ' leading tabs', + ''' + leading newline''', + ])) == 0, 'is incorrectly failing on leading whitespace' def test_validate_middle_whitespace(self): - self.validate_and_compare( - [ - 'middle spaces', - 'middle tabs', - '''middle - newline''', - ], - True, - 'is incorrectly failing on central whitespace' - ) + assert len(get_warnings(self.validator, [ + 'middle spaces', + 'middle tabs', + '''middle + newline''', + ])) == 0, 'is incorrectly failing on central whitespace' class CanCallJson(ValidationTestBase): @@ -174,29 +161,21 @@ class CanCallJson(ValidationTestBase): """ def setUp(self): - self.validator = CanCallValidation(json.loads) + self.validator = CanCallValidation(json.loads, index=0) def test_validate_valid_json(self): - self.validate_and_compare( - [ - '[1, 2, 3]', - '{"a": 1.1, "b": 2.2, "c": 3.3}', - '"string"' - ], - True, - 'is incorrectly failing on valid JSON' - ) + assert len(get_warnings(self.validator, [ + '[1, 2, 3]', + '{"a": 1.1, "b": 2.2, "c": 3.3}', + '"string"' + ])) == 0, 'is incorrectly failing on valid JSON' def test_validate_invalid_json(self): - self.validate_and_compare( - [ - '[1, 2, 3', - '{a: 1.1, b: 2.2, c: 3.3}', - 'string' - ], - False, - 'is not detecting invalid JSON' - ) + assert len(get_warnings(self.validator, [ + '[1, 2, 3', + '{a: 1.1, b: 2.2, c: 3.3}', + 'string' + ])) == 3, 'is not detecting invalid JSON' class CanCallLambda(ValidationTestBase): @@ -206,29 +185,21 @@ class CanCallLambda(ValidationTestBase): def setUp(self): # Succeed if it's divisible by 2, otherwise cause an error - self.validator = CanCallValidation(lambda x: False if x % 2 == 0 else 1 / 0) + self.validator = CanCallValidation(lambda x: False if x % 2 == 0 else 1 / 0, index=0) def test_validate_noerror(self): - self.validate_and_compare( - [ - 2, - 4, - 6 - ], - True, - 'is incorrectly failing on even numbers' - ) + assert len(get_warnings(self.validator, [ + 2, + 4, + 6 + ])) == 0, 'is incorrectly failing on even numbers' def test_validate_error(self): - self.validate_and_compare( - [ - 1, - 3, - 5 - ], - False, - 'should fail on odd numbers' - ) + assert len(get_warnings(self.validator, [ + 1, + 3, + 5 + ])) == 3, 'should fail on odd numbers' class CanConvertInt(ValidationTestBase): @@ -237,161 +208,117 @@ class CanConvertInt(ValidationTestBase): """ def setUp(self): - self.validator = CanConvertValidation(int) + self.validator = CanConvertValidation(int, index=0) def test_valid_int(self): - self.validate_and_compare( - [ - '1', - '10', - '999', - '99999' - ], - True, - 'does not accept valid integers' - ) + assert len(get_warnings(self.validator, [ + '1', + '10', + '999', + '99999' + ])) == 0, 'does not accept valid integers' def test_invalid_int(self): - self.validate_and_compare( - [ - '1.0', - '9.5', - 'abc', - '1e-6' - ], - False, - 'accepts invalid integers' - ) + assert len(get_warnings(self.validator, [ + '1.0', + '9.5', + 'abc', + '1e-6' + ])) == 4, 'accepts invalid integers' class InListCaseSensitive(ValidationTestBase): def setUp(self): - self.validator = InListValidation(['a', 'b', 'c']) + self.validator = InListValidation(['a', 'b', 'c'], index=0) def test_valid_elements(self): - self.validate_and_compare( - [ - 'a', - 'b', - 'c' - ], - True, - 'does not accept elements that are in the validation list' - ) + assert len(get_warnings(self.validator, [ + 'a', + 'b', + 'c' + ])) == 0, 'does not accept elements that are in the validation list' def test_invalid_elements(self): - self.validate_and_compare( - [ - 'aa', - 'bb', - 'd', - 'A', - 'B', - 'C' - ], - False, - 'accepts elements that are not in the validation list' - ) + assert len(get_warnings(self.validator, [ + 'aa', + 'bb', + 'd', + 'A', + 'B', + 'C' + ])) == 6, 'accepts elements that are not in the validation list' class InListCaseInsensitive(ValidationTestBase): def setUp(self): - self.validator = InListValidation(['a', 'b', 'c'], case_sensitive=False) + self.validator = InListValidation(['a', 'b', 'c'], case_sensitive=False, index=0) def test_valid_elements(self): - self.validate_and_compare( - [ - 'a', - 'b', - 'c', - 'A', - 'B', - 'C' - ], - True, - 'does not accept elements that are in the validation list' - ) + assert len(get_warnings(self.validator, [ + 'a', + 'b', + 'c', + 'A', + 'B', + 'C' + ])) == 0, 'does not accept elements that are in the validation list' def test_invalid_elements(self): - self.validate_and_compare( - [ - 'aa', - 'bb', - 'd', - ], - False, - 'accepts elements that are not in the validation list' - ) + assert len(get_warnings(self.validator, [ + 'aa', + 'bb', + 'd', + ])) == 3, 'accepts elements that are not in the validation list' class DateFormat(ValidationTestBase): def setUp(self): - self.validator = DateFormatValidation('%Y%m%d') + self.validator = DateFormatValidation('%Y%m%d', index=0) def test_valid_dates(self): - self.validate_and_compare( - [ - '20160404', - '00011212' - ], - True, - 'does not accept valid dates' - ) + assert len(get_warnings(self.validator, [ + '20160404', + '00011212' + ])) == 0, 'does not accept valid dates' def test_invalid_dates(self): - self.validate_and_compare( - [ - '1/2/3456', - 'yyyymmdd', - '11112233' - ], - False, - 'accepts invalid dates' - ) + assert len(get_warnings(self.validator, [ + '1/2/3456', + 'yyyymmdd', + '11112233' + ])) == 3, 'accepts invalid dates' class StringRegexMatch(ValidationTestBase): def setUp(self): - self.validator = MatchesPatternValidation('^.+\.txt$') + self.validator = MatchesPatternValidation('^.+\.txt$', index=0) def test_valid_strings(self): - self.validate_and_compare( - [ - 'pass.txt', - 'a.txt', - 'lots of words.txt' - ], - True, - 'does not accept strings matching the regex' - ) + assert len(get_warnings(self.validator, [ + 'pass.txt', + 'a.txt', + 'lots of words.txt' + ])) == 0, 'does not accept strings matching the regex' def test_invalid_strings(self): - self.validate_and_compare( - [ - 'pass.TXT', - '.txt', - 'lots of words.tx' - ], - False, - 'accepts strings that do not match the regex' - ) + assert len(get_warnings(self.validator, [ + 'pass.TXT', + '.txt', + 'lots of words.tx' + ])) == 3, 'accepts strings that do not match the regex' class IsDistinct(ValidationTestBase): def setUp(self): - self.validator = IsDistinctValidation() + self.validator = IsDistinctValidation(index=0) def test_valid_strings(self): - self.validate_and_compare( - [ - '1', - '2', - '3', - '4' - ], - True, - 'does not accept unique strings' - ) + assert len(get_warnings(self.validator, [ + '1', + '2', + '3', + '4' + ])) == 0, 'does not accept unique strings' def test_invalid_strings(self): validation = self.validator.select_cells(pd.Series([ @@ -415,29 +342,21 @@ class CompiledRegexMatch(ValidationTestBase): """ def setUp(self): - self.validator = MatchesPatternValidation(re.compile('^.+\.txt$', re.IGNORECASE)) + self.validator = MatchesPatternValidation(re.compile('^.+\.txt$', re.IGNORECASE), index=0) def test_valid_strings(self): - self.validate_and_compare( - [ - 'pass.txt', - 'a.TXT', - 'lots of words.tXt' - ], - True, - 'does not accept strings matching the regex' - ) + assert len(get_warnings(self.validator, [ + 'pass.txt', + 'a.TXT', + 'lots of words.tXt' + ])) == 0, 'does not accept strings matching the regex' def test_invalid_strings(self): - self.validate_and_compare( - [ - 'pass.txtt', - '.txt', - 'lots of words.tx' - ], - False, - 'accepts strings that do not match the regex' - ) + assert len(get_warnings(self.validator, [ + 'pass.txtt', + '.txt', + 'lots of words.tx' + ])) == 3, 'accepts strings that do not match the regex' class InRange(ValidationTestBase): @@ -446,29 +365,21 @@ class InRange(ValidationTestBase): """ def setUp(self): - self.validator = InRangeValidation(7, 9) + self.validator = InRangeValidation(7, 9, index=0) def test_valid_items(self): - self.validate_and_compare( - [ - 7, - 8, - 7 - ], - True, - 'does not accept integers in the correct range' - ) + assert len(get_warnings(self.validator, [ + 7, + 8, + 7 + ])) == 0, 'does not accept integers in the correct range' def test_invalid_items(self): - self.validate_and_compare( - [ - 1, - 2, - 3 - ], - False, - 'Incorrectly accepts integers outside of the range' - ) + assert len(get_warnings(self.validator, [ + 1, + 2, + 3 + ])) == 3, 'Incorrectly accepts integers outside of the range' class Dtype(ValidationTestBase): @@ -477,7 +388,7 @@ class Dtype(ValidationTestBase): """ def setUp(self): - self.validator = IsDtypeValidation(np.number) + self.validator = IsDtypeValidation(np.number, index=0) def test_valid_items(self): errors = self.validator.validate_series(pd.Series( @@ -500,7 +411,6 @@ def test_invalid_items(self): self.assertEqual(len(errors), 1) self.assertEqual(type(errors[0]), ValidationWarning) - def test_schema(self): """ Test this validation inside a schema, to ensure we get helpful error messages. @@ -530,36 +440,27 @@ def test_schema(self): ) - class Negate(ValidationTestBase): """ Tests the ~ operator on a MatchesPatternValidation """ def setUp(self): - self.validator = ~MatchesPatternValidation('fail') + self.validator = ~MatchesPatternValidation('fail', index=0) def test_valid_items(self): - self.validate_and_compare( - [ - 'Pass', - '1', - 'True' - ], - True, - 'Rejects values that should pass' - ) + assert len(get_warnings(self.validator, [ + 'Pass', + '1', + 'True' + ])) == 0, 'Rejects values that should pass' def test_invalid_items(self): - self.validate_and_compare( - [ - 'fail', - 'thisfails', - 'failure' - ], - False, - 'Accepts values that should pass' - ) + assert len(get_warnings(self.validator, [ + 'fail', + 'thisfails', + 'failure' + ])) == 3, 'Accepts values that should pass' class Or(ValidationTestBase): @@ -568,30 +469,27 @@ class Or(ValidationTestBase): """ def setUp(self): - self.validator = MatchesPatternValidation('yes') | MatchesPatternValidation('pass') + self.validator = CombinedValidation( + MatchesPatternValidation('yes', index=0), + MatchesPatternValidation('pass', index=0), + 'or' + ) + # self.validator = MatchesPatternValidation('yes') | MatchesPatternValidation('pass', index=0) def test_valid_items(self): - self.validate_and_compare( - [ - 'pass', - 'yes', - 'passyes', - '345yes345' - ], - True, - 'Rejects values that should pass' - ) + assert len(get_warnings(self.validator, [ + 'pass', + 'yes', + 'passyes', + '345yes345' + ])) == 0, 'rejects values that should pass' def test_invalid_items(self): - self.validate_and_compare( - [ - 'fail', - 'YES', - 'YPESS' - ], - False, - 'Accepts values that should pass' - ) + assert len(get_warnings(self.validator, [ + 'fail', + 'YES', + 'YPESS' + ])) == 6, 'accepts values that should pass' class CustomMessage(ValidationTestBase): @@ -610,7 +508,7 @@ def test_default_message(self): 2, 3 ] - )): + ), flatten=True): self.assertNotRegex(error.message, self.message, 'Validator not using the default warning message!') def test_custom_message(self): @@ -621,7 +519,7 @@ def test_custom_message(self): 2, 3 ] - )): + ), flatten=True): self.assertRegex(error.message, self.message, 'Validator not using the custom warning message!') @@ -634,17 +532,17 @@ def setUp(self): self.vals = [1.0, None, 3] def test_in_range_allow_empty_with_error(self): - validator = InRangeValidation(min=4) + validator = InRangeValidation(min=4, index=0) errors = list(validator.validate_series(pd.Series(self.vals))) self.assertEqual(len(errors), sum(v is not None for v in self.vals)) def test_in_range_allow_empty_with_no_error(self): - validator = InRangeValidation(min=0) + validator = InRangeValidation(min=0, index=0) errors = list(validator.validate_series(pd.Series(self.vals))) self.assertEqual(len(errors), 0) def test_in_range_allow_empty_false_with_error(self): - validator = InRangeValidation(min=4) + validator = InRangeValidation(min=4, index=0) errors = list(validator.validate_series(pd.Series(self.vals))) self.assertEqual(len(errors), len(self.vals)) @@ -655,7 +553,7 @@ class PandasDtypeTests(ValidationTestBase): """ def setUp(self): - self.validator = InListValidation(['a', 'b', 'c'], case_sensitive=False) + self.validator = InListValidation(['a', 'b', 'c'], case_sensitive=False, index=0) def test_valid_elements(self): errors = self.validator.validate_series(pd.Series(['a', 'b', 'c', None, 'A', 'B', 'C'], dtype='category')) From bc7f269a57c1a15e7ca563a121fb72dbca24b953 Mon Sep 17 00:00:00 2001 From: Michael Milton Date: Fri, 20 Mar 2020 02:19:34 +1100 Subject: [PATCH 12/31] All tests passing; fixed message generation, fixed negation --- TODO.md | 5 ++- pandas_schema/core.py | 62 +++++++++++++++++++++-------- pandas_schema/validation_warning.py | 10 ++++- pandas_schema/validations.py | 29 +++++--------- test/test_validation.py | 32 ++++++++++----- 5 files changed, 89 insertions(+), 49 deletions(-) diff --git a/TODO.md b/TODO.md index 0a6c2c6..f777a57 100755 --- a/TODO.md +++ b/TODO.md @@ -2,4 +2,7 @@ * [x] Fix CombinedValidations * [x] Add replacement for allow_empty Columns * [ ] New column() tests -* [ ] New CombinedValidation tests \ No newline at end of file +* [ ] New CombinedValidation tests +* [x] Fix Negate +* [ ] Add facility for allow_empty +* [x] Fix messages diff --git a/pandas_schema/core.py b/pandas_schema/core.py index a7963a6..28561dc 100755 --- a/pandas_schema/core.py +++ b/pandas_schema/core.py @@ -12,7 +12,7 @@ from . import column from .errors import PanSchArgumentError, PanSchNoIndexError from pandas_schema.validation_warning import ValidationWarning -from pandas_schema.index import PandasIndexer, IndexValue +from pandas_schema.index import PandasIndexer, IndexValue, IndexType from pandas.api.types import is_categorical_dtype, is_numeric_dtype @@ -34,9 +34,10 @@ def message(self, warning: ValidationWarning) -> str: pass -class IndexValidation(BaseValidation): +class IndexValidation(BaseValidation, metaclass=abc.ABCMeta): """ - Mixin for Validation classes, giving them access to an index for selecting a Series out of the DataFrame + Abstract class that builds on BaseValidation to give it access to an index for selecting a Series out of the + DataFrame """ def __init__(self, index: typing.Union[PandasIndexer, IndexValue], message: str = None, **kwargs): @@ -54,14 +55,14 @@ def __init__(self, index: typing.Union[PandasIndexer, IndexValue], message: str self.custom_message = message def message(self, warning: ValidationWarning) -> str: - prefix = self.prefix() + prefix = self.prefix(warning) if self.custom_message: suffix = self.custom_message else: - suffix = self.default_message + suffix = self.default_message(warning) - return "Column {} {}".format(prefix, suffix) + return "{} {}".format(prefix, suffix) @property def readable_name(self, **kwargs): @@ -70,7 +71,7 @@ def readable_name(self, **kwargs): """ return type(self).__name__ - def default_message(self) -> str: + def default_message(self, warnings: ValidationWarning) -> str: return 'failed the {}'.format(self.readable_name) def select_series(self, df: pd.DataFrame) -> pd.Series: @@ -82,17 +83,19 @@ def select_series(self, df: pd.DataFrame) -> pd.Series: return self.index(df) - def prefix(self): + def prefix(self, warning: ValidationWarning): """ - Return a string that could be used to prefix a message that relates to this index + Return a string that can be used to prefix a message that relates to this index + + This method is safe to override """ if self.index is None: return "" - if self.index.type == 'position': - return self.index.index + if self.index.type == IndexType.POSITION: + return 'Column {}'.format(self.index.index) else: - return '"{}"'.format(self.index.index) + return 'Column "{}"'.format(self.index.index) # @@ -173,6 +176,10 @@ class BooleanSeriesValidation(IndexValidation, WarningSeriesGenerator): because the data is in the same form for each cell. You need only define a :py:meth~default_message. """ + def __init__(self, *args, negated=False, **kwargs): + super().__init__(*args, **kwargs) + self.negated = negated + @abc.abstractmethod def select_cells(self, series: pd.Series) -> pd.Series: """ @@ -187,9 +194,17 @@ def validate_series(self, series, flatten=True) -> typing.Union[ pd.Series ]: """ - Utility method for shortcutting data-frame validation and instead validating only a single series + Validates a single series selected from the DataFrame """ - failed = ~self.select_cells(series) + selection = self.select_cells(series) + + if self.negated: + # If self.negated (which is not the default), then we don't need to flip the booleans + failed = selection + else: + # In the normal case we do need to flip the booleans, since select_cells returns True for cells that pass + # the validation, and we want cells that failed it + failed = ~selection # Slice out the failed items, then map each into a list of validation warnings at each respective index warnings = series[failed].to_frame().apply(lambda row: [ValidationWarning(self, { @@ -198,6 +213,7 @@ def validate_series(self, series, flatten=True) -> typing.Union[ })], axis='columns', result_type='reduce') # warnings = warnings.iloc[:, 0] + # If flatten, return a list of ValidationWarning, otherwise return a series of lists of Validation Warnings if flatten: return self.flatten_warning_series(warnings) else: @@ -206,12 +222,24 @@ def validate_series(self, series, flatten=True) -> typing.Union[ def get_warning_series(self, df: pd.DataFrame) -> pd.Series: """ Validates a series and returns a series of warnings. - This is shared by the two validation entrypoints, :py:meth:~validate_with_series, and :py:meth:`~validate_series - :param series: The series to validate """ series = self.select_series(df) return self.validate_series(series, flatten=False) + def prefix(self, warning: ValidationWarning): + parent = super().prefix(warning) + # Only in this subclass do we know the contents of the warning props, since we defined them in the + # validate_series method. Thus, we can now add row index information + + return parent + ', Row {row}: "{value}"'.format(**warning.props) + + def __invert__(self) -> 'BooleanSeriesValidation': + """ + If a BooleanSeriesValidation is negated, it has the opposite result + """ + self.negated = not self.negated + return self + class CombinedValidation(WarningSeriesGenerator): """ @@ -249,5 +277,5 @@ def get_warning_series(self, df: pd.DataFrame) -> pd.Series: return combined @property - def default_message(self): + def default_message(self, warnings: ValidationWarning) -> str: return '({}) {} ({})'.format(self.v_a.message, self.operator, self.v_b.message) diff --git a/pandas_schema/validation_warning.py b/pandas_schema/validation_warning.py index 3eec3db..e6e3ddd 100755 --- a/pandas_schema/validation_warning.py +++ b/pandas_schema/validation_warning.py @@ -1,4 +1,3 @@ -import pandas_schema from dataclasses import dataclass, field @@ -8,7 +7,7 @@ class ValidationWarning: Represents a difference between the schema and data frame, found during the validation of the data frame """ - validation: 'pandas_schema.BaseValidation' + validation: 'pandas_schema.core.BaseValidation' """ The validation that spawned this warning """ @@ -22,4 +21,11 @@ class ValidationWarning: @property def message(self): + """ + Return this validation as a string + """ + # Internally, this actually asks the validator class to formulate a message return self.validation.message(self) + + def __str__(self): + return self.message diff --git a/pandas_schema/validations.py b/pandas_schema/validations.py index b2ffae1..2e803df 100755 --- a/pandas_schema/validations.py +++ b/pandas_schema/validations.py @@ -73,8 +73,7 @@ def __init__(self, min: float = -math.inf, max: float = math.inf, **kwargs): self.max = max super().__init__(**kwargs) - @property - def default_message(self): + def default_message(self, warning: ValidationWarning): return 'was not in the range [{}, {})'.format(self.min, self.max) def select_cells(self, series: pd.Series) -> pd.Series: @@ -126,8 +125,7 @@ def __init__(self, func: typing.Callable, **kwargs): type)) super().__init__(**kwargs) - @property - def default_message(self): + def default_message(self, warning: ValidationWarning): return 'raised an exception when the callable {} was called on it'.format( self.callable) @@ -162,8 +160,7 @@ def __init__(self, _type: type, **kwargs): else: raise PanSchArgumentError('{} is not a valid type'.format(_type)) - @property - def default_message(self): + def default_message(self, warning: ValidationWarning): return 'cannot be converted to type {}'.format(self.callable) @@ -182,9 +179,8 @@ def __init__(self, pattern, options={}, **kwargs): self.options = options super().__init__(**kwargs) - @property - def default_message(self): - return 'does not match the pattern "{}"'.format(self.pattern) + def default_message(self, warning: ValidationWarning): + return 'does not match the pattern "{}"'.format(self.pattern.pattern) def select_cells(self, series: pd.Series) -> pd.Series: return series.astype(str).str.contains(self.pattern, **self.options) @@ -198,8 +194,7 @@ class TrailingWhitespaceValidation(BooleanSeriesValidation): def __init__(self, **kwargs): super().__init__(**kwargs) - @property - def default_message(self): + def default_message(self, warning: ValidationWarning): return 'contains trailing whitespace' def select_cells(self, series: pd.Series) -> pd.Series: @@ -214,8 +209,7 @@ class LeadingWhitespaceValidation(BooleanSeriesValidation): def __init__(self, **kwargs): super().__init__(**kwargs) - @property - def default_message(self): + def default_message(self, warning: ValidationWarning): return 'contains leading whitespace' def select_cells(self, series: pd.Series) -> pd.Series: @@ -230,8 +224,7 @@ class IsDistinctValidation(BooleanSeriesValidation): def __init__(self, **kwargs): super().__init__(**kwargs) - @property - def default_message(self): + def default_message(self, warning: ValidationWarning): return 'contains values that are not unique' def select_cells(self, series: pd.Series) -> pd.Series: @@ -252,8 +245,7 @@ def __init__(self, options: typing.Iterable, case_sensitive: bool = True, **kwar self.options = options super().__init__(**kwargs) - @property - def default_message(self): + def default_message(self, warning: ValidationWarning): values = ', '.join(str(v) for v in self.options) return 'is not in the list of legal options ({})'.format(values) @@ -278,8 +270,7 @@ def __init__(self, date_format: str, **kwargs): self.date_format = date_format super().__init__(**kwargs) - @property - def default_message(self): + def default_message(self, warning: ValidationWarning): return 'does not match the date format string "{}"'.format(self.date_format) def valid_date(self, val): diff --git a/test/test_validation.py b/test/test_validation.py index a0c316c..90e6b0c 100755 --- a/test/test_validation.py +++ b/test/test_validation.py @@ -291,7 +291,7 @@ def test_invalid_dates(self): class StringRegexMatch(ValidationTestBase): def setUp(self): - self.validator = MatchesPatternValidation('^.+\.txt$', index=0) + self.validator = MatchesPatternValidation(r'^.+\.txt$', index=0) def test_valid_strings(self): assert len(get_warnings(self.validator, [ @@ -352,11 +352,22 @@ def test_valid_strings(self): ])) == 0, 'does not accept strings matching the regex' def test_invalid_strings(self): - assert len(get_warnings(self.validator, [ + test_data = [ 'pass.txtt', '.txt', 'lots of words.tx' - ])) == 3, 'accepts strings that do not match the regex' + ] + warnings = get_warnings(self.validator, test_data) + + # Check that every piece of data failed + assert len(warnings) == 3, 'accepts strings that do not match the regex' + + # Also test the messages + for i, (warning, data) in enumerate(zip(warnings, test_data)): + assert 'Row {}'.format(i) in warning.message + assert 'Column 0' in warning.message + assert data in warning.message + assert self.validator.pattern.pattern in warning.message class InRange(ValidationTestBase): @@ -501,7 +512,7 @@ def setUp(self): self.message = "UNUSUAL MESSAGE THAT WOULDN'T BE IN A NORMAL ERROR" def test_default_message(self): - validator = InRangeValidation(min=4) + validator = InRangeValidation(min=4, index=0) for error in validator.validate_series(pd.Series( [ 1, @@ -512,7 +523,7 @@ def test_default_message(self): self.assertNotRegex(error.message, self.message, 'Validator not using the default warning message!') def test_custom_message(self): - validator = InRangeValidation(min=4, message=self.message) + validator = InRangeValidation(min=4, message=self.message, index=0) for error in validator.validate_series(pd.Series( [ 1, @@ -523,6 +534,7 @@ def test_custom_message(self): self.assertRegex(error.message, self.message, 'Validator not using the custom warning message!') +@unittest.skip('allow_empty no longer exists') class GetErrorTests(ValidationTestBase): """ Tests for float valued columns where allow_empty=True @@ -556,17 +568,17 @@ def setUp(self): self.validator = InListValidation(['a', 'b', 'c'], case_sensitive=False, index=0) def test_valid_elements(self): - errors = self.validator.validate_series(pd.Series(['a', 'b', 'c', None, 'A', 'B', 'C'], dtype='category')) - self.assertEqual(len(list(errors)), 0) + errors = self.validator.validate_series(pd.Series(['a', 'b', 'c', 'A', 'B', 'C'], dtype='category')) + assert len(list(errors)) == 0 def test_invalid_empty_elements(self): errors = self.validator.validate_series(pd.Series(['aa', 'bb', 'd', None], dtype='category')) - self.assertEqual(len(list(errors)), 4) + assert len(list(errors)) == 4 def test_invalid_and_empty_elements(self): errors = self.validator.validate_series(pd.Series(['a', None], dtype='category')) - self.assertEqual(len(list(errors)), 1) + assert len(list(errors)) == 1 def test_invalid_elements(self): errors = self.validator.validate_series(pd.Series(['aa', 'bb', 'd'], dtype='category')) - self.assertEqual(len(list(errors)), 3) + assert len(list(errors)) == 3 From f8ce653cff196e2b58013563010a17881bcd80f2 Mon Sep 17 00:00:00 2001 From: Michael Milton Date: Sat, 21 Mar 2020 16:09:48 +1100 Subject: [PATCH 13/31] Fix or operator --- TODO.md | 2 ++ UPDATE.md | 14 ++++++++++ pandas_schema/core.py | 8 ++++++ setup.py | 2 +- test/test_validation.py | 59 ++++++++++++++++++++++++++--------------- 5 files changed, 62 insertions(+), 23 deletions(-) diff --git a/TODO.md b/TODO.md index f777a57..3cdd8fb 100755 --- a/TODO.md +++ b/TODO.md @@ -6,3 +6,5 @@ * [x] Fix Negate * [ ] Add facility for allow_empty * [x] Fix messages +* [x] Re-implement the or/and using operators +* [ ] Allow and/or operators between Series-level and row-level validations diff --git a/UPDATE.md b/UPDATE.md index 5bbc15f..c6c8a1d 100755 --- a/UPDATE.md +++ b/UPDATE.md @@ -31,3 +31,17 @@ that spawned it * In order to make both CombinedValidation and BooleanSeriesValidation both share a class, so they can be chained together, either we had to make a mixin that creates a "side path" that doesn't call `validate` (in this case, `validate_with_series`), or we + +# Rework of Validation Indexing +## All Indexed +* All Validations now have an index and an axis +* However, this index can be none, can be column only, row only, or both +* When combined with each other, the resulting boolean series will be broadcast using numpy broadcasting rules +* e.g. + * A per-series validation might have index 0 (column 0) and return a scalar (the whole series is okay) + * A per-cell validation might have index 0 (column 0) and return a series (True, True, False) indicating that cell 0 and 1 of column 0 are okay + * A per-frame validation would have index None, and might return True if the whole frame meets the validation, or a series indicating which columns or rows match the validation + +# Rework of combinedvalidations +## Bitwise +* Could assign each validation a bit in a large bitwise enum, and `or` together a number each time that index fails a validatioin. This lets us track the origin of each warning, allowing us to slice them out by bit and generate an appropriate list of warnings \ No newline at end of file diff --git a/pandas_schema/core.py b/pandas_schema/core.py index 28561dc..4435d1d 100755 --- a/pandas_schema/core.py +++ b/pandas_schema/core.py @@ -166,6 +166,14 @@ def validate(self, df: pd.DataFrame, flatten=True) -> typing.Union[ else: return warnings + def __or__(self, other: 'WarningSeriesGenerator'): + if not isinstance(other, WarningSeriesGenerator): + raise PanSchArgumentError('The "|" operator can only be used between two' + 'Validations that subclass {}'.format(self.__class__)) + + return CombinedValidation(self, other, operator='or') + + class BooleanSeriesValidation(IndexValidation, WarningSeriesGenerator): """ diff --git a/setup.py b/setup.py index 2441567..ff6d9a4 100755 --- a/setup.py +++ b/setup.py @@ -83,7 +83,7 @@ def run(self): packages=find_packages(include=['pandas_schema']), install_requires=[ 'numpy', - 'pandas>=0.19', + 'pandas>=0.23', 'dataclasses' ], cmdclass={ diff --git a/test/test_validation.py b/test/test_validation.py index 90e6b0c..2351434 100755 --- a/test/test_validation.py +++ b/test/test_validation.py @@ -10,14 +10,16 @@ import pandas as pd from pandas_schema.validations import * -from pandas_schema.core import BooleanSeriesValidation, CombinedValidation, BaseValidation +from pandas_schema.core import BooleanSeriesValidation, CombinedValidation, \ + BaseValidation from pandas_schema.index import ColumnIndexer as ci from pandas_schema.schema import Schema from pandas_schema.column import column, column_sequence from pandas_schema import ValidationWarning -def get_warnings(validator: BaseValidation, series: list) -> typing.Collection[ValidationWarning]: +def get_warnings(validator: BaseValidation, series: list) -> typing.Collection[ + ValidationWarning]: """ Tests a validator by asserting that it generates the amount of warnings :param series_dtype: Explicitly specifies the dtype for the generated Series @@ -62,10 +64,12 @@ def setUp(self): ) def test_valid_inputs(self): - assert len(get_warnings(self.validator, ['good', 'success'])) == 0, 'did not accept valid inputs' + assert len(get_warnings(self.validator, ['good', + 'success'])) == 0, 'did not accept valid inputs' def test_invalid_inputs(self): - assert len(get_warnings(self.validator, ['fail', 'failure'])) == 2, 'accepted invalid inputs' + assert len(get_warnings(self.validator, + ['fail', 'failure'])) == 2, 'accepted invalid inputs' class CustomElement(ValidationTestBase): @@ -82,10 +86,12 @@ def setUp(self): def test_valid_inputs(self): assert len( - get_warnings(self.validator, ['_start_sdiyhsd', '_start_234fpwunxc\n'])) == 0, 'did not accept valid inputs' + get_warnings(self.validator, ['_start_sdiyhsd', + '_start_234fpwunxc\n'])) == 0, 'did not accept valid inputs' def test_invalid_inputs(self): - assert len(get_warnings(self.validator, ['fail', '324wfp9ni'])) == 2, 'accepted invalid inputs' + assert len(get_warnings(self.validator, + ['fail', '324wfp9ni'])) == 2, 'accepted invalid inputs' class LeadingWhitespace(ValidationTestBase): @@ -185,7 +191,8 @@ class CanCallLambda(ValidationTestBase): def setUp(self): # Succeed if it's divisible by 2, otherwise cause an error - self.validator = CanCallValidation(lambda x: False if x % 2 == 0 else 1 / 0, index=0) + self.validator = CanCallValidation(lambda x: False if x % 2 == 0 else 1 / 0, + index=0) def test_validate_noerror(self): assert len(get_warnings(self.validator, [ @@ -251,7 +258,8 @@ def test_invalid_elements(self): class InListCaseInsensitive(ValidationTestBase): def setUp(self): - self.validator = InListValidation(['a', 'b', 'c'], case_sensitive=False, index=0) + self.validator = InListValidation(['a', 'b', 'c'], case_sensitive=False, + index=0) def test_valid_elements(self): assert len(get_warnings(self.validator, [ @@ -342,7 +350,8 @@ class CompiledRegexMatch(ValidationTestBase): """ def setUp(self): - self.validator = MatchesPatternValidation(re.compile('^.+\.txt$', re.IGNORECASE), index=0) + self.validator = MatchesPatternValidation( + re.compile('^.+\.txt$', re.IGNORECASE), index=0) def test_valid_strings(self): assert len(get_warnings(self.validator, [ @@ -480,12 +489,11 @@ class Or(ValidationTestBase): """ def setUp(self): - self.validator = CombinedValidation( - MatchesPatternValidation('yes', index=0), - MatchesPatternValidation('pass', index=0), - 'or' + self.validator = MatchesPatternValidation( + 'yes', index=0 + ) | MatchesPatternValidation( + 'pass', index=0 ) - # self.validator = MatchesPatternValidation('yes') | MatchesPatternValidation('pass', index=0) def test_valid_items(self): assert len(get_warnings(self.validator, [ @@ -520,7 +528,8 @@ def test_default_message(self): 3 ] ), flatten=True): - self.assertNotRegex(error.message, self.message, 'Validator not using the default warning message!') + self.assertNotRegex(error.message, self.message, + 'Validator not using the default warning message!') def test_custom_message(self): validator = InRangeValidation(min=4, message=self.message, index=0) @@ -531,7 +540,8 @@ def test_custom_message(self): 3 ] ), flatten=True): - self.assertRegex(error.message, self.message, 'Validator not using the custom warning message!') + self.assertRegex(error.message, self.message, + 'Validator not using the custom warning message!') @unittest.skip('allow_empty no longer exists') @@ -565,20 +575,25 @@ class PandasDtypeTests(ValidationTestBase): """ def setUp(self): - self.validator = InListValidation(['a', 'b', 'c'], case_sensitive=False, index=0) + self.validator = InListValidation(['a', 'b', 'c'], case_sensitive=False, + index=0) def test_valid_elements(self): - errors = self.validator.validate_series(pd.Series(['a', 'b', 'c', 'A', 'B', 'C'], dtype='category')) - assert len(list(errors)) == 0 + errors = self.validator.validate_series( + pd.Series(['a', 'b', 'c', 'A', 'B', 'C'], dtype='category')) + assert len(list(errors)) == 0 def test_invalid_empty_elements(self): - errors = self.validator.validate_series(pd.Series(['aa', 'bb', 'd', None], dtype='category')) + errors = self.validator.validate_series( + pd.Series(['aa', 'bb', 'd', None], dtype='category')) assert len(list(errors)) == 4 def test_invalid_and_empty_elements(self): - errors = self.validator.validate_series(pd.Series(['a', None], dtype='category')) + errors = self.validator.validate_series( + pd.Series(['a', None], dtype='category')) assert len(list(errors)) == 1 def test_invalid_elements(self): - errors = self.validator.validate_series(pd.Series(['aa', 'bb', 'd'], dtype='category')) + errors = self.validator.validate_series( + pd.Series(['aa', 'bb', 'd'], dtype='category')) assert len(list(errors)) == 3 From cc1e8c829a99cc6b373c11fca248ef8d18691880 Mon Sep 17 00:00:00 2001 From: Michael Milton Date: Mon, 23 Mar 2020 00:59:14 +1100 Subject: [PATCH 14/31] Initial bitwise rewrite --- pandas_schema/core.py | 371 ++++++++++++++++++----------------- pandas_schema/index.py | 32 ++- pandas_schema/validations.py | 42 ++-- test/test_validation.py | 12 +- 4 files changed, 248 insertions(+), 209 deletions(-) diff --git a/pandas_schema/core.py b/pandas_schema/core.py index 4435d1d..774b588 100755 --- a/pandas_schema/core.py +++ b/pandas_schema/core.py @@ -12,7 +12,7 @@ from . import column from .errors import PanSchArgumentError, PanSchNoIndexError from pandas_schema.validation_warning import ValidationWarning -from pandas_schema.index import PandasIndexer, IndexValue, IndexType +from pandas_schema.index import PandasIndexer, IndexValue, IndexType, RowIndexer from pandas.api.types import is_categorical_dtype, is_numeric_dtype @@ -21,38 +21,53 @@ class BaseValidation(abc.ABC): A validation is, broadly, just a function that maps a data frame to a list of errors """ - @abc.abstractmethod + def __init__( + self, + message: str = None, + negated: bool = False + ): + """ + Creates a new IndexSeriesValidation + :param index: An index with which to select the series + Otherwise it's a label (ie, index=0) indicates the column with the label of 0 + """ + self.custom_message = message + self.negated = negated + def validate(self, df: pd.DataFrame) -> typing.Iterable[ValidationWarning]: """ Validates a data frame :param df: Data frame to validate :return: All validation failures detected by this validation """ + selection = self.validation_series(df) - @abc.abstractmethod - def message(self, warning: ValidationWarning) -> str: - pass - + if self.negated: + # If self.negated (which is not the default), then we don't need to flip the booleans + failed = selection + else: + # In the normal case we do need to flip the booleans, since select_cells returns True for cells that pass + # the validation, and we want cells that failed it + failed = ~selection -class IndexValidation(BaseValidation, metaclass=abc.ABCMeta): - """ - Abstract class that builds on BaseValidation to give it access to an index for selecting a Series out of the - DataFrame - """ + # Slice out the failed items, then map each into a list of validation warnings at each respective index + warnings = [] + for index, value in df[failed].iteritems(): + warnings.append(ValidationWarning( + ValidationWarning(self, { + 'row': index, + 'value': value + }) + )) + return warnings - def __init__(self, index: typing.Union[PandasIndexer, IndexValue], message: str = None, **kwargs): + @abc.abstractmethod + def validation_series(self, df: pd.DataFrame) -> pd.Series: """ - Creates a new IndexSeriesValidation - :param index: An index with which to select the series - Otherwise it's a label (ie, index=0) indicates the column with the label of 0 + Returns a scalar, series or DataFrame of booleans, which will then be broadcast + across the DataFrame according to broadcasting rules + :return: """ - super().__init__(**kwargs) - if isinstance(index, PandasIndexer): - self.index = index - else: - # If it isn't already an indexer object, convert it to one - self.index = PandasIndexer(index=index) - self.custom_message = message def message(self, warning: ValidationWarning) -> str: prefix = self.prefix(warning) @@ -74,182 +89,179 @@ def readable_name(self, **kwargs): def default_message(self, warnings: ValidationWarning) -> str: return 'failed the {}'.format(self.readable_name) - def select_series(self, df: pd.DataFrame) -> pd.Series: - """ - Select a series using the data stored in this validation - """ - if self.index is None: - raise PanSchNoIndexError() - - return self.index(df) - + @abc.abstractmethod def prefix(self, warning: ValidationWarning): """ Return a string that can be used to prefix a message that relates to this index This method is safe to override """ - if self.index is None: - return "" - if self.index.type == IndexType.POSITION: - return 'Column {}'.format(self.index.index) - else: - return 'Column "{}"'.format(self.index.index) - - -# -# class SeriesValidation(BaseValidation): -# """ -# A SeriesValidation validates a DataFrame by selecting a single series from it, and -# applying some validation to it -# """ -# -# @abc.abstractmethod -# def select_series(self, df: pd.DataFrame) -> pd.Series: -# """ -# Selects a series from the DataFrame that will be validated -# """ -# -# @abc.abstractmethod -# def validate_series(self, series: pd.Series) -> typing.Iterable[ValidationWarning]: -# """ -# Validate a single series -# """ -# -# def validate(self, df: pd.DataFrame) -> typing.Iterable[ValidationWarning]: -# series = self.select_series(df) -# return self.validate_series(series) - - -class SeriesValidation(IndexValidation): - """ - A SeriesValidation validates a DataFrame by selecting a single series from it, and - applying some validation to it - """ - - def validate(self, df: pd.DataFrame) -> typing.Iterable[ValidationWarning]: - series = self.index(df) - return self.validate_series(series) - - @abc.abstractmethod - def validate_series(self, series: pd.Series) -> typing.Iterable[ValidationWarning]: - pass + def __or__(self, other: 'BaseValidation'): + if not isinstance(other, BaseValidation): + raise PanSchArgumentError('The "|" operator can only be used between two' + 'Validations that subclass {}'.format( + self.__class__)) + return CombinedValidation(self, other, operator='or') -class WarningSeriesGenerator(BaseValidation, abc.ABC): - """ - Mixin class that indicates that this Validation can produce a "warning series", which is a pandas Series with one - or more warnings in each cell, corresponding to warnings detected in the DataFrame at the same index - """ - @abc.abstractmethod - def get_warning_series(self, df: pd.DataFrame) -> pd.Series: +class IndexValidation(BaseValidation): + def __init__( + self, + row_index: typing.Union[PandasIndexer, IndexValue], + col_index: typing.Union[PandasIndexer, IndexValue], + *args, + **kwargs + ): """ - Return a series of ValidationWarnings, not an iterable of ValidationWarnings like the normal validate() method + Creates a new IndexSeriesValidation + :param index: An index with which to select the series + Otherwise it's a label (ie, index=0) indicates the column with the label of 0 """ + super().__init__(*args, **kwargs) - @staticmethod - def flatten_warning_series(warnings: pd.Series): - """ - Converts a warning series into an iterable of warnings - """ - return warnings[warnings.astype(bool)].explode().tolist() - - def validate(self, df: pd.DataFrame, flatten=True) -> typing.Union[ - typing.Iterable[ValidationWarning], - pd.Series - ]: - warnings = self.get_warning_series(df) - if flatten: - return self.flatten_warning_series(warnings) + if isinstance(row_index, PandasIndexer): + self.row_index = row_index else: - return warnings - - def __or__(self, other: 'WarningSeriesGenerator'): - if not isinstance(other, WarningSeriesGenerator): - raise PanSchArgumentError('The "|" operator can only be used between two' - 'Validations that subclass {}'.format(self.__class__)) - - return CombinedValidation(self, other, operator='or') - - - -class BooleanSeriesValidation(IndexValidation, WarningSeriesGenerator): - """ - Validation is defined by the function :py:meth:~select_cells that returns a boolean series. - Each cell that has False has failed the validation. - - Child classes need not create their own :py:class:~pandas_schema.core.BooleanSeriesValidation.Warning subclass, - because the data is in the same form for each cell. You need only define a :py:meth~default_message. - """ + # If it isn't already an indexer object, convert it to one + self.row_index = PandasIndexer(index=row_index) - def __init__(self, *args, negated=False, **kwargs): - super().__init__(*args, **kwargs) - self.negated = negated + if isinstance(col_index, PandasIndexer): + self.col_index = col_index + else: + # If it isn't already an indexer object, convert it to one + self.col_index = PandasIndexer(index=col_index) - @abc.abstractmethod - def select_cells(self, series: pd.Series) -> pd.Series: + def apply_index(self, df: pd.DataFrame): """ - A BooleanSeriesValidation must return a boolean series. Each cell that has False has failed the - validation - :param series: The series to validate + Select a series using the data stored in this validation """ - pass + if self.row_index is None or self.col_index is None: + raise PanSchNoIndexError() + + return df.loc[self.row_index.for_loc(df), self.col_index.for_loc(df)] - def validate_series(self, series, flatten=True) -> typing.Union[ - typing.Iterable[ValidationWarning], - pd.Series - ]: + def prefix(self, warning: ValidationWarning): """ - Validates a single series selected from the DataFrame + Return a string that can be used to prefix a message that relates to this index + + This method is safe to override """ - selection = self.select_cells(series) + ret = "" - if self.negated: - # If self.negated (which is not the default), then we don't need to flip the booleans - failed = selection - else: - # In the normal case we do need to flip the booleans, since select_cells returns True for cells that pass - # the validation, and we want cells that failed it - failed = ~selection + if self.col_index is not None: + if self.col_index.type == IndexType.POSITION: + ret += 'Column {}'.format(self.col_index.index) + else: + ret += 'Column "{}"'.format(self.col_index.index) + if self.row_index is not None: + if self.row_index.type == IndexType.POSITION: + ret += 'Column {}'.format(self.row_index.index) + else: + ret += 'Column "{}"'.format(self.row_index.index) - # Slice out the failed items, then map each into a list of validation warnings at each respective index - warnings = series[failed].to_frame().apply(lambda row: [ValidationWarning(self, { - 'row': row.name, - 'value': row[0] - })], axis='columns', result_type='reduce') - # warnings = warnings.iloc[:, 0] - - # If flatten, return a list of ValidationWarning, otherwise return a series of lists of Validation Warnings - if flatten: - return self.flatten_warning_series(warnings) - else: - return warnings - def get_warning_series(self, df: pd.DataFrame) -> pd.Series: +class SeriesValidation(IndexValidation): + def __init__(self, index, *args, **kwargs): + super().__init__( + *args, + col_index=index, + row_index=RowIndexer(index=slice(None), typ=IndexType.POSITION), + **kwargs + ) + + def validation_series(self, df) -> pd.Series: + series = self.apply_index(df) + #TODO: Combine the index and the result series into one set of indexes + return self.validate_series(series) + + @abc.abstractmethod + def validate_series(self, series: pd.Series) -> pd.Series: """ - Validates a series and returns a series of warnings. + Given a series, return a bool Series that has values of True if the series + passes the validation, otherwise False """ - series = self.select_series(df) - return self.validate_series(series, flatten=False) - - def prefix(self, warning: ValidationWarning): - parent = super().prefix(warning) - # Only in this subclass do we know the contents of the warning props, since we defined them in the - # validate_series method. Thus, we can now add row index information + pass - return parent + ', Row {row}: "{value}"'.format(**warning.props) - def __invert__(self) -> 'BooleanSeriesValidation': - """ - If a BooleanSeriesValidation is negated, it has the opposite result - """ - self.negated = not self.negated - return self +# +# class BooleanSeriesValidation(IndexValidation, WarningSeriesGenerator): +# """ +# Validation is defined by the function :py:meth:~select_cells that returns a boolean series. +# Each cell that has False has failed the validation. +# +# Child classes need not create their own :py:class:~pandas_schema.core.BooleanSeriesValidation.Warning subclass, +# because the data is in the same form for each cell. You need only define a :py:meth~default_message. +# """ +# +# def __init__(self, *args, negated=False, **kwargs): +# super().__init__(*args, **kwargs) +# self.negated = negated +# +# @abc.abstractmethod +# def select_cells(self, series: pd.Series) -> pd.Series: +# """ +# A BooleanSeriesValidation must return a boolean series. Each cell that has False has failed the +# validation +# :param series: The series to validate +# """ +# pass +# +# def validate_series(self, series, flatten=True) -> typing.Union[ +# typing.Iterable[ValidationWarning], +# pd.Series +# ]: +# """ +# Validates a single series selected from the DataFrame +# """ +# selection = self.select_cells(series) +# +# if self.negated: +# # If self.negated (which is not the default), then we don't need to flip the booleans +# failed = selection +# else: +# # In the normal case we do need to flip the booleans, since select_cells returns True for cells that pass +# # the validation, and we want cells that failed it +# failed = ~selection +# +# # Slice out the failed items, then map each into a list of validation warnings at each respective index +# warnings = series[failed].to_frame().apply( +# lambda row: [ValidationWarning(self, { +# 'row': row.name, +# 'value': row[0] +# })], axis='columns', result_type='reduce') +# # warnings = warnings.iloc[:, 0] +# +# # If flatten, return a list of ValidationWarning, otherwise return a series of lists of Validation Warnings +# if flatten: +# return self.flatten_warning_series(warnings) +# else: +# return warnings +# +# def get_warning_series(self, df: pd.DataFrame) -> pd.Series: +# """ +# Validates a series and returns a series of warnings. +# """ +# series = self.select_series(df) +# return self.validate_series(series, flatten=False) +# +# def prefix(self, warning: ValidationWarning): +# parent = super().prefix(warning) +# # Only in this subclass do we know the contents of the warning props, since we defined them in the +# # validate_series method. Thus, we can now add row index information +# +# return parent + ', Row {row}: "{value}"'.format(**warning.props) +# +# def __invert__(self) -> 'BooleanSeriesValidation': +# """ +# If a BooleanSeriesValidation is negated, it has the opposite result +# """ +# self.negated = not self.negated +# return self -class CombinedValidation(WarningSeriesGenerator): +class CombinedValidation(BaseValidation): """ Validates if one and/or the other validation is true for an element """ @@ -257,7 +269,8 @@ class CombinedValidation(WarningSeriesGenerator): def message(self, warning: ValidationWarning) -> str: pass - def __init__(self, validation_a: WarningSeriesGenerator, validation_b: WarningSeriesGenerator, operator: str): + def __init__(self, validation_a: BaseValidation, + validation_b: BaseValidation, operator: str): super().__init__() self.operator = operator self.left = validation_a @@ -265,19 +278,27 @@ def __init__(self, validation_a: WarningSeriesGenerator, validation_b: WarningSe def get_warning_series(self, df: pd.DataFrame) -> pd.Series: # Let both validations separately select and filter a column - left_errors = self.left.validate(df, flatten=False) - right_errors = self.right.validate(df, flatten=False) + left_errors = self.left.validate(df) + right_errors = self.right.validate(df) if self.operator == 'and': - # If it's an "and" validation, left, right, or both failing means an error, so we can simply concatenate - # the lists of errors - combined = left_errors.combine(right_errors, func=operator.add, fill_value=[]) + # If it's an "and" validation, left, right, or both failing means an error, + # so we can simply concatenate the lists of errors + combined = left_errors.combine( + right_errors, + func=operator.add, + fill_value=[] + ) elif self.operator == 'or': # [error] and [] = [] # [error_1] and [error_2] = [error_2] # [] and [] = [] # Thus, we can use the and operator to implement "or" validations - combined = left_errors.combine(right_errors, func=lambda l, r: l + r if l and r else [], fill_value=[]) + combined = left_errors.combine( + right_errors, + func=lambda l, r: l + r if l and r else [], + fill_value=[] + ) # func=lambda a, b: [] if len(a) == 0 or len(b) == 0 else a + b) else: raise Exception('Operator must be "and" or "or"') diff --git a/pandas_schema/index.py b/pandas_schema/index.py index 51f1172..9f777f6 100755 --- a/pandas_schema/index.py +++ b/pandas_schema/index.py @@ -5,7 +5,7 @@ import pandas from enum import Enum -IndexValue = Union[numpy.string_, numpy.int_, str, int] +IndexValue = Union[numpy.string_, numpy.int_, str, int, slice] """ A pandas index can either be an integer or string, or an array of either. This typing is a bit sketchy because really a lot of things are accepted here @@ -22,11 +22,11 @@ class PandasIndexer: An index into a particular axis of a DataFrame. Attempts to recreate the behaviour of `df.ix[some_index]` """ - # valid_types = {'position', 'label'} index: IndexValue """ The index to use, either an integer for position-based indexing, or a string for label-based indexing """ + type: IndexType """ The type of indexing to use, either 'position' or 'label' @@ -42,11 +42,9 @@ def __init__(self, index: IndexValue, typ: IndexType = None, axis: int = 1): self.axis = axis if typ is not None: - # If the type is provided, validate it - if typ not in self.valid_types: - raise PanSchIndexError('The index type was not one of {}'.format(' or '.join(self.valid_types))) - else: - self.type = typ + if not isinstance(typ, IndexType): + raise PanSchIndexError('Index must be a subclass of IndexType') + self.type = typ else: # If the type isn't provided, guess it based on the datatype of the index if numpy.issubdtype(type(index), numpy.character): @@ -68,6 +66,26 @@ def __call__(self, df: pandas.DataFrame): elif self.type == IndexType.POSITION: return df.iloc(axis=self.axis)[self.index] + def for_loc(self, df: pandas.DataFrame): + """ + Returns this index as something that could be passed into df.loc[] + """ + if self.type == IndexType.LABEL: + return df.axes[self.axis][self.index] + elif self.type == IndexType.POSITION: + return self.index + + def for_iloc(self, df): + """ + Returns this index as something that could be passed into df.iloc[] + """ + if self.type == IndexType.LABEL: + return df.axes[self.axis].get_indexer(self.index) + elif self.type == IndexType.POSITION: + return self.index + + + class RowIndexer(PandasIndexer): def __init__(self, index: IndexValue, typ: IndexType = None): diff --git a/pandas_schema/validations.py b/pandas_schema/validations.py index 2e803df..872c3c1 100755 --- a/pandas_schema/validations.py +++ b/pandas_schema/validations.py @@ -7,13 +7,13 @@ import operator from . import column -from .core import SeriesValidation, BooleanSeriesValidation, IndexValidation +from .core import SeriesValidation, IndexValidation from .validation_warning import ValidationWarning from .errors import PanSchArgumentError from pandas.api.types import is_categorical_dtype, is_numeric_dtype -class CustomSeriesValidation(BooleanSeriesValidation): +class CustomSeriesValidation(IndexValidation): """ Validates using a user-provided function that operates on an entire series (for example by using one of the pandas Series methods: http://pandas.pydata.org/pandas-docs/stable/api.html#series) @@ -33,11 +33,11 @@ def __init__(self, validation: typing.Callable[[pd.Series], pd.Series], *args, * self._validation = validation - def select_cells(self, series: pd.Series) -> pd.Series: + def validate_series(self, series: pd.Series) -> pd.Series: return self._validation(series) -class CustomElementValidation(BooleanSeriesValidation): +class CustomElementValidation(IndexValidation): """ Validates using a user-provided function that operates on each element """ @@ -55,11 +55,11 @@ def __init__(self, validation: typing.Callable[[typing.Any], typing.Any], *args, self._validation = validation super().__init__(*args, **kwargs) - def select_cells(self, series: pd.Series) -> pd.Series: + def validate_series(self, series: pd.Series) -> pd.Series: return series.apply(self._validation) -class InRangeValidation(BooleanSeriesValidation): +class InRangeValidation(IndexValidation): """ Checks that each element in the series is within a given numerical range """ @@ -76,7 +76,7 @@ def __init__(self, min: float = -math.inf, max: float = math.inf, **kwargs): def default_message(self, warning: ValidationWarning): return 'was not in the range [{}, {})'.format(self.min, self.max) - def select_cells(self, series: pd.Series) -> pd.Series: + def validate_series(self, series: pd.Series) -> pd.Series: series = pd.to_numeric(series) return (series >= self.min) & (series < self.max) @@ -107,7 +107,7 @@ def validate_series(self, series: pd.Series) -> typing.Iterable[ValidationWarnin return [] -class CanCallValidation(BooleanSeriesValidation): +class CanCallValidation(SeriesValidation): """ Validates if a given function can be called on each element in a column without raising an exception """ @@ -136,7 +136,7 @@ def can_call(self, var): except: return False - def select_cells(self, series: pd.Series) -> pd.Series: + def validate_series(self, series: pd.Series) -> pd.Series: return series.apply(self.can_call) @@ -164,7 +164,7 @@ def default_message(self, warning: ValidationWarning): return 'cannot be converted to type {}'.format(self.callable) -class MatchesPatternValidation(BooleanSeriesValidation): +class MatchesPatternValidation(SeriesValidation): """ Validates that a string or regular expression can match somewhere in each element in this column """ @@ -182,11 +182,11 @@ def __init__(self, pattern, options={}, **kwargs): def default_message(self, warning: ValidationWarning): return 'does not match the pattern "{}"'.format(self.pattern.pattern) - def select_cells(self, series: pd.Series) -> pd.Series: + def validate_series(self, series: pd.Series) -> pd.Series: return series.astype(str).str.contains(self.pattern, **self.options) -class TrailingWhitespaceValidation(BooleanSeriesValidation): +class TrailingWhitespaceValidation(SeriesValidation): """ Checks that there is no trailing whitespace in this column """ @@ -197,11 +197,11 @@ def __init__(self, **kwargs): def default_message(self, warning: ValidationWarning): return 'contains trailing whitespace' - def select_cells(self, series: pd.Series) -> pd.Series: + def validate_series(self, series: pd.Series) -> pd.Series: return ~series.astype(str).str.contains('\s+$') -class LeadingWhitespaceValidation(BooleanSeriesValidation): +class LeadingWhitespaceValidation(SeriesValidation): """ Checks that there is no leading whitespace in this column """ @@ -212,11 +212,11 @@ def __init__(self, **kwargs): def default_message(self, warning: ValidationWarning): return 'contains leading whitespace' - def select_cells(self, series: pd.Series) -> pd.Series: + def validate_series(self, series: pd.Series) -> pd.Series: return ~series.astype(str).str.contains('^\s+') -class IsDistinctValidation(BooleanSeriesValidation): +class IsDistinctValidation(SeriesValidation): """ Checks that every element of this column is different from each other element """ @@ -227,11 +227,11 @@ def __init__(self, **kwargs): def default_message(self, warning: ValidationWarning): return 'contains values that are not unique' - def select_cells(self, series: pd.Series) -> pd.Series: + def validate_series(self, series: pd.Series) -> pd.Series: return ~series.duplicated(keep='first') -class InListValidation(BooleanSeriesValidation): +class InListValidation(SeriesValidation): """ Checks that each element in this column is contained within a list of possibilities """ @@ -249,14 +249,14 @@ def default_message(self, warning: ValidationWarning): values = ', '.join(str(v) for v in self.options) return 'is not in the list of legal options ({})'.format(values) - def select_cells(self, series: pd.Series) -> pd.Series: + def validate_series(self, series: pd.Series) -> pd.Series: if self.case_sensitive: return series.isin(self.options) else: return series.str.lower().isin([s.lower() for s in self.options]) -class DateFormatValidation(BooleanSeriesValidation): +class DateFormatValidation(SeriesValidation): """ Checks that each element in this column is a valid date according to a provided format string """ @@ -280,5 +280,5 @@ def valid_date(self, val): except: return False - def select_cells(self, series: pd.Series) -> pd.Series: + def validate_series(self, series: pd.Series) -> pd.Series: return series.astype(str).apply(self.valid_date) diff --git a/test/test_validation.py b/test/test_validation.py index 2351434..d7a0165 100755 --- a/test/test_validation.py +++ b/test/test_validation.py @@ -10,8 +10,7 @@ import pandas as pd from pandas_schema.validations import * -from pandas_schema.core import BooleanSeriesValidation, CombinedValidation, \ - BaseValidation +from pandas_schema.core import CombinedValidation, BaseValidation from pandas_schema.index import ColumnIndexer as ci from pandas_schema.schema import Schema from pandas_schema.column import column, column_sequence @@ -86,12 +85,13 @@ def setUp(self): def test_valid_inputs(self): assert len( - get_warnings(self.validator, ['_start_sdiyhsd', - '_start_234fpwunxc\n'])) == 0, 'did not accept valid inputs' + get_warnings(self.validator, ['_start_sdiyhsd', '_start_234fpwunxc\n']) + ) == 0, 'did not accept valid inputs' def test_invalid_inputs(self): - assert len(get_warnings(self.validator, - ['fail', '324wfp9ni'])) == 2, 'accepted invalid inputs' + assert len( + get_warnings(self.validator, ['fail', '324wfp9ni']) + ) == 2, 'accepted invalid inputs' class LeadingWhitespace(ValidationTestBase): From 3115dcb6209896b0e969bc82012ee307b2e80bf1 Mon Sep 17 00:00:00 2001 From: Michael Milton Date: Mon, 23 Mar 2020 02:23:53 +1100 Subject: [PATCH 15/31] Simple use cases working --- pandas_schema/column.py | 6 +-- pandas_schema/core.py | 116 ++++++++++++++++++---------------------- pandas_schema/index.py | 60 +++++++++++++++++++-- pandas_schema/schema.py | 4 +- 4 files changed, 112 insertions(+), 74 deletions(-) diff --git a/pandas_schema/column.py b/pandas_schema/column.py index ab3b58a..dcd363e 100755 --- a/pandas_schema/column.py +++ b/pandas_schema/column.py @@ -1,12 +1,12 @@ import typing import pandas_schema.core -from pandas_schema.index import PandasIndexer +from pandas_schema.index import AxisIndexer def column( validations: typing.Iterable['pandas_schema.core.IndexSeriesValidation'], - index: PandasIndexer = None, + index: AxisIndexer = None, override: bool = False, allow_empty=False ): @@ -36,7 +36,7 @@ def column_sequence( """ for i, valid in validations: if override or valid.index is None: - valid.index = PandasIndexer(i, typ='positional') + valid.index = AxisIndexer(i, typ='positional') # # def label_column( # validations: typing.Iterable['pandas_schema.core.IndexSeriesValidation'], diff --git a/pandas_schema/core.py b/pandas_schema/core.py index 774b588..39fa8f3 100755 --- a/pandas_schema/core.py +++ b/pandas_schema/core.py @@ -12,7 +12,7 @@ from . import column from .errors import PanSchArgumentError, PanSchNoIndexError from pandas_schema.validation_warning import ValidationWarning -from pandas_schema.index import PandasIndexer, IndexValue, IndexType, RowIndexer +from pandas_schema.index import AxisIndexer, IndexValue, IndexType, RowIndexer, DualAxisIndexer from pandas.api.types import is_categorical_dtype, is_numeric_dtype @@ -21,52 +21,36 @@ class BaseValidation(abc.ABC): A validation is, broadly, just a function that maps a data frame to a list of errors """ - def __init__( - self, - message: str = None, - negated: bool = False - ): + def __init__(self, message: str = None, ): """ Creates a new IndexSeriesValidation :param index: An index with which to select the series Otherwise it's a label (ie, index=0) indicates the column with the label of 0 """ self.custom_message = message - self.negated = negated - def validate(self, df: pd.DataFrame) -> typing.Iterable[ValidationWarning]: + def validate(self, df: pd.DataFrame) -> typing.Collection[ValidationWarning]: """ Validates a data frame :param df: Data frame to validate :return: All validation failures detected by this validation """ - selection = self.validation_series(df) - - if self.negated: - # If self.negated (which is not the default), then we don't need to flip the booleans - failed = selection - else: - # In the normal case we do need to flip the booleans, since select_cells returns True for cells that pass - # the validation, and we want cells that failed it - failed = ~selection + failed = self.get_failed_index(df) # Slice out the failed items, then map each into a list of validation warnings at each respective index warnings = [] - for index, value in df[failed].iteritems(): - warnings.append(ValidationWarning( - ValidationWarning(self, { - 'row': index, - 'value': value - }) - )) + for index, value in failed(df).iteritems(): + warnings.append(ValidationWarning(self, { + 'row': index, + 'value': value + })) return warnings @abc.abstractmethod - def validation_series(self, df: pd.DataFrame) -> pd.Series: + def get_failed_index(self, df: pd.DataFrame) -> DualAxisIndexer: """ - Returns a scalar, series or DataFrame of booleans, which will then be broadcast - across the DataFrame according to broadcasting rules - :return: + Returns an indexer object that fully specifies which sections of the DataFrame this validation believes are + invalid (both row and column-wise) """ def message(self, warning: ValidationWarning) -> str: @@ -109,8 +93,7 @@ def __or__(self, other: 'BaseValidation'): class IndexValidation(BaseValidation): def __init__( self, - row_index: typing.Union[PandasIndexer, IndexValue], - col_index: typing.Union[PandasIndexer, IndexValue], + index: DualAxisIndexer, *args, **kwargs ): @@ -120,27 +103,13 @@ def __init__( Otherwise it's a label (ie, index=0) indicates the column with the label of 0 """ super().__init__(*args, **kwargs) - - if isinstance(row_index, PandasIndexer): - self.row_index = row_index - else: - # If it isn't already an indexer object, convert it to one - self.row_index = PandasIndexer(index=row_index) - - if isinstance(col_index, PandasIndexer): - self.col_index = col_index - else: - # If it isn't already an indexer object, convert it to one - self.col_index = PandasIndexer(index=col_index) + self.index = index def apply_index(self, df: pd.DataFrame): """ Select a series using the data stored in this validation """ - if self.row_index is None or self.col_index is None: - raise PanSchNoIndexError() - - return df.loc[self.row_index.for_loc(df), self.col_index.for_loc(df)] + return self.index(df) def prefix(self, warning: ValidationWarning): """ @@ -148,33 +117,49 @@ def prefix(self, warning: ValidationWarning): This method is safe to override """ - ret = "" + ret = [] - if self.col_index is not None: - if self.col_index.type == IndexType.POSITION: - ret += 'Column {}'.format(self.col_index.index) - else: - ret += 'Column "{}"'.format(self.col_index.index) - if self.row_index is not None: - if self.row_index.type == IndexType.POSITION: - ret += 'Column {}'.format(self.row_index.index) - else: - ret += 'Column "{}"'.format(self.row_index.index) + if self.index.col_index is not None: + col_str = self.index.col_index.for_message() + if col_str: + ret.append(col_str) + + ret.append('Row {}'.format(warning.props['row'])) + + ret.append('"{}"'.format(warning.props['value'])) + + return ' '.join(ret) class SeriesValidation(IndexValidation): - def __init__(self, index, *args, **kwargs): + def __init__(self, index, *args, negated: bool = False, **kwargs): super().__init__( *args, - col_index=index, - row_index=RowIndexer(index=slice(None), typ=IndexType.POSITION), + index=DualAxisIndexer( + col_index=index, + row_index=RowIndexer(index=slice(None), typ=IndexType.POSITION), + ), **kwargs ) + self.negated = negated - def validation_series(self, df) -> pd.Series: + def get_failed_index(self, df) -> DualAxisIndexer: series = self.apply_index(df) - #TODO: Combine the index and the result series into one set of indexes - return self.validate_series(series) + + selected = self.validate_series(series) + + # Normally, validate_series returns the indices of the cells that passed the validation, but here we want the + # cells that failed it, so invert the series (unless this is a negated validation) + if self.negated: + row_index = selected + else: + row_index = ~selected + + # Combine the index and the result series into one set of indexes + return DualAxisIndexer( + row_index=row_index, + col_index=self.index.col_index + ) @abc.abstractmethod def validate_series(self, series: pd.Series) -> pd.Series: @@ -184,6 +169,10 @@ def validate_series(self, series: pd.Series) -> pd.Series: """ pass + def __invert__(self): + self.negated = not self.negated + return self + # # class BooleanSeriesValidation(IndexValidation, WarningSeriesGenerator): @@ -305,6 +294,5 @@ def get_warning_series(self, df: pd.DataFrame) -> pd.Series: return combined - @property def default_message(self, warnings: ValidationWarning) -> str: return '({}) {} ({})'.format(self.v_a.message, self.operator, self.v_b.message) diff --git a/pandas_schema/index.py b/pandas_schema/index.py index 9f777f6..91b3a5e 100755 --- a/pandas_schema/index.py +++ b/pandas_schema/index.py @@ -1,6 +1,6 @@ from pandas_schema.errors import PanSchIndexError from dataclasses import dataclass -from typing import Union +from typing import Union, Optional import numpy import pandas from enum import Enum @@ -17,7 +17,7 @@ class IndexType(Enum): LABEL = 1 -class PandasIndexer: +class AxisIndexer: """ An index into a particular axis of a DataFrame. Attempts to recreate the behaviour of `df.ix[some_index]` """ @@ -47,7 +47,10 @@ def __init__(self, index: IndexValue, typ: IndexType = None, axis: int = 1): self.type = typ else: # If the type isn't provided, guess it based on the datatype of the index - if numpy.issubdtype(type(index), numpy.character): + if isinstance(index, pandas.Series) and numpy.issubdtype(index.dtype, numpy.bool_): + # Boolean series can actually be used in loc or iloc, but let's assume it's only iloc for simplicity + self.type = IndexType.POSITION + elif numpy.issubdtype(type(index), numpy.character): self.type = IndexType.LABEL elif numpy.issubdtype(type(index), numpy.int_): self.type = IndexType.POSITION @@ -84,14 +87,61 @@ def for_iloc(self, df): elif self.type == IndexType.POSITION: return self.index + def for_message(self) -> Optional[str]: + """ + Returns a string that could be used to describe this indexer in a human readable way. However, returns None + if this indexer should not be described + """ + if self.axis == 0: + prefix = "Row" + else: + prefix = "Column" + + if isinstance(self.index, int): + idx = str(self.index) + elif isinstance(self.index, str): + idx = '"{}"'.format(self.index) + elif isinstance(self.index, slice): + if self.index == slice(None): + # If it's a slice of everything, skip this index + return None + else: + idx = str(self.index) + else: + idx = str(self.index) + return "{} {}".format(prefix, idx) -class RowIndexer(PandasIndexer): +class RowIndexer(AxisIndexer): def __init__(self, index: IndexValue, typ: IndexType = None): super().__init__(index=index, typ=typ, axis=0) -class ColumnIndexer(PandasIndexer): +class ColumnIndexer(AxisIndexer): def __init__(self, index: IndexValue, typ: IndexType = None): super().__init__(index=index, typ=typ, axis=1) + + +@dataclass +class DualAxisIndexer: + """ + Completely specifies some subset of a DataFrame, using both axes + """ + row_index: RowIndexer + col_index: ColumnIndexer + + def __init__(self, row_index: Union[RowIndexer, IndexValue], col_index: Union[ColumnIndexer, IndexValue]): + # Use the validation and automatic conversion built into the AxisIndexer class to handle these inputs + if isinstance(row_index, RowIndexer): + self.row_index = row_index + else: + self.row_index = RowIndexer(index=row_index) + + if isinstance(col_index, ColumnIndexer): + self.col_index = col_index + else: + self.col_index = ColumnIndexer(index=col_index) + + def __call__(self, df: pandas.DataFrame): + return df.loc[self.row_index.for_loc(df), self.col_index.for_loc(df)] diff --git a/pandas_schema/schema.py b/pandas_schema/schema.py index 83ad9c5..b777881 100755 --- a/pandas_schema/schema.py +++ b/pandas_schema/schema.py @@ -4,7 +4,7 @@ from pandas_schema.core import BaseValidation from pandas_schema.errors import PanSchArgumentError, PanSchInvalidSchemaError from pandas_schema.validation_warning import ValidationWarning -from pandas_schema.index import PandasIndexer +from pandas_schema.index import AxisIndexer class Schema: @@ -24,7 +24,7 @@ def __init__(self, validations: typing.Iterable[BaseValidation]): self.validations = list(validations) - def validate(self, df: pd.DataFrame, subset: PandasIndexer = None) -> typing.List[ValidationWarning]: + def validate(self, df: pd.DataFrame, subset: AxisIndexer = None) -> typing.List[ValidationWarning]: """ Runs a full validation of the target DataFrame using the internal columns list From a6c98ec20d2cafaa0f3c0ed2f7f21ead8248398b Mon Sep 17 00:00:00 2001 From: Michael Milton Date: Fri, 27 Mar 2020 18:21:23 +1100 Subject: [PATCH 16/31] Update --- pandas_schema/index.py | 10 +++++++--- pandas_schema/validations.py | 24 ++++++++++++++---------- test/test_validation.py | 28 ++++++++++++---------------- 3 files changed, 33 insertions(+), 29 deletions(-) diff --git a/pandas_schema/index.py b/pandas_schema/index.py index 91b3a5e..e8f108b 100755 --- a/pandas_schema/index.py +++ b/pandas_schema/index.py @@ -5,7 +5,8 @@ import pandas from enum import Enum -IndexValue = Union[numpy.string_, numpy.int_, str, int, slice] +# IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices +IndexValue = Union[numpy.ndarray, pandas.Series, str, int, slice] """ A pandas index can either be an integer or string, or an array of either. This typing is a bit sketchy because really a lot of things are accepted here @@ -47,6 +48,9 @@ def __init__(self, index: IndexValue, typ: IndexType = None, axis: int = 1): self.type = typ else: # If the type isn't provided, guess it based on the datatype of the index + if isinstance(index, slice): + # Slices can be used in either indexer + self.type = IndexType.POSITION if isinstance(index, pandas.Series) and numpy.issubdtype(index.dtype, numpy.bool_): # Boolean series can actually be used in loc or iloc, but let's assume it's only iloc for simplicity self.type = IndexType.POSITION @@ -74,9 +78,9 @@ def for_loc(self, df: pandas.DataFrame): Returns this index as something that could be passed into df.loc[] """ if self.type == IndexType.LABEL: - return df.axes[self.axis][self.index] - elif self.type == IndexType.POSITION: return self.index + elif self.type == IndexType.POSITION: + return df.axes[self.axis][self.index] def for_iloc(self, df): """ diff --git a/pandas_schema/validations.py b/pandas_schema/validations.py index 872c3c1..f482c07 100755 --- a/pandas_schema/validations.py +++ b/pandas_schema/validations.py @@ -8,12 +8,13 @@ from . import column from .core import SeriesValidation, IndexValidation +from .index import DualAxisIndexer from .validation_warning import ValidationWarning from .errors import PanSchArgumentError from pandas.api.types import is_categorical_dtype, is_numeric_dtype -class CustomSeriesValidation(IndexValidation): +class CustomSeriesValidation(SeriesValidation): """ Validates using a user-provided function that operates on an entire series (for example by using one of the pandas Series methods: http://pandas.pydata.org/pandas-docs/stable/api.html#series) @@ -37,7 +38,7 @@ def validate_series(self, series: pd.Series) -> pd.Series: return self._validation(series) -class CustomElementValidation(IndexValidation): +class CustomElementValidation(SeriesValidation): """ Validates using a user-provided function that operates on each element """ @@ -59,7 +60,7 @@ def validate_series(self, series: pd.Series) -> pd.Series: return series.apply(self._validation) -class InRangeValidation(IndexValidation): +class InRangeValidation(SeriesValidation): """ Checks that each element in the series is within a given numerical range """ @@ -97,14 +98,17 @@ def default_message(self, warning: ValidationWarning) -> str: return 'has a dtype of {} which is not a subclass of the required type {}'.format( self.dtype, warning.props['dtype']) - def validate_series(self, series: pd.Series) -> typing.Iterable[ValidationWarning]: - if not np.issubdtype(series.dtype, self.dtype): - return [ValidationWarning( - self, - {'dtype': series.dtype} - )] + def validate_series(self, series: pd.Series): + if np.issubdtype(series.dtype, self.dtype): + return True else: - return [] + return False + # return [ValidationWarning( + # self, + # {'dtype': series.dtype} + # )] + # else: + # return [] class CanCallValidation(SeriesValidation): diff --git a/test/test_validation.py b/test/test_validation.py index d7a0165..020736a 100755 --- a/test/test_validation.py +++ b/test/test_validation.py @@ -329,12 +329,12 @@ def test_valid_strings(self): ])) == 0, 'does not accept unique strings' def test_invalid_strings(self): - validation = self.validator.select_cells(pd.Series([ + validation = get_warnings(self.validator, [ '1', '1', '3', '4' - ])) + ]) self.assertTrue((validation == pd.Series([ True, @@ -521,25 +521,21 @@ def setUp(self): def test_default_message(self): validator = InRangeValidation(min=4, index=0) - for error in validator.validate_series(pd.Series( - [ - 1, - 2, - 3 - ] - ), flatten=True): + for error in get_warnings(validator, [ + 1, + 2, + 3 + ]): self.assertNotRegex(error.message, self.message, 'Validator not using the default warning message!') def test_custom_message(self): validator = InRangeValidation(min=4, message=self.message, index=0) - for error in validator.validate_series(pd.Series( - [ - 1, - 2, - 3 - ] - ), flatten=True): + for error in get_warnings(validator, [ + 1, + 2, + 3 + ] ): self.assertRegex(error.message, self.message, 'Validator not using the custom warning message!') From 73e86f161e8461802db1949780a11ce8badfdaee Mon Sep 17 00:00:00 2001 From: Michael Milton Date: Tue, 31 Mar 2020 01:59:46 +1100 Subject: [PATCH 17/31] Most tests working with bitwise rewrite --- TODO.md | 2 +- pandas_schema/core.py | 121 ++++++++++++++++++++++------ pandas_schema/index.py | 81 +++++++++++++++++-- pandas_schema/scope.py | 12 +++ pandas_schema/validation_warning.py | 5 +- pandas_schema/validations.py | 3 +- test/test_validation.py | 34 ++++---- 7 files changed, 204 insertions(+), 54 deletions(-) create mode 100644 pandas_schema/scope.py diff --git a/TODO.md b/TODO.md index 3cdd8fb..5ec84cd 100755 --- a/TODO.md +++ b/TODO.md @@ -3,7 +3,7 @@ * [x] Add replacement for allow_empty Columns * [ ] New column() tests * [ ] New CombinedValidation tests -* [x] Fix Negate +* [ ] Implement the negate flag in the indexer * [ ] Add facility for allow_empty * [x] Fix messages * [x] Re-implement the or/and using operators diff --git a/pandas_schema/core.py b/pandas_schema/core.py index 39fa8f3..9636da3 100755 --- a/pandas_schema/core.py +++ b/pandas_schema/core.py @@ -8,20 +8,29 @@ import operator import re from dataclasses import dataclass +import enum from . import column from .errors import PanSchArgumentError, PanSchNoIndexError from pandas_schema.validation_warning import ValidationWarning from pandas_schema.index import AxisIndexer, IndexValue, IndexType, RowIndexer, DualAxisIndexer +from pandas_schema.scope import ValidationScope from pandas.api.types import is_categorical_dtype, is_numeric_dtype +SubSelection = typing.Union[pd.Series, pd.DataFrame, object] +""" +Anything that an indexer could return from a DataFrame +""" class BaseValidation(abc.ABC): """ A validation is, broadly, just a function that maps a data frame to a list of errors """ - def __init__(self, message: str = None, ): + def __init_subclass__(cls, scope: ValidationScope=ValidationScope.CELL, **kwargs): + cls.scope = scope + + def __init__(self, message: str = None): """ Creates a new IndexSeriesValidation :param index: An index with which to select the series @@ -35,15 +44,39 @@ def validate(self, df: pd.DataFrame) -> typing.Collection[ValidationWarning]: :param df: Data frame to validate :return: All validation failures detected by this validation """ - failed = self.get_failed_index(df) + index = self.get_failed_index(df) + failed = index(df) + + # If it's am empty series/frame then this produced no warnings + if isinstance(failed, (pd.DataFrame, pd.Series)) and failed.empty: + return [] - # Slice out the failed items, then map each into a list of validation warnings at each respective index + # Depending on the scope, we produce the lists of warnings in different ways warnings = [] - for index, value in failed(df).iteritems(): - warnings.append(ValidationWarning(self, { - 'row': index, - 'value': value - })) + if isinstance(failed, pd.DataFrame): + if self.scope == ValidationScope.DATA_FRAME: + warnings.append(ValidationWarning(self)) + elif self.scope == ValidationScope.SERIES: + for column in failed.columns: + warnings.append(ValidationWarning(self, column=column)) + elif self.scope == ValidationScope.CELL: + for column in failed.columns: + for row, value in df[column].iteritems(): + warnings.append(ValidationWarning(self, column=column, row=row, value=value)) + elif isinstance(failed, pd.Series): + if self.scope == ValidationScope.SERIES: + warnings.append(ValidationWarning(self, column=index.col_index.index)) + elif self.scope == ValidationScope.CELL: + for row, value in failed.iteritems(): + warnings.append(ValidationWarning(self, column=index.col_index.index, row=row, value=value)) + else: + warnings.append(ValidationWarning( + self, + column=index.col_index.index, + row=index.row_index.index, + value=failed + )) + return warnings @abc.abstractmethod @@ -130,8 +163,46 @@ def prefix(self, warning: ValidationWarning): return ' '.join(ret) + def get_failed_index(self, df) -> DualAxisIndexer: + selection = self.apply_index(df) + # We invert here because the validation returns True for indices that pass the validation, but we want to + # select indices that fail + return self.validate_selection(selection).invert(axis=0) + + # Normally, validate_series returns the indices of the cells that passed the validation, but here we want the + # cells that failed it, so invert the series (unless this is a negated validation) + # if self.negated: + # row_index = selected + # else: + # row_index = ~selected + + # Combine the index and the result series into one set of indexes + # return DualAxisIndexer( + # row_index=row_index + # col_index=self.index.col_index + # ) + + @abc.abstractmethod + def validate_selection(self, selection: SubSelection) -> DualAxisIndexer: + """ + Given a series, return an indexer that + passes the validation, otherwise False + """ + pass + + def negate(self, axis: int): + """ + Returns a copy of this validation, but with an inverted indexer + """ + return self.__class__(index=self.index.invert(axis)) + class SeriesValidation(IndexValidation): + """ + A type of IndexValidation that operates only on a Series. This class mostly adds utility methods rather than + any particular functionality. + """ + def __init__(self, index, *args, negated: bool = False, **kwargs): super().__init__( *args, @@ -143,38 +214,36 @@ def __init__(self, index, *args, negated: bool = False, **kwargs): ) self.negated = negated - def get_failed_index(self, df) -> DualAxisIndexer: - series = self.apply_index(df) - - selected = self.validate_series(series) + def validate_selection(self, selection: SubSelection) -> DualAxisIndexer: + """ + Since this is a SeriesValidation, we can simplify the validation. Now we only have to ask the subclass to take + a Series and return a Series (or slice) that indicates the successful cells (or series). Then we can combine + this with the current index to produce an indexer that finds all failing cells in the DF + """ + row_index = self.validate_series(selection) - # Normally, validate_series returns the indices of the cells that passed the validation, but here we want the - # cells that failed it, so invert the series (unless this is a negated validation) - if self.negated: - row_index = selected - else: - row_index = ~selected + # As a convenience, we allow validate_series to return a boolean. If True it indicates everything passed, so + # convert it to a None slice which returns everything, and if false convert it to an empty list, an indexer + # that returns nothing + if isinstance(row_index, bool): + if row_index: + row_index = slice(None) + else: + row_index = [] - # Combine the index and the result series into one set of indexes return DualAxisIndexer( row_index=row_index, col_index=self.index.col_index ) @abc.abstractmethod - def validate_series(self, series: pd.Series) -> pd.Series: + def validate_series(self, series: pd.Series) -> IndexValue: """ Given a series, return a bool Series that has values of True if the series passes the validation, otherwise False """ - pass - def __invert__(self): - self.negated = not self.negated - return self - -# # class BooleanSeriesValidation(IndexValidation, WarningSeriesGenerator): # """ # Validation is defined by the function :py:meth:~select_cells that returns a boolean series. diff --git a/pandas_schema/index.py b/pandas_schema/index.py index e8f108b..4ba251c 100755 --- a/pandas_schema/index.py +++ b/pandas_schema/index.py @@ -17,7 +17,6 @@ class IndexType(Enum): POSITION = 0 LABEL = 1 - class AxisIndexer: """ An index into a particular axis of a DataFrame. Attempts to recreate the behaviour of `df.ix[some_index]` @@ -38,9 +37,15 @@ class AxisIndexer: The axis for the indexer """ - def __init__(self, index: IndexValue, typ: IndexType = None, axis: int = 1): + negate: bool = False + """ + If yes, return all values that this index does *not* select + """ + + def __init__(self, index: IndexValue, typ: IndexType = None, axis: int = 1, negate=False): self.index = index self.axis = axis + self.negate = negate if typ is not None: if not isinstance(typ, IndexType): @@ -51,7 +56,9 @@ def __init__(self, index: IndexValue, typ: IndexType = None, axis: int = 1): if isinstance(index, slice): # Slices can be used in either indexer self.type = IndexType.POSITION - if isinstance(index, pandas.Series) and numpy.issubdtype(index.dtype, numpy.bool_): + elif isinstance(index, list): + self.type = IndexType.POSITION + elif isinstance(index, pandas.Series) and numpy.issubdtype(index.dtype, numpy.bool_): # Boolean series can actually be used in loc or iloc, but let's assume it's only iloc for simplicity self.type = IndexType.POSITION elif numpy.issubdtype(type(index), numpy.character): @@ -116,13 +123,60 @@ def for_message(self) -> Optional[str]: return "{} {}".format(prefix, idx) + @staticmethod + def invert_index(index: IndexValue): + if isinstance(index, slice) and index.start is None and index.stop is None: + # If this is a None slice, it would previously return everything, so make it return nothing + return [] + elif isinstance(index, list) and len(index) == 0: + # If this is an empty list, it would previously return nothing, so make it return everything + return slice(None) + elif isinstance(index, pandas.Series) and numpy.issubdtype(index.dtype, numpy.bool_): + # Boolean series have a built-in inversion + return ~index + # elif numpy.issubdtype(type(index), numpy.int_): + # # Index series can't be inverted without knowing the original DF + else: + raise PanSchIndexError('Uninvertible type') + + def __invert__(self) -> 'AxisIndexer': + """ + Returns an index that is inverted (will return the opposite of what was previously specified) + """ + return AxisIndexer( + index=self.invert_index(self.index), + typ=self.type, + axis=self.axis, + ) + + +class SubIndexerMeta(type): + def __init__(cls, *args, axis: int, **kwargs): + super().__init__(*args) + cls.axis = axis + + def __new__(metacls, name, bases, namespace, **kargs): + return super().__new__(metacls, name, bases, namespace) + + @classmethod + def __prepare__(metacls, name, bases, **kwargs): + return super().__prepare__(name, bases, **kwargs) + + def __instancecheck__(self, instance): + # Any AxisIndexer can be considered a ColumnIndexer if it has axis 0 + result = super().__instancecheck__(instance) + if not result and isinstance(instance, AxisIndexer) and instance.axis == self.axis: + return True + else: + return result + -class RowIndexer(AxisIndexer): +class RowIndexer(AxisIndexer, axis=0, metaclass=SubIndexerMeta): def __init__(self, index: IndexValue, typ: IndexType = None): super().__init__(index=index, typ=typ, axis=0) -class ColumnIndexer(AxisIndexer): +class ColumnIndexer(AxisIndexer, axis=1, metaclass=SubIndexerMeta): def __init__(self, index: IndexValue, typ: IndexType = None): super().__init__(index=index, typ=typ, axis=1) @@ -149,3 +203,20 @@ def __init__(self, row_index: Union[RowIndexer, IndexValue], col_index: Union[Co def __call__(self, df: pandas.DataFrame): return df.loc[self.row_index.for_loc(df), self.col_index.for_loc(df)] + + def invert(self, axis) -> 'AxisIndexer': + """ + Returns an index that is inverted along the given axis. e.g. if axis=0, the column index stays the same, but + all row indices are inverted. + """ + if axis == 0: + return DualAxisIndexer( + row_index=~self.row_index, + col_index=self.col_index + ) + + elif axis == 1: + return DualAxisIndexer( + row_index=self.row_index, + col_index=~self.col_index + ) diff --git a/pandas_schema/scope.py b/pandas_schema/scope.py new file mode 100644 index 0000000..e3e3eeb --- /dev/null +++ b/pandas_schema/scope.py @@ -0,0 +1,12 @@ +import enum + +class ValidationScope(enum.Enum): + """ + Defines the scope of a validation, ie DATA_FRAME scope means this validation validates the entire DataFrame is + valid or invalid, SERIES means each series can be valid/invalid, and CELL means each index anywhere in the frame + can be valid/invalid + """ + DATA_FRAME = 0 + SERIES = 1 + CELL = 2 + diff --git a/pandas_schema/validation_warning.py b/pandas_schema/validation_warning.py index e6e3ddd..e4b62a5 100755 --- a/pandas_schema/validation_warning.py +++ b/pandas_schema/validation_warning.py @@ -1,6 +1,5 @@ from dataclasses import dataclass, field - @dataclass class ValidationWarning: """ @@ -19,6 +18,10 @@ class ValidationWarning: include a `value` key, for storing what the actual value was """ + def __init__(self, validation, **props): + self.validation = validation + self.props = props + @property def message(self): """ diff --git a/pandas_schema/validations.py b/pandas_schema/validations.py index f482c07..0201f45 100755 --- a/pandas_schema/validations.py +++ b/pandas_schema/validations.py @@ -12,6 +12,7 @@ from .validation_warning import ValidationWarning from .errors import PanSchArgumentError from pandas.api.types import is_categorical_dtype, is_numeric_dtype +from pandas_schema.scope import ValidationScope class CustomSeriesValidation(SeriesValidation): @@ -82,7 +83,7 @@ def validate_series(self, series: pd.Series) -> pd.Series: return (series >= self.min) & (series < self.max) -class IsDtypeValidation(SeriesValidation): +class IsDtypeValidation(SeriesValidation, scope=ValidationScope.SERIES): """ Checks that a series has a certain numpy dtype """ diff --git a/test/test_validation.py b/test/test_validation.py index 020736a..8b7f322 100755 --- a/test/test_validation.py +++ b/test/test_validation.py @@ -329,19 +329,15 @@ def test_valid_strings(self): ])) == 0, 'does not accept unique strings' def test_invalid_strings(self): - validation = get_warnings(self.validator, [ + warnings = get_warnings(self.validator, [ '1', '1', '3', '4' ]) - self.assertTrue((validation == pd.Series([ - True, - False, - True, - True - ])).all(), 'did not identify the error') + assert len(warnings) == 1 + assert warnings[0].props['row'] == 1, 'did not identify the error' class CompiledRegexMatch(ValidationTestBase): @@ -411,22 +407,20 @@ def setUp(self): self.validator = IsDtypeValidation(np.number, index=0) def test_valid_items(self): - errors = self.validator.validate_series(pd.Series( - [ - 1, - 2, - 3 - ])) + errors = get_warnings(self.validator, pd.Series([ + 1, + 2, + 3 + ], dtype=np.int_)) self.assertEqual(len(errors), 0) def test_invalid_items(self): - errors = self.validator.validate_series(pd.Series( - [ - 'a', - '', - 'c' - ])) + errors = get_warnings(self.validator, [ + 'a', + '', + 'c' + ]) self.assertEqual(len(errors), 1) self.assertEqual(type(errors[0]), ValidationWarning) @@ -535,7 +529,7 @@ def test_custom_message(self): 1, 2, 3 - ] ): + ]): self.assertRegex(error.message, self.message, 'Validator not using the custom warning message!') From 8fe1c9057fa60545746e45a780241725d4375e8b Mon Sep 17 00:00:00 2001 From: Michael Milton Date: Thu, 2 Apr 2020 01:08:57 +1100 Subject: [PATCH 18/31] Implement negation --- TODO.md | 1 + pandas_schema/core.py | 82 ++++++++++++++++++++++++++++-------- pandas_schema/validations.py | 7 +++ test/test_validation.py | 10 +---- 4 files changed, 75 insertions(+), 25 deletions(-) diff --git a/TODO.md b/TODO.md index 5ec84cd..add03b7 100755 --- a/TODO.md +++ b/TODO.md @@ -8,3 +8,4 @@ * [x] Fix messages * [x] Re-implement the or/and using operators * [ ] Allow and/or operators between Series-level and row-level validations +* [ ] Separate ValidationClasses for each scope diff --git a/pandas_schema/core.py b/pandas_schema/core.py index 9636da3..6fe8a58 100755 --- a/pandas_schema/core.py +++ b/pandas_schema/core.py @@ -9,6 +9,7 @@ import re from dataclasses import dataclass import enum +import copy from . import column from .errors import PanSchArgumentError, PanSchNoIndexError @@ -22,21 +23,50 @@ Anything that an indexer could return from a DataFrame """ + class BaseValidation(abc.ABC): """ A validation is, broadly, just a function that maps a data frame to a list of errors """ - def __init_subclass__(cls, scope: ValidationScope=ValidationScope.CELL, **kwargs): + def __init_subclass__(cls, scope: ValidationScope = ValidationScope.CELL, **kwargs): cls.scope = scope - def __init__(self, message: str = None): + def __init__(self, message: str = None, negated: bool = False): """ Creates a new IndexSeriesValidation :param index: An index with which to select the series Otherwise it's a label (ie, index=0) indicates the column with the label of 0 """ self.custom_message = message + self.negated = negated + + def make_df_warning(self, df: pd.DataFrame) -> ValidationWarning: + """ + Creates a DF-scope warning. Can be overridden by child classes + """ + return ValidationWarning(self) + + def make_series_warning(self, df: pd.DataFrame, column: str, series: pd.Series) -> ValidationWarning: + """ + Creates a series-scope warning. Can be overridden by child classes + """ + return ValidationWarning(self, column=column) + + def make_cell_warning(self, df: pd.DataFrame, column: str, row: int, value, + series: pd.Series = None) -> ValidationWarning: + """ + Creates a cell-scope warning. Can be overridden by child classes + """ + return ValidationWarning(self, column=column, row=row, value=value) + + def apply_negation(self, index: DualAxisIndexer) -> DualAxisIndexer: + """ + Can be implemented by sub-classes to provide negation behaviour. If implemented, this should return a new + indexer that returns the opposite of what it normally would. The definition of opposite may vary from validation + to validation + """ + raise NotImplementedError() def validate(self, df: pd.DataFrame) -> typing.Collection[ValidationWarning]: """ @@ -45,6 +75,8 @@ def validate(self, df: pd.DataFrame) -> typing.Collection[ValidationWarning]: :return: All validation failures detected by this validation """ index = self.get_failed_index(df) + if self.negated: + index = self.apply_negation(index) failed = index(df) # If it's am empty series/frame then this produced no warnings @@ -55,27 +87,27 @@ def validate(self, df: pd.DataFrame) -> typing.Collection[ValidationWarning]: warnings = [] if isinstance(failed, pd.DataFrame): if self.scope == ValidationScope.DATA_FRAME: - warnings.append(ValidationWarning(self)) + warnings.append(self.make_df_warning(df)) elif self.scope == ValidationScope.SERIES: - for column in failed.columns: - warnings.append(ValidationWarning(self, column=column)) + for column, series in failed.iteritems(): + warnings.append(self.make_series_warning(df=df, column=column, series=series)) elif self.scope == ValidationScope.CELL: - for column in failed.columns: + for column, series in failed.iteritems(): for row, value in df[column].iteritems(): - warnings.append(ValidationWarning(self, column=column, row=row, value=value)) + warnings.append( + self.make_cell_warning(df=df, column=column, series=series, row=row, value=value)) elif isinstance(failed, pd.Series): if self.scope == ValidationScope.SERIES: - warnings.append(ValidationWarning(self, column=index.col_index.index)) + warnings.append(self.make_series_warning(df=df, column=index.col_index.index, series=failed)) elif self.scope == ValidationScope.CELL: for row, value in failed.iteritems(): - warnings.append(ValidationWarning(self, column=index.col_index.index, row=row, value=value)) + warnings.append( + self.make_cell_warning(df=df, column=index.col_index.index, series=failed, row=row, + value=value)) else: - warnings.append(ValidationWarning( - self, - column=index.col_index.index, - row=index.row_index.index, - value=failed - )) + warnings.append( + self.make_cell_warning(df=df, column=index.col_index.index, row=index.row_index.index, value=failed) + ) return warnings @@ -122,6 +154,14 @@ def __or__(self, other: 'BaseValidation'): return CombinedValidation(self, other, operator='or') + def __invert__(self): + """ + Return a copy of this, except that it will return indices of those that would normally pass this validation, + in the same series + """ + clone = copy.copy(self) + clone.negated = True + return clone class IndexValidation(BaseValidation): def __init__( @@ -203,7 +243,7 @@ class SeriesValidation(IndexValidation): any particular functionality. """ - def __init__(self, index, *args, negated: bool = False, **kwargs): + def __init__(self, index, *args, **kwargs): super().__init__( *args, index=DualAxisIndexer( @@ -212,7 +252,14 @@ def __init__(self, index, *args, negated: bool = False, **kwargs): ), **kwargs ) - self.negated = negated + + def apply_negation(self, index: DualAxisIndexer) -> DualAxisIndexer: + """ + When a SeriesValidation is negated, it means that we should invert only the row indices returned by the + validation. This makes the validation return warnings from the same subset of the DataFrame, but makes cells + pass if they would fail, and fail if they would pass + """ + return index.invert(axis=0) def validate_selection(self, selection: SubSelection) -> DualAxisIndexer: """ @@ -244,6 +291,7 @@ def validate_series(self, series: pd.Series) -> IndexValue: """ + # class BooleanSeriesValidation(IndexValidation, WarningSeriesGenerator): # """ # Validation is defined by the function :py:meth:~select_cells that returns a boolean series. diff --git a/pandas_schema/validations.py b/pandas_schema/validations.py index 0201f45..1e362d3 100755 --- a/pandas_schema/validations.py +++ b/pandas_schema/validations.py @@ -99,6 +99,13 @@ def default_message(self, warning: ValidationWarning) -> str: return 'has a dtype of {} which is not a subclass of the required type {}'.format( self.dtype, warning.props['dtype']) + def make_series_warning(self, df: pd.DataFrame, column: str, series: pd.Series) -> ValidationWarning: + return ValidationWarning( + self, + column=column, + dtype=series.dtype + ) + def validate_series(self, series: pd.Series): if np.issubdtype(series.dtype, self.dtype): return True diff --git a/test/test_validation.py b/test/test_validation.py index 8b7f322..cfa6462 100755 --- a/test/test_validation.py +++ b/test/test_validation.py @@ -444,14 +444,8 @@ def test_schema(self): errors = schema.validate(df) - self.assertEqual( - [x.props for x in errors], - [ - {'dtype': np.object}, - {'dtype': np.int64}, - {'dtype': np.float64}, - ] - ) + for error, correct_dtype in zip(errors, [np.object, np.int64, np.float64]): + assert error.props['dtype'] == correct_dtype class Negate(ValidationTestBase): From bc6d0deb706637f31cfdc65784ba2ab81d70987c Mon Sep 17 00:00:00 2001 From: Michael Milton Date: Sun, 5 Apr 2020 02:01:18 +1100 Subject: [PATCH 19/31] First attempt at combined validations --- pandas_schema/core.py | 156 +++++++++++++++++++++++++++++++---------- pandas_schema/scope.py | 1 + 2 files changed, 119 insertions(+), 38 deletions(-) diff --git a/pandas_schema/core.py b/pandas_schema/core.py index 6fe8a58..c88f13b 100755 --- a/pandas_schema/core.py +++ b/pandas_schema/core.py @@ -6,7 +6,6 @@ import numpy as np import typing import operator -import re from dataclasses import dataclass import enum import copy @@ -14,7 +13,8 @@ from . import column from .errors import PanSchArgumentError, PanSchNoIndexError from pandas_schema.validation_warning import ValidationWarning -from pandas_schema.index import AxisIndexer, IndexValue, IndexType, RowIndexer, DualAxisIndexer +from pandas_schema.index import AxisIndexer, IndexValue, IndexType, RowIndexer, \ + DualAxisIndexer from pandas_schema.scope import ValidationScope from pandas.api.types import is_categorical_dtype, is_numeric_dtype @@ -47,7 +47,8 @@ def make_df_warning(self, df: pd.DataFrame) -> ValidationWarning: """ return ValidationWarning(self) - def make_series_warning(self, df: pd.DataFrame, column: str, series: pd.Series) -> ValidationWarning: + def make_series_warning(self, df: pd.DataFrame, column: str, + series: pd.Series) -> ValidationWarning: """ Creates a series-scope warning. Can be overridden by child classes """ @@ -68,48 +69,88 @@ def apply_negation(self, index: DualAxisIndexer) -> DualAxisIndexer: """ raise NotImplementedError() - def validate(self, df: pd.DataFrame) -> typing.Collection[ValidationWarning]: - """ - Validates a data frame - :param df: Data frame to validate - :return: All validation failures detected by this validation - """ - index = self.get_failed_index(df) - if self.negated: - index = self.apply_negation(index) - failed = index(df) - + def index_to_warnings_series(self, df: pd.DataFrame, failed: DualAxisIndexer): # If it's am empty series/frame then this produced no warnings if isinstance(failed, (pd.DataFrame, pd.Series)) and failed.empty: return [] # Depending on the scope, we produce the lists of warnings in different ways - warnings = [] if isinstance(failed, pd.DataFrame): if self.scope == ValidationScope.DATA_FRAME: - warnings.append(self.make_df_warning(df)) + return [self.make_df_warning(df)] elif self.scope == ValidationScope.SERIES: - for column, series in failed.iteritems(): - warnings.append(self.make_series_warning(df=df, column=column, series=series)) + return df.apply(lambda series: self.make_series_warning( + df=df, + column=series.name, + series=series + ), axis=0) elif self.scope == ValidationScope.CELL: - for column, series in failed.iteritems(): - for row, value in df[column].iteritems(): - warnings.append( - self.make_cell_warning(df=df, column=column, series=series, row=row, value=value)) + return df.apply(lambda series: series.to_frame().apply( + lambda cell: self.make_cell_warning( + df=df, + column=series.name, + series=series, + row=cell.name, + value=cell + ))) elif isinstance(failed, pd.Series): if self.scope == ValidationScope.SERIES: - warnings.append(self.make_series_warning(df=df, column=index.col_index.index, series=failed)) + return [self.make_series_warning( + df=df, + column=index.col_index.index, + series=failed + )] elif self.scope == ValidationScope.CELL: - for row, value in failed.iteritems(): - warnings.append( - self.make_cell_warning(df=df, column=index.col_index.index, series=failed, row=row, - value=value)) + return failed.to_frame().apply(lambda cell: self.make_cell_warning( + df=df, + column=failed.name, + series=failed, + row=cell.name, + value=cell[0] + ), axis=1) else: - warnings.append( - self.make_cell_warning(df=df, column=index.col_index.index, row=index.row_index.index, value=failed) - ) + return [self.make_cell_warning( + df=df, + column=index.col_index.index, + row=index.row_index.index, + value=failed) + ] + + def get_warnings_series(self, df: pd.DataFrame) -> pd.Series: + """ + Converts an index into a series of warnings each corresponding to an issue + with the DataFrame at the same index. + """ + index = self.get_failed_index(df) + if self.negated: + index = self.apply_negation(index) + failed = index(df) + + return self.index_to_warnings_series(df, failed) + + @staticmethod + def to_warning_list(failed): + if isinstance(failed, pd.DataFrame): + return failed.to_numpy().tolist() + elif isinstance(failed, pd.Series): + return failed.tolist() + else: + return failed + + def validate(self, df: pd.DataFrame) -> typing.Collection[ValidationWarning]: + """ + Validates a data frame + :param df: Data frame to validate + :return: All validation failures detected by this validation + """ + # index = self.get_failed_index(df) + # if self.negated: + # index = self.apply_negation(index) + # failed = index(df) + + failed = self.get_warnings_series(df) + return self.to_warning_list(failed) - return warnings @abc.abstractmethod def get_failed_index(self, df: pd.DataFrame) -> DualAxisIndexer: @@ -163,6 +204,7 @@ def __invert__(self): clone.negated = True return clone + class IndexValidation(BaseValidation): def __init__( self, @@ -291,7 +333,6 @@ def validate_series(self, series: pd.Series) -> IndexValue: """ - # class BooleanSeriesValidation(IndexValidation, WarningSeriesGenerator): # """ # Validation is defined by the function :py:meth:~select_cells that returns a boolean series. @@ -372,15 +413,50 @@ class CombinedValidation(BaseValidation): Validates if one and/or the other validation is true for an element """ - def message(self, warning: ValidationWarning) -> str: - pass - - def __init__(self, validation_a: BaseValidation, - validation_b: BaseValidation, operator: str): + def __init__( + self, + validation_a: BaseValidation, + validation_b: BaseValidation, + operator: typing.Callable, + axis='rows' + ): super().__init__() self.operator = operator self.left = validation_a self.right = validation_b + self.axis = 1 + + def apply_negation(self, index: DualAxisIndexer) -> DualAxisIndexer: + pass + + def get_failed_index(self, df: pd.DataFrame) -> DualAxisIndexer: + left_failed = self.left.get_failed_index(df) + right_failed = self.right.get_failed_index(df) + + if self.axis == 'rows': + assert left_failed.col_index == right_failed.col_index + assert isinstance(left_failed.col_index.index, pd.Series) + return DualAxisIndexer( + row_index=self.operator(left_failed.row_index, right_failed.row_index), + col_index=left_failed.col_index + ) + + elif self.axis == 'columns': + assert left_failed.row_index == right_failed.row_index + assert isinstance(left_failed.row_index.index, pd.Series) + return DualAxisIndexer( + row_index=left_failed.row_index, + col_index=self.operator(left_failed.col_index, right_failed.col_index) + ) + + else: + raise Exception() + + def prefix(self, warning: ValidationWarning): + pass + + def message(self, warning: ValidationWarning) -> str: + pass def get_warning_series(self, df: pd.DataFrame) -> pd.Series: # Let both validations separately select and filter a column @@ -409,7 +485,11 @@ def get_warning_series(self, df: pd.DataFrame) -> pd.Series: else: raise Exception('Operator must be "and" or "or"') - return combined + warnings = ( + self.index_to_warnings_series(df, left_errors) + + self.index_to_warnings_series(df, right_errors) + ) + return combined(warnings) def default_message(self, warnings: ValidationWarning) -> str: return '({}) {} ({})'.format(self.v_a.message, self.operator, self.v_b.message) diff --git a/pandas_schema/scope.py b/pandas_schema/scope.py index e3e3eeb..ccc66b6 100644 --- a/pandas_schema/scope.py +++ b/pandas_schema/scope.py @@ -9,4 +9,5 @@ class ValidationScope(enum.Enum): DATA_FRAME = 0 SERIES = 1 CELL = 2 + ROW = 3 From f3dee89cf20917d82ae77f39b032d25a9205a8a3 Mon Sep 17 00:00:00 2001 From: Michael Milton Date: Wed, 8 Apr 2020 00:24:54 +1000 Subject: [PATCH 20/31] Update --- pandas_schema/core.py | 116 ++++++++++++++++++++++++++-------------- pandas_schema/index.py | 4 +- test/test_validation.py | 11 ++-- 3 files changed, 87 insertions(+), 44 deletions(-) diff --git a/pandas_schema/core.py b/pandas_schema/core.py index c88f13b..9d8af9e 100755 --- a/pandas_schema/core.py +++ b/pandas_schema/core.py @@ -69,7 +69,8 @@ def apply_negation(self, index: DualAxisIndexer) -> DualAxisIndexer: """ raise NotImplementedError() - def index_to_warnings_series(self, df: pd.DataFrame, failed: DualAxisIndexer): + def index_to_warnings_series(self, df: pd.DataFrame, index: DualAxisIndexer, failed): + # If it's am empty series/frame then this produced no warnings if isinstance(failed, (pd.DataFrame, pd.Series)) and failed.empty: return [] @@ -97,7 +98,7 @@ def index_to_warnings_series(self, df: pd.DataFrame, failed: DualAxisIndexer): if self.scope == ValidationScope.SERIES: return [self.make_series_warning( df=df, - column=index.col_index.index, + column=failed.col_index.index, series=failed )] elif self.scope == ValidationScope.CELL: @@ -111,8 +112,8 @@ def index_to_warnings_series(self, df: pd.DataFrame, failed: DualAxisIndexer): else: return [self.make_cell_warning( df=df, - column=index.col_index.index, - row=index.row_index.index, + column=self.index.col_index.index, + row=self.index.row_index.index, value=failed) ] @@ -126,7 +127,7 @@ def get_warnings_series(self, df: pd.DataFrame) -> pd.Series: index = self.apply_negation(index) failed = index(df) - return self.index_to_warnings_series(df, failed) + return self.index_to_warnings_series(df, index, failed) @staticmethod def to_warning_list(failed): @@ -151,7 +152,6 @@ def validate(self, df: pd.DataFrame) -> typing.Collection[ValidationWarning]: failed = self.get_warnings_series(df) return self.to_warning_list(failed) - @abc.abstractmethod def get_failed_index(self, df: pd.DataFrame) -> DualAxisIndexer: """ @@ -193,7 +193,7 @@ def __or__(self, other: 'BaseValidation'): 'Validations that subclass {}'.format( self.__class__)) - return CombinedValidation(self, other, operator='or') + return CombinedValidation(self, other, operator=operator.or_) def __invert__(self): """ @@ -424,7 +424,7 @@ def __init__( self.operator = operator self.left = validation_a self.right = validation_b - self.axis = 1 + self.axis = axis def apply_negation(self, index: DualAxisIndexer) -> DualAxisIndexer: pass @@ -435,18 +435,24 @@ def get_failed_index(self, df: pd.DataFrame) -> DualAxisIndexer: if self.axis == 'rows': assert left_failed.col_index == right_failed.col_index - assert isinstance(left_failed.col_index.index, pd.Series) + assert isinstance(left_failed.row_index.index, pd.Series) return DualAxisIndexer( - row_index=self.operator(left_failed.row_index, right_failed.row_index), + row_index=self.operator( + left_failed.row_index.index, + right_failed.row_index.index + ), col_index=left_failed.col_index ) elif self.axis == 'columns': assert left_failed.row_index == right_failed.row_index - assert isinstance(left_failed.row_index.index, pd.Series) + assert isinstance(left_failed.col_index.index, pd.Series) return DualAxisIndexer( row_index=left_failed.row_index, - col_index=self.operator(left_failed.col_index, right_failed.col_index) + col_index=self.operator( + left_failed.col_index.index, + right_failed.col_index.index + ) ) else: @@ -458,38 +464,70 @@ def prefix(self, warning: ValidationWarning): def message(self, warning: ValidationWarning) -> str: pass - def get_warning_series(self, df: pd.DataFrame) -> pd.Series: + def get_warnings_series(self, df: pd.DataFrame) -> pd.Series: # Let both validations separately select and filter a column - left_errors = self.left.validate(df) - right_errors = self.right.validate(df) - - if self.operator == 'and': - # If it's an "and" validation, left, right, or both failing means an error, - # so we can simply concatenate the lists of errors - combined = left_errors.combine( - right_errors, - func=operator.add, - fill_value=[] + left_index = self.left.get_failed_index(df) + right_index = self.right.get_failed_index(df) + + if self.axis == 'rows': + assert left_index.col_index == right_index.col_index + assert isinstance(left_index.row_index.index, pd.Series) + combined = DualAxisIndexer( + row_index=self.operator( + left_index.row_index.index, + right_index.row_index.index + ), + col_index=left_index.col_index ) - elif self.operator == 'or': - # [error] and [] = [] - # [error_1] and [error_2] = [error_2] - # [] and [] = [] - # Thus, we can use the and operator to implement "or" validations - combined = left_errors.combine( - right_errors, - func=lambda l, r: l + r if l and r else [], - fill_value=[] + + elif self.axis == 'columns': + assert left_index.row_index == right_index.row_index + assert isinstance(left_index.col_index.index, pd.Series) + combined = DualAxisIndexer( + row_index=left_index.row_index, + col_index=self.operator( + left_index.col_index.index, + right_index.col_index.index + ) ) - # func=lambda a, b: [] if len(a) == 0 or len(b) == 0 else a + b) + else: - raise Exception('Operator must be "and" or "or"') + raise Exception() - warnings = ( - self.index_to_warnings_series(df, left_errors) + - self.index_to_warnings_series(df, right_errors) - ) - return combined(warnings) + # if self.operator == 'and': + # # If it's an "and" validation, left, right, or both failing means an error, + # # so we can simply concatenate the lists of errors + # combined = left_errors.combine( + # right_errors, + # func=operator.add, + # fill_value=[] + # ) + # elif self.operator == 'or': + # # [error] and [] = [] + # # [error_1] and [error_2] = [error_2] + # # [] and [] = [] + # # Thus, we can use the and operator to implement "or" validations + # combined = left_errors.combine( + # right_errors, + # func=lambda l, r: l + r if l and r else [], + # fill_value=[] + # ) + # # func=lambda a, b: [] if len(a) == 0 or len(b) == 0 else a + b) + # else: + # raise Exception('Operator must be "and" or "or"') + # + left_failed = left_index(df) + right_failed = right_index(df) + + warnings = pd.concat([ + self.left.index_to_warnings_series(df, left_index, left_failed), + self.right.index_to_warnings_series(df, right_index, right_failed) + ])#, join='inner', keys=['inner', 'outer']) + + if self.axis == 'rows': + return warnings[combined.row_index.index] + else: + return warnings[combined.col_index.index] def default_message(self, warnings: ValidationWarning) -> str: return '({}) {} ({})'.format(self.v_a.message, self.operator, self.v_b.message) diff --git a/pandas_schema/index.py b/pandas_schema/index.py index 4ba251c..70e2ee8 100755 --- a/pandas_schema/index.py +++ b/pandas_schema/index.py @@ -17,6 +17,7 @@ class IndexType(Enum): POSITION = 0 LABEL = 1 +@dataclass(init=False) class AxisIndexer: """ An index into a particular axis of a DataFrame. Attempts to recreate the behaviour of `df.ix[some_index]` @@ -181,7 +182,7 @@ def __init__(self, index: IndexValue, typ: IndexType = None): super().__init__(index=index, typ=typ, axis=1) -@dataclass +@dataclass(init=False) class DualAxisIndexer: """ Completely specifies some subset of a DataFrame, using both axes @@ -201,6 +202,7 @@ def __init__(self, row_index: Union[RowIndexer, IndexValue], col_index: Union[Co else: self.col_index = ColumnIndexer(index=col_index) + def __call__(self, df: pandas.DataFrame): return df.loc[self.row_index.for_loc(df), self.col_index.for_loc(df)] diff --git a/test/test_validation.py b/test/test_validation.py index cfa6462..2e2aeb5 100755 --- a/test/test_validation.py +++ b/test/test_validation.py @@ -484,19 +484,22 @@ def setUp(self): ) def test_valid_items(self): - assert len(get_warnings(self.validator, [ + warnings = get_warnings(self.validator, [ 'pass', 'yes', 'passyes', '345yes345' - ])) == 0, 'rejects values that should pass' + ]) + assert len(warnings) == 0, 'rejects values that should pass' def test_invalid_items(self): - assert len(get_warnings(self.validator, [ + warnings = get_warnings(self.validator, [ 'fail', 'YES', 'YPESS' - ])) == 6, 'accepts values that should pass' + ]) + + assert len(warnings) == 6, 'accepts values that should pass' class CustomMessage(ValidationTestBase): From b0105ca0bd53a1310a8c5856c3e4730825764cec Mon Sep 17 00:00:00 2001 From: Michael Milton Date: Wed, 8 Apr 2020 01:21:53 +1000 Subject: [PATCH 21/31] All tests passing --- pandas_schema/core.py | 125 +++++++++++++++-------------------- pandas_schema/validations.py | 2 +- test/test_validation.py | 52 +++++++-------- 3 files changed, 80 insertions(+), 99 deletions(-) diff --git a/pandas_schema/core.py b/pandas_schema/core.py index 9d8af9e..1ca22da 100755 --- a/pandas_schema/core.py +++ b/pandas_schema/core.py @@ -98,13 +98,13 @@ def index_to_warnings_series(self, df: pd.DataFrame, index: DualAxisIndexer, fai if self.scope == ValidationScope.SERIES: return [self.make_series_warning( df=df, - column=failed.col_index.index, + column=index.col_index.index, series=failed )] elif self.scope == ValidationScope.CELL: return failed.to_frame().apply(lambda cell: self.make_cell_warning( df=df, - column=failed.name, + column=index.col_index.index, series=failed, row=cell.name, value=cell[0] @@ -153,12 +153,19 @@ def validate(self, df: pd.DataFrame) -> typing.Collection[ValidationWarning]: return self.to_warning_list(failed) @abc.abstractmethod - def get_failed_index(self, df: pd.DataFrame) -> DualAxisIndexer: + def get_passed_index(self, df: pd.DataFrame) -> DualAxisIndexer: """ Returns an indexer object that fully specifies which sections of the DataFrame this validation believes are invalid (both row and column-wise) """ + @abc.abstractmethod + def get_failed_index(self, df: pd.DataFrame) -> DualAxisIndexer: + """ + Returns an indexer object that fully specifies which sections of the DataFrame this validation believes are + valid (both row and column-wise) + """ + def message(self, warning: ValidationWarning) -> str: prefix = self.prefix(warning) @@ -245,11 +252,12 @@ def prefix(self, warning: ValidationWarning): return ' '.join(ret) - def get_failed_index(self, df) -> DualAxisIndexer: + def get_passed_index(self, df: pd.DataFrame) -> DualAxisIndexer: selection = self.apply_index(df) - # We invert here because the validation returns True for indices that pass the validation, but we want to - # select indices that fail - return self.validate_selection(selection).invert(axis=0) + return self.validate_selection(selection) + + def get_failed_index(self, df) -> DualAxisIndexer: + return self.get_passed_index(df).invert(axis=0) # Normally, validate_series returns the indices of the cells that passed the validation, but here we want the # cells that failed it, so invert the series (unless this is a negated validation) @@ -428,36 +436,48 @@ def __init__( def apply_negation(self, index: DualAxisIndexer) -> DualAxisIndexer: pass - - def get_failed_index(self, df: pd.DataFrame) -> DualAxisIndexer: - left_failed = self.left.get_failed_index(df) - right_failed = self.right.get_failed_index(df) - + + def combine_indices(self, left: DualAxisIndexer, right: DualAxisIndexer) -> DualAxisIndexer: + """ + Utility method for combining the indexers using boolean logic + :param left: + :param right: + :return: + """ + # TODO: convert axis into an integer and apply proper panas logic if self.axis == 'rows': - assert left_failed.col_index == right_failed.col_index - assert isinstance(left_failed.row_index.index, pd.Series) + assert left.col_index == right.col_index + assert isinstance(left.row_index.index, pd.Series) return DualAxisIndexer( row_index=self.operator( - left_failed.row_index.index, - right_failed.row_index.index + left.row_index.index, + right.row_index.index ), - col_index=left_failed.col_index + col_index=left.col_index ) elif self.axis == 'columns': - assert left_failed.row_index == right_failed.row_index - assert isinstance(left_failed.col_index.index, pd.Series) + assert left.row_index == right.row_index + assert isinstance(left.col_index.index, pd.Series) return DualAxisIndexer( - row_index=left_failed.row_index, + row_index=left.row_index, col_index=self.operator( - left_failed.col_index.index, - right_failed.col_index.index + left.col_index.index, + right.col_index.index ) ) else: raise Exception() + def get_passed_index(self, df: pd.DataFrame) -> DualAxisIndexer: + left_passed = self.left.get_passed_index(df) + right_passed = self.right.get_passed_index(df) + return self.combine_indices(left_passed, right_passed) + + def get_failed_index(self, df: pd.DataFrame) -> DualAxisIndexer: + return self.get_passed_index(df).invert(self.axis) + def prefix(self, warning: ValidationWarning): pass @@ -466,64 +486,25 @@ def message(self, warning: ValidationWarning) -> str: def get_warnings_series(self, df: pd.DataFrame) -> pd.Series: # Let both validations separately select and filter a column - left_index = self.left.get_failed_index(df) - right_index = self.right.get_failed_index(df) + left_index = self.left.get_passed_index(df) + right_index = self.right.get_passed_index(df) - if self.axis == 'rows': - assert left_index.col_index == right_index.col_index - assert isinstance(left_index.row_index.index, pd.Series) - combined = DualAxisIndexer( - row_index=self.operator( - left_index.row_index.index, - right_index.row_index.index - ), - col_index=left_index.col_index - ) + # Combine them with boolean logic + # We have to invert the combined index because left and right are *passed* indices not failed ones + combined = self.combine_indices(left_index, right_index).invert(axis=0) - elif self.axis == 'columns': - assert left_index.row_index == right_index.row_index - assert isinstance(left_index.col_index.index, pd.Series) - combined = DualAxisIndexer( - row_index=left_index.row_index, - col_index=self.operator( - left_index.col_index.index, - right_index.col_index.index - ) - ) - - else: - raise Exception() - - # if self.operator == 'and': - # # If it's an "and" validation, left, right, or both failing means an error, - # # so we can simply concatenate the lists of errors - # combined = left_errors.combine( - # right_errors, - # func=operator.add, - # fill_value=[] - # ) - # elif self.operator == 'or': - # # [error] and [] = [] - # # [error_1] and [error_2] = [error_2] - # # [] and [] = [] - # # Thus, we can use the and operator to implement "or" validations - # combined = left_errors.combine( - # right_errors, - # func=lambda l, r: l + r if l and r else [], - # fill_value=[] - # ) - # # func=lambda a, b: [] if len(a) == 0 or len(b) == 0 else a + b) - # else: - # raise Exception('Operator must be "and" or "or"') - # - left_failed = left_index(df) - right_failed = right_index(df) + # Slice out the failed data + # We have to invert these because left_index and right_index are passed indices + left_failed = left_index.invert(axis=0)(df) + right_failed = right_index.invert(axis=0)(df) + # Convert the data into warnings, and then join together the warnings from both validations warnings = pd.concat([ self.left.index_to_warnings_series(df, left_index, left_failed), self.right.index_to_warnings_series(df, right_index, right_failed) ])#, join='inner', keys=['inner', 'outer']) + # Finally, apply the combined index from above to the warnings series if self.axis == 'rows': return warnings[combined.row_index.index] else: diff --git a/pandas_schema/validations.py b/pandas_schema/validations.py index 1914e1d..eac6f71 100755 --- a/pandas_schema/validations.py +++ b/pandas_schema/validations.py @@ -78,7 +78,7 @@ def default_message(self, warning: ValidationWarning): return 'was not in the range [{}, {})'.format(self.min, self.max) def validate_series(self, series: pd.Series) -> pd.Series: - series = pd.to_numeric(series) + series = pd.to_numeric(series, errors="coerce") return (series >= self.min) & (series < self.max) diff --git a/test/test_validation.py b/test/test_validation.py index b9ef11f..00e93d4 100755 --- a/test/test_validation.py +++ b/test/test_validation.py @@ -17,7 +17,7 @@ from pandas_schema import ValidationWarning -def get_warnings(validator: BaseValidation, series: list) -> typing.Collection[ +def get_warnings(validator: BaseValidation, series: typing.Union[list, pd.Series]) -> typing.Collection[ ValidationWarning]: """ Tests a validator by asserting that it generates the amount of warnings @@ -398,25 +398,17 @@ def test_invalid_items(self): ])) == 3, 'Incorrectly accepts integers outside of the range' def test_valid_character_items(self): - self.validate_and_compare( - [ - 7, - "8", - 8 - ], - True, - "Does not accept integers provided as a string" - ) + assert len(get_warnings(self.validator, [ + 7, + "8", + 8 + ])) == 0, "Does not accept integers provided as a string" def test_invalid_character_items(self): - self.validate_and_compare( - [ - "seven", - "eight", - ], - False, - "Incorrectly accepts items with non numerical text" - ) + assert len(get_warnings(self.validator, [ + "seven", + "eight", + ])) == 2, "Incorrectly accepts items with non numerical text" class Dtype(ValidationTestBase): @@ -586,21 +578,29 @@ def setUp(self): index=0) def test_valid_elements(self): - errors = self.validator.validate_series( - pd.Series(['a', 'b', 'c', 'A', 'B', 'C'], dtype='category')) + errors = get_warnings( + self.validator, + pd.Series(['a', 'b', 'c', 'A', 'B', 'C'], dtype='category') + ) assert len(list(errors)) == 0 def test_invalid_empty_elements(self): - errors = self.validator.validate_series( - pd.Series(['aa', 'bb', 'd', None], dtype='category')) + errors = get_warnings( + self.validator, + pd.Series(['aa', 'bb', 'd', None], dtype='category') + ) assert len(list(errors)) == 4 def test_invalid_and_empty_elements(self): - errors = self.validator.validate_series( - pd.Series(['a', None], dtype='category')) + errors = get_warnings( + self.validator, + pd.Series(['a', None], dtype='category') + ) assert len(list(errors)) == 1 def test_invalid_elements(self): - errors = self.validator.validate_series( - pd.Series(['aa', 'bb', 'd'], dtype='category')) + errors = get_warnings( + self.validator, + pd.Series(['aa', 'bb', 'd'], dtype='category') + ) assert len(list(errors)) == 3 From 53067283a92ae7a0cfadda76ff7c84d7bf82d91e Mon Sep 17 00:00:00 2001 From: Michael Milton Date: Mon, 13 Apr 2020 23:52:28 +1000 Subject: [PATCH 22/31] Restructure test --- pandas_schema/core.py | 10 ++++++- test/__init__.py | 2 +- test/test_schema.py | 20 +++++++------- test/test_validation.py | 58 +---------------------------------------- 4 files changed, 21 insertions(+), 69 deletions(-) diff --git a/pandas_schema/core.py b/pandas_schema/core.py index 1ca22da..a74308d 100755 --- a/pandas_schema/core.py +++ b/pandas_schema/core.py @@ -202,6 +202,14 @@ def __or__(self, other: 'BaseValidation'): return CombinedValidation(self, other, operator=operator.or_) + def __and__(self, other: 'BaseValidation'): + if not isinstance(other, BaseValidation): + raise PanSchArgumentError('The "|" operator can only be used between two' + 'Validations that subclass {}'.format( + self.__class__)) + + return CombinedValidation(self, other, operator=operator.and_) + def __invert__(self): """ Return a copy of this, except that it will return indices of those that would normally pass this validation, @@ -425,7 +433,7 @@ def __init__( self, validation_a: BaseValidation, validation_b: BaseValidation, - operator: typing.Callable, + operator: typing.Callable[[pd.Series, pd.Series], pd.Series], axis='rows' ): super().__init__() diff --git a/test/__init__.py b/test/__init__.py index 27a4caa..d59a7f6 100755 --- a/test/__init__.py +++ b/test/__init__.py @@ -1,5 +1,5 @@ from test.test_example import * -from test.test_column import * +# from test.test_column import * from test.test_schema import * from test.test_validation import * from test.test_validation_warning import * diff --git a/test/test_schema.py b/test/test_schema.py index ae7b337..4461c62 100755 --- a/test/test_schema.py +++ b/test/test_schema.py @@ -3,16 +3,16 @@ import pandas as pd from numpy.core.multiarray import dtype -from pandas_schema import Schema, Column -from pandas_schema.validation import LeadingWhitespaceValidation, IsDtypeValidation +from pandas_schema.schema import Schema +from pandas_schema.validations import LeadingWhitespaceValidation, IsDtypeValidation from pandas_schema.errors import PanSchArgumentError class UnorderedSchema(unittest.TestCase): - schema = Schema([ - Column('a'), - Column('b', [LeadingWhitespaceValidation()]) - ], ordered=False) + # schema = Schema([ + # Column('a'), + # Column('b', [LeadingWhitespaceValidation()]) + # ], ordered=False) def test_fields(self): self.assertEqual(len(self.schema.columns), 2, 'The schema is not storing all of its columns') @@ -138,10 +138,10 @@ def test_column_subset_error(self): class OrderedSchema(unittest.TestCase): - schema = Schema([ - Column('a', [LeadingWhitespaceValidation()]), - Column('b') - ], ordered=True) + # schema = Schema([ + # Column('a', [LeadingWhitespaceValidation()]), + # Column('b') + # ], ordered=True) def test_mixed_columns(self): """ diff --git a/test/test_validation.py b/test/test_validation.py index 00e93d4..e618b27 100755 --- a/test/test_validation.py +++ b/test/test_validation.py @@ -16,32 +16,7 @@ from pandas_schema.column import column, column_sequence from pandas_schema import ValidationWarning - -def get_warnings(validator: BaseValidation, series: typing.Union[list, pd.Series]) -> typing.Collection[ - ValidationWarning]: - """ - Tests a validator by asserting that it generates the amount of warnings - :param series_dtype: Explicitly specifies the dtype for the generated Series - :param series: The series to check - :param expected_result: Whether the elements in this series should pass the validation - :param msg: The message to display if this test fails - """ - - # # Check that self.validator is correct - # if not self.validator or not isinstance(self.validator, BooleanSeriesValidation, index=0): - # raise ValueError('The class must have the validator field set to an instance of a Validation subclass') - # - # # Ensure we're comparing series correctly - # self.addTypeEqualityFunc(pd.Series, self.seriesEquality) - - df = pd.Series(series).to_frame() - warnings = validator.validate(df) - return list(warnings) - # - # # Now find any items where their validation does not correspond to the expected_result - # for item, result in zip(series, results): - # with self.subTest(value=item): - # self.assertEqual(result, expected_result, msg) +from .util import get_warnings class ValidationTestBase(unittest.TestCase): @@ -483,37 +458,6 @@ def test_invalid_items(self): ])) == 3, 'Accepts values that should pass' -class Or(ValidationTestBase): - """ - Tests the | operator on two MatchesPatternValidations - """ - - def setUp(self): - self.validator = MatchesPatternValidation( - 'yes', index=0 - ) | MatchesPatternValidation( - 'pass', index=0 - ) - - def test_valid_items(self): - warnings = get_warnings(self.validator, [ - 'pass', - 'yes', - 'passyes', - '345yes345' - ]) - assert len(warnings) == 0, 'rejects values that should pass' - - def test_invalid_items(self): - warnings = get_warnings(self.validator, [ - 'fail', - 'YES', - 'YPESS' - ]) - - assert len(warnings) == 6, 'accepts values that should pass' - - class CustomMessage(ValidationTestBase): """ Tests that custom error messages work as expected From a8fa041a05d53cea97c1fd24a2a1d78e6e96a510 Mon Sep 17 00:00:00 2001 From: Michael Milton Date: Mon, 13 Apr 2020 23:52:49 +1000 Subject: [PATCH 23/31] Restructure tests --- test/test_combined_validation.py | 54 ++++++++++++++++++++++++++++++++ test/util.py | 32 +++++++++++++++++++ 2 files changed, 86 insertions(+) create mode 100644 test/test_combined_validation.py create mode 100644 test/util.py diff --git a/test/test_combined_validation.py b/test/test_combined_validation.py new file mode 100644 index 0000000..9f7094b --- /dev/null +++ b/test/test_combined_validation.py @@ -0,0 +1,54 @@ +import json +import unittest +import re + +from numpy import nan, dtype +import numpy as np +import pandas as pd + +from pandas_schema.validations import * +from pandas_schema.core import CombinedValidation, BaseValidation +from pandas_schema.index import ColumnIndexer as ci +from pandas_schema.schema import Schema +from pandas_schema.column import column, column_sequence +from pandas_schema import ValidationWarning + +from .util import get_warnings + +class Or(unittest.TestCase): + """ + Tests the | operator on two MatchesPatternValidations + """ + + def setUp(self): + self.validator = MatchesPatternValidation( + 'yes', index=0 + ) | MatchesPatternValidation( + 'pass', index=0 + ) + + def test_valid_items(self): + warnings = get_warnings(self.validator, [ + 'pass', + 'yes', + 'passyes', + '345yes345' + ]) + assert len(warnings) == 0, 'rejects values that should pass' + + def test_invalid_items(self): + warnings = get_warnings(self.validator, [ + 'fail', + 'YES', + 'YPESS' + ]) + + assert len(warnings) == 6, 'accepts values that should pass' + +class AndOr(unittest.TestCase): + validator = InListValidation(['one', 'two', 'three']) | ( + IsDtypeValidation(int) & InRangeValidation(1, 3) + ) + def test_and_or(self): + pass + diff --git a/test/util.py b/test/util.py new file mode 100644 index 0000000..bab2b77 --- /dev/null +++ b/test/util.py @@ -0,0 +1,32 @@ +import pandas as pd +from pandas_schema.core import BaseValidation +from pandas_schema.validation_warning import ValidationWarning +import typing + + +def get_warnings(validator: BaseValidation, series: typing.Union[list, pd.Series]) -> typing.Collection[ + ValidationWarning]: + """ + Tests a validator by asserting that it generates the amount of warnings + :param series_dtype: Explicitly specifies the dtype for the generated Series + :param series: The series to check + :param expected_result: Whether the elements in this series should pass the validation + :param msg: The message to display if this test fails + """ + + # # Check that self.validator is correct + # if not self.validator or not isinstance(self.validator, BooleanSeriesValidation, index=0): + # raise ValueError('The class must have the validator field set to an instance of a Validation subclass') + # + # # Ensure we're comparing series correctly + # self.addTypeEqualityFunc(pd.Series, self.seriesEquality) + + df = pd.Series(series).to_frame() + warnings = validator.validate(df) + return list(warnings) + # + # # Now find any items where their validation does not correspond to the expected_result + # for item, result in zip(series, results): + # with self.subTest(value=item): + # self.assertEqual(result, expected_result, msg) + From d216e48bfa082b7b906bb801ab8b41d342c7293f Mon Sep 17 00:00:00 2001 From: Michael Milton Date: Mon, 13 Apr 2020 23:54:37 +1000 Subject: [PATCH 24/31] Update docstrings --- TODO.md | 5 +- pandas_schema/column.py | 6 +- pandas_schema/core.py | 236 ++++++++++++++-------------------------- test/test_validation.py | 6 +- 4 files changed, 93 insertions(+), 160 deletions(-) diff --git a/TODO.md b/TODO.md index add03b7..0033105 100755 --- a/TODO.md +++ b/TODO.md @@ -2,10 +2,11 @@ * [x] Fix CombinedValidations * [x] Add replacement for allow_empty Columns * [ ] New column() tests -* [ ] New CombinedValidation tests -* [ ] Implement the negate flag in the indexer +* [x] New CombinedValidation tests +* [x] Implement the negate flag in the indexer * [ ] Add facility for allow_empty * [x] Fix messages * [x] Re-implement the or/and using operators * [ ] Allow and/or operators between Series-level and row-level validations * [ ] Separate ValidationClasses for each scope +* [ ] Add row-level validations \ No newline at end of file diff --git a/pandas_schema/column.py b/pandas_schema/column.py index dcd363e..74de643 100755 --- a/pandas_schema/column.py +++ b/pandas_schema/column.py @@ -5,7 +5,7 @@ def column( - validations: typing.Iterable['pandas_schema.core.IndexSeriesValidation'], + validations: typing.Iterable['pandas_schema.core.IndexValidation'], index: AxisIndexer = None, override: bool = False, allow_empty=False @@ -24,13 +24,13 @@ def column( def column_sequence( - validations: typing.Iterable['pandas_schema.core.IndexSeriesValidation'], + validations: typing.Iterable['pandas_schema.core.IndexValidation'], override: bool = False ): """ A utility method for setting the index data on a set of Validations. Applies a sequential position based index, so that the first validation gets index 0, the second gets index 1 etc. Note: this will not modify any index that - already has some kind of index + already has some kind of index unless you set override=True :param validations: A list of validations to modify :param override: If true, override existing index values. Otherwise keep the existing ones """ diff --git a/pandas_schema/core.py b/pandas_schema/core.py index 1ca22da..23d35c0 100755 --- a/pandas_schema/core.py +++ b/pandas_schema/core.py @@ -30,16 +30,15 @@ class BaseValidation(abc.ABC): """ def __init_subclass__(cls, scope: ValidationScope = ValidationScope.CELL, **kwargs): + # We override this so that you can set the scope at the time you declare the validation class, not the instance cls.scope = scope - def __init__(self, message: str = None, negated: bool = False): + def __init__(self, message: str = None): """ Creates a new IndexSeriesValidation - :param index: An index with which to select the series - Otherwise it's a label (ie, index=0) indicates the column with the label of 0 + :param message: A custom message to use for ValidationWarnings generated by this validation """ self.custom_message = message - self.negated = negated def make_df_warning(self, df: pd.DataFrame) -> ValidationWarning: """ @@ -69,13 +68,20 @@ def apply_negation(self, index: DualAxisIndexer) -> DualAxisIndexer: """ raise NotImplementedError() - def index_to_warnings_series(self, df: pd.DataFrame, index: DualAxisIndexer, failed): + def index_to_warnings_series(self, df: pd.DataFrame, index: DualAxisIndexer, failed: SubSelection): + """ + Takes an index that points to parts of the DF that have *failed* validation, and returns a Series (or similar) + that has ValidationWarning instances at each index that has failed + :param df: The DataFrame we're validating + :param index: The index pointing to the failed parts of the DF + :param failed: The result of applying index to the DF + """ # If it's am empty series/frame then this produced no warnings if isinstance(failed, (pd.DataFrame, pd.Series)) and failed.empty: return [] - # Depending on the scope, we produce the lists of warnings in different ways + # Depending on the scope, we produce the lists of warnings in different ways (ideally the most efficient ways) if isinstance(failed, pd.DataFrame): if self.scope == ValidationScope.DATA_FRAME: return [self.make_df_warning(df)] @@ -112,8 +118,8 @@ def index_to_warnings_series(self, df: pd.DataFrame, index: DualAxisIndexer, fai else: return [self.make_cell_warning( df=df, - column=self.index.col_index.index, - row=self.index.row_index.index, + column=index.col_index.index, + row=index.row_index.index, value=failed) ] @@ -123,14 +129,15 @@ def get_warnings_series(self, df: pd.DataFrame) -> pd.Series: with the DataFrame at the same index. """ index = self.get_failed_index(df) - if self.negated: - index = self.apply_negation(index) failed = index(df) return self.index_to_warnings_series(df, index, failed) @staticmethod - def to_warning_list(failed): + def to_warning_list(failed: SubSelection): + """ + Converts a Series/DF of warnings to a list of warnings + """ if isinstance(failed, pd.DataFrame): return failed.to_numpy().tolist() elif isinstance(failed, pd.Series): @@ -140,15 +147,10 @@ def to_warning_list(failed): def validate(self, df: pd.DataFrame) -> typing.Collection[ValidationWarning]: """ - Validates a data frame + Validates a data frame and returns a list of issues with it :param df: Data frame to validate :return: All validation failures detected by this validation """ - # index = self.get_failed_index(df) - # if self.negated: - # index = self.apply_negation(index) - # failed = index(df) - failed = self.get_warnings_series(df) return self.to_warning_list(failed) @@ -167,8 +169,17 @@ def get_failed_index(self, df: pd.DataFrame) -> DualAxisIndexer: """ def message(self, warning: ValidationWarning) -> str: + """ + Get a string that fully describes the provided warning, given that the warning was generating by this validation + """ + # The message is made up of a prefix (which describes the index that failed), and a suffix (which describes + # the validation that it failed) + + # The prefix can be overridden prefix = self.prefix(warning) + # The suffix can be overridden in two ways, either using a custom message (the most common), or with a custom + # default_message() function if self.custom_message: suffix = self.custom_message else: @@ -177,24 +188,31 @@ def message(self, warning: ValidationWarning) -> str: return "{} {}".format(prefix, suffix) @property - def readable_name(self, **kwargs): + def readable_name(self): """ A readable name for this validation, to be shown in validation warnings """ return type(self).__name__ - def default_message(self, warnings: ValidationWarning) -> str: + def default_message(self, warning: ValidationWarning) -> str: + """ + Returns a description of this validation, to be included in the py:meth:~message as the suffix`` + """ return 'failed the {}'.format(self.readable_name) - @abc.abstractmethod def prefix(self, warning: ValidationWarning): """ Return a string that can be used to prefix a message that relates to this index This method is safe to override """ + return "" def __or__(self, other: 'BaseValidation'): + """ + Returns a validation that will only return an error if both validations fail at the same place + :param other: Another validation to combine with this + """ if not isinstance(other, BaseValidation): raise PanSchArgumentError('The "|" operator can only be used between two' 'Validations that subclass {}'.format( @@ -202,17 +220,26 @@ def __or__(self, other: 'BaseValidation'): return CombinedValidation(self, other, operator=operator.or_) - def __invert__(self): + def __and__(self, other: 'BaseValidation'): """ - Return a copy of this, except that it will return indices of those that would normally pass this validation, - in the same series + Returns a validation that will only return an error if both validations fail at the same place + :param other: Another validation to combine with this """ - clone = copy.copy(self) - clone.negated = True - return clone + if not isinstance(other, BaseValidation): + raise PanSchArgumentError('The "&" operator can only be used between two' + 'Validations that subclass {}'.format( + self.__class__)) + + return CombinedValidation(self, other, operator=operator.and_) + + class IndexValidation(BaseValidation): + """ + An IndexValidation expands upon a BaseValidation by adding an index (in Pandas co-ordinates) that points to the + Series/DF sub-selection/row/cell that it validates + """ def __init__( self, index: DualAxisIndexer, @@ -234,11 +261,6 @@ def apply_index(self, df: pd.DataFrame): return self.index(df) def prefix(self, warning: ValidationWarning): - """ - Return a string that can be used to prefix a message that relates to this index - - This method is safe to override - """ ret = [] if self.index.col_index is not None: @@ -259,41 +281,33 @@ def get_passed_index(self, df: pd.DataFrame) -> DualAxisIndexer: def get_failed_index(self, df) -> DualAxisIndexer: return self.get_passed_index(df).invert(axis=0) - # Normally, validate_series returns the indices of the cells that passed the validation, but here we want the - # cells that failed it, so invert the series (unless this is a negated validation) - # if self.negated: - # row_index = selected - # else: - # row_index = ~selected - - # Combine the index and the result series into one set of indexes - # return DualAxisIndexer( - # row_index=row_index - # col_index=self.index.col_index - # ) - @abc.abstractmethod def validate_selection(self, selection: SubSelection) -> DualAxisIndexer: """ - Given a series, return an indexer that - passes the validation, otherwise False + Given a selection, return an indexer that points to elements that passed the validation """ pass - def negate(self, axis: int): + def invert(self, axis: int): """ Returns a copy of this validation, but with an inverted indexer """ - return self.__class__(index=self.index.invert(axis)) - + clone = copy.copy(self) + clone.index = self.index.invert(axis) + return clone class SeriesValidation(IndexValidation): """ - A type of IndexValidation that operates only on a Series. This class mostly adds utility methods rather than - any particular functionality. + A type of IndexValidation that expands IndexValidation with the knowledge that it will validate a single Series """ - def __init__(self, index, *args, **kwargs): + def __init__(self, index:typing.Union[RowIndexer, IndexValue], *args, **kwargs): + """ + Create a new SeriesValidation + :param index: The index pointing to the Series to validate. For example, this might be 2 to validate Series + with index 2, or "first_name" to validate a Series named "first_name". For more advanced indexing, you may + pass in an instance of the RowIndexer class + """ super().__init__( *args, index=DualAxisIndexer( @@ -303,14 +317,6 @@ def __init__(self, index, *args, **kwargs): **kwargs ) - def apply_negation(self, index: DualAxisIndexer) -> DualAxisIndexer: - """ - When a SeriesValidation is negated, it means that we should invert only the row indices returned by the - validation. This makes the validation return warnings from the same subset of the DataFrame, but makes cells - pass if they would fail, and fail if they would pass - """ - return index.invert(axis=0) - def validate_selection(self, selection: SubSelection) -> DualAxisIndexer: """ Since this is a SeriesValidation, we can simplify the validation. Now we only have to ask the subclass to take @@ -339,81 +345,12 @@ def validate_series(self, series: pd.Series) -> IndexValue: Given a series, return a bool Series that has values of True if the series passes the validation, otherwise False """ - - -# class BooleanSeriesValidation(IndexValidation, WarningSeriesGenerator): -# """ -# Validation is defined by the function :py:meth:~select_cells that returns a boolean series. -# Each cell that has False has failed the validation. -# -# Child classes need not create their own :py:class:~pandas_schema.core.BooleanSeriesValidation.Warning subclass, -# because the data is in the same form for each cell. You need only define a :py:meth~default_message. -# """ -# -# def __init__(self, *args, negated=False, **kwargs): -# super().__init__(*args, **kwargs) -# self.negated = negated -# -# @abc.abstractmethod -# def select_cells(self, series: pd.Series) -> pd.Series: -# """ -# A BooleanSeriesValidation must return a boolean series. Each cell that has False has failed the -# validation -# :param series: The series to validate -# """ -# pass -# -# def validate_series(self, series, flatten=True) -> typing.Union[ -# typing.Iterable[ValidationWarning], -# pd.Series -# ]: -# """ -# Validates a single series selected from the DataFrame -# """ -# selection = self.select_cells(series) -# -# if self.negated: -# # If self.negated (which is not the default), then we don't need to flip the booleans -# failed = selection -# else: -# # In the normal case we do need to flip the booleans, since select_cells returns True for cells that pass -# # the validation, and we want cells that failed it -# failed = ~selection -# -# # Slice out the failed items, then map each into a list of validation warnings at each respective index -# warnings = series[failed].to_frame().apply( -# lambda row: [ValidationWarning(self, { -# 'row': row.name, -# 'value': row[0] -# })], axis='columns', result_type='reduce') -# # warnings = warnings.iloc[:, 0] -# -# # If flatten, return a list of ValidationWarning, otherwise return a series of lists of Validation Warnings -# if flatten: -# return self.flatten_warning_series(warnings) -# else: -# return warnings -# -# def get_warning_series(self, df: pd.DataFrame) -> pd.Series: -# """ -# Validates a series and returns a series of warnings. -# """ -# series = self.select_series(df) -# return self.validate_series(series, flatten=False) -# -# def prefix(self, warning: ValidationWarning): -# parent = super().prefix(warning) -# # Only in this subclass do we know the contents of the warning props, since we defined them in the -# # validate_series method. Thus, we can now add row index information -# -# return parent + ', Row {row}: "{value}"'.format(**warning.props) -# -# def __invert__(self) -> 'BooleanSeriesValidation': -# """ -# If a BooleanSeriesValidation is negated, it has the opposite result -# """ -# self.negated = not self.negated -# return self + def __invert__(self) -> 'SeriesValidation': + """ + Return a Validation that returns the opposite cells it used to, but in the same column + """ + # We can only do this now that we are a SeriesValidation and we know on which axis to invert + return self.invert(axis=0) class CombinedValidation(BaseValidation): @@ -425,26 +362,28 @@ def __init__( self, validation_a: BaseValidation, validation_b: BaseValidation, - operator: typing.Callable, + operator: typing.Callable[[pd.Series, pd.Series], pd.Series], axis='rows' ): + """ + Creates a new CombinedValidation + :param validation_a: The first validation to combine + :param validation_b: The second validation to combine + :param operator: An operator, likely operator.or_ or operator.and_ that we should use to combine Validations + :param axis: The axis across which to combine validations. If this is "rows", then we keep the column indices + of each result, and combine the row indices (the most common option). If this is "columns", do the opposite + """ super().__init__() self.operator = operator self.left = validation_a self.right = validation_b self.axis = axis - def apply_negation(self, index: DualAxisIndexer) -> DualAxisIndexer: - pass - def combine_indices(self, left: DualAxisIndexer, right: DualAxisIndexer) -> DualAxisIndexer: """ - Utility method for combining the indexers using boolean logic - :param left: - :param right: - :return: + Utility method for combining two indexers using boolean logic """ - # TODO: convert axis into an integer and apply proper panas logic + # TODO: convert axis into an integer and apply proper pandas logic if self.axis == 'rows': assert left.col_index == right.col_index assert isinstance(left.row_index.index, pd.Series) @@ -478,12 +417,6 @@ def get_passed_index(self, df: pd.DataFrame) -> DualAxisIndexer: def get_failed_index(self, df: pd.DataFrame) -> DualAxisIndexer: return self.get_passed_index(df).invert(self.axis) - def prefix(self, warning: ValidationWarning): - pass - - def message(self, warning: ValidationWarning) -> str: - pass - def get_warnings_series(self, df: pd.DataFrame) -> pd.Series: # Let both validations separately select and filter a column left_index = self.left.get_passed_index(df) @@ -509,6 +442,3 @@ def get_warnings_series(self, df: pd.DataFrame) -> pd.Series: return warnings[combined.row_index.index] else: return warnings[combined.col_index.index] - - def default_message(self, warnings: ValidationWarning) -> str: - return '({}) {} ({})'.format(self.v_a.message, self.operator, self.v_b.message) diff --git a/test/test_validation.py b/test/test_validation.py index 00e93d4..6939b01 100755 --- a/test/test_validation.py +++ b/test/test_validation.py @@ -476,11 +476,13 @@ def test_valid_items(self): ])) == 0, 'Rejects values that should pass' def test_invalid_items(self): - assert len(get_warnings(self.validator, [ + warnings = get_warnings(self.validator, [ 'fail', 'thisfails', 'failure' - ])) == 3, 'Accepts values that should pass' + ]) + + assert len(warnings) == 3, 'Accepts values that should pass' class Or(ValidationTestBase): From aae44a7af5d055d21afa24ad3014c3e4b14c54b5 Mon Sep 17 00:00:00 2001 From: Michael Milton Date: Sun, 26 Apr 2020 00:34:11 +1000 Subject: [PATCH 25/31] Update --- pandas_schema/core.py | 52 +++++++++++++++++++++++++++++ pandas_schema/validation_warning.py | 17 ++++++++++ 2 files changed, 69 insertions(+) diff --git a/pandas_schema/core.py b/pandas_schema/core.py index deb1493..5368e8b 100755 --- a/pandas_schema/core.py +++ b/pandas_schema/core.py @@ -429,6 +429,58 @@ def get_passed_index(self, df: pd.DataFrame) -> DualAxisIndexer: def get_failed_index(self, df: pd.DataFrame) -> DualAxisIndexer: return self.get_passed_index(df).invert(self.axis) + def index_to_warnings_series(self, df: pd.DataFrame, index: DualAxisIndexer, failed: SubSelection): + # If it's am empty series/frame then this produced no warnings + if isinstance(failed, (pd.DataFrame, pd.Series)) and failed.empty: + return pd.Series() + + # Depending on the scope, we produce the lists of warnings in different ways (ideally the most efficient ways) + if isinstance(failed, pd.DataFrame): + if self.scope == ValidationScope.DATA_FRAME: + return [self.make_df_warning(df)] + elif self.scope == ValidationScope.SERIES: + return df.apply(lambda series: self.make_series_warning( + df=df, + column=series.name, + series=series + ), axis=0) + elif self.scope == ValidationScope.CELL: + return df.apply(lambda series: series.to_frame().apply( + lambda cell: self.make_cell_warning( + df=df, + column=series.name, + series=series, + row=cell.name, + value=cell + ))).squeeze() + elif isinstance(failed, pd.Series): + if self.scope == ValidationScope.SERIES: + return df.apply(lambda series: self.make_series_warning( + df=df, + column=series.name, + series=series + ), axis=0) + # return [self.make_series_warning( + # df=df, + # column=index.col_index.index, + # series=failed + # )] + elif self.scope == ValidationScope.CELL: + return failed.to_frame().apply(lambda cell: self.make_cell_warning( + df=df, + column=index.col_index.index, + series=failed, + row=cell.name, + value=cell[0] + ), axis=1).squeeze() + else: + return [self.make_cell_warning( + df=df, + column=index.col_index.index, + row=index.row_index.index, + value=failed) + ] + def get_warnings_series(self, df: pd.DataFrame) -> pd.Series: # Let both validations separately select and filter a column left_index = self.left.get_passed_index(df) diff --git a/pandas_schema/validation_warning.py b/pandas_schema/validation_warning.py index e4b62a5..1f86832 100755 --- a/pandas_schema/validation_warning.py +++ b/pandas_schema/validation_warning.py @@ -32,3 +32,20 @@ def message(self): def __str__(self): return self.message + + +class CombinedValidationWarning(ValidationWarning): + """ + Warning for a CombinedValidation, which itself wraps 2 other Warnings from child Validations + """ + left: ValidationWarning + right: ValidationWarning + + @property + def message(self): + """ + Return this validation as a string + """ + # Unlike a normal ValidationWarning, this doesn't ask CombinedValidation for a message, it just combines + # existing messages + return '{} and {}'.format(self.left.message, self.right.message) From 3a8e43792a090b4ba6cf62e4900352bf6ad4e8c6 Mon Sep 17 00:00:00 2001 From: Michael Milton Date: Tue, 5 May 2020 02:57:55 +1000 Subject: [PATCH 26/31] Nested validations seem to be working --- pandas_schema/core.py | 108 +++++++++------------------- pandas_schema/index.py | 4 +- pandas_schema/validation_warning.py | 17 ++++- pandas_schema/validations.py | 2 +- test/test_combined_validation.py | 4 +- 5 files changed, 57 insertions(+), 78 deletions(-) diff --git a/pandas_schema/core.py b/pandas_schema/core.py index 5368e8b..059af31 100755 --- a/pandas_schema/core.py +++ b/pandas_schema/core.py @@ -9,10 +9,11 @@ from dataclasses import dataclass import enum import copy +from math import isnan from . import column from .errors import PanSchArgumentError, PanSchNoIndexError -from pandas_schema.validation_warning import ValidationWarning +from pandas_schema.validation_warning import ValidationWarning, CombinedValidationWarning from pandas_schema.index import AxisIndexer, IndexValue, IndexType, RowIndexer, \ DualAxisIndexer, BooleanIndexer from pandas_schema.scope import ValidationScope @@ -177,20 +178,24 @@ def message(self, warning: ValidationWarning) -> str: """ Get a string that fully describes the provided warning, given that the warning was generating by this validation """ - # The message is made up of a prefix (which describes the index that failed), and a suffix (which describes - # the validation that it failed) + return "{} {}".format(self.prefix(warning), self.suffix(warning)) - # The prefix can be overridden - prefix = self.prefix(warning) + def prefix(self, warning: ValidationWarning): + """ + Return a string that can be used to prefix a message that relates to this index + This method is safe to override + """ + return "" + + + def suffix(self, warning: ValidationWarning): # The suffix can be overridden in two ways, either using a custom message (the most common), or with a custom # default_message() function if self.custom_message: - suffix = self.custom_message + return self.custom_message else: - suffix = self.default_message(warning) - - return "{} {}".format(prefix, suffix) + return self.default_message(warning) @property def readable_name(self): @@ -205,14 +210,6 @@ def default_message(self, warning: ValidationWarning) -> str: """ return 'failed the {}'.format(self.readable_name) - def prefix(self, warning: ValidationWarning): - """ - Return a string that can be used to prefix a message that relates to this index - - This method is safe to override - """ - return "" - def __or__(self, other: 'BaseValidation'): """ Returns a validation that will only return an error if both validations fail at the same place @@ -273,9 +270,9 @@ def prefix(self, warning: ValidationWarning): ret.append('Row {}'.format(warning.props['row'])) - ret.append('"{}"'.format(warning.props['value'])) + ret.append('Value "{}"'.format(warning.props['value'])) - return ' '.join(ret) + return '{' + ', '.join(ret) + '}' def get_passed_index(self, df: pd.DataFrame) -> DualAxisIndexer: selection = self.apply_index(df) @@ -430,56 +427,9 @@ def get_failed_index(self, df: pd.DataFrame) -> DualAxisIndexer: return self.get_passed_index(df).invert(self.axis) def index_to_warnings_series(self, df: pd.DataFrame, index: DualAxisIndexer, failed: SubSelection): - # If it's am empty series/frame then this produced no warnings - if isinstance(failed, (pd.DataFrame, pd.Series)) and failed.empty: - return pd.Series() - - # Depending on the scope, we produce the lists of warnings in different ways (ideally the most efficient ways) - if isinstance(failed, pd.DataFrame): - if self.scope == ValidationScope.DATA_FRAME: - return [self.make_df_warning(df)] - elif self.scope == ValidationScope.SERIES: - return df.apply(lambda series: self.make_series_warning( - df=df, - column=series.name, - series=series - ), axis=0) - elif self.scope == ValidationScope.CELL: - return df.apply(lambda series: series.to_frame().apply( - lambda cell: self.make_cell_warning( - df=df, - column=series.name, - series=series, - row=cell.name, - value=cell - ))).squeeze() - elif isinstance(failed, pd.Series): - if self.scope == ValidationScope.SERIES: - return df.apply(lambda series: self.make_series_warning( - df=df, - column=series.name, - series=series - ), axis=0) - # return [self.make_series_warning( - # df=df, - # column=index.col_index.index, - # series=failed - # )] - elif self.scope == ValidationScope.CELL: - return failed.to_frame().apply(lambda cell: self.make_cell_warning( - df=df, - column=index.col_index.index, - series=failed, - row=cell.name, - value=cell[0] - ), axis=1).squeeze() - else: - return [self.make_cell_warning( - df=df, - column=index.col_index.index, - row=index.row_index.index, - value=failed) - ] + # In a normal validation this method would create new Validatations, and use the index, but we don't actually + # need either here + return self.get_warnings_series(df) def get_warnings_series(self, df: pd.DataFrame) -> pd.Series: # Let both validations separately select and filter a column @@ -496,10 +446,22 @@ def get_warnings_series(self, df: pd.DataFrame) -> pd.Series: right_failed = right_index.invert(axis=0)(df) # Convert the data into warnings, and then join together the warnings from both validations - warnings = pd.concat([ - self.left.index_to_warnings_series(df, left_index, left_failed), - self.right.index_to_warnings_series(df, right_index, right_failed) - ])#, join='inner', keys=['inner', 'outer']) + def combine(left, right): + # Make a CombinedValidationWarning if it failed both validations, otherwise return the single failure + if left: + if right: + return CombinedValidationWarning(left, right, validation=self) + else: + return left + else: + return right + + + warnings = self.left.index_to_warnings_series(df, left_index, left_failed).combine( + self.right.index_to_warnings_series(df, right_index, right_failed), + func=combine, + fill_value=False + ) # Finally, apply the combined index from above to the warnings series if self.axis == 'rows': diff --git a/pandas_schema/index.py b/pandas_schema/index.py index 69b4165..75ca90b 100755 --- a/pandas_schema/index.py +++ b/pandas_schema/index.py @@ -219,12 +219,14 @@ def direct_index(self): """ Converts this indexer's self.index into a value that can be passed directly into iloc[] """ - if isinstance(self.index, bool): + # If it's a scalar boolean, we need special values + if np.issubdtype(type(self.index), np.bool_) and np.ndim(self.index) == 0: if self.index: return slice(None) else: return [] + # If it's a vector, pandas can deal with it return self.index def __call__(self, df: pd.DataFrame): diff --git a/pandas_schema/validation_warning.py b/pandas_schema/validation_warning.py index 1f86832..cc3923a 100755 --- a/pandas_schema/validation_warning.py +++ b/pandas_schema/validation_warning.py @@ -23,13 +23,21 @@ def __init__(self, validation, **props): self.props = props @property - def message(self): + def message(self) -> str: """ Return this validation as a string """ # Internally, this actually asks the validator class to formulate a message return self.validation.message(self) + @property + def prefix(self) -> str: + return self.validation.prefix(self) + + @property + def suffix(self) -> str: + return self.validation.suffix(self) + def __str__(self): return self.message @@ -41,6 +49,11 @@ class CombinedValidationWarning(ValidationWarning): left: ValidationWarning right: ValidationWarning + def __init__(self, left: ValidationWarning, right: ValidationWarning, **kwargs): + super().__init__(**kwargs) + self.left = left + self.right = right + @property def message(self): """ @@ -48,4 +61,4 @@ def message(self): """ # Unlike a normal ValidationWarning, this doesn't ask CombinedValidation for a message, it just combines # existing messages - return '{} and {}'.format(self.left.message, self.right.message) + return '{} {} and {}'.format(self.left.prefix, self.left.suffix, self.right.suffix) diff --git a/pandas_schema/validations.py b/pandas_schema/validations.py index 8978a14..71a4ed9 100755 --- a/pandas_schema/validations.py +++ b/pandas_schema/validations.py @@ -258,7 +258,7 @@ def __init__(self, options: typing.Iterable, case_sensitive: bool = True, **kwar def default_message(self, warning: ValidationWarning): values = ', '.join(str(v) for v in self.options) - return 'must be in the list of legal options ({})'.format(values) + return 'was not in the list of legal options [{}]'.format(values) def validate_series(self, series: pd.Series) -> pd.Series: if self.case_sensitive: diff --git a/test/test_combined_validation.py b/test/test_combined_validation.py index 0deed60..f0ded19 100644 --- a/test/test_combined_validation.py +++ b/test/test_combined_validation.py @@ -49,7 +49,7 @@ def test_invalid_items(self): class AndOr(unittest.TestCase): validator = InListValidation(['one', 'two', 'three'], index=0) | ( - IsDtypeValidation(int, index=0) & InRangeValidation(1, 3, index=0) + IsDtypeValidation(np.int_, index=0) & InRangeValidation(1, 3, index=0) ) def test_passing_words(self): @@ -83,3 +83,5 @@ def test_failing_numbers(self): 6 ], dtype=np.int_)) assert len(warnings) == 3 + for warning in warnings: + print(warning.message) From b6627598bdb319c4133bc47387d3f17c5587066b Mon Sep 17 00:00:00 2001 From: Michael Milton Date: Fri, 15 May 2020 02:36:04 +1000 Subject: [PATCH 27/31] Implement .optional() method, which works a bit like allow_empty --- TODO.md | 2 +- pandas_schema/column.py | 54 ++++++++++++++++++++--------- pandas_schema/core.py | 58 ++++++++++++++++++++------------ pandas_schema/validations.py | 6 ++-- test/test_combined_validation.py | 32 ++++++++++++++++++ test/test_validation.py | 43 +++++++++++++++++++++++ 6 files changed, 155 insertions(+), 40 deletions(-) diff --git a/TODO.md b/TODO.md index 0033105..68d2f86 100755 --- a/TODO.md +++ b/TODO.md @@ -4,7 +4,7 @@ * [ ] New column() tests * [x] New CombinedValidation tests * [x] Implement the negate flag in the indexer -* [ ] Add facility for allow_empty +* [x] Add facility for allow_empty * [x] Fix messages * [x] Re-implement the or/and using operators * [ ] Allow and/or operators between Series-level and row-level validations diff --git a/pandas_schema/column.py b/pandas_schema/column.py index 74de643..36eba59 100755 --- a/pandas_schema/column.py +++ b/pandas_schema/column.py @@ -1,22 +1,25 @@ import typing -import pandas_schema.core -from pandas_schema.index import AxisIndexer +from pandas_schema.core import IndexValidation +from pandas_schema.index import AxisIndexer, IndexValue def column( - validations: typing.Iterable['pandas_schema.core.IndexValidation'], + validations: typing.Iterable['IndexValidation'], index: AxisIndexer = None, override: bool = False, allow_empty=False ): - """ - A utility method for setting the index data on a set of Validations - :param validations: A list of validations to modify - :param index: The index of the series that these validations will now consider - :param override: If true, override existing index values. Otherwise keep the existing ones - :param allow_empty: Allow empty rows (NaN) to pass the validation - See :py:class:`pandas_schema.validation.IndexSeriesValidation` + """A utility method for setting the index data on a set of Validations + + Args: + validations: A list of validations to modify + index: The index of the series that these validations will now consider + override: If true, override existing index values. Otherwise keep the existing ones + allow_empty: Allow empty rows (NaN) to pass the validation + See :py:class:`pandas_schema.validation.IndexSeriesValidation` (Default value = False) + Returns: + """ for valid in validations: if override or valid.index is None: @@ -24,19 +27,40 @@ def column( def column_sequence( - validations: typing.Iterable['pandas_schema.core.IndexValidation'], + validations: typing.Iterable['IndexValidation'], override: bool = False ): - """ - A utility method for setting the index data on a set of Validations. Applies a sequential position based index, so + """A utility method for setting the index data on a set of Validations. Applies a sequential position based index, so that the first validation gets index 0, the second gets index 1 etc. Note: this will not modify any index that already has some kind of index unless you set override=True - :param validations: A list of validations to modify - :param override: If true, override existing index values. Otherwise keep the existing ones + + Args: + validations: A list of validations to modify + override: If true, override existing index values. Otherwise keep the existing ones + validations: typing.Iterable['pandas_schema.core.IndexValidation']: + override: bool: (Default value = False) + + Returns: + """ for i, valid in validations: if override or valid.index is None: valid.index = AxisIndexer(i, typ='positional') + + +def each_column(validations: typing.Iterable[IndexValidation], columns: IndexValue): + """Duplicates a validation and applies it to each column specified + + Args: + validations: A list of validations to apply to each column + columns: An index that should, when applied to the column index, should return all columns you want this to + validations: typing.Iterable[pandas_schema.core.IndexValidation]: + columns: IndexValue: + + Returns: + + """ + # # def label_column( # validations: typing.Iterable['pandas_schema.core.IndexSeriesValidation'], diff --git a/pandas_schema/core.py b/pandas_schema/core.py index 059af31..d6e3c70 100755 --- a/pandas_schema/core.py +++ b/pandas_schema/core.py @@ -11,7 +11,6 @@ import copy from math import isnan -from . import column from .errors import PanSchArgumentError, PanSchNoIndexError from pandas_schema.validation_warning import ValidationWarning, CombinedValidationWarning from pandas_schema.index import AxisIndexer, IndexValue, IndexType, RowIndexer, \ @@ -61,14 +60,6 @@ def make_cell_warning(self, df: pd.DataFrame, column: str, row: int, value, """ return ValidationWarning(self, column=column, row=row, value=value) - def apply_negation(self, index: DualAxisIndexer) -> DualAxisIndexer: - """ - Can be implemented by sub-classes to provide negation behaviour. If implemented, this should return a new - indexer that returns the opposite of what it normally would. The definition of opposite may vary from validation - to validation - """ - raise NotImplementedError() - def index_to_warnings_series(self, df: pd.DataFrame, index: DualAxisIndexer, failed: SubSelection): """ Takes an index that points to parts of the DF that have *failed* validation, and returns a Series (or similar) @@ -108,11 +99,6 @@ def index_to_warnings_series(self, df: pd.DataFrame, index: DualAxisIndexer, fai column=series.name, series=series ), axis=0) - # return [self.make_series_warning( - # df=df, - # column=index.col_index.index, - # series=failed - # )] elif self.scope == ValidationScope.CELL: return failed.to_frame().apply(lambda cell: self.make_cell_warning( df=df, @@ -188,7 +174,6 @@ def prefix(self, warning: ValidationWarning): """ return "" - def suffix(self, warning: ValidationWarning): # The suffix can be overridden in two ways, either using a custom message (the most common), or with a custom # default_message() function @@ -240,6 +225,7 @@ class IndexValidation(BaseValidation): An IndexValidation expands upon a BaseValidation by adding an index (in Pandas co-ordinates) that points to the Series/DF sub-selection/row/cell that it validates """ + def __init__( self, index: DualAxisIndexer, @@ -296,24 +282,41 @@ def invert(self, axis: int): clone.index = self.index.invert(axis) return clone + def optional(self) -> 'CombinedValidation': + """ + Makes this Validation optional, by returning a CombinedValidation that accepts empty cells + """ + return CombinedValidation( + self, + IsEmptyValidation(index=self.index), + operator=operator.or_ + ) + + class SeriesValidation(IndexValidation): """ A type of IndexValidation that expands IndexValidation with the knowledge that it will validate a single Series """ - def __init__(self, index:typing.Union[RowIndexer, IndexValue], *args, **kwargs): + def __init__(self, index: typing.Union[RowIndexer, IndexValue, DualAxisIndexer], *args, **kwargs): """ Create a new SeriesValidation :param index: The index pointing to the Series to validate. For example, this might be 2 to validate Series with index 2, or "first_name" to validate a Series named "first_name". For more advanced indexing, you may pass in an instance of the RowIndexer class """ - super().__init__( - *args, - index=DualAxisIndexer( + # We have to convert a single-axis index into a dual-axis index + if isinstance(index, DualAxisIndexer): + dual = index + else: + dual = DualAxisIndexer( col_index=index, row_index=BooleanIndexer(index=True, axis=0), - ), + ) + + super().__init__( + *args, + index=dual, **kwargs ) @@ -345,6 +348,7 @@ def validate_series(self, series: pd.Series) -> IndexValue: Given a series, return a bool Series that has values of True if the series passes the validation, otherwise False """ + def __invert__(self) -> 'SeriesValidation': """ Return a Validation that returns the opposite cells it used to, but in the same column @@ -386,7 +390,6 @@ def message(self, warning: ValidationWarning) -> str: # def index_to_warnings_series(self, df: pd.DataFrame, index: DualAxisIndexer, failed: SubSelection): # # We handle this method by deferring to the children - def combine_indices(self, left: DualAxisIndexer, right: DualAxisIndexer) -> DualAxisIndexer: """ @@ -456,7 +459,6 @@ def combine(left, right): else: return right - warnings = self.left.index_to_warnings_series(df, left_index, left_failed).combine( self.right.index_to_warnings_series(df, right_index, right_failed), func=combine, @@ -468,3 +470,15 @@ def combine(left, right): return warnings[combined.row_index.index] else: return warnings[combined.col_index.index] + + +class IsEmptyValidation(SeriesValidation): + """ + Validates that each element in the Series is "empty". For most dtypes, this means each element contains null, + but for strings we consider 0-length strings to be empty + """ + def validate_series(self, series: pd.Series) -> IndexValue: + if is_categorical_dtype(series) or is_numeric_dtype(series): + return series.isnull() + else: + return series.str.len() == 0 diff --git a/pandas_schema/validations.py b/pandas_schema/validations.py index 71a4ed9..27792b3 100755 --- a/pandas_schema/validations.py +++ b/pandas_schema/validations.py @@ -7,8 +7,8 @@ import operator from . import column -from .core import SeriesValidation, IndexValidation -from .index import DualAxisIndexer +from .core import SeriesValidation, IndexValidation, IsEmptyValidation +from .index import DualAxisIndexer, IndexValue from .validation_warning import ValidationWarning from .errors import PanSchArgumentError from pandas.api.types import is_categorical_dtype, is_numeric_dtype @@ -293,3 +293,5 @@ def valid_date(self, val): def validate_series(self, series: pd.Series) -> pd.Series: return series.astype(str).apply(self.valid_date) + + diff --git a/test/test_combined_validation.py b/test/test_combined_validation.py index f0ded19..a97b4c3 100644 --- a/test/test_combined_validation.py +++ b/test/test_combined_validation.py @@ -1,6 +1,7 @@ import json import unittest import re +import math from numpy import nan, dtype import numpy as np @@ -48,6 +49,9 @@ def test_invalid_items(self): class AndOr(unittest.TestCase): + """ + Tests a more complex case where we have an "or" and then an "and" + """ validator = InListValidation(['one', 'two', 'three'], index=0) | ( IsDtypeValidation(np.int_, index=0) & InRangeValidation(1, 3, index=0) ) @@ -85,3 +89,31 @@ def test_failing_numbers(self): assert len(warnings) == 3 for warning in warnings: print(warning.message) + +class Optional(unittest.TestCase): + """ + Tests the "optional" method, which Ors the validation with an IsEmptyValidation + """ + validator = InRangeValidation(5, 10, index=0).optional() + + def test_passing(self): + warnings = get_warnings(self.validator, [ + 5, + None, + 6, + None, + 7, + None + ]) + + assert warnings == [], 'is not accepting null values' + + def test_failing(self): + assert len(get_warnings(self.validator, [ + 0, + math.inf, + -1, + 10 + ])) == 4, 'is accepting invalid values' + + diff --git a/test/test_validation.py b/test/test_validation.py index 671dc9a..3206cac 100755 --- a/test/test_validation.py +++ b/test/test_validation.py @@ -435,6 +435,49 @@ def test_schema(self): for error, correct_dtype in zip(errors, [np.object, np.int64, np.float64]): assert error.props['dtype'] == correct_dtype + +class IsEmpty(ValidationTestBase): + def setUp(self): + self.validator = IsEmptyValidation(index=0) + + def test_valid_items_float(self): + errors = get_warnings(self.validator, pd.Series([ + np.nan, + np.nan + ], dtype=np.float_)) + + self.assertEqual(len(errors), 0) + + def test_valid_items_str(self): + errors = get_warnings(self.validator, pd.Series([ + '', + '', + '' + ], dtype=np.str_)) + + self.assertEqual(len(errors), 0) + + def test_invalid_items_int(self): + errors = get_warnings(self.validator, pd.Series([ + 0, + 1, + -1 + ], dtype=np.int_)) + + self.assertEqual(len(errors), 3) + self.assertEqual(type(errors[0]), ValidationWarning) + + + def test_invalid_items_str(self): + errors = get_warnings(self.validator, pd.Series([ + 'a', + ' ' + ], dtype=np.str_)) + + self.assertEqual(len(errors), 2) + self.assertEqual(type(errors[0]), ValidationWarning) + + class Negate(ValidationTestBase): """ Tests the ~ operator on a MatchesPatternValidation From 24f3bb2e6cefd3ddc2e1cac7f725c8dbde18cc45 Mon Sep 17 00:00:00 2001 From: Michael Milton Date: Tue, 26 May 2020 01:27:32 +1000 Subject: [PATCH 28/31] Rework negation (again), to fix tests --- pandas_schema/core.py | 114 ++++++++++++++++++++----------- pandas_schema/index.py | 13 ++++ setup.py | 3 + test/test_combined_validation.py | 4 +- test/test_validation.py | 4 +- 5 files changed, 94 insertions(+), 44 deletions(-) diff --git a/pandas_schema/core.py b/pandas_schema/core.py index d6e3c70..b06baa1 100755 --- a/pandas_schema/core.py +++ b/pandas_schema/core.py @@ -40,6 +40,7 @@ def __init__(self, message: str = None): """ self.custom_message = message + def make_df_warning(self, df: pd.DataFrame) -> ValidationWarning: """ Creates a DF-scope warning. Can be overridden by child classes @@ -78,35 +79,36 @@ def index_to_warnings_series(self, df: pd.DataFrame, index: DualAxisIndexer, fai if self.scope == ValidationScope.DATA_FRAME: return [self.make_df_warning(df)] elif self.scope == ValidationScope.SERIES: - return df.apply(lambda series: self.make_series_warning( + return failed.apply(lambda series: self.make_series_warning( df=df, column=series.name, series=series - ), axis=0) + ), axis='columns') elif self.scope == ValidationScope.CELL: - return df.apply(lambda series: series.to_frame().apply( + return failed.apply(lambda series: series.to_frame().apply( lambda cell: self.make_cell_warning( df=df, column=series.name, series=series, row=cell.name, value=cell - ))).squeeze() + ))) elif isinstance(failed, pd.Series): if self.scope == ValidationScope.SERIES: - return df.apply(lambda series: self.make_series_warning( + return [self.make_series_warning( df=df, - column=series.name, - series=series - ), axis=0) + column=failed.name, + series=failed + )] elif self.scope == ValidationScope.CELL: + # DataFrame.apply returns a series if the function returns a scalar, as it does here return failed.to_frame().apply(lambda cell: self.make_cell_warning( df=df, column=index.col_index.index, series=failed, row=cell.name, value=cell[0] - ), axis=1).squeeze() + ), axis='columns') else: return [self.make_cell_warning( df=df, @@ -240,7 +242,7 @@ def __init__( super().__init__(*args, **kwargs) self.index = index - def apply_index(self, df: pd.DataFrame): + def apply_index(self, df: pd.DataFrame) -> SubSelection: """ Select a series using the data stored in this validation """ @@ -264,9 +266,6 @@ def get_passed_index(self, df: pd.DataFrame) -> DualAxisIndexer: selection = self.apply_index(df) return self.validate_selection(selection) - def get_failed_index(self, df) -> DualAxisIndexer: - return self.get_passed_index(df).invert(axis=0) - @abc.abstractmethod def validate_selection(self, selection: SubSelection) -> DualAxisIndexer: """ @@ -274,14 +273,6 @@ def validate_selection(self, selection: SubSelection) -> DualAxisIndexer: """ pass - def invert(self, axis: int): - """ - Returns a copy of this validation, but with an inverted indexer - """ - clone = copy.copy(self) - clone.index = self.index.invert(axis) - return clone - def optional(self) -> 'CombinedValidation': """ Makes this Validation optional, by returning a CombinedValidation that accepts empty cells @@ -298,7 +289,7 @@ class SeriesValidation(IndexValidation): A type of IndexValidation that expands IndexValidation with the knowledge that it will validate a single Series """ - def __init__(self, index: typing.Union[RowIndexer, IndexValue, DualAxisIndexer], *args, **kwargs): + def __init__(self, index: typing.Union[RowIndexer, IndexValue, DualAxisIndexer], negated: bool=False, *args, **kwargs): """ Create a new SeriesValidation :param index: The index pointing to the Series to validate. For example, this might be 2 to validate Series @@ -320,6 +311,28 @@ def __init__(self, index: typing.Union[RowIndexer, IndexValue, DualAxisIndexer], **kwargs ) + self.negated = negated + """ + This broadly means that this validation will do the opposite of what it normally does. The actual implementation + depends on the subclass checking for this field whenever it needs to. Even for IndexValidations, we can't invert + the actual index, because it doesn't exist yet. It's only created after we run the actual validation + """ + + def get_passed_index(self, df: pd.DataFrame) -> DualAxisIndexer: + index = super().get_passed_index(df) + if self.negated: + return index.invert(axis=0) + else: + return index + + def get_failed_index(self, df) -> DualAxisIndexer: + # This is the opposite of get_passed_index, so we just flip the conditional + index = super().get_passed_index(df) + if self.negated: + return index + else: + return index.invert(axis=0) + def validate_selection(self, selection: SubSelection) -> DualAxisIndexer: """ Since this is a SeriesValidation, we can simplify the validation. Now we only have to ask the subclass to take @@ -349,12 +362,13 @@ def validate_series(self, series: pd.Series) -> IndexValue: passes the validation, otherwise False """ - def __invert__(self) -> 'SeriesValidation': + def __invert__(self) -> 'BaseValidation': """ - Return a Validation that returns the opposite cells it used to, but in the same column + Returns: A copy of this validation, but that validates the opposite of what it normally would """ - # We can only do this now that we are a SeriesValidation and we know on which axis to invert - return self.invert(axis=0) + clone = copy.copy(self) + clone.negated = True + return clone class CombinedValidation(BaseValidation): @@ -434,6 +448,30 @@ def index_to_warnings_series(self, df: pd.DataFrame, index: DualAxisIndexer, fai # need either here return self.get_warnings_series(df) + def combine(self, left: SubSelection, right: SubSelection): + """ + Combine two subsections of the DataFrame, each containing :py:class:`pandas_schema.validation_warning.ValidationWarning` + instances + """ + + # Convert the data into warnings, and then join together the warnings from both validations + def combine_index(left, right): + # Make a CombinedValidationWarning if it failed both validations, otherwise return the single failure + if left: + if right: + return CombinedValidationWarning(left, right, validation=self) + else: + return left + else: + return right + + if isinstance(left, (pd.Series, pd.DataFrame)): + return left.combine(right, combine_index, fill_value=False) + elif isinstance(right, (pd.Series, pd.DataFrame)): + return right.combine(left, combine_index, fill_value=False) + else: + return combine_index(left, right) + def get_warnings_series(self, df: pd.DataFrame) -> pd.Series: # Let both validations separately select and filter a column left_index = self.left.get_passed_index(df) @@ -448,22 +486,15 @@ def get_warnings_series(self, df: pd.DataFrame) -> pd.Series: left_failed = left_index.invert(axis=0)(df) right_failed = right_index.invert(axis=0)(df) - # Convert the data into warnings, and then join together the warnings from both validations - def combine(left, right): - # Make a CombinedValidationWarning if it failed both validations, otherwise return the single failure - if left: - if right: - return CombinedValidationWarning(left, right, validation=self) - else: - return left - else: - return right - - warnings = self.left.index_to_warnings_series(df, left_index, left_failed).combine( - self.right.index_to_warnings_series(df, right_index, right_failed), - func=combine, - fill_value=False + warnings = self.combine( + self.left.index_to_warnings_series(df, left_index, left_failed), + self.right.index_to_warnings_series(df, right_index, right_failed) ) + # warnings = self.left.index_to_warnings_series(df, left_index, left_failed).combine( + # self.right.index_to_warnings_series(df, right_index, right_failed), + # func=combine, + # fill_value=False + # ) # Finally, apply the combined index from above to the warnings series if self.axis == 'rows': @@ -477,6 +508,7 @@ class IsEmptyValidation(SeriesValidation): Validates that each element in the Series is "empty". For most dtypes, this means each element contains null, but for strings we consider 0-length strings to be empty """ + def validate_series(self, series: pd.Series) -> IndexValue: if is_categorical_dtype(series) or is_numeric_dtype(series): return series.isnull() diff --git a/pandas_schema/index.py b/pandas_schema/index.py index 75ca90b..8015ae1 100755 --- a/pandas_schema/index.py +++ b/pandas_schema/index.py @@ -18,6 +18,7 @@ class IndexType(Enum): POSITION = 0 LABEL = 1 + @dataclass class AxisIndexer(ABC): """ @@ -200,6 +201,7 @@ def __invert__(self) -> 'AxisIndexer': negate=not self.negate ) + BooleanIndexType = Union[pd.Series, bool] @@ -266,6 +268,7 @@ class SubIndexerMeta(ABCMeta): """ Metaclass for RowIndexer and ColumnIndexer, allowing then to do instance checks in a more flexible way """ + def __init__(cls, *args, axis: int, **kwargs): super().__init__(*args) cls.axis = axis @@ -294,6 +297,16 @@ class ColumnIndexer(AxisIndexer, axis=1, metaclass=SubIndexerMeta): pass +class DirectRowIndexer(DirectIndexer): + def __init__(self, index: IndexValue, typ: IndexType = None): + super().__init__(index=index, typ=typ, axis=0) + + +class DirectColumnIndexer(DirectIndexer): + def __init__(self, index: IndexValue, typ: IndexType = None): + super().__init__(index=index, typ=typ, axis=1) + + @dataclass(init=False) class DualAxisIndexer: """ diff --git a/setup.py b/setup.py index a5a77fb..e78c8c3 100755 --- a/setup.py +++ b/setup.py @@ -87,6 +87,9 @@ def run(self): 'pandas>=0.23', 'dataclasses' ], + extras_requires={ + 'dev': ['pytest'], + }, cmdclass={ 'build_readme': BuildReadme, 'build_site': BuildHtmlDocs diff --git a/test/test_combined_validation.py b/test/test_combined_validation.py index a97b4c3..f4cb1b0 100644 --- a/test/test_combined_validation.py +++ b/test/test_combined_validation.py @@ -45,7 +45,7 @@ def test_invalid_items(self): 'YPESS' ]) - assert len(warnings) == 6, 'accepts values that should pass' + assert len(warnings) == 3, 'accepts values that should pass' class AndOr(unittest.TestCase): @@ -53,7 +53,7 @@ class AndOr(unittest.TestCase): Tests a more complex case where we have an "or" and then an "and" """ validator = InListValidation(['one', 'two', 'three'], index=0) | ( - IsDtypeValidation(np.int_, index=0) & InRangeValidation(1, 3, index=0) + IsDtypeValidation(np.int_, index=0) & InRangeValidation(1, 4, index=0) ) def test_passing_words(self): diff --git a/test/test_validation.py b/test/test_validation.py index 3206cac..a496b63 100755 --- a/test/test_validation.py +++ b/test/test_validation.py @@ -11,7 +11,7 @@ from pandas_schema.validations import * from pandas_schema.core import CombinedValidation, BaseValidation -from pandas_schema.index import ColumnIndexer as ci +from pandas_schema.index import DirectColumnIndexer as ci from pandas_schema.schema import Schema from pandas_schema.column import column, column_sequence from pandas_schema import ValidationWarning @@ -432,6 +432,8 @@ def test_schema(self): errors = schema.validate(df) + assert len(errors) == 3 + for error, correct_dtype in zip(errors, [np.object, np.int64, np.float64]): assert error.props['dtype'] == correct_dtype From 36f6ee61f18dd257b3624cda27cdf10303b50a0e Mon Sep 17 00:00:00 2001 From: Michael Milton Date: Thu, 28 May 2020 14:27:12 +1000 Subject: [PATCH 29/31] Add row-uniqueness validation --- TODO.md | 3 ++- pandas_schema/core.py | 11 +++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/TODO.md b/TODO.md index 68d2f86..6825f7e 100755 --- a/TODO.md +++ b/TODO.md @@ -1,4 +1,5 @@ -* [ ] Add validations that apply to every column in the DF equally +* [ ] Add validations that apply to every column in the DF equally (for the moment, users can just duplicate their validations) +* [x] Add validations that use the entire DF like, uniqueness * [x] Fix CombinedValidations * [x] Add replacement for allow_empty Columns * [ ] New column() tests diff --git a/pandas_schema/core.py b/pandas_schema/core.py index b06baa1..e5e714e 100755 --- a/pandas_schema/core.py +++ b/pandas_schema/core.py @@ -47,6 +47,12 @@ def make_df_warning(self, df: pd.DataFrame) -> ValidationWarning: """ return ValidationWarning(self) + def make_row_warning(self, df: pd.DataFrame, row_index: IndexValue) -> ValidationWarning: + """ + Creates a series-scope warning. Can be overridden by child classes + """ + return ValidationWarning(self, row=row_index) + def make_series_warning(self, df: pd.DataFrame, column: str, series: pd.Series) -> ValidationWarning: """ @@ -83,6 +89,11 @@ def index_to_warnings_series(self, df: pd.DataFrame, index: DualAxisIndexer, fai df=df, column=series.name, series=series + ), axis='rows') + elif self.scope == ValidationScope.ROW: + return failed.apply(lambda row: self.make_row_warning( + df=df, + row_index=row.name ), axis='columns') elif self.scope == ValidationScope.CELL: return failed.apply(lambda series: series.to_frame().apply( From 9452513fbd2f58acc6ca8c3ff94062b07f3f7ffd Mon Sep 17 00:00:00 2001 From: Michael Milton Date: Thu, 9 Jul 2020 02:13:53 +1000 Subject: [PATCH 30/31] Improve column functions; add recurse; improve CombinedValidation messages when they're nested --- TODO.md | 3 +- pandas_schema/column.py | 51 ++++++++++++++------ pandas_schema/core.py | 75 +++++++++++++++++++++++------ pandas_schema/df_validations.py | 69 ++++++++++++++++++++++++++ pandas_schema/validation_warning.py | 8 +++ test/test_combined_validation.py | 34 +++++++++++-- test/test_df_validations.py | 62 ++++++++++++++++++++++++ 7 files changed, 269 insertions(+), 33 deletions(-) create mode 100644 pandas_schema/df_validations.py create mode 100644 test/test_df_validations.py diff --git a/TODO.md b/TODO.md index 6825f7e..eca42d5 100755 --- a/TODO.md +++ b/TODO.md @@ -10,4 +10,5 @@ * [x] Re-implement the or/and using operators * [ ] Allow and/or operators between Series-level and row-level validations * [ ] Separate ValidationClasses for each scope -* [ ] Add row-level validations \ No newline at end of file +* [ ] Add row-level validations +* [x] Fix message for DateAndOr test \ No newline at end of file diff --git a/pandas_schema/column.py b/pandas_schema/column.py index 36eba59..7d12fe7 100755 --- a/pandas_schema/column.py +++ b/pandas_schema/column.py @@ -1,35 +1,58 @@ -import typing +from typing import Union, Iterable -from pandas_schema.core import IndexValidation +from pandas_schema.core import IndexValidation, BaseValidation from pandas_schema.index import AxisIndexer, IndexValue def column( - validations: typing.Iterable['IndexValidation'], - index: AxisIndexer = None, + validations: Union[Iterable['IndexValidation'], 'IndexValidation'], + index = None, override: bool = False, - allow_empty=False -): + recurse: bool = True, + allow_empty: bool = False +) -> Union[Iterable['IndexValidation'], 'IndexValidation']: """A utility method for setting the index data on a set of Validations Args: validations: A list of validations to modify index: The index of the series that these validations will now consider override: If true, override existing index values. Otherwise keep the existing ones + recurse: If true, recurse into child validations allow_empty: Allow empty rows (NaN) to pass the validation See :py:class:`pandas_schema.validation.IndexSeriesValidation` (Default value = False) Returns: - """ - for valid in validations: - if override or valid.index is None: - valid.index = index + def update_validation(validation: BaseValidation): + if isinstance(validation, IndexValidation): + if override or validation.index is None: + validation.index = index + + if allow_empty: + return validation.optional() + else: + return validation + + if isinstance(validations, Iterable): + ret = [] + for valid in validations: + if recurse: + ret.append(valid.map(update_validation)) + else: + ret.append(update_validation(valid)) + return ret + else: + if recurse: + return validations.map(update_validation) + else: + return update_validation(validations) + + return validations def column_sequence( - validations: typing.Iterable['IndexValidation'], + validations: Iterable['IndexValidation'], override: bool = False -): +) -> Iterable['IndexValidation']: """A utility method for setting the index data on a set of Validations. Applies a sequential position based index, so that the first validation gets index 0, the second gets index 1 etc. Note: this will not modify any index that already has some kind of index unless you set override=True @@ -46,9 +69,10 @@ def column_sequence( for i, valid in validations: if override or valid.index is None: valid.index = AxisIndexer(i, typ='positional') + return validations -def each_column(validations: typing.Iterable[IndexValidation], columns: IndexValue): +def each_column(validations: Iterable[IndexValidation], columns: IndexValue): """Duplicates a validation and applies it to each column specified Args: @@ -90,4 +114,3 @@ def each_column(validations: typing.Iterable[IndexValidation], columns: IndexVal # validations, # index, # position=True - diff --git a/pandas_schema/core.py b/pandas_schema/core.py index e5e714e..7f6898a 100755 --- a/pandas_schema/core.py +++ b/pandas_schema/core.py @@ -40,6 +40,32 @@ def __init__(self, message: str = None): """ self.custom_message = message + def recurse(self, func: typing.Callable[['BaseValidation'], typing.Any]) -> list: + """ + Calls a function on this validation and all of its children (if this is a compound validation) + Args: + func: A function whose only argument is a single validation. The function might change the validation, or + if can return a value, in which case the value will be included in the final result + + Returns: + A list of result values + + """ + return [func(self)] + + def map(self, func: typing.Callable[['BaseValidation'], 'BaseValidation']) -> 'BaseValidation': + """ + Calls a function on this validation and all of its children (if this is a compound validation) + This function return a validation that will replace the validation it receives as an argument. + Args: + func: A function whose only argument is a single validation. The function might change the validation, or + if can return a value, in which case the value will be included in the final result + + Returns: + A list of result values + + """ + return func(self) def make_df_warning(self, df: pd.DataFrame) -> ValidationWarning: """ @@ -299,26 +325,22 @@ class SeriesValidation(IndexValidation): """ A type of IndexValidation that expands IndexValidation with the knowledge that it will validate a single Series """ + _index: typing.Optional[DualAxisIndexer] - def __init__(self, index: typing.Union[RowIndexer, IndexValue, DualAxisIndexer], negated: bool=False, *args, **kwargs): + def __init__(self, index: typing.Union[RowIndexer, IndexValue, DualAxisIndexer] = None, negated: bool=False, *args, **kwargs): """ Create a new SeriesValidation :param index: The index pointing to the Series to validate. For example, this might be 2 to validate Series with index 2, or "first_name" to validate a Series named "first_name". For more advanced indexing, you may pass in an instance of the RowIndexer class """ - # We have to convert a single-axis index into a dual-axis index - if isinstance(index, DualAxisIndexer): - dual = index - else: - dual = DualAxisIndexer( - col_index=index, - row_index=BooleanIndexer(index=True, axis=0), - ) + # This convets the index from primitive numbers into a data structure + self._index = None + self.index = index super().__init__( *args, - index=dual, + index=self.index, **kwargs ) @@ -329,6 +351,22 @@ def __init__(self, index: typing.Union[RowIndexer, IndexValue, DualAxisIndexer], the actual index, because it doesn't exist yet. It's only created after we run the actual validation """ + @property + def index(self): + return self._index + + @index.setter + def index(self, val): + # We have to convert a single-axis index into a dual-axis index + if val is not None: + if isinstance(val, DualAxisIndexer): + self._index = val + else: + self._index = DualAxisIndexer( + col_index=val, + row_index=BooleanIndexer(index=True, axis=0), + ) + def get_passed_index(self, df: pd.DataFrame) -> DualAxisIndexer: index = super().get_passed_index(df) if self.negated: @@ -408,10 +446,19 @@ def __init__( self.right = validation_b self.axis = axis - def message(self, warning: ValidationWarning) -> str: - # Nothing should ever try to create a ValidationWarning directly from a CombinedValidation, - # it should always use the original warnings from the child Validations - raise NotImplementedError() + def recurse(self, func: typing.Callable[['BaseValidation'], typing.Any]) -> list: + return [*super().recurse(func), *self.left.recurse(func), *self.right.recurse(func)] + + def map(self, func): + new = func(self) + new.left = new.left.map(func) + new.right = new.right.map(func) + return new + + # def message(self, warning: ValidationWarning) -> str: + # # Nothing should ever try to create a ValidationWarning directly from a CombinedValidation, + # # it should always use the original warnings from the child Validations + # raise NotImplementedError() # def index_to_warnings_series(self, df: pd.DataFrame, index: DualAxisIndexer, failed: SubSelection): # # We handle this method by deferring to the children diff --git a/pandas_schema/df_validations.py b/pandas_schema/df_validations.py new file mode 100644 index 0000000..be372bc --- /dev/null +++ b/pandas_schema/df_validations.py @@ -0,0 +1,69 @@ +from abc import abstractmethod +from typing import Union + +import pandas as pd + +from . import ValidationWarning +from .core import BaseValidation, ValidationScope +from .index import DualAxisIndexer, BooleanIndexer + + +class DfRowValidation(BaseValidation): + """ + Validates the entire DF at once, by returning a boolean Series corresponding to row indices that pass or fail + """ + def get_failed_index(self, df: pd.DataFrame) -> DualAxisIndexer: + passed = self.get_passed_index(df) + return passed.invert(axis=0) + + def get_passed_index(self, df: pd.DataFrame) -> DualAxisIndexer: + series = self.validate_df(df) + return DualAxisIndexer( + row_index=BooleanIndexer(series, axis=0), + col_index=BooleanIndexer(True, axis=1) + ) + + @abstractmethod + def validate_df(self, df: pd.DataFrame) -> pd.Series: + """ + Validate the DF by returning a boolean series + Args: + df: The DF to validate + + Returns: A boolean Series whose indices correspond to the row indices of the DF. If the Series has the value + True, this means the corresponding row passed the validation + + Example: + If we were for some reason validating that each row contains values higher than any element in the previous + row:: + + 1 2 3 + 4 5 6 + 1 1 1 + + The correct boolean Series to return here would be:: + + True + True + False + """ + + +class DistinctRowValidation(DfRowValidation, scope=ValidationScope.ROW): + def __init__(self, keep: Union[bool, str] = False, **kwargs): + """ + Args: + keep: Refer to the pandas docs: + "first" indicates that duplicates fail the validation except for the first occurrence. + "last" indicates that duplicates fail the validation except for the last occurrence. + False indicates that all duplicates fail the validation + """ + super().__init__(**kwargs) + self.keep = keep + + def prefix(self, warning: ValidationWarning): + return '{{Row {row}}}'.format(**warning.props) + + def validate_df(self, df: pd.DataFrame) -> pd.Series: + # We invert here because pandas gives duplicates a True value but we want them to be False as in "invalid" + return ~df.duplicated(keep=self.keep) diff --git a/pandas_schema/validation_warning.py b/pandas_schema/validation_warning.py index cc3923a..55fad7e 100755 --- a/pandas_schema/validation_warning.py +++ b/pandas_schema/validation_warning.py @@ -62,3 +62,11 @@ def message(self): # Unlike a normal ValidationWarning, this doesn't ask CombinedValidation for a message, it just combines # existing messages return '{} {} and {}'.format(self.left.prefix, self.left.suffix, self.right.suffix) + + @property + def suffix(self) -> str: + return '{} and {}'.format(self.left.suffix, self.right.suffix) + + @property + def prefix(self) -> str: + return self.left.prefix diff --git a/test/test_combined_validation.py b/test/test_combined_validation.py index f4cb1b0..4c20f66 100644 --- a/test/test_combined_validation.py +++ b/test/test_combined_validation.py @@ -48,9 +48,10 @@ def test_invalid_items(self): assert len(warnings) == 3, 'accepts values that should pass' -class AndOr(unittest.TestCase): +class NumericAndOr(unittest.TestCase): """ - Tests a more complex case where we have an "or" and then an "and" + Tests a more complex case where we have an "or" and then an "and". This schema allows either numbers + represented as either digits or words """ validator = InListValidation(['one', 'two', 'three'], index=0) | ( IsDtypeValidation(np.int_, index=0) & InRangeValidation(1, 4, index=0) @@ -90,6 +91,33 @@ def test_failing_numbers(self): for warning in warnings: print(warning.message) + +class DateAndOr(unittest.TestCase): + """ + Allows days of the week as either numbers or short words, or long words + """ + # Note: this isn't an actually well-designed validation; the two InLists should really be one validation. + # But here we're testing a somewhat complex validation + validator = column(( + CanConvertValidation(int) & InRangeValidation(min=1, max=8) + ) | ( + CanConvertValidation(str) & InListValidation(['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']) + ) | ( + CanConvertValidation(str) & InListValidation([ + 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday' + ]) + ), index=0) + + def test_correct(self): + warnings = get_warnings(self.validator, ['Mon', 3, 'Thursday', 1, 'Fri', 6, 7]) + assert len(warnings) == 0, warnings + + def test_incorrect(self): + warnings = get_warnings(self.validator, [0, 8, 'Mondesday', 'Frisday', 'Sund', 'Frid']) + assert len(warnings) == 6, warnings + for warning in warnings: + assert 'CombinedValidation' not in warning.message + class Optional(unittest.TestCase): """ Tests the "optional" method, which Ors the validation with an IsEmptyValidation @@ -115,5 +143,3 @@ def test_failing(self): -1, 10 ])) == 4, 'is accepting invalid values' - - diff --git a/test/test_df_validations.py b/test/test_df_validations.py new file mode 100644 index 0000000..67db474 --- /dev/null +++ b/test/test_df_validations.py @@ -0,0 +1,62 @@ +import pandas as pd +import pytest + +from pandas_schema import ValidationWarning +from pandas_schema.df_validations import DistinctRowValidation +from pandas.testing import assert_series_equal + + +@pytest.mark.parametrize(['df', 'result', 'kwargs'], [ + [ + # By default, all duplicates should be marked + pd.DataFrame([ + ['a', 'a', 'a'], + ['a', 'b', 'c'], + ['a', 'a', 'a'], + ['a', 'b', 'c'], + ]), + [ + False, False, False, False + ], + dict() + ], + [ + # With keep='first', the first duplicates are okay + pd.DataFrame([ + ['a', 'a', 'a'], + ['a', 'b', 'c'], + ['a', 'a', 'a'], + ['a', 'b', 'c'], + ]), + [ + True, True, False, False + ], + dict(keep='first') + ], + [ + # With keep='last', the last duplicates are okay + pd.DataFrame([ + ['a', 'a', 'a'], + ['a', 'b', 'c'], + ['a', 'a', 'a'], + ['a', 'b', 'c'], + ]), + [ + False, False, True, True + ], + dict(keep='last') + ] +]) +def test_distinct_row_validation(df, result, kwargs): + validator = DistinctRowValidation(**kwargs) + + # Test the internal validation that produces a Series + series = validator.validate_df(df) + assert_series_equal(series, pd.Series(result)) + + # Test the public method that returns warnings + # The number of warnings should be equal to the number of failures + warnings = validator.validate(df) + assert len(warnings) == result.count(False) + assert isinstance(warnings[0], ValidationWarning) + From 02e2a341d280e37a3804e4bfb85a340231e1b7ae Mon Sep 17 00:00:00 2001 From: Michael Milton Date: Sat, 7 Nov 2020 15:27:41 +1100 Subject: [PATCH 31/31] Some notes about a better indexer interface --- pandas_schema/column.py | 1 + pandas_schema/core.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/pandas_schema/column.py b/pandas_schema/column.py index 7d12fe7..828d6a0 100755 --- a/pandas_schema/column.py +++ b/pandas_schema/column.py @@ -22,6 +22,7 @@ def column( See :py:class:`pandas_schema.validation.IndexSeriesValidation` (Default value = False) Returns: """ + # TODO: Abolish this, and instead propagate the individual validator indexes when we And/Or them together def update_validation(validation: BaseValidation): if isinstance(validation, IndexValidation): if override or validation.index is None: diff --git a/pandas_schema/core.py b/pandas_schema/core.py index 7f6898a..dfd5af1 100755 --- a/pandas_schema/core.py +++ b/pandas_schema/core.py @@ -244,6 +244,7 @@ def __or__(self, other: 'BaseValidation'): 'Validations that subclass {}'.format( self.__class__)) + # TODO: Propagate the individual validator indexes when we And/Or them together return CombinedValidation(self, other, operator=operator.or_) def __and__(self, other: 'BaseValidation'): @@ -256,6 +257,7 @@ def __and__(self, other: 'BaseValidation'): 'Validations that subclass {}'.format( self.__class__)) + # TODO: Propagate the individual validator indexes when we And/Or them together return CombinedValidation(self, other, operator=operator.and_)