From e10fca3c9ca6e9de049a216fca93094b6e164a3c Mon Sep 17 00:00:00 2001 From: Igor Date: Mon, 2 Nov 2020 19:36:01 +0000 Subject: [PATCH 1/2] Allow typecasting selected value arrays --- .../test_pandas_feature_selector.py | 19 +++++++++++++++--- timeserio/preprocessing/pandas.py | 20 +++++++++++++------ 2 files changed, 30 insertions(+), 9 deletions(-) diff --git a/tests/test_preprocessing/test_pandas_feature_selector.py b/tests/test_preprocessing/test_pandas_feature_selector.py index d6195e0..3504ae2 100644 --- a/tests/test_preprocessing/test_pandas_feature_selector.py +++ b/tests/test_preprocessing/test_pandas_feature_selector.py @@ -5,13 +5,13 @@ import timeserio.ini as ini from timeserio.data.mock import mock_fit_data from timeserio.preprocessing import ( - PandasColumnSelector, PandasValueSelector, - PandasIndexValueSelector, PandasSequenceSplitter + PandasColumnSelector, PandasValueSelector, PandasIndexValueSelector, + PandasSequenceSplitter ) - datetime_column = ini.Columns.datetime usage_column = ini.Columns.target +id_column = ini.Columns.id @pytest.fixture @@ -66,6 +66,12 @@ def test_value_selector(df, columns, shape1): assert subarray.shape == expected_shape +@pytest.mark.parametrize("dtype", ["uint8", "int8"]) +def test_value_selector_dtype(df, dtype): + subarray = PandasValueSelector(columns="id", dtype=dtype).transform(df) + assert subarray.dtype == dtype + + @pytest.mark.parametrize( 'levels, shape1', [ (None, 0), @@ -83,6 +89,13 @@ def test_index_value_selector(indexed_df, levels, shape1): assert subarray.shape == expected_shape +@pytest.mark.parametrize("dtype", ["uint8", "int8"]) +def test_index_value_selector_dtype(indexed_df, dtype): + subarray = PandasIndexValueSelector(levels="id", + dtype=dtype).transform(indexed_df) + assert subarray.dtype == dtype + + @pytest.mark.parametrize( 'transformer, required_columns', [ (PandasColumnSelector('col1'), {'col1'}), diff --git a/timeserio/preprocessing/pandas.py b/timeserio/preprocessing/pandas.py index e9fd786..357cd00 100644 --- a/timeserio/preprocessing/pandas.py +++ b/timeserio/preprocessing/pandas.py @@ -50,7 +50,6 @@ def _join_multilevel_dataframes(df_list): class PandasColumnSelector(BaseEstimator, TransformerMixin): """Select a sub-set of columns from a pandas DataFrame.""" - def __init__(self, columns=None): self.columns = columns @@ -82,10 +81,13 @@ def _get_column_as_tensor(s: pd.Series): class PandasValueSelector(BaseEstimator, TransformerMixin): - """Select scalar - or vector-valued feature cols, and return np.array.""" + """Select scalar - or vector-valued feature cols, and return np.array. - def __init__(self, columns=None): + Optionally, cast the resulting arry to dtype. + """ + def __init__(self, columns=None, dtype=None): self.columns = columns + self.dtype = dtype def fit(self, df, y=None, **fit_params): return self @@ -98,6 +100,8 @@ def transform(self, df): else: # support a mix of compatible tensors and regular columns blocks = [_get_column_as_tensor(df[col]) for col in columns] subarray = np.hstack(blocks) + if self.dtype: + subarray = subarray.astype(self.dtype) return subarray @property @@ -112,10 +116,13 @@ def transformed_columns(self, input_columns): class PandasIndexValueSelector(BaseEstimator, TransformerMixin): - """Select index levels as feature cols, and return np.array.""" + """Select index levels as feature cols, and return np.array. - def __init__(self, levels=None): + Optionally, cast the resulting arry to dtype. + """ + def __init__(self, levels=None, dtype=None): self.levels = levels + self.dtype = dtype def fit(self, df, y=None, **fit_params): return self @@ -133,12 +140,13 @@ def transform(self, df): for level in levels ] subarray = np.hstack(blocks) if blocks else np.empty((len(df), 0)) + if self.dtype: + subarray = subarray.astype(self.dtype) return subarray class PandasSequenceSplitter(BaseEstimator, TransformerMixin): """Split sequence columns in two.""" - def __init__(self, columns=None, index=0): self.columns = columns self.index = index From 0c983a76a9651c93de0b19dff735e7d3f6895f2f Mon Sep 17 00:00:00 2001 From: Igor Date: Mon, 2 Nov 2020 19:52:41 +0000 Subject: [PATCH 2/2] Fix dicstring format --- timeserio/preprocessing/pandas.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/timeserio/preprocessing/pandas.py b/timeserio/preprocessing/pandas.py index 357cd00..f7eab49 100644 --- a/timeserio/preprocessing/pandas.py +++ b/timeserio/preprocessing/pandas.py @@ -50,6 +50,7 @@ def _join_multilevel_dataframes(df_list): class PandasColumnSelector(BaseEstimator, TransformerMixin): """Select a sub-set of columns from a pandas DataFrame.""" + def __init__(self, columns=None): self.columns = columns @@ -85,6 +86,7 @@ class PandasValueSelector(BaseEstimator, TransformerMixin): Optionally, cast the resulting arry to dtype. """ + def __init__(self, columns=None, dtype=None): self.columns = columns self.dtype = dtype @@ -120,6 +122,7 @@ class PandasIndexValueSelector(BaseEstimator, TransformerMixin): Optionally, cast the resulting arry to dtype. """ + def __init__(self, levels=None, dtype=None): self.levels = levels self.dtype = dtype @@ -147,6 +150,7 @@ def transform(self, df): class PandasSequenceSplitter(BaseEstimator, TransformerMixin): """Split sequence columns in two.""" + def __init__(self, columns=None, index=0): self.columns = columns self.index = index