From 33d3d5c1c9173a03a3fa326adde6443982695dc0 Mon Sep 17 00:00:00 2001 From: Rohan Bopardikar Date: Wed, 9 Jun 2021 13:21:03 -0700 Subject: [PATCH] Detector Param Tuning Summary: Minor changes to support metalearning for detection Reviewed By: yangbk560 Differential Revision: D28313594 fbshipit-source-id: 9e21ed88b40ed4a9153508a9548652a0638903b9 --- kats/detectors/bocpd_model.py | 100 ++++++++++++------ kats/detectors/changepoint_evaluator.py | 2 - kats/detectors/cusum_model.py | 23 +++- kats/models/metalearner/metalearner_hpt.py | 24 ++++- .../metalearner/metalearner_modelselect.py | 11 +- 5 files changed, 113 insertions(+), 47 deletions(-) diff --git a/kats/detectors/bocpd_model.py b/kats/detectors/bocpd_model.py index 7b4f39d23..d3720931c 100644 --- a/kats/detectors/bocpd_model.py +++ b/kats/detectors/bocpd_model.py @@ -9,24 +9,22 @@ algorithm as a DetectorModel, to provide a common interface. """ -from typing import Optional -import pandas as pd import json +from typing import Optional -from kats.detectors.detector import DetectorModel - +import pandas as pd from kats.consts import TimeSeriesData - from kats.detectors.bocpd import ( BOCPDetector, BOCPDModelType, ) - +from kats.detectors.detector import DetectorModel from kats.detectors.detector_consts import ( AnomalyResponse, ConfidenceBand, ) + class BocpdDetectorModel(DetectorModel): """Implements the Bayesian Online Changepoint Detection as a DetectorModel. @@ -43,16 +41,25 @@ class BocpdDetectorModel(DetectorModel): >>> anom = bocpd_detector.fit_predict(data=level_ts) """ - def __init__(self, serialized_model: Optional[bytes] = None, slow_drift: bool = False): + def __init__( + self, + serialized_model: Optional[bytes] = None, + slow_drift: bool = False, + threshold: Optional[float] = None, + ): if serialized_model is None: self.slow_drift = slow_drift + self.threshold = threshold else: model_dict = json.loads(serialized_model) - if 'slow_drift' in model_dict: - self.slow_drift = model_dict['slow_drift'] + if "slow_drift" in model_dict: + self.slow_drift = model_dict["slow_drift"] else: self.slow_drift = slow_drift - + if "threshold" in model_dict: + self.threshold = model_dict["threshold"] + else: + self.threshold = threshold def serialize(self) -> bytes: """Returns the serialzed model. @@ -64,7 +71,7 @@ def serialize(self) -> bytes: json containing information about serialized model. """ - model_dict = {'slow_drift': self.slow_drift} + model_dict = {"slow_drift": self.slow_drift} return json.dumps(model_dict).encode("utf-8") def _handle_missing_data_extend( @@ -76,10 +83,7 @@ def _handle_missing_data_extend( # but we will remove the interpolated data when we # evaluate, to make sure that the anomaly score is # the same length as data - original_time_list = ( - list(historical_data.time) - + list(data.time) - ) + original_time_list = list(historical_data.time) + list(data.time) if historical_data.is_data_missing(): historical_data = historical_data.interpolate() @@ -90,20 +94,27 @@ def _handle_missing_data_extend( # extend has been done, now remove the interpolated data data = TimeSeriesData( - pd.DataFrame({ - 'time':[ - historical_data.time.iloc[i] for i in range(len(historical_data)) - if historical_data.time.iloc[i] in original_time_list], - 'value':[ - historical_data.value.iloc[i] for i in range(len(historical_data)) - if historical_data.time.iloc[i] in original_time_list] - }), - use_unix_time=True, unix_time_units="s", tz="US/Pacific" + pd.DataFrame( + { + "time": [ + historical_data.time.iloc[i] + for i in range(len(historical_data)) + if historical_data.time.iloc[i] in original_time_list + ], + "value": [ + historical_data.value.iloc[i] + for i in range(len(historical_data)) + if historical_data.time.iloc[i] in original_time_list + ], + } + ), + use_unix_time=True, + unix_time_units="s", + tz="US/Pacific", ) return data - # pyre-fixme[14]: `fit_predict` overrides method defined in `DetectorModel` # inconsistently. def fit_predict( @@ -129,7 +140,7 @@ def fit_predict( # pyre-fixme[16]: `BocpdDetectorModel` has no attribute `last_N`. self.last_N = len(data) - #if there is historical data + # if there is historical data # we prepend it to data, and run # the detector as if we only saw data if historical_data is not None: @@ -138,20 +149,39 @@ def fit_predict( bocpd_model = BOCPDetector(data=data) if not self.slow_drift: - _ = bocpd_model.detector( - model=BOCPDModelType.NORMAL_KNOWN_MODEL, choose_priors=True, - agg_cp=True - ) + + if self.threshold is not None: + _ = bocpd_model.detector( + model=BOCPDModelType.NORMAL_KNOWN_MODEL, + choose_priors=True, + agg_cp=True, + threshold=self.threshold, + ) + else: + _ = bocpd_model.detector( + model=BOCPDModelType.NORMAL_KNOWN_MODEL, + choose_priors=True, + agg_cp=True, + ) else: - _ = bocpd_model.detector( - model=BOCPDModelType.TREND_CHANGE_MODEL, choose_priors=False, - agg_cp=True - ) + if self.threshold is not None: + _ = bocpd_model.detector( + model=BOCPDModelType.NORMAL_KNOWN_MODEL, + choose_priors=True, + agg_cp=True, + threshold=self.threshold, + ) + else: + _ = bocpd_model.detector( + model=BOCPDModelType.TREND_CHANGE_MODEL, + choose_priors=False, + agg_cp=True, + ) change_prob_dict = bocpd_model.get_change_prob() change_prob = list(change_prob_dict.values())[0] - #construct the object + # construct the object N = len(data) default_ts = TimeSeriesData(time=data.time, value=pd.Series(N * [0.0])) score_ts = TimeSeriesData(time=data.time, value=pd.Series(change_prob)) diff --git a/kats/detectors/changepoint_evaluator.py b/kats/detectors/changepoint_evaluator.py index a618de29f..d491320c4 100644 --- a/kats/detectors/changepoint_evaluator.py +++ b/kats/detectors/changepoint_evaluator.py @@ -416,8 +416,6 @@ def _parse_data(self, df_row: Any): this_ts = df_row['time_series'] this_anno = df_row['annotation'] - print(this_dataset) - this_anno_json_acc = this_anno.replace("'", "\"") this_anno_dict = json.loads(this_anno_json_acc) diff --git a/kats/detectors/cusum_model.py b/kats/detectors/cusum_model.py index 32603edd7..364729a2c 100644 --- a/kats/detectors/cusum_model.py +++ b/kats/detectors/cusum_model.py @@ -31,7 +31,7 @@ import logging from datetime import datetime from enum import Enum -from typing import Any, List, Optional +from typing import Any, List, Optional, Union import numpy as np import pandas as pd @@ -49,7 +49,6 @@ CHANGEPOINT_RETENTION = 7 * 24 * 60 * 60 # in seconds MAX_CHANGEPOINT = 10 - def percentage_change( data: TimeSeriesData, pre_mean: float, **kwargs: Any ) -> TimeSeriesData: @@ -94,12 +93,18 @@ class CusumScoreFunction(Enum): percentage_change = "percentage_change" z_score = "z_score" - +# Score Function Constants SCORE_FUNC_DICT = { CusumScoreFunction.change.value: change, CusumScoreFunction.percentage_change.value: percentage_change, CusumScoreFunction.z_score.value: z_score, } +DEFAULT_SCORE_FUNCTION = CusumScoreFunction.change +STR_TO_SCORE_FUNC = { # Used for param tuning + "change": CusumScoreFunction.change, + "percentage_change": CusumScoreFunction.percentage_change, + "z_score": CusumScoreFunction.z_score, +} class CUSUMDetectorModel(DetectorModel): @@ -139,7 +144,7 @@ def __init__( magnitude_quantile: float = CUSUM_DEFAULT_ARGS["magnitude_quantile"], magnitude_ratio: float = CUSUM_DEFAULT_ARGS["magnitude_ratio"], change_directions: List[str] = CUSUM_DEFAULT_ARGS["change_directions"], - score_func: CusumScoreFunction = CusumScoreFunction.change, + score_func: Union[str, CusumScoreFunction] = DEFAULT_SCORE_FUNCTION, remove_seasonality: bool = CUSUM_DEFAULT_ARGS["remove_seasonality"], ): if serialized_model: @@ -178,8 +183,16 @@ def __init__( self.magnitude_quantile = magnitude_quantile self.magnitude_ratio = magnitude_ratio self.change_directions = change_directions - self.score_func = score_func.value self.remove_seasonality = remove_seasonality + + # We allow score_function to be a str for compatibility with param tuning + if isinstance(score_func, str): + if score_func in STR_TO_SCORE_FUNC: + score_func = STR_TO_SCORE_FUNC[score_func] + else: + score_func = DEFAULT_SCORE_FUNCTION + self.score_func = score_func.value + else: raise ValueError( """ diff --git a/kats/models/metalearner/metalearner_hpt.py b/kats/models/metalearner/metalearner_hpt.py index 9d5b25b5e..6ff6604b4 100644 --- a/kats/models/metalearner/metalearner_hpt.py +++ b/kats/models/metalearner/metalearner_hpt.py @@ -49,6 +49,14 @@ ], "numerical_idx": [], }, + "cusum": { + "categorical_idx": ["score_func"], + "numerical_idx": ["delta_std_ratio", "scan_window", "historical_window"], + }, + "statsig": { + "categorical_idx": [], + "numerical_idx": ["n_control", "n_test"], + } } default_model_networks = { @@ -78,6 +86,16 @@ "n_hidden_cat_combo": [[5], [5], [2], [3], [5], [5], [5]], "n_hidden_num": [], }, + "cusum": { + "n_hidden_shared": [20], + "n_hidden_cat_combo": [[3]], + "n_hidden_num": [5, 5, 5], + }, + "statsig": { + "n_hidden_shared": [20], + "n_hidden_cat_combo": [], + "n_hidden_num": [5, 5], + }, } @@ -163,7 +181,7 @@ def __init__( numerical_idx = default_model_params[default_model]["numerical_idx"] else: - msg = f"default_model={default_model} is not available! Please choose one from 'prophet', 'arima', 'sarima', 'holtwinters', stlf, 'theta'" + msg = f"default_model={default_model} is not available! Please choose one from 'prophet', 'arima', 'sarima', 'holtwinters', 'stlf', 'theta', 'cusum', 'statsig'" logging.error(msg) raise ValueError(msg) @@ -385,7 +403,7 @@ def _prepare_data( else None ) y_num = ( - torch.from_numpy(self._target_num[train_idx, :]).float() + torch.from_numpy(self._target_num[train_idx, :].astype('float')).float() if self.numerical_idx else None ) @@ -398,7 +416,7 @@ def _prepare_data( else None ) y_num_val = ( - torch.from_numpy(self._target_num[val_idx, :]).float() + torch.from_numpy(self._target_num[val_idx, :].astype('float')).float() if self.numerical_idx else None ) diff --git a/kats/models/metalearner/metalearner_modelselect.py b/kats/models/metalearner/metalearner_modelselect.py index 3b165828a..c9050382c 100644 --- a/kats/models/metalearner/metalearner_modelselect.py +++ b/kats/models/metalearner/metalearner_modelselect.py @@ -60,9 +60,16 @@ class MetaLearnModelSelect: def __init__( self, metadata: Optional[List[Dict[str, Any]]] = None, load_model: bool = False ) -> None: - if not load_model and metadata is not None: + if not load_model: + # pyre-fixme[6]: Expected `Sized` for 1st param but got + # `Optional[List[typing.Any]]`. if len(metadata) <= 30: - msg = f"metadata size should be greater than 30 but receives {len(metadata)}." + msg = "Dataset is too small to train a meta learner!" + logging.error(msg) + raise ValueError(msg) + + if metadata is None: + msg = "Missing metadata!" logging.error(msg) raise ValueError(msg)