diff --git a/lookout/style/typos/config.py b/lookout/style/typos/config.py index 9d9316a07..7fe922444 100644 --- a/lookout/style/typos/config.py +++ b/lookout/style/typos/config.py @@ -50,10 +50,11 @@ "chunksize": 256, }, "ranking": { - "train_rounds": 4000, - "early_stopping": 200, + "train_rounds": 1000, + "early_stopping": 100, + "verbose_eval": False, "boost_param": { - "max_depth": 6, + "max_depth": 5, "eta": 0.03, "min_child_weight": 2, "silent": 1, diff --git a/lookout/style/typos/ranking.py b/lookout/style/typos/ranking.py index db3e661de..a4f06cfae 100644 --- a/lookout/style/typos/ranking.py +++ b/lookout/style/typos/ranking.py @@ -37,6 +37,7 @@ def __init__(self, config: Optional[Mapping[str, Any]] = None, **kwargs): :param kwargs: Extra keyword arguments which are consumed by Model. """ super().__init__(**kwargs) + self.config = DEFAULT_CORRECTOR_CONFIG["ranking"] self.set_config(config) self.bst = None # type: xgb.Booster @@ -51,7 +52,7 @@ def set_config(self, config: Optional[Mapping[str, Any]] = None) -> None: """ if config is None: config = {} - self.config = merge_dicts(DEFAULT_CORRECTOR_CONFIG["ranking"], config) + self.config = merge_dicts(self.config, config) def fit(self, identifiers: pandas.Series, candidates: pandas.DataFrame, features: numpy.ndarray, val_part: float = 0.1) -> None: @@ -70,15 +71,23 @@ def fit(self, identifiers: pandas.Series, candidates: pandas.DataFrame, self._log.info("candidates shape %s", candidates.shape) self._log.info("features shape %s", features.shape) labels = self._create_labels(identifiers, candidates) - edge = int(features.shape[0] * (1 - val_part)) - data_train = xgb.DMatrix(features[:edge, :], label=labels[:edge]) - data_val = xgb.DMatrix(features[edge:, :], label=labels[edge:]) + all_tokens = numpy.array(list(set(candidates[Columns.Token]))) + indices = numpy.zeros(len(all_tokens), dtype=bool) + indices[numpy.random.choice(len(all_tokens), + int((1 - val_part) * len(all_tokens)), + replace=False)] = True + train_token = {all_tokens[i]: indices[i] for i in range(len(all_tokens))} + in_train = numpy.array( + [train_token[row[Columns.Token]] for _, row in candidates.iterrows()], dtype=bool) + data_train = xgb.DMatrix(features[in_train], label=labels[in_train]) + data_val = xgb.DMatrix(features[~in_train], label=labels[~in_train]) self.config["boost_param"]["scale_pos_weight"] = float( - 1.0 * (edge - numpy.sum(labels[:edge])) / numpy.sum(labels[:edge])) + 1.0 * (numpy.sum(in_train) - numpy.sum(labels[in_train])) / numpy.sum( + labels[in_train])) evallist = [(data_train, "train"), (data_val, "validation")] self.bst = xgb.train(self.config["boost_param"], data_train, self.config["train_rounds"], evallist, early_stopping_rounds=self.config["early_stopping"], - verbose_eval=False) + verbose_eval=self.config["verbose_eval"]) self._log.debug("successfully fitted") def rank(self, candidates: pandas.DataFrame, features: numpy.ndarray, n_candidates: int = 3,