Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FEAT add extend by mini-batch #22

Merged
merged 2 commits into from
Nov 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ API Reference

FastCan
refine
extend
ssc
ols

Expand Down
2 changes: 2 additions & 0 deletions fastcan/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
The :mod:`fastcan` module implements algorithms, including
"""

from ._extend import extend
from ._fastcan import FastCan
from ._refine import refine
from ._utils import ols, ssc
Expand All @@ -11,4 +12,5 @@
"ssc",
"ols",
"refine",
"extend",
]
4 changes: 4 additions & 0 deletions fastcan/_cancorr_fast.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,10 @@ cpdef int _forward_search(

# Find max scores and update indices, X, mask, and scores
index = _iamax(n_features, &r2[0], 1)
if r2[index] == 0:
raise RuntimeError(
f"No improvement can be found when selecting the {i}th feature."
)
indices[i] = index
scores[i] = r2[index]

Expand Down
120 changes: 120 additions & 0 deletions fastcan/_extend.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
"""
Extend feature selection
"""

import math
from copy import deepcopy
from numbers import Integral

import numpy as np
from sklearn.utils._openmp_helpers import _openmp_effective_n_threads
from sklearn.utils._param_validation import Interval, validate_params
from sklearn.utils.validation import check_is_fitted

from ._cancorr_fast import _forward_search # type: ignore
from ._fastcan import FastCan, _prepare_search


@validate_params(
{
"selector": [FastCan],
"n_features_to_select": [
Interval(Integral, 1, None, closed="left"),
],
"batch_size": [
Interval(Integral, 1, None, closed="left"),
],
},
prefer_skip_nested_validation=False,
)
def extend(selector, n_features_to_select=1, batch_size=1):
"""Extend FastCan with mini batches.

It is suitable for selecting a very large number of features
even larger than the number of samples.

Similar to the correlation filter which selects each feature without considering
the redundancy, the function selects features in mini-batch and the
redundancy between the two mini-batches will be ignored.

Parameters
----------
selector : FastCan
FastCan selector.

n_features_to_select : int, default=1
The parameter is the absolute number of features to select.

batch_size : int, default=1
The number of features in a mini-batch.

Returns
-------
indices : ndarray of shape (n_features_to_select,), dtype=int
The indices of the selected features.

Examples
--------
>>> from fastcan import FastCan, extend
>>> X = [[1, 1, 0], [0.01, 0, 0], [-1, 0, 1], [0, 0, 0]]
>>> y = [1, 0, -1, 0]
>>> selector = FastCan(1, verbose=0).fit(X, y)
>>> print(f"Indices: {selector.indices_}")
Indices: [0]
>>> indices = extend(selector, 3, batch_size=2)
>>> print(f"Indices: {indices}")
Indices: [0 2 1]
"""
check_is_fitted(selector)
n_inclusions = selector.indices_include_.size
n_features = selector.n_features_in_
n_to_select = n_features_to_select - selector.n_features_to_select
batch_size_to_select = batch_size - n_inclusions

if n_features_to_select > n_features:
raise ValueError(
f"n_features_to_select {n_features_to_select} "
f"must be <= n_features {n_features}."
)
if n_to_select <= 0:
raise ValueError(
f"The number of features to select ({n_to_select}) ", "is less than 0."
)
if batch_size_to_select <= 0:
raise ValueError(
"The size of mini batch without included indices ",
f"({batch_size_to_select}) is less than 0.",
)

X_transformed_ = deepcopy(selector.X_transformed_)

indices_include = selector.indices_include_
indices_exclude = selector.indices_exclude_
indices_select = selector.indices_[n_inclusions:]

n_threads = _openmp_effective_n_threads()

for i in range(math.ceil(n_to_select / batch_size_to_select)):
if i == 0:
batch_size_i = (n_to_select - 1) % batch_size_to_select + 1 + n_inclusions
else:
batch_size_i = batch_size
indices, scores, mask = _prepare_search(
n_features,
batch_size_i,
indices_include,
np.r_[indices_exclude, indices_select],
)
_forward_search(
X=X_transformed_,
V=selector.y_transformed_,
t=batch_size_i,
tol=selector.tol,
num_threads=n_threads,
verbose=0,
mask=mask,
indices=indices,
scores=scores,
)
indices_select = np.r_[indices_select, indices[n_inclusions:]]
return np.r_[indices_include, indices_select]
6 changes: 6 additions & 0 deletions fastcan/_fastcan.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,12 @@ class FastCan(SelectorMixin, BaseEstimator):
When h-correlation method is used, `n_samples_` = n_samples.
When eta-cosine method is used, `n_samples_` = n_features+n_outputs.

indices_include_ : ndarray of shape (n_inclusions,), dtype=int
The indices of the prerequisite features.

indices_exclude_ : array-like of shape (n_exclusions,), dtype=int
The indices of the excluded features.

References
----------
* Zhang, S., & Lang, Z. Q. (2022).
Expand Down
2 changes: 1 addition & 1 deletion fastcan/_refine.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ def refine(selector, drop=1, max_iter=None, verbose=1):

n_inclusions = indices_include.size
n_selections = n_features_to_select - n_inclusions
n_threads = _openmp_effective_n_threads()

if drop == "all":
drop = np.arange(1, n_selections)
Expand Down Expand Up @@ -126,7 +127,6 @@ def refine(selector, drop=1, max_iter=None, verbose=1):
rolled_indices[:-drop_n],
indices_exclude,
)
n_threads = _openmp_effective_n_threads()
_forward_search(
X=X_transformed_,
V=selector.y_transformed_,
Expand Down
86 changes: 86 additions & 0 deletions tests/test_extend.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
"""Test feature selection extend"""
import numpy as np
import pytest
from numpy.testing import (
assert_array_equal,
)
from sklearn.datasets import make_classification

from fastcan import FastCan, extend


def test_select_extend_cls():
# Test whether refine work correctly with random samples.
n_samples = 200
n_features = 30
n_informative = 20
n_classes = 8
n_repeated = 5
n_to_select = 18

X, y = make_classification(
n_samples=n_samples,
n_features=n_features,
n_informative=n_informative,
n_repeated=n_repeated,
n_classes=n_classes,
n_clusters_per_class=1,
flip_y=0.0,
class_sep=10,
shuffle=False,
random_state=0,
)

n_features_to_select = 2
selector = FastCan(n_features_to_select).fit(X, y)
indices = extend(selector, n_to_select, batch_size=3)
selector_inc = FastCan(n_features_to_select, indices_include=[10]).fit(X, y)
indices_inc = extend(selector_inc, n_to_select, batch_size=3)
selector_exc = FastCan(
n_features_to_select, indices_include=[10], indices_exclude=[0]
).fit(X, y)
indices_exc = extend(selector_exc, n_to_select, batch_size=3)


assert np.unique(indices).size == n_to_select
assert_array_equal(indices[:n_features_to_select], selector.indices_)
assert np.unique(indices_inc).size == n_to_select
assert_array_equal(indices_inc[:n_features_to_select], selector_inc.indices_)
assert np.unique(indices_exc).size == n_to_select
assert_array_equal(indices_exc[:n_features_to_select], selector_exc.indices_)
assert ~np.isin(0, indices_exc)


def test_extend_error():
# Test refine raise error.
n_samples = 200
n_features = 20
n_informative = 10
n_classes = 8
n_repeated = 5

X, y = make_classification(
n_samples=n_samples,
n_features=n_features,
n_informative=n_informative,
n_repeated=n_repeated,
n_classes=n_classes,
n_clusters_per_class=1,
flip_y=0.0,
class_sep=10,
shuffle=False,
random_state=0,
)

n_features_to_select = 2

selector = FastCan(n_features_to_select, indices_include=[0]).fit(X, y)

with pytest.raises(ValueError, match=r"n_features_to_select .*"):
_ = extend(selector, n_features+1, batch_size=3)

with pytest.raises(ValueError, match=r"The number of features to select .*"):
_ = extend(selector, n_features_to_select, batch_size=3)

with pytest.raises(ValueError, match=r"The size of mini batch without .*"):
_ = extend(selector, n_features, batch_size=1)
2 changes: 1 addition & 1 deletion tests/test_refine.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from fastcan import FastCan, refine


def test_select_refine_random_cls():
def test_select_refine_cls():
# Test whether refine work correctly with random samples.
n_samples = 200
n_features = 20
Expand Down