Skip to content

Commit

Permalink
Fixing FAISS integration and performance test (#102)
Browse files Browse the repository at this point in the history
* Working FAISS KNN

* Handling custom KNN, add test for the handling custom knn and faiss

* Add docstring for knn_classifier

* Remove trailing whitespace, remove unused variables

* - Updating docstring for the knn_classifier parameter;
- Setting default value of knn_classifier to 'knn' (standard scikit-learn implementation) ;

* Adding documentation to methods

* Handle continous array for faiss and add predict_proba test with IH

* Fix error in predict and predict_proba, add instalation guide, add
faiss test

* Add performance comparison between faiss vs sklearn

* Add performance comparison for FAISS

* Improved code quality

* Update travis for faiss

* Fix bug and remove mock

* Update travis for anaconda

* Fix minlengh=2 in faiss wrapper

* Better travis by removing if for conda

* Fix anaconda travis

* Add bash install for anaconda

* Change anaconda to miniconda, fix conda to always yes

* Fix old anaconda environment handling

* Add source bashrc

* Fix travis

* Fix travis

* Fix travis

* Handle skipping faiss test

* Fix unused variable

* Fix unused variable

* Remove unused import

* Update code quality
  • Loading branch information
Natlem authored and Luiz Gustavo Hafemann committed Sep 25, 2018
1 parent ec39641 commit de1d444
Show file tree
Hide file tree
Showing 11 changed files with 176 additions and 24 deletions.
11 changes: 9 additions & 2 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,20 @@ language: python
python:
- "3.5"
- "3.6"
before_install:
- pip install -U pip
install:
- wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh
- bash miniconda.sh -b -p $HOME/miniconda
- export PATH="$HOME/miniconda/bin:$PATH"
- hash -r
- conda config --set always_yes yes --set changeps1 no
- conda create -n test_env python="$TRAVIS_PYTHON_VERSION"
- echo ". $HOME/miniconda/etc/profile.d/conda.sh" >> "$HOME/.bashrc"
- source activate test_env
- travis_wait travis_retry pip install -r requirements-dev.txt
- travis_retry pip install codecov
- travis_retry python setup.py build
- travis_retry python setup.py install
- travis_retry conda install faiss-cpu -c pytorch
script: coverage run -m py.test
after_success:
- codecov
Expand Down
17 changes: 11 additions & 6 deletions deslib/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,24 +51,29 @@ def __init__(self, pool_classifiers, k=7, DFP=False, with_IH=False, safe_k=None,
self.n_classes = None
self.n_samples = None
self.n_features = None
self.knn_class = None

if knn_classifier is None:
self.roc_algorithm = functools.partial(KNeighborsClassifier, n_jobs=-1, algorithm="auto")
self.knn_class = functools.partial(KNeighborsClassifier, n_jobs=-1, algorithm="auto")
elif isinstance(knn_classifier, str):
if knn_classifier == "faiss":
from deslib.util.faiss_knn_wrapper import FaissKNNClassifier
self.roc_algorithm = functools.partial(FaissKNNClassifier, n_jobs=-1, algorithm="auto")
try:
from deslib.util.faiss_knn_wrapper import FaissKNNClassifier
except ImportError:
raise ImportError("FAISS library needs to be manually installed, please check the Installation Guide")
self.knn_class = functools.partial(FaissKNNClassifier, n_jobs=-1, algorithm="auto")
elif knn_classifier == "knn":
self.roc_algorithm = functools.partial(KNeighborsClassifier, n_jobs=-1, algorithm="auto")
self.knn_class = functools.partial(KNeighborsClassifier, n_jobs=-1, algorithm="auto")
else:
raise ValueError('"knn_classifier" should be one of the following '
'["knn", "faiss"] or an estimator class')
elif callable(knn_classifier):
self.roc_algorithm = knn_classifier
self.knn_class = knn_classifier
else:
raise ValueError('"knn_classifier" should be one of the following '
'["knn", "faiss"] or an estimator class')

self.roc_algorithm = self.roc_algorithm(self.k)
self.roc_algorithm = self.knn_class(self.k)

# TODO: remove these as class variables
self.neighbors = None
Expand Down
3 changes: 1 addition & 2 deletions deslib/des/knop.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
# License: BSD 3 clause

import numpy as np
from sklearn.neighbors import KNeighborsClassifier

from deslib.des.base import DES

Expand Down Expand Up @@ -130,7 +129,7 @@ def _fit_OP(self, X_op, y_op, k):
Number of output profiles used in the region of competence estimation.
"""
self.op_knn = KNeighborsClassifier(n_neighbors=k, n_jobs=-1, algorithm='auto')
self.op_knn = self.knn_class(k)

if self.n_classes == 2:
# Get only the scores for one class since they are complementary
Expand Down
3 changes: 1 addition & 2 deletions deslib/des/meta_des.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
import numpy as np
from sklearn.exceptions import NotFittedError
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.utils.validation import check_is_fitted

from deslib.des.base import DES
Expand Down Expand Up @@ -178,7 +177,7 @@ class labels of each sample in X_op.
Number of output profiles used in the estimation.
"""
self.op_knn = KNeighborsClassifier(n_neighbors=kp, n_jobs=-1, algorithm='auto')
self.op_knn = self.knn_class(kp)

if self.n_classes == 2:
# Get only the scores for one class since they are complementary
Expand Down
Binary file not shown.
72 changes: 72 additions & 0 deletions deslib/tests/performance/compare_performance_faiss.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import numpy as np
import faiss
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
import time
from sklearn.model_selection import train_test_split
import threading
import os
import urllib.request
import gzip
import shutil

def sk_knn(Xtrain, Y, k, Xtest):
start = time.clock()
s_knn = KNeighborsClassifier(k, n_jobs=4) #Half of current cores
s_knn.fit(Xtrain, Y)
s_knn.predict(Xtest)
print("sklearn_knn run_time: {}".format(time.clock() - start))

def faiss_knn(Xtrain, Y, k, Xtest):
start = time.clock()
index = faiss.IndexFlatL2(Xtrain.shape[1])
index.add(np.ascontiguousarray(Xtrain).astype(np.float32))
index.search(Xtest.astype(np.float32), k)
print("faiss_knn run_time: {}".format(time.clock() - start))


if __name__ == "__main__":

if not os.path.exists("../../HIGGS.csv"):
print("Downloading HIGGS dataset from https://archive.ics.uci.edu/ml/datasets/HIGGS")
if not os.path.exists("../../HIGGS.gz"):
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz"
filedata = urllib.request.urlopen(url)
data2write = filedata.read()
with open('../../HIGSS.gz', 'wb') as f:
f.write(data2write)
print("Finished downloading")
print("Extracting HIGGS.gz")
if not os.path.exists("../../HIGGS.csv"):
with gzip.open('../../HIGGS.gz', 'rb') as f:
with open('../../HIGGS.csv', 'wb') as csv_out:
shutil.copyfileobj(f, csv_out)
print("Extracted csv")

df = pd.read_csv('../../HIGGS.csv', header=None)
data = df.values
X = data[:, 1:]
Y = data[:, 0]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33)
num_samples_list = [1000000]
num_of_k_list = [1, 2, 5, 7, 10]
num_of_test_inputs = [100, 1000]

for nsamples in num_samples_list:
for n_k in num_of_k_list:
for n_t in num_of_test_inputs:
print("running experiment: num_of_train_samples: {}, num_of_k: {}, num_of_tests: {}".format(
nsamples,
n_k,
n_t))
faiss_knn(X_train[:nsamples], Y_train[:nsamples], n_k, X_test[:n_t])
t = threading.Thread(target=sk_knn, args=(X_train[:nsamples], Y_train[:nsamples], n_k, X_test[:n_t]))
t.start()
t.join(timeout=600)
if t.is_alive():
print("sklearn_knn, num_of_train_samples: {}, num_of_k: {}, num_of_tests: {}, run_time: {}".format(
nsamples,
n_k,
n_t,
"timeout after 60s"))
21 changes: 20 additions & 1 deletion deslib/tests/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,11 @@

import pytest
from sklearn.exceptions import NotFittedError
from sklearn.neighbors import KNeighborsClassifier

from deslib.base import DS
from deslib.tests.examples_test import *

import unittest.mock

def test_all_classifiers_agree():
# 10 classifiers that return 1
Expand Down Expand Up @@ -80,6 +81,24 @@ def test_valid_selection_mode(knn_method):
with pytest.raises(ValueError):
DS(create_pool_classifiers(), knn_classifier=knn_method)

def test_import_faiss_mode():
try:
import sys
sys.modules.pop('deslib.util.faiss_knn_wrapper')
except Exception:
pass
with unittest.mock.patch.dict('sys.modules', {'faiss': None}):
with pytest.raises(ImportError):
DS(create_pool_classifiers(), knn_classifier="faiss")

def test_none_selection_mode():
ds = DS(create_pool_classifiers(), knn_classifier=None)
assert(isinstance(ds.roc_algorithm, KNeighborsClassifier))

def test_string_selection_mode():
ds = DS(create_pool_classifiers(), knn_classifier="knn")
assert(isinstance(ds.roc_algorithm, KNeighborsClassifier))

# In this test the system was trained for a sample containing 2 features and we are passing a sample with 3 as argument.
# So it should raise a value error.
def test_different_input_shape():
Expand Down
19 changes: 15 additions & 4 deletions deslib/tests/test_des_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
from sklearn.ensemble import BaggingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
Expand Down Expand Up @@ -33,12 +32,12 @@
from deslib.static.static_selection import StaticSelection
import pytest
import warnings
import sys

knn_methods = [None, "knn", KNeighborsClassifier]
knn_methods = [None]

try:
from deslib.util.faiss_knn_wrapper import FaissKNNClassifier
knn_methods.append("faiss")
knn_methods.append(FaissKNNClassifier)
except ImportError:
warnings.warn("Not testing FAISS for KNN")
Expand Down Expand Up @@ -281,10 +280,22 @@ def test_kne_proba(knn_methods):
expected = np.load('deslib/tests/expected_values/kne_proba_integration.npy')
assert np.allclose(probas, expected)

# ------------------------------------------ Testing predict_proba -----------------------------------

@pytest.mark.skipif('faiss' not in sys.modules,
reason="requires the faiss library")
def test_compare_faiss_predict_proba_IH():
pool_classifiers, X_dsel, y_dsel, X_test, y_test = setup_classifiers()
kne = KNORAE(pool_classifiers, knn_classifier="faiss", with_IH=True, IH_rate=0.1)
kne.fit(X_dsel, y_dsel)
probas = kne.predict_proba(X_test)
expected = np.load('deslib/tests/expected_values/kne_knn_proba_integration.npy')
assert np.allclose(probas, expected)


@pytest.mark.parametrize('knn_methods', knn_methods)
def test_desp_proba(knn_methods):
pool_classifiers, X_dsel, y_dsel, X_test, y_test = setup_classifiers()

desp = DESP(pool_classifiers, knn_classifier=knn_methods)
desp.fit(X_dsel, y_dsel)
probas = desp.predict_proba(X_test)
Expand Down
30 changes: 30 additions & 0 deletions deslib/tests/test_faiss.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@

import pytest
import sys
from sklearn.neighbors import KNeighborsClassifier
from deslib.tests.examples_test import *
from deslib.tests.test_des_integration import load_dataset

try:
from deslib.util.faiss_knn_wrapper import FaissKNNClassifier
except ImportError:
pass


@pytest.mark.skipif('faiss' not in sys.modules,
reason="requires the faiss library")
def test_faiss_predict():
rng = np.random.RandomState(123456)
_, X_test, X_train, _, _, y_train = load_dataset(None, rng)
k = 7
X_train = X_train.astype(np.float32)
X_test = X_test.astype(np.float32)
f_knn_test = FaissKNNClassifier(n_neighbors=k)
f_knn_test.fit(X_train, y_train)
f_knn_preds = f_knn_test.predict(X_test)

knn_test = KNeighborsClassifier(n_neighbors=k)
knn_test.fit(X_train, y_train)
knn_preds = knn_test.predict(X_test)

assert ((f_knn_preds - knn_preds).sum() == 0)
15 changes: 9 additions & 6 deletions deslib/util/faiss_knn_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,8 @@ def predict(self, X):
"""
_, idx = self.kneighbors(X, self.n_neighbors)
class_idx = self.y[idx]
preds = np.amax(class_idx, axis=1)
counts = np.apply_along_axis(lambda x: np.bincount(x, minlength=self.num_of_classes), axis=1, arr=class_idx.astype(np.int64))
preds = np.argmax(counts, axis=1)
return preds

def kneighbors(self, X, n_neighbors, return_distance=True):
Expand All @@ -70,12 +71,13 @@ def predict_proba(self, X):
"""
_, idx = self.kneighbors(X, self.n_neighbors)
class_idx = self.y[idx]
preds = np.amax(class_idx, axis=1)
counts = np.apply_along_axis(lambda x: np.bincount(x, minlength=self.num_of_classes), axis=1, arr=class_idx.astype(np.int64))
preds = np.argmax(counts, axis=1)

#FIXME: can probably be improved for a vectorized version
preds_proba = np.zeros(X.shape[0], self.num_of_classes)
for i in range(preds):
preds_proba[i] = np.bincount(class_idx[i, :]) / self.n_neighbors
#TODO: can probably be improved for a vectorized version
preds_proba = np.zeros((X.shape[0], self.num_of_classes))
for i in range(preds.shape[0]):
preds_proba[i] = counts[i] / self.n_neighbors

return preds_proba

Expand All @@ -91,6 +93,7 @@ def fit(self, X, y):
class labels of each example in X.
"""
X = np.atleast_2d(X).astype(np.float32)
X = np.ascontiguousarray(X)
self.index = faiss.IndexFlatL2(X.shape[1])
self.index.add(X)
self.y = y
Expand Down
9 changes: 8 additions & 1 deletion docs/user_guide/installation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,11 @@ DESlib is tested to work with Python 3.5, and 3.6. The dependency requirements a
* numpy(>=1.10.4)
* scikit-learn(>=0.19.0)

These dependencies are automatically installed using the pip commands above.
These dependencies are automatically installed using the pip commands above.

Optional dependencies
=====================

To use Faiss (Fair AI Similarity Search), a fast implementation of KNN that can use GPUs, follow the instructions below:
https://github.com/facebookresearch/faiss/blob/master/INSTALL.md

0 comments on commit de1d444

Please sign in to comment.