Skip to content

Commit

Permalink
Merge branch 'master' of github.com:mwydmuch/napkinXC
Browse files Browse the repository at this point in the history
  • Loading branch information
mwydmuch committed Feb 2, 2021
2 parents a239c2c + 9a52b49 commit dc31543
Show file tree
Hide file tree
Showing 34 changed files with 708 additions and 553 deletions.
5 changes: 3 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,11 @@ nxc
# Docs
docs/*/*

# Experiments
/data
# Experiments and examples
data
/models*
/results*
eurlex-model

# Misc
.idea
Expand Down
4 changes: 1 addition & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

cmake_minimum_required(VERSION 3.12)
project(napkinXC
VERSION 0.4.3
VERSION 0.5.0
DESCRIPTION "Extremely simple and fast extreme multi-class and multi-label classifiers"
HOMEPAGE_URL https://github.com/mwydmuch/napkinXC
LANGUAGES C CXX)
Expand Down Expand Up @@ -42,8 +42,6 @@ configure_file(
${SRC_DIR}/version.h
)



# Gather napkinXC source files
file(GLOB SOURCES
${SRC_DIR}/*.cpp
Expand Down
43 changes: 43 additions & 0 deletions experiments/calculate_inv_ps.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#!/usr/bin/env python

import sys
import os

napkinxc_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../python")
sys.path.append(napkinxc_path)

from napkinxc.measures import *


def load_true_file(filepath):
with open(filepath) as file:
Y = []
for i, line in enumerate(file):
if i == 0 and len(line.split(' ')) == 3:
continue
Y.append([int(y) for y in line.strip().split(' ', 1)[0].split(',') if ':' not in y])
return Y


if __name__ == "__main__":
if len(sys.argv) < 3:
print("Requires true file and output as arguments!")
exit(1)

true_file = sys.argv[1]
true = load_true_file(sys.argv[1])

A = 0.55
B = 1.5

if '/wikiLSHTC/' in true_file:
A = 0.5
B = 0.4
elif '/amazon/' in true_file:
A = 0.6
B = 2.6

inv_ps = inverse_propensity(true, A=A, B=B)
with open(sys.argv[2], "w") as out:
for ip in inv_ps:
out.write("{}\n".format(ip))
79 changes: 79 additions & 0 deletions experiments/evaluate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
#!/usr/bin/env python

import sys
import os

napkinxc_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../python")
sys.path.append(napkinxc_path)

from napkinxc.measures import *


def load_true_file(filepath):
with open(filepath) as file:
Y = []
for i, line in enumerate(file):
if i == 0 and len(line.split(' ')) == 3:
continue
Y.append([int(y) for y in line.strip().split(' ', 1)[0].split(',') if ':' not in y])
return Y


def load_pred_file(filepath):
with open(filepath) as file:
Y = []

def convert_y(y):
y = y.split(':')
if len(y) == 2:
return (int(y[0]), float(y[1]))
else:
return int(y)

for line in file:
Y.append([convert_y(y) for y in line.strip().split(' ')])
return Y


def load_inv_ps_file(filepath):
with open(filepath) as file:
v = []
for line in file:
v.append(float(line.strip()))
return v


if __name__ == "__main__":
if len(sys.argv) < 3:
print("Requires true and prediction files as arguments!")
exit(1)

true = load_true_file(sys.argv[1])
pred = load_pred_file(sys.argv[2])

inv_ps = None
if len(sys.argv) > 3:
inv_ps = load_inv_ps_file(sys.argv[3])

max_k = 10

r = precision_at_k(true, pred, k=max_k)
for k in range(max_k):
print("P@{}: {}".format(k + 1, r[k]))

r = recall_at_k(true, pred, k=max_k)
for k in range(max_k):
print("R@{}: {}".format(k + 1, r[k]))

r = coverage_at_k(true, pred, k=max_k)
for k in range(max_k):
print("C@{}: {}".format(k + 1, r[k]))

r = ndcg_at_k(true, pred, k=max_k)
for k in range(max_k):
print("nDCG@{}: {}".format(k + 1, r[k]))

if inv_ps is not None:
r = psprecision_at_k(true, pred, inv_ps=inv_ps, k=max_k)
for k in range(max_k):
print("PSP@{}: {}".format(k + 1, r[k]))
14 changes: 7 additions & 7 deletions experiments/remap_libsvm.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,10 @@ def load_libsvm(file):
Y = []
with open(file) as f:
for row in f:
yi, xi = row.split(' ', 1)
X.append(xi)
if len(yi):
Y.append(yi.split(','))
y, x = row.split(' ', 1)
X.append(x)
if len(y):
Y.append(y.split(','))
else:
Y.append([])

Expand All @@ -23,10 +23,10 @@ def load_libsvm(file):

def save_libsvm(X, Y, file):
with open(file, "w") as f:
for xi, yi in zip(X, Y):
f.write(','.join([str(y) for y in sorted(yi)]))
for x, y in zip(X, Y):
f.write(','.join([str(y_i) for y_i in sorted(y)]))
f.write(' ')
f.write(xi)
f.write(x)


def remap_files(files, mapping):
Expand Down
48 changes: 44 additions & 4 deletions experiments/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,9 @@ elif [[ -e "${DATASET_FILE}_train.txt" ]]; then
elif [[ -e "${DATASET_FILE}.train" ]]; then
TRAIN_FILE="${DATASET_FILE}.train"
TEST_FILE="${DATASET_FILE}.test"
elif [[ -e "${DATASET_FILE}_train.svm" ]]; then
TRAIN_FILE="${DATASET_FILE}_train.svm"
TEST_FILE="${DATASET_FILE}_test.svm"
elif [[ -e "${DATASET_FILE}_train.libsvm" ]]; then
TRAIN_FILE="${DATASET_FILE}_train.libsvm"
TEST_FILE="${DATASET_FILE}_test.libsvm"
fi

# Build nxc
Expand All @@ -65,12 +65,23 @@ if [[ ! -e ${ROOT_DIR}/nxc ]]; then
cd ${ROOT_DIR}/experiments
fi

# Calculate inverse propensity
INV_PS_FILE="${DATASET_FILE}.inv_ps"
if [[ ! -e $INV_PS_FILE ]]; then
python3 ${SCRIPT_DIR}/calculate_inv_ps.py $TRAIN_FILE $INV_PS_FILE
fi

# Train model
TRAIN_RESULT_FILE=${MODEL}/train_results
TRAIN_LOCK_FILE=${MODEL}/.train_lock
if [[ ! -e $MODEL ]] || [[ -e $TRAIN_LOCK_FILE ]]; then
mkdir -p $MODEL
touch $TRAIN_LOCK_FILE

if [[ $TRAIN_ARGS == *"--labelsWeights"* ]]; then
TRAIN_ARGS="${TRAIN_ARGS} --labelsWeights ${INV_PS_FILE}"
fi

(time ${ROOT_DIR}/nxc train -i $TRAIN_FILE -o $MODEL $TRAIN_ARGS | tee $TRAIN_RESULT_FILE)
echo
echo "Train date: $(date)" | tee -a $TRAIN_RESULT_FILE
Expand All @@ -86,7 +97,36 @@ if [[ ! -e $TEST_RESULT_FILE ]] || [[ -e $TEST_LOCK_FILE ]]; then
if [ -e $TRAIN_RESULT_FILE ]; then
cat $TRAIN_RESULT_FILE > $TEST_RESULT_FILE
fi
(time ${ROOT_DIR}/nxc test -i $TEST_FILE -o $MODEL $TEST_ARGS | tee -a $TEST_RESULT_FILE)
#(time ${ROOT_DIR}/nxc test -i $TEST_FILE -o $MODEL $TEST_ARGS | tee -a $TEST_RESULT_FILE)

if [[ $TEST_ARGS == *"--labelsWeights"* ]]; then
TEST_ARGS="${TEST_ARGS} --labelsWeights ${INV_PS_FILE}"
fi

PRED_FILE=${MODEL}/test_pred_$(echo "${TEST_ARGS}" | tr " /" "__")
PRED_LOCK_FILE=${MODEL}/.test_pred_lock_$(echo "${TEST_ARGS}" | tr " /" "__")
if [[ ! -e $PRED_FILE ]] || [[ -e $PRED_LOCK_FILE ]]; then
touch $PRED_LOCK_FILE
${ROOT_DIR}/nxc predict -i $TEST_FILE -o $MODEL $TEST_ARGS > $PRED_FILE
rm -rf $PRED_LOCK_FILE
fi

echo "Test file results:" | tee -a $TEST_RESULT_FILE
python3 ${SCRIPT_DIR}/evaluate.py $TEST_FILE $PRED_FILE $INV_PS_FILE | tee -a $TEST_RESULT_FILE

TEST_ON_TRAIN=0
if [[ "${TEST_ON_TRAIN}" != "0" ]]; then
PRED_FILE=${MODEL}/train_pred_$(echo "${TEST_ARGS}" | tr " /" "__")
PRED_LOCK_FILE=${MODEL}/.train_pred_lock_$(echo "${TEST_ARGS}" | tr " /" "__")
if [[ ! -e $PRED_FILE ]] || [[ -e $PRED_LOCK_FILE ]]; then
touch $PRED_LOCK_FILE
${ROOT_DIR}/nxc predict -i $TRAIN_FILE -o $MODEL $TEST_ARGS > $PRED_FILE
rm -rf $PRED_LOCK_FILE
fi

echo "Train results file:" | tee -a $TEST_RESULT_FILE
python3 ${SCRIPT_DIR}/evaluate.py $TEST_FILE $PRED_FILE $INV_PS_FILE | tee -a $TEST_RESULT_FILE
fi

echo
echo "Model file size: $(du -ch ${MODEL} | tail -n 1 | grep -E '[0-9\.,]+[BMG]' -o)" | tee -a $TEST_RESULT_FILE
Expand Down
Empty file added experiments/utils.py
Empty file.
38 changes: 13 additions & 25 deletions python/napkinxc/_napkinxc/_napkinxc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,17 @@ class CPPModel {
}

void unload(){
if(model->isLoaded()) model->unload();
if(model != nullptr && model->isLoaded()) model->unload();
}

void setThresholds(std::vector<double> thresholds){
load();
model->setThresholds(thresholds);
}

void setLabelsWeights(std::vector<double> weights){
load();
model->setLabelsWeights(weights);
}

std::vector<std::vector<int>> predict(py::object inputFeatures, int featuresDataType, int topK, double threshold){
Expand All @@ -209,28 +219,6 @@ class CPPModel {
return pred;
}

std::vector<std::vector<int>> predictWithThresholds(py::object inputFeatures, int featuresDataType, int topK, std::vector<double> thresholds){
auto predWithProba = predictProbaWithThresholds(inputFeatures, featuresDataType, topK, thresholds);
return dropProbaHelper(predWithProba);
}

std::vector<std::vector<std::pair<int, double>>> predictProbaWithThresholds(py::object inputFeatures, int featuresDataType, int topK, std::vector<double> thresholds){
std::vector<std::vector<Prediction>> pred;
runAsInterruptable([&] {
load();
SRMatrix<Feature> features;
readFeatureMatrix(features, inputFeatures, (InputDataType)featuresDataType);
args.printArgs("predict");

args.topK = topK;
model->setThresholds(thresholds);
pred = model->predictBatchWithThresholds(features, args);
});

// This is only safe because it's struct with two fields casted to pair, don't do this with tuples!
return reinterpret_cast<std::vector<std::vector<std::pair<int, double>>>&>(pred);
}

std::vector<double> ofo(py::object inputFeatures, py::object inputLabels, int featuresDataType, int labelsDataType) {
std::vector<double> thresholds;
runAsInterruptable([&] {
Expand Down Expand Up @@ -501,12 +489,12 @@ PYBIND11_MODULE(_napkinxc, n) {
.def("fit_on_file", &CPPModel::fitOnFile)
.def("load", &CPPModel::load)
.def("unload", &CPPModel::unload)
.def("set_thresholds", &CPPModel::setThresholds)
.def("set_labels_weights", &CPPModel::setLabelsWeights)
.def("predict", &CPPModel::predict)
.def("predict_proba", &CPPModel::predictProba)
.def("predict_for_file", &CPPModel::predictForFile)
.def("predict_proba_for_file", &CPPModel::predictProbaForFile)
.def("predict_with_thresholds", &CPPModel::predictWithThresholds)
.def("predict_proba_with_thresholds", &CPPModel::predictProbaWithThresholds)
.def("ofo", &CPPModel::ofo)
.def("test", &CPPModel::test)
.def("test_on_file", &CPPModel::testOnFile)
Expand Down
31 changes: 23 additions & 8 deletions python/napkinxc/measures.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,10 +228,11 @@ def inverse_propensity(Y, A=0.55, B=1.5):
:return: ndarray with propensity scores for each label
"""
if isinstance(Y, np.ndarray) or isinstance(Y, csr_matrix):
m = Y.shape[0]
n, m = Y.shape
freqs = np.sum(Y, axis=0)

elif all((isinstance(y, list) or isinstance(y, tuple)) for y in Y):
n = len(Y)
m = max([max(y) for y in Y if len(y)])
freqs = np.zeros(m + 1)
for y in Y:
Expand All @@ -240,8 +241,8 @@ def inverse_propensity(Y, A=0.55, B=1.5):
else:
raise TypeError("Unsupported data type, should be Numpy matrix, Scipy sparse matrix or list of list of ints")

C = (log(m) - 1.0) * (B + 1) ** A
inv_ps = 1.0 + C * (freqs + B) ** -A
C = (log(n) - 1) * (B + 1) ** A
inv_ps = 1 + C * (freqs + B) ** -A
return inv_ps


Expand All @@ -255,8 +256,8 @@ def psprecision_at_k(Y_true, Y_pred, inv_ps, k=5):
Predicted labels provided as a matrix with scores or list of rankings as a list of labels or tuples of labels with scores (idx, score)..
In the case of the matrix, the ranking will be calculated by sorting scores in descending order.
:type Y_pred: ndarray, csr_matrix, list[list[int|str]], list[list[tuple[int|str, float]]
:param inv_ps: Propensity scores for each label.
:type inv_ps: ndarray, list
:param inv_ps: Propensity scores for each label. In case of text labels needs to be a dict.
:type inv_ps: ndarray, list, dict
:param k: Calculate at places from 1 to k, defaults to 5
:type k: int, optional
:return: ndarray with values of PSP at 1-k places.
Expand All @@ -265,13 +266,27 @@ def psprecision_at_k(Y_true, Y_pred, inv_ps, k=5):
Y_true = _get_Y_iterator(Y_true)
Y_pred = _get_Y_iterator(Y_pred, ranking=True)

if not isinstance(inv_ps, np.ndarray):
def _get_top_ps_dict(t):
return np.array(sorted([inv_ps.get(t_i, 0) for t_i in t], reverse=True))

def _get_top_ps_np(t):
t = [t_i for t_i in t if t_i < inv_ps.shape[0]]
return -np.sort(-inv_ps[t])

if isinstance(inv_ps, dict):
_get_top_ps = _get_top_ps_dict
elif isinstance(inv_ps, list):
inv_ps = np.array(inv_ps)
_get_top_ps = _get_top_ps_np
elif isinstance(inv_ps, np.ndarray):
_get_top_ps = _get_top_ps_np
else:
raise TypeError("Unsupported data type for inv_ps, should be Numpy vector (1d array), or list, or dict")

sum = np.zeros(k)
best_sum = np.zeros(k)
for t, p in zip(Y_true, Y_pred):
top_ps = -np.sort(-inv_ps[t])
top_ps = _get_top_ps(t)
psp_at_i = 0
best_psp_at_i = 0
for i, p_i in enumerate(p[:k]):
Expand Down Expand Up @@ -402,4 +417,4 @@ def _get_Y_iterator(Y, ranking=False):
return _Y_list_iterator(Y)

else:
raise TypeError("Unsupported data type, should be Numpy matrix, Scipy sparse matrix or list of list of ints")
raise TypeError("Unsupported data type, should be Numpy matrix (2d array), or Scipy CSR matrix, or list of list of ints")
Loading

0 comments on commit dc31543

Please sign in to comment.