Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve efficiency of sparse queries #94

Merged
merged 22 commits into from
Mar 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
cmake_minimum_required(VERSION 3.16)
project(pp_sketchlib)
set(CMAKE_CXX_STANDARD 14)
set(CMAKE_CXX_STANDARD 17)

if (${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.18")
cmake_policy(SET CMP0104 OLD) # Can't get CUDA_ARCHITECTURES to work with NEW
Expand Down
22 changes: 22 additions & 0 deletions LICENSE_unordered_dense
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
MIT License

Copyright (c) 2022 Martin Leitner-Ankerl

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,7 @@ You can set an environment variable `SKETCHLIB_INSTALL` to affect `python setup.
- Empty: uses cmake
- `conda`: sets library location to the conda environment, and uses `src/Makefile` (used to be used in conda-forge recipe)
- `azure`: Uses `src/Makefile`
- `local`: Uses `src/Makefile_fedora38`

johnlees marked this conversation as resolved.
Show resolved Hide resolved
### cmake
Now requires v3.19. If nvcc version is 11.0 or higher, sm8.6 with device link time optimisation will be used.
Expand Down
7 changes: 7 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ def build_extension(self, ext):
env['CXXFLAGS'] = '{} -DVERSION_INFO=\\"{}\\"'.format(env.get('CXXFLAGS', ''),
self.distribution.get_version())


if not os.path.exists(self.build_temp):
os.makedirs(self.build_temp)

Expand All @@ -88,6 +89,12 @@ def build_extension(self, ext):
elif target == 'azure':
subprocess.check_call(['make', 'python'], cwd=ext.sourcedir + '/src', env=env)
subprocess.check_call(['make', 'install_python', 'PYTHON_LIB_PATH=' + extdir], cwd=ext.sourcedir + '/src', env=env)
elif target == 'local':
debug = "DEBUG="
johnlees marked this conversation as resolved.
Show resolved Hide resolved
if cfg == 'Debug':
debug = "DEBUG=1"
subprocess.check_call(['make', '-f', 'Makefile_fedora38', 'python', debug], cwd=ext.sourcedir + '/src', env=env)
subprocess.check_call(['make', '-f', 'Makefile_fedora38', 'install_python', 'PYTHON_LIB_PATH=' + extdir, debug], cwd=ext.sourcedir + '/src', env=env)
else:
subprocess.check_call(['cmake', ext.sourcedir] + cmake_args, cwd=self.build_temp, env=env)
subprocess.check_call(['cmake', '--build', '.'] + build_args, cwd=self.build_temp)
Expand Down
5 changes: 3 additions & 2 deletions src/Makefile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
CXXFLAGS+=-Wall -Wextra -std=c++14 -fopenmp -D__STDC_LIMIT_MACROS -D__STDC_CONSTANT_MACROS -fPIC
CXXFLAGS+=-Wall -Wextra -std=c++17 -fopenmp -D__STDC_LIMIT_MACROS -D__STDC_CONSTANT_MACROS -fPIC
ifdef DEBUG
CXXFLAGS+= -O0 -g
CUDAFLAGS = -g -G
Expand All @@ -7,6 +7,7 @@ else ifdef PROFILE
CUDAFLAGS = -O2 -pg -lineinfo
else
CXXFLAGS+= -O3 -flto -fno-fat-lto-objects -fvisibility=hidden
CUDAFLAGS = -O3
endif

UNAME_S := $(shell uname -s)
Expand All @@ -29,7 +30,7 @@ LDFLAGS+= -L$(LIBLOC)/lib
CUDA_LDLIBS=-lcudadevrt -lcudart_static $(LDLIBS)

CUDA_LDFLAGS =-L$(LIBLOC)/lib -L${CUDA_HOME}/targets/x86_64-linux/lib/stubs -L${CUDA_HOME}/targets/x86_64-linux/lib
CUDAFLAGS +=-Xcompiler -fPIC --cudart static --relocatable-device-code=true --expt-relaxed-constexpr -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75
CUDAFLAGS +=-std=c++17 -Xcompiler -fPIC --cudart static --relocatable-device-code=true --expt-relaxed-constexpr -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75
ifdef GPU
CXXFLAGS += -DGPU_AVAILABLE
CUDAFLAGS += -gencode arch=compute_86,code=sm_86
Expand Down
142 changes: 142 additions & 0 deletions src/Makefile_fedora38
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
CXX=gcc-11
CC=gcc-11
CFLAGS+=-Wall -Wextra -fPIC
CXXFLAGS+=-Wall -Wextra -std=c++17 -fopenmp -D__STDC_LIMIT_MACROS -D__STDC_CONSTANT_MACROS -fPIC
ifdef DEBUG
CXXFLAGS+= -O0 -g
CUDAFLAGS = -g -G
else ifdef PROFILE
CXXFLAGS+= -O2 -g -flto -fno-fat-lto-objects -fvisibility=hidden
CUDAFLAGS = -O2 -pg -lineinfo
else
CXXFLAGS+= -march=native -O3 -flto -fno-fat-lto-objects -fvisibility=hidden
CFLAGS+= -march=native -O3 -flto -fno-fat-lto-objects -fvisibility=hidden
CUDAFLAGS+= -O3
endif

UNAME_S := $(shell uname -s)
LIBLOC = ${CONDA_PREFIX}
LDLIBS = -lz -lhdf5_cpp -lhdf5 -lopenblas -lgomp
ifeq ($(UNAME_S),Linux)
CXXFLAGS+= -m64
ifdef PROFILE
CXXFLAGS+= -Wl,--compress-debug-sections=none
endif
LDLIBS+= -lpthread -lgfortran -lm -ldl -lrt
LDFLAGS=-Wl,-as-needed
endif
ifeq ($(UNAME_S),Darwin)
LDLIBS+= -pthread
endif

CPPFLAGS+=-I"/home/linuxbrew/.linuxbrew/include" -I"." -I"../vendor/highfive/include" -I$(LIBLOC)/include -I$(LIBLOC)/include/eigen3
LDFLAGS+= -L$(LIBLOC)/lib -L"/home/linuxbrew/.linuxbrew/lib" -L/usr/local/cuda-12.3/lib64
CUDA_LDLIBS=-lcudadevrt -lcudart_static $(LDLIBS)

CUDA_LDFLAGS =-L$(LIBLOC)/lib -L${CUDA_HOME}/targets/x86_64-linux/lib/stubs -L${CUDA_HOME}/targets/x86_64-linux/lib
CUDAFLAGS +=-ccbin /home/linuxbrew/.linuxbrew/bin/g++-11 -std=c++17 -Xcompiler -fPIC --cudart static --relocatable-device-code=true --expt-relaxed-constexpr -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75
ifdef GPU
CXXFLAGS += -DGPU_AVAILABLE
CUDAFLAGS += -gencode arch=compute_86,code=sm_86
CUDA_LDFLAGS += -L/usr/local/cuda-12.3/lib64
endif

PYTHON_LIB = pp_sketchlib$(shell python3-config --extension-suffix)

# python specific options
python: CPPFLAGS += -DGPU_AVAILABLE -DPYTHON_EXT -DNDEBUG -Dpp_sketchlib_EXPORTS $(shell python3 -m pybind11 --includes)

PROGRAMS=sketch_test matrix_test read_test gpu_dist_test

SKETCH_OBJS=dist/dist.o dist/matrix_ops.o reference.o sketch/seqio.o sketch/sketch.o database/database.o sketch/countmin.o api.o dist/linear_regression.o random/rng.o random/random_match.o random/kmeans/KMeansRexCore.o random/kmeans/mersenneTwister2002.o
GPU_SKETCH_OBJS=gpu/gpu_api.o
CUDA_OBJS=gpu/dist.cu.o gpu/sketch.cu.o gpu/device_reads.cu.o gpu/gpu_countmin.cu.o gpu/device_memory.cu.o

# web specific options
web: CXX = em++
# optimised compile options
# NB turn exceptions back on for testing
# NB `--closure 1` can be used to reduce size of js file (this minifies variable names!)
web: CXXFLAGS = -O3 -s ASSERTIONS=1 \
-DNOEXCEPT \
-DJSON_NOEXCEPTION \
-s DISABLE_EXCEPTION_CATCHING=1 \
-fno-exceptions \
-flto --bind -s STRICT=1 \
-s ALLOW_MEMORY_GROWTH=1 \
-s USE_ZLIB=1 \
-s MODULARIZE=1 \
-s "EXPORTED_FUNCTIONS=['_malloc']" \
-s 'EXPORTED_RUNTIME_METHODS=["FS"]' \
-s EXPORT_NAME=WebSketch \
-Wall -Wextra -std=c++14
web: CPPFLAGS += -DWEB_SKETCH
web: LDFLAGS = -lnodefs.js -lworkerfs.js

WEB_OUT=web/web_sketch
WEB_OBJS=${WEB_OUT}.js ${WEB_OUT}.html ${WEB_OUT}.wasm

web: web/web_sketch.o sketch/seqio.o sketch/sketch.o sketch/countmin.o
$(LINK.cpp) $^ -o ${WEB_OUT}.js
sed -i.old '1s;^;\/* eslint-disable *\/;' ${WEB_OUT}.js

all: $(PROGRAMS)

clean:
$(RM) $(SKETCH_OBJS) $(GPU_SKETCH_OBJS) $(CUDA_OBJS) $(WEB_OBJS) *.o *.so version.h ~* $(PROGRAMS)

install: all
install -d $(BINDIR)
install $(PROGRAMS) $(BINDIR)

sketch_test: $(SKETCH_OBJS) test/main.o
$(LINK.cpp) $(CUDA_LDFLAGS) $(LDFLAGS) $^ -o $@ $(LDLIBS)

matrix_test: $(SKETCH_OBJS) test/matrix_test.o
$(LINK.cpp) $^ -o $@ $(LDLIBS)

read_test: $(SKETCH_OBJS) $(GPU_SKETCH_OBJS) $(CUDA_OBJS) test/read_test.o
nvcc $(CUDAFLAGS) $(CUDA_LDFLAGS) -Wno-deprecated-gpu-targets -shared -dlink $^ -o device_link.o -Xnvlink $(CUDA_LDLIBS)
$(LINK.cpp) $(CUDA_LDFLAGS) $(LDFLAGS) $^ device_link.o -o $@ $(CUDA_LDLIBS)

gpu_dist_test: $(SKETCH_OBJS) $(GPU_SKETCH_OBJS) $(CUDA_OBJS) test/gpu_dist_test.o
nvcc $(CUDAFLAGS) $(CUDA_LDFLAGS) -Wno-deprecated-gpu-targets -shared -dlink $^ -o device_link.o -Xnvlink $(CUDA_LDLIBS)
$(LINK.cpp) $(CUDA_LDFLAGS) $(LDFLAGS) $^ device_link.o -o $@ $(CUDA_LDLIBS)

version.h:
cat sketch/*.cpp sketch/*.hpp gpu/sketch.cu | openssl sha1 | awk '{print "#define SKETCH_VERSION \"" $$2 "\""}' > version.h

database/database.o: version.h

web/web_sketch.o: version.h

python: $(PYTHON_LIB)

$(PYTHON_LIB): $(SKETCH_OBJS) $(GPU_SKETCH_OBJS) $(CUDA_OBJS) sketchlib_bindings.o
nvcc $(CUDAFLAGS) $(CUDA_LDFLAGS) -Wno-deprecated-gpu-targets -shared -dlink $^ -o device_link.o -Xnvlink $(CUDA_LDLIBS)
$(LINK.cpp) $(CUDA_LDFLAGS) $(LDFLAGS) -shared $^ device_link.o -o $(PYTHON_LIB) $(CUDA_LDLIBS)

install_python: python
install -d $(PYTHON_LIB_PATH)
install $(PYTHON_LIB) $(PYTHON_LIB_PATH)

gpu/dist.cu.o:
echo ${CUDAFLAGS}
echo ${CPPFLAGS}
echo ${CXXFLAGS}
echo ${CFLAGS}
nvcc $(CUDAFLAGS) $(CPPFLAGS) -DGPU_AVAILABLE -x cu -c gpu/dist.cu -o $@

gpu/sketch.cu.o:
nvcc $(CUDAFLAGS) $(CPPFLAGS) -DGPU_AVAILABLE -x cu -c gpu/sketch.cu -o $@

gpu/device_memory.cu.o:
nvcc $(CUDAFLAGS) $(CPPFLAGS) -DGPU_AVAILABLE -x cu -c gpu/device_memory.cu -o $@

gpu/device_reads.cu.o:
nvcc $(CUDAFLAGS) $(CPPFLAGS) -DGPU_AVAILABLE -x cu -c gpu/device_reads.cu -o $@

gpu/gpu_countmin.cu.o:
nvcc $(CUDAFLAGS) $(CPPFLAGS) -DGPU_AVAILABLE -x cu -c gpu/gpu_countmin.cu -o $@

.PHONY: all clean install python install_python web
54 changes: 39 additions & 15 deletions src/api.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

#include <algorithm>
#include <limits>
#include <queue>

#include <H5Cpp.h>
#include <omp.h>
Expand Down Expand Up @@ -314,6 +315,20 @@ void check_sparse_inputs(const std::vector<Reference> &ref_sketches,
}
}

// Struct that allows sorting by dist but also keeping index
struct SparseDist {
float dist;
long j;
};
bool operator<(SparseDist const &a, SparseDist const &b)
{
return a.dist < b.dist;
}
bool operator==(SparseDist const &a, SparseDist const &b)
{
return a.dist == b.dist;
}

sparse_coo query_db_sparse(std::vector<Reference> &ref_sketches,
const std::vector<size_t> &kmer_lengths,
RandomMC &random_chance, const bool jaccard,
Expand Down Expand Up @@ -344,27 +359,35 @@ sparse_coo query_db_sparse(std::vector<Reference> &ref_sketches,
Eigen::MatrixXf kmer_mat = kmer2mat(kmer_lengths);
#pragma omp parallel for schedule(static) num_threads(num_threads) shared(progress)
for (size_t i = 0; i < ref_sketches.size(); i++) {
std::vector<float> row_dists(ref_sketches.size());
// Use a priority queue to efficiently track the smallest N dists
std::priority_queue<SparseDist> min_dists;
if (!interrupt) {
for (size_t j = 0; j < ref_sketches.size(); j++) {
float row_dist = std::numeric_limits<float>::infinity();
if (i != j) {
if (jaccard) {
// Need 1-J here to sort correctly
row_dists[j] = 1.0f - ref_sketches[i].jaccard_dist(
row_dist = 1.0f - ref_sketches[i].jaccard_dist(
ref_sketches[j], kmer_lengths[dist_col], random_chance);
} else {
float core, acc;
std::tie(core, acc) =
ref_sketches[i].core_acc_dist<RandomMC>(
ref_sketches[j], kmer_mat, random_chance);
if (dist_col == 0) {
row_dists[j] = core;
row_dist = core;
} else {
row_dists[j] = acc;
row_dist = acc;
}
}
} else {
row_dists[j] = std::numeric_limits<float>::infinity();
}
// Add dist if it is in the smallest k
if (min_dists.size() < kNN || row_dist < min_dists.top().dist) {
SparseDist new_min = {row_dist, j};
min_dists.push(new_min);
if (min_dists.size() > kNN) {
min_dists.pop();
}
}
johnlees marked this conversation as resolved.
Show resolved Hide resolved
if ((i * ref_sketches.size() + j) % update_every == 0) {
#pragma omp critical
Expand All @@ -376,16 +399,17 @@ sparse_coo query_db_sparse(std::vector<Reference> &ref_sketches,
}
}
}
long offset = i * kNN;
std::vector<long> ordered_dists = sort_indexes(row_dists);
std::fill_n(i_vec.begin() + offset, kNN, i);
// std::copy_n(ordered_dists.begin(), kNN, j_vec.begin() + offset);

for (int k = 0; k < kNN; ++k) {
j_vec[offset + k] = ordered_dists[k];
dists[offset + k] = row_dists[ordered_dists[k]];
}
}

// For each sample/row/i, fill the ijk vectors
// This goes 'backwards' for compatibility with numpy (so dists are ascending)
long offset = i * kNN;
std::fill_n(i_vec.begin() + offset, kNN, i);
for (int k = kNN - 1; k >= 0; --k) {
SparseDist entry = min_dists.top();
j_vec[offset + k] = entry.j;
dists[offset + k] = entry.dist;
min_dists.pop();
}
}
}
Expand Down
6 changes: 2 additions & 4 deletions src/database/database.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,6 @@
#include "hdf5_funcs.hpp"
#include "random/random_match.hpp"

#include "robin_hood.h"

// const int deflate_level = 9;

// Helper function prototypes
Expand Down Expand Up @@ -200,9 +198,9 @@ RandomMC Database::load_random(const bool use_rc_default) {
HighFive::Group random_group = _h5_file.getGroup("/random");

// Flattened hashes
robin_hood::unordered_node_map<std::string, uint16_t> cluster_table =
ankerl::unordered_dense::map<std::string, uint16_t> cluster_table =
load_hash<std::string, uint16_t>(random_group, "table");
robin_hood::unordered_node_map<size_t, NumpyMatrix> matches =
ankerl::unordered_dense::map<size_t, NumpyMatrix> matches =
load_hash<size_t, NumpyMatrix>(random_group, "matches");

// Centroid matrix
Expand Down
2 changes: 1 addition & 1 deletion src/database/database.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
#include <cstring>
#include <vector>
#include <string>
#include "robin_hood.h"
#include "unordered_dense.hpp"

#include <highfive/H5File.hpp>

Expand Down
Loading
Loading