Skip to content

Commit

Permalink
Merge pull request #8 from koide3/cuda12.5
Browse files Browse the repository at this point in the history
cuda 12.5
  • Loading branch information
koide3 authored Jul 22, 2024
2 parents 97563dc + bb03636 commit b3a7d04
Show file tree
Hide file tree
Showing 13 changed files with 127 additions and 83 deletions.
6 changes: 4 additions & 2 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,14 @@ jobs:
DISTRO: [
{ "ubuntu": "noble", "suffix": "gcc" },
{ "ubuntu": "noble", "suffix": "llvm" },
{ "ubuntu": "noble_cuda12.5", "suffix": "gcc.cuda" },
{ "ubuntu": "noble_cuda12.5", "suffix": "llvm.cuda" },
{ "ubuntu": "jammy", "suffix": "gcc" },
{ "ubuntu": "jammy", "suffix": "llvm" },
{ "ubuntu": "jammy_cuda12.2", "suffix": "gcc.cuda" },
{ "ubuntu": "jammy_cuda12.2", "suffix": "llvm.cuda" },
{ "ubuntu": "jammy_cuda12.5", "suffix": "gcc.cuda" },
{ "ubuntu": "focal", "suffix": "focal" },
{ "ubuntu": "focal_cuda12.2", "suffix": "focal.cuda" },
{ "ubuntu": "focal_cuda12.5", "suffix": "focal.cuda" },
]

steps:
Expand Down
3 changes: 2 additions & 1 deletion include/gtsam_points/cuda/kernels/lookup_voxels.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include <Eigen/Core>
#include <Eigen/Geometry>

#include <thrust/pair.h>
#include <thrust/count.h>
#include <thrust/device_new.h>
#include <thrust/device_vector.h>
Expand Down Expand Up @@ -86,7 +87,7 @@ struct lookup_voxels_kernel {
thrust::device_ptr<const Eigen::Isometry3f> x_ptr;

thrust::device_ptr<const VoxelMapInfo> voxelmap_info_ptr;
thrust::device_ptr<const thrust::pair<Eigen::Vector3i, int>> buckets_ptr;
thrust::device_ptr<const VoxelBucket> buckets_ptr;

thrust::device_ptr<const Eigen::Vector3f> points_ptr;
thrust::device_ptr<const Eigen::Vector3f> normals_ptr;
Expand Down
2 changes: 1 addition & 1 deletion include/gtsam_points/cuda/kernels/vector3_hash.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ inline __host__ __device__ Eigen::Vector3i calc_voxel_coord(const Eigen::Vector3
inline __host__ __device__ int lookup_voxel(
const int max_bucket_scan_count,
const int num_buckets,
const thrust::device_ptr<const thrust::pair<Eigen::Vector3i, int>>& buckets_ptr,
const thrust::device_ptr<const VoxelBucket>& buckets_ptr,
const float resolution,
const Eigen::Vector3f& x) {
Eigen::Vector3i coord = calc_voxel_coord(x, resolution);
Expand Down
40 changes: 26 additions & 14 deletions include/gtsam_points/cuda/nonlinear_factor_set_gpu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,6 @@
#include <memory>
#include <vector>

#include <thrust/host_vector.h>
#include <thrust/device_vector.h>

#include <boost/utility/in_place_factory.hpp>
#include <boost/utility/typed_in_place_factory.hpp>

#include <gtsam/nonlinear/NonlinearFactorGraph.h>
#include <gtsam_points/factors/nonlinear_factor_gpu.hpp>
#include <gtsam_points/optimizers/linearization_hook.hpp>
Expand Down Expand Up @@ -88,6 +82,24 @@ class NonlinearFactorSetGPU : public NonlinearFactorSet {
*/
std::vector<gtsam::GaussianFactor::shared_ptr> calc_linear_factors(const gtsam::Values& linearization_point) override;

private:
/// @brief Simple buffer class for device memory
struct DeviceBuffer {
DeviceBuffer();
~DeviceBuffer();

DeviceBuffer(const DeviceBuffer&) = delete;
DeviceBuffer& operator=(const DeviceBuffer&) = delete;

void resize(size_t size, CUstream_st* stream);
unsigned char* data() { return buffer; }
const unsigned char* data() const { return buffer; }

size_t size;
unsigned char* buffer;
};


private:
CUstream_st* stream;

Expand All @@ -96,15 +108,15 @@ class NonlinearFactorSetGPU : public NonlinearFactorSet {

std::vector<boost::shared_ptr<NonlinearFactorGPU>> factors;

thrust::host_vector<unsigned char, Eigen::aligned_allocator<unsigned char>> linearization_input_buffer_cpu;
thrust::host_vector<unsigned char, Eigen::aligned_allocator<unsigned char>> linearization_output_buffer_cpu;
thrust::device_vector<unsigned char> linearization_input_buffer_gpu;
thrust::device_vector<unsigned char> linearization_output_buffer_gpu;
std::vector<unsigned char, Eigen::aligned_allocator<unsigned char>> linearization_input_buffer_cpu;
std::vector<unsigned char, Eigen::aligned_allocator<unsigned char>> linearization_output_buffer_cpu;
std::unique_ptr<DeviceBuffer> linearization_input_buffer_gpu;
std::unique_ptr<DeviceBuffer> linearization_output_buffer_gpu;

thrust::host_vector<unsigned char, Eigen::aligned_allocator<unsigned char>> evaluation_input_buffer_cpu;
thrust::host_vector<unsigned char, Eigen::aligned_allocator<unsigned char>> evaluation_output_buffer_cpu;
thrust::device_vector<unsigned char> evaluation_input_buffer_gpu;
thrust::device_vector<unsigned char> evaluation_output_buffer_gpu;
std::vector<unsigned char, Eigen::aligned_allocator<unsigned char>> evaluation_input_buffer_cpu;
std::vector<unsigned char, Eigen::aligned_allocator<unsigned char>> evaluation_output_buffer_cpu;
std::unique_ptr<DeviceBuffer> evaluation_input_buffer_gpu;
std::unique_ptr<DeviceBuffer> evaluation_output_buffer_gpu;
};

} // namespace gtsam_points
24 changes: 12 additions & 12 deletions include/gtsam_points/cuda/stream_temp_buffer_roundrobin.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,6 @@

#include <gtsam_points/cuda/stream_roundrobin.hpp>

// forward declaration
namespace thrust {

template <typename T>
class device_allocator;

template <typename T, typename Alloc>
class device_vector;

} // namespace thrust

namespace gtsam_points {

/**
Expand All @@ -29,6 +18,17 @@ namespace gtsam_points {
*/
class TempBufferManager {
public:
struct Buffer {
Buffer(size_t size);
~Buffer();

Buffer(const Buffer&) = delete;
Buffer& operator=(const Buffer&) = delete;

size_t size;
char* buffer;
};

using Ptr = std::shared_ptr<TempBufferManager>;

TempBufferManager(size_t init_buffer_size = 0);
Expand All @@ -40,7 +40,7 @@ class TempBufferManager {
void clear_all();

private:
std::vector<std::shared_ptr<thrust::device_vector<char, thrust::device_allocator<char>>>> buffers;
std::vector<std::shared_ptr<Buffer>> buffers;
};

/**
Expand Down
10 changes: 5 additions & 5 deletions include/gtsam_points/factors/integrated_vgicp_factor_gpu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -100,17 +100,17 @@ class IntegratedVGICPFactorGPU : public gtsam_points::NonlinearFactorGPU {
virtual void set_linearization_point(const gtsam::Values& values, void* lin_input_cpu) override;
virtual void issue_linearize(
const void* lin_input_cpu,
const thrust::device_ptr<const void>& lin_input_gpu,
const thrust::device_ptr<void>& lin_output_gpu) override;
const void* lin_input_gpu,
void* lin_output_gpu) override;
virtual void store_linearized(const void* lin_output_cpu) override;

virtual void set_evaluation_point(const gtsam::Values& values, void* eval_input_cpu) override;
virtual void issue_compute_error(
const void* lin_input_cpu,
const void* eval_input_cpu,
const thrust::device_ptr<const void>& lin_input_gpu,
const thrust::device_ptr<const void>& eval_input_gpu,
const thrust::device_ptr<void>& eval_output_gpu) override;
const void* lin_input_gpu,
const void* eval_input_gpu,
void* eval_output_gpu) override;
virtual void store_computed_error(const void* eval_output_cpu) override;

virtual void sync() override;
Expand Down
14 changes: 4 additions & 10 deletions include/gtsam_points/factors/nonlinear_factor_gpu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,6 @@
#include <boost/utility/typed_in_place_factory.hpp>
#include <gtsam/nonlinear/NonlinearFactor.h>

namespace thrust {

template <typename T>
class device_ptr;
}

namespace gtsam_points {

/**
Expand Down Expand Up @@ -83,7 +77,7 @@ class NonlinearFactorGPU : public gtsam::NonlinearFactor {
* @param lin_output_gpu Output data destination on the GPU memory (size == linearization_output_size)
*/
virtual void
issue_linearize(const void* lin_input_cpu, const thrust::device_ptr<const void>& lin_input_gpu, const thrust::device_ptr<void>& lin_output_gpu) = 0;
issue_linearize(const void* lin_input_cpu, const void* lin_input_gpu, void* lin_output_gpu) = 0;

/**
* @brief Read linearization output data from the download buffer
Expand All @@ -109,9 +103,9 @@ class NonlinearFactorGPU : public gtsam::NonlinearFactor {
virtual void issue_compute_error(
const void* lin_input_cpu,
const void* eval_input_cpu,
const thrust::device_ptr<const void>& lin_input_gpu,
const thrust::device_ptr<const void>& eval_input_gpu,
const thrust::device_ptr<void>& eval_output_gpu) = 0;
const void* lin_input_gpu,
const void* eval_input_gpu,
void* eval_output_gpu) = 0;

/**
* @brief Read cost evaluation output data from the download buffer
Expand Down
16 changes: 9 additions & 7 deletions include/gtsam_points/types/gaussian_voxelmap_gpu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,6 @@
// forward declaration
struct CUstream_st;

namespace thrust {
template <typename T1, typename T2>
class pair;

} // namespace thrust

namespace gtsam_points {

/**
Expand All @@ -29,6 +23,14 @@ struct VoxelMapInfo {
float voxel_resolution; ///< Voxel resolution
};

/**
* @brief Voxel bucket (avoid using thrust::pair for CUDA compatibility)
*/
struct VoxelBucket {
Eigen::Vector3i first;
int second;
};

/**
* @brief Gaussian distribution voxelmap on GPU
*/
Expand Down Expand Up @@ -71,7 +73,7 @@ class GaussianVoxelMapGPU : public GaussianVoxelMap {
VoxelMapInfo voxelmap_info; ///< Voxelmap information
VoxelMapInfo* voxelmap_info_ptr; ///< Voxelmap information on GPU memory

thrust::pair<Eigen::Vector3i, int>* buckets; ///< Voxel buckets for hashing
VoxelBucket* buckets; ///< Voxel buckets for hashing

// voxel data
int* num_points; ///< Number of points in eac voxel
Expand Down
50 changes: 37 additions & 13 deletions src/gtsam_points/cuda/nonlinear_factor_set_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,31 @@

namespace gtsam_points {

NonlinearFactorSetGPU::DeviceBuffer::DeviceBuffer() : size(0), buffer(nullptr) {}

NonlinearFactorSetGPU::DeviceBuffer::~DeviceBuffer() {
if(buffer) {
check_error << cudaFreeAsync(buffer, 0);
}
}

void NonlinearFactorSetGPU::DeviceBuffer::resize(size_t size, CUstream_st* stream) {
if(this->size < size) {
if(buffer) {
check_error << cudaFreeAsync(buffer, stream);
}
check_error << cudaMallocAsync(&buffer, size, stream);
this->size = size;
}
}

NonlinearFactorSetGPU::NonlinearFactorSetGPU() {
check_error << cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);

linearization_input_buffer_gpu.reset(new DeviceBuffer);
linearization_output_buffer_gpu.reset(new DeviceBuffer);
evaluation_input_buffer_gpu.reset(new DeviceBuffer);
evaluation_output_buffer_gpu.reset(new DeviceBuffer);
}

NonlinearFactorSetGPU::~NonlinearFactorSetGPU() {
Expand Down Expand Up @@ -52,9 +75,9 @@ void NonlinearFactorSetGPU::linearize(const gtsam::Values& linearization_point)
}

linearization_input_buffer_cpu.resize(input_buffer_size);
linearization_input_buffer_gpu.resize(input_buffer_size);
linearization_input_buffer_gpu->resize(input_buffer_size, stream);
linearization_output_buffer_cpu.resize(output_buffer_size);
linearization_output_buffer_gpu.resize(output_buffer_size);
linearization_output_buffer_gpu->resize(output_buffer_size, stream);

// set linearization point
size_t input_cursor = 0;
Expand All @@ -67,7 +90,7 @@ void NonlinearFactorSetGPU::linearize(const gtsam::Values& linearization_point)

// copy input buffer from cpu to gpu
check_error << cudaMemcpyAsync(
thrust::raw_pointer_cast(linearization_input_buffer_gpu.data()),
linearization_input_buffer_gpu->data(),
linearization_input_buffer_cpu.data(),
input_buffer_size,
cudaMemcpyHostToDevice,
Expand All @@ -79,8 +102,8 @@ void NonlinearFactorSetGPU::linearize(const gtsam::Values& linearization_point)
output_cursor = 0;
for (auto& factor : factors) {
auto input_cpu = linearization_input_buffer_cpu.data() + input_cursor;
auto input_gpu = linearization_input_buffer_gpu.data() + input_cursor;
auto output_gpu = linearization_output_buffer_gpu.data() + output_cursor;
auto input_gpu = linearization_input_buffer_gpu->data() + input_cursor;
auto output_gpu = linearization_output_buffer_gpu->data() + output_cursor;
factor->issue_linearize(input_cpu, input_gpu, output_gpu);
input_cursor += factor->linearization_input_size();
output_cursor += factor->linearization_output_size();
Expand All @@ -94,7 +117,7 @@ void NonlinearFactorSetGPU::linearize(const gtsam::Values& linearization_point)
// copy output buffer from gpu to cpu
check_error << cudaMemcpyAsync(
linearization_output_buffer_cpu.data(),
thrust::raw_pointer_cast(linearization_output_buffer_gpu.data()),
linearization_output_buffer_gpu->data(),
output_buffer_size,
cudaMemcpyDeviceToHost,
stream);
Expand Down Expand Up @@ -124,9 +147,9 @@ void NonlinearFactorSetGPU::error(const gtsam::Values& values) {
output_buffer_size += factor->evaluation_output_size();
}
evaluation_input_buffer_cpu.resize(input_buffer_size);
evaluation_input_buffer_gpu.resize(input_buffer_size);
evaluation_input_buffer_gpu->resize(input_buffer_size, stream);
evaluation_output_buffer_cpu.resize(output_buffer_size);
evaluation_output_buffer_gpu.resize(output_buffer_size);
evaluation_output_buffer_gpu->resize(output_buffer_size, stream);

// set evaluation point
size_t lin_input_cursor = 0;
Expand All @@ -141,7 +164,7 @@ void NonlinearFactorSetGPU::error(const gtsam::Values& values) {

// copy input buffer from cpu to gpu
check_error << cudaMemcpyAsync(
thrust::raw_pointer_cast(evaluation_input_buffer_gpu.data()),
evaluation_input_buffer_gpu->data(),
evaluation_input_buffer_cpu.data(),
input_buffer_size,
cudaMemcpyHostToDevice,
Expand All @@ -154,10 +177,10 @@ void NonlinearFactorSetGPU::error(const gtsam::Values& values) {
eval_output_cursor = 0;
for (auto& factor : factors) {
auto lin_input_cpu = linearization_input_buffer_cpu.data() + lin_input_cursor;
auto lin_input_gpu = linearization_input_buffer_gpu.data() + lin_input_cursor;
auto lin_input_gpu = linearization_input_buffer_gpu->data() + lin_input_cursor;
auto eval_input_cpu = evaluation_input_buffer_cpu.data() + eval_input_cursor;
auto eval_input_gpu = evaluation_input_buffer_gpu.data() + eval_input_cursor;
auto eval_output_gpu = evaluation_output_buffer_gpu.data() + eval_output_cursor;
auto eval_input_gpu = evaluation_input_buffer_gpu->data() + eval_input_cursor;
auto eval_output_gpu = evaluation_output_buffer_gpu->data() + eval_output_cursor;

factor->issue_compute_error(lin_input_cpu, eval_input_cpu, lin_input_gpu, eval_input_gpu, eval_output_gpu);

Expand All @@ -174,7 +197,7 @@ void NonlinearFactorSetGPU::error(const gtsam::Values& values) {
// copy output buffer from gpu to cpu
check_error << cudaMemcpyAsync(
evaluation_output_buffer_cpu.data(),
thrust::raw_pointer_cast(evaluation_output_buffer_gpu.data()),
evaluation_output_buffer_gpu->data(),
output_buffer_size,
cudaMemcpyDeviceToHost,
stream);
Expand All @@ -201,4 +224,5 @@ std::vector<gtsam::GaussianFactor::shared_ptr> NonlinearFactorSetGPU::calc_linea

return linear_factors;
}

} // namespace gtsam_points
Loading

0 comments on commit b3a7d04

Please sign in to comment.