This repository has been archived by the owner on Nov 15, 2024. It is now read-only.
forked from JazzEd-EdTech/pumpkin-py
-
Notifications
You must be signed in to change notification settings - Fork 0
/
CudaIPCTypes.h
146 lines (122 loc) · 3.39 KB
/
CudaIPCTypes.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
#pragma once
#ifdef USE_CUDA
#include <c10/core/Allocator.h>
#include <c10/cuda/CUDACachingAllocator.h>
#include <c10/cuda/CUDAException.h>
#include <c10/cuda/CUDAGuard.h>
#include <c10/cuda/CUDAStream.h>
#include <c10/util/Logging.h>
#include <cuda_runtime_api.h>
#include <torch/csrc/Export.h>
#include <cstddef>
namespace torch {
TORCH_CUDA_CU_API bool CudaIPCCollect();
struct CudaIPCReceivedData final {
CudaIPCReceivedData() = default;
explicit CudaIPCReceivedData(std::shared_ptr<void> shared_ptr)
: shared_ptr_(std::move(shared_ptr)) {}
std::shared_ptr<void> shared_ptr_;
};
struct CudaIPCSentData final {
std::string handle_;
int64_t offset_;
int64_t* counter_ptr_; // Reference counter shared memory block
at::DataPtr original_ptr_; // Original mem allocation
cudaEvent_t event_; // Sync cuEventDestroy
bool event_sync_required_;
at::Device device_;
CudaIPCSentData(
std::string handle,
int64_t offset,
int64_t* counter_ptr,
at::Device device);
~CudaIPCSentData();
int64_t counter_value();
std::string handle() {
return handle_;
}
int64_t offset() {
return offset_;
}
void set_original_ptr(at::DataPtr data_ptr) {
original_ptr_ = std::move(data_ptr);
}
};
TORCH_CUDA_CU_API at::DataPtr GetNewRefCountedSentData(
void* data,
at::Device device);
namespace {
constexpr int64_t CUDA_IPC_REF_COUNTER_FILE_SIZE = 10000;
constexpr int64_t CUDA_IPC_WARN_AFTER_X_BLOCKS_IN_LIMBO = 1000;
// This was determined empirically that CUDA (v10.1 and below) have the limit
// on the number of recorded blocking interprocess events. It is around ~22,000.
// And to give us leeway, we picked 1000 as it gives us enough events to share
// tensors effectively.
constexpr int64_t CUDA_IPC_MAXIMUM_EVENTS_TO_USE = 1000;
// All to be deleted data blocks with non zero reference counter goes there
struct CudaIPCSentDataLimbo final {
~CudaIPCSentDataLimbo();
bool collect();
void add(std::unique_ptr<CudaIPCSentData> shared_block);
uint64_t size();
private:
// TODO: Can be changed to FIFO in order to avoid full traverse on every
// collect()
std::vector<std::unique_ptr<CudaIPCSentData>> shared_blocks_;
std::mutex limbo_mutex_;
};
struct CudaIPCRefCountersFile final {
CudaIPCRefCountersFile(
std::string handle,
uint64_t size,
at::DataPtr data_ptr)
: next_offset_(0),
size_(size),
used_slots_(0),
handle_(std::move(handle)),
refcounted_shared_mem_(std::move(data_ptr)) {}
int64_t* counter_ptr() {
return static_cast<int64_t*>(refcounted_shared_mem_.get()) + next_offset_;
}
void set_counter(uint64_t value) {
*counter_ptr() = value;
}
bool have_offsets() {
return next_offset_ < size_;
}
bool offsets_in_use() {
return used_slots_;
}
int64_t get_offset() {
return next_offset_;
}
void rotate_offset() {
next_offset_++;
used_slots_++;
}
void return_offset(uint64_t offset /* unused */) {
used_slots_--;
}
std::string handle() {
return handle_;
}
private:
uint64_t next_offset_;
uint64_t size_;
uint64_t used_slots_;
std::string handle_;
at::DataPtr refcounted_shared_mem_;
};
} // namespace
} // namespace torch
namespace c10 {
namespace {
class CudaIPCCollectCallback : public FreeMemoryCallback {
public:
bool Execute() override {
return torch::CudaIPCCollect();
}
};
} // namespace
} // namespace c10
#endif