-
Notifications
You must be signed in to change notification settings - Fork 207
/
init.lua
165 lines (146 loc) · 5.93 KB
/
init.lua
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
require "torch"
paths.require("libcutorch")
torch.CudaByteStorage.__tostring__ = torch.ByteStorage.__tostring__
torch.CudaByteTensor.__tostring__ = torch.ByteTensor.__tostring__
torch.CudaCharStorage.__tostring__ = torch.CharStorage.__tostring__
torch.CudaCharTensor.__tostring__ = torch.CharTensor.__tostring__
torch.CudaShortStorage.__tostring__ = torch.ShortStorage.__tostring__
torch.CudaShortTensor.__tostring__ = torch.ShortTensor.__tostring__
torch.CudaIntStorage.__tostring__ = torch.IntStorage.__tostring__
torch.CudaIntTensor.__tostring__ = torch.IntTensor.__tostring__
torch.CudaLongStorage.__tostring__ = torch.LongStorage.__tostring__
torch.CudaLongTensor.__tostring__ = torch.LongTensor.__tostring__
torch.CudaStorage.__tostring__ = torch.FloatStorage.__tostring__
torch.CudaTensor.__tostring__ = torch.FloatTensor.__tostring__
torch.CudaDoubleStorage.__tostring__ = torch.DoubleStorage.__tostring__
torch.CudaDoubleTensor.__tostring__ = torch.DoubleTensor.__tostring__
if cutorch.hasHalf then
torch.CudaHalfStorage.__tostring__ = torch.HalfStorage.__tostring__
torch.CudaHalfTensor.__tostring__ = torch.HalfTensor.__tostring__
end
require('cutorch.Tensor')
require('cutorch.FFI')
require('cutorch.test')
local unpack = unpack or table.unpack
function cutorch.withDevice(newDeviceID, closure)
local curDeviceID = cutorch.getDevice()
cutorch.setDevice(newDeviceID)
local vals = {pcall(closure)}
cutorch.setDevice(curDeviceID)
if vals[1] then
return unpack(vals, 2)
end
error(unpack(vals, 2))
end
local function longTensorSize(...)
local size
if not ... then
size = torch.LongTensor{0}
elseif torch.isStorage(...) then
size = torch.LongTensor(...)
else
size = torch.LongTensor{...}
end
return size
end
local hostTypes = {'Float', 'Double', 'Int', 'Long', 'Byte'}
if cutorch.hasHalf then
table.insert(hostTypes, 'Half')
end
for _, ty in ipairs(hostTypes) do
-- Creates torch Tensors using the CudaHostAllocator.
-- Accepts either a LongStorage or a sequence of numbers.
cutorch['createCudaHost' .. ty .. 'Tensor'] = function(...)
local size = longTensorSize(...)
local storage = torch[ty .. 'Storage'](cutorch.CudaHostAllocator, size:prod())
return torch[ty .. 'Tensor'](storage, 1, size:storage())
end
end
-- Alias to automate creation from both torch and cutorch types
cutorch.createCudaHostTensor = cutorch.createCudaHostFloatTensor
-- Creates a CudaTensor using the CudaUVAAllocator.
-- Accepts either a LongStorage or a sequence of numbers.
local function _createUVATensor(...)
local size = longTensorSize(...)
-- See CUDA_C_Programming_guide.pdf for detailed explanation about synchronization
-- Section J.
-- "It is worth a comment on the synchronization between host and device. Notice how in
-- the non-managed example, the synchronous cudaMemcpy() routine is used both to
-- synchronize the kernel (that is, to wait for it to finish running), and to transfer the data
-- to the host. The Unified Memory examples do not call cudaMemcpy() and so require an
-- explicit cudaDeviceSynchronize() before the host program can safely use the output
-- from the GPU."
-- Section J.2.2.1.
-- " Note that if memory is dynamically allocated with cudaMallocManaged() or
-- cuMemAllocManaged() while the GPU is active, the behavior of the memory is
-- unspecified until additional work is launched or the GPU is synchronized. Attempting
-- to access the memory on the CPU during this time may or may not cause a segmentation
-- fault."
cutorch.synchronize()
local storage = torch.FloatStorage(cutorch.CudaUVAAllocator, size:prod())
return torch.FloatTensor(storage)
end
function cutorch.createFloatUVATensor(...)
return _createUVATensor(...)
end
-- Creates a CudaTensor using the CudaUVAAllocator.
-- Accepts either a LongStorage or a sequence of numbers.
-- First creates a UVA backed FloatTensor and takes its pointer.
function cutorch.createCudaUVATensor(...)
-- Delegate actual allocation and synchronization to CPU tensor and
-- take the pointer.
local ft = _createUVATensor(...)
local storage = torch.CudaStorage(
ft:storage():size(),
tonumber(torch.data(ft:storage(), true))
)
return torch.CudaTensor(storage)
end
-- UVA storage is a single memory location backed by virtual addressing.
-- Converting between CPU / GPU tensor types is done by raw pointer passing.
-- We only support FloatTensor, CudaTensor, Cuda -> float and float -> Cuda atm
function cutorch.toFloatUVATensor(t)
if not torch.isTensor(t) then
error('Must use a tensor, got ' .. torch.type(t))
end
local storage = torch.FloatStorage(
t:storage():size(),
tonumber(torch.data(t:storage(), true))
)
assert(cutorch.isManaged(storage))
return torch.FloatTensor(storage)
end
function cutorch.toCudaUVATensor(t)
if not torch.isTensor(t) then
error('Must use a tensor, got ' .. torch.type(t))
end
local storage = torch.CudaStorage(
t:storage():size(),
tonumber(torch.data(t:storage(), true))
)
assert(cutorch.isManaged(storage))
return torch.CudaTensor(storage)
end
function cutorch.isManaged(t)
if not torch.isTensor(t) and not torch.isStorage(t) then
error('Usage: cutorch.isManaged(Tensor|Storage), got ' .. torch.type(t))
end
return cutorch.isManagedPtr(tonumber(torch.data(t, true)))
end
-- remove this line to disable automatic cutorch heap-tracking
-- for garbage collection
cutorch.setHeapTracking(true)
function torch.multinomialAliasSetup(probs, state)
if torch.type(state) == 'table' then
state[1], state[2] = torch.multinomialAliasSetup_(probs, state[1], state[2])
else
state = {}
state[1], state[2] = torch.multinomialAliasSetup_(probs)
end
return state
end
function torch.multinomialAlias(output, state)
torch.CudaTensor.multinomialAlias_(output, state[1], state[2])
return output
end
return cutorch