forked from pytorch/pytorch
-
Notifications
You must be signed in to change notification settings - Fork 0
/
CopyKernel.cpp
72 lines (66 loc) · 2.61 KB
/
CopyKernel.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
#include <ATen/ATen.h>
#include <ATen/Dispatch.h>
#include <ATen/native/Copy.h>
#include <ATen/native/TensorIterator.h>
#include <ATen/native/cpu/Loops.h>
#include <c10/util/TypeCast.h>
namespace at {
namespace native {
namespace {
static void copy_kernel(TensorIterator& iter, bool non_blocking) {
ScalarType dtype = iter.dtype(0);
if (dtype == iter.dtype(1)) {
if (dtype == ScalarType::Half) {
cpu_kernel(iter, [=](at::Half a) -> at::Half { return a; });
} else if (dtype == ScalarType::BFloat16) {
cpu_kernel(iter, [=](at::BFloat16 a) -> at::BFloat16 { return a; });
} else if (isQIntType(dtype)) {
AT_DISPATCH_QINT_TYPES(dtype, "copy_kernel", [&] {
cpu_kernel_vec(
iter,
[=](scalar_t a) -> scalar_t { return a; },
[=](Vec256<scalar_t> a) -> Vec256<scalar_t> { return a; });
});
} else if (isComplexType(dtype)) {
AT_DISPATCH_COMPLEX_TYPES(dtype, "copy_kernel", [&] {
cpu_kernel(
iter,
[=](scalar_t a) -> scalar_t { return a; });
});
} else {
AT_DISPATCH_ALL_TYPES_AND(
ScalarType::Bool, dtype, "copy_kernel", [&] {
cpu_kernel_vec(
iter,
[=](scalar_t a) -> scalar_t { return a; },
[=](Vec256<scalar_t> a) { return a; });
});
}
} else {
AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(ScalarType::Half, ScalarType::Bool, ScalarType::BFloat16, dtype, "copy_", [&] {
using dest_t = scalar_t;
AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(ScalarType::Half, ScalarType::Bool, ScalarType::BFloat16, iter.dtype(1), "copy_", [&] {
// Note (@zasdfgbnm):
//
// The code below can not be simplified as
// cpu_kernel(iter, c10::static_cast_with_inter_type<dest_t, scalar_t>::apply);
//
// because this would force the compiler to instantiate the inline function and generate a function call in the loop
// instead of inlining it, making all the optimizations like vectorization impossible.
// You can verify this by looking the the symbols of `libtorch_cpu.so`:
//
// readelf -Ws libtorch_cpu.so | grep static_cast_with_inter_type
//
// If done correctly, the above command should have no output.
//
// See: https://github.com/pytorch/pytorch/issues/31271
cpu_kernel(iter, [](scalar_t src) -> dest_t {
return c10::static_cast_with_inter_type<dest_t, scalar_t>::apply(src); });
});
});
}
}
} // anonymous namespace
REGISTER_DISPATCH(copy_stub, ©_kernel);
} // namespace native
} // namespace at