-
Notifications
You must be signed in to change notification settings - Fork 21
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Boolean Indexing is slow #18
Comments
Unfortunately, Matft is 5 times slower than numpy... Matft; do{
let a = Matft.arange(start: -10*10*10*10*10*5, to: 10*10*10*10*10*5, by: 1, shape: [10,10,10,10,10,10])
self.measure {
let _ = a[a>0]
}
/*
'-[PerformanceTests.IndexingPefTests testPeformanceBooleanIndexing1]' measured [Time, seconds] average: 0.007, relative standard deviation: 17.050%, values: [0.010224, 0.007128, 0.006454, 0.007535, 0.006929, 0.006481, 0.006221, 0.006312, 0.006142, 0.006018]
7ms
*/
} Numpy; import numpy as np
#import timeit
a = np.arange(-10**6/2,10**6/2).reshape((10,10,10,10,10,10))
#timeit.timeit("b+c", repeat=10, globals=globals())
%timeit -n 10 a[a>0]
1.36 ms ± 187 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) |
I think this parts cause above slowness |
Should use |
The commit 394d3a4 is invalid. public static func clip<T: MfTypable>(_ mfarray: MfArray, min: T? = nil, max: T? = nil) -> MfArray{
switch mfarray.storedType {
case .Float:
return _clip(vDSP_vclipc)
case .Double:
return _clip(vDSP_vclipcD)
}
}
fileprivate func _clip<T: MfStorable>(_ vDSP_func: vDSP_clip_func<T>) -> MfArray{
let min = min == nil ? -T.infinity : T.from(min!)
let max = max == nil ? T.infinity : T.from(max!)
return clip_by_vDSP(mfarray, min, max, vDSP_func)
} instead of public static func clip<T: MfTypable>(_ mfarray: MfArray, min: T? = nil, max: T? = nil) -> MfArray{
func _clip<T: MfStorable>(_ vDSP_func: vDSP_clip_func<T>) -> MfArray{
let min = min == nil ? -T.infinity : T.from(min!)
let max = max == nil ? T.infinity : T.from(max!)
return clip_by_vDSP(mfarray, min, max, vDSP_func)
}
switch mfarray.storedType {
case .Float:
return _clip(vDSP_vclipc)
case .Double:
return _clip(vDSP_vclipcD)
}
} |
Use this commit‘s function and stored bool. bool -> UInt8(cast only) -> Float(conversion) However, vDSP can handle Floating point types only, not including UInt8. |
https://gain-performance.com/ume/ |
stride simd sample; // GATHERU
UME_FORCE_INLINE SIMDVec_u & gatheru(uint64_t const * baseAddr, uint64_t stride) {
#if defined (__AVX512DQ__)
__m512i t0 = _mm512_set1_epi64(stride);
__m512i t1 = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7);
__m512i t2 = _mm512_setr_epi64(8, 9, 10, 11, 12, 13, 14, 15);
__m512i t3 = _mm512_mullo_epi64(t0, t1);
__m512i t4 = _mm512_mullo_epi64(t0, t2);
#else
__m512i t3 = _mm512_setr_epi64(0, stride, 2*stride, 3*stride, 4*stride, 5*stride, 6*stride, 7*stride);
__m512i t4 = _mm512_setr_epi64(8*stride, 9*stride, 10*stride, 11*stride, 12*stride, 13*stride, 14*stride, 15*stride);
#endif
#if defined(WA_GCC_INTR_SUPPORT_6_2)
// g++ has some interface issues.
mVec[0] = _mm512_i64gather_epi64(t3, (const long long int*)baseAddr, 8);
mVec[1] = _mm512_i64gather_epi64(t4, (const long long int*)baseAddr, 8);
#else
mVec[0] = _mm512_i64gather_epi64(t3, (int64_t const*)baseAddr, 8);
mVec[1] = _mm512_i64gather_epi64(t4, (int64_t const*)baseAddr, 8);
#endif
return *this;
}
// MGATHERU
UME_FORCE_INLINE SIMDVec_u & gatheru(SIMDVecMask<16> const & mask, uint64_t const * baseAddr, uint64_t stride) {
#if defined (__AVX512DQ__)
__m512i t0 = _mm512_set1_epi64(stride);
__m512i t1 = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7);
__m512i t2 = _mm512_setr_epi64(8, 9, 10, 11, 12, 13, 14, 15);
__m512i t3 = _mm512_mullo_epi64(t0, t1);
__m512i t4 = _mm512_mullo_epi64(t0, t2);
#else
__m512i t3 = _mm512_setr_epi64(0, stride, 2*stride, 3*stride, 4*stride, 5*stride, 6*stride, 7*stride);
__m512i t4 = _mm512_setr_epi64(8*stride, 9*stride, 10*stride, 11*stride, 12*stride, 13*stride, 14*stride, 15*stride);
#endif
__mmask8 m0 = mask.mMask & 0x00FF;
__mmask8 m1 = (mask.mMask & 0xFF00) >> 8;
#if defined(WA_GCC_INTR_SUPPORT_6_2)
// g++ has some interface issues.
__m512i t5 = _mm512_i64gather_epi64(t3, (const long long int*)baseAddr, 8);
__m512i t6 = _mm512_i64gather_epi64(t4, (const long long int*)baseAddr, 8);
#else
__m512i t5 = _mm512_i64gather_epi64(t3, (int64_t const*)baseAddr, 8);
__m512i t6 = _mm512_i64gather_epi64(t4, (int64_t const*)baseAddr, 8);
#endif
mVec[0] = _mm512_mask_mov_epi64(mVec[0], m0, t5);
mVec[1] = _mm512_mask_mov_epi64(mVec[1], m1, t6);
return *this;
}
|
Try ‘withMemoryRebound’ in evdsp_sign like this |
Bottleneck is the comparison operators. Use a specific vDSP function for comparison such that positive is true, negative is false |
May use this function |
How about following calculation?
|
I may find the solution... https://developer.apple.com/forums/thread/719117?answerId=734789022#734789022 |
I confirmed BNNS usage. I can implement the comparison operators like this! let a: [Float] = [1,2,3,4,5]
let b: [Float] = [1,-2,3,-4,5]
//var c: [Bool] = [0,0,0,0,0]
var c: [Bool] = [false,false,false,false,false]
let aDescriptor = BNNSNDArrayDescriptor.allocate(initializingFrom: a, shape: .vector(a.count))
let bDescriptor = BNNSNDArrayDescriptor.allocate(initializingFrom: b, shape: .vector(b.count))
let cDescriptor = BNNSNDArrayDescriptor.allocate(initializingFrom: c, shape: .vector(c.count))
try! BNNS.compare(aDescriptor, bDescriptor, using: .equal, output: cDescriptor)
var ret = cDescriptor.makeArray(of: UInt8.self)!
var retF: [Float] = [0,0,0,0,0]
ret.withUnsafeMutableBufferPointer{
retptr in
retF.withUnsafeMutableBufferPointer{
retFptr in
vDSP_vfltu8(retptr.baseAddress!, vDSP_Stride(1), retFptr.baseAddress!, vDSP_Stride(1), vDSP_Length(5))
}
}
print(retF) // [1.0, 0.0, 1.0, 0.0, 1.0] |
3 times faster! 17.8ms -> 6ms |
Regarding #17
Official boolean indexing code is
https://github.com/numpy/numpy/blob/cf1306a842d7b1064270bd06951a485121e60816/numpy/core/src/multiarray/mapping.c#L1010
SIMD function is
https://github.com/numpy/numpy/blob/45bc13e6d922690eea43b9d807d476e0f243f836/numpy/core/src/umath/loops_comparison.dispatch.c.src#L36
The text was updated successfully, but these errors were encountered: