You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
For the following codes in miniFE, the comments show that __syncthreads() are not needed in a warp. However, I think __syncthreads() are actually needed to produce correct sum results. I got incorrect sum results when omitting them. Could you reproduce the issue ? Thank you for your comments.
template<typename Vector>
__global__ void dot_kernel(const Vector x, const Vector y, typename TypeTraits<typename Vector::ScalarType>::magnitude_type *d) {
typedef typename TypeTraits<typename Vector::ScalarType>::magnitude_type magnitude;
const int BLOCK_SIZE=512;
magnitude sum=0;
for(int idx=blockIdx.x*blockDim.x+threadIdx.x;idx<x.n;idx+=gridDim.x*blockDim.x) {
sum+=x.coefs[idx]*y.coefs[idx];
}
//Do a shared memory reduction on the dot product
__shared__ volatile magnitude red[BLOCK_SIZE];
red[threadIdx.x]=sum;
//__syncthreads(); if(threadIdx.x<512) {sum+=red[threadIdx.x+512]; red[threadIdx.x]=sum;}
__syncthreads(); if(threadIdx.x<256) {sum+=red[threadIdx.x+256]; red[threadIdx.x]=sum;}
__syncthreads(); if(threadIdx.x<128) {sum+=red[threadIdx.x+128]; red[threadIdx.x]=sum;}
__syncthreads(); if(threadIdx.x<64) {sum+=red[threadIdx.x+64]; red[threadIdx.x]=sum;}
__syncthreads(); if(threadIdx.x<32) {sum+=red[threadIdx.x+32]; red[threadIdx.x]=sum;}
//the remaining ones don't need syncthreads because they are warp synchronous
if(threadIdx.x<16) {sum+=red[threadIdx.x+16]; red[threadIdx.x]=sum;}
if(threadIdx.x<8) {sum+=red[threadIdx.x+8]; red[threadIdx.x]=sum;}
if(threadIdx.x<4) {sum+=red[threadIdx.x+4]; red[threadIdx.x]=sum;}
if(threadIdx.x<2) {sum+=red[threadIdx.x+2]; red[threadIdx.x]=sum;}
if(threadIdx.x<1) {sum+=red[threadIdx.x+1];}
//save partial dot products
if(threadIdx.x==0) d[blockIdx.x]=sum;
}
template<typename Scalar>
__global__ void dot_final_reduce_kernel(Scalar *d) {
const int BLOCK_SIZE=1024;
Scalar sum=d[threadIdx.x];
__shared__ volatile Scalar red[BLOCK_SIZE];
red[threadIdx.x]=sum;
__syncthreads(); if(threadIdx.x<512) {sum+=red[threadIdx.x+512]; red[threadIdx.x]=sum;}
__syncthreads(); if(threadIdx.x<256) {sum+=red[threadIdx.x+256]; red[threadIdx.x]=sum;}
__syncthreads(); if(threadIdx.x<128) {sum+=red[threadIdx.x+128]; red[threadIdx.x]=sum;}
__syncthreads(); if(threadIdx.x<64) {sum+=red[threadIdx.x+64]; red[threadIdx.x]=sum;}
__syncthreads(); if(threadIdx.x<32) {sum+=red[threadIdx.x+32]; red[threadIdx.x]=sum;}
//the remaining ones don't need syncthreads because they are warp synchronous
if(threadIdx.x<16) {sum+=red[threadIdx.x+16]; red[threadIdx.x]=sum;}
if(threadIdx.x<8) {sum+=red[threadIdx.x+8]; red[threadIdx.x]=sum;}
if(threadIdx.x<4) {sum+=red[threadIdx.x+4]; red[threadIdx.x]=sum;}
if(threadIdx.x<2) {sum+=red[threadIdx.x+2]; red[threadIdx.x]=sum;}
if(threadIdx.x<1) {sum+=red[threadIdx.x+1];}
//save final dot product at the front
if(threadIdx.x==0) d[0]=sum;
}
#define BLOCK_SIZE 256
#include <stdio.h>
#include <cuda.h>
__global__ void dot_kernel(const int n, const int* x, const int* y, int *d) {
int sum=0;
for(int idx=blockIdx.x*blockDim.x+threadIdx.x;idx<n;idx+=gridDim.x*blockDim.x) {
sum+=x[idx]*y[idx];
}
//Do a shared memory reduction on the dot product
__shared__ int red[BLOCK_SIZE];
red[threadIdx.x]=sum;
#pragma unroll
for (int n = 128; n > 0; n = n/2) { // incorrect results when syncthreads() are omitted in a wrap
__syncthreads();
if(threadIdx.x<n) {sum+=red[threadIdx.x+n]; red[threadIdx.x]=sum;}
}
//save partial dot products
if(threadIdx.x==0) d[blockIdx.x]=sum;
}
__global__ void final(int *d) {
int sum=d[threadIdx.x];
__shared__ int red[BLOCK_SIZE];
red[threadIdx.x]=sum;
#pragma unroll
for (int n = 128; n > 0; n = n/2) {
__syncthreads();
if(threadIdx.x<n) {sum+=red[threadIdx.x+n]; red[threadIdx.x]=sum;}
}
//save final dot product at the front
if(threadIdx.x==0) d[0]=sum;
}
#define LEN 1025
int main() {
int a[LEN];
int b[LEN];
int r[256];
srand(2);
int sum = 0;
int d_sum = 0;
// sum on the host
for (int i = 0; i < LEN; i++) {
a[i] = rand() % 3;
b[i] = rand() % 3;
sum += a[i]*b[i];
}
// sum on the device
int *da, *db;
int *dr;
const int n = LEN;
cudaMalloc((void**)&da, sizeof(int)*LEN);
cudaMalloc((void**)&db, sizeof(int)*LEN);
cudaMalloc((void**)&dr, sizeof(int)*256);
cudaMemcpy(da, a, sizeof(int)*LEN, cudaMemcpyHostToDevice);
cudaMemcpy(db, b, sizeof(int)*LEN, cudaMemcpyHostToDevice);
dot_kernel<<<(n+255)/256, 256 >>>(n, da,db,dr);
final<<<1, 256>>>(dr);
cudaMemcpy(&d_sum, dr, sizeof(int), cudaMemcpyDeviceToHost);
printf("%d %d\n", sum ,d_sum);
cudaFree(da);
cudaFree(db);
cudaFree(dr);
return 0;
}
The text was updated successfully, but these errors were encountered:
For the following codes in miniFE, the comments show that __syncthreads() are not needed in a warp. However, I think __syncthreads() are actually needed to produce correct sum results. I got incorrect sum results when omitting them. Could you reproduce the issue ? Thank you for your comments.
The text was updated successfully, but these errors were encountered: