Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

__syncthreads() needed for reduction ? #13

Open
zjin-lcf opened this issue Dec 16, 2020 · 1 comment
Open

__syncthreads() needed for reduction ? #13

zjin-lcf opened this issue Dec 16, 2020 · 1 comment

Comments

@zjin-lcf
Copy link

For the following codes in miniFE, the comments show that __syncthreads() are not needed in a warp. However, I think __syncthreads() are actually needed to produce correct sum results. I got incorrect sum results when omitting them. Could you reproduce the issue ? Thank you for your comments.

template<typename Vector>
__global__ void dot_kernel(const Vector x, const Vector y, typename TypeTraits<typename Vector::ScalarType>::magnitude_type *d) {

  typedef typename TypeTraits<typename Vector::ScalarType>::magnitude_type magnitude;
  const int BLOCK_SIZE=512;

  magnitude sum=0;
  for(int idx=blockIdx.x*blockDim.x+threadIdx.x;idx<x.n;idx+=gridDim.x*blockDim.x) {
    sum+=x.coefs[idx]*y.coefs[idx];
  }

  //Do a shared memory reduction on the dot product
  __shared__ volatile magnitude red[BLOCK_SIZE];
  red[threadIdx.x]=sum;
  //__syncthreads(); if(threadIdx.x<512) {sum+=red[threadIdx.x+512]; red[threadIdx.x]=sum;}
  __syncthreads(); if(threadIdx.x<256)  {sum+=red[threadIdx.x+256]; red[threadIdx.x]=sum;}
  __syncthreads(); if(threadIdx.x<128)  {sum+=red[threadIdx.x+128]; red[threadIdx.x]=sum;}
  __syncthreads(); if(threadIdx.x<64)   {sum+=red[threadIdx.x+64];  red[threadIdx.x]=sum;}
  __syncthreads(); if(threadIdx.x<32)   {sum+=red[threadIdx.x+32];  red[threadIdx.x]=sum;}
  //the remaining ones don't need syncthreads because they are warp synchronous
                   if(threadIdx.x<16)   {sum+=red[threadIdx.x+16];  red[threadIdx.x]=sum;}
                   if(threadIdx.x<8)    {sum+=red[threadIdx.x+8];   red[threadIdx.x]=sum;}
                   if(threadIdx.x<4)    {sum+=red[threadIdx.x+4];   red[threadIdx.x]=sum;}
                   if(threadIdx.x<2)    {sum+=red[threadIdx.x+2];   red[threadIdx.x]=sum;}
                   if(threadIdx.x<1)    {sum+=red[threadIdx.x+1];}

  //save partial dot products
  if(threadIdx.x==0) d[blockIdx.x]=sum;
}

template<typename Scalar>
__global__ void dot_final_reduce_kernel(Scalar *d) {
  const int BLOCK_SIZE=1024;
  Scalar sum=d[threadIdx.x];
  __shared__ volatile Scalar red[BLOCK_SIZE];

  red[threadIdx.x]=sum;
  __syncthreads(); if(threadIdx.x<512)  {sum+=red[threadIdx.x+512]; red[threadIdx.x]=sum;}
  __syncthreads(); if(threadIdx.x<256)  {sum+=red[threadIdx.x+256]; red[threadIdx.x]=sum;}
  __syncthreads(); if(threadIdx.x<128)  {sum+=red[threadIdx.x+128]; red[threadIdx.x]=sum;}
  __syncthreads(); if(threadIdx.x<64)   {sum+=red[threadIdx.x+64];  red[threadIdx.x]=sum;}
  __syncthreads(); if(threadIdx.x<32)   {sum+=red[threadIdx.x+32];  red[threadIdx.x]=sum;}
  //the remaining ones don't need syncthreads because they are warp synchronous
                   if(threadIdx.x<16)   {sum+=red[threadIdx.x+16];  red[threadIdx.x]=sum;}
                   if(threadIdx.x<8)    {sum+=red[threadIdx.x+8];   red[threadIdx.x]=sum;}
                   if(threadIdx.x<4)    {sum+=red[threadIdx.x+4];   red[threadIdx.x]=sum;}
                   if(threadIdx.x<2)    {sum+=red[threadIdx.x+2];   red[threadIdx.x]=sum;}
                   if(threadIdx.x<1)    {sum+=red[threadIdx.x+1];}

  //save final dot product at the front
                   if(threadIdx.x==0) d[0]=sum;
}
#define BLOCK_SIZE  256

#include <stdio.h>
#include <cuda.h>

__global__ void dot_kernel(const int n, const int* x, const int* y, int *d) {

  int sum=0;
  for(int idx=blockIdx.x*blockDim.x+threadIdx.x;idx<n;idx+=gridDim.x*blockDim.x) {
    sum+=x[idx]*y[idx];
  }

  //Do a shared memory reduction on the dot product
  __shared__ int red[BLOCK_SIZE];
  red[threadIdx.x]=sum;
  #pragma unroll
  for (int n = 128; n > 0; n = n/2) {   // incorrect results when syncthreads() are omitted in a wrap
     __syncthreads();
     if(threadIdx.x<n)  {sum+=red[threadIdx.x+n]; red[threadIdx.x]=sum;}
  }

  //save partial dot products
  if(threadIdx.x==0) d[blockIdx.x]=sum;
}

__global__ void final(int *d) {
  int sum=d[threadIdx.x];
  __shared__ int red[BLOCK_SIZE];

  red[threadIdx.x]=sum;
  #pragma unroll
  for (int n = 128; n > 0; n = n/2) {    
     __syncthreads();
     if(threadIdx.x<n)  {sum+=red[threadIdx.x+n]; red[threadIdx.x]=sum;}
  }
  //save final dot product at the front
  if(threadIdx.x==0) d[0]=sum;
}

#define LEN 1025
int main() {
  int a[LEN];
  int b[LEN];
  int r[256];
  srand(2);
  int sum = 0;
  int d_sum = 0;

// sum on the host
  for (int i = 0; i < LEN; i++) {
    a[i] = rand() % 3;
    b[i] = rand() % 3;
    sum += a[i]*b[i];
  }

// sum on the device
  int *da, *db;
  int *dr;
  const int n = LEN;
  cudaMalloc((void**)&da, sizeof(int)*LEN);
  cudaMalloc((void**)&db, sizeof(int)*LEN);
  cudaMalloc((void**)&dr, sizeof(int)*256);
  cudaMemcpy(da, a, sizeof(int)*LEN, cudaMemcpyHostToDevice);
  cudaMemcpy(db, b, sizeof(int)*LEN, cudaMemcpyHostToDevice);
  dot_kernel<<<(n+255)/256, 256 >>>(n, da,db,dr);
  final<<<1, 256>>>(dr);
  cudaMemcpy(&d_sum, dr, sizeof(int), cudaMemcpyDeviceToHost);
  printf("%d %d\n", sum ,d_sum);
  cudaFree(da);
  cudaFree(db);
  cudaFree(dr);
  return 0;
}
@crtrott
Copy link
Member

crtrott commented Dec 19, 2020

Actually this needs to be updated for volta. It needs warp synchronization not block synchs

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants