diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu index 29e5e82049b5c..7eb90cffa2d77 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu @@ -38,7 +38,7 @@ namespace pixelgpudetails { - SiPixelRawToClusterGPUKernel::SiPixelRawToClusterGPUKernel() { + SiPixelRawToClusterGPUKernel::SiPixelRawToClusterGPUKernel(cuda::stream_t<>& cudaStream) { int WSIZE = pixelgpudetails::MAX_FED * pixelgpudetails::MAX_WORD; cudaMallocHost(&word, sizeof(unsigned int)*WSIZE); cudaMallocHost(&fedId_h, sizeof(unsigned char)*WSIZE); @@ -90,6 +90,12 @@ namespace pixelgpudetails { cudaCheck(cudaMalloc((void**) & moduleStart_d, (MaxNumModules+1)*sizeof(uint32_t) )); cudaCheck(cudaMalloc((void**) & clusInModule_d,(MaxNumModules)*sizeof(uint32_t) )); cudaCheck(cudaMalloc((void**) & moduleId_d, (MaxNumModules)*sizeof(uint32_t) )); + + cudaCheck(cudaMalloc((void**) & gpuProduct_d, sizeof(GPUProduct))); + gpuProduct = getProduct(); + assert(xx_d==gpuProduct.xx_d); + + cudaCheck(cudaMemcpyAsync(gpuProduct_d, &gpuProduct, sizeof(GPUProduct), cudaMemcpyDefault,cudaStream.id())); } SiPixelRawToClusterGPUKernel::~SiPixelRawToClusterGPUKernel() { @@ -111,6 +117,7 @@ namespace pixelgpudetails { cudaCheck(cudaFree(clus_d)); cudaCheck(cudaFree(clusInModule_d)); cudaCheck(cudaFree(moduleId_d)); + cudaCheck(cudaFree(gpuProduct_d)); } void SiPixelRawToClusterGPUKernel::initializeWordFed(int fedId, unsigned int wordCounterGPU, const cms_uint32_t *src, unsigned int length) { @@ -478,7 +485,7 @@ namespace pixelgpudetails { XX[gIndex] = 0; // 0 is an indicator of a noise/dead channel YY[gIndex] = 0; // skip these pixels during clusterization ADC[gIndex] = 0; - continue ; // 0: bad word + continue; // 0: bad word } uint32_t link = getLink(ww); // Extract link @@ -521,9 +528,9 @@ namespace pixelgpudetails { // endcap ids layer = 0; panel = (rawId >> pixelgpudetails::panelStartBit) & pixelgpudetails::panelMask; - //disk = (rawId >> diskStartBit_) & diskMask_ ; + //disk = (rawId >> diskStartBit_) & diskMask_; side = (panel == 1)? -1 : 1; - //blade = (rawId>>bladeStartBit_) & bladeMask_; + //blade = (rawId >> bladeStartBit_) & bladeMask_; } // ***special case of layer to 1 be handled here @@ -558,8 +565,8 @@ namespace pixelgpudetails { } pixelgpudetails::Pixel globalPix = frameConversion(barrel, side, layer, rocIdInDetUnit, localPix); - XX[gIndex] = globalPix.row ; // origin shifting by 1 0-159 - YY[gIndex] = globalPix.col ; // origin shifting by 1 0-415 + XX[gIndex] = globalPix.row; // origin shifting by 1 0-159 + YY[gIndex] = globalPix.col; // origin shifting by 1 0-415 ADC[gIndex] = getADC(ww); pdigi[gIndex] = pixelgpudetails::pack(globalPix.row,globalPix.col,ADC[gIndex]); moduleId[gIndex] = detId.moduleId; @@ -583,7 +590,6 @@ namespace pixelgpudetails { const int threadsPerBlock = 512; const int blocks = (wordCounter + threadsPerBlock-1) /threadsPerBlock; // fill it all - assert(0 == wordCounter%2); // wordCounter is the total no of words in each event to be trasfered on device cudaCheck(cudaMemcpyAsync(&word_d[0], &word[0], wordCounter*sizeof(uint32_t), cudaMemcpyDefault, stream.id())); @@ -630,7 +636,6 @@ namespace pixelgpudetails { int threadsPerBlock = 256; int blocks = (wordCounter + threadsPerBlock - 1) / threadsPerBlock; - gpuCalibPixel::calibDigis<<>>( moduleInd_d, xx_d, yy_d, adc_d, diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h index 2b0b205c9f536..2f7436052902b 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h @@ -83,7 +83,7 @@ namespace pixelgpudetails { class Packing { public: using PackedDigiType = uint32_t; - + // Constructor: pre-computes masks and shifts from field widths __host__ __device__ inline @@ -144,22 +144,32 @@ namespace pixelgpudetails { (adc << thePacking.adc_shift); } + constexpr + uint32_t pixelToChannel( int row, int col) { + constexpr Packing thePacking = packing(); + return (row << thePacking.column_width) | col; + } + + using error_obj = siPixelRawToClusterHeterogeneousProduct::error_obj; class SiPixelRawToClusterGPUKernel { public: - SiPixelRawToClusterGPUKernel(); + + using GPUProduct = siPixelRawToClusterHeterogeneousProduct::GPUProduct; + + SiPixelRawToClusterGPUKernel(cuda::stream_t<>& cudaStream); ~SiPixelRawToClusterGPUKernel(); - + SiPixelRawToClusterGPUKernel(const SiPixelRawToClusterGPUKernel&) = delete; SiPixelRawToClusterGPUKernel(SiPixelRawToClusterGPUKernel&&) = delete; SiPixelRawToClusterGPUKernel& operator=(const SiPixelRawToClusterGPUKernel&) = delete; SiPixelRawToClusterGPUKernel& operator=(SiPixelRawToClusterGPUKernel&&) = delete; void initializeWordFed(int fedId, unsigned int wordCounterGPU, const cms_uint32_t *src, unsigned int length); - + // Not really very async yet... void makeClustersAsync(const SiPixelFedCablingMapGPU *cablingMap, const unsigned char *modToUnp, const SiPixelGainForHLTonGPU *gains, @@ -170,8 +180,9 @@ namespace pixelgpudetails { auto getProduct() const { return siPixelRawToClusterHeterogeneousProduct::GPUProduct{ pdigi_h, rawIdArr_h, clus_h, adc_h, error_h, - nDigis, nModulesActive, - xx_d, yy_d, adc_d, moduleInd_d, moduleStart_d,clus_d, clusInModule_d, moduleId_d + gpuProduct_d, + xx_d, yy_d, adc_d, moduleInd_d, moduleStart_d,clus_d, clusInModule_d, moduleId_d, + nDigis, nModulesActive }; } @@ -181,6 +192,11 @@ namespace pixelgpudetails { unsigned char *fedId_h = nullptr; // to hold fed index for each word // output + GPUProduct gpuProduct; + GPUProduct * gpuProduct_d; + + // FIXME cleanup all these are in the gpuProduct above... + uint32_t *pdigi_h = nullptr, *rawIdArr_h = nullptr; // host copy of output uint16_t *adc_h = nullptr; int32_t *clus_h = nullptr; // host copy of calib&clus output pixelgpudetails::error_obj *data_h = nullptr; @@ -209,7 +225,7 @@ namespace pixelgpudetails { uint32_t * clusInModule_d; uint32_t * moduleId_d; }; - + // configuration and memory buffers alocated on the GPU struct context { uint32_t * word_d; @@ -223,7 +239,7 @@ namespace pixelgpudetails { GPU::SimpleVector * error_d; error_obj * data_d; - + // these are for the clusterizer (to be moved) uint32_t * moduleStart_d; int32_t * clus_d;