From 1603b2c34ad49f4926a1bc5daf7240eb9947b493 Mon Sep 17 00:00:00 2001 From: Vincenzo Innocente Date: Tue, 31 Jul 2018 12:03:15 +0200 Subject: [PATCH] Heterogeneous ClusterTPAssociation (#105) Implement a heterogeneous Cluster-to-TrackingParticle associator running on the GPU. --- .../plugins/SiPixelRawToClusterGPUKernel.cu | 21 +++++++----- .../plugins/SiPixelRawToClusterGPUKernel.h | 32 ++++++++++++++----- .../SiPixelRecHits/plugins/PixelRecHits.cu | 20 +++++------- 3 files changed, 45 insertions(+), 28 deletions(-) diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu index 29e5e82049b5c..7eb90cffa2d77 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu @@ -38,7 +38,7 @@ namespace pixelgpudetails { - SiPixelRawToClusterGPUKernel::SiPixelRawToClusterGPUKernel() { + SiPixelRawToClusterGPUKernel::SiPixelRawToClusterGPUKernel(cuda::stream_t<>& cudaStream) { int WSIZE = pixelgpudetails::MAX_FED * pixelgpudetails::MAX_WORD; cudaMallocHost(&word, sizeof(unsigned int)*WSIZE); cudaMallocHost(&fedId_h, sizeof(unsigned char)*WSIZE); @@ -90,6 +90,12 @@ namespace pixelgpudetails { cudaCheck(cudaMalloc((void**) & moduleStart_d, (MaxNumModules+1)*sizeof(uint32_t) )); cudaCheck(cudaMalloc((void**) & clusInModule_d,(MaxNumModules)*sizeof(uint32_t) )); cudaCheck(cudaMalloc((void**) & moduleId_d, (MaxNumModules)*sizeof(uint32_t) )); + + cudaCheck(cudaMalloc((void**) & gpuProduct_d, sizeof(GPUProduct))); + gpuProduct = getProduct(); + assert(xx_d==gpuProduct.xx_d); + + cudaCheck(cudaMemcpyAsync(gpuProduct_d, &gpuProduct, sizeof(GPUProduct), cudaMemcpyDefault,cudaStream.id())); } SiPixelRawToClusterGPUKernel::~SiPixelRawToClusterGPUKernel() { @@ -111,6 +117,7 @@ namespace pixelgpudetails { cudaCheck(cudaFree(clus_d)); cudaCheck(cudaFree(clusInModule_d)); cudaCheck(cudaFree(moduleId_d)); + cudaCheck(cudaFree(gpuProduct_d)); } void SiPixelRawToClusterGPUKernel::initializeWordFed(int fedId, unsigned int wordCounterGPU, const cms_uint32_t *src, unsigned int length) { @@ -478,7 +485,7 @@ namespace pixelgpudetails { XX[gIndex] = 0; // 0 is an indicator of a noise/dead channel YY[gIndex] = 0; // skip these pixels during clusterization ADC[gIndex] = 0; - continue ; // 0: bad word + continue; // 0: bad word } uint32_t link = getLink(ww); // Extract link @@ -521,9 +528,9 @@ namespace pixelgpudetails { // endcap ids layer = 0; panel = (rawId >> pixelgpudetails::panelStartBit) & pixelgpudetails::panelMask; - //disk = (rawId >> diskStartBit_) & diskMask_ ; + //disk = (rawId >> diskStartBit_) & diskMask_; side = (panel == 1)? -1 : 1; - //blade = (rawId>>bladeStartBit_) & bladeMask_; + //blade = (rawId >> bladeStartBit_) & bladeMask_; } // ***special case of layer to 1 be handled here @@ -558,8 +565,8 @@ namespace pixelgpudetails { } pixelgpudetails::Pixel globalPix = frameConversion(barrel, side, layer, rocIdInDetUnit, localPix); - XX[gIndex] = globalPix.row ; // origin shifting by 1 0-159 - YY[gIndex] = globalPix.col ; // origin shifting by 1 0-415 + XX[gIndex] = globalPix.row; // origin shifting by 1 0-159 + YY[gIndex] = globalPix.col; // origin shifting by 1 0-415 ADC[gIndex] = getADC(ww); pdigi[gIndex] = pixelgpudetails::pack(globalPix.row,globalPix.col,ADC[gIndex]); moduleId[gIndex] = detId.moduleId; @@ -583,7 +590,6 @@ namespace pixelgpudetails { const int threadsPerBlock = 512; const int blocks = (wordCounter + threadsPerBlock-1) /threadsPerBlock; // fill it all - assert(0 == wordCounter%2); // wordCounter is the total no of words in each event to be trasfered on device cudaCheck(cudaMemcpyAsync(&word_d[0], &word[0], wordCounter*sizeof(uint32_t), cudaMemcpyDefault, stream.id())); @@ -630,7 +636,6 @@ namespace pixelgpudetails { int threadsPerBlock = 256; int blocks = (wordCounter + threadsPerBlock - 1) / threadsPerBlock; - gpuCalibPixel::calibDigis<<>>( moduleInd_d, xx_d, yy_d, adc_d, diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h index 2b0b205c9f536..2f7436052902b 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h @@ -83,7 +83,7 @@ namespace pixelgpudetails { class Packing { public: using PackedDigiType = uint32_t; - + // Constructor: pre-computes masks and shifts from field widths __host__ __device__ inline @@ -144,22 +144,32 @@ namespace pixelgpudetails { (adc << thePacking.adc_shift); } + constexpr + uint32_t pixelToChannel( int row, int col) { + constexpr Packing thePacking = packing(); + return (row << thePacking.column_width) | col; + } + + using error_obj = siPixelRawToClusterHeterogeneousProduct::error_obj; class SiPixelRawToClusterGPUKernel { public: - SiPixelRawToClusterGPUKernel(); + + using GPUProduct = siPixelRawToClusterHeterogeneousProduct::GPUProduct; + + SiPixelRawToClusterGPUKernel(cuda::stream_t<>& cudaStream); ~SiPixelRawToClusterGPUKernel(); - + SiPixelRawToClusterGPUKernel(const SiPixelRawToClusterGPUKernel&) = delete; SiPixelRawToClusterGPUKernel(SiPixelRawToClusterGPUKernel&&) = delete; SiPixelRawToClusterGPUKernel& operator=(const SiPixelRawToClusterGPUKernel&) = delete; SiPixelRawToClusterGPUKernel& operator=(SiPixelRawToClusterGPUKernel&&) = delete; void initializeWordFed(int fedId, unsigned int wordCounterGPU, const cms_uint32_t *src, unsigned int length); - + // Not really very async yet... void makeClustersAsync(const SiPixelFedCablingMapGPU *cablingMap, const unsigned char *modToUnp, const SiPixelGainForHLTonGPU *gains, @@ -170,8 +180,9 @@ namespace pixelgpudetails { auto getProduct() const { return siPixelRawToClusterHeterogeneousProduct::GPUProduct{ pdigi_h, rawIdArr_h, clus_h, adc_h, error_h, - nDigis, nModulesActive, - xx_d, yy_d, adc_d, moduleInd_d, moduleStart_d,clus_d, clusInModule_d, moduleId_d + gpuProduct_d, + xx_d, yy_d, adc_d, moduleInd_d, moduleStart_d,clus_d, clusInModule_d, moduleId_d, + nDigis, nModulesActive }; } @@ -181,6 +192,11 @@ namespace pixelgpudetails { unsigned char *fedId_h = nullptr; // to hold fed index for each word // output + GPUProduct gpuProduct; + GPUProduct * gpuProduct_d; + + // FIXME cleanup all these are in the gpuProduct above... + uint32_t *pdigi_h = nullptr, *rawIdArr_h = nullptr; // host copy of output uint16_t *adc_h = nullptr; int32_t *clus_h = nullptr; // host copy of calib&clus output pixelgpudetails::error_obj *data_h = nullptr; @@ -209,7 +225,7 @@ namespace pixelgpudetails { uint32_t * clusInModule_d; uint32_t * moduleId_d; }; - + // configuration and memory buffers alocated on the GPU struct context { uint32_t * word_d; @@ -223,7 +239,7 @@ namespace pixelgpudetails { GPU::SimpleVector * error_d; error_obj * data_d; - + // these are for the clusterizer (to be moved) uint32_t * moduleStart_d; int32_t * clus_d; diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu index 999fdcd6eff19..62253ca9d7e1b 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu +++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu @@ -36,11 +36,10 @@ namespace pixelgpudetails { cudaCheck(cudaMalloc((void**) & gpu_.sortIndex_d,(gpuClustering::MaxNumModules*256)*sizeof(uint16_t))); cudaCheck(cudaMalloc((void**) & gpu_.mr_d,(gpuClustering::MaxNumModules*256)*sizeof(uint16_t))); cudaCheck(cudaMalloc((void**) & gpu_.mc_d,(gpuClustering::MaxNumModules*256)*sizeof(uint16_t))); -// cudaCheck(cudaMalloc((void**) & gpu_.hist_d, 10*sizeof(HitsOnGPU::Hist))); - + cudaCheck(cudaMalloc((void**) & gpu_.hist_d, 10*sizeof(HitsOnGPU::Hist))); cudaCheck(cudaMalloc((void**) & gpu_d, sizeof(HitsOnGPU))); + gpu_.me_d = gpu_d; cudaCheck(cudaMemcpyAsync(gpu_d, &gpu_, sizeof(HitsOnGPU), cudaMemcpyDefault,cudaStream.id())); - } PixelRecHitGPUKernel::~PixelRecHitGPUKernel() { @@ -59,8 +58,7 @@ namespace pixelgpudetails { cudaCheck(cudaFree(gpu_.sortIndex_d)); cudaCheck(cudaFree(gpu_.mr_d)); cudaCheck(cudaFree(gpu_.mc_d)); - // cudaCheck(cudaFree(gpu_.hist_d)); - + cudaCheck(cudaFree(gpu_.hist_d)); cudaCheck(cudaFree(gpu_d)); } @@ -78,7 +76,7 @@ namespace pixelgpudetails { input.clusInModule_d, input.clusInModule_d + gpuClustering::MaxNumModules, &gpu_.hitsModuleStart_d[1]); - + int threadsPerBlock = 256; int blocks = input.nModules; // active modules (with digis) gpuPixelRecHits::getHits<<>>( @@ -96,20 +94,20 @@ namespace pixelgpudetails { gpu_.xg_d, gpu_.yg_d, gpu_.zg_d, gpu_.rg_d, gpu_.iphi_d, gpu_.xl_d, gpu_.yl_d, - gpu_.xerr_d, gpu_.yerr_d, + gpu_.xerr_d, gpu_.yerr_d, gpu_.mr_d, gpu_.mc_d ); // needed only if hits on CPU are required... cudaCheck(cudaMemcpyAsync(hitsModuleStart_, gpu_.hitsModuleStart_d, (gpuClustering::MaxNumModules+1) * sizeof(uint32_t), cudaMemcpyDefault, stream.id())); - + // to be moved to gpu? auto nhits = hitsModuleStart_[gpuClustering::MaxNumModules]; for (int i=0;i<10;++i) hitsLayerStart_[i]=hitsModuleStart_[phase1PixelTopology::layerStart[i]]; hitsLayerStart_[10]=nhits; #ifdef GPU_DEBUG - std::cout << "hit layerStart "; + std::cout << "hit layerStart "; for (int i=0;i<10;++i) std::cout << phase1PixelTopology::layerName[i] << ':' << hitsLayerStart_[i] << ' '; std::cout << "end:" << hitsLayerStart_[10] << std::endl; #endif @@ -119,9 +117,7 @@ namespace pixelgpudetails { // for timing test // radixSortMultiWrapper<<<10, 256, 0, c.stream>>>(gpu_.iphi_d,gpu_.sortIndex_d,gpu_.hitsLayerStart_d); - // fillManyFromVector(gpu_.hist_d,10,gpu_.iphi_d, gpu_.hitsLayerStart_d, nhits,256,c.stream); - - + cudautils::fillManyFromVector(gpu_.hist_d,10,gpu_.iphi_d, gpu_.hitsLayerStart_d, nhits,256,stream.id()); } HitsOnCPU PixelRecHitGPUKernel::getOutput(cuda::stream_t<>& stream) const {