diff --git a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h index baa2e7f64c613..c398b29a64c69 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h +++ b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h @@ -8,14 +8,6 @@ #include #endif // __CUDA_ARCH__ -#ifdef __CUDACC__ -#include -#else -#define __device__ -#define __global__ -#define __host__ -#endif // __CUDACC__ - #include "HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h" #ifdef __CUDACC__ diff --git a/HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h b/HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h index 4bfa4e694d9dc..c107f4b228539 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h +++ b/HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h @@ -1,6 +1,9 @@ #ifndef HeterogeneousCore_CUDAUtilities_cudastdAlgorithm_h #define HeterogeneousCore_CUDAUtilities_cudastdAlgorithm_h +#include + + #include // reimplementation of std algorithms able to compile with CUDA and run on GPUs, @@ -10,6 +13,7 @@ namespace cuda_std { template struct less { + __device__ __host__ constexpr bool operator()(const T &lhs, const T &rhs) const { return lhs < rhs; } @@ -18,10 +22,12 @@ namespace cuda_std { template<> struct less { template + __device__ __host__ constexpr bool operator()(const T &lhs, const U &rhs ) const { return lhs < rhs;} }; template> + __device__ __host__ constexpr RandomIt lower_bound(RandomIt first, RandomIt last, const T& value, Compare comp={}) { @@ -43,6 +49,7 @@ namespace cuda_std { } template> + __device__ __host__ constexpr RandomIt upper_bound(RandomIt first, RandomIt last, const T& value, Compare comp={}) { @@ -64,6 +71,7 @@ namespace cuda_std { } template> + __device__ __host__ constexpr RandomIt binary_find(RandomIt first, RandomIt last, const T& value, Compare comp={}) { diff --git a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml index 1c1d28dbf3761..ab97c243c4385 100644 --- a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml +++ b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml @@ -1,31 +1,31 @@ + + - + + + - - - - + + + - - - diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu index 29e5e82049b5c..88004245dddae 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu @@ -38,7 +38,7 @@ namespace pixelgpudetails { - SiPixelRawToClusterGPUKernel::SiPixelRawToClusterGPUKernel() { + SiPixelRawToClusterGPUKernel::SiPixelRawToClusterGPUKernel(cuda::stream_t<>& cudaStream) { int WSIZE = pixelgpudetails::MAX_FED * pixelgpudetails::MAX_WORD; cudaMallocHost(&word, sizeof(unsigned int)*WSIZE); cudaMallocHost(&fedId_h, sizeof(unsigned char)*WSIZE); @@ -90,6 +90,13 @@ namespace pixelgpudetails { cudaCheck(cudaMalloc((void**) & moduleStart_d, (MaxNumModules+1)*sizeof(uint32_t) )); cudaCheck(cudaMalloc((void**) & clusInModule_d,(MaxNumModules)*sizeof(uint32_t) )); cudaCheck(cudaMalloc((void**) & moduleId_d, (MaxNumModules)*sizeof(uint32_t) )); + + cudaCheck(cudaMalloc((void**) & gpuProduct_d, sizeof(GPUProduct))); + gpuProduct = getProduct(); + assert(xx_d==gpuProduct.xx_d); + + cudaCheck(cudaMemcpyAsync(gpuProduct_d, &gpuProduct, sizeof(GPUProduct), cudaMemcpyDefault,cudaStream.id())); + } SiPixelRawToClusterGPUKernel::~SiPixelRawToClusterGPUKernel() { @@ -111,6 +118,10 @@ namespace pixelgpudetails { cudaCheck(cudaFree(clus_d)); cudaCheck(cudaFree(clusInModule_d)); cudaCheck(cudaFree(moduleId_d)); + + cudaCheck(cudaFree(gpuProduct_d)); + + } void SiPixelRawToClusterGPUKernel::initializeWordFed(int fedId, unsigned int wordCounterGPU, const cms_uint32_t *src, unsigned int length) { diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h index 2b0b205c9f536..bb7768ebf5a76 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h @@ -144,12 +144,22 @@ namespace pixelgpudetails { (adc << thePacking.adc_shift); } + constexpr + uint32_t pixelToChannel( int row, int col) { + constexpr Packing thePacking = packing(); + return (row << thePacking.column_width) | col; + } + + using error_obj = siPixelRawToClusterHeterogeneousProduct::error_obj; class SiPixelRawToClusterGPUKernel { public: - SiPixelRawToClusterGPUKernel(); + + using GPUProduct = siPixelRawToClusterHeterogeneousProduct::GPUProduct; + + SiPixelRawToClusterGPUKernel(cuda::stream_t<>& cudaStream); ~SiPixelRawToClusterGPUKernel(); @@ -170,8 +180,9 @@ namespace pixelgpudetails { auto getProduct() const { return siPixelRawToClusterHeterogeneousProduct::GPUProduct{ pdigi_h, rawIdArr_h, clus_h, adc_h, error_h, - nDigis, nModulesActive, - xx_d, yy_d, adc_d, moduleInd_d, moduleStart_d,clus_d, clusInModule_d, moduleId_d + gpuProduct_d, + xx_d, yy_d, adc_d, moduleInd_d, moduleStart_d,clus_d, clusInModule_d, moduleId_d, + nDigis, nModulesActive }; } @@ -181,6 +192,11 @@ namespace pixelgpudetails { unsigned char *fedId_h = nullptr; // to hold fed index for each word // output + GPUProduct gpuProduct; + GPUProduct * gpuProduct_d; + + // FIXME cleanup all these are in the gpuProduct above... + uint32_t *pdigi_h = nullptr, *rawIdArr_h = nullptr; // host copy of output uint16_t *adc_h = nullptr; int32_t *clus_h = nullptr; // host copy of calib&clus output pixelgpudetails::error_obj *data_h = nullptr; diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc index 196a08b85d861..e2c2dcf828685 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc @@ -425,7 +425,7 @@ void SiPixelRawToClusterHeterogeneous::produceCPU(edm::HeterogeneousEvent& ev, c // ----------------------------------------------------------------------------- void SiPixelRawToClusterHeterogeneous::beginStreamGPUCuda(edm::StreamID streamId, cuda::stream_t<>& cudaStream) { // Allocate GPU resources here - gpuAlgo_ = std::make_unique(); + gpuAlgo_ = std::make_unique(cudaStream); gpuModulesToUnpack_ = std::make_unique(); } @@ -523,6 +523,7 @@ void SiPixelRawToClusterHeterogeneous::acquireGPUCuda(const edm::HeterogeneousEv void SiPixelRawToClusterHeterogeneous::produceGPUCuda(edm::HeterogeneousEvent& ev, const edm::EventSetup& es, cuda::stream_t<>& cudaStream) { auto output = std::make_unique(gpuAlgo_->getProduct()); + assert(output->me_d); ev.put(std::move(output), [this](const GPUProduct& gpu, CPUProduct& cpu) { this->convertGPUtoCPU(gpu, cpu); }); diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/siPixelRawToClusterHeterogeneousProduct.h b/RecoLocalTracker/SiPixelClusterizer/plugins/siPixelRawToClusterHeterogeneousProduct.h index 6d6da10934532..5c0d027fe57b5 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/siPixelRawToClusterHeterogeneousProduct.h +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/siPixelRawToClusterHeterogeneousProduct.h @@ -36,6 +36,7 @@ namespace siPixelRawToClusterHeterogeneousProduct { { } }; + // FIXME split in two struct GPUProduct { // Needed for digi and cluster CPU output uint32_t const * pdigi_h = nullptr; @@ -44,9 +45,9 @@ namespace siPixelRawToClusterHeterogeneousProduct { uint16_t const * adc_h = nullptr; GPU::SimpleVector const * error_h = nullptr; + GPUProduct const * me_d = nullptr; + // Needed for GPU rechits - uint32_t nDigis; - uint32_t nModules; uint16_t const * xx_d; uint16_t const * yy_d; uint16_t const * adc_d; @@ -55,6 +56,10 @@ namespace siPixelRawToClusterHeterogeneousProduct { int32_t const * clus_d; uint32_t const * clusInModule_d; uint32_t const * moduleId_d; + + uint32_t nDigis; + uint32_t nModules; + }; using HeterogeneousDigiCluster = HeterogeneousProductImpl, diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu index 999fdcd6eff19..1bf3c7dce3d94 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu +++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu @@ -36,9 +36,10 @@ namespace pixelgpudetails { cudaCheck(cudaMalloc((void**) & gpu_.sortIndex_d,(gpuClustering::MaxNumModules*256)*sizeof(uint16_t))); cudaCheck(cudaMalloc((void**) & gpu_.mr_d,(gpuClustering::MaxNumModules*256)*sizeof(uint16_t))); cudaCheck(cudaMalloc((void**) & gpu_.mc_d,(gpuClustering::MaxNumModules*256)*sizeof(uint16_t))); -// cudaCheck(cudaMalloc((void**) & gpu_.hist_d, 10*sizeof(HitsOnGPU::Hist))); + cudaCheck(cudaMalloc((void**) & gpu_.hist_d, 10*sizeof(HitsOnGPU::Hist))); cudaCheck(cudaMalloc((void**) & gpu_d, sizeof(HitsOnGPU))); + gpu_.me_d = gpu_d; cudaCheck(cudaMemcpyAsync(gpu_d, &gpu_, sizeof(HitsOnGPU), cudaMemcpyDefault,cudaStream.id())); } @@ -59,7 +60,7 @@ namespace pixelgpudetails { cudaCheck(cudaFree(gpu_.sortIndex_d)); cudaCheck(cudaFree(gpu_.mr_d)); cudaCheck(cudaFree(gpu_.mc_d)); - // cudaCheck(cudaFree(gpu_.hist_d)); + cudaCheck(cudaFree(gpu_.hist_d)); cudaCheck(cudaFree(gpu_d)); } @@ -119,7 +120,7 @@ namespace pixelgpudetails { // for timing test // radixSortMultiWrapper<<<10, 256, 0, c.stream>>>(gpu_.iphi_d,gpu_.sortIndex_d,gpu_.hitsLayerStart_d); - // fillManyFromVector(gpu_.hist_d,10,gpu_.iphi_d, gpu_.hitsLayerStart_d, nhits,256,c.stream); + cudautils::fillManyFromVector(gpu_.hist_d,10,gpu_.iphi_d, gpu_.hitsLayerStart_d, nhits,256,stream.id()); } diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h b/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h index 6ccd435faa7ba..79608c13713d7 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h +++ b/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h @@ -8,7 +8,7 @@ #include #include -// #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" +#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" namespace siPixelRecHitsHeterogeneousProduct { @@ -31,22 +31,26 @@ namespace siPixelRecHitsHeterogeneousProduct { uint16_t * mr_d; uint16_t * mc_d; - // using Hist = HistoContainer; - // Hist * hist_d; + using Hist = HistoContainer; + Hist * hist_d; + + HitsOnGPU const * me_d=nullptr; }; struct HitsOnCPU { HitsOnCPU() = default; explicit HitsOnCPU(uint32_t nhits) : - charge(nhits),xl(nhits),yl(nhits),xe(nhits),ye(nhits), mr(nhits), mc(nhits){} + charge(nhits),xl(nhits),yl(nhits),xe(nhits),ye(nhits), mr(nhits), mc(nhits), + nHits(nhits){} uint32_t hitsModuleStart[2001]; std::vector charge; std::vector xl, yl; std::vector xe, ye; std::vector mr; std::vector mc; - + + uint32_t nHits; HitsOnGPU const * gpu_d=nullptr; }; diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletHeterogeneousEDProducer.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletHeterogeneousEDProducer.cc index 186a84617e8d3..e9873247a119f 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletHeterogeneousEDProducer.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletHeterogeneousEDProducer.cc @@ -18,6 +18,11 @@ #include "CAHitQuadrupletGeneratorGPU.h" +// gpu +#include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h" + + + namespace { void fillNtuplets(RegionsSeedingHitSets::RegionFiller &seedingHitSetsFiller, const OrderedHitSeeds &quadruplets) { @@ -31,6 +36,10 @@ class CAHitNtupletHeterogeneousEDProducer : public HeterogeneousEDProducer> { public: + + using PixelRecHitsH = siPixelRecHitsHeterogeneousProduct::HeterogeneousPixelRecHit; + + CAHitNtupletHeterogeneousEDProducer(const edm::ParameterSet &iConfig); ~CAHitNtupletHeterogeneousEDProducer() = default; @@ -50,6 +59,9 @@ class CAHitNtupletHeterogeneousEDProducer private: edm::EDGetTokenT doubletToken_; + edm::EDGetTokenT tGpuHits; + + edm::RunningAverage localRA_; CAHitQuadrupletGeneratorGPU GPUGenerator_; CAHitQuadrupletGenerator CPUGenerator_; @@ -63,6 +75,7 @@ CAHitNtupletHeterogeneousEDProducer::CAHitNtupletHeterogeneousEDProducer( : HeterogeneousEDProducer(iConfig), doubletToken_(consumes( iConfig.getParameter("doublets"))), + tGpuHits(consumesHeterogeneous(iConfig.getParameter("heterogeneousPixelRecHitSrc"))), GPUGenerator_(iConfig, consumesCollector()), CPUGenerator_(iConfig, consumesCollector()) { produces(); @@ -73,6 +86,9 @@ void CAHitNtupletHeterogeneousEDProducer::fillDescriptions( edm::ParameterSetDescription desc; desc.add("doublets", edm::InputTag("hitPairEDProducer")); + + desc.add("heterogeneousPixelRecHitSrc", edm::InputTag("siPixelRecHitHeterogeneous")); + CAHitQuadrupletGeneratorGPU::fillDescriptions(desc); HeterogeneousEDProducer::fillPSetDescription(desc); auto label = "caHitQuadrupletHeterogeneousEDProducer"; @@ -106,6 +122,13 @@ void CAHitNtupletHeterogeneousEDProducer::acquireGPUCuda( seedingHitSets_ = std::make_unique(); + edm::Handle gh; + iEvent.getByToken(tGpuHits, gh); + auto const & gHits = *gh; +// auto nhits = gHits.nHits; + + GPUGenerator_.buildDoublets(gHits,0.06f,cudaStream.id()); + if (regionDoublets.empty()) { emptyRegionDoublets = true; } else { diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu index 4a7f3ce5aa5f1..b94aad94bf16b 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu @@ -6,6 +6,8 @@ #include "GPUCACell.h" #include "CAHitQuadrupletGeneratorGPU.h" +#include "gpuPixelDoublets.h" + __global__ void kernel_debug(unsigned int numberOfLayerPairs_, unsigned int numberOfLayers_, const GPULayerDoublets *gpuDoublets, @@ -490,3 +492,18 @@ CAHitQuadrupletGeneratorGPU::fetchKernelResult(int regionIndex, cudaStream_t cud } return quadsInterface; } + + + + +void CAHitQuadrupletGeneratorGPU::buildDoublets(HitsOnCPU const & hh, float phicut, cudaStream_t stream) { + auto nhits = hh.nHits; + + float phiCut=0.06; + int threadsPerBlock = 256; + int blocks = (nhits + threadsPerBlock - 1) / threadsPerBlock; + + gpuPixelDoublets::getDoubletsFromHisto<<>>(hh.gpu_d,phiCut); + + +} diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.h index a6e9db46e68e7..717164c9cff62 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.h @@ -3,6 +3,10 @@ #include +#include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h" + + + #include "DataFormats/SiPixelDetId/interface/PixelSubdetector.h" #include "FWCore/ParameterSet/interface/ParameterSet.h" #include "FWCore/Utilities/interface/EDGetToken.h" @@ -32,6 +36,11 @@ namespace edm { class CAHitQuadrupletGeneratorGPU { public: + + using HitsOnGPU = siPixelRecHitsHeterogeneousProduct::HitsOnGPU; + using HitsOnCPU = siPixelRecHitsHeterogeneousProduct::HitsOnCPU; + + typedef LayerHitMapCache LayerCacheType; static constexpr unsigned int minLayers = 4; @@ -49,6 +58,8 @@ class CAHitQuadrupletGeneratorGPU { void initEvent(const edm::Event& ev, const edm::EventSetup& es); + void buildDoublets(HitsOnCPU const & hh, float phicut, cudaStream_t stream); + void hitNtuplets(const IntermediateHitDoublets& regionDoublets, const edm::EventSetup& es, const SeedingLayerSetsHits& layers, cudaStream_t stream); diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h new file mode 100644 index 0000000000000..019e775209f96 --- /dev/null +++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h @@ -0,0 +1,161 @@ +#ifndef RecoLocalTracker_SiPixelRecHits_plugins_gpuPixelDoublets_h +#define RecoLocalTracker_SiPixelRecHits_plugins_gpuPixelDouplets_h + +#include +#include +#include +#include +#include +#include + +#include "DataFormats/Math/interface/approx_atan2.h" + +#include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h" + +namespace gpuPixelDoublets { + + + __device__ + std::pair + findPhiLimits(int16_t phiMe, int16_t * iphi, uint16_t * index, uint16_t size, int16_t iphicut) { + + assert(iphicut>0); + + // find extreemes in top + int16_t minPhi = phiMe-iphicut; + int16_t maxPhi = phiMe+iphicut; + + // std::cout << "\n phi min/max " << phiMe << ' ' << minPhi << ' ' << maxPhi << std::endl; + + // guess and adjust + auto findLimit = [&](int16_t mPhi) { + int jm = float(0.5f*size)*(1.f+float(mPhi)/float(std::numeric_limits::max())); + // std::cout << "jm for " << mPhi << ' ' << jm << std::endl; + jm = std::min(size-1,std::max(0,jm)); + bool notDone=true; + while(jm>0 && mPhiiphi[index[++jm]]){} + jm = std::min(size-1,std::max(0,jm)); + return jm; + }; + + auto jmin = findLimit(minPhi); + auto jmax = findLimit(maxPhi); + + + /* + std::cout << "j min/max " << jmin << ' ' << jmax << std::endl; + std::cout << "found min/max " << iphi[index[jmin]] << ' ' << iphi[index[jmax]] << std::endl; + std::cout << "found min/max +1 " << iphi[index[jmin+1]] << ' ' << iphi[index[jmax+1]] << std::endl; + std::cout << "found min/max -1 " << iphi[index[jmin-1]] << ' ' << iphi[index[jmax-1]] << std::endl; + */ + + return std::make_pair(jmin,jmax); + } + + + __global__ + void getDoubletsFromSorted(int16_t * iphi, uint16_t * index, uint32_t * offsets, float phiCut) { + + auto iphicut = phi2short(phiCut); + + auto i = blockIdx.x*blockDim.x + threadIdx.x; + if (i>=offsets[9]) return; // get rid of last layer + + assert(0==offsets[0]); + int top = (i>offsets[5]) ? 5: 0; + while (i>=offsets[++top]){}; + assert(top<10); + auto bottom = top-1; + if (bottom==3 || bottom==6) return; // do not have UP... (9 we got rid already) + assert(i>=offsets[bottom]); + assert(i= (offsets[top]-offsets[bottom])) { + printf("index problem: %d %d %d %d %d\n",i, offsets[top], offsets[bottom], offsets[top]-offsets[bottom], index[i]); + return; + } + + assert(index[i]::max()); + + auto jLimits = findPhiLimits(phiMe, iphi+offsets[top],index+offsets[top],size,iphicut); + + auto slidingWindow = [&](uint16_t mysize, uint16_t mymin,uint16_t mymax) { + auto topPhi = iphi+offsets[top]; + uint16_t imax = std::numeric_limits::max(); + uint16_t offset = (mymin>mymax) ? imax-(mysize-1) : 0; + int n=0; + for (uint16_t i = mymin+offset; i!=mymax; i++) { + assert(i<=imax); + uint16_t k = (i>mymax) ? i-offset : i; + assert(k=mymin || k2*iphicut && int16_t(phiMe-topPhi[k])>2*iphicut) + printf("deltaPhi problem: %d %d %d %d, deltas %d:%d cut %d\n",i,k,phiMe,topPhi[k],int16_t(topPhi[k]-phiMe),int16_t(phiMe-topPhi[k]),iphicut); + n++; + } + int tot = (mymin>mymax) ? (mysize-mymin)+mymax : mymax-mymin; + assert(n==tot); + }; + + slidingWindow(size,jLimits.first,jLimits.second); + + } + + template + __device__ + void doubletsFromHisto(int16_t const * iphi, Hist const * hist, uint32_t const * offsets, float phiCut) { + + auto iphicut = phi2short(phiCut); + + auto i = blockIdx.x*blockDim.x + threadIdx.x; + if (i>=offsets[9]) return; // get rid of last layer + + assert(0==offsets[0]); + int top = (i>offsets[5]) ? 5: 0; + while (i>=offsets[++top]){}; + assert(top<10); + auto bottom = top-1; + if (bottom==3 || bottom==6) return; // do not have UP... (9 we got rid already) + assert(i>=offsets[bottom]); + assert(i iphicut ) continue; + ++tot; + } + } + if (0==hist[top].nspills) assert(tot>=nmin); + // look in spill bin as well.... + + } + + void + __global__ + getDoubletsFromHisto(siPixelRecHitsHeterogeneousProduct::HitsOnGPU const * hhp, float phiCut) { + auto const & hh = *hhp; + doubletsFromHisto(hh.iphi_d,hh.hist_d,hh.hitsLayerStart_d,phiCut); + + } + +} // namespace end + +#endif diff --git a/SimTracker/TrackerHitAssociation/plugins/BuildFile.xml b/SimTracker/TrackerHitAssociation/plugins/BuildFile.xml index 7cc02d9313e26..c767b1e68936a 100644 --- a/SimTracker/TrackerHitAssociation/plugins/BuildFile.xml +++ b/SimTracker/TrackerHitAssociation/plugins/BuildFile.xml @@ -1,5 +1,14 @@ + + + - + + + + + + + diff --git a/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.cu b/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.cu new file mode 100644 index 0000000000000..7a3e4eb83161d --- /dev/null +++ b/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.cu @@ -0,0 +1,220 @@ +#include "ClusterSLOnGPU.h" + +// for the "packing" +#include "RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h" +#include +#include +#include +#include + + +using ClusterSLGPU = trackerHitAssociationHeterogeneousProduct::ClusterSLGPU; + +__global__ +void simLink(clusterSLOnGPU::DigisOnGPU const * ddp, uint32_t ndigis, clusterSLOnGPU::HitsOnGPU const * hhp, ClusterSLGPU const * slp, uint32_t n) { + + assert(slp==slp->me_d); + + constexpr int32_t invTK = 0; // std::numeric_limits::max(); + + constexpr uint16_t InvId=9999; // must be > MaxNumModules + + auto const & dd = *ddp; + auto const & hh = *hhp; + auto const & sl = *slp; + auto i = blockIdx.x*blockDim.x + threadIdx.x; + + if (i>ndigis) return; + + auto id = dd.moduleInd_d[i]; + if (InvId==id) return; + assert(id<2000); + + auto ch = pixelgpudetails::pixelToChannel(dd.xx_d[i], dd.yy_d[i]); + auto first = hh.hitsModuleStart_d[id]; + auto cl = first + dd.clus_d[i]; + assert(cl<256*2000); + + const std::array me{{id,ch,0,0}}; + + auto less = [] __device__ __host__ (std::array const & a, std::array const & b)->bool { + return a[0] const & a, std::array const & b)->bool { + return a[0]==b[0] && a[1]==b[1]; // in this context we do not care of [2] + }; + + auto const * b = sl.links_d; + auto const * e = b+n; + + auto p = cuda_std::lower_bound(b,e,me,less); + int32_t j = p-sl.links_d; + assert(j>=0); + + auto getTK = [&](int i) { auto const & l = sl.links_d[i]; return l[2];}; + + j = std::min(int(j),int(n-1)); + if (equal(me,sl.links_d[j])) { + auto const itk = j; + auto const tk = getTK(j); + auto old = atomicCAS(&sl.tkId_d[cl],invTK,itk); + if (invTK==old || tk==getTK(old)) { + atomicAdd(&sl.n1_d[cl],1); +// sl.n1_d[cl] = tk; + } else { + auto old = atomicCAS(&sl.tkId2_d[cl],invTK,itk); + if (invTK==old || tk==getTK(old)) atomicAdd(&sl.n2_d[cl],1); + } +// if (92==tk) printf("TK3: %d %d %d %d: %d,%d ?%d?%d\n", j, cl, id, i, dd.xx_d[i], dd.yy_d[i], hh.mr_d[cl], hh.mc_d[cl]); + } + /* + else { + auto const & k=sl.links_d[j]; + auto const & kk = j+1nhits) return; + +// auto const & dd = *ddp; +// auto const & hh = *hhp; + auto const & sl = *slp; + + assert(sl.tkId_d[i]==0); + auto const & tk = sl.links_d[0]; + assert(tk[0]==0); + assert(tk[1]==0); + assert(tk[2]==0); + assert(tk[3]==0); + + // if (i==0) printf("xx_d gpu %x\n",dd.xx_d); + +} + + +__global__ +void dumpLink(int first, int ev, clusterSLOnGPU::HitsOnGPU const * hhp, uint32_t nhits, ClusterSLGPU const * slp) { + auto i = first + blockIdx.x*blockDim.x + threadIdx.x; + if (i>nhits) return; + + auto const & hh = *hhp; + auto const & sl = *slp; + + auto const & tk1 = sl.links_d[sl.tkId_d[i]]; + auto const & tk2 = sl.links_d[sl.tkId2_d[i]]; + + printf("HIT: %d %d %d %d %f %f %f %f %d %d %d %d %d %d %d\n",ev, i, + hh.detInd_d[i], hh.charge_d[i], + hh.xg_d[i],hh.yg_d[i],hh.zg_d[i],hh.rg_d[i],hh.iphi_d[i], + tk1[2],tk1[3],sl.n1_d[i], + tk2[2],tk2[3],sl.n2_d[i] + ); + +} + + + +namespace clusterSLOnGPU { + + constexpr uint32_t invTK = 0; // std::numeric_limits::max(); + + void printCSVHeader() { + printf("HIT: %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s\n", "ev", "ind", + "det", "charge", + "xg","yg","zg","rg","iphi", + "tkId","pt","n1","tkId2","pt2","n2" + ); + } + + + std::atomic evId(0); + std::once_flag doneCSVHeader; + + Kernel::Kernel(cuda::stream_t<>& stream, bool dump) : doDump(dump) { + if (doDump) std::call_once(doneCSVHeader,printCSVHeader); + alloc(stream); + } + + + void + Kernel::alloc(cuda::stream_t<>& stream) { + cudaCheck(cudaMalloc((void**) & slgpu.links_d,(ClusterSLGPU::MAX_DIGIS)*sizeof(std::array))); + + cudaCheck(cudaMalloc((void**) & slgpu.tkId_d,(ClusterSLGPU::MaxNumModules*256)*sizeof(uint32_t))); + cudaCheck(cudaMalloc((void**) & slgpu.tkId2_d,(ClusterSLGPU::MaxNumModules*256)*sizeof(uint32_t))); + cudaCheck(cudaMalloc((void**) & slgpu.n1_d,(ClusterSLGPU::MaxNumModules*256)*sizeof(uint32_t))); + cudaCheck(cudaMalloc((void**) & slgpu.n2_d,(ClusterSLGPU::MaxNumModules*256)*sizeof(uint32_t))); + + + cudaCheck(cudaMalloc((void**) & slgpu.me_d, sizeof(ClusterSLGPU))); + cudaCheck(cudaMemcpyAsync(slgpu.me_d, &slgpu, sizeof(ClusterSLGPU), cudaMemcpyDefault, stream.id())); + } + + void + Kernel::deAlloc() { + cudaCheck(cudaFree(slgpu.links_d)); + cudaCheck(cudaFree(slgpu.tkId_d)); + cudaCheck(cudaFree(slgpu.tkId2_d)); + cudaCheck(cudaFree(slgpu.n1_d)); + cudaCheck(cudaFree(slgpu.n2_d)); + cudaCheck(cudaFree(slgpu.me_d)); +} + + + void + Kernel::zero(cudaStream_t stream) { + cudaCheck(cudaMemsetAsync(slgpu.tkId_d,invTK,(ClusterSLGPU::MaxNumModules*256)*sizeof(uint32_t), stream)); + cudaCheck(cudaMemsetAsync(slgpu.tkId2_d,invTK,(ClusterSLGPU::MaxNumModules*256)*sizeof(uint32_t), stream)); + cudaCheck(cudaMemsetAsync(slgpu.n1_d,0,(ClusterSLGPU::MaxNumModules*256)*sizeof(uint32_t), stream)); + cudaCheck(cudaMemsetAsync(slgpu.n2_d,0,(ClusterSLGPU::MaxNumModules*256)*sizeof(uint32_t), stream)); + } + + + void + Kernel::algo(DigisOnGPU const & dd, uint32_t ndigis, HitsOnCPU const & hh, uint32_t nhits, uint32_t n, cuda::stream_t<>& stream) { + + /* + size_t pfs = 16*1024*1024; + // cudaDeviceSetLimit(cudaLimitPrintfFifoSize,pfs); + cudaDeviceGetLimit(&pfs,cudaLimitPrintfFifoSize); + std::cout << "cudaLimitPrintfFifoSize " << pfs << std::endl; + */ + + zero(stream.id()); + + ClusterSLGPU const & sl = slgpu; + + int ev = ++evId; + int threadsPerBlock = 256; + + int blocks = (nhits + threadsPerBlock - 1) / threadsPerBlock; + verifyZero<<>>(ev, dd.me_d, hh.gpu_d, nhits, sl.me_d); + + + blocks = (ndigis + threadsPerBlock - 1) / threadsPerBlock; + + assert(sl.me_d); + simLink<<>>(dd.me_d,ndigis, hh.gpu_d, sl.me_d,n); + + if (doDump) { + cudaStreamSynchronize(stream.id()); // flush previous printf + // one line == 200B so each kernel can print only 5K lines.... + blocks = 16; // (nhits + threadsPerBlock - 1) / threadsPerBlock; + for (int first=0; first>>(first, ev, hh.gpu_d, nhits, sl.me_d); + cudaStreamSynchronize(stream.id()); + } + } + cudaCheck(cudaGetLastError()); + + } + +} diff --git a/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.h b/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.h new file mode 100644 index 0000000000000..352337e783105 --- /dev/null +++ b/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.h @@ -0,0 +1,44 @@ +#ifndef SimTrackerTrackerHitAssociationClusterSLOnGPU_H +#define SimTrackerTrackerHitAssociationClusterSLOnGPU_H + +#include +#include +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" + + +#include "trackerHitAssociationHeterogeneousProduct.h" + +#include "RecoLocalTracker/SiPixelClusterizer/plugins/siPixelRawToClusterHeterogeneousProduct.h" +#include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h" + + + + +namespace clusterSLOnGPU { + + using ClusterSLGPU = trackerHitAssociationHeterogeneousProduct::ClusterSLGPU; + using GPUProduct = trackerHitAssociationHeterogeneousProduct::GPUProduct; + + using DigisOnGPU = siPixelRawToClusterHeterogeneousProduct::GPUProduct; + using HitsOnGPU = siPixelRecHitsHeterogeneousProduct::HitsOnGPU; + using HitsOnCPU = siPixelRecHitsHeterogeneousProduct::HitsOnCPU; + + + class Kernel { + public: + Kernel(cuda::stream_t<>& stream, bool dump); + ~Kernel() {deAlloc();} + void algo(DigisOnGPU const & dd, uint32_t ndigis, HitsOnCPU const & hh, uint32_t nhits, uint32_t n, cuda::stream_t<>& stream); + GPUProduct getProduct() { return GPUProduct{slgpu.me_d};} + + private: + void alloc(cuda::stream_t<>& stream); + void deAlloc(); + void zero(cudaStream_t stream); + public: + ClusterSLGPU slgpu; + bool doDump; + }; +} + +#endif diff --git a/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneous.cc b/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneous.cc new file mode 100644 index 0000000000000..6cffed8ad0cd5 --- /dev/null +++ b/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneous.cc @@ -0,0 +1,442 @@ +#include +#include +#include + +#include "FWCore/Framework/interface/Frameworkfwd.h" +#include "FWCore/Framework/interface/stream/EDProducer.h" +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Framework/interface/EventSetup.h" +#include "FWCore/Utilities/interface/InputTag.h" +#include "FWCore/Utilities/interface/EDGetToken.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" +#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" + +#include "HeterogeneousCore/CUDACore/interface/GPUCuda.h" +#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" +#include "HeterogeneousCore/Producer/interface/HeterogeneousEDProducer.h" + + +#include "DataFormats/Common/interface/Handle.h" +#include "FWCore/Framework/interface/ESHandle.h" +#include "DataFormats/Common/interface/DetSetVector.h" +#include "DataFormats/Common/interface/DetSetVectorNew.h" +#include "DataFormats/DetId/interface/DetId.h" +#include "DataFormats/SiPixelDetId/interface/PixelChannelIdentifier.h" +#include "DataFormats/TrackerRecHit2D/interface/OmniClusterRef.h" +#include "DataFormats/SiPixelCluster/interface/SiPixelCluster.h" +#include "DataFormats/SiStripCluster/interface/SiStripCluster.h" +#include "DataFormats/Phase2TrackerCluster/interface/Phase2TrackerCluster1D.h" + +#include "SimDataFormats/Track/interface/SimTrackContainer.h" +#include "SimDataFormats/TrackerDigiSimLink/interface/StripDigiSimLink.h" +#include "SimDataFormats/TrackerDigiSimLink/interface/PixelDigiSimLink.h" +#include "DataFormats/Phase2TrackerDigi/interface/Phase2TrackerDigi.h" +#include "SimDataFormats/TrackingAnalysis/interface/TrackingParticle.h" +#include "SimDataFormats/TrackingAnalysis/interface/TrackingParticleFwd.h" +#include "SimTracker/TrackerHitAssociation/interface/ClusterTPAssociation.h" + +#include "Geometry/TrackerGeometryBuilder/interface/TrackerGeometry.h" +#include "Geometry/Records/interface/TrackerDigiGeometryRecord.h" + + +// gpu +#include "RecoLocalTracker/SiPixelClusterizer/plugins/siPixelRawToClusterHeterogeneousProduct.h" +#include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h" + +#include +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "ClusterSLOnGPU.h" + +class ClusterTPAssociationHeterogeneous : public HeterogeneousEDProducer> +{ +public: + typedef std::vector OmniClusterCollection; + + using ClusterSLGPU = trackerHitAssociationHeterogeneousProduct::ClusterSLGPU; + using GPUProduct = trackerHitAssociationHeterogeneousProduct::GPUProduct; + using CPUProduct = trackerHitAssociationHeterogeneousProduct::CPUProduct; + using Output = trackerHitAssociationHeterogeneousProduct::ClusterTPAHeterogeneousProduct; + + using PixelDigiClustersH = siPixelRawToClusterHeterogeneousProduct::HeterogeneousDigiCluster; + using PixelRecHitsH = siPixelRecHitsHeterogeneousProduct::HeterogeneousPixelRecHit; + + explicit ClusterTPAssociationHeterogeneous(const edm::ParameterSet&); + ~ClusterTPAssociationHeterogeneous() = default; + + static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); + +private: + + void beginStreamGPUCuda(edm::StreamID streamId, + cuda::stream_t<> &cudaStream) override; + + void acquireGPUCuda(const edm::HeterogeneousEvent &iEvent, + const edm::EventSetup &iSetup, + cuda::stream_t<> &cudaStream) override; + void produceGPUCuda(edm::HeterogeneousEvent &iEvent, + const edm::EventSetup &iSetup, + cuda::stream_t<> &cudaStream) override; + void produceCPU(edm::HeterogeneousEvent &iEvent, + const edm::EventSetup &iSetup) override; + + void makeMap(const edm::HeterogeneousEvent &iEvent); + std::unique_ptr produceLegacy(edm::HeterogeneousEvent &iEvent, const edm::EventSetup& es); + + template + std::vector > + getSimTrackId(const edm::Handle >& simLinks, const DetId& detId, uint32_t channel) const; + + edm::EDGetTokenT > sipixelSimLinksToken_; + edm::EDGetTokenT > sistripSimLinksToken_; + edm::EDGetTokenT > siphase2OTSimLinksToken_; + edm::EDGetTokenT > pixelClustersToken_; + edm::EDGetTokenT > stripClustersToken_; + edm::EDGetTokenT > phase2OTClustersToken_; + edm::EDGetTokenT trackingParticleToken_; + + edm::EDGetTokenT tGpuDigis; + edm::EDGetTokenT tGpuHits; + + std::unique_ptr gpuAlgo; + + std::map, TrackingParticleRef> mapping; + + std::vector> digi2tp; + + bool doDump; + +}; + +ClusterTPAssociationHeterogeneous::ClusterTPAssociationHeterogeneous(const edm::ParameterSet & cfg) + : HeterogeneousEDProducer(cfg), + sipixelSimLinksToken_(consumes >(cfg.getParameter("pixelSimLinkSrc"))), + sistripSimLinksToken_(consumes >(cfg.getParameter("stripSimLinkSrc"))), + siphase2OTSimLinksToken_(consumes >(cfg.getParameter("phase2OTSimLinkSrc"))), + pixelClustersToken_(consumes >(cfg.getParameter("pixelClusterSrc"))), + stripClustersToken_(consumes >(cfg.getParameter("stripClusterSrc"))), + phase2OTClustersToken_(consumes >(cfg.getParameter("phase2OTClusterSrc"))), + trackingParticleToken_(consumes(cfg.getParameter("trackingParticleSrc"))), + tGpuDigis(consumesHeterogeneous(cfg.getParameter("heterogeneousPixelDigiClusterSrc"))), + tGpuHits(consumesHeterogeneous(cfg.getParameter("heterogeneousPixelRecHitSrc"))), + doDump(cfg.getParameter("dumpCSV")) +{ + produces(); +} + +void ClusterTPAssociationHeterogeneous::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { + edm::ParameterSetDescription desc; + desc.add("simTrackSrc", edm::InputTag("g4SimHits")); + desc.add("pixelSimLinkSrc", edm::InputTag("simSiPixelDigis")); + desc.add("stripSimLinkSrc", edm::InputTag("simSiStripDigis")); + desc.add("phase2OTSimLinkSrc", edm::InputTag("simSiPixelDigis","Tracker")); + desc.add("pixelClusterSrc", edm::InputTag("siPixelClusters")); + desc.add("stripClusterSrc", edm::InputTag("siStripClusters")); + desc.add("phase2OTClusterSrc", edm::InputTag("siPhase2Clusters")); + desc.add("trackingParticleSrc", edm::InputTag("mix", "MergedTrackTruth")); + desc.add("heterogeneousPixelDigiClusterSrc", edm::InputTag("siPixelClustersHeterogeneous")); + desc.add("heterogeneousPixelRecHitSrc", edm::InputTag("siPixelRecHitHeterogeneous")); + + desc.add("dumpCSV",false); + + HeterogeneousEDProducer::fillPSetDescription(desc); + + descriptions.add("tpClusterProducerHeterogeneousDefault", desc); +} + + +void ClusterTPAssociationHeterogeneous::beginStreamGPUCuda(edm::StreamID streamId, + cuda::stream_t<> &cudaStream) { + + gpuAlgo = std::make_unique(cudaStream,doDump); + +} + +void ClusterTPAssociationHeterogeneous::makeMap(const edm::HeterogeneousEvent &iEvent) { + // TrackingParticle + edm::Handle TPCollectionH; + iEvent.getByToken(trackingParticleToken_,TPCollectionH); + + + // prepare temporary map between SimTrackId and TrackingParticle index + mapping.clear(); + for (TrackingParticleCollection::size_type itp = 0; + itp < TPCollectionH.product()->size(); ++itp) { + TrackingParticleRef trackingParticle(TPCollectionH, itp); + + // SimTracks inside TrackingParticle + EncodedEventId eid(trackingParticle->eventId()); + for (auto itrk = trackingParticle->g4Track_begin(); + itrk != trackingParticle->g4Track_end(); ++itrk) { + std::pair trkid(itrk->trackId(), eid); + //std::cout << "creating map for id: " << trkid.first << " with tp: " << trackingParticle.key() << std::endl; + mapping.insert(std::make_pair(trkid, trackingParticle)); + } + } + + +} + + +void ClusterTPAssociationHeterogeneous::acquireGPUCuda(const edm::HeterogeneousEvent &iEvent, + const edm::EventSetup &iSetup, + cuda::stream_t<> &cudaStream) { + + edm::ESHandle geom; + iSetup.get().get( geom ); + + // Pixel DigiSimLink + edm::Handle > sipixelSimLinks; + // iEvent.getByLabel(_pixelSimLinkSrc, sipixelSimLinks); + iEvent.getByToken(sipixelSimLinksToken_,sipixelSimLinks); + + // TrackingParticle + edm::Handle TPCollectionH; + iEvent.getByToken(trackingParticleToken_,TPCollectionH); + + makeMap(iEvent); + + // gpu stuff ------------------------ + + // std::cout << "In tpsimlink " << mapping.size() << std::endl; + + edm::Handle gd; + edm::Handle gh; + iEvent.getByToken(tGpuDigis, gd); + iEvent.getByToken(tGpuHits, gh); + auto const & gDigis = *gd; + auto const & gHits = *gh; + auto ndigis = gDigis.nDigis; + auto nhits = gHits.nHits; + + uint32_t nn=0, ng=0, ng10=0; + digi2tp.clear(); + {std::array a{{0,0,0,0}}; digi2tp.push_back(a);} // put at 0 0 + for (auto const & links : *sipixelSimLinks) { + DetId detId(links.detId()); + const GeomDetUnit * genericDet = geom->idToDetUnit(detId); + uint32_t gind = genericDet->index(); + for (auto const & link : links) { + ++ng; + if (link.fraction() > 0.3f) ++ng10; + if (link.fraction() < 0.5f) continue; + auto tkid = std::make_pair(link.SimTrackId(), link.eventId()); + auto ipos = mapping.find(tkid); + if (ipos != mapping.end()) { + uint32_t pt = 1000*(*ipos).second->pt(); + ++nn; + std::array a{{gind,uint32_t(link.channel()),(*ipos).second.key(),pt}}; + digi2tp.push_back(a); + } + } + } + std::sort(digi2tp.begin(),digi2tp.end()); + + // std::cout << "In tpsimlink found " << nn << " valid link out of " << ng << '/' << ng10 << ' ' << digi2tp.size() << std::endl; + + cudaCheck(cudaMemcpyAsync(gpuAlgo->slgpu.links_d, digi2tp.data(), sizeof(std::array)*digi2tp.size(), cudaMemcpyDefault, cudaStream.id())); + gpuAlgo->algo(gDigis, ndigis, gHits, nhits, digi2tp.size(),cudaStream); + + // end gpu stuff --------------------- + + +} + +void ClusterTPAssociationHeterogeneous::produceGPUCuda(edm::HeterogeneousEvent &iEvent, + const edm::EventSetup &iSetup, + cuda::stream_t<> &cudaStream) { + + auto output = std::make_unique(gpuAlgo->getProduct()); + + auto legacy = produceLegacy(iEvent,iSetup).release(); + + iEvent.put(std::move(output), [legacy](const GPUProduct& hits, CPUProduct& cpu) { + cpu = *legacy; delete legacy; + }); + +} + + +void ClusterTPAssociationHeterogeneous::produceCPU(edm::HeterogeneousEvent &iEvent, const edm::EventSetup& es) { + + makeMap(iEvent); + iEvent.put(std::move(produceLegacy(iEvent,es))); + +} + +std::unique_ptr +ClusterTPAssociationHeterogeneous::produceLegacy(edm::HeterogeneousEvent &iEvent, const edm::EventSetup& es) { + + + // Pixel DigiSimLink + edm::Handle > sipixelSimLinks; + // iEvent.getByLabel(_pixelSimLinkSrc, sipixelSimLinks); + iEvent.getByToken(sipixelSimLinksToken_,sipixelSimLinks); + + // SiStrip DigiSimLink + edm::Handle > sistripSimLinks; + iEvent.getByToken(sistripSimLinksToken_,sistripSimLinks); + + // Phase2 OT DigiSimLink + edm::Handle > siphase2OTSimLinks; + iEvent.getByToken(siphase2OTSimLinksToken_, siphase2OTSimLinks); + + // Pixel Cluster + edm::Handle > pixelClusters; + bool foundPixelClusters = iEvent.getByToken(pixelClustersToken_,pixelClusters); + + // Strip Cluster + edm::Handle > stripClusters; + bool foundStripClusters = iEvent.getByToken(stripClustersToken_,stripClusters); + + // Phase2 Cluster + edm::Handle > phase2OTClusters; + bool foundPhase2OTClusters = iEvent.getByToken(phase2OTClustersToken_, phase2OTClusters); + + // TrackingParticle + edm::Handle TPCollectionH; + iEvent.getByToken(trackingParticleToken_,TPCollectionH); + + auto output = std::make_unique(TPCollectionH); + auto & clusterTPList = output->collection; + + if ( foundPixelClusters ) { + // Pixel Clusters + for (edmNew::DetSetVector::const_iterator iter = pixelClusters->begin(); + iter != pixelClusters->end(); ++iter) { + uint32_t detid = iter->id(); + DetId detId(detid); + edmNew::DetSet link_pixel = (*iter); + for (edmNew::DetSet::const_iterator di = link_pixel.begin(); + di != link_pixel.end(); ++di) { + const SiPixelCluster& cluster = (*di); + edm::Ref, SiPixelCluster> c_ref = + edmNew::makeRefTo(pixelClusters, di); + + std::set > simTkIds; + for (int irow = cluster.minPixelRow(); irow <= cluster.maxPixelRow(); ++irow) { + for (int icol = cluster.minPixelCol(); icol <= cluster.maxPixelCol(); ++icol) { + uint32_t channel = PixelChannelIdentifier::pixelToChannel(irow, icol); + std::vector > trkid(getSimTrackId(sipixelSimLinks, detId, channel)); + if (trkid.empty()) continue; + simTkIds.insert(trkid.begin(),trkid.end()); + } + } + for (std::set >::const_iterator iset = simTkIds.begin(); + iset != simTkIds.end(); iset++) { + auto ipos = mapping.find(*iset); + if (ipos != mapping.end()) { + //std::cout << "cluster in detid: " << detid << " from tp: " << ipos->second.key() << " " << iset->first << std::endl; + clusterTPList.emplace_back(OmniClusterRef(c_ref), ipos->second); + } + } + } + } + } + + if ( foundStripClusters ) { + // Strip Clusters + for (edmNew::DetSetVector::const_iterator iter = stripClusters->begin(false), eter = stripClusters->end(false); + iter != eter; ++iter) { + if (!(*iter).isValid()) continue; + uint32_t detid = iter->id(); + DetId detId(detid); + edmNew::DetSet link_strip = (*iter); + for (edmNew::DetSet::const_iterator di = link_strip.begin(); + di != link_strip.end(); di++) { + const SiStripCluster& cluster = (*di); + edm::Ref, SiStripCluster> c_ref = + edmNew::makeRefTo(stripClusters, di); + + std::set > simTkIds; + int first = cluster.firstStrip(); + int last = first + cluster.amplitudes().size(); + + for (int istr = first; istr < last; ++istr) { + std::vector > trkid(getSimTrackId(sistripSimLinks, detId, istr)); + if (trkid.empty()) continue; + simTkIds.insert(trkid.begin(),trkid.end()); + } + for (std::set >::const_iterator iset = simTkIds.begin(); + iset != simTkIds.end(); iset++) { + auto ipos = mapping.find(*iset); + if (ipos != mapping.end()) { + //std::cout << "cluster in detid: " << detid << " from tp: " << ipos->second.key() << " " << iset->first << std::endl; + clusterTPList.emplace_back(OmniClusterRef(c_ref), ipos->second); + } + } + } + } + } + + if ( foundPhase2OTClusters ) { + + // Phase2 Clusters + if(phase2OTClusters.isValid()){ + for (edmNew::DetSetVector::const_iterator iter = phase2OTClusters->begin(false), eter = phase2OTClusters->end(false); + iter != eter; ++iter) { + if (!(*iter).isValid()) continue; + uint32_t detid = iter->id(); + DetId detId(detid); + edmNew::DetSet link_phase2 = (*iter); + for (edmNew::DetSet::const_iterator di = link_phase2.begin(); + di != link_phase2.end(); di++) { + const Phase2TrackerCluster1D& cluster = (*di); + edm::Ref, Phase2TrackerCluster1D> c_ref = + edmNew::makeRefTo(phase2OTClusters, di); + + std::set > simTkIds; + + for (unsigned int istr(0); istr < cluster.size(); ++istr) { + uint32_t channel = Phase2TrackerDigi::pixelToChannel(cluster.firstRow() + istr, cluster.column()); + std::vector > trkid(getSimTrackId(siphase2OTSimLinks, detId, channel)); + if (trkid.empty()) continue; + simTkIds.insert(trkid.begin(),trkid.end()); + } + + for (std::set >::const_iterator iset = simTkIds.begin(); + iset != simTkIds.end(); iset++) { + auto ipos = mapping.find(*iset); + if (ipos != mapping.end()) { + clusterTPList.emplace_back(OmniClusterRef(c_ref), ipos->second); + } + } + } + } + } + + } + + + clusterTPList.sortAndUnique(); + + return output; + mapping.clear(); +} + +template +std::vector > +//std::pair +ClusterTPAssociationHeterogeneous::getSimTrackId(const edm::Handle >& simLinks, + const DetId& detId, uint32_t channel) const +{ + //std::pair simTrkId; + std::vector > simTrkId; + auto isearch = simLinks->find(detId); + if (isearch != simLinks->end()) { + // Loop over DigiSimLink in this det unit + edm::DetSet link_detset = (*isearch); + for (typename edm::DetSet::const_iterator it = link_detset.data.begin(); + it != link_detset.data.end(); ++it) { + if (channel == it->channel()) { + simTrkId.push_back(std::make_pair(it->SimTrackId(), it->eventId())); + } + } + } + return simTrkId; +} +#include "FWCore/PluginManager/interface/ModuleDef.h" +#include "FWCore/Framework/interface/MakerMacros.h" + +DEFINE_FWK_MODULE(ClusterTPAssociationHeterogeneous); diff --git a/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneousConverter.cc b/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneousConverter.cc new file mode 100644 index 0000000000000..d8b9e099c3474 --- /dev/null +++ b/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneousConverter.cc @@ -0,0 +1,59 @@ +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Framework/interface/MakerMacros.h" +#include "FWCore/Framework/interface/global/EDProducer.h" +#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" +#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "HeterogeneousCore/Product/interface/HeterogeneousProduct.h" + +#include "trackerHitAssociationHeterogeneousProduct.h" + + +class ClusterTPAssociationHeterogeneousConverter: public edm::global::EDProducer<> { +public: + + using Input = trackerHitAssociationHeterogeneousProduct::ClusterTPAHeterogeneousProduct; + using Product = ClusterTPAssociation; + + explicit ClusterTPAssociationHeterogeneousConverter(edm::ParameterSet const& iConfig); + ~ClusterTPAssociationHeterogeneousConverter() = default; + + static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); + +private: + void produce(edm::StreamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const override; + + edm::EDGetTokenT token_; +}; + +ClusterTPAssociationHeterogeneousConverter::ClusterTPAssociationHeterogeneousConverter(edm::ParameterSet const& iConfig): + token_(consumes(iConfig.getParameter("src"))) +{ + produces(); +} + +void ClusterTPAssociationHeterogeneousConverter::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { + edm::ParameterSetDescription desc; + desc.add("src", edm::InputTag("tpClusterProducerHeterogeneos")); + + descriptions.add("tpClusterHeterogeneousConverter",desc); +} + +namespace { + template + auto copy_unique(const T& t) { + return std::make_unique(t); + } +} + +void ClusterTPAssociationHeterogeneousConverter::produce(edm::StreamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const { + edm::Handle hinput; + iEvent.getByToken(token_, hinput); + + const auto& input = hinput->get().getProduct(); + + iEvent.put(copy_unique(input.collection)); +} + + +DEFINE_FWK_MODULE(ClusterTPAssociationHeterogeneousConverter); diff --git a/SimTracker/TrackerHitAssociation/plugins/trackerHitAssociationHeterogeneousProduct.h b/SimTracker/TrackerHitAssociation/plugins/trackerHitAssociationHeterogeneousProduct.h new file mode 100644 index 0000000000000..c3f86ddf855c8 --- /dev/null +++ b/SimTracker/TrackerHitAssociation/plugins/trackerHitAssociationHeterogeneousProduct.h @@ -0,0 +1,47 @@ +#ifndef SimTrackerTrackerHitAssociationClusterHeterogeneousProduct_H +#define SimTrackerTrackerHitAssociationClusterHeterogeneousProduct_H + +#ifndef __CUDACC__ +#include "SimTracker/TrackerHitAssociation/interface/ClusterTPAssociation.h" +#endif + +#include "HeterogeneousCore/Product/interface/HeterogeneousProduct.h" + + +namespace trackerHitAssociationHeterogeneousProduct { + +#ifndef __CUDACC__ + struct CPUProduct { + CPUProduct() = default; + template + explicit CPUProduct(T const & t) : collection(t){} + ClusterTPAssociation collection; + }; +#endif + + struct ClusterSLGPU { + + ClusterSLGPU * me_d=nullptr; + std::array * links_d; + uint32_t * tkId_d; + uint32_t * tkId2_d; + uint32_t * n1_d; + uint32_t * n2_d; + + static constexpr uint32_t MAX_DIGIS = 2000*150; + static constexpr uint32_t MaxNumModules = 2000; + + }; + + struct GPUProduct { + ClusterSLGPU * gpu_d=nullptr; + }; + +#ifndef __CUDACC__ + using ClusterTPAHeterogeneousProduct = HeterogeneousProductImpl, + heterogeneous::GPUCudaProduct >; +#endif + +} + +#endif diff --git a/SimTracker/TrackerHitAssociation/python/tpClusterProducer_cfi.py b/SimTracker/TrackerHitAssociation/python/tpClusterProducer_cfi.py index 8757a67226fb8..e8330c074dfac 100644 --- a/SimTracker/TrackerHitAssociation/python/tpClusterProducer_cfi.py +++ b/SimTracker/TrackerHitAssociation/python/tpClusterProducer_cfi.py @@ -1,4 +1,5 @@ import FWCore.ParameterSet.Config as cms +from Configuration.ProcessModifiers.gpu_cff import gpu from SimTracker.TrackerHitAssociation.tpClusterProducerDefault_cfi import tpClusterProducerDefault as _tpClusterProducerDefault @@ -18,3 +19,12 @@ stripSimLinkSrc = "mixData:StripDigiSimLink", phase2OTSimLinkSrc = "mixData:Phase2OTDigiSimLink", ) + + +from SimTracker.TrackerHitAssociation.tpClusterProducerHeterogeneousDefault_cfi import tpClusterProducerHeterogeneousDefault as _tpClusterProducerHeterogeneous +tpClusterProducerHeterogeneous = _tpClusterProducerHeterogeneous.clone() + +from SimTracker.TrackerHitAssociation.tpClusterHeterogeneousConverter_cfi import tpClusterHeterogeneousConverter as _tpHeterogeneousConverter + +gpu.toReplaceWith(tpClusterProducer, _tpHeterogeneousConverter.clone()) + diff --git a/Validation/RecoTrack/python/TrackValidation_cff.py b/Validation/RecoTrack/python/TrackValidation_cff.py index b6251473fcab4..dff571862bb46 100644 --- a/Validation/RecoTrack/python/TrackValidation_cff.py +++ b/Validation/RecoTrack/python/TrackValidation_cff.py @@ -709,9 +709,14 @@ def _uniqueFirstLayers(layerList): ### Pixel tracking only mode (placeholder for now) -tpClusterProducerPixelTrackingOnly = tpClusterProducer.clone( - pixelClusterSrc = "siPixelClustersPreSplitting" + +tpClusterProducerHeterogeneousPixelTrackingOnly = tpClusterProducerHeterogeneous.clone( + pixelClusterSrc = "siPixelClustersPreSplitting" ) +tpClusterProducerPixelTrackingOnly = tpClusterProducer.clone() +# Need to use the modifier to customize because the exact EDProducer type depends on the modifier +gpu.toModify(tpClusterProducerPixelTrackingOnly, src = "tpClusterProducerHeterogeneousPixelTrackingOnly") + quickTrackAssociatorByHitsPixelTrackingOnly = quickTrackAssociatorByHits.clone( cluster2TPSrc = "tpClusterProducerPixelTrackingOnly" ) @@ -739,12 +744,18 @@ def _uniqueFirstLayers(layerList): tracksValidationTruthPixelTrackingOnly.replace(quickTrackAssociatorByHits, quickTrackAssociatorByHitsPixelTrackingOnly) tracksValidationTruthPixelTrackingOnly.replace(trackingParticleRecoTrackAsssociation, trackingParticlePixelTrackAsssociation) tracksValidationTruthPixelTrackingOnly.replace(VertexAssociatorByPositionAndTracks, PixelVertexAssociatorByPositionAndTracks) + +_tracksValidationTruthPixelTrackingOnlyGPU = tracksValidationTruthPixelTrackingOnly.copy() +_tracksValidationTruthPixelTrackingOnlyGPU.insert(0, tpClusterProducerHeterogeneousPixelTrackingOnly) +gpu.toReplaceWith(tracksValidationTruthPixelTrackingOnly, _tracksValidationTruthPixelTrackingOnlyGPU) + tracksValidationPixelTrackingOnly = cms.Sequence( tracksValidationTruthPixelTrackingOnly + trackValidatorPixelTrackingOnly ) + ### Lite mode (only generalTracks and HP) trackValidatorLite = trackValidator.clone( label = ["generalTracks", "cutsRecoTracksHp"] diff --git a/Validation/RecoTrack/python/associators_cff.py b/Validation/RecoTrack/python/associators_cff.py index 73c6f416e509c..8c2c8b7c7c85e 100644 --- a/Validation/RecoTrack/python/associators_cff.py +++ b/Validation/RecoTrack/python/associators_cff.py @@ -3,7 +3,8 @@ #### TrackAssociation import SimTracker.TrackAssociatorProducers.quickTrackAssociatorByHits_cfi import SimTracker.TrackAssociatorProducers.trackAssociatorByPosition_cfi -from SimTracker.TrackerHitAssociation.tpClusterProducer_cfi import tpClusterProducer as _tpClusterProducer +# from SimTracker.TrackerHitAssociation.tpClusterProducer_cfi import tpClusterProducer as _tpClusterProducer +from SimTracker.TrackerHitAssociation.tpClusterProducer_cfi import tpClusterProducerHeterogeneous as _tpClusterProducer from SimTracker.TrackAssociation.trackingParticleRecoTrackAsssociation_cfi import trackingParticleRecoTrackAsssociation as _trackingParticleRecoTrackAsssociation hltTPClusterProducer = _tpClusterProducer.clone(