diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc index 084763385bdb4..d685ced488233 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc @@ -1 +1,192 @@ -#include "gpuVertexFinderImpl.h" +#ifndef RecoPixelVertexing_PixelVertexFinding_plugins_gpuVertexFinderImpl_h +#define RecoPixelVertexing_PixelVertexFinding_plugins_gpuVertexFinderImpl_h + +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" + +#include "gpuClusterTracksByDensity.h" +#include "gpuClusterTracksDBSCAN.h" +#include "gpuClusterTracksIterative.h" +#include "gpuFitVertices.h" +#include "gpuSortByPt2.h" +#include "gpuSplitVertices.h" + +#undef PIXVERTEX_DEBUG_PRODUCE + +namespace gpuVertexFinder { + + // reject outlier tracks that contribute more than this to the chi2 of the vertex fit + constexpr float maxChi2ForFirstFit = 50.f; + constexpr float maxChi2ForFinalFit = 5000.f; + + // split vertices with a chi2/NDoF greater than this + constexpr float maxChi2ForSplit = 9.f; + + __global__ void loadTracks(TkSoA const* ptracks, ZVertexSoA* soa, WorkSpace* pws, float ptMin) { + assert(ptracks); + assert(soa); + auto const& tracks = *ptracks; + auto const& fit = tracks.stateAtBS; + auto const* quality = tracks.qualityData(); + + auto first = blockIdx.x * blockDim.x + threadIdx.x; + for (int idx = first, nt = TkSoA::stride(); idx < nt; idx += gridDim.x * blockDim.x) { + auto nHits = tracks.nHits(idx); + if (nHits == 0) + break; // this is a guard: maybe we need to move to nTracks... + + // initialize soa... + soa->idv[idx] = -1; + + if (nHits < 4) + continue; // no triplets + if (quality[idx] != pixelTrack::Quality::loose) + continue; + + auto pt = tracks.pt(idx); + + if (pt < ptMin) + continue; + + auto& data = *pws; + auto it = atomicAdd(&data.ntrks, 1); + data.itrk[it] = idx; + data.zt[it] = tracks.zip(idx); + data.ezt2[it] = fit.covariance(idx)(14); + data.ptt2[it] = pt * pt; + } + } + +// #define THREE_KERNELS +#ifndef THREE_KERNELS + __global__ void vertexFinderOneKernel(gpuVertexFinder::ZVertices* pdata, + gpuVertexFinder::WorkSpace* pws, + int minT, // min number of neighbours to be "seed" + float eps, // max absolute distance to cluster + float errmax, // max error to be "seed" + float chi2max // max normalized distance to cluster, + ) { + clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max); + __syncthreads(); + fitVertices(pdata, pws, maxChi2ForFirstFit); + __syncthreads(); + splitVertices(pdata, pws, maxChi2ForSplit); + __syncthreads(); + fitVertices(pdata, pws, maxChi2ForFinalFit); + __syncthreads(); + sortByPt2(pdata, pws); + } +#else + __global__ void vertexFinderKernel1(gpuVertexFinder::ZVertices* pdata, + gpuVertexFinder::WorkSpace* pws, + int minT, // min number of neighbours to be "seed" + float eps, // max absolute distance to cluster + float errmax, // max error to be "seed" + float chi2max // max normalized distance to cluster, + ) { + clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max); + __syncthreads(); + fitVertices(pdata, pws, maxChi2ForFirstFit); + } + + __global__ void vertexFinderKernel2(gpuVertexFinder::ZVertices* pdata, gpuVertexFinder::WorkSpace* pws) { + fitVertices(pdata, pws, maxChi2ForFinalFit); + __syncthreads(); + sortByPt2(pdata, pws); + } +#endif + +#ifdef __CUDACC__ + ZVertexHeterogeneous Producer::makeAsync(cudaStream_t stream, TkSoA const* tksoa, float ptMin) const { +#ifdef PIXVERTEX_DEBUG_PRODUCE + std::cout << "producing Vertices on GPU" << std::endl; +#endif // PIXVERTEX_DEBUG_PRODUCE + ZVertexHeterogeneous vertices(cms::cuda::make_device_unique(stream)); +#else + ZVertexHeterogeneous Producer::make(TkSoA const* tksoa, float ptMin) const { +#ifdef PIXVERTEX_DEBUG_PRODUCE + std::cout << "producing Vertices on CPU" << std::endl; +#endif // PIXVERTEX_DEBUG_PRODUCE + ZVertexHeterogeneous vertices(std::make_unique()); +#endif + assert(tksoa); + auto* soa = vertices.get(); + assert(soa); + +#ifdef __CUDACC__ + auto ws_d = cms::cuda::make_device_unique(stream); +#else + auto ws_d = std::make_unique(); +#endif + +#ifdef __CUDACC__ + init<<<1, 1, 0, stream>>>(soa, ws_d.get()); + auto blockSize = 128; + auto numberOfBlocks = (TkSoA::stride() + blockSize - 1) / blockSize; + loadTracks<<>>(tksoa, soa, ws_d.get(), ptMin); + cudaCheck(cudaGetLastError()); +#else + init(soa, ws_d.get()); + loadTracks(tksoa, soa, ws_d.get(), ptMin); +#endif + +#ifdef __CUDACC__ + // Running too many thread lead to problems when printf is enabled. + constexpr int maxThreadsForPrint = 1024 - 256; + constexpr int numBlocks = 1024; + constexpr int threadsPerBlock = 128; + + if (oneKernel_) { + // implemented only for density clustesrs +#ifndef THREE_KERNELS + vertexFinderOneKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max); +#else + vertexFinderKernel1<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max); + cudaCheck(cudaGetLastError()); + // one block per vertex... + splitVerticesKernel<<>>(soa, ws_d.get(), maxChi2ForSplit); + cudaCheck(cudaGetLastError()); + vertexFinderKernel2<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get()); +#endif + } else { // five kernels + if (useDensity_) { + clusterTracksByDensityKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max); + } else if (useDBSCAN_) { + clusterTracksDBSCAN<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max); + } else if (useIterative_) { + clusterTracksIterative<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max); + } + cudaCheck(cudaGetLastError()); + fitVerticesKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), maxChi2ForFirstFit); + cudaCheck(cudaGetLastError()); + // one block per vertex... + splitVerticesKernel<<>>(soa, ws_d.get(), maxChi2ForSplit); + cudaCheck(cudaGetLastError()); + fitVerticesKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), maxChi2ForFinalFit); + cudaCheck(cudaGetLastError()); + sortByPt2Kernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get()); + } + cudaCheck(cudaGetLastError()); +#else // __CUDACC__ + if (useDensity_) { + clusterTracksByDensity(soa, ws_d.get(), minT, eps, errmax, chi2max); + } else if (useDBSCAN_) { + clusterTracksDBSCAN(soa, ws_d.get(), minT, eps, errmax, chi2max); + } else if (useIterative_) { + clusterTracksIterative(soa, ws_d.get(), minT, eps, errmax, chi2max); + } +#ifdef PIXVERTEX_DEBUG_PRODUCE + std::cout << "found " << (*ws_d).nvIntermediate << " vertices " << std::endl; +#endif // PIXVERTEX_DEBUG_PRODUCE + fitVertices(soa, ws_d.get(), maxChi2ForFirstFit); + // one block per vertex! + splitVertices(soa, ws_d.get(), maxChi2ForSplit); + fitVertices(soa, ws_d.get(), maxChi2ForFinalFit); + sortByPt2(soa, ws_d.get()); +#endif + + return vertices; + } + +} // namespace gpuVertexFinder + +#endif // RecoPixelVertexing_PixelVertexFinding_plugins_gpuVertexFinderImpl_h diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cu b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cu index 084763385bdb4..9674eac7d8784 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cu +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cu @@ -1 +1 @@ -#include "gpuVertexFinderImpl.h" +#include "gpuVertexFinder.cc" diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinderImpl.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinderImpl.h deleted file mode 100644 index d685ced488233..0000000000000 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinderImpl.h +++ /dev/null @@ -1,192 +0,0 @@ -#ifndef RecoPixelVertexing_PixelVertexFinding_plugins_gpuVertexFinderImpl_h -#define RecoPixelVertexing_PixelVertexFinding_plugins_gpuVertexFinderImpl_h - -#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" - -#include "gpuClusterTracksByDensity.h" -#include "gpuClusterTracksDBSCAN.h" -#include "gpuClusterTracksIterative.h" -#include "gpuFitVertices.h" -#include "gpuSortByPt2.h" -#include "gpuSplitVertices.h" - -#undef PIXVERTEX_DEBUG_PRODUCE - -namespace gpuVertexFinder { - - // reject outlier tracks that contribute more than this to the chi2 of the vertex fit - constexpr float maxChi2ForFirstFit = 50.f; - constexpr float maxChi2ForFinalFit = 5000.f; - - // split vertices with a chi2/NDoF greater than this - constexpr float maxChi2ForSplit = 9.f; - - __global__ void loadTracks(TkSoA const* ptracks, ZVertexSoA* soa, WorkSpace* pws, float ptMin) { - assert(ptracks); - assert(soa); - auto const& tracks = *ptracks; - auto const& fit = tracks.stateAtBS; - auto const* quality = tracks.qualityData(); - - auto first = blockIdx.x * blockDim.x + threadIdx.x; - for (int idx = first, nt = TkSoA::stride(); idx < nt; idx += gridDim.x * blockDim.x) { - auto nHits = tracks.nHits(idx); - if (nHits == 0) - break; // this is a guard: maybe we need to move to nTracks... - - // initialize soa... - soa->idv[idx] = -1; - - if (nHits < 4) - continue; // no triplets - if (quality[idx] != pixelTrack::Quality::loose) - continue; - - auto pt = tracks.pt(idx); - - if (pt < ptMin) - continue; - - auto& data = *pws; - auto it = atomicAdd(&data.ntrks, 1); - data.itrk[it] = idx; - data.zt[it] = tracks.zip(idx); - data.ezt2[it] = fit.covariance(idx)(14); - data.ptt2[it] = pt * pt; - } - } - -// #define THREE_KERNELS -#ifndef THREE_KERNELS - __global__ void vertexFinderOneKernel(gpuVertexFinder::ZVertices* pdata, - gpuVertexFinder::WorkSpace* pws, - int minT, // min number of neighbours to be "seed" - float eps, // max absolute distance to cluster - float errmax, // max error to be "seed" - float chi2max // max normalized distance to cluster, - ) { - clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max); - __syncthreads(); - fitVertices(pdata, pws, maxChi2ForFirstFit); - __syncthreads(); - splitVertices(pdata, pws, maxChi2ForSplit); - __syncthreads(); - fitVertices(pdata, pws, maxChi2ForFinalFit); - __syncthreads(); - sortByPt2(pdata, pws); - } -#else - __global__ void vertexFinderKernel1(gpuVertexFinder::ZVertices* pdata, - gpuVertexFinder::WorkSpace* pws, - int minT, // min number of neighbours to be "seed" - float eps, // max absolute distance to cluster - float errmax, // max error to be "seed" - float chi2max // max normalized distance to cluster, - ) { - clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max); - __syncthreads(); - fitVertices(pdata, pws, maxChi2ForFirstFit); - } - - __global__ void vertexFinderKernel2(gpuVertexFinder::ZVertices* pdata, gpuVertexFinder::WorkSpace* pws) { - fitVertices(pdata, pws, maxChi2ForFinalFit); - __syncthreads(); - sortByPt2(pdata, pws); - } -#endif - -#ifdef __CUDACC__ - ZVertexHeterogeneous Producer::makeAsync(cudaStream_t stream, TkSoA const* tksoa, float ptMin) const { -#ifdef PIXVERTEX_DEBUG_PRODUCE - std::cout << "producing Vertices on GPU" << std::endl; -#endif // PIXVERTEX_DEBUG_PRODUCE - ZVertexHeterogeneous vertices(cms::cuda::make_device_unique(stream)); -#else - ZVertexHeterogeneous Producer::make(TkSoA const* tksoa, float ptMin) const { -#ifdef PIXVERTEX_DEBUG_PRODUCE - std::cout << "producing Vertices on CPU" << std::endl; -#endif // PIXVERTEX_DEBUG_PRODUCE - ZVertexHeterogeneous vertices(std::make_unique()); -#endif - assert(tksoa); - auto* soa = vertices.get(); - assert(soa); - -#ifdef __CUDACC__ - auto ws_d = cms::cuda::make_device_unique(stream); -#else - auto ws_d = std::make_unique(); -#endif - -#ifdef __CUDACC__ - init<<<1, 1, 0, stream>>>(soa, ws_d.get()); - auto blockSize = 128; - auto numberOfBlocks = (TkSoA::stride() + blockSize - 1) / blockSize; - loadTracks<<>>(tksoa, soa, ws_d.get(), ptMin); - cudaCheck(cudaGetLastError()); -#else - init(soa, ws_d.get()); - loadTracks(tksoa, soa, ws_d.get(), ptMin); -#endif - -#ifdef __CUDACC__ - // Running too many thread lead to problems when printf is enabled. - constexpr int maxThreadsForPrint = 1024 - 256; - constexpr int numBlocks = 1024; - constexpr int threadsPerBlock = 128; - - if (oneKernel_) { - // implemented only for density clustesrs -#ifndef THREE_KERNELS - vertexFinderOneKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max); -#else - vertexFinderKernel1<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max); - cudaCheck(cudaGetLastError()); - // one block per vertex... - splitVerticesKernel<<>>(soa, ws_d.get(), maxChi2ForSplit); - cudaCheck(cudaGetLastError()); - vertexFinderKernel2<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get()); -#endif - } else { // five kernels - if (useDensity_) { - clusterTracksByDensityKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max); - } else if (useDBSCAN_) { - clusterTracksDBSCAN<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max); - } else if (useIterative_) { - clusterTracksIterative<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max); - } - cudaCheck(cudaGetLastError()); - fitVerticesKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), maxChi2ForFirstFit); - cudaCheck(cudaGetLastError()); - // one block per vertex... - splitVerticesKernel<<>>(soa, ws_d.get(), maxChi2ForSplit); - cudaCheck(cudaGetLastError()); - fitVerticesKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), maxChi2ForFinalFit); - cudaCheck(cudaGetLastError()); - sortByPt2Kernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get()); - } - cudaCheck(cudaGetLastError()); -#else // __CUDACC__ - if (useDensity_) { - clusterTracksByDensity(soa, ws_d.get(), minT, eps, errmax, chi2max); - } else if (useDBSCAN_) { - clusterTracksDBSCAN(soa, ws_d.get(), minT, eps, errmax, chi2max); - } else if (useIterative_) { - clusterTracksIterative(soa, ws_d.get(), minT, eps, errmax, chi2max); - } -#ifdef PIXVERTEX_DEBUG_PRODUCE - std::cout << "found " << (*ws_d).nvIntermediate << " vertices " << std::endl; -#endif // PIXVERTEX_DEBUG_PRODUCE - fitVertices(soa, ws_d.get(), maxChi2ForFirstFit); - // one block per vertex! - splitVertices(soa, ws_d.get(), maxChi2ForSplit); - fitVertices(soa, ws_d.get(), maxChi2ForFinalFit); - sortByPt2(soa, ws_d.get()); -#endif - - return vertices; - } - -} // namespace gpuVertexFinder - -#endif // RecoPixelVertexing_PixelVertexFinding_plugins_gpuVertexFinderImpl_h