Skip to content

Commit

Permalink
Move the implementation of the gpuVertexFinder kernels to gpuVertexFi…
Browse files Browse the repository at this point in the history
…nder.cc
  • Loading branch information
fwyzard committed Apr 27, 2021
1 parent 109fbc4 commit ad06d78
Show file tree
Hide file tree
Showing 3 changed files with 193 additions and 194 deletions.
193 changes: 192 additions & 1 deletion RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc
Original file line number Diff line number Diff line change
@@ -1 +1,192 @@
#include "gpuVertexFinderImpl.h"
#ifndef RecoPixelVertexing_PixelVertexFinding_plugins_gpuVertexFinderImpl_h
#define RecoPixelVertexing_PixelVertexFinding_plugins_gpuVertexFinderImpl_h

#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"

#include "gpuClusterTracksByDensity.h"
#include "gpuClusterTracksDBSCAN.h"
#include "gpuClusterTracksIterative.h"
#include "gpuFitVertices.h"
#include "gpuSortByPt2.h"
#include "gpuSplitVertices.h"

#undef PIXVERTEX_DEBUG_PRODUCE

namespace gpuVertexFinder {

// reject outlier tracks that contribute more than this to the chi2 of the vertex fit
constexpr float maxChi2ForFirstFit = 50.f;
constexpr float maxChi2ForFinalFit = 5000.f;

// split vertices with a chi2/NDoF greater than this
constexpr float maxChi2ForSplit = 9.f;

__global__ void loadTracks(TkSoA const* ptracks, ZVertexSoA* soa, WorkSpace* pws, float ptMin) {
assert(ptracks);
assert(soa);
auto const& tracks = *ptracks;
auto const& fit = tracks.stateAtBS;
auto const* quality = tracks.qualityData();

auto first = blockIdx.x * blockDim.x + threadIdx.x;
for (int idx = first, nt = TkSoA::stride(); idx < nt; idx += gridDim.x * blockDim.x) {
auto nHits = tracks.nHits(idx);
if (nHits == 0)
break; // this is a guard: maybe we need to move to nTracks...

// initialize soa...
soa->idv[idx] = -1;

if (nHits < 4)
continue; // no triplets
if (quality[idx] != pixelTrack::Quality::loose)
continue;

auto pt = tracks.pt(idx);

if (pt < ptMin)
continue;

auto& data = *pws;
auto it = atomicAdd(&data.ntrks, 1);
data.itrk[it] = idx;
data.zt[it] = tracks.zip(idx);
data.ezt2[it] = fit.covariance(idx)(14);
data.ptt2[it] = pt * pt;
}
}

// #define THREE_KERNELS
#ifndef THREE_KERNELS
__global__ void vertexFinderOneKernel(gpuVertexFinder::ZVertices* pdata,
gpuVertexFinder::WorkSpace* pws,
int minT, // min number of neighbours to be "seed"
float eps, // max absolute distance to cluster
float errmax, // max error to be "seed"
float chi2max // max normalized distance to cluster,
) {
clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max);
__syncthreads();
fitVertices(pdata, pws, maxChi2ForFirstFit);
__syncthreads();
splitVertices(pdata, pws, maxChi2ForSplit);
__syncthreads();
fitVertices(pdata, pws, maxChi2ForFinalFit);
__syncthreads();
sortByPt2(pdata, pws);
}
#else
__global__ void vertexFinderKernel1(gpuVertexFinder::ZVertices* pdata,
gpuVertexFinder::WorkSpace* pws,
int minT, // min number of neighbours to be "seed"
float eps, // max absolute distance to cluster
float errmax, // max error to be "seed"
float chi2max // max normalized distance to cluster,
) {
clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max);
__syncthreads();
fitVertices(pdata, pws, maxChi2ForFirstFit);
}

__global__ void vertexFinderKernel2(gpuVertexFinder::ZVertices* pdata, gpuVertexFinder::WorkSpace* pws) {
fitVertices(pdata, pws, maxChi2ForFinalFit);
__syncthreads();
sortByPt2(pdata, pws);
}
#endif

#ifdef __CUDACC__
ZVertexHeterogeneous Producer::makeAsync(cudaStream_t stream, TkSoA const* tksoa, float ptMin) const {
#ifdef PIXVERTEX_DEBUG_PRODUCE
std::cout << "producing Vertices on GPU" << std::endl;
#endif // PIXVERTEX_DEBUG_PRODUCE
ZVertexHeterogeneous vertices(cms::cuda::make_device_unique<ZVertexSoA>(stream));
#else
ZVertexHeterogeneous Producer::make(TkSoA const* tksoa, float ptMin) const {
#ifdef PIXVERTEX_DEBUG_PRODUCE
std::cout << "producing Vertices on CPU" << std::endl;
#endif // PIXVERTEX_DEBUG_PRODUCE
ZVertexHeterogeneous vertices(std::make_unique<ZVertexSoA>());
#endif
assert(tksoa);
auto* soa = vertices.get();
assert(soa);

#ifdef __CUDACC__
auto ws_d = cms::cuda::make_device_unique<WorkSpace>(stream);
#else
auto ws_d = std::make_unique<WorkSpace>();
#endif

#ifdef __CUDACC__
init<<<1, 1, 0, stream>>>(soa, ws_d.get());
auto blockSize = 128;
auto numberOfBlocks = (TkSoA::stride() + blockSize - 1) / blockSize;
loadTracks<<<numberOfBlocks, blockSize, 0, stream>>>(tksoa, soa, ws_d.get(), ptMin);
cudaCheck(cudaGetLastError());
#else
init(soa, ws_d.get());
loadTracks(tksoa, soa, ws_d.get(), ptMin);
#endif

#ifdef __CUDACC__
// Running too many thread lead to problems when printf is enabled.
constexpr int maxThreadsForPrint = 1024 - 256;
constexpr int numBlocks = 1024;
constexpr int threadsPerBlock = 128;

if (oneKernel_) {
// implemented only for density clustesrs
#ifndef THREE_KERNELS
vertexFinderOneKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
#else
vertexFinderKernel1<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
cudaCheck(cudaGetLastError());
// one block per vertex...
splitVerticesKernel<<<numBlocks, threadsPerBlock, 0, stream>>>(soa, ws_d.get(), maxChi2ForSplit);
cudaCheck(cudaGetLastError());
vertexFinderKernel2<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get());
#endif
} else { // five kernels
if (useDensity_) {
clusterTracksByDensityKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
} else if (useDBSCAN_) {
clusterTracksDBSCAN<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
} else if (useIterative_) {
clusterTracksIterative<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
}
cudaCheck(cudaGetLastError());
fitVerticesKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), maxChi2ForFirstFit);
cudaCheck(cudaGetLastError());
// one block per vertex...
splitVerticesKernel<<<numBlocks, threadsPerBlock, 0, stream>>>(soa, ws_d.get(), maxChi2ForSplit);
cudaCheck(cudaGetLastError());
fitVerticesKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), maxChi2ForFinalFit);
cudaCheck(cudaGetLastError());
sortByPt2Kernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get());
}
cudaCheck(cudaGetLastError());
#else // __CUDACC__
if (useDensity_) {
clusterTracksByDensity(soa, ws_d.get(), minT, eps, errmax, chi2max);
} else if (useDBSCAN_) {
clusterTracksDBSCAN(soa, ws_d.get(), minT, eps, errmax, chi2max);
} else if (useIterative_) {
clusterTracksIterative(soa, ws_d.get(), minT, eps, errmax, chi2max);
}
#ifdef PIXVERTEX_DEBUG_PRODUCE
std::cout << "found " << (*ws_d).nvIntermediate << " vertices " << std::endl;
#endif // PIXVERTEX_DEBUG_PRODUCE
fitVertices(soa, ws_d.get(), maxChi2ForFirstFit);
// one block per vertex!
splitVertices(soa, ws_d.get(), maxChi2ForSplit);
fitVertices(soa, ws_d.get(), maxChi2ForFinalFit);
sortByPt2(soa, ws_d.get());
#endif

return vertices;
}

} // namespace gpuVertexFinder

#endif // RecoPixelVertexing_PixelVertexFinding_plugins_gpuVertexFinderImpl_h
Original file line number Diff line number Diff line change
@@ -1 +1 @@
#include "gpuVertexFinderImpl.h"
#include "gpuVertexFinder.cc"
192 changes: 0 additions & 192 deletions RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinderImpl.h

This file was deleted.

0 comments on commit ad06d78

Please sign in to comment.