-
Notifications
You must be signed in to change notification settings - Fork 4.3k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Move the implementation of the gpuVertexFinder kernels to gpuVertexFi…
…nder.cc
- Loading branch information
Showing
3 changed files
with
193 additions
and
194 deletions.
There are no files selected for viewing
193 changes: 192 additions & 1 deletion
193
RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,192 @@ | ||
#include "gpuVertexFinderImpl.h" | ||
#ifndef RecoPixelVertexing_PixelVertexFinding_plugins_gpuVertexFinderImpl_h | ||
#define RecoPixelVertexing_PixelVertexFinding_plugins_gpuVertexFinderImpl_h | ||
|
||
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" | ||
|
||
#include "gpuClusterTracksByDensity.h" | ||
#include "gpuClusterTracksDBSCAN.h" | ||
#include "gpuClusterTracksIterative.h" | ||
#include "gpuFitVertices.h" | ||
#include "gpuSortByPt2.h" | ||
#include "gpuSplitVertices.h" | ||
|
||
#undef PIXVERTEX_DEBUG_PRODUCE | ||
|
||
namespace gpuVertexFinder { | ||
|
||
// reject outlier tracks that contribute more than this to the chi2 of the vertex fit | ||
constexpr float maxChi2ForFirstFit = 50.f; | ||
constexpr float maxChi2ForFinalFit = 5000.f; | ||
|
||
// split vertices with a chi2/NDoF greater than this | ||
constexpr float maxChi2ForSplit = 9.f; | ||
|
||
__global__ void loadTracks(TkSoA const* ptracks, ZVertexSoA* soa, WorkSpace* pws, float ptMin) { | ||
assert(ptracks); | ||
assert(soa); | ||
auto const& tracks = *ptracks; | ||
auto const& fit = tracks.stateAtBS; | ||
auto const* quality = tracks.qualityData(); | ||
|
||
auto first = blockIdx.x * blockDim.x + threadIdx.x; | ||
for (int idx = first, nt = TkSoA::stride(); idx < nt; idx += gridDim.x * blockDim.x) { | ||
auto nHits = tracks.nHits(idx); | ||
if (nHits == 0) | ||
break; // this is a guard: maybe we need to move to nTracks... | ||
|
||
// initialize soa... | ||
soa->idv[idx] = -1; | ||
|
||
if (nHits < 4) | ||
continue; // no triplets | ||
if (quality[idx] != pixelTrack::Quality::loose) | ||
continue; | ||
|
||
auto pt = tracks.pt(idx); | ||
|
||
if (pt < ptMin) | ||
continue; | ||
|
||
auto& data = *pws; | ||
auto it = atomicAdd(&data.ntrks, 1); | ||
data.itrk[it] = idx; | ||
data.zt[it] = tracks.zip(idx); | ||
data.ezt2[it] = fit.covariance(idx)(14); | ||
data.ptt2[it] = pt * pt; | ||
} | ||
} | ||
|
||
// #define THREE_KERNELS | ||
#ifndef THREE_KERNELS | ||
__global__ void vertexFinderOneKernel(gpuVertexFinder::ZVertices* pdata, | ||
gpuVertexFinder::WorkSpace* pws, | ||
int minT, // min number of neighbours to be "seed" | ||
float eps, // max absolute distance to cluster | ||
float errmax, // max error to be "seed" | ||
float chi2max // max normalized distance to cluster, | ||
) { | ||
clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max); | ||
__syncthreads(); | ||
fitVertices(pdata, pws, maxChi2ForFirstFit); | ||
__syncthreads(); | ||
splitVertices(pdata, pws, maxChi2ForSplit); | ||
__syncthreads(); | ||
fitVertices(pdata, pws, maxChi2ForFinalFit); | ||
__syncthreads(); | ||
sortByPt2(pdata, pws); | ||
} | ||
#else | ||
__global__ void vertexFinderKernel1(gpuVertexFinder::ZVertices* pdata, | ||
gpuVertexFinder::WorkSpace* pws, | ||
int minT, // min number of neighbours to be "seed" | ||
float eps, // max absolute distance to cluster | ||
float errmax, // max error to be "seed" | ||
float chi2max // max normalized distance to cluster, | ||
) { | ||
clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max); | ||
__syncthreads(); | ||
fitVertices(pdata, pws, maxChi2ForFirstFit); | ||
} | ||
|
||
__global__ void vertexFinderKernel2(gpuVertexFinder::ZVertices* pdata, gpuVertexFinder::WorkSpace* pws) { | ||
fitVertices(pdata, pws, maxChi2ForFinalFit); | ||
__syncthreads(); | ||
sortByPt2(pdata, pws); | ||
} | ||
#endif | ||
|
||
#ifdef __CUDACC__ | ||
ZVertexHeterogeneous Producer::makeAsync(cudaStream_t stream, TkSoA const* tksoa, float ptMin) const { | ||
#ifdef PIXVERTEX_DEBUG_PRODUCE | ||
std::cout << "producing Vertices on GPU" << std::endl; | ||
#endif // PIXVERTEX_DEBUG_PRODUCE | ||
ZVertexHeterogeneous vertices(cms::cuda::make_device_unique<ZVertexSoA>(stream)); | ||
#else | ||
ZVertexHeterogeneous Producer::make(TkSoA const* tksoa, float ptMin) const { | ||
#ifdef PIXVERTEX_DEBUG_PRODUCE | ||
std::cout << "producing Vertices on CPU" << std::endl; | ||
#endif // PIXVERTEX_DEBUG_PRODUCE | ||
ZVertexHeterogeneous vertices(std::make_unique<ZVertexSoA>()); | ||
#endif | ||
assert(tksoa); | ||
auto* soa = vertices.get(); | ||
assert(soa); | ||
|
||
#ifdef __CUDACC__ | ||
auto ws_d = cms::cuda::make_device_unique<WorkSpace>(stream); | ||
#else | ||
auto ws_d = std::make_unique<WorkSpace>(); | ||
#endif | ||
|
||
#ifdef __CUDACC__ | ||
init<<<1, 1, 0, stream>>>(soa, ws_d.get()); | ||
auto blockSize = 128; | ||
auto numberOfBlocks = (TkSoA::stride() + blockSize - 1) / blockSize; | ||
loadTracks<<<numberOfBlocks, blockSize, 0, stream>>>(tksoa, soa, ws_d.get(), ptMin); | ||
cudaCheck(cudaGetLastError()); | ||
#else | ||
init(soa, ws_d.get()); | ||
loadTracks(tksoa, soa, ws_d.get(), ptMin); | ||
#endif | ||
|
||
#ifdef __CUDACC__ | ||
// Running too many thread lead to problems when printf is enabled. | ||
constexpr int maxThreadsForPrint = 1024 - 256; | ||
constexpr int numBlocks = 1024; | ||
constexpr int threadsPerBlock = 128; | ||
|
||
if (oneKernel_) { | ||
// implemented only for density clustesrs | ||
#ifndef THREE_KERNELS | ||
vertexFinderOneKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max); | ||
#else | ||
vertexFinderKernel1<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max); | ||
cudaCheck(cudaGetLastError()); | ||
// one block per vertex... | ||
splitVerticesKernel<<<numBlocks, threadsPerBlock, 0, stream>>>(soa, ws_d.get(), maxChi2ForSplit); | ||
cudaCheck(cudaGetLastError()); | ||
vertexFinderKernel2<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get()); | ||
#endif | ||
} else { // five kernels | ||
if (useDensity_) { | ||
clusterTracksByDensityKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max); | ||
} else if (useDBSCAN_) { | ||
clusterTracksDBSCAN<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max); | ||
} else if (useIterative_) { | ||
clusterTracksIterative<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max); | ||
} | ||
cudaCheck(cudaGetLastError()); | ||
fitVerticesKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), maxChi2ForFirstFit); | ||
cudaCheck(cudaGetLastError()); | ||
// one block per vertex... | ||
splitVerticesKernel<<<numBlocks, threadsPerBlock, 0, stream>>>(soa, ws_d.get(), maxChi2ForSplit); | ||
cudaCheck(cudaGetLastError()); | ||
fitVerticesKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), maxChi2ForFinalFit); | ||
cudaCheck(cudaGetLastError()); | ||
sortByPt2Kernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get()); | ||
} | ||
cudaCheck(cudaGetLastError()); | ||
#else // __CUDACC__ | ||
if (useDensity_) { | ||
clusterTracksByDensity(soa, ws_d.get(), minT, eps, errmax, chi2max); | ||
} else if (useDBSCAN_) { | ||
clusterTracksDBSCAN(soa, ws_d.get(), minT, eps, errmax, chi2max); | ||
} else if (useIterative_) { | ||
clusterTracksIterative(soa, ws_d.get(), minT, eps, errmax, chi2max); | ||
} | ||
#ifdef PIXVERTEX_DEBUG_PRODUCE | ||
std::cout << "found " << (*ws_d).nvIntermediate << " vertices " << std::endl; | ||
#endif // PIXVERTEX_DEBUG_PRODUCE | ||
fitVertices(soa, ws_d.get(), maxChi2ForFirstFit); | ||
// one block per vertex! | ||
splitVertices(soa, ws_d.get(), maxChi2ForSplit); | ||
fitVertices(soa, ws_d.get(), maxChi2ForFinalFit); | ||
sortByPt2(soa, ws_d.get()); | ||
#endif | ||
|
||
return vertices; | ||
} | ||
|
||
} // namespace gpuVertexFinder | ||
|
||
#endif // RecoPixelVertexing_PixelVertexFinding_plugins_gpuVertexFinderImpl_h |
2 changes: 1 addition & 1 deletion
2
RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cu
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
#include "gpuVertexFinderImpl.h" | ||
#include "gpuVertexFinder.cc" |
192 changes: 0 additions & 192 deletions
192
RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinderImpl.h
This file was deleted.
Oops, something went wrong.