From 4677b9d47bc0083f06003dcb7b8855462b26fca5 Mon Sep 17 00:00:00 2001 From: Andrea Bocci Date: Tue, 23 Mar 2021 22:51:45 +0100 Subject: [PATCH] Squash all Patatrack developments on top of CMSSW_11_3_0_pre5 --- CUDADataFormats/Track/BuildFile.xml | 9 + .../Track/interface/PixelTrackHeterogeneous.h | 9 + .../Track/interface/TrackSoAHeterogeneousT.h | 73 ++ .../Track/interface/TrajectoryStateSoAT.h | 59 + CUDADataFormats/Track/src/classes.h | 9 + CUDADataFormats/Track/src/classes_def.xml | 6 + CUDADataFormats/Track/test/BuildFile.xml | 13 + .../Track/test/TrajectoryStateSOA_t.cpp | 1 + .../Track/test/TrajectoryStateSOA_t.cu | 1 + .../Track/test/TrajectoryStateSOA_t.h | 75 ++ CUDADataFormats/Vertex/BuildFile.xml | 9 + .../Vertex/interface/ZVertexHeterogeneous.h | 14 + CUDADataFormats/Vertex/interface/ZVertexSoA.h | 26 + CUDADataFormats/Vertex/src/classes.h | 8 + CUDADataFormats/Vertex/src/classes_def.xml | 6 + .../PyReleaseValidation/python/relval_2017.py | 18 +- .../PyReleaseValidation/python/relval_gpu.py | 18 +- .../python/relval_steps.py | 12 +- .../python/upgradeWorkflowComponents.py | 41 +- .../clients/beam_dqm_sourceclient-live_cfg.py | 3 +- .../beampixel_dqm_sourceclient-live_cfg.py | 2 +- .../pixelTrackingEffFromHitPattern_cff.py | 5 +- .../python/pixelVertexResolutionClient_cfi.py | 7 + .../python/pixelTracksMonitoring_cff.py | 92 +- .../python/DQMOffline_SecondStep_cff.py | 4 +- .../Configuration/python/DQMOffline_cff.py | 4 +- .../RecoB/python/PixelVertexMonitor_cff.py | 8 + .../Tracking/python/SeedingMigration.py | 5 +- .../python/customizeHLTforPatatrack.py | 802 +++++++++++++ .../python/RecoPixelVertexing_cff.py | 18 +- .../customizePixelTracksForProfiling.py | 15 - .../python/customizePixelTracksSoAonCPU.py | 61 + .../PixelTrackFitting/BuildFile.xml | 2 + .../PixelTrackFitting/interface/BrokenLine.h | 606 ++++++++++ .../PixelTrackFitting/interface/FitResult.h | 65 ++ .../PixelTrackFitting/interface/FitUtils.h | 243 ++++ .../interface/PixelNtupletsFitter.h | 27 + .../PixelTrackFitting/interface/RiemannFit.h | 1008 +++++++++++++++++ .../PixelTrackFitting/plugins/BuildFile.xml | 3 + .../plugins/PixelNtupletsFitterProducer.cc | 44 + .../plugins/PixelTrackDumpCUDA.cc | 86 ++ .../plugins/PixelTrackProducer.cc | 78 +- .../plugins/PixelTrackProducer.h | 9 +- .../plugins/PixelTrackProducerFromSoA.cc | 205 ++++ .../plugins/PixelTrackSoAFromCUDA.cc | 86 ++ .../PixelTrackFitting/plugins/storeTracks.h | 72 ++ .../python/PixelTracks_cff.py | 23 + .../python/pixelNtupletsFitter_cfi.py | 6 + .../src/PixelNtupletsFitter.cc | 102 ++ .../PixelTrackFitting/test/BuildFile.xml | 82 +- .../PixelTrackFitting/test/PixelTrackFits.cc | 431 +++++++ .../PixelTrackFitting/test/testEigenGPU.cu | 343 ++++++ .../test/testEigenGPUNoFit.cu | 248 ++++ .../test/testEigenJacobian.cpp | 134 +++ .../PixelTrackFitting/test/testFits.cpp | 154 +++ .../PixelTrackFitting/test/test_common.h | 47 + .../interface/CAHitQuadrupletGenerator.h | 2 +- .../PixelTriplets/interface/CircleEq.h | 97 ++ .../plugins/BrokenLineFitOnGPU.cc | 70 ++ .../plugins/BrokenLineFitOnGPU.cu | 85 ++ .../plugins/BrokenLineFitOnGPU.h | 184 +++ .../PixelTriplets/plugins/BuildFile.xml | 12 +- .../PixelTriplets/plugins/CAConstants.h | 83 ++ .../PixelTriplets/plugins/CAHitNtupletCUDA.cc | 83 ++ .../plugins/CAHitNtupletGeneratorKernels.cc | 184 +++ .../plugins/CAHitNtupletGeneratorKernels.cu | 308 +++++ .../plugins/CAHitNtupletGeneratorKernels.h | 223 ++++ .../CAHitNtupletGeneratorKernelsAlloc.cc | 1 + .../CAHitNtupletGeneratorKernelsAlloc.cu | 1 + .../CAHitNtupletGeneratorKernelsAlloc.h | 35 + .../CAHitNtupletGeneratorKernelsImpl.h | 593 ++++++++++ .../plugins/CAHitNtupletGeneratorOnGPU.cc | 229 ++++ .../plugins/CAHitNtupletGeneratorOnGPU.h | 65 ++ .../PixelTriplets/plugins/GPUCACell.h | 347 ++++++ .../PixelTriplets/plugins/HelixFitOnGPU.cc | 16 + .../PixelTriplets/plugins/HelixFitOnGPU.h | 68 ++ .../PixelTriplets/plugins/RiemannFitOnGPU.cc | 113 ++ .../PixelTriplets/plugins/RiemannFitOnGPU.cu | 131 +++ .../PixelTriplets/plugins/RiemannFitOnGPU.h | 187 +++ .../PixelTriplets/plugins/gpuFishbone.h | 91 ++ .../PixelTriplets/plugins/gpuPixelDoublets.h | 130 +++ .../plugins/gpuPixelDoubletsAlgos.h | 243 ++++ .../python/caHitQuadrupletEDProducer_cfi.py | 4 + .../PixelTriplets/test/BuildFile.xml | 11 + .../PixelTriplets/test/CAsizes_t.cpp | 25 + .../PixelTriplets/test/CircleEq_t.cpp | 77 ++ .../PixelTriplets/test/fastDPHI_t.cpp | 165 +++ .../PixelVertexFinding/plugins/BuildFile.xml | 5 +- .../plugins/PixelVertexProducerCUDA.cc | 125 ++ .../plugins/PixelVertexProducerFromSoA.cc | 175 +++ .../plugins/PixelVertexSoAFromCUDA.cc | 65 ++ .../plugins/gpuClusterTracksByDensity.h | 234 ++++ .../plugins/gpuClusterTracksDBSCAN.h | 242 ++++ .../plugins/gpuClusterTracksIterative.h | 213 ++++ .../plugins/gpuFitVertices.h | 113 ++ .../PixelVertexFinding/plugins/gpuSortByPt2.h | 73 ++ .../plugins/gpuSplitVertices.h | 139 +++ .../plugins/gpuVertexFinder.cc | 1 + .../plugins/gpuVertexFinder.cu | 1 + .../plugins/gpuVertexFinder.h | 83 ++ .../plugins/gpuVertexFinderImpl.h | 169 +++ .../python/PixelVertexes_cfi.py | 2 - .../PixelVertexFinding/test/BuildFile.xml | 37 +- .../PixelVertexFinding/test/VertexFinder_t.h | 347 ++++++ .../test/cpuVertexFinder_t.cpp | 1 + .../test/gpuVertexFinder_t.cu | 1 + .../python/customizePixelOnlyForProfiling.py | 59 + .../TkSeedGenerator/plugins/BuildFile.xml | 18 +- .../plugins/SeedProducerFromSoA.cc | 170 +++ .../TrackerHitAssociation/BuildFile.xml | 2 + .../trackerHitAssociationHeterogeneous.h | 69 ++ .../plugins/BuildFile.xml | 7 +- .../plugins/ClusterSLOnGPU.cu | 224 ++++ .../plugins/ClusterSLOnGPU.h | 36 + .../ClusterTPAssociationProducerCUDA.cc | 227 ++++ .../python/tpClusterProducer_cfi.py | 3 + .../TrackerHitAssociation/src/classes.h | 2 + .../TrackerHitAssociation/src/classes_def.xml | 5 + .../TrackerHitAssociation/test/BuildFile.xml | 2 + .../test/ClusterTPCUDAdump.cc | 66 ++ .../python/PostProcessorTracker_cfi.py | 4 +- .../RecoTrack/python/TrackValidation_cff.py | 72 +- ...kingParticleSelectionsForEfficiency_cff.py | 2 +- Validation/RecoTrack/python/plotting/html.py | 47 +- .../python/plotting/trackingPlots.py | 25 +- 125 files changed, 12290 insertions(+), 176 deletions(-) create mode 100644 CUDADataFormats/Track/BuildFile.xml create mode 100644 CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h create mode 100644 CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h create mode 100644 CUDADataFormats/Track/interface/TrajectoryStateSoAT.h create mode 100644 CUDADataFormats/Track/src/classes.h create mode 100644 CUDADataFormats/Track/src/classes_def.xml create mode 100644 CUDADataFormats/Track/test/BuildFile.xml create mode 100644 CUDADataFormats/Track/test/TrajectoryStateSOA_t.cpp create mode 100644 CUDADataFormats/Track/test/TrajectoryStateSOA_t.cu create mode 100644 CUDADataFormats/Track/test/TrajectoryStateSOA_t.h create mode 100644 CUDADataFormats/Vertex/BuildFile.xml create mode 100644 CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h create mode 100644 CUDADataFormats/Vertex/interface/ZVertexSoA.h create mode 100644 CUDADataFormats/Vertex/src/classes.h create mode 100644 CUDADataFormats/Vertex/src/classes_def.xml create mode 100644 DQM/TrackingMonitorClient/python/pixelVertexResolutionClient_cfi.py create mode 100644 DQMOffline/RecoB/python/PixelVertexMonitor_cff.py create mode 100644 HLTrigger/Configuration/python/customizeHLTforPatatrack.py delete mode 100644 RecoPixelVertexing/Configuration/python/customizePixelTracksForProfiling.py create mode 100644 RecoPixelVertexing/Configuration/python/customizePixelTracksSoAonCPU.py create mode 100644 RecoPixelVertexing/PixelTrackFitting/interface/BrokenLine.h create mode 100644 RecoPixelVertexing/PixelTrackFitting/interface/FitResult.h create mode 100644 RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h create mode 100644 RecoPixelVertexing/PixelTrackFitting/interface/PixelNtupletsFitter.h create mode 100644 RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h create mode 100644 RecoPixelVertexing/PixelTrackFitting/plugins/PixelNtupletsFitterProducer.cc create mode 100644 RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc create mode 100644 RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc create mode 100644 RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc create mode 100644 RecoPixelVertexing/PixelTrackFitting/plugins/storeTracks.h create mode 100644 RecoPixelVertexing/PixelTrackFitting/python/pixelNtupletsFitter_cfi.py create mode 100644 RecoPixelVertexing/PixelTrackFitting/src/PixelNtupletsFitter.cc create mode 100644 RecoPixelVertexing/PixelTrackFitting/test/PixelTrackFits.cc create mode 100644 RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu create mode 100644 RecoPixelVertexing/PixelTrackFitting/test/testEigenGPUNoFit.cu create mode 100644 RecoPixelVertexing/PixelTrackFitting/test/testEigenJacobian.cpp create mode 100644 RecoPixelVertexing/PixelTrackFitting/test/testFits.cpp create mode 100644 RecoPixelVertexing/PixelTrackFitting/test/test_common.h create mode 100644 RecoPixelVertexing/PixelTriplets/interface/CircleEq.h create mode 100644 RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cc create mode 100644 RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu create mode 100644 RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h create mode 100644 RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h create mode 100644 RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc create mode 100644 RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc create mode 100644 RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu create mode 100644 RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h create mode 100644 RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.cc create mode 100644 RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.cu create mode 100644 RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h create mode 100644 RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h create mode 100644 RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc create mode 100644 RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h create mode 100644 RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h create mode 100644 RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc create mode 100644 RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h create mode 100644 RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cc create mode 100644 RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu create mode 100644 RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h create mode 100644 RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h create mode 100644 RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h create mode 100644 RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h create mode 100644 RecoPixelVertexing/PixelTriplets/python/caHitQuadrupletEDProducer_cfi.py create mode 100644 RecoPixelVertexing/PixelTriplets/test/CAsizes_t.cpp create mode 100644 RecoPixelVertexing/PixelTriplets/test/CircleEq_t.cpp create mode 100644 RecoPixelVertexing/PixelTriplets/test/fastDPHI_t.cpp create mode 100644 RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc create mode 100644 RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerFromSoA.cc create mode 100644 RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexSoAFromCUDA.cc create mode 100644 RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h create mode 100644 RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h create mode 100644 RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h create mode 100644 RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h create mode 100644 RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h create mode 100644 RecoPixelVertexing/PixelVertexFinding/plugins/gpuSplitVertices.h create mode 100644 RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc create mode 100644 RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cu create mode 100644 RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h create mode 100644 RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinderImpl.h create mode 100644 RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h create mode 100644 RecoPixelVertexing/PixelVertexFinding/test/cpuVertexFinder_t.cpp create mode 100644 RecoPixelVertexing/PixelVertexFinding/test/gpuVertexFinder_t.cu create mode 100644 RecoTracker/Configuration/python/customizePixelOnlyForProfiling.py create mode 100644 RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc create mode 100644 SimTracker/TrackerHitAssociation/interface/trackerHitAssociationHeterogeneous.h create mode 100644 SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.cu create mode 100644 SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.h create mode 100644 SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationProducerCUDA.cc create mode 100644 SimTracker/TrackerHitAssociation/test/ClusterTPCUDAdump.cc diff --git a/CUDADataFormats/Track/BuildFile.xml b/CUDADataFormats/Track/BuildFile.xml new file mode 100644 index 0000000000000..e3f9a0910bbd8 --- /dev/null +++ b/CUDADataFormats/Track/BuildFile.xml @@ -0,0 +1,9 @@ + + + + + + + + + diff --git a/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h b/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h new file mode 100644 index 0000000000000..3ee5af80353dd --- /dev/null +++ b/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h @@ -0,0 +1,9 @@ +#ifndef CUDADataFormats_Track_PixelTrackHeterogeneous_h +#define CUDADataFormats_Track_PixelTrackHeterogeneous_h + +#include "CUDADataFormats/Common/interface/HeterogeneousSoA.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h" + +using PixelTrackHeterogeneous = HeterogeneousSoA; + +#endif // #ifndef CUDADataFormats_Track_PixelTrackHeterogeneous_h \ No newline at end of file diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h new file mode 100644 index 0000000000000..bd39f3c4d3bfe --- /dev/null +++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h @@ -0,0 +1,73 @@ +#ifndef CUDADataFormats_Track_TrackHeterogeneousT_H +#define CUDADataFormats_Track_TrackHeterogeneousT_H + +#include "CUDADataFormats/Track/interface/TrajectoryStateSoAT.h" +#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" + +#include "CUDADataFormats/Common/interface/HeterogeneousSoA.h" + +namespace pixelTrack { + enum class Quality : uint8_t { bad = 0, dup, loose, strict, tight, highPurity }; +} + +template +class TrackSoAHeterogeneousT { +public: + static constexpr int32_t stride() { return S; } + + using Quality = pixelTrack::Quality; + using hindex_type = uint32_t; + using HitContainer = cms::cuda::OneToManyAssoc; + + // Always check quality is at least loose! + // CUDA does not support enums in __lgc ... +private: + eigenSoA::ScalarSoA quality_; + +public: + constexpr Quality quality(int32_t i) const { return (Quality)(quality_(i)); } + constexpr Quality &quality(int32_t i) { return (Quality &)(quality_(i)); } + constexpr Quality const *qualityData() const { return (Quality const *)(quality_.data()); } + constexpr Quality *qualityData() { return (Quality *)(quality_.data()); } + + // this is chi2/ndof as not necessarely all hits are used in the fit + eigenSoA::ScalarSoA chi2; + + constexpr int nHits(int i) const { return detIndices.size(i); } + + // State at the Beam spot + // phi,tip,1/pt,cotan(theta),zip + TrajectoryStateSoAT stateAtBS; + eigenSoA::ScalarSoA eta; + eigenSoA::ScalarSoA pt; + constexpr float charge(int32_t i) const { return std::copysign(1.f, stateAtBS.state(i)(2)); } + constexpr float phi(int32_t i) const { return stateAtBS.state(i)(0); } + constexpr float tip(int32_t i) const { return stateAtBS.state(i)(1); } + constexpr float zip(int32_t i) const { return stateAtBS.state(i)(4); } + + // state at the detector of the outermost hit + // representation to be decided... + // not yet filled on GPU + // TrajectoryStateSoA stateAtOuterDet; + + HitContainer hitIndices; + HitContainer detIndices; +}; + +namespace pixelTrack { + +#ifdef GPU_SMALL_EVENTS + // kept for testing and debugging + constexpr uint32_t maxNumber() { return 2 * 1024; } +#else + // tested on MC events with 55-75 pileup events + constexpr uint32_t maxNumber() { return 32 * 1024; } +#endif + + using TrackSoA = TrackSoAHeterogeneousT; + using TrajectoryState = TrajectoryStateSoAT; + using HitContainer = TrackSoA::HitContainer; + +} // namespace pixelTrack + +#endif // CUDADataFormats_Track_TrackHeterogeneousT_H diff --git a/CUDADataFormats/Track/interface/TrajectoryStateSoAT.h b/CUDADataFormats/Track/interface/TrajectoryStateSoAT.h new file mode 100644 index 0000000000000..64fcd573a6991 --- /dev/null +++ b/CUDADataFormats/Track/interface/TrajectoryStateSoAT.h @@ -0,0 +1,59 @@ +#ifndef CUDADataFormats_Track_TrajectoryStateSOAT_H +#define CUDADataFormats_Track_TrajectoryStateSOAT_H + +#include +#include "HeterogeneousCore/CUDAUtilities/interface/eigenSoA.h" + +template +struct TrajectoryStateSoAT { + using Vector5f = Eigen::Matrix; + using Vector15f = Eigen::Matrix; + + using Vector5d = Eigen::Matrix; + using Matrix5d = Eigen::Matrix; + + static constexpr int32_t stride() { return S; } + + eigenSoA::MatrixSoA state; + eigenSoA::MatrixSoA covariance; + + template + __host__ __device__ inline void copyFromCircle( + V3 const& cp, M3 const& ccov, V2 const& lp, M2 const& lcov, float b, int32_t i) { + state(i) << cp.template cast(), lp.template cast(); + state(i)(2) *= b; + auto cov = covariance(i); + cov(0) = ccov(0, 0); + cov(1) = ccov(0, 1); + cov(2) = b * float(ccov(0, 2)); + cov(4) = cov(3) = 0; + cov(5) = ccov(1, 1); + cov(6) = b * float(ccov(1, 2)); + cov(8) = cov(7) = 0; + cov(9) = b * b * float(ccov(2, 2)); + cov(11) = cov(10) = 0; + cov(12) = lcov(0, 0); + cov(13) = lcov(0, 1); + cov(14) = lcov(1, 1); + } + + template + __host__ __device__ inline void copyFromDense(V5 const& v, M5 const& cov, int32_t i) { + state(i) = v.template cast(); + for (int j = 0, ind = 0; j < 5; ++j) + for (auto k = j; k < 5; ++k) + covariance(i)(ind++) = cov(j, k); + } + + template + __host__ __device__ inline void copyToDense(V5& v, M5& cov, int32_t i) const { + v = state(i).template cast(); + for (int j = 0, ind = 0; j < 5; ++j) { + cov(j, j) = covariance(i)(ind++); + for (auto k = j + 1; k < 5; ++k) + cov(k, j) = cov(j, k) = covariance(i)(ind++); + } + } +}; + +#endif // CUDADataFormats_Track_TrajectoryStateSOAT_H diff --git a/CUDADataFormats/Track/src/classes.h b/CUDADataFormats/Track/src/classes.h new file mode 100644 index 0000000000000..97c116f6c88d3 --- /dev/null +++ b/CUDADataFormats/Track/src/classes.h @@ -0,0 +1,9 @@ +#ifndef CUDADataFormats_Track_src_classes_h +#define CUDADataFormats_Track_src_classes_h + +#include "CUDADataFormats/Common/interface/Product.h" +#include "CUDADataFormats/Common/interface/HostProduct.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h" +#include "DataFormats/Common/interface/Wrapper.h" + +#endif // CUDADataFormats_Track_src_classes_h diff --git a/CUDADataFormats/Track/src/classes_def.xml b/CUDADataFormats/Track/src/classes_def.xml new file mode 100644 index 0000000000000..9c80ae91baf29 --- /dev/null +++ b/CUDADataFormats/Track/src/classes_def.xml @@ -0,0 +1,6 @@ + + + + + + diff --git a/CUDADataFormats/Track/test/BuildFile.xml b/CUDADataFormats/Track/test/BuildFile.xml new file mode 100644 index 0000000000000..598b345d4709d --- /dev/null +++ b/CUDADataFormats/Track/test/BuildFile.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + diff --git a/CUDADataFormats/Track/test/TrajectoryStateSOA_t.cpp b/CUDADataFormats/Track/test/TrajectoryStateSOA_t.cpp new file mode 100644 index 0000000000000..d6ff539a642b0 --- /dev/null +++ b/CUDADataFormats/Track/test/TrajectoryStateSOA_t.cpp @@ -0,0 +1 @@ +#include "TrajectoryStateSOA_t.h" diff --git a/CUDADataFormats/Track/test/TrajectoryStateSOA_t.cu b/CUDADataFormats/Track/test/TrajectoryStateSOA_t.cu new file mode 100644 index 0000000000000..d6ff539a642b0 --- /dev/null +++ b/CUDADataFormats/Track/test/TrajectoryStateSOA_t.cu @@ -0,0 +1 @@ +#include "TrajectoryStateSOA_t.h" diff --git a/CUDADataFormats/Track/test/TrajectoryStateSOA_t.h b/CUDADataFormats/Track/test/TrajectoryStateSOA_t.h new file mode 100644 index 0000000000000..97b88873c2613 --- /dev/null +++ b/CUDADataFormats/Track/test/TrajectoryStateSOA_t.h @@ -0,0 +1,75 @@ +#include "CUDADataFormats/Track/interface/TrajectoryStateSoAT.h" + +using Vector5d = Eigen::Matrix; +using Matrix5d = Eigen::Matrix; + +__host__ __device__ Matrix5d loadCov(Vector5d const& e) { + Matrix5d cov; + for (int i = 0; i < 5; ++i) + cov(i, i) = e(i) * e(i); + for (int i = 0; i < 5; ++i) { + for (int j = 0; j < i; ++j) { + double v = 0.3 * std::sqrt(cov(i, i) * cov(j, j)); // this makes the matrix pos defined + cov(i, j) = (i + j) % 2 ? -0.4 * v : 0.1 * v; + cov(j, i) = cov(i, j); + } + } + return cov; +} + +using TS = TrajectoryStateSoAT<128>; + +__global__ void testTSSoA(TS* pts, int n) { + assert(n <= 128); + + Vector5d par0; + par0 << 0.2, 0.1, 3.5, 0.8, 0.1; + Vector5d e0; + e0 << 0.01, 0.01, 0.035, -0.03, -0.01; + auto cov0 = loadCov(e0); + + TS& ts = *pts; + + int first = threadIdx.x + blockIdx.x * blockDim.x; + + for (int i = first; i < n; i += blockDim.x * gridDim.x) { + ts.copyFromDense(par0, cov0, i); + Vector5d par1; + Matrix5d cov1; + ts.copyToDense(par1, cov1, i); + Vector5d delV = par1 - par0; + Matrix5d delM = cov1 - cov0; + for (int j = 0; j < 5; ++j) { + assert(std::abs(delV(j)) < 1.e-5); + for (auto k = j; k < 5; ++k) { + assert(cov0(k, j) == cov0(j, k)); + assert(cov1(k, j) == cov1(j, k)); + assert(std::abs(delM(k, j)) < 1.e-5); + } + } + } +} + +#ifdef __CUDACC__ +#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#endif + +int main() { +#ifdef __CUDACC__ + cms::cudatest::requireDevices(); +#endif + + TS ts; + +#ifdef __CUDACC__ + TS* ts_d; + cudaCheck(cudaMalloc(&ts_d, sizeof(TS))); + testTSSoA<<<1, 64>>>(ts_d, 128); + cudaCheck(cudaGetLastError()); + cudaCheck(cudaMemcpy(&ts, ts_d, sizeof(TS), cudaMemcpyDefault)); + cudaCheck(cudaDeviceSynchronize()); +#else + testTSSoA(&ts, 128); +#endif +} diff --git a/CUDADataFormats/Vertex/BuildFile.xml b/CUDADataFormats/Vertex/BuildFile.xml new file mode 100644 index 0000000000000..e3f9a0910bbd8 --- /dev/null +++ b/CUDADataFormats/Vertex/BuildFile.xml @@ -0,0 +1,9 @@ + + + + + + + + + diff --git a/CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h b/CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h new file mode 100644 index 0000000000000..aacfddc6fe7e2 --- /dev/null +++ b/CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h @@ -0,0 +1,14 @@ +#ifndef CUDADataFormatsVertexZVertexHeterogeneous_H +#define CUDADataFormatsVertexZVertexHeterogeneous_H + +#include "CUDADataFormats/Vertex/interface/ZVertexSoA.h" +#include "CUDADataFormats/Common/interface/HeterogeneousSoA.h" +#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" + +using ZVertexHeterogeneous = HeterogeneousSoA; +#ifndef __CUDACC__ +#include "CUDADataFormats/Common/interface/Product.h" +using ZVertexCUDAProduct = cms::cuda::Product; +#endif + +#endif diff --git a/CUDADataFormats/Vertex/interface/ZVertexSoA.h b/CUDADataFormats/Vertex/interface/ZVertexSoA.h new file mode 100644 index 0000000000000..5f0699d5831ec --- /dev/null +++ b/CUDADataFormats/Vertex/interface/ZVertexSoA.h @@ -0,0 +1,26 @@ +#ifndef CUDADataFormatsVertexZVertexSoA_H +#define CUDADataFormatsVertexZVertexSoA_H + +#include +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h" + +// SOA for vertices +// These vertices are clusterized and fitted only along the beam line (z) +// to obtain their global coordinate the beam spot position shall be added (eventually correcting for the beam angle as well) +struct ZVertexSoA { + static constexpr uint32_t MAXTRACKS = 32 * 1024; + static constexpr uint32_t MAXVTX = 1024; + + int16_t idv[MAXTRACKS]; // vertex index for each associated (original) track (-1 == not associate) + float zv[MAXVTX]; // output z-posistion of found vertices + float wv[MAXVTX]; // output weight (1/error^2) on the above + float chi2[MAXVTX]; // vertices chi2 + float ptv2[MAXVTX]; // vertices pt^2 + int32_t ndof[MAXTRACKS]; // vertices number of dof (reused as workspace for the number of nearest neighbours FIXME) + uint16_t sortInd[MAXVTX]; // sorted index (by pt2) ascending + uint32_t nvFinal; // the number of vertices + + __host__ __device__ void init() { nvFinal = 0; } +}; + +#endif // CUDADataFormatsVertexZVertexSoA.H diff --git a/CUDADataFormats/Vertex/src/classes.h b/CUDADataFormats/Vertex/src/classes.h new file mode 100644 index 0000000000000..e7fea871f7d39 --- /dev/null +++ b/CUDADataFormats/Vertex/src/classes.h @@ -0,0 +1,8 @@ +#ifndef CUDADataFormats__src_classes_h +#define CUDADataFormats__src_classes_h + +#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h" +#include "CUDADataFormats/Common/interface/Product.h" +#include "DataFormats/Common/interface/Wrapper.h" + +#endif diff --git a/CUDADataFormats/Vertex/src/classes_def.xml b/CUDADataFormats/Vertex/src/classes_def.xml new file mode 100644 index 0000000000000..ea633080af9af --- /dev/null +++ b/CUDADataFormats/Vertex/src/classes_def.xml @@ -0,0 +1,6 @@ + + + + + + diff --git a/Configuration/PyReleaseValidation/python/relval_2017.py b/Configuration/PyReleaseValidation/python/relval_2017.py index 193ab79bcd384..87130136c154c 100644 --- a/Configuration/PyReleaseValidation/python/relval_2017.py +++ b/Configuration/PyReleaseValidation/python/relval_2017.py @@ -5,7 +5,7 @@ # here only define the workflows as a combination of the steps defined above: workflows = Matrix() -# each workflow defines a name and a list of steps to be done. +# each workflow defines a name and a list of steps to be done. # if no explicit name/label given for the workflow (first arg), # the name of step1 will be used @@ -24,16 +24,16 @@ # (HE collapse: TTbar, TTbar PU, TTbar design) # (ParkingBPH: TTbar) # (TTbar PU with JME NanoAOD) -# (Patatrack pixel-only: ZMM - on CPU) -# (Patatrack pixel-only: TTbar - on CPU) +# (Patatrack pixel-only: ZMM - on CPU: quadruplets, triplets) +# (Patatrack pixel-only: TTbar - on CPU: quadruplets, triplets) # (Patatrack ECAL-only: TTbar - on CPU) # (Patatrack HCAL-only: TTbar - on CPU) # 2021 (DD4HEP: TTbar, ZMM) # (ele guns 10, 35, 1000; pho guns 10, 35; mu guns 1, 10, 100, 1000, QCD 3TeV, QCD Flat) # (ZMM, TTbar, ZEE, MinBias, TTbar PU, TTbar PU premix, ZEE PU, TTbar design) # (TTbar trackingOnly, pixelTrackingOnly, trackingMkFit, trackdnn) -# (Patatrack pixel-only: ZMM - on CPU) -# (Patatrack pixel-only: TTbar - on CPU) +# (Patatrack pixel-only: ZMM - on CPU: quadruplets, triplets) +# (Patatrack pixel-only: TTbar - on CPU: quadruplets, triplets) # (Patatrack ECAL-only: TTbar - on CPU) # (Patatrack HCAL-only: TTbar - on CPU) # (TTbar 0T, TTbar PU 0T) @@ -51,16 +51,16 @@ 10824.6,11024.6,11224.6, 10824.8, 11024.15, - 10842.501, - 10824.501, + 10842.501,10842.505, + 10824.501,10824.505, 10824.511, 10824.521, 11634.911, 11650.911, 11601.0,11602.0,11603.0,11604.0,11605.0,11606.0,11607.0,11608.0,11609.0,11630.0,11643.0, 11650.0,11634.0,11646.0,11640.0,11834.0,11834.99,11846.0,12024.0, 11634.1,11634.5,11634.7,11634.91, - 11650.501, - 11634.501, + 11650.501,11650.505, + 11634.501,11634.505, 11634.511, 11634.521, 11634.24,11834.24, diff --git a/Configuration/PyReleaseValidation/python/relval_gpu.py b/Configuration/PyReleaseValidation/python/relval_gpu.py index 4e49467a0e2e8..43353279ea4ad 100644 --- a/Configuration/PyReleaseValidation/python/relval_gpu.py +++ b/Configuration/PyReleaseValidation/python/relval_gpu.py @@ -5,7 +5,7 @@ # here only define the workflows as a combination of the steps defined above: workflows = Matrix() -# each workflow defines a name and a list of steps to be done. +# each workflow defines a name and a list of steps to be done. # if no explicit name/label given for the workflow (first arg), # the name of step1 will be used @@ -14,21 +14,29 @@ #just define all of them #WFs to run in IB: -# mc 2018 (Patatrack pixel-only: ZMM - on GPU, both CPU and GPU, auto) -# (Patatrack pixel-only: TTbar - on GPU, both CPU and GPU, auto) +# mc 2018 (Patatrack pixel-only quadruplets: ZMM - on GPU, both CPU and GPU, auto) +# (Patatrack pixel-only triplets: ZMM - on GPU, both CPU and GPU, auto) +# (Patatrack pixel-only quadruplets: TTbar - on GPU, both CPU and GPU, auto) +# (Patatrack pixel-only triplets: TTbar - on GPU, both CPU and GPU, auto) # (Patatrack ECAL-only: TTbar - on GPU, both CPU and GPU, auto) # (Patatrack HCAL-only: TTbar - on GPU, both CPU and GPU, auto) -# mc 2021 (Patatrack pixel-only: ZMM - on GPU, both CPU and GPU, auto) -# (Patatrack pixel-only: TTbar - on GPU, both CPU and GPU, auto) +# mc 2021 (Patatrack pixel-only quadruplets: ZMM - on GPU, both CPU and GPU, auto) +# (Patatrack pixel-only triplets: ZMM - on GPU, both CPU and GPU, auto) +# (Patatrack pixel-only quadruplets: TTbar - on GPU, both CPU and GPU, auto) +# (Patatrack pixel-only triplets: TTbar - on GPU, both CPU and GPU, auto) # (Patatrack ECAL-only: TTbar - on GPU, both CPU and GPU, auto) # (Patatrack HCAL-only: TTbar - on GPU, both CPU and GPU, auto) numWFIB = [ 10842.502, # 10842.503,10842.504, + 10842.506, # 10842.507,10842.508, 10824.502, # 10824.503,10824.504, + 10824.506, # 10824.507,10824.508, 10824.512, # 10824.513,10824.514, 10824.522, # 10824.523,10824.524, 11650.502, # 11650.503,11650.504, + 11650.506, # 11650.507,11650.508, 11634.502, # 11634.503,11634.504, + 11634.506, # 11634.507,11634.508, 11634.512, # 11634.513,11634.514, 11634.522, # 11634.523,11634.524 ] diff --git a/Configuration/PyReleaseValidation/python/relval_steps.py b/Configuration/PyReleaseValidation/python/relval_steps.py index 3556eda005ada..0fc667283361b 100644 --- a/Configuration/PyReleaseValidation/python/relval_steps.py +++ b/Configuration/PyReleaseValidation/python/relval_steps.py @@ -2186,8 +2186,11 @@ def gen2021HiMix(fragment,howMuch): '--era' :'Run2_2016' } -step3_pixelNtupleFit = { - '--procModifiers': 'pixelNtupleFit', +step3_pixel_ntuplet_cpu = { + '--customise': 'RecoPixelVertexing/Configuration/customizePixelTracksSoAonCPU.customizePixelTracksSoAonCPU' +} +step3_pixel_triplets = { + '--customise': 'RecoPixelVertexing/Configuration/customizePixelTracksSoAonCPU.customizePixelTracksForTriplets' } step3_gpu = { '--procModifiers': 'gpu', @@ -2320,8 +2323,11 @@ def gen2021HiMix(fragment,howMuch): steps['RECODR2_2018reHLT_Prompt']=merge([{'--conditions':'auto:run2_data'},steps['RECODR2_2018reHLT']]) steps['RECODR2_2018reHLT_ZBPrompt']=merge([{'--conditions':'auto:run2_data','-s':'RAW2DIGI,L1Reco,RECO,EI,PAT,ALCA:SiStripCalZeroBias+SiStripCalMinBias+TkAlMinBias+EcalESAlign,DQM:@rerecoZeroBias+@ExtraHLT+@miniAODDQM'},steps['RECODR2_2018reHLT']]) steps['RECODR2_2018reHLT_Prompt_pixelTrackingOnly']=merge([{'-s': 'RAW2DIGI:RawToDigi_pixelOnly,RECO:reconstruction_pixelTrackingOnly,DQM:@pixelTrackingOnlyDQM'},steps['RECODR2_2018reHLT_Prompt']]) -steps['RECODR2_2018reHLT_Patatrack_PixelOnlyCPU']=merge([step3_pixelNtupleFit, steps['RECODR2_2018reHLT_Prompt_pixelTrackingOnly']]) +steps['RECODR2_2018reHLT_Patatrack_PixelOnlyCPU']=merge([step3_pixel_ntuplet_cpu, steps['RECODR2_2018reHLT_Prompt_pixelTrackingOnly']]) steps['RECODR2_2018reHLT_Patatrack_PixelOnlyGPU']=merge([step3_gpu, steps['RECODR2_2018reHLT_Prompt_pixelTrackingOnly']]) +steps['RECODR2_2018reHLT_Patatrack_PixelOnlyTripletsCPU']=merge([step3_pixel_ntuplet_cpu, step3_pixel_triplets, steps['RECODR2_2018reHLT_Prompt_pixelTrackingOnly']]) +steps['RECODR2_2018reHLT_Patatrack_PixelOnlyTripletsGPU']=merge([step3_gpu, step3_pixel_triplets, steps['RECODR2_2018reHLT_Prompt_pixelTrackingOnly']]) + steps['RECODR2_2018reHLT_ECALOnlyCPU']=merge([{'-s': 'RAW2DIGI:RawToDigi_ecalOnly,RECO:reconstruction_ecalOnly,DQM:@ecalOnly'},steps['RECODR2_2018reHLT_Prompt']]) steps['RECODR2_2018reHLT_ECALOnlyGPU']=merge([step3_gpu, steps['RECODR2_2018reHLT_ECALOnlyCPU']]) steps['RECODR2_2018reHLT_HCALOnlyCPU']=merge([{'-s': 'RAW2DIGI:RawToDigi_hcalOnly,RECO:reconstruction_hcalOnly,DQM:@hcalOnly+@hcal2Only'},steps['RECODR2_2018reHLT_Prompt']]) diff --git a/Configuration/PyReleaseValidation/python/upgradeWorkflowComponents.py b/Configuration/PyReleaseValidation/python/upgradeWorkflowComponents.py index 9ff0ae9f22e0d..513462593300e 100644 --- a/Configuration/PyReleaseValidation/python/upgradeWorkflowComponents.py +++ b/Configuration/PyReleaseValidation/python/upgradeWorkflowComponents.py @@ -453,7 +453,26 @@ def condition_(self, fragment, stepList, key, hasHarvest): '-s': 'RAW2DIGI:RawToDigi_pixelOnly,RECO:reconstruction_pixelTrackingOnly,VALIDATION:@pixelTrackingOnlyValidation,DQM:@pixelTrackingOnlyDQM', '--datatier': 'GEN-SIM-RECO,DQMIO', '--eventcontent': 'RECOSIM,DQM', - '--procModifiers': 'pixelNtupleFit' + '--customise' : 'RecoPixelVertexing/Configuration/customizePixelTracksSoAonCPU.customizePixelTracksSoAonCPU' +} + +upgradeWFs['PatatrackPixelOnlyTripletsCPU'] = UpgradeWorkflowPatatrack_PixelOnlyCPU( + steps = [ + 'Reco', + 'HARVEST', + 'RecoGlobal', + 'HARVESTGlobal', + ], + PU = [], + suffix = 'Patatrack_PixelOnlyTripletsCPU', + offset = 0.505, +) + +upgradeWFs['PatatrackPixelOnlyTripletsCPU'].step3 = { + '-s': 'RAW2DIGI:RawToDigi_pixelOnly,RECO:reconstruction_pixelTrackingOnly,VALIDATION:@pixelTrackingOnlyValidation,DQM:@pixelTrackingOnlyDQM', + '--datatier': 'GEN-SIM-RECO,DQMIO', + '--eventcontent': 'RECOSIM,DQM', + '--customise' : 'RecoPixelVertexing/Configuration/customizePixelTracksSoAonCPU.customizePixelTracksSoAonCPU,RecoPixelVertexing/Configuration/customizePixelTracksSoAonCPU.customizePixelTracksForTriplets' } class UpgradeWorkflowPatatrack_PixelOnlyGPU(UpgradeWorkflowPatatrack): @@ -487,6 +506,26 @@ def condition_(self, fragment, stepList, key, hasHarvest): '--procModifiers': 'gpu' } +upgradeWFs['PatatrackPixelOnlyTripletsGPU'] = UpgradeWorkflowPatatrack_PixelOnlyGPU( + steps = [ + 'Reco', + 'HARVEST', + 'RecoGlobal', + 'HARVESTGlobal', + ], + PU = [], + suffix = 'Patatrack_PixelOnlyTripletsGPU', + offset = 0.506, +) + +upgradeWFs['PatatrackPixelOnlyTripletsGPU'].step3 = { + '-s': 'RAW2DIGI:RawToDigi_pixelOnly,RECO:reconstruction_pixelTrackingOnly,VALIDATION:@pixelTrackingOnlyValidation,DQM:@pixelTrackingOnlyDQM', + '--datatier': 'GEN-SIM-RECO,DQMIO', + '--eventcontent': 'RECOSIM,DQM', + '--procModifiers': 'gpu', + '--customise': 'RecoPixelVertexing/Configuration/customizePixelTracksSoAonCPU.customizePixelTracksForTriplets' +} + class UpgradeWorkflowPatatrack_ECALOnlyCPU(UpgradeWorkflowPatatrack): def setup_(self, step, stepName, stepDict, k, properties): if 'Reco' in step: diff --git a/DQM/Integration/python/clients/beam_dqm_sourceclient-live_cfg.py b/DQM/Integration/python/clients/beam_dqm_sourceclient-live_cfg.py index efdde1512fcf7..4846de0887fde 100644 --- a/DQM/Integration/python/clients/beam_dqm_sourceclient-live_cfg.py +++ b/DQM/Integration/python/clients/beam_dqm_sourceclient-live_cfg.py @@ -317,8 +317,7 @@ process.pixelTracksTrackingRegions.RegionPSet.originXPos = 0.08 process.pixelTracksTrackingRegions.RegionPSet.originYPos = -0.03 process.pixelTracksTrackingRegions.RegionPSet.originZPos = 0. - -process.pixelVertices.TkFilterParameters.minPt = process.pixelTracksTrackingRegions.RegionPSet.ptMin +process.pixelVertices.PtMin = process.pixelTracksTrackingRegions.RegionPSet.ptMin process.tracking_FirstStep = cms.Sequence( process.siPixelDigis diff --git a/DQM/Integration/python/clients/beampixel_dqm_sourceclient-live_cfg.py b/DQM/Integration/python/clients/beampixel_dqm_sourceclient-live_cfg.py index f909104a39834..a3eac2069e6ed 100644 --- a/DQM/Integration/python/clients/beampixel_dqm_sourceclient-live_cfg.py +++ b/DQM/Integration/python/clients/beampixel_dqm_sourceclient-live_cfg.py @@ -90,12 +90,12 @@ process.siPixelClusterShapeCachePreSplitting = siPixelClusterShapeCache.clone(src = 'siPixelClustersPreSplitting') process.load("RecoLocalTracker.SiPixelRecHits.PixelCPEGeneric_cfi") process.load("RecoPixelVertexing.Configuration.RecoPixelVertexing_cff") -process.pixelVertices.TkFilterParameters.minPt = process.pixelTracksTrackingRegions.RegionPSet.ptMin process.pixelTracksTrackingRegions.RegionPSet.originRadius = cms.double(0.4) process.pixelTracksTrackingRegions.RegionPSet.originHalfLength = cms.double(15.) process.pixelTracksTrackingRegions.RegionPSet.originXPos = cms.double(0.08) process.pixelTracksTrackingRegions.RegionPSet.originYPos = cms.double(-0.03) process.pixelTracksTrackingRegions.RegionPSet.originZPos = cms.double(0.) +process.pixelVertices.PtMin = process.pixelTracksTrackingRegions.RegionPSet.ptMin #---------------------------- diff --git a/DQM/TrackingMonitorClient/python/pixelTrackingEffFromHitPattern_cff.py b/DQM/TrackingMonitorClient/python/pixelTrackingEffFromHitPattern_cff.py index 15ceaf93ed20a..cff85e56d94f7 100644 --- a/DQM/TrackingMonitorClient/python/pixelTrackingEffFromHitPattern_cff.py +++ b/DQM/TrackingMonitorClient/python/pixelTrackingEffFromHitPattern_cff.py @@ -21,7 +21,10 @@ def _layers(suffix, quant, histoPostfix): ] pixelTrackingEffFromHitPattern = DQMEDHarvester("DQMGenericClient", - subDirs = cms.untracked.vstring("Tracking/PixelTrackParameters/HitEffFromHitPattern*"), + subDirs = cms.untracked.vstring("Tracking/PixelTrackParameters/pixelTracks/HitEffFromHitPattern*", + "Tracking/PixelTrackParameters/dzPV0p1/HitEffFromHitPattern*", + "Tracking/PixelTrackParameters/pt_0to1/HitEffFromHitPattern*", + "Tracking/PixelTrackParameters/pt_1/HitEffFromHitPattern*"), efficiency = cms.vstring( _layers("PU", "GoodNumVertices", "") + _layers("BX", "BX", "VsBX") + diff --git a/DQM/TrackingMonitorClient/python/pixelVertexResolutionClient_cfi.py b/DQM/TrackingMonitorClient/python/pixelVertexResolutionClient_cfi.py new file mode 100644 index 0000000000000..2558e88d26012 --- /dev/null +++ b/DQM/TrackingMonitorClient/python/pixelVertexResolutionClient_cfi.py @@ -0,0 +1,7 @@ +import FWCore.ParameterSet.Config as cms + +from DQM.TrackingMonitorClient.primaryVertexResolutionClient_cfi import primaryVertexResolutionClient as _primaryVertexResolutionClient + +pixelVertexResolutionClient = _primaryVertexResolutionClient.clone( + subDirs = ["OfflinePixelPV/Resolution/*"] +) diff --git a/DQM/TrackingMonitorSource/python/pixelTracksMonitoring_cff.py b/DQM/TrackingMonitorSource/python/pixelTracksMonitoring_cff.py index a075f671f05ce..d5deba78b46c8 100644 --- a/DQM/TrackingMonitorSource/python/pixelTracksMonitoring_cff.py +++ b/DQM/TrackingMonitorSource/python/pixelTracksMonitoring_cff.py @@ -1,23 +1,77 @@ import FWCore.ParameterSet.Config as cms import DQM.TrackingMonitor.TrackerCollisionTrackingMonitor_cfi -pixelTracksMonitoring = DQM.TrackingMonitor.TrackerCollisionTrackingMonitor_cfi.TrackerCollisionTrackMon.clone() -pixelTracksMonitoring.FolderName = 'Tracking/PixelTrackParameters' -pixelTracksMonitoring.TrackProducer = 'pixelTracks' -pixelTracksMonitoring.allTrackProducer = 'pixelTracks' -pixelTracksMonitoring.beamSpot = 'offlineBeamSpot' -pixelTracksMonitoring.primaryVertex = 'pixelVertices' -pixelTracksMonitoring.pvNDOF = 1 -pixelTracksMonitoring.doAllPlots = True -pixelTracksMonitoring.doLumiAnalysis = True -pixelTracksMonitoring.doProfilesVsLS = True -pixelTracksMonitoring.doDCAPlots = True -pixelTracksMonitoring.doProfilesVsLS = True -pixelTracksMonitoring.doPlotsVsGoodPVtx = True -pixelTracksMonitoring.doEffFromHitPatternVsPU = False -pixelTracksMonitoring.doEffFromHitPatternVsBX = False -pixelTracksMonitoring.doEffFromHitPatternVsLUMI = False -pixelTracksMonitoring.doPlotsVsGoodPVtx = True -pixelTracksMonitoring.doPlotsVsLUMI = True -pixelTracksMonitoring.doPlotsVsBX = True +pixelTracksMonitor = DQM.TrackingMonitor.TrackerCollisionTrackingMonitor_cfi.TrackerCollisionTrackMon.clone() +pixelTracksMonitor.FolderName = 'Tracking/PixelTrackParameters/pixelTracks' +pixelTracksMonitor.TrackProducer = 'pixelTracks' +pixelTracksMonitor.allTrackProducer = 'pixelTracks' +pixelTracksMonitor.beamSpot = 'offlineBeamSpot' +pixelTracksMonitor.primaryVertex = 'pixelVertices' +pixelTracksMonitor.pvNDOF = 1 +pixelTracksMonitor.doAllPlots = True +pixelTracksMonitor.doLumiAnalysis = True +pixelTracksMonitor.doProfilesVsLS = True +pixelTracksMonitor.doDCAPlots = True +pixelTracksMonitor.doProfilesVsLS = True +pixelTracksMonitor.doPlotsVsGoodPVtx = True +pixelTracksMonitor.doEffFromHitPatternVsPU = False +pixelTracksMonitor.doEffFromHitPatternVsBX = False +pixelTracksMonitor.doEffFromHitPatternVsLUMI = False +pixelTracksMonitor.doPlotsVsGoodPVtx = True +pixelTracksMonitor.doPlotsVsLUMI = True +pixelTracksMonitor.doPlotsVsBX = True +_trackSelector = cms.EDFilter('TrackSelector', + src = cms.InputTag('pixelTracks'), + cut = cms.string("") +) + +pixelTracksPt0to1 = _trackSelector.clone(cut = "pt >= 0 & pt < 1 ") +pixelTracksPt1 = _trackSelector.clone(cut = "pt >= 1 ") +from DQM.TrackingMonitorSource.TrackCollections2monitor_cff import highPurityPV0p1 as _highPurityPV0p1 +pixelTracksPV0p1 = _highPurityPV0p1.clone( + src = "pixelTracks", + quality = "", + vertexTag = "goodPixelVertices" +) + +pixelTracksMonitorPt0to1 = pixelTracksMonitor.clone( + TrackProducer = "pixelTracksPt0to1", + FolderName = "Tracking/PixelTrackParameters/pt_0to1" +) +pixelTracksMonitorPt1 = pixelTracksMonitor.clone( + TrackProducer = "pixelTracksPt1", + FolderName = "Tracking/PixelTrackParameters/pt_1" +) +pixelTracksMonitorPV0p1 = pixelTracksMonitor.clone( + TrackProducer = "pixelTracksPV0p1", + FolderName = "Tracking/PixelTrackParameters/dzPV0p1" +) + + +from CommonTools.ParticleFlow.goodOfflinePrimaryVertices_cfi import goodOfflinePrimaryVertices as _goodOfflinePrimaryVertices +goodPixelVertices = _goodOfflinePrimaryVertices.clone( + src = "pixelVertices", +) + +from DQM.TrackingMonitor.primaryVertexResolution_cfi import primaryVertexResolution as _primaryVertexResolution +pixelVertexResolution = _primaryVertexResolution.clone( + vertexSrc = "goodPixelVertices", + rootFolder = "OfflinePixelPV/Resolution", +) + +pixelTracksMonitoringTask = cms.Task( + goodPixelVertices, + pixelTracksPt0to1, + pixelTracksPt1, + pixelTracksPV0p1, +) + +pixelTracksMonitoring = cms.Sequence( + pixelTracksMonitor + + pixelTracksMonitorPt0to1 + + pixelTracksMonitorPt1 + + pixelTracksMonitorPV0p1 + + pixelVertexResolution, + pixelTracksMonitoringTask +) diff --git a/DQMOffline/Configuration/python/DQMOffline_SecondStep_cff.py b/DQMOffline/Configuration/python/DQMOffline_SecondStep_cff.py index 368b328632fd8..29bf311c474d4 100644 --- a/DQMOffline/Configuration/python/DQMOffline_SecondStep_cff.py +++ b/DQMOffline/Configuration/python/DQMOffline_SecondStep_cff.py @@ -122,6 +122,7 @@ from DQM.CTPPS.ctppsDQM_cff import * from Validation.RecoTau.DQMSequences_cfi import * from DQM.TrackingMonitorClient.pixelTrackingEffFromHitPattern_cff import * +from DQM.TrackingMonitorClient.pixelVertexResolutionClient_cfi import * DQMHarvestTrackerStrip = cms.Sequence ( SiStripOfflineDQMClient ) @@ -179,7 +180,8 @@ DQMHarvestTrackingZeroBias = cms.Sequence( TrackingOfflineDQMClientZeroBias * dqmFastTimerServiceClient ) -DQMHarvestPixelTracking = cms.Sequence( pixelTrackingEffFromHitPattern ) +DQMHarvestPixelTracking = cms.Sequence( pixelTrackingEffFromHitPattern * + pixelVertexResolutionClient ) DQMHarvestOuterTracker = cms.Sequence( OuterTrackerClient * diff --git a/DQMOffline/Configuration/python/DQMOffline_cff.py b/DQMOffline/Configuration/python/DQMOffline_cff.py index 2001c22352a48..ac28700d4eaf4 100644 --- a/DQMOffline/Configuration/python/DQMOffline_cff.py +++ b/DQMOffline/Configuration/python/DQMOffline_cff.py @@ -157,10 +157,12 @@ #DQMOfflineCommon from DQM.TrackingMonitorSource.pixelTracksMonitoring_cff import * +from DQMOffline.RecoB.PixelVertexMonitor_cff import * from DQM.SiOuterTracker.OuterTrackerSourceConfig_cff import * from Validation.RecoTau.DQMSequences_cfi import * -DQMOfflinePixelTracking = cms.Sequence( pixelTracksMonitoring ) +DQMOfflinePixelTracking = cms.Sequence( pixelTracksMonitoring * + pixelPVMonitor ) DQMOuterTracker = cms.Sequence( DQMOfflineDCS * OuterTrackerSource * diff --git a/DQMOffline/RecoB/python/PixelVertexMonitor_cff.py b/DQMOffline/RecoB/python/PixelVertexMonitor_cff.py new file mode 100644 index 0000000000000..9e293f4478bd6 --- /dev/null +++ b/DQMOffline/RecoB/python/PixelVertexMonitor_cff.py @@ -0,0 +1,8 @@ +import FWCore.ParameterSet.Config as cms + +from DQMOffline.RecoB.PrimaryVertexMonitor_cff import pvMonitor as _pvMonitor +pixelPVMonitor = _pvMonitor.clone( + TopFolderName = "OfflinePixelPV", + vertexLabel = "pixelVertices", + ndof = cms.int32( 1 ) +) diff --git a/FastSimulation/Tracking/python/SeedingMigration.py b/FastSimulation/Tracking/python/SeedingMigration.py index 751670daa50c8..3a982eba55e36 100644 --- a/FastSimulation/Tracking/python/SeedingMigration.py +++ b/FastSimulation/Tracking/python/SeedingMigration.py @@ -13,8 +13,9 @@ def _hitSetProducerToFactoryPSet(producer): "PixelTripletLargeTipEDProducer": "PixelTripletLargeTipGenerator", "MultiHitFromChi2EDProducer": "MultiHitGeneratorFromChi2", "CAHitTripletEDProducer": "CAHitTripletGenerator", - "CAHitQuadrupletEDProducer": "CAHitQuadrupletGenerator", - } + "CAHitQuadrupletEDProducer": "CAHitQuadrupletGenerator", + "CAHitNtupletHeterogeneousEDProducer": "CAHitQuadrupletGenerator", + } ret = cms.PSet() _copy(producer, ret) ret.ComponentName = cms.string(_map[producer._TypedParameterizable__type]); diff --git a/HLTrigger/Configuration/python/customizeHLTforPatatrack.py b/HLTrigger/Configuration/python/customizeHLTforPatatrack.py new file mode 100644 index 0000000000000..5164188c94997 --- /dev/null +++ b/HLTrigger/Configuration/python/customizeHLTforPatatrack.py @@ -0,0 +1,802 @@ +import copy +import FWCore.ParameterSet.Config as cms +from HeterogeneousCore.CUDACore.SwitchProducerCUDA import SwitchProducerCUDA +from HLTrigger.Configuration.common import * +from Configuration.Eras.Modifier_run3_common_cff import run3_common + + +# force the SwitchProducerCUDA choice to pick a specific backend: True for offloading to a gpu, False for running on cpu +def forceGpuOffload(status = True): + import HeterogeneousCore.CUDACore.SwitchProducerCUDA + HeterogeneousCore.CUDACore.SwitchProducerCUDA._cuda_enabled_cached = bool(status) + + +# reset the SwitchProducerCUDA choice to pick a backend depending on the availability of a supported gpu +def resetGpuOffload(): + import HeterogeneousCore.CUDACore.SwitchProducerCUDA + HeterogeneousCore.CUDACore.SwitchProducerCUDA._cuda_enabled_cached = None + HeterogeneousCore.CUDACore.SwitchProducerCUDA._switch_cuda() + + +# customisation for running the Patatrack reconstruction, common parts +def customiseCommon(process): + + # Services + + process.load("HeterogeneousCore.CUDAServices.CUDAService_cfi") + + process.load("HeterogeneousCore.CUDAServices.NVProfilerService_cfi") + + + # Paths and EndPaths + + # the hltGetConditions module would force gpu-specific ESProducers to run even if no supported gpu is present + if 'hltGetConditions' in process.__dict__: + del process.hltGetConditions + + # produce a boolean to track if the events ar being processed on gpu (true) or cpu (false) + process.statusOnGPU = SwitchProducerCUDA( + cpu = cms.EDProducer("BooleanProducer", value = cms.bool(False)), + cuda = cms.EDProducer("BooleanProducer", value = cms.bool(True)) + ) + + process.statusOnGPUFilter = cms.EDFilter("BooleanFilter", + src = cms.InputTag("statusOnGPU") + ) + + if 'Status_OnGPU' in process.__dict__: + replace_with(process.Status_OnGPU, cms.Path(process.statusOnGPU + process.statusOnGPUFilter)) + else: + process.Status_OnGPU = cms.Path(process.statusOnGPU + process.statusOnGPUFilter) + if 'HLTSchedule' in process.__dict__: + process.HLTSchedule.append(process.Status_OnGPU) + if process.schedule is not None: + process.schedule.append(process.Status_OnGPU) + + + # make the ScoutingCaloMuonOutput endpath compatible with using Tasks in the Scouting paths + if 'hltOutputScoutingCaloMuon' in process.__dict__ and not 'hltPreScoutingCaloMuonOutputSmart' in process.__dict__: + process.hltPreScoutingCaloMuonOutputSmart = cms.EDFilter( "TriggerResultsFilter", + l1tIgnoreMaskAndPrescale = cms.bool( False ), + l1tResults = cms.InputTag( "" ), + hltResults = cms.InputTag( 'TriggerResults','','@currentProcess' ), + triggerConditions = process.hltOutputScoutingCaloMuon.SelectEvents.SelectEvents, + throw = cms.bool( True ) + ) + insert_modules_after(process, process.hltPreScoutingCaloMuonOutput, process.hltPreScoutingCaloMuonOutputSmart) + + # make the ScoutingPFOutput endpath compatible with using Tasks in the Scouting paths + if 'hltOutputScoutingPF' in process.__dict__ and not 'hltPreScoutingPFOutputSmart' in process.__dict__: + process.hltPreScoutingPFOutputSmart = cms.EDFilter( "TriggerResultsFilter", + l1tIgnoreMaskAndPrescale = cms.bool( False ), + l1tResults = cms.InputTag( "" ), + hltResults = cms.InputTag( 'TriggerResults','','@currentProcess' ), + triggerConditions = process.hltOutputScoutingPF.SelectEvents.SelectEvents, + throw = cms.bool( True ) + ) + insert_modules_after(process, process.hltPreScoutingPFOutput, process.hltPreScoutingPFOutputSmart) + + + # done + return process + + +# customisation for running the "Patatrack" pixel local reconstruction +def customisePixelLocalReconstruction(process): + + if not 'HLTDoLocalPixelSequence' in process.__dict__: + return process + + + # FIXME replace the Sequences with empty ones to avoid exanding them during the (re)definition of Modules and EDAliases + + process.HLTDoLocalPixelSequence = cms.Sequence() + + + # Event Setup + + process.load("CalibTracker.SiPixelESProducers.siPixelGainCalibrationForHLTGPU_cfi") # this should be used only on GPUs, will crash otherwise + process.load("CalibTracker.SiPixelESProducers.siPixelROCsStatusAndMappingWrapperESProducer_cfi") # this should be used only on GPUs, will crash otherwise + process.load("RecoLocalTracker.SiPixelRecHits.PixelCPEFastESProducer_cfi") + + + # Modules and EDAliases + + # referenced in HLTDoLocalPixelTask + + # transfer the beamspot to the gpu + from RecoVertex.BeamSpotProducer.offlineBeamSpotToCUDA_cfi import offlineBeamSpotToCUDA as _offlineBeamSpotToCUDA + process.hltOnlineBeamSpotToCUDA = _offlineBeamSpotToCUDA.clone( + src = "hltOnlineBeamSpot" + ) + + # reconstruct the pixel digis and clusters on the gpu + from RecoLocalTracker.SiPixelClusterizer.siPixelRawToClusterCUDA_cfi import siPixelRawToClusterCUDA as _siPixelRawToClusterCUDA + process.hltSiPixelClustersCUDA = _siPixelRawToClusterCUDA.clone() + # use the pixel channel calibrations scheme for Run 3 + run3_common.toModify(process.hltSiPixelClustersCUDA, isRun2 = False) + + # copy the pixel digis errors to the host + from EventFilter.SiPixelRawToDigi.siPixelDigiErrorsSoAFromCUDA_cfi import siPixelDigiErrorsSoAFromCUDA as _siPixelDigiErrorsSoAFromCUDA + process.hltSiPixelDigiErrorsSoA = _siPixelDigiErrorsSoAFromCUDA.clone( + src = "hltSiPixelClustersCUDA" + ) + + # convert the pixel digis errors to the legacy format + from EventFilter.SiPixelRawToDigi.siPixelDigiErrorsFromSoA_cfi import siPixelDigiErrorsFromSoA as _siPixelDigiErrorsFromSoA + process.hltSiPixelDigiErrors = _siPixelDigiErrorsFromSoA.clone( + digiErrorSoASrc = "hltSiPixelDigiErrorsSoA", + UsePhase1 = True + ) + + # copy the pixel digis (except errors) and clusters to the host + from EventFilter.SiPixelRawToDigi.siPixelDigisSoAFromCUDA_cfi import siPixelDigisSoAFromCUDA as _siPixelDigisSoAFromCUDA + process.hltSiPixelDigisSoA = _siPixelDigisSoAFromCUDA.clone( + src = "hltSiPixelClustersCUDA" + ) + + # convert the pixel digis (except errors) and clusters to the legacy format + from RecoLocalTracker.SiPixelClusterizer.siPixelDigisClustersFromSoA_cfi import siPixelDigisClustersFromSoA as _siPixelDigisClustersFromSoA + process.hltSiPixelDigisClusters = _siPixelDigisClustersFromSoA.clone( + src = "hltSiPixelDigisSoA" + ) + + # SwitchProducer wrapping the legacy pixel digis producer or an alias combining the pixel digis information converted from SoA + process.hltSiPixelDigis = SwitchProducerCUDA( + # legacy producer + cpu = process.hltSiPixelDigis, + # alias used to access products from multiple conversion modules + cuda = cms.EDAlias( + hltSiPixelDigisClusters = cms.VPSet( + cms.PSet(type = cms.string("PixelDigiedmDetSetVector")) + ), + hltSiPixelDigiErrors = cms.VPSet( + cms.PSet(type = cms.string("DetIdedmEDCollection")), + cms.PSet(type = cms.string("SiPixelRawDataErroredmDetSetVector")), + cms.PSet(type = cms.string("PixelFEDChanneledmNewDetSetVector")) + ) + ) + ) + + # SwitchProducer wrapping the legacy pixel cluster producer or an alias for the pixel clusters information converted from SoA + process.hltSiPixelClusters = SwitchProducerCUDA( + # legacy producer + cpu = process.hltSiPixelClusters, + # alias used to access products from multiple conversion modules + cuda = cms.EDAlias( + hltSiPixelDigisClusters = cms.VPSet( + cms.PSet(type = cms.string("SiPixelClusteredmNewDetSetVector")) + ) + ) + ) + + # reconstruct the pixel rechits on the gpu + from RecoLocalTracker.SiPixelRecHits.siPixelRecHitCUDA_cfi import siPixelRecHitCUDA as _siPixelRecHitCUDA + process.hltSiPixelRecHitsCUDA = _siPixelRecHitCUDA.clone( + src = "hltSiPixelClustersCUDA", + beamSpot = "hltOnlineBeamSpotToCUDA" + ) + + # SwitchProducer wrapping the legacy pixel rechit producer or the transfer of the pixel rechits to the host and the conversion from SoA + from RecoLocalTracker.SiPixelRecHits.siPixelRecHitFromCUDA_cfi import siPixelRecHitFromCUDA as _siPixelRecHitFromCUDA + process.hltSiPixelRecHits = SwitchProducerCUDA( + # legacy producer + cpu = process.hltSiPixelRecHits, + # converter to legacy format + cuda = _siPixelRecHitFromCUDA.clone( + pixelRecHitSrc = "hltSiPixelRecHitsCUDA", + src = "hltSiPixelClusters" + ) + ) + + + # Tasks and Sequences + + process.HLTDoLocalPixelTask = cms.Task( + process.hltOnlineBeamSpotToCUDA, # transfer the beamspot to the gpu + process.hltSiPixelClustersCUDA, # reconstruct the pixel digis and clusters on the gpu + process.hltSiPixelRecHitsCUDA, # reconstruct the pixel rechits on the gpu + process.hltSiPixelDigisSoA, # copy the pixel digis (except errors) and clusters to the host + process.hltSiPixelDigisClusters, # convert the pixel digis (except errors) and clusters to the legacy format + process.hltSiPixelDigiErrorsSoA, # copy the pixel digis errors to the host + process.hltSiPixelDigiErrors, # convert the pixel digis errors to the legacy format + process.hltSiPixelDigis, # SwitchProducer wrapping the legacy pixel digis producer or an alias combining the pixel digis information converted from SoA + process.hltSiPixelClusters, # SwitchProducer wrapping the legacy pixel cluster producer or an alias for the pixel clusters information converted from SoA + process.hltSiPixelClustersCache, # legacy module, used by the legacy pixel quadruplet producer + process.hltSiPixelRecHits) # SwitchProducer wrapping the legacy pixel rechit producer or the transfer of the pixel rechits to the host and the conversion from SoA + + process.HLTDoLocalPixelSequence = cms.Sequence(process.HLTDoLocalPixelTask) + + + # done + return process + + +# customisation for running the "Patatrack" pixel track reconstruction +def customisePixelTrackReconstruction(process): + + if not 'HLTRecoPixelTracksSequence' in process.__dict__: + return process + + + # FIXME replace the Sequences with empty ones to avoid exanding them during the (re)definition of Modules and EDAliases + + process.HLTRecoPixelTracksSequence = cms.Sequence() + process.HLTRecopixelvertexingSequence = cms.Sequence() + + + # Modules and EDAliases + + # referenced in process.HLTRecoPixelTracksTask + + # cpu only: convert the pixel rechits from legacy to SoA format + from RecoLocalTracker.SiPixelRecHits.siPixelRecHitSoAFromLegacy_cfi import siPixelRecHitSoAFromLegacy as _siPixelRecHitSoAFromLegacy + process.hltSiPixelRecHitSoA = _siPixelRecHitSoAFromLegacy.clone( + src = "hltSiPixelClusters", + beamSpot = "hltOnlineBeamSpot", + convertToLegacy = True + ) + + # build pixel ntuplets and pixel tracks in SoA format on gpu + from RecoPixelVertexing.PixelTriplets.caHitNtupletCUDA_cfi import caHitNtupletCUDA as _caHitNtupletCUDA + process.hltPixelTracksCUDA = _caHitNtupletCUDA.clone( + idealConditions = False, + pixelRecHitSrc = "hltSiPixelRecHitsCUDA", + onGPU = True + ) + # use quality cuts tuned for Run 2 ideal conditions for all Run 3 workflows + run3_common.toModify(process.hltPixelTracksCUDA, idealConditions = True) + + # SwitchProducer providing the pixel tracks in SoA format on cpu + process.hltPixelTracksSoA = SwitchProducerCUDA( + # build pixel ntuplets and pixel tracks in SoA format on cpu + cpu = _caHitNtupletCUDA.clone( + idealConditions = False, + pixelRecHitSrc = "hltSiPixelRecHitSoA", + onGPU = False + ), + # transfer the pixel tracks in SoA format to the host + cuda = cms.EDProducer("PixelTrackSoAFromCUDA", + src = cms.InputTag("hltPixelTracksCUDA") + ) + ) + # use quality cuts tuned for Run 2 ideal conditions for all Run 3 workflows + run3_common.toModify(process.hltPixelTracksSoA.cpu, idealConditions = True) + + # convert the pixel tracks from SoA to legacy format + from RecoPixelVertexing.PixelTrackFitting.pixelTrackProducerFromSoA_cfi import pixelTrackProducerFromSoA as _pixelTrackProducerFromSoA + process.hltPixelTracks = _pixelTrackProducerFromSoA.clone( + beamSpot = "hltOnlineBeamSpot", + pixelRecHitLegacySrc = "hltSiPixelRecHits", + trackSrc = "hltPixelTracksSoA" + ) + + + # referenced in process.HLTRecopixelvertexingTask + + # build pixel vertices in SoA format on gpu + from RecoPixelVertexing.PixelVertexFinding.pixelVertexCUDA_cfi import pixelVertexCUDA as _pixelVertexCUDA + process.hltPixelVerticesCUDA = _pixelVertexCUDA.clone( + pixelTrackSrc = "hltPixelTracksCUDA", + onGPU = True + ) + + # build or transfer pixel vertices in SoA format on cpu + process.hltPixelVerticesSoA = SwitchProducerCUDA( + # build pixel vertices in SoA format on cpu + cpu = _pixelVertexCUDA.clone( + pixelTrackSrc = "hltPixelTracksSoA", + onGPU = False + ), + # transfer the pixel vertices in SoA format to cpu + cuda = cms.EDProducer("PixelVertexSoAFromCUDA", + src = cms.InputTag("hltPixelVerticesCUDA") + ) + ) + + # convert the pixel vertices from SoA to legacy format + from RecoPixelVertexing.PixelVertexFinding.pixelVertexFromSoA_cfi import pixelVertexFromSoA as _pixelVertexFromSoA + process.hltPixelVertices = _pixelVertexFromSoA.clone( + src = "hltPixelVerticesSoA", + TrackCollection = "hltPixelTracks", + beamSpot = "hltOnlineBeamSpot" + ) + + + # Tasks and Sequences + + process.HLTRecoPixelTracksTask = cms.Task( + process.hltPixelTracksTrackingRegions, # from the original sequence + process.hltSiPixelRecHitSoA, # pixel rechits on cpu, converted to SoA + process.hltPixelTracksCUDA, # pixel ntuplets on gpu, in SoA format + process.hltPixelTracksSoA, # pixel ntuplets on cpu, in SoA format + process.hltPixelTracks) # pixel tracks on cpu, in legacy format + + + process.HLTRecoPixelTracksSequence = cms.Sequence(process.HLTRecoPixelTracksTask) + + process.HLTRecopixelvertexingTask = cms.Task( + process.HLTRecoPixelTracksTask, + process.hltPixelVerticesCUDA, # pixel vertices on gpu, in SoA format + process.hltPixelVerticesSoA, # pixel vertices on cpu, in SoA format + process.hltPixelVertices, # pixel vertices on cpu, in legacy format + process.hltTrimmedPixelVertices) # from the original sequence + + process.HLTRecopixelvertexingSequence = cms.Sequence( + process.hltPixelTracksFitter + # not used here, kept for compatibility with legacy sequences + process.hltPixelTracksFilter, # not used here, kept for compatibility with legacy sequences + process.HLTRecopixelvertexingTask) + + + # done + return process + + +# customisation for offloading the ECAL local reconstruction via CUDA if a supported gpu is present +def customiseEcalLocalReconstruction(process): + + if not 'HLTDoFullUnpackingEgammaEcalSequence' in process.__dict__: + return process + + + # FIXME replace the Sequences with empty ones to avoid exanding them during the (re)definition of Modules and EDAliases + + process.HLTDoFullUnpackingEgammaEcalMFSequence = cms.Sequence() + process.HLTDoFullUnpackingEgammaEcalWithoutPreshowerSequence = cms.Sequence() + process.HLTDoFullUnpackingEgammaEcalSequence = cms.Sequence() + + + # Event Setup + + process.load("EventFilter.EcalRawToDigi.ecalElectronicsMappingGPUESProducer_cfi") + process.load("RecoLocalCalo.EcalRecProducers.ecalGainRatiosGPUESProducer_cfi") + process.load("RecoLocalCalo.EcalRecProducers.ecalPedestalsGPUESProducer_cfi") + process.load("RecoLocalCalo.EcalRecProducers.ecalPulseCovariancesGPUESProducer_cfi") + process.load("RecoLocalCalo.EcalRecProducers.ecalPulseShapesGPUESProducer_cfi") + process.load("RecoLocalCalo.EcalRecProducers.ecalSamplesCorrelationGPUESProducer_cfi") + process.load("RecoLocalCalo.EcalRecProducers.ecalTimeBiasCorrectionsGPUESProducer_cfi") + process.load("RecoLocalCalo.EcalRecProducers.ecalTimeCalibConstantsGPUESProducer_cfi") + process.load("RecoLocalCalo.EcalRecProducers.ecalMultifitParametersGPUESProducer_cfi") + + process.load("RecoLocalCalo.EcalRecProducers.ecalRechitADCToGeVConstantGPUESProducer_cfi") + process.load("RecoLocalCalo.EcalRecProducers.ecalRechitChannelStatusGPUESProducer_cfi") + process.load("RecoLocalCalo.EcalRecProducers.ecalIntercalibConstantsGPUESProducer_cfi") + process.load("RecoLocalCalo.EcalRecProducers.ecalLaserAPDPNRatiosGPUESProducer_cfi") + process.load("RecoLocalCalo.EcalRecProducers.ecalLaserAPDPNRatiosRefGPUESProducer_cfi") + process.load("RecoLocalCalo.EcalRecProducers.ecalLaserAlphasGPUESProducer_cfi") + process.load("RecoLocalCalo.EcalRecProducers.ecalLinearCorrectionsGPUESProducer_cfi") + process.load("RecoLocalCalo.EcalRecProducers.ecalRecHitParametersGPUESProducer_cfi") + + + # Modules and EDAliases + + # ECAL unpacker running on gpu + process.hltEcalDigisGPU = cms.EDProducer("EcalRawToDigiGPU", + InputLabel = cms.InputTag("rawDataCollector"), + FEDs = cms.vint32( + 601, 602, 603, 604, 605, + 606, 607, 608, 609, 610, + 611, 612, 613, 614, 615, + 616, 617, 618, 619, 620, + 621, 622, 623, 624, 625, + 626, 627, 628, 629, 630, + 631, 632, 633, 634, 635, + 636, 637, 638, 639, 640, + 641, 642, 643, 644, 645, + 646, 647, 648, 649, 650, + 651, 652, 653, 654 + ), + digisLabelEB = cms.string("ebDigis"), + digisLabelEE = cms.string("eeDigis"), + maxChannelsEB = cms.uint32(61200), + maxChannelsEE = cms.uint32(14648), + ) + + # SwitchProducer wrapping the legacy ECAL unpacker or the ECAL digi converter from SoA format on gpu to legacy format on cpu + process.hltEcalDigisLegacy = process.hltEcalDigis.clone() + + process.hltEcalDigis = SwitchProducerCUDA( + # legacy producer + cpu = cms.EDAlias( + hltEcalDigisLegacy = cms.VPSet( + cms.PSet(type = cms.string("EBDigiCollection")), + cms.PSet(type = cms.string("EEDigiCollection")), + cms.PSet(type = cms.string("EBDetIdedmEDCollection")), + cms.PSet(type = cms.string("EEDetIdedmEDCollection")), + cms.PSet(type = cms.string("EBSrFlagsSorted")), + cms.PSet(type = cms.string("EESrFlagsSorted")), + cms.PSet(type = cms.string("EcalElectronicsIdedmEDCollection"), fromProductInstance = cms.string("EcalIntegrityBlockSizeErrors")), + cms.PSet(type = cms.string("EcalElectronicsIdedmEDCollection"), fromProductInstance = cms.string("EcalIntegrityTTIdErrors")) + ) + ), + # convert ECAL digis from SoA format on gpu to legacy format on cpu + cuda = cms.EDProducer("EcalCPUDigisProducer", + digisInLabelEB = cms.InputTag("hltEcalDigisGPU", "ebDigis"), + digisInLabelEE = cms.InputTag("hltEcalDigisGPU", "eeDigis"), + digisOutLabelEB = cms.string("ebDigis"), + digisOutLabelEE = cms.string("eeDigis"), + produceDummyIntegrityCollections = cms.bool(True) + ) + ) + + # ECAL multifit running on gpu + from RecoLocalCalo.EcalRecProducers.ecalUncalibRecHitProducerGPU_cfi import ecalUncalibRecHitProducerGPU as _ecalUncalibRecHitProducerGPU + process.hltEcalUncalibRecHitGPU = _ecalUncalibRecHitProducerGPU.clone( + digisLabelEB = ("hltEcalDigisGPU", "ebDigis"), + digisLabelEE = ("hltEcalDigisGPU", "eeDigis"), + shouldRunTimingComputation = False + ) + + # copy the ECAL uncalibrated rechits from gpu to cpu in SoA format + process.hltEcalUncalibRecHitSoA = cms.EDProducer("EcalCPUUncalibRecHitProducer", + containsTimingInformation = cms.bool(False), + recHitsInLabelEB = cms.InputTag("hltEcalUncalibRecHitGPU", "EcalUncalibRecHitsEB"), + recHitsInLabelEE = cms.InputTag("hltEcalUncalibRecHitGPU", "EcalUncalibRecHitsEE"), + recHitsOutLabelEB = cms.string("EcalUncalibRecHitsEB"), + recHitsOutLabelEE = cms.string("EcalUncalibRecHitsEE") + ) + + # SwitchProducer wrapping the legacy ECAL uncalibrated rechits producer or a converter from SoA to legacy format + process.hltEcalUncalibRecHit = SwitchProducerCUDA( + # legacy producer + cpu = process.hltEcalUncalibRecHit, + # convert the ECAL uncalibrated rechits from SoA to legacy format + cuda = cms.EDProducer("EcalUncalibRecHitConvertGPU2CPUFormat", + recHitsLabelGPUEB = cms.InputTag("hltEcalUncalibRecHitSoA", "EcalUncalibRecHitsEB"), + recHitsLabelGPUEE = cms.InputTag("hltEcalUncalibRecHitSoA", "EcalUncalibRecHitsEE"), + recHitsLabelCPUEB = cms.string("EcalUncalibRecHitsEB"), + recHitsLabelCPUEE = cms.string("EcalUncalibRecHitsEE") + ) + ) + + # Reconstructing the ECAL calibrated rechits on gpu works, but is extremely slow. + # Disable it for the time being, until the performance has been addressed. + """ + process.hltEcalRecHitGPU = cms.EDProducer("EcalRecHitProducerGPU", + uncalibrecHitsInLabelEB = cms.InputTag("hltEcalUncalibRecHitGPU","EcalUncalibRecHitsEB"), + uncalibrecHitsInLabelEE = cms.InputTag("hltEcalUncalibRecHitGPU","EcalUncalibRecHitsEE"), + recHitsLabelEB = cms.string("EcalRecHitsEB"), + recHitsLabelEE = cms.string("EcalRecHitsEE"), + maxNumberHitsEB = cms.uint32(61200), + maxNumberHitsEE = cms.uint32(14648), + ChannelStatusToBeExcluded = cms.vstring( + "kDAC", + "kNoisy", + "kNNoisy", + "kFixedG6", + "kFixedG1", + "kFixedG0", + "kNonRespondingIsolated", + "kDeadVFE", + "kDeadFE", + "kNoDataNoTP"), + killDeadChannels = cms.bool(True), + EBLaserMIN = cms.double(0.01), + EELaserMIN = cms.double(0.01), + EBLaserMAX = cms.double(30.0), + EELaserMAX = cms.double(30.0), + flagsMapDBReco = cms.PSet( + kGood = cms.vstring("kOk","kDAC","kNoLaser","kNoisy"), + kNoisy = cms.vstring("kNNoisy","kFixedG6","kFixedG1"), + kNeighboursRecovered = cms.vstring("kFixedG0", "kNonRespondingIsolated", "kDeadVFE"), + kTowerRecovered = cms.vstring("kDeadFE"), + kDead = cms.vstring("kNoDataNoTP") + ), + recoverEBIsolatedChannels = cms.bool(False), + recoverEEIsolatedChannels = cms.bool(False), + recoverEBVFE = cms.bool(False), + recoverEEVFE = cms.bool(False), + recoverEBFE = cms.bool(True), + recoverEEFE = cms.bool(True), + ) + + process.hltEcalRecHitSoA = cms.EDProducer("EcalCPURecHitProducer", + recHitsInLabelEB = cms.InputTag("hltEcalRecHitGPU", "EcalRecHitsEB"), + recHitsInLabelEE = cms.InputTag("hltEcalRecHitGPU", "EcalRecHitsEE"), + recHitsOutLabelEB = cms.string("EcalRecHitsEB"), + recHitsOutLabelEE = cms.string("EcalRecHitsEE"), + containsTimingInformation = cms.bool(False), + ) + + # SwitchProducer wrapping the legacy ECAL calibrated rechits producer or a converter from SoA to legacy format + process.hltEcalRecHit = SwitchProducerCUDA( + # legacy producer + cpu = process.hltEcalRecHit, + # convert the ECAL calibrated rechits from SoA to legacy format + cuda = cms.EDProducer("EcalRecHitConvertGPU2CPUFormat", + recHitsLabelGPUEB = cms.InputTag("hltEcalRecHitSoA", "EcalRecHitsEB"), + recHitsLabelGPUEE = cms.InputTag("hltEcalRecHitSoA", "EcalRecHitsEE"), + recHitsLabelCPUEB = cms.string("EcalRecHitsEB"), + recHitsLabelCPUEE = cms.string("EcalRecHitsEE"), + ) + """ + + + # SwitchProducer wrapping the legacy ECAL rechits producer + # the gpu unpacker does not produce the TPs used for the recovery, so the SwitchProducer alias does not provide them: + # - the cpu uncalibrated rechit producer may mark them for recovery, read the TPs explicitly from the legacy unpacker + # - the gpu uncalibrated rechit producer does not flag them for recovery, so the TPs are not necessary + process.hltEcalRecHit = SwitchProducerCUDA( + cpu = process.hltEcalRecHit.clone( + triggerPrimitiveDigiCollection = cms.InputTag('hltEcalDigisLegacy', 'EcalTriggerPrimitives') + ), + cuda = process.hltEcalRecHit.clone( + triggerPrimitiveDigiCollection = cms.InputTag('unused') + ) + ) + + # Tasks and Sequences + + process.HLTDoFullUnpackingEgammaEcalWithoutPreshowerTask = cms.Task( + process.hltEcalDigisGPU, # unpack ECAL digis on gpu + process.hltEcalDigisLegacy, # legacy producer, referenced in the SwitchProducer + process.hltEcalDigis, # SwitchProducer + process.hltEcalUncalibRecHitGPU, # run ECAL local reconstruction and multifit on gpu + process.hltEcalUncalibRecHitSoA, # needed by hltEcalPhiSymFilter - copy to host + process.hltEcalUncalibRecHit, # needed by hltEcalPhiSymFilter - convert to legacy format + # process.hltEcalRecHitGPU, # make ECAL calibrated rechits on gpu + # process.hltEcalRecHitSoA, # copy to host + process.hltEcalDetIdToBeRecovered, # legacy producer + process.hltEcalRecHit) # legacy producer + + process.HLTDoFullUnpackingEgammaEcalWithoutPreshowerSequence = cms.Sequence( + process.HLTDoFullUnpackingEgammaEcalWithoutPreshowerTask) + + process.HLTPreshowerTask = cms.Task( + process.hltEcalPreshowerDigis, # unpack ECAL preshower digis on the host + process.hltEcalPreshowerRecHit) # build ECAL preshower rechits on the host + + process.HLTPreshowerSequence = cms.Sequence(process.HLTPreshowerTask) + + process.HLTDoFullUnpackingEgammaEcalTask = cms.Task( + process.HLTDoFullUnpackingEgammaEcalWithoutPreshowerTask, + process.HLTPreshowerTask) + + process.HLTDoFullUnpackingEgammaEcalSequence = cms.Sequence( + process.HLTDoFullUnpackingEgammaEcalTask) + + process.HLTDoFullUnpackingEgammaEcalMFSequence = cms.Sequence( + process.HLTDoFullUnpackingEgammaEcalTask) + + + # done + return process + +# customisation for offloading the HCAL local reconstruction via CUDA if a supported gpu is present +def customiseHcalLocalReconstruction(process): + + if not 'HLTDoLocalHcalSequence' in process.__dict__: + return process + + + # FIXME replace the Sequences with empty ones to avoid exanding them during the (re)definition of Modules and EDAliases + + process.HLTDoLocalHcalSequence = cms.Sequence() + process.HLTStoppedHSCPLocalHcalReco = cms.Sequence() + + + # Event Setup + + process.load("EventFilter.HcalRawToDigi.hcalElectronicsMappingGPUESProducer_cfi") + + process.load("RecoLocalCalo.HcalRecProducers.hcalGainsGPUESProducer_cfi") + process.load("RecoLocalCalo.HcalRecProducers.hcalGainWidthsGPUESProducer_cfi") + process.load("RecoLocalCalo.HcalRecProducers.hcalLUTCorrsGPUESProducer_cfi") + process.load("RecoLocalCalo.HcalRecProducers.hcalConvertedPedestalsGPUESProducer_cfi") + process.load("RecoLocalCalo.HcalRecProducers.hcalConvertedEffectivePedestalsGPUESProducer_cfi") + process.hcalConvertedEffectivePedestalsGPUESProducer.label0 = "withTopoEff" + process.load("RecoLocalCalo.HcalRecProducers.hcalConvertedPedestalWidthsGPUESProducer_cfi") + process.load("RecoLocalCalo.HcalRecProducers.hcalConvertedEffectivePedestalWidthsGPUESProducer_cfi") + process.hcalConvertedEffectivePedestalWidthsGPUESProducer.label0 = "withTopoEff" + process.hcalConvertedEffectivePedestalWidthsGPUESProducer.label1 = "withTopoEff" + process.load("RecoLocalCalo.HcalRecProducers.hcalQIECodersGPUESProducer_cfi") + process.load("RecoLocalCalo.HcalRecProducers.hcalRecoParamsWithPulseShapesGPUESProducer_cfi") + process.load("RecoLocalCalo.HcalRecProducers.hcalRespCorrsGPUESProducer_cfi") + process.load("RecoLocalCalo.HcalRecProducers.hcalTimeCorrsGPUESProducer_cfi") + process.load("RecoLocalCalo.HcalRecProducers.hcalQIETypesGPUESProducer_cfi") + process.load("RecoLocalCalo.HcalRecProducers.hcalSiPMParametersGPUESProducer_cfi") + process.load("RecoLocalCalo.HcalRecProducers.hcalSiPMCharacteristicsGPUESProducer_cfi") + process.load("RecoLocalCalo.HcalRecProducers.hcalMahiPulseOffsetsGPUESProducer_cfi") + + + # Modules and EDAliases + + # The HCAL unpacker running on the gpu supports only the HB and HE digis. + # So, run the legacy unacker on the cpu, then convert the HB and HE digis + # to SoA format and copy them to the gpu. + process.hltHcalDigisGPU = cms.EDProducer("HcalDigisProducerGPU", + hbheDigisLabel = cms.InputTag("hltHcalDigis"), + qie11DigiLabel = cms.InputTag("hltHcalDigis"), + digisLabelF01HE = cms.string(""), + digisLabelF5HB = cms.string(""), + digisLabelF3HB = cms.string(""), + maxChannelsF01HE = cms.uint32(10000), + maxChannelsF5HB = cms.uint32(10000), + maxChannelsF3HB = cms.uint32(10000) + ) + + # run the HCAL local reconstruction (including Method 0 and MAHI) on gpu + from RecoLocalCalo.HcalRecProducers.hbheRecHitProducerGPU_cfi import hbheRecHitProducerGPU as _hbheRecHitProducerGPU + process.hltHbherecoGPU = _hbheRecHitProducerGPU.clone( + digisLabelF01HE = "hltHcalDigisGPU", + digisLabelF5HB = "hltHcalDigisGPU", + digisLabelF3HB = "hltHcalDigisGPU", + recHitsLabelM0HBHE = "" + ) + + # transfer the HCAL rechits to the cpu, and convert them to the legacy format + from RecoLocalCalo.HcalRecProducers.hcalCPURecHitsProducer_cfi import hcalCPURecHitsProducer as _hcalCPURecHitsProducer + process.hltHbherecoFromGPU = _hcalCPURecHitsProducer.clone( + recHitsM0LabelIn = "hltHbherecoGPU", + recHitsM0LabelOut = "", + recHitsLegacyLabelOut = "" + ) + + # SwitchProducer between the legacy producer and the copy from gpu with conversion + process.hltHbhereco = SwitchProducerCUDA( + # legacy producer + cpu = process.hltHbhereco.clone(), + # alias to the rechits converted to legacy format + cuda = cms.EDAlias( + hltHbherecoFromGPU = cms.VPSet( + cms.PSet(type = cms.string("HBHERecHitsSorted")) + ) + ) + ) + + + # Tasks and Sequences + + process.HLTDoLocalHcalTask = cms.Task( + process.hltHcalDigis, # legacy producer, unpack HCAL digis on cpu + process.hltHcalDigisGPU, # copy to gpu and convert to SoA format + process.hltHbherecoGPU, # run the HCAL local reconstruction (including Method 0 and MAHI) on gpu + process.hltHbherecoFromGPU, # transfer the HCAL rechits to the cpu, and convert them to the legacy format + process.hltHbhereco, # SwitchProducer between the legacy producer and the copy from gpu with conversion + process.hltHfprereco, # legacy producer + process.hltHfreco, # legacy producer + process.hltHoreco) # legacy producer + + process.HLTDoLocalHcalSequence = cms.Sequence( + process.HLTDoLocalHcalTask) + + process.HLTStoppedHSCPLocalHcalRecoTask = cms.Task( + process.hltHcalDigis, # legacy producer, unpack HCAL digis on cpu + process.hltHcalDigisGPU, # copy to gpu and convert to SoA format + process.hltHbherecoGPU, # run the HCAL local reconstruction (including Method 0 and MAHI) on gpu + process.hltHbherecoFromGPU, # transfer the HCAL rechits to the cpu, and convert them to the legacy format + process.hltHbhereco) # SwitchProducer between the legacy producer and the copy from gpu with conversion + + process.HLTStoppedHSCPLocalHcalReco = cms.Sequence( + process.HLTStoppedHSCPLocalHcalRecoTask) + + + # done + return process + + +# customisation to enable pixel triplets instead of quadruplets +def enablePatatrackPixelTriplets(process): + + # configure GPU pixel tracks for triplets + process.hltPixelTracksCUDA.minHitsPerNtuplet = 3 + process.hltPixelTracksCUDA.includeJumpingForwardDoublets = True + + # configure CPU pixel tracks for triplets + process.hltPixelTracksSoA.cpu.minHitsPerNtuplet = 3 + process.hltPixelTracksSoA.cpu.includeJumpingForwardDoublets = True + + # done + return process + + +# customisation for running the Patatrack reconstruction, with automatic offload via CUDA when a supported gpu is available +def customizeHLTforPatatrack(process): + process = customiseCommon(process) + process = customisePixelLocalReconstruction(process) + process = customisePixelTrackReconstruction(process) + process = customiseEcalLocalReconstruction(process) + process = customiseHcalLocalReconstruction(process) + return process + + +# customisation for running the Patatrack triplets reconstruction, with automatic offload via CUDA when a supported gpu is available +def customizeHLTforPatatrackTriplets(process): + process = customiseCommon(process) + process = customisePixelLocalReconstruction(process) + process = customisePixelTrackReconstruction(process) + process = customiseEcalLocalReconstruction(process) + process = customiseHcalLocalReconstruction(process) + process = enablePatatrackPixelTriplets(process) + return process + + +def _addConsumerPath(process): + # add to a path all consumers and the tasks that define the producers + process.Consumer = cms.Path( + process.HLTBeginSequence + + process.hltPixelConsumer + + process.hltEcalConsumer + + process.hltHbheConsumer, + process.HLTDoLocalPixelTask, + process.HLTRecoPixelTracksTask, + process.HLTRecopixelvertexingTask, + process.HLTDoFullUnpackingEgammaEcalTask, + process.HLTDoLocalHcalTask, + ) + + if 'HLTSchedule' in process.__dict__: + process.HLTSchedule.append(process.Consumer) + if process.schedule is not None: + process.schedule.append(process.Consumer) + + # done + return process + + +def consumeGPUSoAProducts(process): + # consume the Pixel tracks and vertices on the GPU in SoA format + process.hltPixelConsumer = cms.EDAnalyzer("GenericConsumer", + eventProducts = cms.untracked.vstring( 'hltPixelTracksCUDA', 'hltPixelVerticesCUDA' ) + ) + + # consume the ECAL uncalibrated rechits on the GPU in SoA format + process.hltEcalConsumer = cms.EDAnalyzer("GenericConsumer", + eventProducts = cms.untracked.vstring( 'hltEcalUncalibRecHitGPU' ) + ) + + # consume the HCAL rechits on the GPU in SoA format + process.hltHbheConsumer = cms.EDAnalyzer("GenericConsumer", + eventProducts = cms.untracked.vstring( 'hltHbherecoGPU' ) + ) + + # add to a path all consumers and the tasks that define the producers + process = _addConsumerPath(process) + + # done + return process + + +def consumeCPUSoAProducts(process): + # consume the Pixel tracks and vertices on the CPU in SoA format + process.hltPixelConsumer = cms.EDAnalyzer("GenericConsumer", + eventProducts = cms.untracked.vstring( 'hltPixelTracksSoA', 'hltPixelVerticesSoA' ) + ) + + # consume the ECAL uncalibrated rechits on the CPU in SoA format + process.hltEcalConsumer = cms.EDAnalyzer("GenericConsumer", + eventProducts = cms.untracked.vstring( 'hltEcalUncalibRecHitSoA' ) + ) + + # consume the HCAL rechits on the CPU in legacy format + process.hltHbheConsumer = cms.EDAnalyzer("GenericConsumer", + eventProducts = cms.untracked.vstring( 'hltHbhereco' ) + ) + + # add to a path all consumers and the tasks that define the producers + process = _addConsumerPath(process) + + # done + return process + +def consumeCPULegacyProducts(process): + # consume the Pixel tracks and vertices on the CPU in legacy format + process.hltPixelConsumer = cms.EDAnalyzer("GenericConsumer", + eventProducts = cms.untracked.vstring( 'hltPixelTracks', 'hltPixelVertices' ) + ) + + # consume the ECAL runcalibrated echits on the CPU in legacy format + process.hltEcalConsumer = cms.EDAnalyzer("GenericConsumer", + eventProducts = cms.untracked.vstring( 'hltEcalUncalibRecHit' ) + ) + + # consume the HCAL rechits on the CPU in legacy format + process.hltHbheConsumer = cms.EDAnalyzer("GenericConsumer", + eventProducts = cms.untracked.vstring( 'hltHbhereco' ) + ) + + # add to a path all consumers and the tasks that define the producers + process = _addConsumerPath(process) + + # done + return process diff --git a/RecoPixelVertexing/Configuration/python/RecoPixelVertexing_cff.py b/RecoPixelVertexing/Configuration/python/RecoPixelVertexing_cff.py index 34ee6fadb04de..424ac13a43627 100644 --- a/RecoPixelVertexing/Configuration/python/RecoPixelVertexing_cff.py +++ b/RecoPixelVertexing/Configuration/python/RecoPixelVertexing_cff.py @@ -4,7 +4,21 @@ # # for STARTUP ONLY use try and use Offline 3D PV from pixelTracks, with adaptive vertex # -#from RecoPixelVertexing.PixelVertexFinding.PixelVertexes_cff import * -from RecoVertex.PrimaryVertexProducer.OfflinePixel3DPrimaryVertices_cfi import * +from RecoPixelVertexing.PixelVertexFinding.PixelVertexes_cff import * +#from RecoVertex.PrimaryVertexProducer.OfflinePixel3DPrimaryVertices_cfi import * recopixelvertexingTask = cms.Task(pixelTracksTask,pixelVertices) recopixelvertexing = cms.Sequence(recopixelvertexingTask) + +from Configuration.ProcessModifiers.gpu_cff import gpu + +from RecoPixelVertexing.PixelVertexFinding.pixelVertexCUDA_cfi import pixelVertexCUDA +from RecoPixelVertexing.PixelVertexFinding.pixelVertexSoA_cfi import pixelVertexSoA +from RecoPixelVertexing.PixelVertexFinding.pixelVertexFromSoA_cfi import pixelVertexFromSoA as _pixelVertexFromSoA + +_pixelVertexingCUDATask = cms.Task(pixelTracksTask,pixelVertexCUDA,pixelVertexSoA,pixelVertices) + +# pixelVertexSoAonCPU = pixelVertexCUDA.clone() +# pixelVertexSoAonCPU.onGPU = False; + +gpu.toReplaceWith(pixelVertices,_pixelVertexFromSoA) +gpu.toReplaceWith(recopixelvertexingTask,_pixelVertexingCUDATask) diff --git a/RecoPixelVertexing/Configuration/python/customizePixelTracksForProfiling.py b/RecoPixelVertexing/Configuration/python/customizePixelTracksForProfiling.py deleted file mode 100644 index 4713b64e5e48a..0000000000000 --- a/RecoPixelVertexing/Configuration/python/customizePixelTracksForProfiling.py +++ /dev/null @@ -1,15 +0,0 @@ -import FWCore.ParameterSet.Config as cms - -def customizePixelTracksForProfiling(process): - process.out = cms.OutputModule("AsciiOutputModule", - outputCommands = cms.untracked.vstring( - "keep *_pixelTracks_*_*", - ), - verbosity = cms.untracked.uint32(0), - ) - - process.outPath = cms.EndPath(process.out) - - process.schedule = cms.Schedule(process.raw2digi_step, process.reconstruction_step, process.outPath) - - return process diff --git a/RecoPixelVertexing/Configuration/python/customizePixelTracksSoAonCPU.py b/RecoPixelVertexing/Configuration/python/customizePixelTracksSoAonCPU.py new file mode 100644 index 0000000000000..909959f2d81be --- /dev/null +++ b/RecoPixelVertexing/Configuration/python/customizePixelTracksSoAonCPU.py @@ -0,0 +1,61 @@ +import FWCore.ParameterSet.Config as cms + +def customizePixelTracksSoAonCPU(process): + + process.CUDAService = cms.Service('CUDAService', + enabled = cms.untracked.bool(False) + ) + + # ensure the same results when running on GPU (which supports only the 'HLT' payload) and CPU + process.siPixelClustersPreSplitting.cpu.payloadType = cms.string('HLT') + + from RecoLocalTracker.SiPixelRecHits.siPixelRecHitSoAFromLegacy_cfi import siPixelRecHitSoAFromLegacy + process.siPixelRecHitsPreSplitting = siPixelRecHitSoAFromLegacy.clone( + convertToLegacy = True + ) + + from RecoPixelVertexing.PixelTriplets.caHitNtupletCUDA_cfi import caHitNtupletCUDA + process.pixelTrackSoA = caHitNtupletCUDA.clone( + onGPU = False, + pixelRecHitSrc = 'siPixelRecHitsPreSplitting' + ) + + from RecoPixelVertexing.PixelVertexFinding.pixelVertexCUDA_cfi import pixelVertexCUDA + process.pixelVertexSoA = pixelVertexCUDA.clone( + onGPU = False, + pixelTrackSrc = 'pixelTrackSoA' + ) + + from RecoPixelVertexing.PixelTrackFitting.pixelTrackProducerFromSoA_cfi import pixelTrackProducerFromSoA + process.pixelTracks = pixelTrackProducerFromSoA.clone( + pixelRecHitLegacySrc = 'siPixelRecHitsPreSplitting' + ) + + from RecoPixelVertexing.PixelVertexFinding.pixelVertexFromSoA_cfi import pixelVertexFromSoA + process.pixelVertices = pixelVertexFromSoA.clone() + + process.reconstruction_step += process.siPixelRecHitsPreSplitting + process.pixelTrackSoA + process.pixelVertexSoA + + return process + + +def customizePixelTracksForTriplets(process): + + from HLTrigger.Configuration.common import producers_by_type + for producer in producers_by_type(process, 'CAHitNtupletCUDA'): + producer.includeJumpingForwardDoublets = True + producer.minHitsPerNtuplet = 3 + + return process + + +def customizePixelTracksSoAonCPUForProfiling(process): + + process.MessageLogger.cerr.FwkReport.reportEvery = 100 + + process = customizePixelTracksSoAonCPU(process) + process.siPixelRecHitSoAFromLegacy.convertToLegacy = False + + process.TkSoA = cms.Path(process.offlineBeamSpot + process.siPixelDigis + process.siPixelClustersPreSplitting + process.siPixelRecHitSoAFromLegacy + process.pixelTrackSoA + process.pixelVertexSoA) + process.schedule = cms.Schedule(process.TkSoA) + return process diff --git a/RecoPixelVertexing/PixelTrackFitting/BuildFile.xml b/RecoPixelVertexing/PixelTrackFitting/BuildFile.xml index e6fc938dc25a7..a589aad036996 100644 --- a/RecoPixelVertexing/PixelTrackFitting/BuildFile.xml +++ b/RecoPixelVertexing/PixelTrackFitting/BuildFile.xml @@ -1,3 +1,5 @@ + + diff --git a/RecoPixelVertexing/PixelTrackFitting/interface/BrokenLine.h b/RecoPixelVertexing/PixelTrackFitting/interface/BrokenLine.h new file mode 100644 index 0000000000000..86fe6a278777c --- /dev/null +++ b/RecoPixelVertexing/PixelTrackFitting/interface/BrokenLine.h @@ -0,0 +1,606 @@ +#ifndef RecoPixelVertexing_PixelTrackFitting_interface_BrokenLine_h +#define RecoPixelVertexing_PixelTrackFitting_interface_BrokenLine_h + +#include + +#include "RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h" + +namespace brokenline { + + //!< Karimäki's parameters: (phi, d, k=1/R) + /*!< covariance matrix: \n + |cov(phi,phi)|cov( d ,phi)|cov( k ,phi)| \n + |cov(phi, d )|cov( d , d )|cov( k , d )| \n + |cov(phi, k )|cov( d , k )|cov( k , k )| \n + as defined in Karimäki V., 1990, Effective circle fitting for particle trajectories, + Nucl. Instr. and Meth. A305 (1991) 187. + */ + using karimaki_circle_fit = riemannFit::CircleFit; + + /*! + \brief data needed for the Broken Line fit procedure. + */ + template + struct PreparedBrokenLineData { + int qCharge; //!< particle charge + riemannFit::Matrix2xNd radii; //!< xy data in the system in which the pre-fitted center is the origin + riemannFit::VectorNd sTransverse; //!< total distance traveled in the transverse plane + // starting from the pre-fitted closest approach + riemannFit::VectorNd sTotal; //!< total distance traveled (three-dimensional) + riemannFit::VectorNd zInSZplane; //!< orthogonal coordinate to the pre-fitted line in the sz plane + riemannFit::VectorNd varBeta; //!< kink angles in the SZ plane + }; + + /*! + \brief Computes the Coulomb multiple scattering variance of the planar angle. + + \param length length of the track in the material. + \param bField magnetic field in Gev/cm/c. + \param radius radius of curvature (needed to evaluate p). + \param layer denotes which of the four layers of the detector is the endpoint of the + * multiple scattered track. For example, if Layer=3, then the particle has + * just gone through the material between the second and the third layer. + + \todo add another Layer variable to identify also the start point of the track, + * so if there are missing hits or multiple hits, the part of the detector that + * the particle has traversed can be exactly identified. + + \warning the formula used here assumes beta=1, and so neglects the dependence + * of theta_0 on the mass of the particle at fixed momentum. + + \return the variance of the planar angle ((theta_0)^2 /3). + */ + __host__ __device__ inline double multScatt( + const double& length, const double bField, const double radius, int layer, double slope) { + // limit R to 20GeV... + auto pt2 = std::min(20., bField * radius); + pt2 *= pt2; + constexpr double inv_X0 = 0.06 / 16.; //!< inverse of radiation length of the material in cm + //if(Layer==1) XXI_0=0.06/16.; + // else XXI_0=0.06/16.; + //XX_0*=1; + + //! number between 1/3 (uniform material) and 1 (thin scatterer) to be manually tuned + constexpr double geometry_factor = 0.7; + constexpr double fact = geometry_factor * riemannFit::sqr(13.6 / 1000.); + return fact / (pt2 * (1. + riemannFit::sqr(slope))) * (std::abs(length) * inv_X0) * + riemannFit::sqr(1. + 0.038 * log(std::abs(length) * inv_X0)); + } + + /*! + \brief Computes the 2D rotation matrix that transforms the line y=slope*x into the line y=0. + + \param slope tangent of the angle of rotation. + + \return 2D rotation matrix. + */ + __host__ __device__ inline riemannFit::Matrix2d rotationMatrix(double slope) { + riemannFit::Matrix2d rot; + rot(0, 0) = 1. / sqrt(1. + riemannFit::sqr(slope)); + rot(0, 1) = slope * rot(0, 0); + rot(1, 0) = -rot(0, 1); + rot(1, 1) = rot(0, 0); + return rot; + } + + /*! + \brief Changes the Karimäki parameters (and consequently their covariance matrix) under a + * translation of the coordinate system, such that the old origin has coordinates (x0,y0) + * in the new coordinate system. The formulas are taken from Karimäki V., 1990, Effective + * circle fitting for particle trajectories, Nucl. Instr. and Meth. A305 (1991) 187. + + \param circle circle fit in the old coordinate system. circle.par(0) is phi, circle.par(1) is d and circle.par(2) is rho. + \param x0 x coordinate of the translation vector. + \param y0 y coordinate of the translation vector. + \param jacobian passed by reference in order to save stack. + */ + __host__ __device__ inline void translateKarimaki(karimaki_circle_fit& circle, + double x0, + double y0, + riemannFit::Matrix3d& jacobian) { + // Avoid multiple access to the circle.par vector. + using scalar = std::remove_reference::type; + scalar phi = circle.par(0); + scalar dee = circle.par(1); + scalar rho = circle.par(2); + + // Avoid repeated trig. computations + scalar sinPhi = sin(phi); + scalar cosPhi = cos(phi); + + // Intermediate computations for the circle parameters + scalar deltaPara = x0 * cosPhi + y0 * sinPhi; + scalar deltaOrth = x0 * sinPhi - y0 * cosPhi + dee; + scalar tempSmallU = 1 + rho * dee; + scalar tempC = -rho * y0 + tempSmallU * cosPhi; + scalar tempB = rho * x0 + tempSmallU * sinPhi; + scalar tempA = 2. * deltaOrth + rho * (riemannFit::sqr(deltaOrth) + riemannFit::sqr(deltaPara)); + scalar tempU = sqrt(1. + rho * tempA); + + // Intermediate computations for the error matrix transform + scalar xi = 1. / (riemannFit::sqr(tempB) + riemannFit::sqr(tempC)); + scalar tempV = 1. + rho * deltaOrth; + scalar lambda = (0.5 * tempA) / (riemannFit::sqr(1. + tempU) * tempU); + scalar mu = 1. / (tempU * (1. + tempU)) + rho * lambda; + scalar zeta = riemannFit::sqr(deltaOrth) + riemannFit::sqr(deltaPara); + jacobian << xi * tempSmallU * tempV, -xi * riemannFit::sqr(rho) * deltaOrth, xi * deltaPara, + 2. * mu * tempSmallU * deltaPara, 2. * mu * tempV, mu * zeta - lambda * tempA, 0, 0, 1.; + + // translated circle parameters + // phi + circle.par(0) = atan2(tempB, tempC); + // d + circle.par(1) = tempA / (1 + tempU); + // rho after translation. It is invariant, so noop + // circle.par(2)= rho; + + // translated error matrix + circle.cov = jacobian * circle.cov * jacobian.transpose(); + } + + /*! + \brief Computes the data needed for the Broken Line fit procedure that are mainly common for the circle and the line fit. + + \param hits hits coordinates. + \param fast_fit pre-fit result in the form (X0,Y0,R,tan(theta)). + \param bField magnetic field in Gev/cm/c. + \param results PreparedBrokenLineData to be filled (see description of PreparedBrokenLineData). + */ + template + __host__ __device__ inline void prepareBrokenLineData(const M3xN& hits, + const V4& fast_fit, + const double bField, + PreparedBrokenLineData& results) { + riemannFit::Vector2d dVec; + riemannFit::Vector2d eVec; + + dVec = hits.block(0, 1, 2, 1) - hits.block(0, 0, 2, 1); + eVec = hits.block(0, n - 1, 2, 1) - hits.block(0, n - 2, 2, 1); + results.qCharge = riemannFit::cross2D(dVec, eVec) > 0 ? -1 : 1; + + const double slope = -results.qCharge / fast_fit(3); + + riemannFit::Matrix2d rotMat = rotationMatrix(slope); + + // calculate radii and s + results.radii = hits.block(0, 0, 2, n) - fast_fit.head(2) * riemannFit::MatrixXd::Constant(1, n, 1); + eVec = -fast_fit(2) * fast_fit.head(2) / fast_fit.head(2).norm(); + for (u_int i = 0; i < n; i++) { + dVec = results.radii.block(0, i, 2, 1); + results.sTransverse(i) = results.qCharge * fast_fit(2) * + atan2(riemannFit::cross2D(dVec, eVec), dVec.dot(eVec)); // calculates the arc length + } + riemannFit::VectorNd zVec = hits.block(2, 0, 1, n).transpose(); + + //calculate sTotal and zVec + riemannFit::Matrix2xNd pointsSZ = riemannFit::Matrix2xNd::Zero(); + for (u_int i = 0; i < n; i++) { + pointsSZ(0, i) = results.sTransverse(i); + pointsSZ(1, i) = zVec(i); + pointsSZ.block(0, i, 2, 1) = rotMat * pointsSZ.block(0, i, 2, 1); + } + results.sTotal = pointsSZ.block(0, 0, 1, n).transpose(); + results.zInSZplane = pointsSZ.block(1, 0, 1, n).transpose(); + + //calculate varBeta + results.varBeta(0) = results.varBeta(n - 1) = 0; + for (u_int i = 1; i < n - 1; i++) { + results.varBeta(i) = multScatt(results.sTotal(i + 1) - results.sTotal(i), bField, fast_fit(2), i + 2, slope) + + multScatt(results.sTotal(i) - results.sTotal(i - 1), bField, fast_fit(2), i + 1, slope); + } + } + + /*! + \brief Computes the n-by-n band matrix obtained minimizing the Broken Line's cost function w.r.t u. + * This is the whole matrix in the case of the line fit and the main n-by-n block in the case + * of the circle fit. + + \param weights weights of the first part of the cost function, the one with the measurements + * and not the angles (\sum_{i=1}^n w*(y_i-u_i)^2). + \param sTotal total distance traveled by the particle from the pre-fitted closest approach. + \param varBeta kink angles' variance. + + \return the n-by-n matrix of the linear system + */ + template + __host__ __device__ inline riemannFit::MatrixNd matrixC_u(const riemannFit::VectorNd& weights, + const riemannFit::VectorNd& sTotal, + const riemannFit::VectorNd& varBeta) { + riemannFit::MatrixNd c_uMat = riemannFit::MatrixNd::Zero(); + for (u_int i = 0; i < n; i++) { + c_uMat(i, i) = weights(i); + if (i > 1) + c_uMat(i, i) += 1. / (varBeta(i - 1) * riemannFit::sqr(sTotal(i) - sTotal(i - 1))); + if (i > 0 && i < n - 1) + c_uMat(i, i) += + (1. / varBeta(i)) * riemannFit::sqr((sTotal(i + 1) - sTotal(i - 1)) / + ((sTotal(i + 1) - sTotal(i)) * (sTotal(i) - sTotal(i - 1)))); + if (i < n - 2) + c_uMat(i, i) += 1. / (varBeta(i + 1) * riemannFit::sqr(sTotal(i + 1) - sTotal(i))); + + if (i > 0 && i < n - 1) + c_uMat(i, i + 1) = + 1. / (varBeta(i) * (sTotal(i + 1) - sTotal(i))) * + (-(sTotal(i + 1) - sTotal(i - 1)) / ((sTotal(i + 1) - sTotal(i)) * (sTotal(i) - sTotal(i - 1)))); + if (i < n - 2) + c_uMat(i, i + 1) += + 1. / (varBeta(i + 1) * (sTotal(i + 1) - sTotal(i))) * + (-(sTotal(i + 2) - sTotal(i)) / ((sTotal(i + 2) - sTotal(i + 1)) * (sTotal(i + 1) - sTotal(i)))); + + if (i < n - 2) + c_uMat(i, i + 2) = 1. / (varBeta(i + 1) * (sTotal(i + 2) - sTotal(i + 1)) * (sTotal(i + 1) - sTotal(i))); + + c_uMat(i, i) *= 0.5; + } + return c_uMat + c_uMat.transpose(); + } + + /*! + \brief A very fast helix fit. + + \param hits the measured hits. + + \return (X0,Y0,R,tan(theta)). + + \warning sign of theta is (intentionally, for now) mistaken for negative charges. + */ + + template + __host__ __device__ inline void fastFit(const M3xN& hits, V4& result) { + constexpr uint32_t n = M3xN::ColsAtCompileTime; + + const riemannFit::Vector2d a = hits.block(0, n / 2, 2, 1) - hits.block(0, 0, 2, 1); + const riemannFit::Vector2d b = hits.block(0, n - 1, 2, 1) - hits.block(0, n / 2, 2, 1); + const riemannFit::Vector2d c = hits.block(0, 0, 2, 1) - hits.block(0, n - 1, 2, 1); + + auto tmp = 0.5 / riemannFit::cross2D(c, a); + result(0) = hits(0, 0) - (a(1) * c.squaredNorm() + c(1) * a.squaredNorm()) * tmp; + result(1) = hits(1, 0) + (a(0) * c.squaredNorm() + c(0) * a.squaredNorm()) * tmp; + // check Wikipedia for these formulas + + result(2) = sqrt(a.squaredNorm() * b.squaredNorm() * c.squaredNorm()) / (2. * std::abs(riemannFit::cross2D(b, a))); + // Using Math Olympiad's formula R=abc/(4A) + + const riemannFit::Vector2d d = hits.block(0, 0, 2, 1) - result.head(2); + const riemannFit::Vector2d e = hits.block(0, n - 1, 2, 1) - result.head(2); + + result(3) = result(2) * atan2(riemannFit::cross2D(d, e), d.dot(e)) / (hits(2, n - 1) - hits(2, 0)); + // ds/dz slope between last and first point + } + + /*! + \brief Performs the Broken Line fit in the curved track case (that is, the fit + * parameters are the interceptions u and the curvature correction \Delta\kappa). + + \param hits hits coordinates. + \param hits_cov hits covariance matrix. + \param fast_fit pre-fit result in the form (X0,Y0,R,tan(theta)). + \param bField magnetic field in Gev/cm/c. + \param data PreparedBrokenLineData. + \param circle_results struct to be filled with the results in this form: + -par parameter of the line in this form: (phi, d, k); \n + -cov covariance matrix of the fitted parameter; \n + -chi2 value of the cost function in the minimum. + + \details The function implements the steps 2 and 3 of the Broken Line fit + * with the curvature correction.\n + * The step 2 is the least square fit, done by imposing the minimum constraint on + * the cost function and solving the consequent linear system. It determines the + * fitted parameters u and \Delta\kappa and their covariance matrix. + * The step 3 is the correction of the fast pre-fitted parameters for the innermost + * part of the track. It is first done in a comfortable coordinate system (the one + * in which the first hit is the origin) and then the parameters and their + * covariance matrix are transformed to the original coordinate system. + */ + template + __host__ __device__ inline void circleFit(const M3xN& hits, + const M6xN& hits_ge, + const V4& fast_fit, + const double bField, + PreparedBrokenLineData& data, + karimaki_circle_fit& circle_results) { + circle_results.qCharge = data.qCharge; + auto& radii = data.radii; + const auto& sTransverse = data.sTransverse; + const auto& sTotal = data.sTotal; + auto& zInSZplane = data.zInSZplane; + auto& varBeta = data.varBeta; + const double slope = -circle_results.qCharge / fast_fit(3); + varBeta *= 1. + riemannFit::sqr(slope); // the kink angles are projected! + + for (u_int i = 0; i < n; i++) { + zInSZplane(i) = radii.block(0, i, 2, 1).norm() - fast_fit(2); + } + + riemannFit::Matrix2d vMat; // covariance matrix + riemannFit::VectorNd weightsVec; // weights + riemannFit::Matrix2d rotMat; // rotation matrix point by point + for (u_int i = 0; i < n; i++) { + vMat(0, 0) = hits_ge.col(i)[0]; // x errors + vMat(0, 1) = vMat(1, 0) = hits_ge.col(i)[1]; // cov_xy + vMat(1, 1) = hits_ge.col(i)[2]; // y errors + rotMat = rotationMatrix(-radii(0, i) / radii(1, i)); + weightsVec(i) = + 1. / ((rotMat * vMat * rotMat.transpose())(1, 1)); // compute the orthogonal weight point by point + } + + riemannFit::VectorNplusONEd r_uVec; + r_uVec(n) = 0; + for (u_int i = 0; i < n; i++) { + r_uVec(i) = weightsVec(i) * zInSZplane(i); + } + + riemannFit::MatrixNplusONEd c_uMat; + c_uMat.block(0, 0, n, n) = matrixC_u(weightsVec, sTransverse, varBeta); + c_uMat(n, n) = 0; + //add the border to the c_uMat matrix + for (u_int i = 0; i < n; i++) { + c_uMat(i, n) = 0; + if (i > 0 && i < n - 1) { + c_uMat(i, n) += + -(sTransverse(i + 1) - sTransverse(i - 1)) * (sTransverse(i + 1) - sTransverse(i - 1)) / + (2. * varBeta(i) * (sTransverse(i + 1) - sTransverse(i)) * (sTransverse(i) - sTransverse(i - 1))); + } + if (i > 1) { + c_uMat(i, n) += + (sTransverse(i) - sTransverse(i - 2)) / (2. * varBeta(i - 1) * (sTransverse(i) - sTransverse(i - 1))); + } + if (i < n - 2) { + c_uMat(i, n) += + (sTransverse(i + 2) - sTransverse(i)) / (2. * varBeta(i + 1) * (sTransverse(i + 1) - sTransverse(i))); + } + c_uMat(n, i) = c_uMat(i, n); + if (i > 0 && i < n - 1) + c_uMat(n, n) += riemannFit::sqr(sTransverse(i + 1) - sTransverse(i - 1)) / (4. * varBeta(i)); + } + +#ifdef CPP_DUMP + std::cout << "CU5\n" << c_uMat << std::endl; +#endif + riemannFit::MatrixNplusONEd iMat; + math::cholesky::invert(c_uMat, iMat); +#ifdef CPP_DUMP + std::cout << "I5\n" << iMat << std::endl; +#endif + + riemannFit::VectorNplusONEd uVec = iMat * r_uVec; // obtain the fitted parameters by solving the linear system + + // compute (phi, d_ca, k) in the system in which the midpoint of the first two corrected hits is the origin... + + radii.block(0, 0, 2, 1) /= radii.block(0, 0, 2, 1).norm(); + radii.block(0, 1, 2, 1) /= radii.block(0, 1, 2, 1).norm(); + + riemannFit::Vector2d dVec = hits.block(0, 0, 2, 1) + (-zInSZplane(0) + uVec(0)) * radii.block(0, 0, 2, 1); + riemannFit::Vector2d eVec = hits.block(0, 1, 2, 1) + (-zInSZplane(1) + uVec(1)) * radii.block(0, 1, 2, 1); + + circle_results.par << atan2((eVec - dVec)(1), (eVec - dVec)(0)), + -circle_results.qCharge * + (fast_fit(2) - sqrt(riemannFit::sqr(fast_fit(2)) - 0.25 * (eVec - dVec).squaredNorm())), + circle_results.qCharge * (1. / fast_fit(2) + uVec(n)); + + assert(circle_results.qCharge * circle_results.par(1) <= 0); + + riemannFit::Vector2d eMinusd = eVec - dVec; + double tmp1 = eMinusd.squaredNorm(); + double tmp2 = sqrt(riemannFit::sqr(2 * fast_fit(2)) - tmp1); + + riemannFit::Matrix3d jacobian; + jacobian << (radii(1, 0) * eMinusd(0) - eMinusd(1) * radii(0, 0)) / tmp1, + (radii(1, 1) * eMinusd(0) - eMinusd(1) * radii(0, 1)) / tmp1, 0, + (circle_results.qCharge / 2) * (eMinusd(0) * radii(0, 0) + eMinusd(1) * radii(1, 0)) / tmp2, + (circle_results.qCharge / 2) * (eMinusd(0) * radii(0, 1) + eMinusd(1) * radii(1, 1)) / tmp2, 0, 0, 0, + circle_results.qCharge; + + circle_results.cov << iMat(0, 0), iMat(0, 1), iMat(0, n), iMat(1, 0), iMat(1, 1), iMat(1, n), iMat(n, 0), + iMat(n, 1), iMat(n, n); + + circle_results.cov = jacobian * circle_results.cov * jacobian.transpose(); + + //...Translate in the system in which the first corrected hit is the origin, adding the m.s. correction... + + auto eMinusDVec = eVec - dVec; + translateKarimaki(circle_results, 0.5 * eMinusDVec(0), 0.5 * eMinusDVec(1), jacobian); + circle_results.cov(0, 0) += + (1 + riemannFit::sqr(slope)) * multScatt(sTotal(1) - sTotal(0), bField, fast_fit(2), 2, slope); + + //...And translate back to the original system + + translateKarimaki(circle_results, dVec(0), dVec(1), jacobian); + + // compute chi2 + circle_results.chi2 = 0; + for (u_int i = 0; i < n; i++) { + circle_results.chi2 += weightsVec(i) * riemannFit::sqr(zInSZplane(i) - uVec(i)); + if (i > 0 && i < n - 1) + circle_results.chi2 += + riemannFit::sqr(uVec(i - 1) / (sTransverse(i) - sTransverse(i - 1)) - + uVec(i) * (sTransverse(i + 1) - sTransverse(i - 1)) / + ((sTransverse(i + 1) - sTransverse(i)) * (sTransverse(i) - sTransverse(i - 1))) + + uVec(i + 1) / (sTransverse(i + 1) - sTransverse(i)) + + (sTransverse(i + 1) - sTransverse(i - 1)) * uVec(n) / 2) / + varBeta(i); + } + + // assert(circle_results.chi2>=0); + } + + /*! + \brief Performs the Broken Line fit in the straight track case (that is, the fit parameters are only the interceptions u). + + \param hits hits coordinates. + \param fast_fit pre-fit result in the form (X0,Y0,R,tan(theta)). + \param bField magnetic field in Gev/cm/c. + \param data PreparedBrokenLineData. + \param line_results struct to be filled with the results in this form: + -par parameter of the line in this form: (cot(theta), Zip); \n + -cov covariance matrix of the fitted parameter; \n + -chi2 value of the cost function in the minimum. + + \details The function implements the steps 2 and 3 of the Broken Line fit without + * the curvature correction.\n + * The step 2 is the least square fit, done by imposing the minimum constraint + * on the cost function and solving the consequent linear system. It determines + * the fitted parameters u and their covariance matrix. + * The step 3 is the correction of the fast pre-fitted parameters for the innermost + * part of the track. It is first done in a comfortable coordinate system (the one + * in which the first hit is the origin) and then the parameters and their covariance + * matrix are transformed to the original coordinate system. + */ + template + __host__ __device__ inline void lineFit(const M6xN& hits_ge, + const V4& fast_fit, + const double bField, + const PreparedBrokenLineData& data, + riemannFit::LineFit& line_results) { + const auto& radii = data.radii; + const auto& sTotal = data.sTotal; + const auto& zInSZplane = data.zInSZplane; + const auto& varBeta = data.varBeta; + + const double slope = -data.qCharge / fast_fit(3); + riemannFit::Matrix2d rotMat = rotationMatrix(slope); + + riemannFit::Matrix3d vMat = riemannFit::Matrix3d::Zero(); // covariance matrix XYZ + riemannFit::Matrix2x3d jacobXYZtosZ = + riemannFit::Matrix2x3d::Zero(); // jacobian for computation of the error on s (xyz -> sz) + riemannFit::VectorNd weights = riemannFit::VectorNd::Zero(); + for (u_int i = 0; i < n; i++) { + vMat(0, 0) = hits_ge.col(i)[0]; // x errors + vMat(0, 1) = vMat(1, 0) = hits_ge.col(i)[1]; // cov_xy + vMat(0, 2) = vMat(2, 0) = hits_ge.col(i)[3]; // cov_xz + vMat(1, 1) = hits_ge.col(i)[2]; // y errors + vMat(2, 1) = vMat(1, 2) = hits_ge.col(i)[4]; // cov_yz + vMat(2, 2) = hits_ge.col(i)[5]; // z errors + auto tmp = 1. / radii.block(0, i, 2, 1).norm(); + jacobXYZtosZ(0, 0) = radii(1, i) * tmp; + jacobXYZtosZ(0, 1) = -radii(0, i) * tmp; + jacobXYZtosZ(1, 2) = 1.; + weights(i) = 1. / ((rotMat * jacobXYZtosZ * vMat * jacobXYZtosZ.transpose() * rotMat.transpose())( + 1, 1)); // compute the orthogonal weight point by point + } + + riemannFit::VectorNd r_u; + for (u_int i = 0; i < n; i++) { + r_u(i) = weights(i) * zInSZplane(i); + } +#ifdef CPP_DUMP + std::cout << "CU4\n" << matrixC_u(w, sTotal, varBeta) << std::endl; +#endif + riemannFit::MatrixNd iMat; + math::cholesky::invert(matrixC_u(weights, sTotal, varBeta), iMat); +#ifdef CPP_DUMP + std::cout << "I4\n" << iMat << std::endl; +#endif + + riemannFit::VectorNd uVec = iMat * r_u; // obtain the fitted parameters by solving the linear system + + // line parameters in the system in which the first hit is the origin and with axis along SZ + line_results.par << (uVec(1) - uVec(0)) / (sTotal(1) - sTotal(0)), uVec(0); + auto idiff = 1. / (sTotal(1) - sTotal(0)); + line_results.cov << (iMat(0, 0) - 2 * iMat(0, 1) + iMat(1, 1)) * riemannFit::sqr(idiff) + + multScatt(sTotal(1) - sTotal(0), bField, fast_fit(2), 2, slope), + (iMat(0, 1) - iMat(0, 0)) * idiff, (iMat(0, 1) - iMat(0, 0)) * idiff, iMat(0, 0); + + // translate to the original SZ system + riemannFit::Matrix2d jacobian; + jacobian(0, 0) = 1.; + jacobian(0, 1) = 0; + jacobian(1, 0) = -sTotal(0); + jacobian(1, 1) = 1.; + line_results.par(1) += -line_results.par(0) * sTotal(0); + line_results.cov = jacobian * line_results.cov * jacobian.transpose(); + + // rotate to the original sz system + auto tmp = rotMat(0, 0) - line_results.par(0) * rotMat(0, 1); + jacobian(1, 1) = 1. / tmp; + jacobian(0, 0) = jacobian(1, 1) * jacobian(1, 1); + jacobian(0, 1) = 0; + jacobian(1, 0) = line_results.par(1) * rotMat(0, 1) * jacobian(0, 0); + line_results.par(1) = line_results.par(1) * jacobian(1, 1); + line_results.par(0) = (rotMat(0, 1) + line_results.par(0) * rotMat(0, 0)) * jacobian(1, 1); + line_results.cov = jacobian * line_results.cov * jacobian.transpose(); + + // compute chi2 + line_results.chi2 = 0; + for (u_int i = 0; i < n; i++) { + line_results.chi2 += weights(i) * riemannFit::sqr(zInSZplane(i) - uVec(i)); + if (i > 0 && i < n - 1) + line_results.chi2 += riemannFit::sqr(uVec(i - 1) / (sTotal(i) - sTotal(i - 1)) - + uVec(i) * (sTotal(i + 1) - sTotal(i - 1)) / + ((sTotal(i + 1) - sTotal(i)) * (sTotal(i) - sTotal(i - 1))) + + uVec(i + 1) / (sTotal(i + 1) - sTotal(i))) / + varBeta(i); + } + } + + /*! + \brief Helix fit by three step: + -fast pre-fit (see Fast_fit() for further info); \n + -circle fit of the hits projected in the transverse plane by Broken Line algorithm (see BL_Circle_fit() for further info); \n + -line fit of the hits projected on the (pre-fitted) cilinder surface by Broken Line algorithm (see BL_Line_fit() for further info); \n + Points must be passed ordered (from inner to outer layer). + + \param hits Matrix3xNd hits coordinates in this form: \n + |x1|x2|x3|...|xn| \n + |y1|y2|y3|...|yn| \n + |z1|z2|z3|...|zn| + \param hits_cov Matrix3Nd covariance matrix in this form (()->cov()): \n + |(x1,x1)|(x2,x1)|(x3,x1)|(x4,x1)|.|(y1,x1)|(y2,x1)|(y3,x1)|(y4,x1)|.|(z1,x1)|(z2,x1)|(z3,x1)|(z4,x1)| \n + |(x1,x2)|(x2,x2)|(x3,x2)|(x4,x2)|.|(y1,x2)|(y2,x2)|(y3,x2)|(y4,x2)|.|(z1,x2)|(z2,x2)|(z3,x2)|(z4,x2)| \n + |(x1,x3)|(x2,x3)|(x3,x3)|(x4,x3)|.|(y1,x3)|(y2,x3)|(y3,x3)|(y4,x3)|.|(z1,x3)|(z2,x3)|(z3,x3)|(z4,x3)| \n + |(x1,x4)|(x2,x4)|(x3,x4)|(x4,x4)|.|(y1,x4)|(y2,x4)|(y3,x4)|(y4,x4)|.|(z1,x4)|(z2,x4)|(z3,x4)|(z4,x4)| \n + . . . . . . . . . . . . . . . \n + |(x1,y1)|(x2,y1)|(x3,y1)|(x4,y1)|.|(y1,y1)|(y2,y1)|(y3,x1)|(y4,y1)|.|(z1,y1)|(z2,y1)|(z3,y1)|(z4,y1)| \n + |(x1,y2)|(x2,y2)|(x3,y2)|(x4,y2)|.|(y1,y2)|(y2,y2)|(y3,x2)|(y4,y2)|.|(z1,y2)|(z2,y2)|(z3,y2)|(z4,y2)| \n + |(x1,y3)|(x2,y3)|(x3,y3)|(x4,y3)|.|(y1,y3)|(y2,y3)|(y3,x3)|(y4,y3)|.|(z1,y3)|(z2,y3)|(z3,y3)|(z4,y3)| \n + |(x1,y4)|(x2,y4)|(x3,y4)|(x4,y4)|.|(y1,y4)|(y2,y4)|(y3,x4)|(y4,y4)|.|(z1,y4)|(z2,y4)|(z3,y4)|(z4,y4)| \n + . . . . . . . . . . . . . . . \n + |(x1,z1)|(x2,z1)|(x3,z1)|(x4,z1)|.|(y1,z1)|(y2,z1)|(y3,z1)|(y4,z1)|.|(z1,z1)|(z2,z1)|(z3,z1)|(z4,z1)| \n + |(x1,z2)|(x2,z2)|(x3,z2)|(x4,z2)|.|(y1,z2)|(y2,z2)|(y3,z2)|(y4,z2)|.|(z1,z2)|(z2,z2)|(z3,z2)|(z4,z2)| \n + |(x1,z3)|(x2,z3)|(x3,z3)|(x4,z3)|.|(y1,z3)|(y2,z3)|(y3,z3)|(y4,z3)|.|(z1,z3)|(z2,z3)|(z3,z3)|(z4,z3)| \n + |(x1,z4)|(x2,z4)|(x3,z4)|(x4,z4)|.|(y1,z4)|(y2,z4)|(y3,z4)|(y4,z4)|.|(z1,z4)|(z2,z4)|(z3,z4)|(z4,z4)| + \param bField magnetic field in the center of the detector in Gev/cm/c, in order to perform the p_t calculation. + + \warning see BL_Circle_fit(), BL_Line_fit() and Fast_fit() warnings. + + \bug see BL_Circle_fit(), BL_Line_fit() and Fast_fit() bugs. + + \return (phi,Tip,p_t,cot(theta)),Zip), their covariance matrix and the chi2's of the circle and line fits. + */ + template + inline riemannFit::HelixFit helixFit(const riemannFit::Matrix3xNd& hits, + const Eigen::Matrix& hits_ge, + const double bField) { + riemannFit::HelixFit helix; + riemannFit::Vector4d fast_fit; + fastFit(hits, fast_fit); + + PreparedBrokenLineData data; + karimaki_circle_fit circle; + riemannFit::LineFit line; + riemannFit::Matrix3d jacobian; + + prepareBrokenLineData(hits, fast_fit, bField, data); + lineFit(hits_ge, fast_fit, bField, data, line); + circleFit(hits, hits_ge, fast_fit, bField, data, circle); + + // the circle fit gives k, but here we want p_t, so let's change the parameter and the covariance matrix + jacobian << 1., 0, 0, 0, 1., 0, 0, 0, + -std::abs(circle.par(2)) * bField / (riemannFit::sqr(circle.par(2)) * circle.par(2)); + circle.par(2) = bField / std::abs(circle.par(2)); + circle.cov = jacobian * circle.cov * jacobian.transpose(); + + helix.par << circle.par, line.par; + helix.cov = riemannFit::MatrixXd::Zero(5, 5); + helix.cov.block(0, 0, 3, 3) = circle.cov; + helix.cov.block(3, 3, 2, 2) = line.cov; + helix.qCharge = circle.qCharge; + helix.chi2_circle = circle.chi2; + helix.chi2_line = line.chi2; + + return helix; + } + +} // namespace brokenline + +#endif // RecoPixelVertexing_PixelTrackFitting_interface_BrokenLine_h diff --git a/RecoPixelVertexing/PixelTrackFitting/interface/FitResult.h b/RecoPixelVertexing/PixelTrackFitting/interface/FitResult.h new file mode 100644 index 0000000000000..01497719d2998 --- /dev/null +++ b/RecoPixelVertexing/PixelTrackFitting/interface/FitResult.h @@ -0,0 +1,65 @@ +#ifndef RecoPixelVertexing_PixelTrackFitting_interface_FitResult_h +#define RecoPixelVertexing_PixelTrackFitting_interface_FitResult_h + +#include +#include + +#include +#include +#include + +namespace riemannFit { + + using Vector2d = Eigen::Vector2d; + using Vector3d = Eigen::Vector3d; + using Vector4d = Eigen::Vector4d; + using Vector5d = Eigen::Matrix; + using Matrix2d = Eigen::Matrix2d; + using Matrix3d = Eigen::Matrix3d; + using Matrix4d = Eigen::Matrix4d; + using Matrix5d = Eigen::Matrix; + using Matrix6d = Eigen::Matrix; + + template + using Matrix3xNd = Eigen::Matrix; // used for inputs hits + + struct CircleFit { + Vector3d par; //!< parameter: (X0,Y0,R) + Matrix3d cov; + /*!< covariance matrix: \n + |cov(X0,X0)|cov(Y0,X0)|cov( R,X0)| \n + |cov(X0,Y0)|cov(Y0,Y0)|cov( R,Y0)| \n + |cov(X0, R)|cov(Y0, R)|cov( R, R)| + */ + int32_t qCharge; //!< particle charge + float chi2; + }; + + struct LineFit { + Vector2d par; //!<(cotan(theta),Zip) + Matrix2d cov; + /*!< + |cov(c_t,c_t)|cov(Zip,c_t)| \n + |cov(c_t,Zip)|cov(Zip,Zip)| + */ + double chi2; + }; + + struct HelixFit { + Vector5d par; //!<(phi,Tip,pt,cotan(theta)),Zip) + Matrix5d cov; + /*!< ()->cov() \n + |(phi,phi)|(Tip,phi)|(p_t,phi)|(c_t,phi)|(Zip,phi)| \n + |(phi,Tip)|(Tip,Tip)|(p_t,Tip)|(c_t,Tip)|(Zip,Tip)| \n + |(phi,p_t)|(Tip,p_t)|(p_t,p_t)|(c_t,p_t)|(Zip,p_t)| \n + |(phi,c_t)|(Tip,c_t)|(p_t,c_t)|(c_t,c_t)|(Zip,c_t)| \n + |(phi,Zip)|(Tip,Zip)|(p_t,Zip)|(c_t,Zip)|(Zip,Zip)| + */ + float chi2_circle; + float chi2_line; + // Vector4d fast_fit; + int32_t qCharge; //!< particle charge + }; // __attribute__((aligned(16))); + +} // namespace riemannFit +#endif diff --git a/RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h b/RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h new file mode 100644 index 0000000000000..2fe74f53a7bd2 --- /dev/null +++ b/RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h @@ -0,0 +1,243 @@ +#ifndef RecoPixelVertexing_PixelTrackFitting_interface_FitUtils_h +#define RecoPixelVertexing_PixelTrackFitting_interface_FitUtils_h + +#include "DataFormats/Math/interface/choleskyInversion.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" +#include "RecoPixelVertexing/PixelTrackFitting/interface/FitResult.h" + +namespace riemannFit { + + constexpr double epsilon = 1.e-4; //!< used in numerical derivative (J2 in Circle_fit()) + + using VectorXd = Eigen::VectorXd; + using MatrixXd = Eigen::MatrixXd; + template + using MatrixNd = Eigen::Matrix; + template + using MatrixNplusONEd = Eigen::Matrix; + template + using ArrayNd = Eigen::Array; + template + using Matrix2Nd = Eigen::Matrix; + template + using Matrix3Nd = Eigen::Matrix; + template + using Matrix2xNd = Eigen::Matrix; + template + using Array2xNd = Eigen::Array; + template + using MatrixNx3d = Eigen::Matrix; + template + using MatrixNx5d = Eigen::Matrix; + template + using VectorNd = Eigen::Matrix; + template + using VectorNplusONEd = Eigen::Matrix; + template + using Vector2Nd = Eigen::Matrix; + template + using Vector3Nd = Eigen::Matrix; + template + using RowVectorNd = Eigen::Matrix; + template + using RowVector2Nd = Eigen::Matrix; + + using Matrix2x3d = Eigen::Matrix; + + using Matrix3f = Eigen::Matrix3f; + using Vector3f = Eigen::Vector3f; + using Vector4f = Eigen::Vector4f; + using Vector6f = Eigen::Matrix; + + template + __host__ __device__ void printIt(C* m, const char* prefix = "") { +#ifdef RFIT_DEBUG + for (uint r = 0; r < m->rows(); ++r) { + for (uint c = 0; c < m->cols(); ++c) { + printf("%s Matrix(%d,%d) = %g\n", prefix, r, c, (*m)(r, c)); + } + } +#endif + } + + /*! + \brief raise to square. + */ + template + constexpr T sqr(const T a) { + return a * a; + } + + /*! + \brief Compute cross product of two 2D vector (assuming z component 0), + returning z component of the result. + \param a first 2D vector in the product. + \param b second 2D vector in the product. + \return z component of the cross product. + */ + + __host__ __device__ inline double cross2D(const Vector2d& a, const Vector2d& b) { + return a.x() * b.y() - a.y() * b.x(); + } + + /*! + * load error in CMSSW format to our formalism + * + */ + template + __host__ __device__ void loadCovariance2D(M6xNf const& ge, M2Nd& hits_cov) { + // Index numerology: + // i: index of the hits/point (0,..,3) + // j: index of space component (x,y,z) + // l: index of space components (x,y,z) + // ge is always in sync with the index i and is formatted as: + // ge[] ==> [xx, xy, yy, xz, yz, zz] + // in (j,l) notation, we have: + // ge[] ==> [(0,0), (0,1), (1,1), (0,2), (1,2), (2,2)] + // so the index ge_idx corresponds to the matrix elements: + // | 0 1 3 | + // | 1 2 4 | + // | 3 4 5 | + constexpr uint32_t hits_in_fit = M6xNf::ColsAtCompileTime; + for (uint32_t i = 0; i < hits_in_fit; ++i) { + { + constexpr uint32_t ge_idx = 0, j = 0, l = 0; + hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = ge.col(i)[ge_idx]; + } + { + constexpr uint32_t ge_idx = 2, j = 1, l = 1; + hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = ge.col(i)[ge_idx]; + } + { + constexpr uint32_t ge_idx = 1, j = 1, l = 0; + hits_cov(i + l * hits_in_fit, i + j * hits_in_fit) = hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = + ge.col(i)[ge_idx]; + } + } + } + + template + __host__ __device__ void loadCovariance(M6xNf const& ge, M3xNd& hits_cov) { + // Index numerology: + // i: index of the hits/point (0,..,3) + // j: index of space component (x,y,z) + // l: index of space components (x,y,z) + // ge is always in sync with the index i and is formatted as: + // ge[] ==> [xx, xy, yy, xz, yz, zz] + // in (j,l) notation, we have: + // ge[] ==> [(0,0), (0,1), (1,1), (0,2), (1,2), (2,2)] + // so the index ge_idx corresponds to the matrix elements: + // | 0 1 3 | + // | 1 2 4 | + // | 3 4 5 | + constexpr uint32_t hits_in_fit = M6xNf::ColsAtCompileTime; + for (uint32_t i = 0; i < hits_in_fit; ++i) { + { + constexpr uint32_t ge_idx = 0, j = 0, l = 0; + hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = ge.col(i)[ge_idx]; + } + { + constexpr uint32_t ge_idx = 2, j = 1, l = 1; + hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = ge.col(i)[ge_idx]; + } + { + constexpr uint32_t ge_idx = 5, j = 2, l = 2; + hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = ge.col(i)[ge_idx]; + } + { + constexpr uint32_t ge_idx = 1, j = 1, l = 0; + hits_cov(i + l * hits_in_fit, i + j * hits_in_fit) = hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = + ge.col(i)[ge_idx]; + } + { + constexpr uint32_t ge_idx = 3, j = 2, l = 0; + hits_cov(i + l * hits_in_fit, i + j * hits_in_fit) = hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = + ge.col(i)[ge_idx]; + } + { + constexpr uint32_t ge_idx = 4, j = 2, l = 1; + hits_cov(i + l * hits_in_fit, i + j * hits_in_fit) = hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = + ge.col(i)[ge_idx]; + } + } + } + + /*! + \brief Transform circle parameter from (X0,Y0,R) to (phi,Tip,p_t) and + consequently covariance matrix. + \param circle_uvr parameter (X0,Y0,R), covariance matrix to + be transformed and particle charge. + \param B magnetic field in Gev/cm/c unit. + \param error flag for errors computation. + */ + __host__ __device__ inline void par_uvrtopak(CircleFit& circle, const double B, const bool error) { + Vector3d par_pak; + const double temp0 = circle.par.head(2).squaredNorm(); + const double temp1 = sqrt(temp0); + par_pak << atan2(circle.qCharge * circle.par(0), -circle.qCharge * circle.par(1)), + circle.qCharge * (temp1 - circle.par(2)), circle.par(2) * B; + if (error) { + const double temp2 = sqr(circle.par(0)) * 1. / temp0; + const double temp3 = 1. / temp1 * circle.qCharge; + Matrix3d j4Mat; + j4Mat << -circle.par(1) * temp2 * 1. / sqr(circle.par(0)), temp2 * 1. / circle.par(0), 0., circle.par(0) * temp3, + circle.par(1) * temp3, -circle.qCharge, 0., 0., B; + circle.cov = j4Mat * circle.cov * j4Mat.transpose(); + } + circle.par = par_pak; + } + + /*! + \brief Transform circle parameter from (X0,Y0,R) to (phi,Tip,q/R) and + consequently covariance matrix. + \param circle_uvr parameter (X0,Y0,R), covariance matrix to + be transformed and particle charge. + */ + __host__ __device__ inline void fromCircleToPerigee(CircleFit& circle) { + Vector3d par_pak; + const double temp0 = circle.par.head(2).squaredNorm(); + const double temp1 = sqrt(temp0); + par_pak << atan2(circle.qCharge * circle.par(0), -circle.qCharge * circle.par(1)), + circle.qCharge * (temp1 - circle.par(2)), circle.qCharge / circle.par(2); + + const double temp2 = sqr(circle.par(0)) * 1. / temp0; + const double temp3 = 1. / temp1 * circle.qCharge; + Matrix3d j4Mat; + j4Mat << -circle.par(1) * temp2 * 1. / sqr(circle.par(0)), temp2 * 1. / circle.par(0), 0., circle.par(0) * temp3, + circle.par(1) * temp3, -circle.qCharge, 0., 0., -circle.qCharge / (circle.par(2) * circle.par(2)); + circle.cov = j4Mat * circle.cov * j4Mat.transpose(); + + circle.par = par_pak; + } + + // transformation between the "perigee" to cmssw localcoord frame + // the plane of the latter is the perigee plane... + // from //!<(phi,Tip,q/pt,cotan(theta)),Zip) + // to q/p,dx/dz,dy/dz,x,z + template + __host__ __device__ inline void transformToPerigeePlane(VI5 const& ip, MI5 const& icov, VO5& op, MO5& ocov) { + auto sinTheta2 = 1. / (1. + ip(3) * ip(3)); + auto sinTheta = std::sqrt(sinTheta2); + auto cosTheta = ip(3) * sinTheta; + + op(0) = sinTheta * ip(2); + op(1) = 0.; + op(2) = -ip(3); + op(3) = ip(1); + op(4) = -ip(4); + + Matrix5d jMat = Matrix5d::Zero(); + + jMat(0, 2) = sinTheta; + jMat(0, 3) = -sinTheta2 * cosTheta * ip(2); + jMat(1, 0) = 1.; + jMat(2, 3) = -1.; + jMat(3, 1) = 1.; + jMat(4, 4) = -1; + + ocov = jMat * icov * jMat.transpose(); + } + +} // namespace riemannFit + +#endif // RecoPixelVertexing_PixelTrackFitting_interface_FitUtils_h diff --git a/RecoPixelVertexing/PixelTrackFitting/interface/PixelNtupletsFitter.h b/RecoPixelVertexing/PixelTrackFitting/interface/PixelNtupletsFitter.h new file mode 100644 index 0000000000000..9fb8843589669 --- /dev/null +++ b/RecoPixelVertexing/PixelTrackFitting/interface/PixelNtupletsFitter.h @@ -0,0 +1,27 @@ +#ifndef RecoPixelVertexing_PixelTrackFitting_interface_PixelNtupletsFitter_h +#define RecoPixelVertexing_PixelTrackFitting_interface_PixelNtupletsFitter_h + +#include + +#include "DataFormats/TrackReco/interface/Track.h" +#include "DataFormats/TrackingRecHit/interface/TrackingRecHit.h" +#include "FWCore/Framework/interface/EventSetup.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "RecoPixelVertexing/PixelTrackFitting/interface/PixelFitterBase.h" +#include "RecoTracker/TkTrackingRegions/interface/TrackingRegion.h" + +class PixelNtupletsFitter final : public PixelFitterBase { +public: + explicit PixelNtupletsFitter(float nominalB, const MagneticField* field, bool useRiemannFit); + ~PixelNtupletsFitter() override = default; + std::unique_ptr run(const std::vector& hits, + const TrackingRegion& region, + const edm::EventSetup& setup) const override; + +private: + float nominalB_; + const MagneticField* field_; + bool useRiemannFit_; +}; + +#endif // RecoPixelVertexing_PixelTrackFitting_interface_PixelNtupletsFitter_h diff --git a/RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h b/RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h new file mode 100644 index 0000000000000..52cf4b637fb37 --- /dev/null +++ b/RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h @@ -0,0 +1,1008 @@ +#ifndef RecoPixelVertexing_PixelTrackFitting_interface_RiemannFit_h +#define RecoPixelVertexing_PixelTrackFitting_interface_RiemannFit_h + +#include "RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h" + +namespace riemannFit { + + /*! Compute the Radiation length in the uniform hypothesis + * + * The Pixel detector, barrel and forward, is considered as an homogeneous + * cylinder of material, whose radiation lengths has been derived from the TDR + * plot that shows that 16cm correspond to 0.06 radiation lengths. Therefore + * one radiation length corresponds to 16cm/0.06 =~ 267 cm. All radiation + * lengths are computed using this unique number, in both regions, barrel and + * endcap. + * + * NB: no angle corrections nor projections are computed inside this routine. + * It is therefore the responsibility of the caller to supply the proper + * lengths in input. These lengths are the path traveled by the particle along + * its trajectory, namely the so called S of the helix in 3D space. + * + * \param length_values vector of incremental distances that will be translated + * into radiation length equivalent. Each radiation length i is computed + * incrementally with respect to the previous length i-1. The first length has + * no reference point (i.e. it has the dca). + * + * \return incremental radiation lengths that correspond to each segment. + */ + + template + __host__ __device__ inline void computeRadLenUniformMaterial(const VNd1& length_values, VNd2& rad_lengths) { + // Radiation length of the pixel detector in the uniform assumption, with + // 0.06 rad_len at 16 cm + constexpr double xx_0_inv = 0.06 / 16.; + uint n = length_values.rows(); + rad_lengths(0) = length_values(0) * xx_0_inv; + for (uint j = 1; j < n; ++j) { + rad_lengths(j) = std::abs(length_values(j) - length_values(j - 1)) * xx_0_inv; + } + } + + /*! + \brief Compute the covariance matrix along cartesian S-Z of points due to + multiple Coulomb scattering to be used in the line_fit, for the barrel + and forward cases. + The input covariance matrix is in the variables s-z, original and + unrotated. + The multiple scattering component is computed in the usual linear + approximation, using the 3D path which is computed as the squared root of + the squared sum of the s and z components passed in. + Internally a rotation by theta is performed and the covariance matrix + returned is the one in the direction orthogonal to the rotated S3D axis, + i.e. along the rotated Z axis. + The choice of the rotation is not arbitrary, but derived from the fact that + putting the horizontal axis along the S3D direction allows the usage of the + ordinary least squared fitting techiques with the trivial parametrization y + = mx + q, avoiding the patological case with m = +/- inf, that would + correspond to the case at eta = 0. + */ + + template + __host__ __device__ inline auto scatterCovLine(Matrix2d const* cov_sz, + const V4& fast_fit, + VNd1 const& s_arcs, + VNd2 const& z_values, + const double theta, + const double bField, + MatrixNd& ret) { +#ifdef RFIT_DEBUG + riemannFit::printIt(&s_arcs, "Scatter_cov_line - s_arcs: "); +#endif + constexpr uint n = N; + double p_t = std::min(20., fast_fit(2) * bField); // limit pt to avoid too small error!!! + double p_2 = p_t * p_t * (1. + 1. / sqr(fast_fit(3))); + VectorNd rad_lengths_S; + // See documentation at http://eigen.tuxfamily.org/dox/group__TutorialArrayClass.html + // Basically, to perform cwise operations on Matrices and Vectors, you need + // to transform them into Array-like objects. + VectorNd s_values = s_arcs.array() * s_arcs.array() + z_values.array() * z_values.array(); + s_values = s_values.array().sqrt(); + computeRadLenUniformMaterial(s_values, rad_lengths_S); + VectorNd sig2_S; + sig2_S = .000225 / p_2 * (1. + 0.038 * rad_lengths_S.array().log()).abs2() * rad_lengths_S.array(); +#ifdef RFIT_DEBUG + riemannFit::printIt(cov_sz, "Scatter_cov_line - cov_sz: "); +#endif + Matrix2Nd tmp = Matrix2Nd::Zero(); + for (uint k = 0; k < n; ++k) { + tmp(k, k) = cov_sz[k](0, 0); + tmp(k + n, k + n) = cov_sz[k](1, 1); + tmp(k, k + n) = tmp(k + n, k) = cov_sz[k](0, 1); + } + for (uint k = 0; k < n; ++k) { + for (uint l = k; l < n; ++l) { + for (uint i = 0; i < std::min(k, l); ++i) { + tmp(k + n, l + n) += std::abs(s_values(k) - s_values(i)) * std::abs(s_values(l) - s_values(i)) * sig2_S(i); + } + tmp(l + n, k + n) = tmp(k + n, l + n); + } + } + // We are interested only in the errors orthogonal to the rotated s-axis + // which, in our formalism, are in the lower square matrix. +#ifdef RFIT_DEBUG + riemannFit::printIt(&tmp, "Scatter_cov_line - tmp: "); +#endif + ret = tmp.block(n, n, n, n); + } + + /*! + \brief Compute the covariance matrix (in radial coordinates) of points in + the transverse plane due to multiple Coulomb scattering. + \param p2D 2D points in the transverse plane. + \param fast_fit fast_fit Vector4d result of the previous pre-fit + structured in this form:(X0, Y0, R, Tan(Theta))). + \param B magnetic field use to compute p + \return scatter_cov_rad errors due to multiple scattering. + \warning input points must be ordered radially from the detector center + (from inner layer to outer ones; points on the same layer must ordered too). + \details Only the tangential component is computed (the radial one is + negligible). + */ + template + __host__ __device__ inline MatrixNd scatter_cov_rad(const M2xN& p2D, + const V4& fast_fit, + VectorNd const& rad, + double B) { + constexpr uint n = N; + double p_t = std::min(20., fast_fit(2) * B); // limit pt to avoid too small error!!! + double p_2 = p_t * p_t * (1. + 1. / sqr(fast_fit(3))); + double theta = atan(fast_fit(3)); + theta = theta < 0. ? theta + M_PI : theta; + VectorNd s_values; + VectorNd rad_lengths; + const Vector2d oVec(fast_fit(0), fast_fit(1)); + + // associated Jacobian, used in weights and errors computation + for (uint i = 0; i < n; ++i) { // x + Vector2d pVec = p2D.block(0, i, 2, 1) - oVec; + const double cross = cross2D(-oVec, pVec); + const double dot = (-oVec).dot(pVec); + const double tempAtan2 = atan2(cross, dot); + s_values(i) = std::abs(tempAtan2 * fast_fit(2)); + } + computeRadLenUniformMaterial(s_values * sqrt(1. + 1. / sqr(fast_fit(3))), rad_lengths); + MatrixNd scatter_cov_rad = MatrixNd::Zero(); + VectorNd sig2 = (1. + 0.038 * rad_lengths.array().log()).abs2() * rad_lengths.array(); + sig2 *= 0.000225 / (p_2 * sqr(sin(theta))); + for (uint k = 0; k < n; ++k) { + for (uint l = k; l < n; ++l) { + for (uint i = 0; i < std::min(k, l); ++i) { + scatter_cov_rad(k, l) += (rad(k) - rad(i)) * (rad(l) - rad(i)) * sig2(i); + } + scatter_cov_rad(l, k) = scatter_cov_rad(k, l); + } + } +#ifdef RFIT_DEBUG + riemannFit::printIt(&scatter_cov_rad, "Scatter_cov_rad - scatter_cov_rad: "); +#endif + return scatter_cov_rad; + } + + /*! + \brief Transform covariance matrix from radial (only tangential component) + to Cartesian coordinates (only transverse plane component). + \param p2D 2D points in the transverse plane. + \param cov_rad covariance matrix in radial coordinate. + \return cov_cart covariance matrix in Cartesian coordinates. +*/ + + template + __host__ __device__ inline Matrix2Nd cov_radtocart(const M2xN& p2D, + const MatrixNd& cov_rad, + const VectorNd& rad) { +#ifdef RFIT_DEBUG + printf("Address of p2D: %p\n", &p2D); +#endif + printIt(&p2D, "cov_radtocart - p2D:"); + constexpr uint n = N; + Matrix2Nd cov_cart = Matrix2Nd::Zero(); + VectorNd rad_inv = rad.cwiseInverse(); + printIt(&rad_inv, "cov_radtocart - rad_inv:"); + for (uint i = 0; i < n; ++i) { + for (uint j = i; j < n; ++j) { + cov_cart(i, j) = cov_rad(i, j) * p2D(1, i) * rad_inv(i) * p2D(1, j) * rad_inv(j); + cov_cart(i + n, j + n) = cov_rad(i, j) * p2D(0, i) * rad_inv(i) * p2D(0, j) * rad_inv(j); + cov_cart(i, j + n) = -cov_rad(i, j) * p2D(1, i) * rad_inv(i) * p2D(0, j) * rad_inv(j); + cov_cart(i + n, j) = -cov_rad(i, j) * p2D(0, i) * rad_inv(i) * p2D(1, j) * rad_inv(j); + cov_cart(j, i) = cov_cart(i, j); + cov_cart(j + n, i + n) = cov_cart(i + n, j + n); + cov_cart(j + n, i) = cov_cart(i, j + n); + cov_cart(j, i + n) = cov_cart(i + n, j); + } + } + return cov_cart; + } + + /*! + \brief Transform covariance matrix from Cartesian coordinates (only + transverse plane component) to radial coordinates (both radial and + tangential component but only diagonal terms, correlation between different + point are not managed). + \param p2D 2D points in transverse plane. + \param cov_cart covariance matrix in Cartesian coordinates. + \return cov_rad covariance matrix in raidal coordinate. + \warning correlation between different point are not computed. +*/ + template + __host__ __device__ inline VectorNd cov_carttorad(const M2xN& p2D, + const Matrix2Nd& cov_cart, + const VectorNd& rad) { + constexpr uint n = N; + VectorNd cov_rad; + const VectorNd rad_inv2 = rad.cwiseInverse().array().square(); + for (uint i = 0; i < n; ++i) { + //!< in case you have (0,0) to avoid dividing by 0 radius + if (rad(i) < 1.e-4) + cov_rad(i) = cov_cart(i, i); + else { + cov_rad(i) = rad_inv2(i) * (cov_cart(i, i) * sqr(p2D(1, i)) + cov_cart(i + n, i + n) * sqr(p2D(0, i)) - + 2. * cov_cart(i, i + n) * p2D(0, i) * p2D(1, i)); + } + } + return cov_rad; + } + + /*! + \brief Transform covariance matrix from Cartesian coordinates (only + transverse plane component) to coordinates system orthogonal to the + pre-fitted circle in each point. + Further information in attached documentation. + \param p2D 2D points in transverse plane. + \param cov_cart covariance matrix in Cartesian coordinates. + \param fast_fit fast_fit Vector4d result of the previous pre-fit + structured in this form:(X0, Y0, R, tan(theta))). + \return cov_rad covariance matrix in the pre-fitted circle's + orthogonal system. +*/ + template + __host__ __device__ inline VectorNd cov_carttorad_prefit(const M2xN& p2D, + const Matrix2Nd& cov_cart, + V4& fast_fit, + const VectorNd& rad) { + constexpr uint n = N; + VectorNd cov_rad; + for (uint i = 0; i < n; ++i) { + //!< in case you have (0,0) to avoid dividing by 0 radius + if (rad(i) < 1.e-4) + cov_rad(i) = cov_cart(i, i); // TO FIX + else { + Vector2d a = p2D.col(i); + Vector2d b = p2D.col(i) - fast_fit.head(2); + const double x2 = a.dot(b); + const double y2 = cross2D(a, b); + const double tan_c = -y2 / x2; + const double tan_c2 = sqr(tan_c); + cov_rad(i) = + 1. / (1. + tan_c2) * (cov_cart(i, i) + cov_cart(i + n, i + n) * tan_c2 + 2 * cov_cart(i, i + n) * tan_c); + } + } + return cov_rad; + } + + /*! + \brief Compute the points' weights' vector for the circle fit when multiple + scattering is managed. + Further information in attached documentation. + \param cov_rad_inv covariance matrix inverse in radial coordinated + (or, beter, pre-fitted circle's orthogonal system). + \return weight VectorNd points' weights' vector. + \bug I'm not sure this is the right way to compute the weights for non + diagonal cov matrix. Further investigation needed. +*/ + + template + __host__ __device__ inline VectorNd weightCircle(const MatrixNd& cov_rad_inv) { + return cov_rad_inv.colwise().sum().transpose(); + } + + /*! + \brief Find particle q considering the sign of cross product between + particles velocity (estimated by the first 2 hits) and the vector radius + between the first hit and the center of the fitted circle. + \param p2D 2D points in transverse plane. + \param par_uvr result of the circle fit in this form: (X0,Y0,R). + \return q int 1 or -1. +*/ + template + __host__ __device__ inline int32_t charge(const M2xN& p2D, const Vector3d& par_uvr) { + return ((p2D(0, 1) - p2D(0, 0)) * (par_uvr.y() - p2D(1, 0)) - (p2D(1, 1) - p2D(1, 0)) * (par_uvr.x() - p2D(0, 0)) > + 0) + ? -1 + : 1; + } + + /*! + \brief Compute the eigenvector associated to the minimum eigenvalue. + \param A the Matrix you want to know eigenvector and eigenvalue. + \param chi2 the double were the chi2-related quantity will be stored. + \return the eigenvector associated to the minimum eigenvalue. + \warning double precision is needed for a correct assessment of chi2. + \details The minimus eigenvalue is related to chi2. + We exploit the fact that the matrix is symmetrical and small (2x2 for line + fit and 3x3 for circle fit), so the SelfAdjointEigenSolver from Eigen + library is used, with the computedDirect method (available only for 2x2 + and 3x3 Matrix) wich computes eigendecomposition of given matrix using a + fast closed-form algorithm. + For this optimization the matrix type must be known at compiling time. +*/ + + __host__ __device__ inline Vector3d min_eigen3D(const Matrix3d& A, double& chi2) { +#ifdef RFIT_DEBUG + printf("min_eigen3D - enter\n"); +#endif + Eigen::SelfAdjointEigenSolver solver(3); + solver.computeDirect(A); + int min_index; + chi2 = solver.eigenvalues().minCoeff(&min_index); +#ifdef RFIT_DEBUG + printf("min_eigen3D - exit\n"); +#endif + return solver.eigenvectors().col(min_index); + } + + /*! + \brief A faster version of min_eigen3D() where double precision is not + needed. + \param A the Matrix you want to know eigenvector and eigenvalue. + \param chi2 the double were the chi2-related quantity will be stored + \return the eigenvector associated to the minimum eigenvalue. + \detail The computedDirect() method of SelfAdjointEigenSolver for 3x3 Matrix + indeed, use trigonometry function (it solves a third degree equation) which + speed up in single precision. +*/ + + __host__ __device__ inline Vector3d min_eigen3D_fast(const Matrix3d& A) { + Eigen::SelfAdjointEigenSolver solver(3); + solver.computeDirect(A.cast()); + int min_index; + solver.eigenvalues().minCoeff(&min_index); + return solver.eigenvectors().col(min_index).cast(); + } + + /*! + \brief 2D version of min_eigen3D(). + \param aMat the Matrix you want to know eigenvector and eigenvalue. + \param chi2 the double were the chi2-related quantity will be stored + \return the eigenvector associated to the minimum eigenvalue. + \detail The computedDirect() method of SelfAdjointEigenSolver for 2x2 Matrix + do not use special math function (just sqrt) therefore it doesn't speed up + significantly in single precision. +*/ + + __host__ __device__ inline Vector2d min_eigen2D(const Matrix2d& aMat, double& chi2) { + Eigen::SelfAdjointEigenSolver solver(2); + solver.computeDirect(aMat); + int min_index; + chi2 = solver.eigenvalues().minCoeff(&min_index); + return solver.eigenvectors().col(min_index); + } + + /*! + \brief A very fast helix fit: it fits a circle by three points (first, middle + and last point) and a line by two points (first and last). + \param hits points to be fitted + \return result in this form: (X0,Y0,R,tan(theta)). + \warning points must be passed ordered (from internal layer to external) in + order to maximize accuracy and do not mistake tan(theta) sign. + \details This fast fit is used as pre-fit which is needed for: + - weights estimation and chi2 computation in line fit (fundamental); + - weights estimation and chi2 computation in circle fit (useful); + - computation of error due to multiple scattering. +*/ + + template + __host__ __device__ inline void fastFit(const M3xN& hits, V4& result) { + constexpr uint32_t N = M3xN::ColsAtCompileTime; + constexpr auto n = N; // get the number of hits + printIt(&hits, "Fast_fit - hits: "); + + // CIRCLE FIT + // Make segments between middle-to-first(b) and last-to-first(c) hits + const Vector2d bVec = hits.block(0, n / 2, 2, 1) - hits.block(0, 0, 2, 1); + const Vector2d cVec = hits.block(0, n - 1, 2, 1) - hits.block(0, 0, 2, 1); + printIt(&bVec, "Fast_fit - b: "); + printIt(&cVec, "Fast_fit - c: "); + // Compute their lengths + auto b2 = bVec.squaredNorm(); + auto c2 = cVec.squaredNorm(); + // The algebra has been verified (MR). The usual approach has been followed: + // * use an orthogonal reference frame passing from the first point. + // * build the segments (chords) + // * build orthogonal lines through mid points + // * make a system and solve for X0 and Y0. + // * add the initial point + bool flip = abs(bVec.x()) < abs(bVec.y()); + auto bx = flip ? bVec.y() : bVec.x(); + auto by = flip ? bVec.x() : bVec.y(); + auto cx = flip ? cVec.y() : cVec.x(); + auto cy = flip ? cVec.x() : cVec.y(); + //!< in case b.x is 0 (2 hits with same x) + auto div = 2. * (cx * by - bx * cy); + // if aligned TO FIX + auto y0 = (cx * b2 - bx * c2) / div; + auto x0 = (0.5 * b2 - y0 * by) / bx; + result(0) = hits(0, 0) + (flip ? y0 : x0); + result(1) = hits(1, 0) + (flip ? x0 : y0); + result(2) = sqrt(sqr(x0) + sqr(y0)); + printIt(&result, "Fast_fit - result: "); + + // LINE FIT + const Vector2d dVec = hits.block(0, 0, 2, 1) - result.head(2); + const Vector2d eVec = hits.block(0, n - 1, 2, 1) - result.head(2); + printIt(&eVec, "Fast_fit - e: "); + printIt(&dVec, "Fast_fit - d: "); + // Compute the arc-length between first and last point: L = R * theta = R * atan (tan (Theta) ) + auto dr = result(2) * atan2(cross2D(dVec, eVec), dVec.dot(eVec)); + // Simple difference in Z between last and first hit + auto dz = hits(2, n - 1) - hits(2, 0); + + result(3) = (dr / dz); + +#ifdef RFIT_DEBUG + printf("Fast_fit: [%f, %f, %f, %f]\n", result(0), result(1), result(2), result(3)); +#endif + } + + /*! + \brief Fit a generic number of 2D points with a circle using Riemann-Chernov + algorithm. Covariance matrix of fitted parameter is optionally computed. + Multiple scattering (currently only in barrel layer) is optionally handled. + \param hits2D 2D points to be fitted. + \param hits_cov2D covariance matrix of 2D points. + \param fast_fit pre-fit result in this form: (X0,Y0,R,tan(theta)). + (tan(theta) is not used). + \param bField magnetic field + \param error flag for error computation. + \param scattering flag for multiple scattering + \return circle circle_fit: + -par parameter of the fitted circle in this form (X0,Y0,R); \n + -cov covariance matrix of the fitted parameter (not initialized if + error = false); \n + -q charge of the particle; \n + -chi2. + \warning hits must be passed ordered from inner to outer layer (double hits + on the same layer must be ordered too) so that multiple scattering is + treated properly. + \warning Multiple scattering for barrel is still not tested. + \warning Multiple scattering for endcap hits is not handled (yet). Do not + fit endcap hits with scattering = true ! + \bug for small pt (<0.3 Gev/c) chi2 could be slightly underestimated. + \bug further investigation needed for error propagation with multiple + scattering. +*/ + template + __host__ __device__ inline CircleFit circleFit(const M2xN& hits2D, + const Matrix2Nd& hits_cov2D, + const V4& fast_fit, + const VectorNd& rad, + const double bField, + const bool error) { +#ifdef RFIT_DEBUG + printf("circle_fit - enter\n"); +#endif + // INITIALIZATION + Matrix2Nd vMat = hits_cov2D; + constexpr uint n = N; + printIt(&hits2D, "circle_fit - hits2D:"); + printIt(&hits_cov2D, "circle_fit - hits_cov2D:"); + +#ifdef RFIT_DEBUG + printf("circle_fit - WEIGHT COMPUTATION\n"); +#endif + // WEIGHT COMPUTATION + VectorNd weight; + MatrixNd gMat; + double renorm; + { + MatrixNd cov_rad = cov_carttorad_prefit(hits2D, vMat, fast_fit, rad).asDiagonal(); + MatrixNd scatterCovRadMat = scatter_cov_rad(hits2D, fast_fit, rad, bField); + printIt(&scatterCovRadMat, "circle_fit - scatter_cov_rad:"); + printIt(&hits2D, "circle_fit - hits2D bis:"); +#ifdef RFIT_DEBUG + printf("Address of hits2D: a) %p\n", &hits2D); +#endif + vMat += cov_radtocart(hits2D, scatterCovRadMat, rad); + printIt(&vMat, "circle_fit - V:"); + cov_rad += scatterCovRadMat; + printIt(&cov_rad, "circle_fit - cov_rad:"); + math::cholesky::invert(cov_rad, gMat); + // gMat = cov_rad.inverse(); + renorm = gMat.sum(); + gMat *= 1. / renorm; + weight = weightCircle(gMat); + } + printIt(&weight, "circle_fit - weight:"); + + // SPACE TRANSFORMATION +#ifdef RFIT_DEBUG + printf("circle_fit - SPACE TRANSFORMATION\n"); +#endif + + // center +#ifdef RFIT_DEBUG + printf("Address of hits2D: b) %p\n", &hits2D); +#endif + const Vector2d hCentroid = hits2D.rowwise().mean(); // centroid + printIt(&hCentroid, "circle_fit - h_:"); + Matrix3xNd p3D; + p3D.block(0, 0, 2, n) = hits2D.colwise() - hCentroid; + printIt(&p3D, "circle_fit - p3D: a)"); + Vector2Nd mc; // centered hits, used in error computation + mc << p3D.row(0).transpose(), p3D.row(1).transpose(); + printIt(&mc, "circle_fit - mc(centered hits):"); + + // scale + const double tempQ = mc.squaredNorm(); + const double tempS = sqrt(n * 1. / tempQ); // scaling factor + p3D *= tempS; + + // project on paraboloid + p3D.row(2) = p3D.block(0, 0, 2, n).colwise().squaredNorm(); + printIt(&p3D, "circle_fit - p3D: b)"); + +#ifdef RFIT_DEBUG + printf("circle_fit - COST FUNCTION\n"); +#endif + // COST FUNCTION + + // compute + Vector3d r0; + r0.noalias() = p3D * weight; // center of gravity + const Matrix3xNd xMat = p3D.colwise() - r0; + Matrix3d aMat = xMat * gMat * xMat.transpose(); + printIt(&aMat, "circle_fit - A:"); + +#ifdef RFIT_DEBUG + printf("circle_fit - MINIMIZE\n"); +#endif + // minimize + double chi2; + Vector3d vVec = min_eigen3D(aMat, chi2); +#ifdef RFIT_DEBUG + printf("circle_fit - AFTER MIN_EIGEN\n"); +#endif + printIt(&vVec, "v BEFORE INVERSION"); + vVec *= (vVec(2) > 0) ? 1 : -1; // TO FIX dovrebbe essere N(3)>0 + printIt(&vVec, "v AFTER INVERSION"); + // This hack to be able to run on GPU where the automatic assignment to a + // double from the vector multiplication is not working. +#ifdef RFIT_DEBUG + printf("circle_fit - AFTER MIN_EIGEN 1\n"); +#endif + Eigen::Matrix cm; +#ifdef RFIT_DEBUG + printf("circle_fit - AFTER MIN_EIGEN 2\n"); +#endif + cm = -vVec.transpose() * r0; +#ifdef RFIT_DEBUG + printf("circle_fit - AFTER MIN_EIGEN 3\n"); +#endif + const double tempC = cm(0, 0); + +#ifdef RFIT_DEBUG + printf("circle_fit - COMPUTE CIRCLE PARAMETER\n"); +#endif + // COMPUTE CIRCLE PARAMETER + + // auxiliary quantities + const double tempH = sqrt(1. - sqr(vVec(2)) - 4. * tempC * vVec(2)); + const double v2x2_inv = 1. / (2. * vVec(2)); + const double s_inv = 1. / tempS; + Vector3d par_uvr; // used in error propagation + par_uvr << -vVec(0) * v2x2_inv, -vVec(1) * v2x2_inv, tempH * v2x2_inv; + + CircleFit circle; + circle.par << par_uvr(0) * s_inv + hCentroid(0), par_uvr(1) * s_inv + hCentroid(1), par_uvr(2) * s_inv; + circle.qCharge = charge(hits2D, circle.par); + circle.chi2 = abs(chi2) * renorm / sqr(2 * vVec(2) * par_uvr(2) * tempS); + printIt(&circle.par, "circle_fit - CIRCLE PARAMETERS:"); + printIt(&circle.cov, "circle_fit - CIRCLE COVARIANCE:"); +#ifdef RFIT_DEBUG + printf("circle_fit - CIRCLE CHARGE: %d\n", circle.qCharge); +#endif + +#ifdef RFIT_DEBUG + printf("circle_fit - ERROR PROPAGATION\n"); +#endif + // ERROR PROPAGATION + if (error) { +#ifdef RFIT_DEBUG + printf("circle_fit - ERROR PRPAGATION ACTIVATED\n"); +#endif + ArrayNd vcsMat[2][2]; // cov matrix of center & scaled points + MatrixNd cMat[3][3]; // cov matrix of 3D transformed points +#ifdef RFIT_DEBUG + printf("circle_fit - ERROR PRPAGATION ACTIVATED 2\n"); +#endif + { + Eigen::Matrix cm; + Eigen::Matrix cm2; + cm = mc.transpose() * vMat * mc; + const double tempC2 = cm(0, 0); + Matrix2Nd tempVcsMat; + tempVcsMat.template triangularView() = + (sqr(tempS) * vMat + sqr(sqr(tempS)) * 1. / (4. * tempQ * n) * + (2. * vMat.squaredNorm() + 4. * tempC2) * // mc.transpose() * V * mc) * + (mc * mc.transpose())); + + printIt(&tempVcsMat, "circle_fit - Vcs:"); + cMat[0][0] = tempVcsMat.block(0, 0, n, n).template selfadjointView(); + vcsMat[0][1] = tempVcsMat.block(0, n, n, n); + cMat[1][1] = tempVcsMat.block(n, n, n, n).template selfadjointView(); + vcsMat[1][0] = vcsMat[0][1].transpose(); + printIt(&tempVcsMat, "circle_fit - Vcs:"); + } + + { + const ArrayNd t0 = (VectorXd::Constant(n, 1.) * p3D.row(0)); + const ArrayNd t1 = (VectorXd::Constant(n, 1.) * p3D.row(1)); + const ArrayNd t00 = p3D.row(0).transpose() * p3D.row(0); + const ArrayNd t01 = p3D.row(0).transpose() * p3D.row(1); + const ArrayNd t11 = p3D.row(1).transpose() * p3D.row(1); + const ArrayNd t10 = t01.transpose(); + vcsMat[0][0] = cMat[0][0]; + cMat[0][1] = vcsMat[0][1]; + cMat[0][2] = 2. * (vcsMat[0][0] * t0 + vcsMat[0][1] * t1); + vcsMat[1][1] = cMat[1][1]; + cMat[1][2] = 2. * (vcsMat[1][0] * t0 + vcsMat[1][1] * t1); + MatrixNd tmp; + tmp.template triangularView() = + (2. * (vcsMat[0][0] * vcsMat[0][0] + vcsMat[0][0] * vcsMat[0][1] + vcsMat[1][1] * vcsMat[1][0] + + vcsMat[1][1] * vcsMat[1][1]) + + 4. * (vcsMat[0][0] * t00 + vcsMat[0][1] * t01 + vcsMat[1][0] * t10 + vcsMat[1][1] * t11)) + .matrix(); + cMat[2][2] = tmp.template selfadjointView(); + } + printIt(&cMat[0][0], "circle_fit - C[0][0]:"); + + Matrix3d c0Mat; // cov matrix of center of gravity (r0.x,r0.y,r0.z) + for (uint i = 0; i < 3; ++i) { + for (uint j = i; j < 3; ++j) { + Eigen::Matrix tmp; + tmp = weight.transpose() * cMat[i][j] * weight; + // Workaround to get things working in GPU + const double tempC = tmp(0, 0); + c0Mat(i, j) = tempC; //weight.transpose() * C[i][j] * weight; + c0Mat(j, i) = c0Mat(i, j); + } + } + printIt(&c0Mat, "circle_fit - C0:"); + + const MatrixNd wMat = weight * weight.transpose(); + const MatrixNd hMat = MatrixNd::Identity().rowwise() - weight.transpose(); + const MatrixNx3d s_v = hMat * p3D.transpose(); + printIt(&wMat, "circle_fit - W:"); + printIt(&hMat, "circle_fit - H:"); + printIt(&s_v, "circle_fit - s_v:"); + + MatrixNd dMat[3][3]; // cov(s_v) + dMat[0][0] = (hMat * cMat[0][0] * hMat.transpose()).cwiseProduct(wMat); + dMat[0][1] = (hMat * cMat[0][1] * hMat.transpose()).cwiseProduct(wMat); + dMat[0][2] = (hMat * cMat[0][2] * hMat.transpose()).cwiseProduct(wMat); + dMat[1][1] = (hMat * cMat[1][1] * hMat.transpose()).cwiseProduct(wMat); + dMat[1][2] = (hMat * cMat[1][2] * hMat.transpose()).cwiseProduct(wMat); + dMat[2][2] = (hMat * cMat[2][2] * hMat.transpose()).cwiseProduct(wMat); + dMat[1][0] = dMat[0][1].transpose(); + dMat[2][0] = dMat[0][2].transpose(); + dMat[2][1] = dMat[1][2].transpose(); + printIt(&dMat[0][0], "circle_fit - D_[0][0]:"); + + constexpr uint nu[6][2] = {{0, 0}, {0, 1}, {0, 2}, {1, 1}, {1, 2}, {2, 2}}; + + Matrix6d eMat; // cov matrix of the 6 independent elements of A + for (uint a = 0; a < 6; ++a) { + const uint i = nu[a][0], j = nu[a][1]; + for (uint b = a; b < 6; ++b) { + const uint k = nu[b][0], l = nu[b][1]; + VectorNd t0(n); + VectorNd t1(n); + if (l == k) { + t0 = 2. * dMat[j][l] * s_v.col(l); + if (i == j) + t1 = t0; + else + t1 = 2. * dMat[i][l] * s_v.col(l); + } else { + t0 = dMat[j][l] * s_v.col(k) + dMat[j][k] * s_v.col(l); + if (i == j) + t1 = t0; + else + t1 = dMat[i][l] * s_v.col(k) + dMat[i][k] * s_v.col(l); + } + + if (i == j) { + Eigen::Matrix cm; + cm = s_v.col(i).transpose() * (t0 + t1); + // Workaround to get things working in GPU + const double tempC = cm(0, 0); + eMat(a, b) = 0. + tempC; + } else { + Eigen::Matrix cm; + cm = (s_v.col(i).transpose() * t0) + (s_v.col(j).transpose() * t1); + // Workaround to get things working in GPU + const double tempC = cm(0, 0); + eMat(a, b) = 0. + tempC; //(s_v.col(i).transpose() * t0) + (s_v.col(j).transpose() * t1); + } + if (b != a) + eMat(b, a) = eMat(a, b); + } + } + printIt(&eMat, "circle_fit - E:"); + + Eigen::Matrix j2Mat; // Jacobian of min_eigen() (numerically computed) + for (uint a = 0; a < 6; ++a) { + const uint i = nu[a][0], j = nu[a][1]; + Matrix3d delta = Matrix3d::Zero(); + delta(i, j) = delta(j, i) = abs(aMat(i, j) * epsilon); + j2Mat.col(a) = min_eigen3D_fast(aMat + delta); + const int sign = (j2Mat.col(a)(2) > 0) ? 1 : -1; + j2Mat.col(a) = (j2Mat.col(a) * sign - vVec) / delta(i, j); + } + printIt(&j2Mat, "circle_fit - J2:"); + + Matrix4d cvcMat; // joint cov matrix of (v0,v1,v2,c) + { + Matrix3d t0 = j2Mat * eMat * j2Mat.transpose(); + Vector3d t1 = -t0 * r0; + cvcMat.block(0, 0, 3, 3) = t0; + cvcMat.block(0, 3, 3, 1) = t1; + cvcMat.block(3, 0, 1, 3) = t1.transpose(); + Eigen::Matrix cm1; + Eigen::Matrix cm3; + cm1 = (vVec.transpose() * c0Mat * vVec); + // cm2 = (c0Mat.cwiseProduct(t0)).sum(); + cm3 = (r0.transpose() * t0 * r0); + // Workaround to get things working in GPU + const double tempC = cm1(0, 0) + (c0Mat.cwiseProduct(t0)).sum() + cm3(0, 0); + cvcMat(3, 3) = tempC; + // (v.transpose() * c0Mat * v) + (c0Mat.cwiseProduct(t0)).sum() + (r0.transpose() * t0 * r0); + } + printIt(&cvcMat, "circle_fit - Cvc:"); + + Eigen::Matrix j3Mat; // Jacobian (v0,v1,v2,c)->(X0,Y0,R) + { + const double t = 1. / tempH; + j3Mat << -v2x2_inv, 0, vVec(0) * sqr(v2x2_inv) * 2., 0, 0, -v2x2_inv, vVec(1) * sqr(v2x2_inv) * 2., 0, + vVec(0) * v2x2_inv * t, vVec(1) * v2x2_inv * t, + -tempH * sqr(v2x2_inv) * 2. - (2. * tempC + vVec(2)) * v2x2_inv * t, -t; + } + printIt(&j3Mat, "circle_fit - J3:"); + + const RowVector2Nd Jq = mc.transpose() * tempS * 1. / n; // var(q) + printIt(&Jq, "circle_fit - Jq:"); + + Matrix3d cov_uvr = j3Mat * cvcMat * j3Mat.transpose() * sqr(s_inv) // cov(X0,Y0,R) + + (par_uvr * par_uvr.transpose()) * (Jq * vMat * Jq.transpose()); + + circle.cov = cov_uvr; + } + + printIt(&circle.cov, "Circle cov:"); +#ifdef RFIT_DEBUG + printf("circle_fit - exit\n"); +#endif + return circle; + } + + /*! \brief Perform an ordinary least square fit in the s-z plane to compute + * the parameters cotTheta and Zip. + * + * The fit is performed in the rotated S3D-Z' plane, following the formalism of + * Frodesen, Chapter 10, p. 259. + * + * The system has been rotated to both try to use the combined errors in s-z + * along Z', as errors in the Y direction and to avoid the patological case of + * degenerate lines with angular coefficient m = +/- inf. + * + * The rotation is using the information on the theta angle computed in the + * fast fit. The rotation is such that the S3D axis will be the X-direction, + * while the rotated Z-axis will be the Y-direction. This pretty much follows + * what is done in the same fit in the Broken Line approach. + */ + + template + __host__ __device__ inline LineFit lineFit(const M3xN& hits, + const M6xN& hits_ge, + const CircleFit& circle, + const V4& fast_fit, + const double bField, + const bool error) { + constexpr uint32_t N = M3xN::ColsAtCompileTime; + constexpr auto n = N; + double theta = -circle.qCharge * atan(fast_fit(3)); + theta = theta < 0. ? theta + M_PI : theta; + + // Prepare the Rotation Matrix to rotate the points + Eigen::Matrix rot; + rot << sin(theta), cos(theta), -cos(theta), sin(theta); + + // PROJECTION ON THE CILINDER + // + // p2D will be: + // [s1, s2, s3, ..., sn] + // [z1, z2, z3, ..., zn] + // s values will be ordinary x-values + // z values will be ordinary y-values + + Matrix2xNd p2D = Matrix2xNd::Zero(); + Eigen::Matrix jxMat; + +#ifdef RFIT_DEBUG + printf("Line_fit - B: %g\n", bField); + printIt(&hits, "Line_fit points: "); + printIt(&hits_ge, "Line_fit covs: "); + printIt(&rot, "Line_fit rot: "); +#endif + // x & associated Jacobian + // cfr https://indico.cern.ch/event/663159/contributions/2707659/attachments/1517175/2368189/Riemann_fit.pdf + // Slide 11 + // a ==> -o i.e. the origin of the circle in XY plane, negative + // b ==> p i.e. distances of the points wrt the origin of the circle. + const Vector2d oVec(circle.par(0), circle.par(1)); + + // associated Jacobian, used in weights and errors computation + Matrix6d covMat = Matrix6d::Zero(); + Matrix2d cov_sz[N]; + for (uint i = 0; i < n; ++i) { + Vector2d pVec = hits.block(0, i, 2, 1) - oVec; + const double cross = cross2D(-oVec, pVec); + const double dot = (-oVec).dot(pVec); + // atan2(cross, dot) give back the angle in the transverse plane so tha the + // final equation reads: x_i = -q*R*theta (theta = angle returned by atan2) + const double tempQAtan2 = -circle.qCharge * atan2(cross, dot); + // p2D.coeffRef(1, i) = atan2_ * circle.par(2); + p2D(0, i) = tempQAtan2 * circle.par(2); + + // associated Jacobian, used in weights and errors- computation + const double temp0 = -circle.qCharge * circle.par(2) * 1. / (sqr(dot) + sqr(cross)); + double d_X0 = 0., d_Y0 = 0., d_R = 0.; // good approximation for big pt and eta + if (error) { + d_X0 = -temp0 * ((pVec(1) + oVec(1)) * dot - (pVec(0) - oVec(0)) * cross); + d_Y0 = temp0 * ((pVec(0) + oVec(0)) * dot - (oVec(1) - pVec(1)) * cross); + d_R = tempQAtan2; + } + const double d_x = temp0 * (oVec(1) * dot + oVec(0) * cross); + const double d_y = temp0 * (-oVec(0) * dot + oVec(1) * cross); + jxMat << d_X0, d_Y0, d_R, d_x, d_y, 0., 0., 0., 0., 0., 0., 1.; + + covMat.block(0, 0, 3, 3) = circle.cov; + covMat(3, 3) = hits_ge.col(i)[0]; // x errors + covMat(4, 4) = hits_ge.col(i)[2]; // y errors + covMat(5, 5) = hits_ge.col(i)[5]; // z errors + covMat(3, 4) = covMat(4, 3) = hits_ge.col(i)[1]; // cov_xy + covMat(3, 5) = covMat(5, 3) = hits_ge.col(i)[3]; // cov_xz + covMat(4, 5) = covMat(5, 4) = hits_ge.col(i)[4]; // cov_yz + Matrix2d tmp = jxMat * covMat * jxMat.transpose(); + cov_sz[i].noalias() = rot * tmp * rot.transpose(); + } + // Math of d_{X0,Y0,R,x,y} all verified by hand + p2D.row(1) = hits.row(2); + + // The following matrix will contain errors orthogonal to the rotated S + // component only, with the Multiple Scattering properly treated!! + MatrixNd cov_with_ms; + scatterCovLine(cov_sz, fast_fit, p2D.row(0), p2D.row(1), theta, bField, cov_with_ms); +#ifdef RFIT_DEBUG + printIt(cov_sz, "line_fit - cov_sz:"); + printIt(&cov_with_ms, "line_fit - cov_with_ms: "); +#endif + + // Rotate Points with the shape [2, n] + Matrix2xNd p2D_rot = rot * p2D; + +#ifdef RFIT_DEBUG + printf("Fast fit Tan(theta): %g\n", fast_fit(3)); + printf("Rotation angle: %g\n", theta); + printIt(&rot, "Rotation Matrix:"); + printIt(&p2D, "Original Hits(s,z):"); + printIt(&p2D_rot, "Rotated hits(S3D, Z'):"); + printIt(&rot, "Rotation Matrix:"); +#endif + + // Build the A Matrix + Matrix2xNd aMat; + aMat << MatrixXd::Ones(1, n), p2D_rot.row(0); // rotated s values + +#ifdef RFIT_DEBUG + printIt(&aMat, "A Matrix:"); +#endif + + // Build A^T V-1 A, where V-1 is the covariance of only the Y components. + MatrixNd vyInvMat; + math::cholesky::invert(cov_with_ms, vyInvMat); + // MatrixNd vyInvMat = cov_with_ms.inverse(); + Eigen::Matrix covParamsMat = aMat * vyInvMat * aMat.transpose(); + // Compute the Covariance Matrix of the fit parameters + math::cholesky::invert(covParamsMat, covParamsMat); + + // Now Compute the Parameters in the form [2,1] + // The first component is q. + // The second component is m. + Eigen::Matrix sol = covParamsMat * aMat * vyInvMat * p2D_rot.row(1).transpose(); + +#ifdef RFIT_DEBUG + printIt(&sol, "Rotated solutions:"); +#endif + + // We need now to transfer back the results in the original s-z plane + const auto sinTheta = sin(theta); + const auto cosTheta = cos(theta); + auto common_factor = 1. / (sinTheta - sol(1, 0) * cosTheta); + Eigen::Matrix jMat; + jMat << 0., common_factor * common_factor, common_factor, sol(0, 0) * cosTheta * common_factor * common_factor; + + double tempM = common_factor * (sol(1, 0) * sinTheta + cosTheta); + double tempQ = common_factor * sol(0, 0); + auto cov_mq = jMat * covParamsMat * jMat.transpose(); + + VectorNd res = p2D_rot.row(1).transpose() - aMat.transpose() * sol; + double chi2 = res.transpose() * vyInvMat * res; + + LineFit line; + line.par << tempM, tempQ; + line.cov << cov_mq; + line.chi2 = chi2; + +#ifdef RFIT_DEBUG + printf("Common_factor: %g\n", common_factor); + printIt(&jMat, "Jacobian:"); + printIt(&sol, "Rotated solutions:"); + printIt(&covParamsMat, "Cov_params:"); + printIt(&cov_mq, "Rotated Covariance Matrix:"); + printIt(&(line.par), "Real Parameters:"); + printIt(&(line.cov), "Real Covariance Matrix:"); + printf("Chi2: %g\n", chi2); +#endif + + return line; + } + + /*! + \brief Helix fit by three step: + -fast pre-fit (see Fast_fit() for further info); \n + -circle fit of hits projected in the transverse plane by Riemann-Chernov + algorithm (see Circle_fit() for further info); \n + -line fit of hits projected on cylinder surface by orthogonal distance + regression (see Line_fit for further info). \n + Points must be passed ordered (from inner to outer layer). + \param hits Matrix3xNd hits coordinates in this form: \n + |x0|x1|x2|...|xn| \n + |y0|y1|y2|...|yn| \n + |z0|z1|z2|...|zn| + \param hits_cov Matrix3Nd covariance matrix in this form (()->cov()): \n + |(x0,x0)|(x1,x0)|(x2,x0)|.|(y0,x0)|(y1,x0)|(y2,x0)|.|(z0,x0)|(z1,x0)|(z2,x0)| \n + |(x0,x1)|(x1,x1)|(x2,x1)|.|(y0,x1)|(y1,x1)|(y2,x1)|.|(z0,x1)|(z1,x1)|(z2,x1)| \n + |(x0,x2)|(x1,x2)|(x2,x2)|.|(y0,x2)|(y1,x2)|(y2,x2)|.|(z0,x2)|(z1,x2)|(z2,x2)| \n + . . . . . . . . . . . \n + |(x0,y0)|(x1,y0)|(x2,y0)|.|(y0,y0)|(y1,y0)|(y2,x0)|.|(z0,y0)|(z1,y0)|(z2,y0)| \n + |(x0,y1)|(x1,y1)|(x2,y1)|.|(y0,y1)|(y1,y1)|(y2,x1)|.|(z0,y1)|(z1,y1)|(z2,y1)| \n + |(x0,y2)|(x1,y2)|(x2,y2)|.|(y0,y2)|(y1,y2)|(y2,x2)|.|(z0,y2)|(z1,y2)|(z2,y2)| \n + . . . . . . . . . . . \n + |(x0,z0)|(x1,z0)|(x2,z0)|.|(y0,z0)|(y1,z0)|(y2,z0)|.|(z0,z0)|(z1,z0)|(z2,z0)| \n + |(x0,z1)|(x1,z1)|(x2,z1)|.|(y0,z1)|(y1,z1)|(y2,z1)|.|(z0,z1)|(z1,z1)|(z2,z1)| \n + |(x0,z2)|(x1,z2)|(x2,z2)|.|(y0,z2)|(y1,z2)|(y2,z2)|.|(z0,z2)|(z1,z2)|(z2,z2)| + \param bField magnetic field in the center of the detector in Gev/cm/c + unit, in order to perform pt calculation. + \param error flag for error computation. + \param scattering flag for multiple scattering treatment. + (see Circle_fit() documentation for further info). + \warning see Circle_fit(), Line_fit() and Fast_fit() warnings. + \bug see Circle_fit(), Line_fit() and Fast_fit() bugs. +*/ + + template + inline HelixFit helixFit(const Matrix3xNd& hits, + const Eigen::Matrix& hits_ge, + const double bField, + const bool error) { + constexpr uint n = N; + VectorNd<4> rad = (hits.block(0, 0, 2, n).colwise().norm()); + + // Fast_fit gives back (X0, Y0, R, theta) w/o errors, using only 3 points. + Vector4d fast_fit; + fastFit(hits, fast_fit); + riemannFit::Matrix2Nd hits_cov = MatrixXd::Zero(2 * n, 2 * n); + riemannFit::loadCovariance2D(hits_ge, hits_cov); + CircleFit circle = circleFit(hits.block(0, 0, 2, n), hits_cov, fast_fit, rad, bField, error); + LineFit line = lineFit(hits, hits_ge, circle, fast_fit, bField, error); + + par_uvrtopak(circle, bField, error); + + HelixFit helix; + helix.par << circle.par, line.par; + if (error) { + helix.cov = MatrixXd::Zero(5, 5); + helix.cov.block(0, 0, 3, 3) = circle.cov; + helix.cov.block(3, 3, 2, 2) = line.cov; + } + helix.qCharge = circle.qCharge; + helix.chi2_circle = circle.chi2; + helix.chi2_line = line.chi2; + + return helix; + } + +} // namespace riemannFit + +#endif // RecoPixelVertexing_PixelTrackFitting_interface_RiemannFit_h diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/BuildFile.xml b/RecoPixelVertexing/PixelTrackFitting/plugins/BuildFile.xml index be113d7a5a3dc..ecfbd99b667fc 100644 --- a/RecoPixelVertexing/PixelTrackFitting/plugins/BuildFile.xml +++ b/RecoPixelVertexing/PixelTrackFitting/plugins/BuildFile.xml @@ -1,3 +1,6 @@ + + + diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelNtupletsFitterProducer.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelNtupletsFitterProducer.cc new file mode 100644 index 0000000000000..f49d2f01f48c6 --- /dev/null +++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelNtupletsFitterProducer.cc @@ -0,0 +1,44 @@ +#include "FWCore/Framework/interface/ESHandle.h" +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Framework/interface/EventSetup.h" +#include "FWCore/Framework/interface/Frameworkfwd.h" +#include "FWCore/Framework/interface/MakerMacros.h" +#include "FWCore/Framework/interface/global/EDProducer.h" +#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" +#include "MagneticField/Engine/interface/MagneticField.h" +#include "MagneticField/Records/interface/IdealMagneticFieldRecord.h" +#include "RecoPixelVertexing/PixelTrackFitting/interface/PixelFitter.h" +#include "RecoPixelVertexing/PixelTrackFitting/interface/PixelNtupletsFitter.h" +#include "RecoTracker/TkMSParametrization/interface/PixelRecoUtilities.h" + +class PixelNtupletsFitterProducer : public edm::global::EDProducer<> { +public: + explicit PixelNtupletsFitterProducer(const edm::ParameterSet& iConfig) + : useRiemannFit_(iConfig.getParameter("useRiemannFit")), idealMagneticFieldToken_(esConsumes()) { + produces(); + } + ~PixelNtupletsFitterProducer() override {} + + static void fillDescriptions(edm::ConfigurationDescriptions& descriptions) { + edm::ParameterSetDescription desc; + desc.add("useRiemannFit", false)->setComment("true for Riemann, false for BrokenLine"); + descriptions.add("pixelNtupletsFitterDefault", desc); + } + +private: + bool useRiemannFit_; + const edm::ESGetToken idealMagneticFieldToken_; + void produce(edm::StreamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const override; +}; + +void PixelNtupletsFitterProducer::produce(edm::StreamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const { + auto const& idealField = iSetup.getData(idealMagneticFieldToken_); + float bField = 1 / PixelRecoUtilities::fieldInInvGev(iSetup); + auto impl = std::make_unique(bField, &idealField, useRiemannFit_); + auto prod = std::make_unique(std::move(impl)); + iEvent.put(std::move(prod)); +} + +DEFINE_FWK_MODULE(PixelNtupletsFitterProducer); diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc new file mode 100644 index 0000000000000..2f0965be50eb8 --- /dev/null +++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc @@ -0,0 +1,86 @@ +#include + +#include "CUDADataFormats/Common/interface/Product.h" +#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" +#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h" +#include "DataFormats/Common/interface/Handle.h" +#include "FWCore/Framework/interface/ConsumesCollector.h" +#include "FWCore/Framework/interface/ESHandle.h" +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Framework/interface/EventSetup.h" +#include "FWCore/Framework/interface/MakerMacros.h" +#include "FWCore/Framework/interface/global/EDAnalyzer.h" +#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" +#include "FWCore/PluginManager/interface/ModuleDef.h" +#include "FWCore/Utilities/interface/EDGetToken.h" +#include "FWCore/Utilities/interface/InputTag.h" +#include "FWCore/Utilities/interface/RunningAverage.h" +#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h" +#include "RecoTracker/TkMSParametrization/interface/PixelRecoUtilities.h" + +class PixelTrackDumpCUDA : public edm::global::EDAnalyzer<> { +public: + explicit PixelTrackDumpCUDA(const edm::ParameterSet& iConfig); + ~PixelTrackDumpCUDA() override = default; + + static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); + +private: + void analyze(edm::StreamID streamID, edm::Event const& iEvent, const edm::EventSetup& iSetup) const override; + const bool m_onGPU; + edm::EDGetTokenT> tokenGPUTrack_; + edm::EDGetTokenT> tokenGPUVertex_; + edm::EDGetTokenT tokenSoATrack_; + edm::EDGetTokenT tokenSoAVertex_; +}; + +PixelTrackDumpCUDA::PixelTrackDumpCUDA(const edm::ParameterSet& iConfig) + : m_onGPU(iConfig.getParameter("onGPU")) { + if (m_onGPU) { + tokenGPUTrack_ = + consumes>(iConfig.getParameter("pixelTrackSrc")); + tokenGPUVertex_ = + consumes>(iConfig.getParameter("pixelVertexSrc")); + } else { + tokenSoATrack_ = consumes(iConfig.getParameter("pixelTrackSrc")); + tokenSoAVertex_ = consumes(iConfig.getParameter("pixelVertexSrc")); + } +} + +void PixelTrackDumpCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { + edm::ParameterSetDescription desc; + + desc.add("onGPU", true); + desc.add("pixelTrackSrc", edm::InputTag("caHitNtupletCUDA")); + desc.add("pixelVertexSrc", edm::InputTag("pixelVertexCUDA")); + descriptions.add("pixelTrackDumpCUDA", desc); +} + +void PixelTrackDumpCUDA::analyze(edm::StreamID streamID, + edm::Event const& iEvent, + const edm::EventSetup& iSetup) const { + if (m_onGPU) { + auto const& hTracks = iEvent.get(tokenGPUTrack_); + cms::cuda::ScopedContextProduce ctx{hTracks}; + + auto const& tracks = ctx.get(hTracks); + auto const* tsoa = tracks.get(); + assert(tsoa); + + auto const& vertices = ctx.get(iEvent.get(tokenGPUVertex_)); + auto const* vsoa = vertices.get(); + assert(vsoa); + + } else { + auto const* tsoa = iEvent.get(tokenSoATrack_).get(); + assert(tsoa); + + auto const* vsoa = iEvent.get(tokenSoAVertex_).get(); + assert(vsoa); + } +} + +DEFINE_FWK_MODULE(PixelTrackDumpCUDA); diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducer.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducer.cc index bd390f5f65352..91c3a44cc8643 100644 --- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducer.cc +++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducer.cc @@ -1,23 +1,22 @@ -#include "PixelTrackProducer.h" +#include +#include "DataFormats/Common/interface/OrphanHandle.h" +#include "DataFormats/TrackReco/interface/Track.h" +#include "DataFormats/TrackReco/interface/TrackExtra.h" +#include "DataFormats/TrackReco/interface/TrackFwd.h" +#include "DataFormats/TrackerCommon/interface/TrackerTopology.h" +#include "DataFormats/TrajectoryState/interface/LocalTrajectoryParameters.h" +#include "FWCore/Framework/interface/ESHandle.h" #include "FWCore/Framework/interface/Event.h" #include "FWCore/Framework/interface/EventSetup.h" -#include "FWCore/Framework/interface/ESHandle.h" #include "FWCore/MessageLogger/interface/MessageLogger.h" -#include "FWCore/ParameterSet/interface/ParameterSet.h" #include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" #include "FWCore/ParameterSet/interface/ParameterSetDescription.h" - -#include "DataFormats/TrajectoryState/interface/LocalTrajectoryParameters.h" -#include "DataFormats/TrackReco/interface/Track.h" -#include "DataFormats/TrackReco/interface/TrackFwd.h" -#include "DataFormats/TrackReco/interface/TrackExtra.h" -#include "DataFormats/Common/interface/OrphanHandle.h" - -#include "DataFormats/TrackerCommon/interface/TrackerTopology.h" #include "Geometry/Records/interface/TrackerTopologyRcd.h" -#include +#include "PixelTrackProducer.h" +#include "storeTracks.h" using namespace pixeltrackfitting; using edm::ParameterSet; @@ -45,62 +44,9 @@ void PixelTrackProducer::produce(edm::Event& ev, const edm::EventSetup& es) { TracksWithTTRHs tracks; theReconstruction.run(tracks, ev, es); - edm::ESHandle httopo; es.get().get(httopo); // store tracks - store(ev, tracks, *httopo); -} - -void PixelTrackProducer::store(edm::Event& ev, const TracksWithTTRHs& tracksWithHits, const TrackerTopology& ttopo) { - auto tracks = std::make_unique(); - auto recHits = std::make_unique(); - auto trackExtras = std::make_unique(); - - int cc = 0, nTracks = tracksWithHits.size(); - - for (int i = 0; i < nTracks; i++) { - reco::Track* track = tracksWithHits.at(i).first; - const SeedingHitSet& hits = tracksWithHits.at(i).second; - - for (unsigned int k = 0; k < hits.size(); k++) { - TrackingRecHit* hit = hits[k]->hit()->clone(); - - track->appendHitPattern(*hit, ttopo); - recHits->push_back(hit); - } - tracks->push_back(*track); - delete track; - } - - LogDebug("TrackProducer") << "put the collection of TrackingRecHit in the event" - << "\n"; - edm::OrphanHandle ohRH = ev.put(std::move(recHits)); - - edm::RefProd hitCollProd(ohRH); - for (int k = 0; k < nTracks; k++) { - reco::TrackExtra theTrackExtra{}; - - //fill the TrackExtra with TrackingRecHitRef - unsigned int nHits = tracks->at(k).numberOfValidHits(); - theTrackExtra.setHits(hitCollProd, cc, nHits); - cc += nHits; - AlgebraicVector5 v = AlgebraicVector5(0, 0, 0, 0, 0); - reco::TrackExtra::TrajParams trajParams(nHits, LocalTrajectoryParameters(v, 1.)); - reco::TrackExtra::Chi2sFive chi2s(nHits, 0); - theTrackExtra.setTrajParams(std::move(trajParams), std::move(chi2s)); - trackExtras->push_back(theTrackExtra); - } - - LogDebug("TrackProducer") << "put the collection of TrackExtra in the event" - << "\n"; - edm::OrphanHandle ohTE = ev.put(std::move(trackExtras)); - - for (int k = 0; k < nTracks; k++) { - const reco::TrackExtraRef theTrackExtraRef(ohTE, k); - (tracks->at(k)).setExtra(theTrackExtraRef); - } - - ev.put(std::move(tracks)); + storeTracks(ev, tracks, *httopo); } diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducer.h b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducer.h index d756a9cf963f5..c38fd44c0d7f5 100644 --- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducer.h +++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducer.h @@ -1,8 +1,7 @@ -#ifndef PixelTrackProducer_H -#define PixelTrackProducer_H +#ifndef RecoPixelVertexing_PixelTrackFitting_plugins_PixelTrackProducer_h +#define RecoPixelVertexing_PixelTrackFitting_plugins_PixelTrackProducer_h #include "FWCore/Framework/interface/stream/EDProducer.h" -#include "RecoPixelVertexing/PixelTrackFitting/interface/TracksWithHits.h" #include "RecoPixelVertexing/PixelTrackFitting/interface/PixelTrackReconstruction.h" namespace edm { @@ -24,7 +23,7 @@ class PixelTrackProducer : public edm::stream::EDProducer<> { void produce(edm::Event& ev, const edm::EventSetup& es) override; private: - void store(edm::Event& ev, const pixeltrackfitting::TracksWithTTRHs& selectedTracks, const TrackerTopology& ttopo); PixelTrackReconstruction theReconstruction; }; -#endif + +#endif // RecoPixelVertexing_PixelTrackFitting_plugins_PixelTrackProducer_h diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc new file mode 100644 index 0000000000000..94c490e948575 --- /dev/null +++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc @@ -0,0 +1,205 @@ +#include "DataFormats/BeamSpot/interface/BeamSpot.h" +#include "DataFormats/Common/interface/OrphanHandle.h" +#include "DataFormats/TrackReco/interface/Track.h" +#include "DataFormats/TrackReco/interface/TrackExtra.h" +#include "DataFormats/TrackReco/interface/TrackFwd.h" +#include "DataFormats/TrackerCommon/interface/TrackerTopology.h" +#include "DataFormats/TrajectoryState/interface/LocalTrajectoryParameters.h" +#include "DataFormats/GeometrySurface/interface/Plane.h" +#include "DataFormats/TrackerRecHit2D/interface/SiPixelRecHitCollection.h" +#include "FWCore/Framework/interface/ESHandle.h" +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Framework/interface/EventSetup.h" +#include "FWCore/Framework/interface/MakerMacros.h" +#include "FWCore/Framework/interface/global/EDProducer.h" +#include "FWCore/Framework/interface/ConsumesCollector.h" +#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" +#include "FWCore/Utilities/interface/InputTag.h" +#include "FWCore/PluginManager/interface/ModuleDef.h" +#include "FWCore/Utilities/interface/EDGetToken.h" +#include "Geometry/Records/interface/TrackerTopologyRcd.h" +#include "MagneticField/Records/interface/IdealMagneticFieldRecord.h" + +#include "TrackingTools/AnalyticalJacobians/interface/JacobianLocalToCurvilinear.h" +#include "TrackingTools/TrajectoryParametrization/interface/GlobalTrajectoryParameters.h" +#include "TrackingTools/TrajectoryParametrization/interface/CurvilinearTrajectoryError.h" +#include "RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h" + +#include "CUDADataFormats/Common/interface/HostProduct.h" +#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" +#include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h" + +#include "storeTracks.h" +#include "CUDADataFormats/Common/interface/HostProduct.h" + +/** + * This class creates "leagcy" reco::Track + * objects from the output of SoA CA. + */ +class PixelTrackProducerFromSoA : public edm::global::EDProducer<> { +public: + using IndToEdm = std::vector; + + explicit PixelTrackProducerFromSoA(const edm::ParameterSet &iConfig); + ~PixelTrackProducerFromSoA() override = default; + + static void fillDescriptions(edm::ConfigurationDescriptions &descriptions); + + // using HitModuleStart = std::array; + using HMSstorage = HostProduct; + +private: + void produce(edm::StreamID streamID, edm::Event &iEvent, const edm::EventSetup &iSetup) const override; + + // Event Data tokens + const edm::EDGetTokenT tBeamSpot_; + const edm::EDGetTokenT tokenTrack_; + const edm::EDGetTokenT cpuHits_; + const edm::EDGetTokenT hmsToken_; + // Event Setup tokens + const edm::ESGetToken idealMagneticFieldToken_; + const edm::ESGetToken ttTopoToken_; + + int32_t const minNumberOfHits_; +}; + +PixelTrackProducerFromSoA::PixelTrackProducerFromSoA(const edm::ParameterSet &iConfig) + : tBeamSpot_(consumes(iConfig.getParameter("beamSpot"))), + tokenTrack_(consumes(iConfig.getParameter("trackSrc"))), + cpuHits_(consumes(iConfig.getParameter("pixelRecHitLegacySrc"))), + hmsToken_(consumes(iConfig.getParameter("pixelRecHitLegacySrc"))), + idealMagneticFieldToken_(esConsumes()), + ttTopoToken_(esConsumes()), + minNumberOfHits_(iConfig.getParameter("minNumberOfHits")) { + produces(); + produces(); + produces(); + produces(); +} + +void PixelTrackProducerFromSoA::fillDescriptions(edm::ConfigurationDescriptions &descriptions) { + edm::ParameterSetDescription desc; + desc.add("beamSpot", edm::InputTag("offlineBeamSpot")); + desc.add("trackSrc", edm::InputTag("pixelTrackSoA")); + desc.add("pixelRecHitLegacySrc", edm::InputTag("siPixelRecHitsPreSplittingLegacy")); + desc.add("minNumberOfHits", 0); + + descriptions.addWithDefaultLabel(desc); +} + +void PixelTrackProducerFromSoA::produce(edm::StreamID streamID, + edm::Event &iEvent, + const edm::EventSetup &iSetup) const { + // std::cout << "Converting gpu helix in reco tracks" << std::endl; + + auto indToEdmP = std::make_unique(); + auto &indToEdm = *indToEdmP; + + auto const &idealField = iSetup.getData(idealMagneticFieldToken_); + + pixeltrackfitting::TracksWithRecHits tracks; + + auto const &httopo = iSetup.getData(ttTopoToken_); + + const auto &bsh = iEvent.get(tBeamSpot_); + GlobalPoint bs(bsh.x0(), bsh.y0(), bsh.z0()); + + auto const &rechits = iEvent.get(cpuHits_); + std::vector hitmap; + auto const &rcs = rechits.data(); + auto nhits = rcs.size(); + hitmap.resize(nhits, nullptr); + + auto const *hitsModuleStart = iEvent.get(hmsToken_).get(); + auto fc = hitsModuleStart; + + for (auto const &h : rcs) { + auto const &thit = static_cast(h); + auto detI = thit.det()->index(); + auto const &clus = thit.firstClusterRef(); + assert(clus.isPixel()); + auto i = fc[detI] + clus.pixelCluster().originalId(); + if (i >= hitmap.size()) + hitmap.resize(i + 256, nullptr); // only in case of hit overflow in one module + assert(nullptr == hitmap[i]); + hitmap[i] = &h; + } + + std::vector hits; + hits.reserve(5); + + const auto &tsoa = *iEvent.get(tokenTrack_); + + auto const *quality = tsoa.qualityData(); + auto const &fit = tsoa.stateAtBS; + auto const &hitIndices = tsoa.hitIndices; + auto maxTracks = tsoa.stride(); + + int32_t nt = 0; + + for (int32_t it = 0; it < maxTracks; ++it) { + auto nHits = tsoa.nHits(it); + if (nHits == 0) + break; // this is a guard: maybe we need to move to nTracks... + indToEdm.push_back(-1); + auto q = quality[it]; + if (q != pixelTrack::Quality::loose) + continue; + if (nHits < minNumberOfHits_) + continue; + indToEdm.back() = nt; + ++nt; + + hits.resize(nHits); + auto b = hitIndices.begin(it); + for (int iHit = 0; iHit < nHits; ++iHit) + hits[iHit] = hitmap[*(b + iHit)]; + + // mind: this values are respect the beamspot! + + float chi2 = tsoa.chi2(it); + float phi = tsoa.phi(it); + + riemannFit::Vector5d ipar, opar; + riemannFit::Matrix5d icov, ocov; + fit.copyToDense(ipar, icov, it); + riemannFit::transformToPerigeePlane(ipar, icov, opar, ocov); + + LocalTrajectoryParameters lpar(opar(0), opar(1), opar(2), opar(3), opar(4), 1.); + AlgebraicSymMatrix55 m; + for (int i = 0; i < 5; ++i) + for (int j = i; j < 5; ++j) + m(i, j) = ocov(i, j); + + float sp = std::sin(phi); + float cp = std::cos(phi); + Surface::RotationType rot(sp, -cp, 0, 0, 0, -1.f, cp, sp, 0); + + Plane impPointPlane(bs, rot); + GlobalTrajectoryParameters gp( + impPointPlane.toGlobal(lpar.position()), impPointPlane.toGlobal(lpar.momentum()), lpar.charge(), &idealField); + JacobianLocalToCurvilinear jl2c(impPointPlane, lpar, idealField); + + AlgebraicSymMatrix55 mo = ROOT::Math::Similarity(jl2c.jacobian(), m); + + int ndof = 2 * hits.size() - 5; + chi2 = chi2 * ndof; + GlobalPoint vv = gp.position(); + math::XYZPoint pos(vv.x(), vv.y(), vv.z()); + GlobalVector pp = gp.momentum(); + math::XYZVector mom(pp.x(), pp.y(), pp.z()); + + auto track = std::make_unique(chi2, ndof, pos, mom, gp.charge(), CurvilinearTrajectoryError(mo)); + // filter??? + tracks.emplace_back(track.release(), hits); + } + // std::cout << "processed " << nt << " good tuples " << tracks.size() << "out of " << indToEdm.size() << std::endl; + + // store tracks + storeTracks(iEvent, tracks, httopo); + iEvent.put(std::move(indToEdmP)); +} + +DEFINE_FWK_MODULE(PixelTrackProducerFromSoA); diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc new file mode 100644 index 0000000000000..2de8ec6c335b5 --- /dev/null +++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc @@ -0,0 +1,86 @@ +#include + +#include "CUDADataFormats/Common/interface/Product.h" +#include "CUDADataFormats/Common/interface/HostProduct.h" +#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" +#include "DataFormats/Common/interface/Handle.h" +#include "FWCore/Framework/interface/ESHandle.h" +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Framework/interface/EventSetup.h" +#include "FWCore/Framework/interface/MakerMacros.h" +#include "FWCore/Framework/interface/stream/EDProducer.h" +#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" +#include "FWCore/PluginManager/interface/ModuleDef.h" +#include "FWCore/Utilities/interface/EDGetToken.h" +#include "FWCore/Utilities/interface/InputTag.h" +#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h" + +// Switch on to enable checks and printout for found tracks +#undef PIXEL_DEBUG_PRODUCE + +class PixelTrackSoAFromCUDA : public edm::stream::EDProducer { +public: + explicit PixelTrackSoAFromCUDA(const edm::ParameterSet& iConfig); + ~PixelTrackSoAFromCUDA() override = default; + + static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); + +private: + void acquire(edm::Event const& iEvent, + edm::EventSetup const& iSetup, + edm::WaitingTaskWithArenaHolder waitingTaskHolder) override; + void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override; + + edm::EDGetTokenT> tokenCUDA_; + edm::EDPutTokenT tokenSOA_; + + cms::cuda::host::unique_ptr soa_; +}; + +PixelTrackSoAFromCUDA::PixelTrackSoAFromCUDA(const edm::ParameterSet& iConfig) + : tokenCUDA_(consumes>(iConfig.getParameter("src"))), + tokenSOA_(produces()) {} + +void PixelTrackSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { + edm::ParameterSetDescription desc; + + desc.add("src", edm::InputTag("caHitNtupletCUDA")); + descriptions.add("pixelTrackSoA", desc); +} + +void PixelTrackSoAFromCUDA::acquire(edm::Event const& iEvent, + edm::EventSetup const& iSetup, + edm::WaitingTaskWithArenaHolder waitingTaskHolder) { + cms::cuda::Product const& inputDataWrapped = iEvent.get(tokenCUDA_); + cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)}; + auto const& inputData = ctx.get(inputDataWrapped); + + soa_ = inputData.toHostAsync(ctx.stream()); +} + +void PixelTrackSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) { +#ifdef PIXEL_DEBUG_PRODUCE + auto const& tsoa = *soa_; + auto maxTracks = tsoa.stride(); + std::cout << "size of SoA" << sizeof(tsoa) << " stride " << maxTracks << std::endl; + + int32_t nt = 0; + for (int32_t it = 0; it < maxTracks; ++it) { + auto nHits = tsoa.nHits(it); + assert(nHits == int(tsoa.hitIndices.size(it))); + if (nHits == 0) + break; // this is a guard: maybe we need to move to nTracks... + nt++; + } + std::cout << "found " << nt << " tracks in cpu SoA at " << &tsoa << std::endl; +#endif + + // DO NOT make a copy (actually TWO....) + iEvent.emplace(tokenSOA_, PixelTrackHeterogeneous(std::move(soa_))); + + assert(!soa_); +} + +DEFINE_FWK_MODULE(PixelTrackSoAFromCUDA); diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/storeTracks.h b/RecoPixelVertexing/PixelTrackFitting/plugins/storeTracks.h new file mode 100644 index 0000000000000..59101b6ba5214 --- /dev/null +++ b/RecoPixelVertexing/PixelTrackFitting/plugins/storeTracks.h @@ -0,0 +1,72 @@ +#ifndef RecoPixelVertexingPixelTrackFittingStoreTracks_H +#define RecoPixelVertexingPixelTrackFittingStoreTracks_H + +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Framework/interface/EventSetup.h" +#include "FWCore/Framework/interface/ESHandle.h" +#include "FWCore/MessageLogger/interface/MessageLogger.h" + +#include "DataFormats/TrajectoryState/interface/LocalTrajectoryParameters.h" +#include "DataFormats/TrackReco/interface/Track.h" +#include "DataFormats/TrackReco/interface/TrackFwd.h" +#include "DataFormats/TrackReco/interface/TrackExtra.h" +#include "DataFormats/Common/interface/OrphanHandle.h" +#include "RecoPixelVertexing/PixelTrackFitting/interface/TracksWithHits.h" + +#include "DataFormats/TrackerCommon/interface/TrackerTopology.h" +#include "Geometry/Records/interface/TrackerTopologyRcd.h" + +template +void storeTracks(Ev& ev, const TWH& tracksWithHits, const TrackerTopology& ttopo) { + auto tracks = std::make_unique(); + auto recHits = std::make_unique(); + auto trackExtras = std::make_unique(); + + int cc = 0, nTracks = tracksWithHits.size(); + + for (int i = 0; i < nTracks; i++) { + reco::Track* track = tracksWithHits[i].first; + const auto& hits = tracksWithHits[i].second; + + for (unsigned int k = 0; k < hits.size(); k++) { + auto* hit = hits[k]->clone(); + + track->appendHitPattern(*hit, ttopo); + recHits->push_back(hit); + } + tracks->push_back(*track); + delete track; + } + + LogDebug("TrackProducer") << "put the collection of TrackingRecHit in the event" + << "\n"; + edm::OrphanHandle ohRH = ev.put(std::move(recHits)); + + edm::RefProd hitCollProd(ohRH); + for (int k = 0; k < nTracks; k++) { + reco::TrackExtra theTrackExtra{}; + + //fill the TrackExtra with TrackingRecHitRef + unsigned int nHits = tracks->at(k).numberOfValidHits(); + theTrackExtra.setHits(hitCollProd, cc, nHits); + cc += nHits; + AlgebraicVector5 v = AlgebraicVector5(0, 0, 0, 0, 0); + reco::TrackExtra::TrajParams trajParams(nHits, LocalTrajectoryParameters(v, 1.)); + reco::TrackExtra::Chi2sFive chi2s(nHits, 0); + theTrackExtra.setTrajParams(std::move(trajParams), std::move(chi2s)); + trackExtras->push_back(theTrackExtra); + } + + LogDebug("TrackProducer") << "put the collection of TrackExtra in the event" + << "\n"; + edm::OrphanHandle ohTE = ev.put(std::move(trackExtras)); + + for (int k = 0; k < nTracks; k++) { + const reco::TrackExtraRef theTrackExtraRef(ohTE, k); + (tracks->at(k)).setExtra(theTrackExtraRef); + } + + ev.put(std::move(tracks)); +} + +#endif diff --git a/RecoPixelVertexing/PixelTrackFitting/python/PixelTracks_cff.py b/RecoPixelVertexing/PixelTrackFitting/python/PixelTracks_cff.py index 4334d724358f3..5ff404cb603d4 100644 --- a/RecoPixelVertexing/PixelTrackFitting/python/PixelTracks_cff.py +++ b/RecoPixelVertexing/PixelTrackFitting/python/PixelTracks_cff.py @@ -11,6 +11,7 @@ from RecoTracker.TkSeedingLayers.PixelLayerTriplets_cfi import * from RecoTracker.TkSeedingLayers.TTRHBuilderWithoutAngle4PixelTriplets_cfi import * from RecoPixelVertexing.PixelTrackFitting.pixelFitterByHelixProjections_cfi import pixelFitterByHelixProjections +from RecoPixelVertexing.PixelTrackFitting.pixelNtupletsFitter_cfi import pixelNtupletsFitter from RecoPixelVertexing.PixelTrackFitting.pixelTrackFilterByKinematics_cfi import pixelTrackFilterByKinematics from RecoPixelVertexing.PixelTrackFitting.pixelTrackCleanerBySharedHits_cfi import pixelTrackCleanerBySharedHits from RecoPixelVertexing.PixelTrackFitting.pixelTracks_cfi import pixelTracks as _pixelTracks @@ -76,4 +77,26 @@ _pixelTracksTask_lowPU.replace(pixelTracksHitQuadruplets, pixelTracksHitTriplets) trackingLowPU.toReplaceWith(pixelTracksTask, _pixelTracksTask_lowPU) +# Use ntuple fit and substitute previous Fitter producer with the ntuple one +from Configuration.ProcessModifiers.pixelNtupleFit_cff import pixelNtupleFit as ntupleFit +ntupleFit.toModify(pixelTracks, Fitter = "pixelNtupletsFitter") +_pixelTracksTask_ntupleFit = pixelTracksTask.copy() +_pixelTracksTask_ntupleFit.replace(pixelFitterByHelixProjections, pixelNtupletsFitter) +ntupleFit.toReplaceWith(pixelTracksTask, _pixelTracksTask_ntupleFit) + + +from Configuration.ProcessModifiers.gpu_cff import gpu +from RecoPixelVertexing.PixelTriplets.caHitNtupletCUDA_cfi import caHitNtupletCUDA +from RecoPixelVertexing.PixelTrackFitting.pixelTrackSoA_cfi import pixelTrackSoA +from RecoPixelVertexing.PixelTrackFitting.pixelTrackProducerFromSoA_cfi import pixelTrackProducerFromSoA as _pixelTrackFromSoA +_pixelTracksGPUTask = cms.Task( + caHitNtupletCUDA, + pixelTrackSoA, + pixelTracks # FromSoA +) + +gpu.toReplaceWith(pixelTracksTask, _pixelTracksGPUTask) +gpu.toReplaceWith(pixelTracks,_pixelTrackFromSoA) + + pixelTracksSequence = cms.Sequence(pixelTracksTask) diff --git a/RecoPixelVertexing/PixelTrackFitting/python/pixelNtupletsFitter_cfi.py b/RecoPixelVertexing/PixelTrackFitting/python/pixelNtupletsFitter_cfi.py new file mode 100644 index 0000000000000..10e1e3852e9c4 --- /dev/null +++ b/RecoPixelVertexing/PixelTrackFitting/python/pixelNtupletsFitter_cfi.py @@ -0,0 +1,6 @@ +import FWCore.ParameterSet.Config as cms + +from RecoPixelVertexing.PixelTrackFitting.pixelNtupletsFitterDefault_cfi import pixelNtupletsFitterDefault + +pixelNtupletsFitter = pixelNtupletsFitterDefault.clone() + diff --git a/RecoPixelVertexing/PixelTrackFitting/src/PixelNtupletsFitter.cc b/RecoPixelVertexing/PixelTrackFitting/src/PixelNtupletsFitter.cc new file mode 100644 index 0000000000000..96f5d5fe03448 --- /dev/null +++ b/RecoPixelVertexing/PixelTrackFitting/src/PixelNtupletsFitter.cc @@ -0,0 +1,102 @@ +#include "CommonTools/Utils/interface/DynArray.h" +#include "DataFormats/GeometryCommonDetAlgo/interface/GlobalError.h" +#include "DataFormats/GeometryCommonDetAlgo/interface/Measurement1D.h" +#include "DataFormats/GeometryVector/interface/GlobalPoint.h" +#include "DataFormats/GeometryVector/interface/LocalPoint.h" +#include "DataFormats/GeometryVector/interface/Pi.h" +#include "DataFormats/TrackingRecHit/interface/TrackingRecHit.h" +#include "FWCore/Framework/interface/EventSetup.h" +#include "FWCore/MessageLogger/interface/MessageLogger.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "Geometry/CommonDetUnit/interface/GeomDet.h" +#include "Geometry/CommonDetUnit/interface/GeomDetType.h" +#include "MagneticField/Engine/interface/MagneticField.h" +#include "RecoPixelVertexing/PixelTrackFitting/interface/BrokenLine.h" +#include "RecoPixelVertexing/PixelTrackFitting/interface/PixelNtupletsFitter.h" +#include "RecoPixelVertexing/PixelTrackFitting/interface/PixelTrackBuilder.h" +#include "RecoPixelVertexing/PixelTrackFitting/interface/PixelTrackErrorParam.h" +#include "RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h" +#include "RecoTracker/TkMSParametrization/interface/PixelRecoUtilities.h" + +using namespace std; + +PixelNtupletsFitter::PixelNtupletsFitter(float nominalB, const MagneticField* field, bool useRiemannFit) + : nominalB_(nominalB), field_(field), useRiemannFit_(useRiemannFit) {} + +std::unique_ptr PixelNtupletsFitter::run(const std::vector& hits, + const TrackingRegion& region, + const edm::EventSetup&) const { + using namespace riemannFit; + + std::unique_ptr ret; + + unsigned int nhits = hits.size(); + + if (nhits < 2) + return ret; + + declareDynArray(GlobalPoint, nhits, points); + declareDynArray(GlobalError, nhits, errors); + declareDynArray(bool, nhits, isBarrel); + + for (unsigned int i = 0; i != nhits; ++i) { + auto const& recHit = hits[i]; + points[i] = GlobalPoint(recHit->globalPosition().basicVector() - region.origin().basicVector()); + errors[i] = recHit->globalPositionError(); + isBarrel[i] = recHit->detUnit()->type().isBarrel(); + } + + assert(nhits == 4); + riemannFit::Matrix3xNd<4> hits_gp; + + Eigen::Matrix hits_ge = Eigen::Matrix::Zero(); + + for (unsigned int i = 0; i < nhits; ++i) { + hits_gp.col(i) << points[i].x(), points[i].y(), points[i].z(); + + hits_ge.col(i) << errors[i].cxx(), errors[i].cyx(), errors[i].cyy(), errors[i].czx(), errors[i].czy(), + errors[i].czz(); + } + + HelixFit fittedTrack = useRiemannFit_ ? riemannFit::helixFit(hits_gp, hits_ge, nominalB_, true) + : brokenline::helixFit(hits_gp, hits_ge, nominalB_); + + int iCharge = fittedTrack.qCharge; + + // parameters are: + // 0: phi + // 1: tip + // 2: curvature + // 3: cottheta + // 4: zip + float valPhi = fittedTrack.par(0); + + float valTip = fittedTrack.par(1); + + float valCotTheta = fittedTrack.par(3); + + float valZip = fittedTrack.par(4); + float valPt = fittedTrack.par(2); + // + // PixelTrackErrorParam param(valEta, valPt); + float errValPhi = std::sqrt(fittedTrack.cov(0, 0)); + float errValTip = std::sqrt(fittedTrack.cov(1, 1)); + + float errValPt = std::sqrt(fittedTrack.cov(2, 2)); + + float errValCotTheta = std::sqrt(fittedTrack.cov(3, 3)); + float errValZip = std::sqrt(fittedTrack.cov(4, 4)); + + float chi2 = fittedTrack.chi2_line + fittedTrack.chi2_circle; + + PixelTrackBuilder builder; + Measurement1D phi(valPhi, errValPhi); + Measurement1D tip(valTip, errValTip); + + Measurement1D pt(valPt, errValPt); + Measurement1D cotTheta(valCotTheta, errValCotTheta); + Measurement1D zip(valZip, errValZip); + + ret.reset(builder.build(pt, phi, cotTheta, tip, zip, chi2, iCharge, hits, field_, region.origin())); + return ret; +} diff --git a/RecoPixelVertexing/PixelTrackFitting/test/BuildFile.xml b/RecoPixelVertexing/PixelTrackFitting/test/BuildFile.xml index 44820da381dd1..98dc3d9b282f1 100644 --- a/RecoPixelVertexing/PixelTrackFitting/test/BuildFile.xml +++ b/RecoPixelVertexing/PixelTrackFitting/test/BuildFile.xml @@ -1,8 +1,80 @@ - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/RecoPixelVertexing/PixelTrackFitting/test/PixelTrackFits.cc b/RecoPixelVertexing/PixelTrackFitting/test/PixelTrackFits.cc new file mode 100644 index 0000000000000..e5a652e9d43f8 --- /dev/null +++ b/RecoPixelVertexing/PixelTrackFitting/test/PixelTrackFits.cc @@ -0,0 +1,431 @@ +#define _USE_MATH_DEFINES + +#include +#include +#include +#include +#include +#include + +#include +#include + +#ifdef USE_BL +#include "RecoPixelVertexing/PixelTrackFitting/interface/BrokenLine.h" +#else +#include "RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h" +#endif + +using namespace std; +using namespace Eigen; +using namespace riemannFit; +using std::unique_ptr; + +namespace riemannFit { + using Vector3i = Eigen::Matrix; + using Vector4i = Eigen::Matrix; + using Vector6d = Eigen::Matrix; + using Vector8d = Eigen::Matrix; +}; // namespace riemannFit + +// quadruplets... +struct hits_gen { + Matrix3xNd<4> hits; + Eigen::Matrix hits_ge; + Vector5d true_par; +}; + +struct geometry { + Vector8d barrel; + Vector4i barrel_2; + Vector8d R_err; + Vector8d Rp_err; + Vector8d z_err; + Vector6d hand; + Vector3i hand_2; + Vector6d xy_err; + Vector6d zh_err; + double z_max; + double r_max; +}; + +void test_helix_fit(); + +constexpr int c_speed = 299792458; +constexpr double pi = M_PI; +default_random_engine generator(1); + +void smearing(const Vector5d& err, const bool& isbarrel, double& x, double& y, double& z) { + normal_distribution dist_R(0., err[0]); + normal_distribution dist_Rp(0., err[1]); + normal_distribution dist_z(0., err[2]); + normal_distribution dist_xyh(0., err[3]); + normal_distribution dist_zh(0., err[4]); + if (isbarrel) { + double dev_Rp = dist_Rp(generator); + double dev_R = dist_R(generator); + double R = sqrt(riemannFit::sqr(x) + riemannFit::sqr(y)); + x += dev_Rp * +y / R + dev_R * -x / R; + y += dev_Rp * -x / R + dev_R * -y / R; + z += dist_z(generator); + } else { + x += dist_xyh(generator); + y += dist_xyh(generator); + z += dist_zh(generator); + } +} + +template +void Hits_cov(Eigen::Matrix& V, + const unsigned int& i, + const unsigned int& n, + const Matrix3xNd& hits, + const Vector5d& err, + bool isbarrel) { + if (isbarrel) { + double R2 = riemannFit::sqr(hits(0, i)) + riemannFit::sqr(hits(1, i)); + V.col(i)[0] = (riemannFit::sqr(err[1]) * riemannFit::sqr(hits(1, i)) + + riemannFit::sqr(err[0]) * riemannFit::sqr(hits(0, i))) / + R2; + V.col(i)[2] = (riemannFit::sqr(err[1]) * riemannFit::sqr(hits(0, i)) + + riemannFit::sqr(err[0]) * riemannFit::sqr(hits(1, i))) / + R2; + V.col(i)[1] = (riemannFit::sqr(err[0]) - riemannFit::sqr(err[1])) * hits(1, i) * hits(0, i) / R2; + V.col(i)[5] = riemannFit::sqr(err[2]); + } else { + V.col(i)[0] = riemannFit::sqr(err[3]); + V.col(i)[2] = riemannFit::sqr(err[3]); + V.col(i)[5] = riemannFit::sqr(err[4]); + } +} + +hits_gen Hits_gen(const unsigned int& n, const Matrix& gen_par) { + hits_gen gen; + gen.hits = MatrixXd::Zero(3, n); + gen.hits_ge = Eigen::Matrix::Zero(); + // err /= 10000.; + constexpr double rad[8] = {2.95, 6.8, 10.9, 16., 3.1, 7., 11., 16.2}; + // constexpr double R_err[8] = {5./10000, 5./10000, 5./10000, 5./10000, 5./10000, + // 5./10000, 5./10000, 5./10000}; constexpr double Rp_err[8] = {35./10000, 18./10000, + // 15./10000, 34./10000, 35./10000, 18./10000, 15./10000, 34./10000}; constexpr double z_err[8] = + // {72./10000, 38./10000, 25./10000, 56./10000, 72./10000, 38./10000, 25./10000, 56./10000}; + constexpr double R_err[8] = { + 10. / 10000, 10. / 10000, 10. / 10000, 10. / 10000, 10. / 10000, 10. / 10000, 10. / 10000, 10. / 10000}; + constexpr double Rp_err[8] = { + 35. / 10000, 18. / 10000, 15. / 10000, 34. / 10000, 35. / 10000, 18. / 10000, 15. / 10000, 34. / 10000}; + constexpr double z_err[8] = { + 72. / 10000, 38. / 10000, 25. / 10000, 56. / 10000, 72. / 10000, 38. / 10000, 25. / 10000, 56. / 10000}; + const double x2 = gen_par(0) + gen_par(4) * cos(gen_par(3) * pi / 180); + const double y2 = gen_par(1) + gen_par(4) * sin(gen_par(3) * pi / 180); + const double alpha = atan2(y2, x2); + + for (unsigned int i = 0; i < n; ++i) { + const double a = gen_par(4); + const double b = rad[i]; + const double c = sqrt(riemannFit::sqr(x2) + riemannFit::sqr(y2)); + const double beta = acos((riemannFit::sqr(a) - riemannFit::sqr(b) - riemannFit::sqr(c)) / (-2. * b * c)); + const double gamma = alpha + beta; + gen.hits(0, i) = rad[i] * cos(gamma); + gen.hits(1, i) = rad[i] * sin(gamma); + gen.hits(2, i) = + gen_par(2) + + 1 / tan(gen_par(5) * pi / 180) * 2. * + asin(sqrt(riemannFit::sqr((gen_par(0) - gen.hits(0, i))) + riemannFit::sqr((gen_par(1) - gen.hits(1, i)))) / + (2. * gen_par(4))) * + gen_par(4); + // isbarrel(i) = ?? + Vector5d err; + err << R_err[i], Rp_err[i], z_err[i], 0, 0; + smearing(err, true, gen.hits(0, i), gen.hits(1, i), gen.hits(2, i)); + Hits_cov(gen.hits_ge, i, n, gen.hits, err, true); + } + + return gen; +} + +Vector5d True_par(const Matrix& gen_par, const int& charge, const double& B_field) { + Vector5d true_par; + const double x0 = gen_par(0) + gen_par(4) * cos(gen_par(3) * pi / 180); + const double y0 = gen_par(1) + gen_par(4) * sin(gen_par(3) * pi / 180); + CircleFit circle; + circle.par << x0, y0, gen_par(4); + circle.qCharge = 1; + riemannFit::par_uvrtopak(circle, B_field, false); + true_par.block(0, 0, 3, 1) = circle.par; + true_par(3) = 1 / tan(gen_par(5) * pi / 180); + const int dir = ((gen_par(0) - cos(true_par(0) - pi / 2) * true_par(1)) * (gen_par(1) - y0) - + (gen_par(1) - sin(true_par(0) - pi / 2) * true_par(1)) * (gen_par(0) - x0) > + 0) + ? -1 + : 1; + true_par(4) = gen_par(2) + 1 / tan(gen_par(5) * pi / 180) * dir * 2.f * + asin(sqrt(riemannFit::sqr((gen_par(0) - cos(true_par(0) - pi / 2) * true_par(1))) + + riemannFit::sqr((gen_par(1) - sin(true_par(0) - pi / 2) * true_par(1)))) / + (2.f * gen_par(4))) * + gen_par(4); + return true_par; +} + +Matrix New_par(const Matrix& gen_par, const int& charge, const double& B_field) { + Matrix new_par; + new_par.block(0, 0, 3, 1) = gen_par.block(0, 0, 3, 1); + new_par(3) = gen_par(3) - charge * 90; + new_par(4) = gen_par(4) / B_field; + // new_par(5) = atan(sinh(gen_par(5))) * 180 / pi; + new_par(5) = 2. * atan(exp(-gen_par(5))) * 180 / pi; + return new_par; +} + +template +void computePull(std::array& fit, const char* label, int n_, int iteration, const Vector5d& true_par) { + Eigen::Matrix score(41, iteration); + + std::string histo_name("Phi Pull"); + histo_name += label; + TH1F phi_pull(histo_name.data(), histo_name.data(), 100, -10., 10.); + histo_name = "dxy Pull "; + histo_name += label; + TH1F dxy_pull(histo_name.data(), histo_name.data(), 100, -10., 10.); + histo_name = "dz Pull "; + histo_name += label; + TH1F dz_pull(histo_name.data(), histo_name.data(), 100, -10., 10.); + histo_name = "Theta Pull "; + histo_name += label; + TH1F theta_pull(histo_name.data(), histo_name.data(), 100, -10., 10.); + histo_name = "Pt Pull "; + histo_name += label; + TH1F pt_pull(histo_name.data(), histo_name.data(), 100, -10., 10.); + histo_name = "Phi Error "; + histo_name += label; + TH1F phi_error(histo_name.data(), histo_name.data(), 100, 0., 0.1); + histo_name = "dxy error "; + histo_name += label; + TH1F dxy_error(histo_name.data(), histo_name.data(), 100, 0., 0.1); + histo_name = "dz error "; + histo_name += label; + TH1F dz_error(histo_name.data(), histo_name.data(), 100, 0., 0.1); + histo_name = "Theta error "; + histo_name += label; + TH1F theta_error(histo_name.data(), histo_name.data(), 100, 0., 0.1); + histo_name = "Pt error "; + histo_name += label; + TH1F pt_error(histo_name.data(), histo_name.data(), 100, 0., 0.1); + for (int x = 0; x < iteration; x++) { + // Compute PULLS information + score(0, x) = (fit[x].par(0) - true_par(0)) / sqrt(fit[x].cov(0, 0)); + score(1, x) = (fit[x].par(1) - true_par(1)) / sqrt(fit[x].cov(1, 1)); + score(2, x) = (fit[x].par(2) - true_par(2)) / sqrt(fit[x].cov(2, 2)); + score(3, x) = (fit[x].par(3) - true_par(3)) / sqrt(fit[x].cov(3, 3)); + score(4, x) = (fit[x].par(4) - true_par(4)) / sqrt(fit[x].cov(4, 4)); + phi_pull.Fill(score(0, x)); + dxy_pull.Fill(score(1, x)); + pt_pull.Fill(score(2, x)); + theta_pull.Fill(score(3, x)); + dz_pull.Fill(score(4, x)); + phi_error.Fill(sqrt(fit[x].cov(0, 0))); + dxy_error.Fill(sqrt(fit[x].cov(1, 1))); + pt_error.Fill(sqrt(fit[x].cov(2, 2))); + theta_error.Fill(sqrt(fit[x].cov(3, 3))); + dz_error.Fill(sqrt(fit[x].cov(4, 4))); + score(5, x) = (fit[x].par(0) - true_par(0)) * (fit[x].par(1) - true_par(1)) / (fit[x].cov(0, 1)); + score(6, x) = (fit[x].par(0) - true_par(0)) * (fit[x].par(2) - true_par(2)) / (fit[x].cov(0, 2)); + score(7, x) = (fit[x].par(1) - true_par(1)) * (fit[x].par(2) - true_par(2)) / (fit[x].cov(1, 2)); + score(8, x) = (fit[x].par(3) - true_par(3)) * (fit[x].par(4) - true_par(4)) / (fit[x].cov(3, 4)); + score(9, x) = fit[x].chi2_circle; + score(25, x) = fit[x].chi2_line; + score(10, x) = sqrt(fit[x].cov(0, 0)) / fit[x].par(0) * 100; + score(13, x) = sqrt(fit[x].cov(3, 3)) / fit[x].par(3) * 100; + score(14, x) = sqrt(fit[x].cov(4, 4)) / fit[x].par(4) * 100; + score(15, x) = + (fit[x].par(0) - true_par(0)) * (fit[x].par(3) - true_par(3)) / sqrt(fit[x].cov(0, 0)) / sqrt(fit[x].cov(3, 3)); + score(16, x) = + (fit[x].par(1) - true_par(1)) * (fit[x].par(3) - true_par(3)) / sqrt(fit[x].cov(1, 1)) / sqrt(fit[x].cov(3, 3)); + score(17, x) = + (fit[x].par(2) - true_par(2)) * (fit[x].par(3) - true_par(3)) / sqrt(fit[x].cov(2, 2)) / sqrt(fit[x].cov(3, 3)); + score(18, x) = + (fit[x].par(0) - true_par(0)) * (fit[x].par(4) - true_par(4)) / sqrt(fit[x].cov(0, 0)) / sqrt(fit[x].cov(4, 4)); + score(19, x) = + (fit[x].par(1) - true_par(1)) * (fit[x].par(4) - true_par(4)) / sqrt(fit[x].cov(1, 1)) / sqrt(fit[x].cov(4, 4)); + score(20, x) = + (fit[x].par(2) - true_par(2)) * (fit[x].par(4) - true_par(4)) / sqrt(fit[x].cov(2, 2)) / sqrt(fit[x].cov(4, 4)); + score(21, x) = + (fit[x].par(0) - true_par(0)) * (fit[x].par(1) - true_par(1)) / sqrt(fit[x].cov(0, 0)) / sqrt(fit[x].cov(1, 1)); + score(22, x) = + (fit[x].par(0) - true_par(0)) * (fit[x].par(2) - true_par(2)) / sqrt(fit[x].cov(0, 0)) / sqrt(fit[x].cov(2, 2)); + score(23, x) = + (fit[x].par(1) - true_par(1)) * (fit[x].par(2) - true_par(2)) / sqrt(fit[x].cov(1, 1)) / sqrt(fit[x].cov(2, 2)); + score(24, x) = + (fit[x].par(3) - true_par(3)) * (fit[x].par(4) - true_par(4)) / sqrt(fit[x].cov(3, 3)) / sqrt(fit[x].cov(4, 4)); + score(30, x) = fit[x].par(0); + score(31, x) = fit[x].par(1); + score(32, x) = fit[x].par(2); + score(33, x) = fit[x].par(3); + score(34, x) = fit[x].par(4); + score(35, x) = sqrt(fit[x].cov(0, 0)); + score(36, x) = sqrt(fit[x].cov(1, 1)); + score(37, x) = sqrt(fit[x].cov(2, 2)); + score(38, x) = sqrt(fit[x].cov(3, 3)); + score(39, x) = sqrt(fit[x].cov(4, 4)); + } + + double phi_ = score.row(0).mean(); + double a_ = score.row(1).mean(); + double pt_ = score.row(2).mean(); + double coT_ = score.row(3).mean(); + double Zip_ = score.row(4).mean(); + std::cout << std::setprecision(5) << std::scientific << label << " AVERAGE FITTED VALUES: \n" + << "phi: " << score.row(30).mean() << " +/- " << score.row(35).mean() << " [+/-] " + << sqrt(score.row(35).array().abs2().mean() - score.row(35).mean() * score.row(35).mean()) << std::endl + << "d0: " << score.row(31).mean() << " +/- " << score.row(36).mean() << " [+/-] " + << sqrt(score.row(36).array().abs2().mean() - score.row(36).mean() * score.row(36).mean()) << std::endl + << "pt: " << score.row(32).mean() << " +/- " << score.row(37).mean() << " [+/-] " + << sqrt(score.row(37).array().abs2().mean() - score.row(37).mean() * score.row(37).mean()) << std::endl + << "coT: " << score.row(33).mean() << " +/- " << score.row(38).mean() << " [+/-] " + << sqrt(score.row(38).array().abs2().mean() - score.row(38).mean() * score.row(38).mean()) << std::endl + << "Zip: " << score.row(34).mean() << " +/- " << score.row(39).mean() << " [+/-] " + << sqrt(score.row(39).array().abs2().mean() - score.row(39).mean() * score.row(39).mean()) << std::endl; + + Matrix5d correlation; + correlation << 1., score.row(21).mean(), score.row(22).mean(), score.row(15).mean(), score.row(20).mean(), + score.row(21).mean(), 1., score.row(23).mean(), score.row(16).mean(), score.row(19).mean(), score.row(22).mean(), + score.row(23).mean(), 1., score.row(17).mean(), score.row(20).mean(), score.row(15).mean(), score.row(16).mean(), + score.row(17).mean(), 1., score.row(24).mean(), score.row(18).mean(), score.row(19).mean(), score.row(20).mean(), + score.row(24).mean(), 1.; + + cout << "\n" + << label << " PULLS (mean, sigma, relative_error):\n" + << "phi: " << phi_ << " " << sqrt((score.row(0).array() - phi_).square().sum() / (iteration - 1)) << " " + << abs(score.row(10).mean()) << "%\n" + << "a0 : " << a_ << " " << sqrt((score.row(1).array() - a_).square().sum() / (iteration - 1)) << " " + << abs(score.row(11).mean()) << "%\n" + << "pt : " << pt_ << " " << sqrt((score.row(2).array() - pt_).square().sum() / (iteration - 1)) << " " + << abs(score.row(12).mean()) << "%\n" + << "coT: " << coT_ << " " << sqrt((score.row(3).array() - coT_).square().sum() / (iteration - 1)) << " " + << abs(score.row(13).mean()) << "%\n" + << "Zip: " << Zip_ << " " << sqrt((score.row(4).array() - Zip_).square().sum() / (iteration - 1)) << " " + << abs(score.row(14).mean()) << "%\n\n" + << "cov(phi,a0)_: " << score.row(5).mean() << "\n" + << "cov(phi,pt)_: " << score.row(6).mean() << "\n" + << "cov(a0,pt)_: " << score.row(7).mean() << "\n" + << "cov(coT,Zip)_: " << score.row(8).mean() << "\n\n" + << "chi2_circle: " << score.row(9).mean() << " vs " << n_ - 3 << "\n" + << "chi2_line: " << score.row(25).mean() << " vs " << n_ - 2 << "\n\n" + << "correlation matrix:\n" + << correlation << "\n\n" + << endl; + + phi_pull.Fit("gaus", "Q"); + dxy_pull.Fit("gaus", "Q"); + dz_pull.Fit("gaus", "Q"); + theta_pull.Fit("gaus", "Q"); + pt_pull.Fit("gaus", "Q"); + phi_pull.Write(); + dxy_pull.Write(); + dz_pull.Write(); + theta_pull.Write(); + pt_pull.Write(); + phi_error.Write(); + dxy_error.Write(); + dz_error.Write(); + theta_error.Write(); + pt_error.Write(); +} + +void test_helix_fit(bool getcin) { + int n_; + const double B_field = 3.8 * c_speed / pow(10, 9) / 100; + Matrix gen_par; + Vector5d true_par; + Vector5d err; + generator.seed(1); + std::cout << std::setprecision(6); + cout << "_________________________________________________________________________\n"; + cout << "n x(cm) y(cm) z(cm) phi(grad) R(Gev/c) eta iteration debug" << endl; + if (getcin) { + cout << "hits: "; + cin >> n_; + cout << "x: "; + cin >> gen_par(0); + cout << "y: "; + cin >> gen_par(1); + cout << "z: "; + cin >> gen_par(2); + cout << "phi: "; + cin >> gen_par(3); + cout << "p_t: "; + cin >> gen_par(4); + cout << "eta: "; + cin >> gen_par(5); + } else { + n_ = 4; + gen_par(0) = -0.1; // x + gen_par(1) = 0.1; // y + gen_par(2) = -1.; // z + gen_par(3) = 45.; // phi + gen_par(4) = 10.; // R (p_t) + gen_par(5) = 1.; // eta + } + + const int iteration = 5000; + gen_par = New_par(gen_par, 1, B_field); + true_par = True_par(gen_par, 1, B_field); + std::array helixRiemann_fit; + + std::cout << "\nTrue parameters: " + << "phi: " << true_par(0) << " " + << "dxy: " << true_par(1) << " " + << "pt: " << true_par(2) << " " + << "CotT: " << true_par(3) << " " + << "Zip: " << true_par(4) << " " << std::endl; + auto start = std::chrono::high_resolution_clock::now(); + auto delta = start - start; + for (int i = 0; i < 100 * iteration; i++) { + hits_gen gen; + gen = Hits_gen(n_, gen_par); + // gen.hits = MatrixXd::Zero(3, 4); + // gen.hits_cov = MatrixXd::Zero(3 * 4, 3 * 4); + // gen.hits.col(0) << 1.82917642593, 2.0411875248, 7.18495464325; + // gen.hits.col(1) << 4.47041416168, 4.82704305649, 18.6394691467; + // gen.hits.col(2) << 7.25991010666, 7.74653434753, 30.6931324005; + // gen.hits.col(3) << 8.99161434174, 9.54262828827, 38.1338043213; + delta -= std::chrono::high_resolution_clock::now() - start; + helixRiemann_fit[i % iteration] = +#ifdef USE_BL + brokenline::helixFit(gen.hits, gen.hits_ge, B_field); +#else + riemannFit::helixFit(gen.hits, gen.hits_ge, B_field, true); +#endif + delta += std::chrono::high_resolution_clock::now() - start; + + if (helixRiemann_fit[i % iteration].par(0) > 10.) + std::cout << "error" << std::endl; + if (0 == i) + cout << std::setprecision(6) << "phi: " << helixRiemann_fit[i].par(0) << " +/- " + << sqrt(helixRiemann_fit[i].cov(0, 0)) << " vs " << true_par(0) << endl + << "Tip: " << helixRiemann_fit[i].par(1) << " +/- " << sqrt(helixRiemann_fit[i].cov(1, 1)) << " vs " + << true_par(1) << endl + << "p_t: " << helixRiemann_fit[i].par(2) << " +/- " << sqrt(helixRiemann_fit[i].cov(2, 2)) << " vs " + << true_par(2) << endl + << "theta:" << helixRiemann_fit[i].par(3) << " +/- " << sqrt(helixRiemann_fit[i].cov(3, 3)) << " vs " + << true_par(3) << endl + << "Zip: " << helixRiemann_fit[i].par(4) << " +/- " << sqrt(helixRiemann_fit[i].cov(4, 4)) << " vs " + << true_par(4) << endl + << "charge:" << helixRiemann_fit[i].qCharge << " vs 1" << endl + << "covariance matrix:" << endl + << helixRiemann_fit[i].cov << endl + << "Initial hits:\n" + << gen.hits << endl + << "Initial Covariance:\n" + << gen.hits_ge << endl; + } + std::cout << "elapsted time " << double(std::chrono::duration_cast(delta).count()) / 1.e6 + << std::endl; + computePull(helixRiemann_fit, "Riemann", n_, iteration, true_par); +} + +int main(int nargs, char**) { + TFile f("TestFitResults.root", "RECREATE"); + test_helix_fit(nargs > 1); + f.Close(); + return 0; +} diff --git a/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu new file mode 100644 index 0000000000000..d5eba9be26594 --- /dev/null +++ b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu @@ -0,0 +1,343 @@ +#include + +#include +#include + +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" + +#ifdef USE_BL +#include "RecoPixelVertexing/PixelTrackFitting/interface/BrokenLine.h" +#else +#include "RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h" +#endif + +#include "test_common.h" + +using namespace Eigen; + +namespace riemannFit { + constexpr uint32_t maxNumberOfTracks() { return 5 * 1024; } + constexpr uint32_t stride() { return maxNumberOfTracks(); } + // hits + template + using Matrix3xNd = Eigen::Matrix; + template + using Map3xNd = Eigen::Map, 0, Eigen::Stride<3 * stride(), stride()>>; + // errors + template + using Matrix6xNf = Eigen::Matrix; + template + using Map6xNf = Eigen::Map, 0, Eigen::Stride<6 * stride(), stride()>>; + // fast fit + using Map4d = Eigen::Map>; + +} // namespace riemannFit + +template +__global__ void kernelPrintSizes(double* __restrict__ phits, float* __restrict__ phits_ge) { + auto i = blockIdx.x * blockDim.x + threadIdx.x; + riemannFit::Map3xNd hits(phits + i, 3, 4); + riemannFit::Map6xNf hits_ge(phits_ge + i, 6, 4); + if (i != 0) + return; + printf("GPU sizes %lu %lu %lu %lu %lu\n", + sizeof(hits[i]), + sizeof(hits_ge[i]), + sizeof(Vector4d), + sizeof(riemannFit::LineFit), + sizeof(riemannFit::CircleFit)); +} + +template +__global__ void kernelFastFit(double* __restrict__ phits, double* __restrict__ presults) { + auto i = blockIdx.x * blockDim.x + threadIdx.x; + riemannFit::Map3xNd hits(phits + i, 3, N); + riemannFit::Map4d result(presults + i, 4); +#ifdef USE_BL + brokenline::fastFit(hits, result); +#else + riemannFit::fastFit(hits, result); +#endif +} + +#ifdef USE_BL + +template +__global__ void kernelBrokenLineFit(double* __restrict__ phits, + float* __restrict__ phits_ge, + double* __restrict__ pfast_fit_input, + double B, + riemannFit::CircleFit* circle_fit, + riemannFit::LineFit* line_fit) { + auto i = blockIdx.x * blockDim.x + threadIdx.x; + riemannFit::Map3xNd hits(phits + i, 3, N); + riemannFit::Map4d fast_fit_input(pfast_fit_input + i, 4); + riemannFit::Map6xNf hits_ge(phits_ge + i, 6, N); + + brokenline::PreparedBrokenLineData data; + riemannFit::Matrix3d Jacob; + + auto& line_fit_results = line_fit[i]; + auto& circle_fit_results = circle_fit[i]; + + brokenline::prepareBrokenLineData(hits, fast_fit_input, B, data); + brokenline::lineFit(hits_ge, fast_fit_input, B, data, line_fit_results); + brokenline::circleFit(hits, hits_ge, fast_fit_input, B, data, circle_fit_results); + Jacob << 1., 0, 0, 0, 1., 0, 0, 0, + -B / std::copysign(riemannFit::sqr(circle_fit_results.par(2)), circle_fit_results.par(2)); + circle_fit_results.par(2) = B / std::abs(circle_fit_results.par(2)); + circle_fit_results.cov = Jacob * circle_fit_results.cov * Jacob.transpose(); + +#ifdef TEST_DEBUG + if (0 == i) { + printf("Circle param %f,%f,%f\n", circle_fit[i].par(0), circle_fit[i].par(1), circle_fit[i].par(2)); + } +#endif +} + +#else + +template +__global__ void kernel_CircleFit(double* __restrict__ phits, + float* __restrict__ phits_ge, + double* __restrict__ pfast_fit_input, + double B, + riemannFit::CircleFit* circle_fit_resultsGPU) { + auto i = blockIdx.x * blockDim.x + threadIdx.x; + riemannFit::Map3xNd hits(phits + i, 3, N); + riemannFit::Map4d fast_fit_input(pfast_fit_input + i, 4); + riemannFit::Map6xNf hits_ge(phits_ge + i, 6, N); + + constexpr auto n = N; + + riemannFit::VectorNd rad = (hits.block(0, 0, 2, n).colwise().norm()); + riemannFit::Matrix2Nd hits_cov = MatrixXd::Zero(2 * n, 2 * n); + riemannFit::loadCovariance2D(hits_ge, hits_cov); + +#ifdef TEST_DEBUG + if (0 == i) { + printf("hits %f, %f\n", hits.block(0, 0, 2, n)(0, 0), hits.block(0, 0, 2, n)(0, 1)); + printf("hits %f, %f\n", hits.block(0, 0, 2, n)(1, 0), hits.block(0, 0, 2, n)(1, 1)); + printf("fast_fit_input(0): %f\n", fast_fit_input(0)); + printf("fast_fit_input(1): %f\n", fast_fit_input(1)); + printf("fast_fit_input(2): %f\n", fast_fit_input(2)); + printf("fast_fit_input(3): %f\n", fast_fit_input(3)); + printf("rad(0,0): %f\n", rad(0, 0)); + printf("rad(1,1): %f\n", rad(1, 1)); + printf("rad(2,2): %f\n", rad(2, 2)); + printf("hits_cov(0,0): %f\n", (*hits_cov)(0, 0)); + printf("hits_cov(1,1): %f\n", (*hits_cov)(1, 1)); + printf("hits_cov(2,2): %f\n", (*hits_cov)(2, 2)); + printf("hits_cov(11,11): %f\n", (*hits_cov)(11, 11)); + printf("B: %f\n", B); + } +#endif + circle_fit_resultsGPU[i] = riemannFit::circleFit(hits.block(0, 0, 2, n), hits_cov, fast_fit_input, rad, B, true); +#ifdef TEST_DEBUG + if (0 == i) { + printf("Circle param %f,%f,%f\n", + circle_fit_resultsGPU[i].par(0), + circle_fit_resultsGPU[i].par(1), + circle_fit_resultsGPU[i].par(2)); + } +#endif +} + +template +__global__ void kernelLineFit(double* __restrict__ phits, + float* __restrict__ phits_ge, + double B, + riemannFit::CircleFit* circle_fit, + double* __restrict__ pfast_fit_input, + riemannFit::LineFit* line_fit) { + auto i = blockIdx.x * blockDim.x + threadIdx.x; + riemannFit::Map3xNd hits(phits + i, 3, N); + riemannFit::Map4d fast_fit_input(pfast_fit_input + i, 4); + riemannFit::Map6xNf hits_ge(phits_ge + i, 6, N); + line_fit[i] = riemannFit::lineFit(hits, hits_ge, circle_fit[i], fast_fit_input, B, true); +} +#endif + +template +__device__ __host__ void fillHitsAndHitsCov(M3xN& hits, M6xN& hits_ge) { + constexpr uint32_t N = M3xN::ColsAtCompileTime; + + if (N == 5) { + hits << 2.934787, 6.314229, 8.936963, 10.360559, 12.856387, 0.773211, 1.816356, 2.765734, 3.330824, 4.422212, + -10.980247, -23.162731, -32.759060, -38.061260, -47.518867; + hits_ge.col(0) << 1.424715e-07, -4.996975e-07, 1.752614e-06, 3.660689e-11, 1.644638e-09, 7.346080e-05; + hits_ge.col(1) << 6.899177e-08, -1.873414e-07, 5.087101e-07, -2.078806e-10, -2.210498e-11, 4.346079e-06; + hits_ge.col(2) << 1.406273e-06, 4.042467e-07, 6.391180e-07, -3.141497e-07, 6.513821e-08, 1.163863e-07; + hits_ge.col(3) << 1.176358e-06, 2.154100e-07, 5.072816e-07, -8.161219e-08, 1.437878e-07, 5.951832e-08; + hits_ge.col(4) << 2.852843e-05, 7.956492e-06, 3.117701e-06, -1.060541e-06, 8.777413e-09, 1.426417e-07; + return; + } + + if (N > 3) + hits << 1.98645, 4.72598, 7.65632, 11.3151, 2.18002, 4.88864, 7.75845, 11.3134, 2.46338, 6.99838, 11.808, 17.793; + else + hits << 1.98645, 4.72598, 7.65632, 2.18002, 4.88864, 7.75845, 2.46338, 6.99838, 11.808; + + hits_ge.col(0)[0] = 7.14652e-06; + hits_ge.col(1)[0] = 2.15789e-06; + hits_ge.col(2)[0] = 1.63328e-06; + if (N > 3) + hits_ge.col(3)[0] = 6.27919e-06; + hits_ge.col(0)[2] = 6.10348e-06; + hits_ge.col(1)[2] = 2.08211e-06; + hits_ge.col(2)[2] = 1.61672e-06; + if (N > 3) + hits_ge.col(3)[2] = 6.28081e-06; + hits_ge.col(0)[5] = 5.184e-05; + hits_ge.col(1)[5] = 1.444e-05; + hits_ge.col(2)[5] = 6.25e-06; + if (N > 3) + hits_ge.col(3)[5] = 3.136e-05; + hits_ge.col(0)[1] = -5.60077e-06; + hits_ge.col(1)[1] = -1.11936e-06; + hits_ge.col(2)[1] = -6.24945e-07; + if (N > 3) + hits_ge.col(3)[1] = -5.28e-06; +} + +template +__global__ void kernelFillHitsAndHitsCov(double* __restrict__ phits, float* phits_ge) { + auto i = blockIdx.x * blockDim.x + threadIdx.x; + riemannFit::Map3xNd hits(phits + i, 3, N); + riemannFit::Map6xNf hits_ge(phits_ge + i, 6, N); + hits_ge = MatrixXf::Zero(6, N); + fillHitsAndHitsCov(hits, hits_ge); +} + +template +void testFit() { + constexpr double B = 0.0113921; + riemannFit::Matrix3xNd hits; + riemannFit::Matrix6xNf hits_ge = MatrixXf::Zero(6, N); + double* hitsGPU = nullptr; + ; + float* hits_geGPU = nullptr; + double* fast_fit_resultsGPU = nullptr; + double* fast_fit_resultsGPUret = new double[riemannFit::maxNumberOfTracks() * sizeof(Vector4d)]; + riemannFit::CircleFit* circle_fit_resultsGPU = nullptr; + riemannFit::CircleFit* circle_fit_resultsGPUret = new riemannFit::CircleFit(); + riemannFit::LineFit* line_fit_resultsGPU = nullptr; + riemannFit::LineFit* line_fit_resultsGPUret = new riemannFit::LineFit(); + + fillHitsAndHitsCov(hits, hits_ge); + + std::cout << "sizes " << N << ' ' << sizeof(hits) << ' ' << sizeof(hits_ge) << ' ' << sizeof(Vector4d) << ' ' + << sizeof(riemannFit::LineFit) << ' ' << sizeof(riemannFit::CircleFit) << std::endl; + + std::cout << "Generated hits:\n" << hits << std::endl; + std::cout << "Generated cov:\n" << hits_ge << std::endl; + + // FAST_FIT_CPU +#ifdef USE_BL + Vector4d fast_fit_results; + brokenline::fastFit(hits, fast_fit_results); +#else + Vector4d fast_fit_results; + riemannFit::fastFit(hits, fast_fit_results); +#endif + std::cout << "Fitted values (FastFit, [X0, Y0, R, tan(theta)]):\n" << fast_fit_results << std::endl; + + // for timing purposes we fit 4096 tracks + constexpr uint32_t Ntracks = 4096; + cudaCheck(cudaMalloc(&hitsGPU, riemannFit::maxNumberOfTracks() * sizeof(riemannFit::Matrix3xNd))); + cudaCheck(cudaMalloc(&hits_geGPU, riemannFit::maxNumberOfTracks() * sizeof(riemannFit::Matrix6xNf))); + cudaCheck(cudaMalloc(&fast_fit_resultsGPU, riemannFit::maxNumberOfTracks() * sizeof(Vector4d))); + cudaCheck(cudaMalloc(&line_fit_resultsGPU, riemannFit::maxNumberOfTracks() * sizeof(riemannFit::LineFit))); + cudaCheck(cudaMalloc(&circle_fit_resultsGPU, riemannFit::maxNumberOfTracks() * sizeof(riemannFit::CircleFit))); + + cudaCheck(cudaMemset(fast_fit_resultsGPU, 0, riemannFit::maxNumberOfTracks() * sizeof(Vector4d))); + cudaCheck(cudaMemset(line_fit_resultsGPU, 0, riemannFit::maxNumberOfTracks() * sizeof(riemannFit::LineFit))); + + kernelPrintSizes<<>>(hitsGPU, hits_geGPU); + kernelFillHitsAndHitsCov<<>>(hitsGPU, hits_geGPU); + + // FAST_FIT GPU + kernelFastFit<<>>(hitsGPU, fast_fit_resultsGPU); + cudaDeviceSynchronize(); + + cudaCheck(cudaMemcpy(fast_fit_resultsGPUret, + fast_fit_resultsGPU, + riemannFit::maxNumberOfTracks() * sizeof(Vector4d), + cudaMemcpyDeviceToHost)); + riemannFit::Map4d fast_fit(fast_fit_resultsGPUret + 10, 4); + std::cout << "Fitted values (FastFit, [X0, Y0, R, tan(theta)]): GPU\n" << fast_fit << std::endl; + assert(isEqualFuzzy(fast_fit_results, fast_fit)); + +#ifdef USE_BL + // CIRCLE AND LINE FIT CPU + brokenline::PreparedBrokenLineData data; + brokenline::karimaki_circle_fit circle_fit_results; + riemannFit::LineFit line_fit_results; + riemannFit::Matrix3d Jacob; + brokenline::prepareBrokenLineData(hits, fast_fit_results, B, data); + brokenline::lineFit(hits_ge, fast_fit_results, B, data, line_fit_results); + brokenline::circleFit(hits, hits_ge, fast_fit_results, B, data, circle_fit_results); + Jacob << 1., 0, 0, 0, 1., 0, 0, 0, + -B / std::copysign(riemannFit::sqr(circle_fit_results.par(2)), circle_fit_results.par(2)); + circle_fit_results.par(2) = B / std::abs(circle_fit_results.par(2)); + circle_fit_results.cov = Jacob * circle_fit_results.cov * Jacob.transpose(); + + // fit on GPU + kernelBrokenLineFit + <<>>(hitsGPU, hits_geGPU, fast_fit_resultsGPU, B, circle_fit_resultsGPU, line_fit_resultsGPU); + cudaDeviceSynchronize(); + +#else + // CIRCLE_FIT CPU + riemannFit::VectorNd rad = (hits.block(0, 0, 2, N).colwise().norm()); + + riemannFit::Matrix2Nd hits_cov = riemannFit::Matrix2Nd::Zero(); + riemannFit::loadCovariance2D(hits_ge, hits_cov); + riemannFit::CircleFit circle_fit_results = + riemannFit::circleFit(hits.block(0, 0, 2, N), hits_cov, fast_fit_results, rad, B, true); + + // CIRCLE_FIT GPU + kernel_CircleFit<<>>(hitsGPU, hits_geGPU, fast_fit_resultsGPU, B, circle_fit_resultsGPU); + cudaDeviceSynchronize(); + + // LINE_FIT CPU + riemannFit::LineFit line_fit_results = + riemannFit::lineFit(hits, hits_ge, circle_fit_results, fast_fit_results, B, true); + + kernelLineFit + <<>>(hitsGPU, hits_geGPU, B, circle_fit_resultsGPU, fast_fit_resultsGPU, line_fit_resultsGPU); + cudaDeviceSynchronize(); +#endif + + std::cout << "Fitted values (CircleFit):\n" << circle_fit_results.par << std::endl; + + cudaCheck(cudaMemcpy( + circle_fit_resultsGPUret, circle_fit_resultsGPU, sizeof(riemannFit::CircleFit), cudaMemcpyDeviceToHost)); + std::cout << "Fitted values (CircleFit) GPU:\n" << circle_fit_resultsGPUret->par << std::endl; + assert(isEqualFuzzy(circle_fit_results.par, circle_fit_resultsGPUret->par)); + + std::cout << "Fitted values (LineFit):\n" << line_fit_results.par << std::endl; + // LINE_FIT GPU + cudaCheck( + cudaMemcpy(line_fit_resultsGPUret, line_fit_resultsGPU, sizeof(riemannFit::LineFit), cudaMemcpyDeviceToHost)); + std::cout << "Fitted values (LineFit) GPU:\n" << line_fit_resultsGPUret->par << std::endl; + assert(isEqualFuzzy(line_fit_results.par, line_fit_resultsGPUret->par, N == 5 ? 1e-4 : 1e-6)); // requires fma on CPU + + std::cout << "Fitted cov (CircleFit) CPU:\n" << circle_fit_results.cov << std::endl; + std::cout << "Fitted cov (LineFit): CPU\n" << line_fit_results.cov << std::endl; + std::cout << "Fitted cov (CircleFit) GPU:\n" << circle_fit_resultsGPUret->cov << std::endl; + std::cout << "Fitted cov (LineFit): GPU\n" << line_fit_resultsGPUret->cov << std::endl; +} + +int main(int argc, char* argv[]) { + cms::cudatest::requireDevices(); + + testFit<4>(); + testFit<3>(); + testFit<5>(); + + std::cout << "TEST FIT, NO ERRORS" << std::endl; + + return 0; +} diff --git a/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPUNoFit.cu b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPUNoFit.cu new file mode 100644 index 0000000000000..6ac1088943305 --- /dev/null +++ b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPUNoFit.cu @@ -0,0 +1,248 @@ +#include + +#include +#include + +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" +#include "test_common.h" + +using namespace Eigen; + +using Matrix5d = Matrix; + +__host__ __device__ void eigenValues(Matrix3d *m, Eigen::SelfAdjointEigenSolver::RealVectorType *ret) { +#if TEST_DEBUG + printf("Matrix(0,0): %f\n", (*m)(0, 0)); + printf("Matrix(1,1): %f\n", (*m)(1, 1)); + printf("Matrix(2,2): %f\n", (*m)(2, 2)); +#endif + SelfAdjointEigenSolver es; + es.computeDirect(*m); + (*ret) = es.eigenvalues(); + return; +} + +__global__ void kernel(Matrix3d *m, Eigen::SelfAdjointEigenSolver::RealVectorType *ret) { + eigenValues(m, ret); +} + +__global__ void kernelInverse3x3(Matrix3d *in, Matrix3d *out) { (*out) = in->inverse(); } + +__global__ void kernelInverse4x4(Matrix4d *in, Matrix4d *out) { (*out) = in->inverse(); } + +__global__ void kernelInverse5x5(Matrix5d *in, Matrix5d *out) { (*out) = in->inverse(); } + +template +__global__ void kernelMultiply(M1 *J, M2 *C, M3 *result) { +// Map res(result->data()); +#if TEST_DEBUG + printf("*** GPU IN ***\n"); +#endif + printIt(J); + printIt(C); + // res.noalias() = (*J) * (*C); + // printIt(&res); + (*result) = (*J) * (*C); +#if TEST_DEBUG + printf("*** GPU OUT ***\n"); +#endif + return; +} + +template +void testMultiply() { + std::cout << "TEST MULTIPLY" << std::endl; + std::cout << "Product of type " << row1 << "x" << col1 << " * " << row2 << "x" << col2 << std::endl; + Eigen::Matrix J; + fillMatrix(J); + Eigen::Matrix C; + fillMatrix(C); + Eigen::Matrix multiply_result = J * C; +#if TEST_DEBUG + std::cout << "Input J:" << std::endl; + printIt(&J); + std::cout << "Input C:" << std::endl; + printIt(&C); + std::cout << "Output:" << std::endl; + printIt(&multiply_result); +#endif + // GPU + Eigen::Matrix *JGPU = nullptr; + Eigen::Matrix *CGPU = nullptr; + Eigen::Matrix *multiply_resultGPU = nullptr; + Eigen::Matrix *multiply_resultGPUret = new Eigen::Matrix(); + + cudaCheck(cudaMalloc((void **)&JGPU, sizeof(Eigen::Matrix))); + cudaCheck(cudaMalloc((void **)&CGPU, sizeof(Eigen::Matrix))); + cudaCheck(cudaMalloc((void **)&multiply_resultGPU, sizeof(Eigen::Matrix))); + cudaCheck(cudaMemcpy(JGPU, &J, sizeof(Eigen::Matrix), cudaMemcpyHostToDevice)); + cudaCheck(cudaMemcpy(CGPU, &C, sizeof(Eigen::Matrix), cudaMemcpyHostToDevice)); + cudaCheck(cudaMemcpy( + multiply_resultGPU, &multiply_result, sizeof(Eigen::Matrix), cudaMemcpyHostToDevice)); + + kernelMultiply<<<1, 1>>>(JGPU, CGPU, multiply_resultGPU); + cudaDeviceSynchronize(); + + cudaCheck(cudaMemcpy( + multiply_resultGPUret, multiply_resultGPU, sizeof(Eigen::Matrix), cudaMemcpyDeviceToHost)); + printIt(multiply_resultGPUret); + assert(isEqualFuzzy(multiply_result, (*multiply_resultGPUret))); +} + +void testInverse3x3() { + std::cout << "TEST INVERSE 3x3" << std::endl; + Matrix3d m; + fillMatrix(m); + m += m.transpose().eval(); + + Matrix3d m_inv = m.inverse(); + Matrix3d *mGPU = nullptr; + Matrix3d *mGPUret = nullptr; + Matrix3d *mCPUret = new Matrix3d(); + +#if TEST_DEBUG + std::cout << "Here is the matrix m:" << std::endl << m << std::endl; + std::cout << "Its inverse is:" << std::endl << m.inverse() << std::endl; +#endif + cudaCheck(cudaMalloc((void **)&mGPU, sizeof(Matrix3d))); + cudaCheck(cudaMalloc((void **)&mGPUret, sizeof(Matrix3d))); + cudaCheck(cudaMemcpy(mGPU, &m, sizeof(Matrix3d), cudaMemcpyHostToDevice)); + + kernelInverse3x3<<<1, 1>>>(mGPU, mGPUret); + cudaDeviceSynchronize(); + + cudaCheck(cudaMemcpy(mCPUret, mGPUret, sizeof(Matrix3d), cudaMemcpyDeviceToHost)); +#if TEST_DEBUG + std::cout << "Its GPU inverse is:" << std::endl << (*mCPUret) << std::endl; +#endif + assert(isEqualFuzzy(m_inv, *mCPUret)); +} + +void testInverse4x4() { + std::cout << "TEST INVERSE 4x4" << std::endl; + Matrix4d m; + fillMatrix(m); + m += m.transpose().eval(); + + Matrix4d m_inv = m.inverse(); + Matrix4d *mGPU = nullptr; + Matrix4d *mGPUret = nullptr; + Matrix4d *mCPUret = new Matrix4d(); + +#if TEST_DEBUG + std::cout << "Here is the matrix m:" << std::endl << m << std::endl; + std::cout << "Its inverse is:" << std::endl << m.inverse() << std::endl; +#endif + cudaCheck(cudaMalloc((void **)&mGPU, sizeof(Matrix4d))); + cudaCheck(cudaMalloc((void **)&mGPUret, sizeof(Matrix4d))); + cudaCheck(cudaMemcpy(mGPU, &m, sizeof(Matrix4d), cudaMemcpyHostToDevice)); + + kernelInverse4x4<<<1, 1>>>(mGPU, mGPUret); + cudaDeviceSynchronize(); + + cudaCheck(cudaMemcpy(mCPUret, mGPUret, sizeof(Matrix4d), cudaMemcpyDeviceToHost)); +#if TEST_DEBUG + std::cout << "Its GPU inverse is:" << std::endl << (*mCPUret) << std::endl; +#endif + assert(isEqualFuzzy(m_inv, *mCPUret)); +} + +void testInverse5x5() { + std::cout << "TEST INVERSE 5x5" << std::endl; + Matrix5d m; + fillMatrix(m); + m += m.transpose().eval(); + + Matrix5d m_inv = m.inverse(); + Matrix5d *mGPU = nullptr; + Matrix5d *mGPUret = nullptr; + Matrix5d *mCPUret = new Matrix5d(); + +#if TEST_DEBUG + std::cout << "Here is the matrix m:" << std::endl << m << std::endl; + std::cout << "Its inverse is:" << std::endl << m.inverse() << std::endl; +#endif + cudaCheck(cudaMalloc((void **)&mGPU, sizeof(Matrix5d))); + cudaCheck(cudaMalloc((void **)&mGPUret, sizeof(Matrix5d))); + cudaCheck(cudaMemcpy(mGPU, &m, sizeof(Matrix5d), cudaMemcpyHostToDevice)); + + kernelInverse5x5<<<1, 1>>>(mGPU, mGPUret); + cudaDeviceSynchronize(); + + cudaCheck(cudaMemcpy(mCPUret, mGPUret, sizeof(Matrix5d), cudaMemcpyDeviceToHost)); +#if TEST_DEBUG + std::cout << "Its GPU inverse is:" << std::endl << (*mCPUret) << std::endl; +#endif + assert(isEqualFuzzy(m_inv, *mCPUret)); +} + +void testEigenvalues() { + std::cout << "TEST EIGENVALUES" << std::endl; + Matrix3d m; + fillMatrix(m); + m += m.transpose().eval(); + + Matrix3d *m_gpu = nullptr; + Matrix3d *mgpudebug = new Matrix3d(); + Eigen::SelfAdjointEigenSolver::RealVectorType *ret = + new Eigen::SelfAdjointEigenSolver::RealVectorType; + Eigen::SelfAdjointEigenSolver::RealVectorType *ret1 = + new Eigen::SelfAdjointEigenSolver::RealVectorType; + Eigen::SelfAdjointEigenSolver::RealVectorType *ret_gpu = nullptr; + eigenValues(&m, ret); +#if TEST_DEBUG + std::cout << "Generated Matrix M 3x3:\n" << m << std::endl; + std::cout << "The eigenvalues of M are:" << std::endl << (*ret) << std::endl; + std::cout << "*************************\n\n" << std::endl; +#endif + cudaCheck(cudaMalloc((void **)&m_gpu, sizeof(Matrix3d))); + cudaCheck(cudaMalloc((void **)&ret_gpu, sizeof(Eigen::SelfAdjointEigenSolver::RealVectorType))); + cudaCheck(cudaMemcpy(m_gpu, &m, sizeof(Matrix3d), cudaMemcpyHostToDevice)); + + kernel<<<1, 1>>>(m_gpu, ret_gpu); + cudaDeviceSynchronize(); + + cudaCheck(cudaMemcpy(mgpudebug, m_gpu, sizeof(Matrix3d), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy( + ret1, ret_gpu, sizeof(Eigen::SelfAdjointEigenSolver::RealVectorType), cudaMemcpyDeviceToHost)); +#if TEST_DEBUG + std::cout << "GPU Generated Matrix M 3x3:\n" << (*mgpudebug) << std::endl; + std::cout << "GPU The eigenvalues of M are:" << std::endl << (*ret1) << std::endl; + std::cout << "*************************\n\n" << std::endl; +#endif + assert(isEqualFuzzy(*ret, *ret1)); +} + +int main(int argc, char *argv[]) { + cms::cudatest::requireDevices(); + + testEigenvalues(); + testInverse3x3(); + testInverse4x4(); + testInverse5x5(); + + testMultiply<1, 2, 2, 1>(); + testMultiply<1, 2, 2, 2>(); + testMultiply<1, 2, 2, 3>(); + testMultiply<1, 2, 2, 4>(); + testMultiply<1, 2, 2, 5>(); + testMultiply<2, 1, 1, 2>(); + testMultiply<2, 1, 1, 3>(); + testMultiply<2, 1, 1, 4>(); + testMultiply<2, 1, 1, 5>(); + testMultiply<2, 2, 2, 2>(); + testMultiply<2, 3, 3, 1>(); + testMultiply<2, 3, 3, 2>(); + testMultiply<2, 3, 3, 4>(); + testMultiply<2, 3, 3, 5>(); + testMultiply<3, 2, 2, 3>(); + testMultiply<2, 3, 3, 3>(); // DOES NOT COMPILE W/O PATCHING EIGEN + testMultiply<3, 3, 3, 3>(); + testMultiply<8, 8, 8, 8>(); + testMultiply<3, 4, 4, 3>(); + testMultiply<2, 4, 4, 2>(); + testMultiply<3, 4, 4, 2>(); // DOES NOT COMPILE W/O PATCHING EIGEN + + return 0; +} diff --git a/RecoPixelVertexing/PixelTrackFitting/test/testEigenJacobian.cpp b/RecoPixelVertexing/PixelTrackFitting/test/testEigenJacobian.cpp new file mode 100644 index 0000000000000..a8e040fa0df38 --- /dev/null +++ b/RecoPixelVertexing/PixelTrackFitting/test/testEigenJacobian.cpp @@ -0,0 +1,134 @@ +#include "RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h" +#include + +using riemannFit::Matrix5d; +using riemannFit::Vector5d; + +#include "TrackingTools/AnalyticalJacobians/interface/JacobianLocalToCurvilinear.h" + +#include "DataFormats/GeometrySurface/interface/Surface.h" +#include "DataFormats/TrajectoryState/interface/LocalTrajectoryParameters.h" +#include "TrackingTools/TrajectoryParametrization/interface/GlobalTrajectoryParameters.h" + +#include "DataFormats/GeometrySurface/interface/Plane.h" + +#include "MagneticField/Engine/interface/MagneticField.h" + +namespace { + + struct M5T : public MagneticField { + M5T() : mf(0., 0., 5.) {} + virtual GlobalVector inTesla(const GlobalPoint&) const { return mf; } + + GlobalVector mf; + }; + +} // namespace + +// old pixeltrack version... +Matrix5d transfFast(Matrix5d cov, Vector5d const& p) { + auto sqr = [](auto x) { return x * x; }; + auto sinTheta = 1 / std::sqrt(1 + p(3) * p(3)); + auto cosTheta = p(3) * sinTheta; + cov(2, 2) = sqr(sinTheta) * (cov(2, 2) * sqr(1. / (p(2) * p(2))) + cov(3, 3) * sqr(cosTheta * sinTheta / p(2))); + cov(3, 2) = cov(2, 3) = cov(3, 3) * cosTheta * sqr(sinTheta) / p(2); + // for (int i=0; i<5; ++i) cov(i,2) *= -sinTheta/(p(2)*p(2)); + // for (int i=0; i<5; ++i) cov(2,i) *= -sinTheta/(p(2)*p(2)); + return cov; +} + +Matrix5d loadCov(Vector5d const& e) { + Matrix5d cov; + for (int i = 0; i < 5; ++i) + cov(i, i) = e(i) * e(i); + for (int i = 0; i < 5; ++i) { + for (int j = 0; j < i; ++j) { + double v = 0.3 * std::sqrt(cov(i, i) * cov(j, j)); // this makes the matrix pos defined + cov(i, j) = (i + j) % 2 ? -0.4 * v : 0.1 * v; + cov(j, i) = cov(i, j); + } + } + return cov; +} + +#include +int main() { + M5T const mf; + + for (auto charge = -1; charge < 2; charge += 2) + for (auto szip = -1; szip < 2; szip += 2) + for (auto stip = -1; stip < 2; stip += 2) { + Vector5d par0; + par0 << 0.2, 0.1, 3.5, 0.8, 0.1; + Vector5d del0; + del0 << 0.01, 0.01, 0.035, -0.03, -0.01; + //!<(phi,Tip,pt,cotan(theta)),Zip) + par0(1) *= stip; + par0(4) *= szip; + + Matrix5d cov0 = loadCov(del0); + + Vector5d par1; + Vector5d par2; + + Matrix5d cov1; + Matrix5d cov2; + + // Matrix5d covf = transfFast(cov0,par0); + + riemannFit::transformToPerigeePlane(par0, cov0, par1, cov1); + + std::cout << "cov1\n" << cov1 << std::endl; + + LocalTrajectoryParameters lpar(par1(0), par1(1), par1(2), par1(3), par1(4), 1.); + AlgebraicSymMatrix55 m; + for (int i = 0; i < 5; ++i) + for (int j = i; j < 5; ++j) + m(i, j) = cov1(i, j); + + float phi = par0(0); + float sp = std::sin(phi); + float cp = std::cos(phi); + Surface::RotationType rot(sp, -cp, 0, 0, 0, -1.f, cp, sp, 0); + + Surface::PositionType bs(0., 0., 0.); + Plane plane(bs, rot); + GlobalTrajectoryParameters gp( + plane.toGlobal(lpar.position()), plane.toGlobal(lpar.momentum()), lpar.charge(), &mf); + std::cout << "global par " << gp.position() << ' ' << gp.momentum() << ' ' << gp.charge() << std::endl; + JacobianLocalToCurvilinear jl2c(plane, lpar, mf); + std::cout << "jac l2c" << jl2c.jacobian() << std::endl; + + AlgebraicSymMatrix55 mo = ROOT::Math::Similarity(jl2c.jacobian(), m); + std::cout << "curv error\n" << mo << std::endl; + + /* + + // not accurate as the perigee plane move as well... + Vector5d del1 = par2-par1; + + + // don't ask: guess + std::cout << "charge " << charge << std::endl; + std::cout << "par0 " << par0.transpose() << std::endl; + std::cout << "del0 " << del0.transpose() << std::endl; + + + std::cout << "par1 " << par1.transpose() << std::endl; + std::cout << "del1 " << del1.transpose() << std::endl; + // std::cout << "del2 " << (J*del0).transpose() << std::endl; + + std::cout << "del1^2 " << (del1.array()*del1.array()).transpose() << std::endl; + std::cout << std::endl; + + std::cout << "cov0\n" << cov0 << std::endl; + std::cout << "cov1\n" << cov1 << std::endl; + std::cout << "cov2\n" << cov2 << std::endl; + */ + + std::cout << std::endl << "----------" << std::endl; + + } // lopp over signs + + return 0; +} diff --git a/RecoPixelVertexing/PixelTrackFitting/test/testFits.cpp b/RecoPixelVertexing/PixelTrackFitting/test/testFits.cpp new file mode 100644 index 0000000000000..7c0dab3be3e00 --- /dev/null +++ b/RecoPixelVertexing/PixelTrackFitting/test/testFits.cpp @@ -0,0 +1,154 @@ +#include + +#include +#include + +#ifdef USE_BL +#include "RecoPixelVertexing/PixelTrackFitting/interface/BrokenLine.h" +#else +#include "RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h" +#endif + +#include "test_common.h" + +using namespace Eigen; + +namespace riemannFit { + constexpr uint32_t maxNumberOfTracks() { return 5 * 1024; } + constexpr uint32_t stride() { return maxNumberOfTracks(); } + // hits + template + using Matrix3xNd = Eigen::Matrix; + template + using Map3xNd = Eigen::Map, 0, Eigen::Stride<3 * stride(), stride()> >; + // errors + template + using Matrix6xNf = Eigen::Matrix; + template + using Map6xNf = Eigen::Map, 0, Eigen::Stride<6 * stride(), stride()> >; + // fast fit + using Map4d = Eigen::Map >; + +} // namespace riemannFit + +/* +Hit global: 641,0 2: 2.934787,0.773211,-10.980247 +Error: 641,0 2: 1.424715e-07,-4.996975e-07,1.752614e-06,3.660689e-11,1.644638e-09,7.346080e-05 +Hit global: 641,1 104: 6.314229,1.816356,-23.162731 +Error: 641,1 104: 6.899177e-08,-1.873414e-07,5.087101e-07,-2.078806e-10,-2.210498e-11,4.346079e-06 +Hit global: 641,2 1521: 8.936963,2.765734,-32.759060 +Error: 641,2 1521: 1.406273e-06,4.042467e-07,6.391180e-07,-3.141497e-07,6.513821e-08,1.163863e-07 +Hit global: 641,3 1712: 10.360559,3.330824,-38.061260 +Error: 641,3 1712: 1.176358e-06,2.154100e-07,5.072816e-07,-8.161219e-08,1.437878e-07,5.951832e-08 +Hit global: 641,4 1824: 12.856387,4.422212,-47.518867 +Error: 641,4 1824: 2.852843e-05,7.956492e-06,3.117701e-06,-1.060541e-06,8.777413e-09,1.426417e-07 +*/ + +template +void fillHitsAndHitsCov(M3xN& hits, M6xN& hits_ge) { + constexpr uint32_t N = M3xN::ColsAtCompileTime; + + if (N == 5) { + hits << 2.934787, 6.314229, 8.936963, 10.360559, 12.856387, 0.773211, 1.816356, 2.765734, 3.330824, 4.422212, + -10.980247, -23.162731, -32.759060, -38.061260, -47.518867; + hits_ge.col(0) << 1.424715e-07, -4.996975e-07, 1.752614e-06, 3.660689e-11, 1.644638e-09, 7.346080e-05; + hits_ge.col(1) << 6.899177e-08, -1.873414e-07, 5.087101e-07, -2.078806e-10, -2.210498e-11, 4.346079e-06; + hits_ge.col(2) << 1.406273e-06, 4.042467e-07, 6.391180e-07, -3.141497e-07, 6.513821e-08, 1.163863e-07; + hits_ge.col(3) << 1.176358e-06, 2.154100e-07, 5.072816e-07, -8.161219e-08, 1.437878e-07, 5.951832e-08; + hits_ge.col(4) << 2.852843e-05, 7.956492e-06, 3.117701e-06, -1.060541e-06, 8.777413e-09, 1.426417e-07; + return; + } + + if (N > 3) + hits << 1.98645, 4.72598, 7.65632, 11.3151, 2.18002, 4.88864, 7.75845, 11.3134, 2.46338, 6.99838, 11.808, 17.793; + else + hits << 1.98645, 4.72598, 7.65632, 2.18002, 4.88864, 7.75845, 2.46338, 6.99838, 11.808; + + hits_ge.col(0)[0] = 7.14652e-06; + hits_ge.col(1)[0] = 2.15789e-06; + hits_ge.col(2)[0] = 1.63328e-06; + if (N > 3) + hits_ge.col(3)[0] = 6.27919e-06; + hits_ge.col(0)[2] = 6.10348e-06; + hits_ge.col(1)[2] = 2.08211e-06; + hits_ge.col(2)[2] = 1.61672e-06; + if (N > 3) + hits_ge.col(3)[2] = 6.28081e-06; + hits_ge.col(0)[5] = 5.184e-05; + hits_ge.col(1)[5] = 1.444e-05; + hits_ge.col(2)[5] = 6.25e-06; + if (N > 3) + hits_ge.col(3)[5] = 3.136e-05; + hits_ge.col(0)[1] = -5.60077e-06; + hits_ge.col(1)[1] = -1.11936e-06; + hits_ge.col(2)[1] = -6.24945e-07; + if (N > 3) + hits_ge.col(3)[1] = -5.28e-06; +} + +template +void testFit() { + constexpr double B = 0.0113921; + riemannFit::Matrix3xNd hits; + riemannFit::Matrix6xNf hits_ge = MatrixXf::Zero(6, N); + + fillHitsAndHitsCov(hits, hits_ge); + + std::cout << "sizes " << N << ' ' << sizeof(hits) << ' ' << sizeof(hits_ge) << ' ' << sizeof(Vector4d) << std::endl; + + std::cout << "Generated hits:\n" << hits << std::endl; + std::cout << "Generated cov:\n" << hits_ge << std::endl; + + // FAST_FIT_CPU +#ifdef USE_BL + Vector4d fast_fit_results; + brokenline::fastFit(hits, fast_fit_results); +#else + Vector4d fast_fit_results; + riemannFit::fastFit(hits, fast_fit_results); +#endif + std::cout << "Fitted values (FastFit, [X0, Y0, R, tan(theta)]):\n" << fast_fit_results << std::endl; + + // CIRCLE_FIT CPU + +#ifdef USE_BL + brokenline::PreparedBrokenLineData data; + brokenline::karimaki_circle_fit circle_fit_results; + riemannFit::Matrix3d Jacob; + + brokenline::prepareBrokenLineData(hits, fast_fit_results, B, data); + riemannFit::LineFit line_fit_results; + brokenline::lineFit(hits_ge, fast_fit_results, B, data, line_fit_results); + brokenline::circleFit(hits, hits_ge, fast_fit_results, B, data, circle_fit_results); + Jacob << 1., 0, 0, 0, 1., 0, 0, 0, + -B / std::copysign(riemannFit::sqr(circle_fit_results.par(2)), circle_fit_results.par(2)); + circle_fit_results.par(2) = B / std::abs(circle_fit_results.par(2)); + circle_fit_results.cov = Jacob * circle_fit_results.cov * Jacob.transpose(); +#else + riemannFit::VectorNd rad = (hits.block(0, 0, 2, N).colwise().norm()); + riemannFit::Matrix2Nd hits_cov = riemannFit::Matrix2Nd::Zero(); + riemannFit::loadCovariance2D(hits_ge, hits_cov); + riemannFit::CircleFit circle_fit_results = + riemannFit::circleFit(hits.block(0, 0, 2, N), hits_cov, fast_fit_results, rad, B, true); + // LINE_FIT CPU + riemannFit::LineFit line_fit_results = + riemannFit::lineFit(hits, hits_ge, circle_fit_results, fast_fit_results, B, true); + riemannFit::par_uvrtopak(circle_fit_results, B, true); + +#endif + + std::cout << "Fitted values (CircleFit):\n" + << circle_fit_results.par << "\nchi2 " << circle_fit_results.chi2 << std::endl; + std::cout << "Fitted values (LineFit):\n" << line_fit_results.par << "\nchi2 " << line_fit_results.chi2 << std::endl; + + std::cout << "Fitted cov (CircleFit) CPU:\n" << circle_fit_results.cov << std::endl; + std::cout << "Fitted cov (LineFit): CPU\n" << line_fit_results.cov << std::endl; +} + +int main(int argc, char* argv[]) { + testFit<4>(); + testFit<3>(); + testFit<5>(); + + return 0; +} diff --git a/RecoPixelVertexing/PixelTrackFitting/test/test_common.h b/RecoPixelVertexing/PixelTrackFitting/test/test_common.h new file mode 100644 index 0000000000000..6377628b0eeca --- /dev/null +++ b/RecoPixelVertexing/PixelTrackFitting/test/test_common.h @@ -0,0 +1,47 @@ +#ifndef RecoPixelVertexing__PixelTrackFitting__test_common_h +#define RecoPixelVertexing__PixelTrackFitting__test_common_h + +#include +#include +#include + +template +__host__ __device__ void printIt(C* m) { +#ifdef TEST_DEBUG + printf("\nMatrix %dx%d\n", (int)m->rows(), (int)m->cols()); + for (u_int r = 0; r < m->rows(); ++r) { + for (u_int c = 0; c < m->cols(); ++c) { + printf("Matrix(%d,%d) = %f\n", r, c, (*m)(r, c)); + } + } +#endif +} + +template +bool isEqualFuzzy(C1 a, C2 b, double epsilon = 1e-6) { + for (unsigned int i = 0; i < a.rows(); ++i) { + for (unsigned int j = 0; j < a.cols(); ++j) { + assert(std::abs(a(i, j) - b(i, j)) < std::min(std::abs(a(i, j)), std::abs(b(i, j))) * epsilon); + } + } + return true; +} + +bool isEqualFuzzy(double a, double b, double epsilon = 1e-6) { + return std::abs(a - b) < std::min(std::abs(a), std::abs(b)) * epsilon; +} + +template +void fillMatrix(T& t) { + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution<> dis(0.0, 2.0); + for (int row = 0; row < t.rows(); ++row) { + for (int col = 0; col < t.cols(); ++col) { + t(row, col) = dis(gen); + } + } + return; +} + +#endif diff --git a/RecoPixelVertexing/PixelTriplets/interface/CAHitQuadrupletGenerator.h b/RecoPixelVertexing/PixelTriplets/interface/CAHitQuadrupletGenerator.h index 9d149533eefbc..deb2beb6099ee 100644 --- a/RecoPixelVertexing/PixelTriplets/interface/CAHitQuadrupletGenerator.h +++ b/RecoPixelVertexing/PixelTriplets/interface/CAHitQuadrupletGenerator.h @@ -42,7 +42,7 @@ class CAHitQuadrupletGenerator { ~CAHitQuadrupletGenerator() = default; static void fillDescriptions(edm::ParameterSetDescription& desc); - static const char* fillDescriptionsLabel() { return "caHitQuadruplet"; } + static const char* fillDescriptionsLabel() { return "caHitQuadrupletDefault"; } void initEvent(const edm::Event& ev, const edm::EventSetup& es); diff --git a/RecoPixelVertexing/PixelTriplets/interface/CircleEq.h b/RecoPixelVertexing/PixelTriplets/interface/CircleEq.h new file mode 100644 index 0000000000000..986fe2e2992b9 --- /dev/null +++ b/RecoPixelVertexing/PixelTriplets/interface/CircleEq.h @@ -0,0 +1,97 @@ +#ifndef RecoPixelVertexingPixelTripletsCircleEq_H +#define RecoPixelVertexingPixelTripletsCircleEq_H +/** +| 1) circle is parameterized as: | +| C*[(X-Xp)**2+(Y-Yp)**2] - 2*alpha*(X-Xp) - 2*beta*(Y-Yp) = 0 | +| Xp,Yp is a point on the track; | +| C = 1/r0 is the curvature ( sign of C is charge of particle ); | +| alpha & beta are the direction cosines of the radial vector at Xp,Yp | +| i.e. alpha = C*(X0-Xp), | +| beta = C*(Y0-Yp), | +| where center of circle is at X0,Y0. | +| | +| Slope dy/dx of tangent at Xp,Yp is -alpha/beta. | +| 2) the z dimension of the helix is parameterized by gamma = dZ/dSperp | +| this is also the tangent of the pitch angle of the helix. | +| with this parameterization, (alpha,beta,gamma) rotate like a vector. | +| 3) For tracks going inward at (Xp,Yp), C, alpha, beta, and gamma change sign| +| +*/ + +#include + +template +class CircleEq { +public: + CircleEq() {} + + constexpr CircleEq(T x1, T y1, T x2, T y2, T x3, T y3) { compute(x1, y1, x2, y2, x3, y3); } + + constexpr void compute(T x1, T y1, T x2, T y2, T x3, T y3); + + // dca to origin divided by curvature + constexpr T dca0() const { + auto x = m_c * m_xp + m_alpha; + auto y = m_c * m_yp + m_beta; + return std::sqrt(x * x + y * y) - T(1); + } + + // dca to given point (divided by curvature) + constexpr T dca(T x, T y) const { + x = m_c * (m_xp - x) + m_alpha; + y = m_c * (m_yp - y) + m_beta; + return std::sqrt(x * x + y * y) - T(1); + } + + // curvature + constexpr auto curvature() const { return m_c; } + + // alpha and beta + constexpr std::pair cosdir() const { return std::make_pair(m_alpha, m_beta); } + + // alpha and beta af given point + constexpr std::pair cosdir(T x, T y) const { + return std::make_pair(m_alpha - m_c * (x - m_xp), m_beta - m_c * (y - m_yp)); + } + + // center + constexpr std::pair center() const { return std::make_pair(m_xp + m_alpha / m_c, m_yp + m_beta / m_c); } + + constexpr auto radius() const { return T(1) / m_c; } + + T m_xp = 0; + T m_yp = 0; + T m_c = 0; + T m_alpha = 0; + T m_beta = 0; +}; + +template +constexpr void CircleEq::compute(T x1, T y1, T x2, T y2, T x3, T y3) { + bool noflip = std::abs(x3 - x1) < std::abs(y3 - y1); + + auto x1p = noflip ? x1 - x2 : y1 - y2; + auto y1p = noflip ? y1 - y2 : x1 - x2; + auto d12 = x1p * x1p + y1p * y1p; + auto x3p = noflip ? x3 - x2 : y3 - y2; + auto y3p = noflip ? y3 - y2 : x3 - x2; + auto d32 = x3p * x3p + y3p * y3p; + + auto num = x1p * y3p - y1p * x3p; // num also gives correct sign for CT + auto det = d12 * y3p - d32 * y1p; + + auto st2 = (d12 * x3p - d32 * x1p); + auto seq = det * det + st2 * st2; + auto al2 = T(1.) / std::sqrt(seq); + auto be2 = -st2 * al2; + auto ct = T(2.) * num * al2; + al2 *= det; + + m_xp = x2; + m_yp = y2; + m_c = noflip ? ct : -ct; + m_alpha = noflip ? al2 : -be2; + m_beta = noflip ? be2 : -al2; +} + +#endif diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cc new file mode 100644 index 0000000000000..bebfe0e08008e --- /dev/null +++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cc @@ -0,0 +1,70 @@ +#include "BrokenLineFitOnGPU.h" + +void HelixFitOnGPU::launchBrokenLineKernelsOnCPU(HitsView const* hv, uint32_t hitsInFit, uint32_t maxNumberOfTuples) { + assert(tuples_); + + // Fit internals + auto hitsGPU_ = + std::make_unique(maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix3xNd<4>) / sizeof(double)); + auto hits_geGPU_ = + std::make_unique(maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix6x4f) / sizeof(float)); + auto fast_fit_resultsGPU_ = + std::make_unique(maxNumberOfConcurrentFits_ * sizeof(riemannFit::Vector4d) / sizeof(double)); + + for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) { + // fit triplets + kernel_BLFastFit<3>( + tuples_, tupleMultiplicity_, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 3, offset); + + kernel_BLFit<3>(tupleMultiplicity_, + bField_, + outputSoa_, + hitsGPU_.get(), + hits_geGPU_.get(), + fast_fit_resultsGPU_.get(), + 3, + offset); + + // fit quads + kernel_BLFastFit<4>( + tuples_, tupleMultiplicity_, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 4, offset); + + kernel_BLFit<4>(tupleMultiplicity_, + bField_, + outputSoa_, + hitsGPU_.get(), + hits_geGPU_.get(), + fast_fit_resultsGPU_.get(), + 4, + offset); + + if (fit5as4_) { + // fit penta (only first 4) + kernel_BLFastFit<4>( + tuples_, tupleMultiplicity_, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 5, offset); + + kernel_BLFit<4>(tupleMultiplicity_, + bField_, + outputSoa_, + hitsGPU_.get(), + hits_geGPU_.get(), + fast_fit_resultsGPU_.get(), + 5, + offset); + } else { + // fit penta (all 5) + kernel_BLFastFit<5>( + tuples_, tupleMultiplicity_, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 5, offset); + + kernel_BLFit<5>(tupleMultiplicity_, + bField_, + outputSoa_, + hitsGPU_.get(), + hits_geGPU_.get(), + fast_fit_resultsGPU_.get(), + 5, + offset); + } + + } // loop on concurrent fits +} diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu new file mode 100644 index 0000000000000..d2ca583e86bd0 --- /dev/null +++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu @@ -0,0 +1,85 @@ +#include "BrokenLineFitOnGPU.h" +#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" + +void HelixFitOnGPU::launchBrokenLineKernels(HitsView const *hv, + uint32_t hitsInFit, + uint32_t maxNumberOfTuples, + cudaStream_t stream) { + assert(tuples_); + + auto blockSize = 64; + auto numberOfBlocks = (maxNumberOfConcurrentFits_ + blockSize - 1) / blockSize; + + // Fit internals + auto hitsGPU_ = cms::cuda::make_device_unique( + maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix3xNd<4>) / sizeof(double), stream); + auto hits_geGPU_ = cms::cuda::make_device_unique( + maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix6x4f) / sizeof(float), stream); + auto fast_fit_resultsGPU_ = cms::cuda::make_device_unique( + maxNumberOfConcurrentFits_ * sizeof(riemannFit::Vector4d) / sizeof(double), stream); + + for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) { + // fit triplets + kernel_BLFastFit<3><<>>( + tuples_, tupleMultiplicity_, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 3, offset); + cudaCheck(cudaGetLastError()); + + kernel_BLFit<3><<>>(tupleMultiplicity_, + bField_, + outputSoa_, + hitsGPU_.get(), + hits_geGPU_.get(), + fast_fit_resultsGPU_.get(), + 3, + offset); + cudaCheck(cudaGetLastError()); + + // fit quads + kernel_BLFastFit<4><<>>( + tuples_, tupleMultiplicity_, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 4, offset); + cudaCheck(cudaGetLastError()); + + kernel_BLFit<4><<>>(tupleMultiplicity_, + bField_, + outputSoa_, + hitsGPU_.get(), + hits_geGPU_.get(), + fast_fit_resultsGPU_.get(), + 4, + offset); + cudaCheck(cudaGetLastError()); + + if (fit5as4_) { + // fit penta (only first 4) + kernel_BLFastFit<4><<>>( + tuples_, tupleMultiplicity_, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 5, offset); + cudaCheck(cudaGetLastError()); + + kernel_BLFit<4><<>>(tupleMultiplicity_, + bField_, + outputSoa_, + hitsGPU_.get(), + hits_geGPU_.get(), + fast_fit_resultsGPU_.get(), + 5, + offset); + cudaCheck(cudaGetLastError()); + } else { + // fit penta (all 5) + kernel_BLFastFit<5><<>>( + tuples_, tupleMultiplicity_, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 5, offset); + cudaCheck(cudaGetLastError()); + + kernel_BLFit<5><<>>(tupleMultiplicity_, + bField_, + outputSoa_, + hitsGPU_.get(), + hits_geGPU_.get(), + fast_fit_resultsGPU_.get(), + 5, + offset); + cudaCheck(cudaGetLastError()); + } + + } // loop on concurrent fits +} diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h new file mode 100644 index 0000000000000..ee5065e81fc45 --- /dev/null +++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h @@ -0,0 +1,184 @@ +// +// Author: Felice Pantaleo, CERN +// + +// #define BROKENLINE_DEBUG + +#include + +#include + +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" +#include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h" +#include "RecoPixelVertexing/PixelTrackFitting/interface/BrokenLine.h" + +#include "HelixFitOnGPU.h" + +using HitsOnGPU = TrackingRecHit2DSOAView; +using Tuples = pixelTrack::HitContainer; +using OutputSoA = pixelTrack::TrackSoA; + +// #define BL_DUMP_HITS + +template +__global__ void kernel_BLFastFit(Tuples const *__restrict__ foundNtuplets, + caConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity, + HitsOnGPU const *__restrict__ hhp, + double *__restrict__ phits, + float *__restrict__ phits_ge, + double *__restrict__ pfast_fit, + uint32_t nHits, + uint32_t offset) { + constexpr uint32_t hitsInFit = N; + + assert(hitsInFit <= nHits); + + assert(hhp); + assert(pfast_fit); + assert(foundNtuplets); + assert(tupleMultiplicity); + + // look in bin for this hit multiplicity + auto local_start = blockIdx.x * blockDim.x + threadIdx.x; + +#ifdef BROKENLINE_DEBUG + if (0 == local_start) { + printf("%d total Ntuple\n", foundNtuplets->nbins()); + printf("%d Ntuple of size %d for %d hits to fit\n", tupleMultiplicity->size(nHits), nHits, hitsInFit); + } +#endif + + for (int local_idx = local_start, nt = riemannFit::maxNumberOfConcurrentFits; local_idx < nt; + local_idx += gridDim.x * blockDim.x) { + auto tuple_idx = local_idx + offset; + if (tuple_idx >= tupleMultiplicity->size(nHits)) + break; + + // get it from the ntuple container (one to one to helix) + auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_idx); + assert(tkid < foundNtuplets->nbins()); + + assert(foundNtuplets->size(tkid) == nHits); + + riemannFit::Map3xNd hits(phits + local_idx); + riemannFit::Map4d fast_fit(pfast_fit + local_idx); + riemannFit::Map6xNf hits_ge(phits_ge + local_idx); + +#ifdef BL_DUMP_HITS + __shared__ int done; + done = 0; + __syncthreads(); + bool dump = (foundNtuplets->size(tkid) == 5 && 0 == atomicAdd(&done, 1)); +#endif + + // Prepare data structure + auto const *hitId = foundNtuplets->begin(tkid); + for (unsigned int i = 0; i < hitsInFit; ++i) { + auto hit = hitId[i]; + float ge[6]; + hhp->cpeParams() + .detParams(hhp->detectorIndex(hit)) + .frame.toGlobal(hhp->xerrLocal(hit), 0, hhp->yerrLocal(hit), ge); +#ifdef BL_DUMP_HITS + if (dump) { + printf("Hit global: %d: %d hits.col(%d) << %f,%f,%f\n", + tkid, + hhp->detectorIndex(hit), + i, + hhp->xGlobal(hit), + hhp->yGlobal(hit), + hhp->zGlobal(hit)); + printf("Error: %d: %d hits_ge.col(%d) << %e,%e,%e,%e,%e,%e\n", + tkid, + hhp->detetectorIndex(hit), + i, + ge[0], + ge[1], + ge[2], + ge[3], + ge[4], + ge[5]); + } +#endif + hits.col(i) << hhp->xGlobal(hit), hhp->yGlobal(hit), hhp->zGlobal(hit); + hits_ge.col(i) << ge[0], ge[1], ge[2], ge[3], ge[4], ge[5]; + } + brokenline::fastFit(hits, fast_fit); + + // no NaN here.... + assert(fast_fit(0) == fast_fit(0)); + assert(fast_fit(1) == fast_fit(1)); + assert(fast_fit(2) == fast_fit(2)); + assert(fast_fit(3) == fast_fit(3)); + } +} + +template +__global__ void kernel_BLFit(caConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity, + double bField, + OutputSoA *results, + double *__restrict__ phits, + float *__restrict__ phits_ge, + double *__restrict__ pfast_fit, + uint32_t nHits, + uint32_t offset) { + assert(N <= nHits); + + assert(results); + assert(pfast_fit); + + // same as above... + + // look in bin for this hit multiplicity + auto local_start = blockIdx.x * blockDim.x + threadIdx.x; + for (int local_idx = local_start, nt = riemannFit::maxNumberOfConcurrentFits; local_idx < nt; + local_idx += gridDim.x * blockDim.x) { + auto tuple_idx = local_idx + offset; + if (tuple_idx >= tupleMultiplicity->size(nHits)) + break; + + // get it for the ntuple container (one to one to helix) + auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_idx); + + riemannFit::Map3xNd hits(phits + local_idx); + riemannFit::Map4d fast_fit(pfast_fit + local_idx); + riemannFit::Map6xNf hits_ge(phits_ge + local_idx); + + brokenline::PreparedBrokenLineData data; + + brokenline::karimaki_circle_fit circle; + riemannFit::LineFit line; + + brokenline::prepareBrokenLineData(hits, fast_fit, bField, data); + brokenline::lineFit(hits_ge, fast_fit, bField, data, line); + brokenline::circleFit(hits, hits_ge, fast_fit, bField, data, circle); + + results->stateAtBS.copyFromCircle(circle.par, circle.cov, line.par, line.cov, 1.f / float(bField), tkid); + results->pt(tkid) = float(bField) / float(std::abs(circle.par(2))); + results->eta(tkid) = asinhf(line.par(0)); + results->chi2(tkid) = (circle.chi2 + line.chi2) / (2 * N - 5); + +#ifdef BROKENLINE_DEBUG + if (!(circle.chi2 >= 0) || !(line.chi2 >= 0)) + printf("kernelBLFit failed! %f/%f\n", circle.chi2, line.chi2); + printf("kernelBLFit size %d for %d hits circle.par(0,1,2): %d %f,%f,%f\n", + N, + nHits, + tkid, + circle.par(0), + circle.par(1), + circle.par(2)); + printf("kernelBLHits line.par(0,1): %d %f,%f\n", tkid, line.par(0), line.par(1)); + printf("kernelBLHits chi2 cov %f/%f %e,%e,%e,%e,%e\n", + circle.chi2, + line.chi2, + circle.cov(0, 0), + circle.cov(1, 1), + circle.cov(2, 2), + line.cov(0, 0), + line.cov(1, 1)); +#endif + } +} diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml b/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml index f76451675de59..3a54cd1134bc2 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml +++ b/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml @@ -1,10 +1,14 @@ + + + + + + + - + - - - diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h b/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h new file mode 100644 index 0000000000000..5342141d2c9e4 --- /dev/null +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h @@ -0,0 +1,83 @@ +#ifndef RecoPixelVertexing_PixelTriplets_plugins_CAConstants_h +#define RecoPixelVertexing_PixelTriplets_plugins_CAConstants_h + +#include + +#include + +#include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h" +#include "HeterogeneousCore/CUDAUtilities/interface/SimpleVector.h" +#include "HeterogeneousCore/CUDAUtilities/interface/VecArray.h" +#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" + +//#define ONLY_PHICUT + +// Cellular automaton constants +namespace caConstants { + + // constants +#ifdef ONLY_PHICUT + constexpr uint32_t maxCellNeighbors = 64; + constexpr uint32_t maxCellTracks = 64; + constexpr uint32_t maxNumberOfTuples = 48 * 1024; + constexpr uint32_t maxNumberOfDoublets = 2 * 1024 * 1024; + constexpr uint32_t maxCellsPerHit = 8 * 128; +#else // ONLY_PHICUT + constexpr uint32_t maxCellNeighbors = 36; + constexpr uint32_t maxCellTracks = 48; +#ifdef GPU_SMALL_EVENTS + // kept for testing and debugging + constexpr uint32_t maxNumberOfTuples = 3 * 1024; + constexpr uint32_t maxNumberOfDoublets = 128 * 1024; + constexpr uint32_t maxCellsPerHit = 128 / 2; +#else // GPU_SMALL_EVENTS + // tested on MC events with 55-75 pileup events + constexpr uint32_t maxNumberOfTuples = 24 * 1024; + constexpr uint32_t maxNumberOfDoublets = 512 * 1024; + constexpr uint32_t maxCellsPerHit = 128; +#endif // GPU_SMALL_EVENTS +#endif // ONLY_PHICUT + constexpr uint32_t maxNumOfActiveDoublets = maxNumberOfDoublets / 8; + constexpr uint32_t maxNumberOfQuadruplets = maxNumberOfTuples; + + constexpr uint32_t maxNumberOfLayerPairs = 20; + constexpr uint32_t maxNumberOfLayers = 10; + constexpr uint32_t maxTuples = maxNumberOfTuples; + + // Modules constants + constexpr uint32_t max_ladder_bpx0 = 12; + constexpr uint32_t first_ladder_bpx0 = 0; + constexpr float module_length_bpx0 = 6.7f; + constexpr float module_tolerance_bpx0 = 0.4f; // projection to cylinder is inaccurate on BPIX1 + constexpr uint32_t max_ladder_bpx4 = 64; + constexpr uint32_t first_ladder_bpx4 = 84; + constexpr float radius_even_ladder = 15.815f; + constexpr float radius_odd_ladder = 16.146f; + constexpr float module_length_bpx4 = 6.7f; + constexpr float module_tolerance_bpx4 = 0.2f; + constexpr float barrel_z_length = 26.f; + constexpr float forward_z_begin = 32.f; + + // Last indexes + constexpr uint32_t last_bpix1_detIndex = 96; + constexpr uint32_t last_barrel_detIndex = 1184; + + // types + using hindex_type = uint32_t; // FIXME from siPixelRecHitsHeterogeneousProduct + using tindex_type = uint16_t; // for tuples + + using CellNeighbors = cms::cuda::VecArray; + using CellTracks = cms::cuda::VecArray; + + using CellNeighborsVector = cms::cuda::SimpleVector; + using CellTracksVector = cms::cuda::SimpleVector; + + using OuterHitOfCell = cms::cuda::VecArray; + using TuplesContainer = cms::cuda::OneToManyAssoc; + using HitToTuple = + cms::cuda::OneToManyAssoc; // 3.5 should be enough + using TupleMultiplicity = cms::cuda::OneToManyAssoc; + +} // namespace caConstants + +#endif // RecoPixelVertexing_PixelTriplets_plugins_CAConstants_h diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc new file mode 100644 index 0000000000000..beba54c33f513 --- /dev/null +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc @@ -0,0 +1,83 @@ +#include + +#include "CUDADataFormats/Common/interface/Product.h" +#include "DataFormats/Common/interface/Handle.h" +#include "FWCore/Framework/interface/ESHandle.h" +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Framework/interface/EventSetup.h" +#include "FWCore/Framework/interface/MakerMacros.h" +#include "FWCore/Framework/interface/global/EDProducer.h" +#include "FWCore/Framework/interface/ConsumesCollector.h" +#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" +#include "FWCore/Utilities/interface/InputTag.h" +#include "FWCore/PluginManager/interface/ModuleDef.h" +#include "FWCore/Utilities/interface/EDGetToken.h" +#include "FWCore/Utilities/interface/RunningAverage.h" +#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h" +#include "RecoTracker/TkMSParametrization/interface/PixelRecoUtilities.h" + +#include "CAHitNtupletGeneratorOnGPU.h" +#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" + +class CAHitNtupletCUDA : public edm::global::EDProducer<> { +public: + explicit CAHitNtupletCUDA(const edm::ParameterSet& iConfig); + ~CAHitNtupletCUDA() override = default; + + static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); + +private: + void produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const override; + + bool onGPU_; + + edm::EDGetTokenT> tokenHitGPU_; + edm::EDPutTokenT> tokenTrackGPU_; + edm::EDGetTokenT tokenHitCPU_; + edm::EDPutTokenT tokenTrackCPU_; + + CAHitNtupletGeneratorOnGPU gpuAlgo_; +}; + +CAHitNtupletCUDA::CAHitNtupletCUDA(const edm::ParameterSet& iConfig) + : onGPU_(iConfig.getParameter("onGPU")), gpuAlgo_(iConfig, consumesCollector()) { + if (onGPU_) { + tokenHitGPU_ = + consumes>(iConfig.getParameter("pixelRecHitSrc")); + tokenTrackGPU_ = produces>(); + } else { + tokenHitCPU_ = consumes(iConfig.getParameter("pixelRecHitSrc")); + tokenTrackCPU_ = produces(); + } +} + +void CAHitNtupletCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { + edm::ParameterSetDescription desc; + + desc.add("onGPU", true); + desc.add("pixelRecHitSrc", edm::InputTag("siPixelRecHitsPreSplittingCUDA")); + + CAHitNtupletGeneratorOnGPU::fillDescriptions(desc); + descriptions.add("caHitNtupletCUDA", desc); +} + +void CAHitNtupletCUDA::produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& es) const { + auto bf = 1. / PixelRecoUtilities::fieldInInvGev(es); + + if (onGPU_) { + auto hHits = iEvent.getHandle(tokenHitGPU_); + + cms::cuda::ScopedContextProduce ctx{*hHits}; + auto const& hits = ctx.get(*hHits); + + ctx.emplace(iEvent, tokenTrackGPU_, gpuAlgo_.makeTuplesAsync(hits, bf, ctx.stream())); + } else { + auto const& hits = iEvent.get(tokenHitCPU_); + iEvent.emplace(tokenTrackCPU_, gpuAlgo_.makeTuples(hits, bf)); + } +} + +DEFINE_FWK_MODULE(CAHitNtupletCUDA); diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc new file mode 100644 index 0000000000000..c4b8a5a54847f --- /dev/null +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc @@ -0,0 +1,184 @@ +#include "RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h" + +template <> +void CAHitNtupletGeneratorKernelsCPU::printCounters(Counters const *counters) { + kernel_printCounters(counters); +} + +template <> +void CAHitNtupletGeneratorKernelsCPU::fillHitDetIndices(HitsView const *hv, TkSoA *tracks_d, cudaStream_t) { + kernel_fillHitDetIndices(&tracks_d->hitIndices, hv, &tracks_d->detIndices); +} + +template <> +void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStream_t stream) { + auto nhits = hh.nHits(); + +#ifdef NTUPLE_DEBUG + std::cout << "building Doublets out of " << nhits << " Hits" << std::endl; +#endif + + // in principle we can use "nhits" to heuristically dimension the workspace... + // overkill to use template here (std::make_unique would suffice) + // device_isOuterHitOfCell_ = Traits:: template make_unique(cs, std::max(1U,nhits), stream); + device_isOuterHitOfCell_.reset( + (GPUCACell::OuterHitOfCell *)malloc(std::max(1U, nhits) * sizeof(GPUCACell::OuterHitOfCell))); + assert(device_isOuterHitOfCell_.get()); + + cellStorage_.reset((unsigned char *)malloc(caConstants::maxNumOfActiveDoublets * sizeof(GPUCACell::CellNeighbors) + + caConstants::maxNumOfActiveDoublets * sizeof(GPUCACell::CellTracks))); + device_theCellNeighborsContainer_ = (GPUCACell::CellNeighbors *)cellStorage_.get(); + device_theCellTracksContainer_ = (GPUCACell::CellTracks *)(cellStorage_.get() + caConstants::maxNumOfActiveDoublets * + sizeof(GPUCACell::CellNeighbors)); + + gpuPixelDoublets::initDoublets(device_isOuterHitOfCell_.get(), + nhits, + device_theCellNeighbors_.get(), + device_theCellNeighborsContainer_, + device_theCellTracks_.get(), + device_theCellTracksContainer_); + + // device_theCells_ = Traits:: template make_unique(cs, m_params.maxNumberOfDoublets_, stream); + device_theCells_.reset((GPUCACell *)malloc(sizeof(GPUCACell) * params_.maxNumberOfDoublets_)); + if (0 == nhits) + return; // protect against empty events + + // FIXME avoid magic numbers + auto nActualPairs = gpuPixelDoublets::nPairs; + if (!params_.includeJumpingForwardDoublets_) + nActualPairs = 15; + if (params_.minHitsPerNtuplet_ > 3) { + nActualPairs = 13; + } + + assert(nActualPairs <= gpuPixelDoublets::nPairs); + gpuPixelDoublets::getDoubletsFromHisto(device_theCells_.get(), + device_nCells_, + device_theCellNeighbors_.get(), + device_theCellTracks_.get(), + hh.view(), + device_isOuterHitOfCell_.get(), + nActualPairs, + params_.idealConditions_, + params_.doClusterCut_, + params_.doZ0Cut_, + params_.doPtCut_, + params_.maxNumberOfDoublets_); +} + +template <> +void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) { + auto *tuples_d = &tracks_d->hitIndices; + auto *quality_d = tracks_d->qualityData(); + + assert(tuples_d && quality_d); + + // zero tuples + cms::cuda::launchZero(tuples_d, cudaStream); + + auto nhits = hh.nHits(); + assert(nhits <= pixelGPUConstants::maxNumberOfHits); + + // std::cout << "N hits " << nhits << std::endl; + // if (nhits<2) std::cout << "too few hits " << nhits << std::endl; + + // + // applying conbinatoric cleaning such as fishbone at this stage is too expensive + // + + kernel_connect(device_hitTuple_apc_, + device_hitToTuple_apc_, // needed only to be reset, ready for next kernel + hh.view(), + device_theCells_.get(), + device_nCells_, + device_theCellNeighbors_.get(), + device_isOuterHitOfCell_.get(), + params_.hardCurvCut_, + params_.ptmin_, + params_.CAThetaCutBarrel_, + params_.CAThetaCutForward_, + params_.dcaCutInnerTriplet_, + params_.dcaCutOuterTriplet_); + + if (nhits > 1 && params_.earlyFishbone_) { + gpuPixelDoublets::fishbone( + hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, false); + } + + kernel_find_ntuplets(hh.view(), + device_theCells_.get(), + device_nCells_, + device_theCellTracks_.get(), + tuples_d, + device_hitTuple_apc_, + quality_d, + params_.minHitsPerNtuplet_); + if (params_.doStats_) + kernel_mark_used(hh.view(), device_theCells_.get(), device_nCells_); + + cms::cuda::finalizeBulk(device_hitTuple_apc_, tuples_d); + + // remove duplicates (tracks that share a doublet) + kernel_earlyDuplicateRemover(device_theCells_.get(), device_nCells_, tuples_d, quality_d); + + kernel_countMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get()); + cms::cuda::launchFinalize(device_tupleMultiplicity_.get(), cudaStream); + kernel_fillMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get()); + + if (nhits > 1 && params_.lateFishbone_) { + gpuPixelDoublets::fishbone( + hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, true); + } + + if (params_.doStats_) { + kernel_checkOverflows(tuples_d, + device_tupleMultiplicity_.get(), + device_hitToTuple_.get(), + device_hitTuple_apc_, + device_theCells_.get(), + device_nCells_, + device_theCellNeighbors_.get(), + device_theCellTracks_.get(), + device_isOuterHitOfCell_.get(), + nhits, + params_.maxNumberOfDoublets_, + counters_); + } +} + +template <> +void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) { + auto const *tuples_d = &tracks_d->hitIndices; + auto *quality_d = tracks_d->qualityData(); + + // classify tracks based on kinematics + kernel_classifyTracks(tuples_d, tracks_d, params_.cuts_, quality_d); + + if (params_.lateFishbone_) { + // apply fishbone cleaning to good tracks + kernel_fishboneCleaner(device_theCells_.get(), device_nCells_, quality_d); + } + + // remove duplicates (tracks that share a doublet) + kernel_fastDuplicateRemover(device_theCells_.get(), device_nCells_, tuples_d, tracks_d); + + // fill hit->track "map" + kernel_countHitInTracks(tuples_d, quality_d, device_hitToTuple_.get()); + cms::cuda::launchFinalize(device_hitToTuple_.get(), cudaStream); + kernel_fillHitInTracks(tuples_d, quality_d, device_hitToTuple_.get()); + + // remove duplicates (tracks that share a hit) + kernel_tripletCleaner(hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get()); + + if (params_.doStats_) { + // counters (add flag???) + kernel_doStatsForHitInTracks(device_hitToTuple_.get(), counters_); + kernel_doStatsForTracks(tuples_d, quality_d, counters_); + } + +#ifdef DUMP_GPU_TK_TUPLES + static std::atomic iev(0); + ++iev; + kernel_print_found_ntuplets(hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get(), 100, iev); +#endif +} diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu new file mode 100644 index 0000000000000..96639e98939f9 --- /dev/null +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu @@ -0,0 +1,308 @@ +#include "RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h" + +template <> +void CAHitNtupletGeneratorKernelsGPU::fillHitDetIndices(HitsView const *hv, TkSoA *tracks_d, cudaStream_t cudaStream) { + auto blockSize = 128; + auto numberOfBlocks = (HitContainer::capacity() + blockSize - 1) / blockSize; + + kernel_fillHitDetIndices<<>>( + &tracks_d->hitIndices, hv, &tracks_d->detIndices); + cudaCheck(cudaGetLastError()); +#ifdef GPU_DEBUG + cudaDeviceSynchronize(); + cudaCheck(cudaGetLastError()); +#endif +} + +template <> +void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) { + // these are pointer on GPU! + auto *tuples_d = &tracks_d->hitIndices; + auto *quality_d = tracks_d->qualityData(); + + // zero tuples + cms::cuda::launchZero(tuples_d, cudaStream); + + auto nhits = hh.nHits(); + assert(nhits <= pixelGPUConstants::maxNumberOfHits); + + // std::cout << "N hits " << nhits << std::endl; + // if (nhits<2) std::cout << "too few hits " << nhits << std::endl; + + // + // applying conbinatoric cleaning such as fishbone at this stage is too expensive + // + + auto nthTot = 64; + auto stride = 4; + auto blockSize = nthTot / stride; + auto numberOfBlocks = nDoubletBlocks(blockSize); + auto rescale = numberOfBlocks / 65536; + blockSize *= (rescale + 1); + numberOfBlocks = nDoubletBlocks(blockSize); + assert(numberOfBlocks < 65536); + assert(blockSize > 0 && 0 == blockSize % 16); + dim3 blks(1, numberOfBlocks, 1); + dim3 thrs(stride, blockSize, 1); + + kernel_connect<<>>( + device_hitTuple_apc_, + device_hitToTuple_apc_, // needed only to be reset, ready for next kernel + hh.view(), + device_theCells_.get(), + device_nCells_, + device_theCellNeighbors_.get(), + device_isOuterHitOfCell_.get(), + params_.hardCurvCut_, + params_.ptmin_, + params_.CAThetaCutBarrel_, + params_.CAThetaCutForward_, + params_.dcaCutInnerTriplet_, + params_.dcaCutOuterTriplet_); + cudaCheck(cudaGetLastError()); + + if (nhits > 1 && params_.earlyFishbone_) { + auto nthTot = 128; + auto stride = 16; + auto blockSize = nthTot / stride; + auto numberOfBlocks = (nhits + blockSize - 1) / blockSize; + dim3 blks(1, numberOfBlocks, 1); + dim3 thrs(stride, blockSize, 1); + gpuPixelDoublets::fishbone<<>>( + hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, false); + cudaCheck(cudaGetLastError()); + } + + blockSize = 64; + numberOfBlocks = (3 * params_.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize; + kernel_find_ntuplets<<>>(hh.view(), + device_theCells_.get(), + device_nCells_, + device_theCellTracks_.get(), + tuples_d, + device_hitTuple_apc_, + quality_d, + params_.minHitsPerNtuplet_); + cudaCheck(cudaGetLastError()); + + if (params_.doStats_) + kernel_mark_used<<>>(hh.view(), device_theCells_.get(), device_nCells_); + cudaCheck(cudaGetLastError()); + +#ifdef GPU_DEBUG + cudaDeviceSynchronize(); + cudaCheck(cudaGetLastError()); +#endif + + blockSize = 128; + numberOfBlocks = (HitContainer::totbins() + blockSize - 1) / blockSize; + cms::cuda::finalizeBulk<<>>(device_hitTuple_apc_, tuples_d); + + // remove duplicates (tracks that share a doublet) + numberOfBlocks = nDoubletBlocks(blockSize); + kernel_earlyDuplicateRemover<<>>( + device_theCells_.get(), device_nCells_, tuples_d, quality_d); + cudaCheck(cudaGetLastError()); + + blockSize = 128; + numberOfBlocks = (3 * caConstants::maxTuples / 4 + blockSize - 1) / blockSize; + kernel_countMultiplicity<<>>( + tuples_d, quality_d, device_tupleMultiplicity_.get()); + cms::cuda::launchFinalize(device_tupleMultiplicity_.get(), cudaStream); + kernel_fillMultiplicity<<>>( + tuples_d, quality_d, device_tupleMultiplicity_.get()); + cudaCheck(cudaGetLastError()); + + if (nhits > 1 && params_.lateFishbone_) { + auto nthTot = 128; + auto stride = 16; + auto blockSize = nthTot / stride; + auto numberOfBlocks = (nhits + blockSize - 1) / blockSize; + dim3 blks(1, numberOfBlocks, 1); + dim3 thrs(stride, blockSize, 1); + gpuPixelDoublets::fishbone<<>>( + hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, true); + cudaCheck(cudaGetLastError()); + } + +#ifdef GPU_DEBUG + cudaDeviceSynchronize(); + cudaCheck(cudaGetLastError()); +#endif + + // free space asap + // device_isOuterHitOfCell_.reset(); +} + +template <> +void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStream_t stream) { + auto nhits = hh.nHits(); + +#ifdef NTUPLE_DEBUG + std::cout << "building Doublets out of " << nhits << " Hits" << std::endl; +#endif + +#ifdef GPU_DEBUG + cudaDeviceSynchronize(); + cudaCheck(cudaGetLastError()); +#endif + + // in principle we can use "nhits" to heuristically dimension the workspace... + device_isOuterHitOfCell_ = cms::cuda::make_device_unique(std::max(1U, nhits), stream); + assert(device_isOuterHitOfCell_.get()); + + cellStorage_ = cms::cuda::make_device_unique( + caConstants::maxNumOfActiveDoublets * sizeof(GPUCACell::CellNeighbors) + + caConstants::maxNumOfActiveDoublets * sizeof(GPUCACell::CellTracks), + stream); + device_theCellNeighborsContainer_ = (GPUCACell::CellNeighbors *)cellStorage_.get(); + device_theCellTracksContainer_ = (GPUCACell::CellTracks *)(cellStorage_.get() + caConstants::maxNumOfActiveDoublets * + sizeof(GPUCACell::CellNeighbors)); + + { + int threadsPerBlock = 128; + // at least one block! + int blocks = (std::max(1U, nhits) + threadsPerBlock - 1) / threadsPerBlock; + gpuPixelDoublets::initDoublets<<>>(device_isOuterHitOfCell_.get(), + nhits, + device_theCellNeighbors_.get(), + device_theCellNeighborsContainer_, + device_theCellTracks_.get(), + device_theCellTracksContainer_); + cudaCheck(cudaGetLastError()); + } + + device_theCells_ = cms::cuda::make_device_unique(params_.maxNumberOfDoublets_, stream); + +#ifdef GPU_DEBUG + cudaDeviceSynchronize(); + cudaCheck(cudaGetLastError()); +#endif + + if (0 == nhits) + return; // protect against empty events + + // FIXME avoid magic numbers + auto nActualPairs = gpuPixelDoublets::nPairs; + if (!params_.includeJumpingForwardDoublets_) + nActualPairs = 15; + if (params_.minHitsPerNtuplet_ > 3) { + nActualPairs = 13; + } + + assert(nActualPairs <= gpuPixelDoublets::nPairs); + int stride = 4; + int threadsPerBlock = gpuPixelDoublets::getDoubletsFromHistoMaxBlockSize / stride; + int blocks = (4 * nhits + threadsPerBlock - 1) / threadsPerBlock; + dim3 blks(1, blocks, 1); + dim3 thrs(stride, threadsPerBlock, 1); + gpuPixelDoublets::getDoubletsFromHisto<<>>(device_theCells_.get(), + device_nCells_, + device_theCellNeighbors_.get(), + device_theCellTracks_.get(), + hh.view(), + device_isOuterHitOfCell_.get(), + nActualPairs, + params_.idealConditions_, + params_.doClusterCut_, + params_.doZ0Cut_, + params_.doPtCut_, + params_.maxNumberOfDoublets_); + cudaCheck(cudaGetLastError()); + +#ifdef GPU_DEBUG + cudaDeviceSynchronize(); + cudaCheck(cudaGetLastError()); +#endif +} + +template <> +void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) { + // these are pointer on GPU! + auto const *tuples_d = &tracks_d->hitIndices; + auto *quality_d = tracks_d->qualityData(); + + auto blockSize = 64; + + // classify tracks based on kinematics + auto numberOfBlocks = nQuadrupletBlocks(blockSize); + kernel_classifyTracks<<>>(tuples_d, tracks_d, params_.cuts_, quality_d); + cudaCheck(cudaGetLastError()); + + if (params_.lateFishbone_) { + // apply fishbone cleaning to good tracks + numberOfBlocks = nDoubletBlocks(blockSize); + kernel_fishboneCleaner<<>>( + device_theCells_.get(), device_nCells_, quality_d); + cudaCheck(cudaGetLastError()); + } + + // remove duplicates (tracks that share a doublet) + numberOfBlocks = nDoubletBlocks(blockSize); + kernel_fastDuplicateRemover<<>>( + device_theCells_.get(), device_nCells_, tuples_d, tracks_d); + cudaCheck(cudaGetLastError()); + + if (params_.minHitsPerNtuplet_ < 4 || params_.doStats_) { + // fill hit->track "map" + numberOfBlocks = nQuadrupletBlocks(blockSize); + kernel_countHitInTracks<<>>( + tuples_d, quality_d, device_hitToTuple_.get()); + cudaCheck(cudaGetLastError()); + cms::cuda::launchFinalize(device_hitToTuple_.get(), cudaStream); + cudaCheck(cudaGetLastError()); + kernel_fillHitInTracks<<>>(tuples_d, quality_d, device_hitToTuple_.get()); + cudaCheck(cudaGetLastError()); + } + if (params_.minHitsPerNtuplet_ < 4) { + // remove duplicates (tracks that share a hit) + numberOfBlocks = (HitToTuple::capacity() + blockSize - 1) / blockSize; + kernel_tripletCleaner<<>>( + hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get()); + cudaCheck(cudaGetLastError()); + } + + if (params_.doStats_) { + auto nhits = hh.nHits(); + numberOfBlocks = (std::max(nhits, params_.maxNumberOfDoublets_) + blockSize - 1) / blockSize; + kernel_checkOverflows<<>>(tuples_d, + device_tupleMultiplicity_.get(), + device_hitToTuple_.get(), + device_hitTuple_apc_, + device_theCells_.get(), + device_nCells_, + device_theCellNeighbors_.get(), + device_theCellTracks_.get(), + device_isOuterHitOfCell_.get(), + nhits, + params_.maxNumberOfDoublets_, + counters_); + cudaCheck(cudaGetLastError()); + } + + if (params_.doStats_) { + // counters (add flag???) + numberOfBlocks = (HitToTuple::capacity() + blockSize - 1) / blockSize; + kernel_doStatsForHitInTracks<<>>(device_hitToTuple_.get(), counters_); + cudaCheck(cudaGetLastError()); + numberOfBlocks = (3 * caConstants::maxNumberOfQuadruplets / 4 + blockSize - 1) / blockSize; + kernel_doStatsForTracks<<>>(tuples_d, quality_d, counters_); + cudaCheck(cudaGetLastError()); + } +#ifdef GPU_DEBUG + cudaDeviceSynchronize(); + cudaCheck(cudaGetLastError()); +#endif + +#ifdef DUMP_GPU_TK_TUPLES + static std::atomic iev(0); + ++iev; + kernel_print_found_ntuplets<<<1, 32, 0, cudaStream>>>( + hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get(), 100, iev); +#endif +} + +template <> +void CAHitNtupletGeneratorKernelsGPU::printCounters(Counters const *counters) { + kernel_printCounters<<<1, 1>>>(counters); +} diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h new file mode 100644 index 0000000000000..d1a9f3d13a67f --- /dev/null +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h @@ -0,0 +1,223 @@ +#ifndef RecoPixelVertexing_PixelTriplets_plugins_CAHitNtupletGeneratorKernels_h +#define RecoPixelVertexing_PixelTriplets_plugins_CAHitNtupletGeneratorKernels_h + +#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" +#include "GPUCACell.h" + +// #define DUMP_GPU_TK_TUPLES + +namespace cAHitNtupletGenerator { + + // counters + struct Counters { + unsigned long long nEvents; + unsigned long long nHits; + unsigned long long nCells; + unsigned long long nTuples; + unsigned long long nFitTracks; + unsigned long long nGoodTracks; + unsigned long long nUsedHits; + unsigned long long nDupHits; + unsigned long long nKilledCells; + unsigned long long nEmptyCells; + unsigned long long nZeroTrackCells; + }; + + using HitsView = TrackingRecHit2DSOAView; + using HitsOnGPU = TrackingRecHit2DSOAView; + + using HitToTuple = caConstants::HitToTuple; + using TupleMultiplicity = caConstants::TupleMultiplicity; + + using Quality = pixelTrack::Quality; + using TkSoA = pixelTrack::TrackSoA; + using HitContainer = pixelTrack::HitContainer; + + struct QualityCuts { + // chi2 cut = chi2Scale * (chi2Coeff[0] + pT/GeV * (chi2Coeff[1] + pT/GeV * (chi2Coeff[2] + pT/GeV * chi2Coeff[3]))) + float chi2Coeff[4]; + float chi2MaxPt; // GeV + float chi2Scale; + + struct Region { + float maxTip; // cm + float minPt; // GeV + float maxZip; // cm + }; + + Region triplet; + Region quadruplet; + }; + + // params + struct Params { + Params(bool onGPU, + uint32_t minHitsPerNtuplet, + uint32_t maxNumberOfDoublets, + bool useRiemannFit, + bool fit5as4, + bool includeJumpingForwardDoublets, + bool earlyFishbone, + bool lateFishbone, + bool idealConditions, + bool doStats, + bool doClusterCut, + bool doZ0Cut, + bool doPtCut, + float ptmin, + float CAThetaCutBarrel, + float CAThetaCutForward, + float hardCurvCut, + float dcaCutInnerTriplet, + float dcaCutOuterTriplet, + QualityCuts const& cuts) + : onGPU_(onGPU), + minHitsPerNtuplet_(minHitsPerNtuplet), + maxNumberOfDoublets_(maxNumberOfDoublets), + useRiemannFit_(useRiemannFit), + fit5as4_(fit5as4), + includeJumpingForwardDoublets_(includeJumpingForwardDoublets), + earlyFishbone_(earlyFishbone), + lateFishbone_(lateFishbone), + idealConditions_(idealConditions), + doStats_(doStats), + doClusterCut_(doClusterCut), + doZ0Cut_(doZ0Cut), + doPtCut_(doPtCut), + ptmin_(ptmin), + CAThetaCutBarrel_(CAThetaCutBarrel), + CAThetaCutForward_(CAThetaCutForward), + hardCurvCut_(hardCurvCut), + dcaCutInnerTriplet_(dcaCutInnerTriplet), + dcaCutOuterTriplet_(dcaCutOuterTriplet), + cuts_(cuts) {} + + const bool onGPU_; + const uint32_t minHitsPerNtuplet_; + const uint32_t maxNumberOfDoublets_; + const bool useRiemannFit_; + const bool fit5as4_; + const bool includeJumpingForwardDoublets_; + const bool earlyFishbone_; + const bool lateFishbone_; + const bool idealConditions_; + const bool doStats_; + const bool doClusterCut_; + const bool doZ0Cut_; + const bool doPtCut_; + const float ptmin_; + const float CAThetaCutBarrel_; + const float CAThetaCutForward_; + const float hardCurvCut_; + const float dcaCutInnerTriplet_; + const float dcaCutOuterTriplet_; + + // quality cuts + QualityCuts cuts_{// polynomial coefficients for the pT-dependent chi2 cut + {0.68177776, 0.74609577, -0.08035491, 0.00315399}, + // max pT used to determine the chi2 cut + 10., + // chi2 scale factor: 30 for broken line fit, 45 for Riemann fit + 30., + // regional cuts for triplets + { + 0.3, // |Tip| < 0.3 cm + 0.5, // pT > 0.5 GeV + 12.0 // |Zip| < 12.0 cm + }, + // regional cuts for quadruplets + { + 0.5, // |Tip| < 0.5 cm + 0.3, // pT > 0.3 GeV + 12.0 // |Zip| < 12.0 cm + }}; + + }; // Params + +} // namespace cAHitNtupletGenerator + +template +class CAHitNtupletGeneratorKernels { +public: + using Traits = TTraits; + + using QualityCuts = cAHitNtupletGenerator::QualityCuts; + using Params = cAHitNtupletGenerator::Params; + using Counters = cAHitNtupletGenerator::Counters; + + template + using unique_ptr = typename Traits::template unique_ptr; + + using HitsView = TrackingRecHit2DSOAView; + using HitsOnGPU = TrackingRecHit2DSOAView; + using HitsOnCPU = TrackingRecHit2DHeterogeneous; + + using HitToTuple = caConstants::HitToTuple; + using TupleMultiplicity = caConstants::TupleMultiplicity; + + using Quality = pixelTrack::Quality; + using TkSoA = pixelTrack::TrackSoA; + using HitContainer = pixelTrack::HitContainer; + + CAHitNtupletGeneratorKernels(Params const& params) + : params_(params), paramsMaxDoubletes3Quarters_(3 * params.maxNumberOfDoublets_ / 4) {} + ~CAHitNtupletGeneratorKernels() = default; + + TupleMultiplicity const* tupleMultiplicity() const { return device_tupleMultiplicity_.get(); } + + void launchKernels(HitsOnCPU const& hh, TkSoA* tuples_d, cudaStream_t cudaStream); + + void classifyTuples(HitsOnCPU const& hh, TkSoA* tuples_d, cudaStream_t cudaStream); + + void fillHitDetIndices(HitsView const* hv, TkSoA* tuples_d, cudaStream_t cudaStream); + + void buildDoublets(HitsOnCPU const& hh, cudaStream_t stream); + void allocateOnGPU(cudaStream_t stream); + void cleanup(cudaStream_t cudaStream); + + static void printCounters(Counters const* counters); + void setCounters(Counters* counters) { counters_ = counters; } + +private: + Counters* counters_ = nullptr; + + // workspace + unique_ptr cellStorage_; + unique_ptr device_theCellNeighbors_; + caConstants::CellNeighbors* device_theCellNeighborsContainer_; + unique_ptr device_theCellTracks_; + caConstants::CellTracks* device_theCellTracksContainer_; + + unique_ptr device_theCells_; + unique_ptr device_isOuterHitOfCell_; + uint32_t* device_nCells_ = nullptr; + + unique_ptr device_hitToTuple_; + cms::cuda::AtomicPairCounter* device_hitToTuple_apc_ = nullptr; + + cms::cuda::AtomicPairCounter* device_hitTuple_apc_ = nullptr; + + unique_ptr device_tupleMultiplicity_; + + unique_ptr device_storage_; + // params + Params const& params_; + /// Intermediate result avoiding repeated computations. + const uint32_t paramsMaxDoubletes3Quarters_; + /// Compute the number of doublet blocks for block size + inline uint32_t nDoubletBlocks(uint32_t blockSize) { + // We want (3 * params_.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize, but first part is pre-computed. + return (paramsMaxDoubletes3Quarters_ + blockSize - 1) / blockSize; + } + + /// Compute the number of quadruplet blocks for block size + inline uint32_t nQuadrupletBlocks(uint32_t blockSize) { + // caConstants::maxNumberOfQuadruplets is a constexpr, so the compiler will pre compute the 3*max/4 + return (3 * caConstants::maxNumberOfQuadruplets / 4 + blockSize - 1) / blockSize; + } +}; + +using CAHitNtupletGeneratorKernelsGPU = CAHitNtupletGeneratorKernels; +using CAHitNtupletGeneratorKernelsCPU = CAHitNtupletGeneratorKernels; + +#endif // RecoPixelVertexing_PixelTriplets_plugins_CAHitNtupletGeneratorKernels_h diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.cc new file mode 100644 index 0000000000000..96381673388ca --- /dev/null +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.cc @@ -0,0 +1 @@ +#include "CAHitNtupletGeneratorKernelsAlloc.h" diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.cu new file mode 100644 index 0000000000000..96381673388ca --- /dev/null +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.cu @@ -0,0 +1 @@ +#include "CAHitNtupletGeneratorKernelsAlloc.h" diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h new file mode 100644 index 0000000000000..1d19aa43d6e1b --- /dev/null +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h @@ -0,0 +1,35 @@ +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" + +#include "CAHitNtupletGeneratorKernels.h" + +template <> +#ifdef __CUDACC__ +void CAHitNtupletGeneratorKernelsGPU::allocateOnGPU(cudaStream_t stream) { +#else +void CAHitNtupletGeneratorKernelsCPU::allocateOnGPU(cudaStream_t stream) { +#endif + ////////////////////////////////////////////////////////// + // ALLOCATIONS FOR THE INTERMEDIATE RESULTS (STAYS ON WORKER) + ////////////////////////////////////////////////////////// + + device_theCellNeighbors_ = Traits::template make_unique(stream); + device_theCellTracks_ = Traits::template make_unique(stream); + + device_hitToTuple_ = Traits::template make_unique(stream); + + device_tupleMultiplicity_ = Traits::template make_unique(stream); + + device_storage_ = Traits::template make_unique(3, stream); + + device_hitTuple_apc_ = (cms::cuda::AtomicPairCounter*)device_storage_.get(); + device_hitToTuple_apc_ = (cms::cuda::AtomicPairCounter*)device_storage_.get() + 1; + device_nCells_ = (uint32_t*)(device_storage_.get() + 2); + + if constexpr (std::is_same::value) { + cudaCheck(cudaMemsetAsync(device_nCells_, 0, sizeof(uint32_t), stream)); + } else { + *device_nCells_ = 0; + } + cms::cuda::launchZero(device_tupleMultiplicity_.get(), stream); + cms::cuda::launchZero(device_hitToTuple_.get(), stream); // we may wish to keep it in the edm... +} diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h new file mode 100644 index 0000000000000..7c0cec51b8057 --- /dev/null +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h @@ -0,0 +1,593 @@ +// +// Original Author: Felice Pantaleo, CERN +// + +// #define NTUPLE_DEBUG + +#include +#include + +#include + +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" +#include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h" + +#include "CAConstants.h" +#include "CAHitNtupletGeneratorKernels.h" +#include "GPUCACell.h" +#include "gpuFishbone.h" +#include "gpuPixelDoublets.h" + +using HitsOnGPU = TrackingRecHit2DSOAView; +using HitsOnCPU = TrackingRecHit2DCUDA; + +using HitToTuple = caConstants::HitToTuple; +using TupleMultiplicity = caConstants::TupleMultiplicity; + +using Quality = pixelTrack::Quality; +using TkSoA = pixelTrack::TrackSoA; +using HitContainer = pixelTrack::HitContainer; + +__global__ void kernel_checkOverflows(HitContainer const *foundNtuplets, + caConstants::TupleMultiplicity const *tupleMultiplicity, + CAHitNtupletGeneratorKernelsGPU::HitToTuple const *hitToTuple, + cms::cuda::AtomicPairCounter *apc, + GPUCACell const *__restrict__ cells, + uint32_t const *__restrict__ nCells, + gpuPixelDoublets::CellNeighborsVector const *cellNeighbors, + gpuPixelDoublets::CellTracksVector const *cellTracks, + GPUCACell::OuterHitOfCell const *__restrict__ isOuterHitOfCell, + uint32_t nHits, + uint32_t maxNumberOfDoublets, + CAHitNtupletGeneratorKernelsGPU::Counters *counters) { + auto first = threadIdx.x + blockIdx.x * blockDim.x; + + auto &c = *counters; + // counters once per event + if (0 == first) { + atomicAdd(&c.nEvents, 1); + atomicAdd(&c.nHits, nHits); + atomicAdd(&c.nCells, *nCells); + atomicAdd(&c.nTuples, apc->get().m); + atomicAdd(&c.nFitTracks, tupleMultiplicity->size()); + } + +#ifdef NTUPLE_DEBUG + if (0 == first) { + printf("number of found cells %d, found tuples %d with total hits %d out of %d\n", + *nCells, + apc->get().m, + apc->get().n, + nHits); + if (apc->get().m < caConstants::maxNumberOfQuadruplets()) { + assert(foundNtuplets->size(apc->get().m) == 0); + assert(foundNtuplets->size() == apc->get().n); + } + } + + for (int idx = first, nt = foundNtuplets->nbins(); idx < nt; idx += gridDim.x * blockDim.x) { + if (foundNtuplets->size(idx) > 5) + printf("ERROR %d, %d\n", idx, foundNtuplets->size(idx)); + assert(foundNtuplets->size(idx) < 6); + for (auto ih = foundNtuplets->begin(idx); ih != foundNtuplets->end(idx); ++ih) + assert(*ih < nHits); + } +#endif + + if (0 == first) { + if (apc->get().m >= caConstants::maxNumberOfQuadruplets) + printf("Tuples overflow\n"); + if (*nCells >= maxNumberOfDoublets) + printf("Cells overflow\n"); + if (cellNeighbors && cellNeighbors->full()) + printf("cellNeighbors overflow\n"); + if (cellTracks && cellTracks->full()) + printf("cellTracks overflow\n"); + } + + for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) { + auto const &thisCell = cells[idx]; + if (thisCell.outerNeighbors().full()) //++tooManyNeighbors[thisCell.theLayerPairId]; + printf("OuterNeighbors overflow %d in %d\n", idx, thisCell.layerPairId()); + if (thisCell.tracks().full()) //++tooManyTracks[thisCell.theLayerPairId]; + printf("Tracks overflow %d in %d\n", idx, thisCell.layerPairId()); + if (thisCell.isKilled()) + atomicAdd(&c.nKilledCells, 1); + if (thisCell.unused()) + atomicAdd(&c.nEmptyCells, 1); + if (0 == hitToTuple->size(thisCell.inner_hit_id()) && 0 == hitToTuple->size(thisCell.outer_hit_id())) + atomicAdd(&c.nZeroTrackCells, 1); + } + + for (int idx = first, nt = nHits; idx < nt; idx += gridDim.x * blockDim.x) { + if (isOuterHitOfCell[idx].full()) // ++tooManyOuterHitOfCell; + printf("OuterHitOfCell overflow %d\n", idx); + } +} + +__global__ void kernel_fishboneCleaner(GPUCACell const *cells, uint32_t const *__restrict__ nCells, Quality *quality) { + constexpr auto bad = pixelTrack::Quality::bad; + + auto first = threadIdx.x + blockIdx.x * blockDim.x; + for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) { + auto const &thisCell = cells[idx]; + if (!thisCell.isKilled()) + continue; + + for (auto it : thisCell.tracks()) + quality[it] = bad; + } +} + +__global__ void kernel_earlyDuplicateRemover(GPUCACell const *cells, + uint32_t const *__restrict__ nCells, + HitContainer *foundNtuplets, + Quality *quality) { + // constexpr auto bad = trackQuality::bad; + constexpr auto dup = pixelTrack::Quality::dup; + // constexpr auto loose = trackQuality::loose; + + assert(nCells); + auto first = threadIdx.x + blockIdx.x * blockDim.x; + for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) { + auto const &thisCell = cells[idx]; + + if (thisCell.tracks().size() < 2) + continue; + //if (0==thisCell.theUsed) continue; + // if (thisCell.theDoubletId < 0) continue; + + uint32_t maxNh = 0; + + // find maxNh + for (auto it : thisCell.tracks()) { + auto nh = foundNtuplets->size(it); + maxNh = std::max(nh, maxNh); + } + + for (auto it : thisCell.tracks()) { + if (foundNtuplets->size(it) != maxNh) + quality[it] = dup; //no race: simple assignment of the same constant + } + } +} + +__global__ void kernel_fastDuplicateRemover(GPUCACell const *__restrict__ cells, + uint32_t const *__restrict__ nCells, + HitContainer const *__restrict__ foundNtuplets, + TkSoA *__restrict__ tracks) { + constexpr auto bad = pixelTrack::Quality::bad; + constexpr auto dup = pixelTrack::Quality::dup; + constexpr auto loose = pixelTrack::Quality::loose; + + assert(nCells); + + auto first = threadIdx.x + blockIdx.x * blockDim.x; + for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) { + auto const &thisCell = cells[idx]; + if (thisCell.tracks().size() < 2) + continue; + // if (thisCell.theDoubletId < 0) continue; + + float mc = 10000.f; + uint16_t im = 60000; + + auto score = [&](auto it) { + return std::abs(tracks->tip(it)); // tip + // return tracks->chi2(it); //chi2 + }; + + // find min score + for (auto it : thisCell.tracks()) { + if (tracks->quality(it) == loose && score(it) < mc) { + mc = score(it); + im = it; + } + } + // mark all other duplicates + for (auto it : thisCell.tracks()) { + if (tracks->quality(it) != bad && it != im) + tracks->quality(it) = dup; //no race: simple assignment of the same constant + } + } +} + +__global__ void kernel_connect(cms::cuda::AtomicPairCounter *apc1, + cms::cuda::AtomicPairCounter *apc2, // just to zero them, + GPUCACell::Hits const *__restrict__ hhp, + GPUCACell *cells, + uint32_t const *__restrict__ nCells, + gpuPixelDoublets::CellNeighborsVector *cellNeighbors, + GPUCACell::OuterHitOfCell const *__restrict__ isOuterHitOfCell, + float hardCurvCut, + float ptmin, + float CAThetaCutBarrel, + float CAThetaCutForward, + float dcaCutInnerTriplet, + float dcaCutOuterTriplet) { + auto const &hh = *hhp; + + auto firstCellIndex = threadIdx.y + blockIdx.y * blockDim.y; + auto first = threadIdx.x; + auto stride = blockDim.x; + + if (0 == (firstCellIndex + first)) { + (*apc1) = 0; + (*apc2) = 0; + } // ready for next kernel + + for (int idx = firstCellIndex, nt = (*nCells); idx < nt; idx += gridDim.y * blockDim.y) { + auto cellIndex = idx; + auto &thisCell = cells[idx]; + auto innerHitId = thisCell.inner_hit_id(); + int numberOfPossibleNeighbors = isOuterHitOfCell[innerHitId].size(); + auto vi = isOuterHitOfCell[innerHitId].data(); + + auto ri = thisCell.inner_r(hh); + auto zi = thisCell.inner_z(hh); + + auto ro = thisCell.outer_r(hh); + auto zo = thisCell.outer_z(hh); + auto isBarrel = thisCell.inner_detIndex(hh) < caConstants::last_barrel_detIndex; + + for (int j = first; j < numberOfPossibleNeighbors; j += stride) { + auto otherCell = __ldg(vi + j); + auto &oc = cells[otherCell]; + auto r1 = oc.inner_r(hh); + auto z1 = oc.inner_z(hh); + bool aligned = GPUCACell::areAlignedRZ( + r1, + z1, + ri, + zi, + ro, + zo, + ptmin, + isBarrel ? CAThetaCutBarrel : CAThetaCutForward); // 2.f*thetaCut); // FIXME tune cuts + if (aligned && thisCell.dcaCut(hh, + oc, + oc.inner_detIndex(hh) < caConstants::last_bpix1_detIndex ? dcaCutInnerTriplet + : dcaCutOuterTriplet, + hardCurvCut)) { // FIXME tune cuts + oc.addOuterNeighbor(cellIndex, *cellNeighbors); + thisCell.setUsedBit(1); + oc.setUsedBit(1); + } + } // loop on inner cells + } // loop on outer cells +} + +__global__ void kernel_find_ntuplets(GPUCACell::Hits const *__restrict__ hhp, + GPUCACell *__restrict__ cells, + uint32_t const *nCells, + gpuPixelDoublets::CellTracksVector *cellTracks, + HitContainer *foundNtuplets, + cms::cuda::AtomicPairCounter *apc, + Quality *__restrict__ quality, + unsigned int minHitsPerNtuplet) { + // recursive: not obvious to widen + auto const &hh = *hhp; + + auto first = threadIdx.x + blockIdx.x * blockDim.x; + for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) { + auto const &thisCell = cells[idx]; + if (thisCell.isKilled()) + continue; // cut by earlyFishbone + + auto pid = thisCell.layerPairId(); + auto doit = minHitsPerNtuplet > 3 ? pid < 3 : pid < 8 || pid > 12; + if (doit) { + GPUCACell::TmpTuple stack; + stack.reset(); + thisCell.find_ntuplets(hh, cells, *cellTracks, *foundNtuplets, *apc, quality, stack, minHitsPerNtuplet, pid < 3); + assert(stack.empty()); + // printf("in %d found quadruplets: %d\n", cellIndex, apc->get()); + } + } +} + +__global__ void kernel_mark_used(GPUCACell::Hits const *__restrict__ hhp, + GPUCACell *__restrict__ cells, + uint32_t const *nCells) { + auto first = threadIdx.x + blockIdx.x * blockDim.x; + for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) { + auto &thisCell = cells[idx]; + if (!thisCell.tracks().empty()) + thisCell.setUsedBit(2); + } +} + +__global__ void kernel_countMultiplicity(HitContainer const *__restrict__ foundNtuplets, + Quality const *__restrict__ quality, + caConstants::TupleMultiplicity *tupleMultiplicity) { + auto first = blockIdx.x * blockDim.x + threadIdx.x; + for (int it = first, nt = foundNtuplets->nbins(); it < nt; it += gridDim.x * blockDim.x) { + auto nhits = foundNtuplets->size(it); + if (nhits < 3) + continue; + if (quality[it] == pixelTrack::Quality::dup) + continue; + assert(quality[it] == pixelTrack::Quality::bad); + if (nhits > 5) + printf("wrong mult %d %d\n", it, nhits); + assert(nhits < 8); + tupleMultiplicity->countDirect(nhits); + } +} + +__global__ void kernel_fillMultiplicity(HitContainer const *__restrict__ foundNtuplets, + Quality const *__restrict__ quality, + caConstants::TupleMultiplicity *tupleMultiplicity) { + auto first = blockIdx.x * blockDim.x + threadIdx.x; + for (int it = first, nt = foundNtuplets->nbins(); it < nt; it += gridDim.x * blockDim.x) { + auto nhits = foundNtuplets->size(it); + if (nhits < 3) + continue; + if (quality[it] == pixelTrack::Quality::dup) + continue; + assert(quality[it] == pixelTrack::Quality::bad); + if (nhits > 5) + printf("wrong mult %d %d\n", it, nhits); + assert(nhits < 8); + tupleMultiplicity->fillDirect(nhits, it); + } +} + +__global__ void kernel_classifyTracks(HitContainer const *__restrict__ tuples, + TkSoA const *__restrict__ tracks, + CAHitNtupletGeneratorKernelsGPU::QualityCuts cuts, + Quality *__restrict__ quality) { + int first = blockDim.x * blockIdx.x + threadIdx.x; + for (int it = first, nt = tuples->nbins(); it < nt; it += gridDim.x * blockDim.x) { + auto nhits = tuples->size(it); + if (nhits == 0) + break; // guard + + // if duplicate: not even fit + if (quality[it] == pixelTrack::Quality::dup) + continue; + + assert(quality[it] == pixelTrack::Quality::bad); + + // mark doublets as bad + if (nhits < 3) + continue; + + // if the fit has any invalid parameters, mark it as bad + bool isNaN = false; + for (int i = 0; i < 5; ++i) { + isNaN |= std::isnan(tracks->stateAtBS.state(it)(i)); + } + if (isNaN) { +#ifdef NTUPLE_DEBUG + printf("NaN in fit %d size %d chi2 %f\n", it, tuples->size(it), tracks->chi2(it)); +#endif + continue; + } + + // compute a pT-dependent chi2 cut + // default parameters: + // - chi2MaxPt = 10 GeV + // - chi2Coeff = { 0.68177776, 0.74609577, -0.08035491, 0.00315399 } + // - chi2Scale = 30 for broken line fit, 45 for Riemann fit + // (see CAHitNtupletGeneratorGPU.cc) + float pt = std::min(tracks->pt(it), cuts.chi2MaxPt); + float chi2Cut = cuts.chi2Scale * + (cuts.chi2Coeff[0] + pt * (cuts.chi2Coeff[1] + pt * (cuts.chi2Coeff[2] + pt * cuts.chi2Coeff[3]))); + // above number were for Quads not normalized so for the time being just multiple by ndof for Quads (triplets to be understood) + if (3.f * tracks->chi2(it) >= chi2Cut) { +#ifdef NTUPLE_DEBUG + printf("Bad fit %d size %d pt %f eta %f chi2 %f\n", + it, + tuples->size(it), + tracks->pt(it), + tracks->eta(it), + 3.f * tracks->chi2(it)); +#endif + continue; + } + + // impose "region cuts" based on the fit results (phi, Tip, pt, cotan(theta)), Zip) + // default cuts: + // - for triplets: |Tip| < 0.3 cm, pT > 0.5 GeV, |Zip| < 12.0 cm + // - for quadruplets: |Tip| < 0.5 cm, pT > 0.3 GeV, |Zip| < 12.0 cm + // (see CAHitNtupletGeneratorGPU.cc) + auto const ®ion = (nhits > 3) ? cuts.quadruplet : cuts.triplet; + bool isOk = (std::abs(tracks->tip(it)) < region.maxTip) and (tracks->pt(it) > region.minPt) and + (std::abs(tracks->zip(it)) < region.maxZip); + + if (isOk) + quality[it] = pixelTrack::Quality::loose; + } +} + +__global__ void kernel_doStatsForTracks(HitContainer const *__restrict__ tuples, + Quality const *__restrict__ quality, + CAHitNtupletGeneratorKernelsGPU::Counters *counters) { + int first = blockDim.x * blockIdx.x + threadIdx.x; + for (int idx = first, ntot = tuples->nbins(); idx < ntot; idx += gridDim.x * blockDim.x) { + if (tuples->size(idx) == 0) + break; //guard + if (quality[idx] != pixelTrack::Quality::loose) + continue; + atomicAdd(&(counters->nGoodTracks), 1); + } +} + +__global__ void kernel_countHitInTracks(HitContainer const *__restrict__ tuples, + Quality const *__restrict__ quality, + CAHitNtupletGeneratorKernelsGPU::HitToTuple *hitToTuple) { + int first = blockDim.x * blockIdx.x + threadIdx.x; + for (int idx = first, ntot = tuples->nbins(); idx < ntot; idx += gridDim.x * blockDim.x) { + if (tuples->size(idx) == 0) + break; // guard + if (quality[idx] != pixelTrack::Quality::loose) + continue; + for (auto h = tuples->begin(idx); h != tuples->end(idx); ++h) + hitToTuple->countDirect(*h); + } +} + +__global__ void kernel_fillHitInTracks(HitContainer const *__restrict__ tuples, + Quality const *__restrict__ quality, + CAHitNtupletGeneratorKernelsGPU::HitToTuple *hitToTuple) { + int first = blockDim.x * blockIdx.x + threadIdx.x; + for (int idx = first, ntot = tuples->nbins(); idx < ntot; idx += gridDim.x * blockDim.x) { + if (tuples->size(idx) == 0) + break; // guard + if (quality[idx] != pixelTrack::Quality::loose) + continue; + for (auto h = tuples->begin(idx); h != tuples->end(idx); ++h) + hitToTuple->fillDirect(*h, idx); + } +} + +__global__ void kernel_fillHitDetIndices(HitContainer const *__restrict__ tuples, + TrackingRecHit2DSOAView const *__restrict__ hhp, + HitContainer *__restrict__ hitDetIndices) { + int first = blockDim.x * blockIdx.x + threadIdx.x; + // copy offsets + for (int idx = first, ntot = tuples->totbins(); idx < ntot; idx += gridDim.x * blockDim.x) { + hitDetIndices->off[idx] = tuples->off[idx]; + } + // fill hit indices + auto const &hh = *hhp; + auto nhits = hh.nHits(); + for (int idx = first, ntot = tuples->size(); idx < ntot; idx += gridDim.x * blockDim.x) { + assert(tuples->bins[idx] < nhits); + hitDetIndices->bins[idx] = hh.detectorIndex(tuples->bins[idx]); + } +} + +__global__ void kernel_doStatsForHitInTracks(CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ hitToTuple, + CAHitNtupletGeneratorKernelsGPU::Counters *counters) { + auto &c = *counters; + int first = blockDim.x * blockIdx.x + threadIdx.x; + for (int idx = first, ntot = hitToTuple->nbins(); idx < ntot; idx += gridDim.x * blockDim.x) { + if (hitToTuple->size(idx) == 0) + continue; // SHALL NOT BE break + atomicAdd(&c.nUsedHits, 1); + if (hitToTuple->size(idx) > 1) + atomicAdd(&c.nDupHits, 1); + } +} + +__global__ void kernel_tripletCleaner(TrackingRecHit2DSOAView const *__restrict__ hhp, + HitContainer const *__restrict__ ptuples, + TkSoA const *__restrict__ ptracks, + Quality *__restrict__ quality, + CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple) { + constexpr auto bad = pixelTrack::Quality::bad; + constexpr auto dup = pixelTrack::Quality::dup; + // constexpr auto loose = trackQuality::loose; + + auto &hitToTuple = *phitToTuple; + auto const &foundNtuplets = *ptuples; + auto const &tracks = *ptracks; + + int first = blockDim.x * blockIdx.x + threadIdx.x; + for (int idx = first, ntot = hitToTuple.nbins(); idx < ntot; idx += gridDim.x * blockDim.x) { + if (hitToTuple.size(idx) < 2) + continue; + + float mc = 10000.f; + uint16_t im = 60000; + uint32_t maxNh = 0; + + // find maxNh + for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) { + uint32_t nh = foundNtuplets.size(*it); + maxNh = std::max(nh, maxNh); + } + // kill all tracks shorter than maxHn (only triplets???) + for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) { + uint32_t nh = foundNtuplets.size(*it); + if (maxNh != nh) + quality[*it] = dup; + } + + if (maxNh > 3) + continue; + // for triplets choose best tip! + for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) { + auto const it = *ip; + if (quality[it] != bad && std::abs(tracks.tip(it)) < mc) { + mc = std::abs(tracks.tip(it)); + im = it; + } + } + // mark duplicates + for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) { + auto const it = *ip; + if (quality[it] != bad && it != im) + quality[it] = dup; //no race: simple assignment of the same constant + } + } // loop over hits +} + +__global__ void kernel_print_found_ntuplets(TrackingRecHit2DSOAView const *__restrict__ hhp, + HitContainer const *__restrict__ ptuples, + TkSoA const *__restrict__ ptracks, + Quality const *__restrict__ quality, + CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple, + uint32_t maxPrint, + int iev) { + auto const &foundNtuplets = *ptuples; + auto const &tracks = *ptracks; + int first = blockDim.x * blockIdx.x + threadIdx.x; + for (int i = first, np = std::min(maxPrint, foundNtuplets.nbins()); i < np; i += blockDim.x * gridDim.x) { + auto nh = foundNtuplets.size(i); + if (nh < 3) + continue; + printf("TK: %d %d %d %f %f %f %f %f %f %f %d %d %d %d %d\n", + 10000 * iev + i, + int(quality[i]), + nh, + tracks.charge(i), + tracks.pt(i), + tracks.eta(i), + tracks.phi(i), + tracks.tip(i), + tracks.zip(i), + // asinhf(fit_results[i].par(3)), + tracks.chi2(i), + *foundNtuplets.begin(i), + *(foundNtuplets.begin(i) + 1), + *(foundNtuplets.begin(i) + 2), + nh > 3 ? int(*(foundNtuplets.begin(i) + 3)) : -1, + nh > 4 ? int(*(foundNtuplets.begin(i) + 4)) : -1); + } +} + +__global__ void kernel_printCounters(cAHitNtupletGenerator::Counters const *counters) { + auto const &c = *counters; + printf( + "||Counters | nEvents | nHits | nCells | nTuples | nFitTacks | nGoodTracks | nUsedHits | nDupHits | " + "nKilledCells | " + "nEmptyCells | nZeroTrackCells ||\n"); + printf("Counters Raw %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld\n", + c.nEvents, + c.nHits, + c.nCells, + c.nTuples, + c.nGoodTracks, + c.nFitTracks, + c.nUsedHits, + c.nDupHits, + c.nKilledCells, + c.nEmptyCells, + c.nZeroTrackCells); + printf("Counters Norm %lld || %.1f| %.1f| %.1f| %.1f| %.1f| %.1f| %.1f| %.1f| %.3f| %.3f||\n", + c.nEvents, + c.nHits / double(c.nEvents), + c.nCells / double(c.nEvents), + c.nTuples / double(c.nEvents), + c.nFitTracks / double(c.nEvents), + c.nGoodTracks / double(c.nEvents), + c.nUsedHits / double(c.nEvents), + c.nDupHits / double(c.nEvents), + c.nKilledCells / double(c.nEvents), + c.nEmptyCells / double(c.nCells), + c.nZeroTrackCells / double(c.nCells)); +} diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc new file mode 100644 index 0000000000000..c2c7c2b869752 --- /dev/null +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc @@ -0,0 +1,229 @@ +// +// Original Author: Felice Pantaleo, CERN +// + +#include +#include +#include +#include + +#include "DataFormats/Common/interface/Handle.h" +#include "FWCore/Framework/interface/ConsumesCollector.h" +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/MessageLogger/interface/MessageLogger.h" +#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" +#include "FWCore/ServiceRegistry/interface/Service.h" +#include "FWCore/Utilities/interface/EDMException.h" +#include "FWCore/Utilities/interface/isFinite.h" +#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" +#include "TrackingTools/DetLayers/interface/BarrelDetLayer.h" + +#include "CAHitNtupletGeneratorOnGPU.h" + +namespace { + + template + T sqr(T x) { + return x * x; + } + + cAHitNtupletGenerator::QualityCuts makeQualityCuts(edm::ParameterSet const& pset) { + auto coeff = pset.getParameter>("chi2Coeff"); + if (coeff.size() != 4) { + throw edm::Exception(edm::errors::Configuration, + "CAHitNtupletGeneratorOnGPU.trackQualityCuts.chi2Coeff must have 4 elements"); + } + return cAHitNtupletGenerator::QualityCuts{// polynomial coefficients for the pT-dependent chi2 cut + {(float)coeff[0], (float)coeff[1], (float)coeff[2], (float)coeff[3]}, + // max pT used to determine the chi2 cut + (float)pset.getParameter("chi2MaxPt"), + // chi2 scale factor: 30 for broken line fit, 45 for Riemann fit + (float)pset.getParameter("chi2Scale"), + // regional cuts for triplets + {(float)pset.getParameter("tripletMaxTip"), + (float)pset.getParameter("tripletMinPt"), + (float)pset.getParameter("tripletMaxZip")}, + // regional cuts for quadruplets + {(float)pset.getParameter("quadrupletMaxTip"), + (float)pset.getParameter("quadrupletMinPt"), + (float)pset.getParameter("quadrupletMaxZip")}}; + } + +} // namespace + +using namespace std; + +CAHitNtupletGeneratorOnGPU::CAHitNtupletGeneratorOnGPU(const edm::ParameterSet& cfg, edm::ConsumesCollector& iC) + : m_params(cfg.getParameter("onGPU"), + cfg.getParameter("minHitsPerNtuplet"), + cfg.getParameter("maxNumberOfDoublets"), + cfg.getParameter("useRiemannFit"), + cfg.getParameter("fit5as4"), + cfg.getParameter("includeJumpingForwardDoublets"), + cfg.getParameter("earlyFishbone"), + cfg.getParameter("lateFishbone"), + cfg.getParameter("idealConditions"), + cfg.getParameter("fillStatistics"), + cfg.getParameter("doClusterCut"), + cfg.getParameter("doZ0Cut"), + cfg.getParameter("doPtCut"), + cfg.getParameter("ptmin"), + cfg.getParameter("CAThetaCutBarrel"), + cfg.getParameter("CAThetaCutForward"), + cfg.getParameter("hardCurvCut"), + cfg.getParameter("dcaCutInnerTriplet"), + cfg.getParameter("dcaCutOuterTriplet"), + makeQualityCuts(cfg.getParameterSet("trackQualityCuts"))) { +#ifdef DUMP_GPU_TK_TUPLES + printf("TK: %s %s % %s %s %s %s %s %s %s %s %s %s %s %s %s\n", + "tid", + "qual", + "nh", + "charge", + "pt", + "eta", + "phi", + "tip", + "zip", + "chi2", + "h1", + "h2", + "h3", + "h4", + "h5"); +#endif + + if (m_params.onGPU_) { + // allocate pinned host memory only if CUDA is available + edm::Service cs; + if (cs and cs->enabled()) { + cudaCheck(cudaMalloc(&m_counters, sizeof(Counters))); + cudaCheck(cudaMemset(m_counters, 0, sizeof(Counters))); + } + } else { + m_counters = new Counters(); + memset(m_counters, 0, sizeof(Counters)); + } +} + +CAHitNtupletGeneratorOnGPU::~CAHitNtupletGeneratorOnGPU() { + if (m_params.onGPU_) { + // print the gpu statistics and free pinned host memory only if CUDA is available + edm::Service cs; + if (cs and cs->enabled()) { + if (m_params.doStats_) { + // crash on multi-gpu processes + CAHitNtupletGeneratorKernelsGPU::printCounters(m_counters); + } + cudaFree(m_counters); + } + } else { + if (m_params.doStats_) { + CAHitNtupletGeneratorKernelsCPU::printCounters(m_counters); + } + delete m_counters; + } +} + +void CAHitNtupletGeneratorOnGPU::fillDescriptions(edm::ParameterSetDescription& desc) { + // 87 cm/GeV = 1/(3.8T * 0.3) + // take less than radius given by the hardPtCut and reject everything below + // auto hardCurvCut = 1.f/(0.35 * 87.f); + desc.add("ptmin", 0.9f)->setComment("Cut on minimum pt"); + desc.add("CAThetaCutBarrel", 0.002f)->setComment("Cut on RZ alignement for Barrel"); + desc.add("CAThetaCutForward", 0.003f)->setComment("Cut on RZ alignment for Forward"); + desc.add("hardCurvCut", 1.f / (0.35 * 87.f))->setComment("Cut on minimum curvature"); + desc.add("dcaCutInnerTriplet", 0.15f)->setComment("Cut on origin radius when the inner hit is on BPix1"); + desc.add("dcaCutOuterTriplet", 0.25f)->setComment("Cut on origin radius when the outer hit is on BPix1"); + desc.add("earlyFishbone", true); + desc.add("lateFishbone", false); + desc.add("idealConditions", true); + desc.add("fillStatistics", false); + desc.add("minHitsPerNtuplet", 4); + desc.add("maxNumberOfDoublets", caConstants::maxNumberOfDoublets); + desc.add("includeJumpingForwardDoublets", false); + desc.add("fit5as4", true); + desc.add("doClusterCut", true); + desc.add("doZ0Cut", true); + desc.add("doPtCut", true); + desc.add("useRiemannFit", false)->setComment("true for Riemann, false for BrokenLine"); + + edm::ParameterSetDescription trackQualityCuts; + trackQualityCuts.add("chi2MaxPt", 10.)->setComment("max pT used to determine the pT-dependent chi2 cut"); + trackQualityCuts.add>("chi2Coeff", {0.68177776, 0.74609577, -0.08035491, 0.00315399}) + ->setComment("Polynomial coefficients to derive the pT-dependent chi2 cut"); + trackQualityCuts.add("chi2Scale", 30.) + ->setComment( + "Factor to multiply the pT-dependent chi2 cut (currently: 30 for the broken line fit, 45 for the Riemann " + "fit)"); + trackQualityCuts.add("tripletMinPt", 0.5)->setComment("Min pT for triplets, in GeV"); + trackQualityCuts.add("tripletMaxTip", 0.3)->setComment("Max |Tip| for triplets, in cm"); + trackQualityCuts.add("tripletMaxZip", 12.)->setComment("Max |Zip| for triplets, in cm"); + trackQualityCuts.add("quadrupletMinPt", 0.3)->setComment("Min pT for quadruplets, in GeV"); + trackQualityCuts.add("quadrupletMaxTip", 0.5)->setComment("Max |Tip| for quadruplets, in cm"); + trackQualityCuts.add("quadrupletMaxZip", 12.)->setComment("Max |Zip| for quadruplets, in cm"); + desc.add("trackQualityCuts", trackQualityCuts) + ->setComment( + "Quality cuts based on the results of the track fit:\n - apply a pT-dependent chi2 cut;\n - apply \"region " + "cuts\" based on the fit results (pT, Tip, Zip)."); +} + +PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecHit2DCUDA const& hits_d, + float bfield, + cudaStream_t stream) const { + PixelTrackHeterogeneous tracks(cms::cuda::make_device_unique(stream)); + + auto* soa = tracks.get(); + + CAHitNtupletGeneratorKernelsGPU kernels(m_params); + kernels.setCounters(m_counters); + + kernels.allocateOnGPU(stream); + + kernels.buildDoublets(hits_d, stream); + kernels.launchKernels(hits_d, soa, stream); + kernels.fillHitDetIndices(hits_d.view(), soa, stream); // in principle needed only if Hits not "available" + + HelixFitOnGPU fitter(bfield, m_params.fit5as4_); + fitter.allocateOnGPU(&(soa->hitIndices), kernels.tupleMultiplicity(), soa); + if (m_params.useRiemannFit_) { + fitter.launchRiemannKernels(hits_d.view(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets, stream); + } else { + fitter.launchBrokenLineKernels(hits_d.view(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets, stream); + } + kernels.classifyTuples(hits_d, soa, stream); + + return tracks; +} + +PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DCPU const& hits_d, float bfield) const { + PixelTrackHeterogeneous tracks(std::make_unique()); + + auto* soa = tracks.get(); + assert(soa); + + CAHitNtupletGeneratorKernelsCPU kernels(m_params); + kernels.setCounters(m_counters); + kernels.allocateOnGPU(nullptr); + + kernels.buildDoublets(hits_d, nullptr); + kernels.launchKernels(hits_d, soa, nullptr); + kernels.fillHitDetIndices(hits_d.view(), soa, nullptr); // in principle needed only if Hits not "available" + + if (0 == hits_d.nHits()) + return tracks; + + // now fit + HelixFitOnGPU fitter(bfield, m_params.fit5as4_); + fitter.allocateOnGPU(&(soa->hitIndices), kernels.tupleMultiplicity(), soa); + + if (m_params.useRiemannFit_) { + fitter.launchRiemannKernelsOnCPU(hits_d.view(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets); + } else { + fitter.launchBrokenLineKernelsOnCPU(hits_d.view(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets); + } + + kernels.classifyTuples(hits_d, soa, nullptr); + + return tracks; +} diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h new file mode 100644 index 0000000000000..564a870f54796 --- /dev/null +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h @@ -0,0 +1,65 @@ +#ifndef RecoPixelVertexing_PixelTriplets_plugins_CAHitNtupletGeneratorOnGPU_h +#define RecoPixelVertexing_PixelTriplets_plugins_CAHitNtupletGeneratorOnGPU_h + +#include +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" +#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" + +#include "DataFormats/SiPixelDetId/interface/PixelSubdetector.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/Utilities/interface/EDGetToken.h" +#include "HeterogeneousCore/CUDAUtilities/interface/SimpleVector.h" + +#include "CAHitNtupletGeneratorKernels.h" +#include "HelixFitOnGPU.h" + +#include "GPUCACell.h" + +namespace edm { + class Event; + class EventSetup; + class ParameterSetDescription; +} // namespace edm + +class CAHitNtupletGeneratorOnGPU { +public: + using HitsOnGPU = TrackingRecHit2DSOAView; + using HitsOnCPU = TrackingRecHit2DCUDA; + using hindex_type = TrackingRecHit2DSOAView::hindex_type; + + using Quality = pixelTrack::Quality; + using OutputSoA = pixelTrack::TrackSoA; + using HitContainer = pixelTrack::HitContainer; + using Tuple = HitContainer; + + using QualityCuts = cAHitNtupletGenerator::QualityCuts; + using Params = cAHitNtupletGenerator::Params; + using Counters = cAHitNtupletGenerator::Counters; + +public: + CAHitNtupletGeneratorOnGPU(const edm::ParameterSet& cfg, edm::ConsumesCollector&& iC) + : CAHitNtupletGeneratorOnGPU(cfg, iC) {} + CAHitNtupletGeneratorOnGPU(const edm::ParameterSet& cfg, edm::ConsumesCollector& iC); + + ~CAHitNtupletGeneratorOnGPU(); + + static void fillDescriptions(edm::ParameterSetDescription& desc); + static const char* fillDescriptionsLabel() { return "caHitNtupletOnGPU"; } + + PixelTrackHeterogeneous makeTuplesAsync(TrackingRecHit2DGPU const& hits_d, float bfield, cudaStream_t stream) const; + + PixelTrackHeterogeneous makeTuples(TrackingRecHit2DCPU const& hits_d, float bfield) const; + +private: + void buildDoublets(HitsOnCPU const& hh, cudaStream_t stream) const; + + void hitNtuplets(HitsOnCPU const& hh, const edm::EventSetup& es, bool useRiemannFit, cudaStream_t cudaStream); + + void launchKernels(HitsOnCPU const& hh, bool useRiemannFit, cudaStream_t cudaStream) const; + + Params m_params; + + Counters* m_counters = nullptr; +}; + +#endif // RecoPixelVertexing_PixelTriplets_plugins_CAHitNtupletGeneratorOnGPU_h diff --git a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h new file mode 100644 index 0000000000000..0fd514e26d223 --- /dev/null +++ b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h @@ -0,0 +1,347 @@ +#ifndef RecoPixelVertexing_PixelTriplets_plugins_GPUCACell_h +#define RecoPixelVertexing_PixelTriplets_plugins_GPUCACell_h + +// +// Author: Felice Pantaleo, CERN +// + +// #define ONLY_TRIPLETS_IN_HOLE + +#include + +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" +#include "HeterogeneousCore/CUDAUtilities/interface/SimpleVector.h" +#include "HeterogeneousCore/CUDAUtilities/interface/VecArray.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" +#include "RecoPixelVertexing/PixelTriplets/interface/CircleEq.h" +#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" +#include "CAConstants.h" + +class GPUCACell { +public: + using PtrAsInt = unsigned long long; + + static constexpr auto maxCellsPerHit = caConstants::maxCellsPerHit; + using OuterHitOfCell = caConstants::OuterHitOfCell; + using CellNeighbors = caConstants::CellNeighbors; + using CellTracks = caConstants::CellTracks; + using CellNeighborsVector = caConstants::CellNeighborsVector; + using CellTracksVector = caConstants::CellTracksVector; + + using Hits = TrackingRecHit2DSOAView; + using hindex_type = Hits::hindex_type; + + using TmpTuple = cms::cuda::VecArray; + + using HitContainer = pixelTrack::HitContainer; + using Quality = pixelTrack::Quality; + static constexpr auto bad = pixelTrack::Quality::bad; + + GPUCACell() = default; + + __device__ __forceinline__ void init(CellNeighborsVector& cellNeighbors, + CellTracksVector& cellTracks, + Hits const& hh, + int layerPairId, + int doubletId, + hindex_type innerHitId, + hindex_type outerHitId) { + theInnerHitId = innerHitId; + theOuterHitId = outerHitId; + theDoubletId_ = doubletId; + theLayerPairId_ = layerPairId; + theUsed_ = 0; + + // optimization that depends on access pattern + theInnerZ = hh.zGlobal(innerHitId); + theInnerR = hh.rGlobal(innerHitId); + + // link to default empty + theOuterNeighbors = &cellNeighbors[0]; + theTracks = &cellTracks[0]; + assert(outerNeighbors().empty()); + assert(tracks().empty()); + } + + __device__ __forceinline__ int addOuterNeighbor(CellNeighbors::value_t t, CellNeighborsVector& cellNeighbors) { + // use smart cache + if (outerNeighbors().empty()) { + auto i = cellNeighbors.extend(); // maybe wasted.... + if (i > 0) { + cellNeighbors[i].reset(); +#ifdef __CUDACC__ + auto zero = (PtrAsInt)(&cellNeighbors[0]); + atomicCAS((PtrAsInt*)(&theOuterNeighbors), + zero, + (PtrAsInt)(&cellNeighbors[i])); // if fails we cannot give "i" back... +#else + theOuterNeighbors = &cellNeighbors[i]; +#endif + } else + return -1; + } + __threadfence(); + return outerNeighbors().push_back(t); + } + + __device__ __forceinline__ int addTrack(CellTracks::value_t t, CellTracksVector& cellTracks) { + if (tracks().empty()) { + auto i = cellTracks.extend(); // maybe wasted.... + if (i > 0) { + cellTracks[i].reset(); +#ifdef __CUDACC__ + auto zero = (PtrAsInt)(&cellTracks[0]); + atomicCAS((PtrAsInt*)(&theTracks), zero, (PtrAsInt)(&cellTracks[i])); // if fails we cannot give "i" back... +#else + theTracks = &cellTracks[i]; +#endif + } else + return -1; + } + __threadfence(); + return tracks().push_back(t); + } + + __device__ __forceinline__ CellTracks& tracks() { return *theTracks; } + __device__ __forceinline__ CellTracks const& tracks() const { return *theTracks; } + __device__ __forceinline__ CellNeighbors& outerNeighbors() { return *theOuterNeighbors; } + __device__ __forceinline__ CellNeighbors const& outerNeighbors() const { return *theOuterNeighbors; } + __device__ __forceinline__ float inner_x(Hits const& hh) const { return hh.xGlobal(theInnerHitId); } + __device__ __forceinline__ float outer_x(Hits const& hh) const { return hh.xGlobal(theOuterHitId); } + __device__ __forceinline__ float inner_y(Hits const& hh) const { return hh.yGlobal(theInnerHitId); } + __device__ __forceinline__ float outer_y(Hits const& hh) const { return hh.yGlobal(theOuterHitId); } + __device__ __forceinline__ float inner_z(Hits const& hh) const { return theInnerZ; } + // { return hh.zGlobal(theInnerHitId); } // { return theInnerZ; } + __device__ __forceinline__ float outer_z(Hits const& hh) const { return hh.zGlobal(theOuterHitId); } + __device__ __forceinline__ float inner_r(Hits const& hh) const { return theInnerR; } + // { return hh.rGlobal(theInnerHitId); } // { return theInnerR; } + __device__ __forceinline__ float outer_r(Hits const& hh) const { return hh.rGlobal(theOuterHitId); } + + __device__ __forceinline__ auto inner_iphi(Hits const& hh) const { return hh.iphi(theInnerHitId); } + __device__ __forceinline__ auto outer_iphi(Hits const& hh) const { return hh.iphi(theOuterHitId); } + + __device__ __forceinline__ float inner_detIndex(Hits const& hh) const { return hh.detectorIndex(theInnerHitId); } + __device__ __forceinline__ float outer_detIndex(Hits const& hh) const { return hh.detectorIndex(theOuterHitId); } + + constexpr unsigned int inner_hit_id() const { return theInnerHitId; } + constexpr unsigned int outer_hit_id() const { return theOuterHitId; } + + __device__ void print_cell() const { + printf("printing cell: %d, on layerPair: %d, innerHitId: %d, outerHitId: %d \n", + theDoubletId_, + theLayerPairId_, + theInnerHitId, + theOuterHitId); + } + + __device__ bool check_alignment(Hits const& hh, + GPUCACell const& otherCell, + const float ptmin, + const float hardCurvCut, + const float caThetaCutBarrel, + const float caThetaCutForward, + const float dcaCutInnerTriplet, + const float dcaCutOuterTriplet) const { + // detIndex of the layerStart for the Phase1 Pixel Detector: + // [BPX1, BPX2, BPX3, BPX4, FP1, FP2, FP3, FN1, FN2, FN3, LAST_VALID] + // [ 0, 96, 320, 672, 1184, 1296, 1408, 1520, 1632, 1744, 1856] + auto ri = inner_r(hh); + auto zi = inner_z(hh); + + auto ro = outer_r(hh); + auto zo = outer_z(hh); + + auto r1 = otherCell.inner_r(hh); + auto z1 = otherCell.inner_z(hh); + auto isBarrel = otherCell.outer_detIndex(hh) < caConstants::last_barrel_detIndex; + bool aligned = areAlignedRZ(r1, + z1, + ri, + zi, + ro, + zo, + ptmin, + isBarrel ? caThetaCutBarrel : caThetaCutForward); // 2.f*thetaCut); // FIXME tune cuts + return (aligned && dcaCut(hh, + otherCell, + otherCell.inner_detIndex(hh) < caConstants::last_bpix1_detIndex ? dcaCutInnerTriplet + : dcaCutOuterTriplet, + hardCurvCut)); // FIXME tune cuts + } + + __device__ __forceinline__ static bool areAlignedRZ( + float r1, float z1, float ri, float zi, float ro, float zo, const float ptmin, const float thetaCut) { + float radius_diff = std::abs(r1 - ro); + float distance_13_squared = radius_diff * radius_diff + (z1 - zo) * (z1 - zo); + + float pMin = ptmin * std::sqrt(distance_13_squared); // this needs to be divided by + // radius_diff later + + float tan_12_13_half_mul_distance_13_squared = fabs(z1 * (ri - ro) + zi * (ro - r1) + zo * (r1 - ri)); + return tan_12_13_half_mul_distance_13_squared * pMin <= thetaCut * distance_13_squared * radius_diff; + } + + __device__ inline bool dcaCut(Hits const& hh, + GPUCACell const& otherCell, + const float region_origin_radius_plus_tolerance, + const float maxCurv) const { + auto x1 = otherCell.inner_x(hh); + auto y1 = otherCell.inner_y(hh); + + auto x2 = inner_x(hh); + auto y2 = inner_y(hh); + + auto x3 = outer_x(hh); + auto y3 = outer_y(hh); + + CircleEq eq(x1, y1, x2, y2, x3, y3); + + if (eq.curvature() > maxCurv) + return false; + + return std::abs(eq.dca0()) < region_origin_radius_plus_tolerance * std::abs(eq.curvature()); + } + + __device__ __forceinline__ static bool dcaCutH(float x1, + float y1, + float x2, + float y2, + float x3, + float y3, + const float region_origin_radius_plus_tolerance, + const float maxCurv) { + CircleEq eq(x1, y1, x2, y2, x3, y3); + + if (eq.curvature() > maxCurv) + return false; + + return std::abs(eq.dca0()) < region_origin_radius_plus_tolerance * std::abs(eq.curvature()); + } + + __device__ inline bool hole0(Hits const& hh, GPUCACell const& innerCell) const { + using caConstants::first_ladder_bpx0; + using caConstants::max_ladder_bpx0; + using caConstants::module_length_bpx0; + using caConstants::module_tolerance_bpx0; + int p = innerCell.inner_iphi(hh); + if (p < 0) + p += std::numeric_limits::max(); + p = (max_ladder_bpx0 * p) / std::numeric_limits::max(); + p %= max_ladder_bpx0; + auto il = first_ladder_bpx0 + p; + auto r0 = hh.averageGeometry().ladderR[il]; + auto ri = innerCell.inner_r(hh); + auto zi = innerCell.inner_z(hh); + auto ro = outer_r(hh); + auto zo = outer_z(hh); + auto z0 = zi + (r0 - ri) * (zo - zi) / (ro - ri); + auto z_in_ladder = std::abs(z0 - hh.averageGeometry().ladderZ[il]); + auto z_in_module = z_in_ladder - module_length_bpx0 * int(z_in_ladder / module_length_bpx0); + auto gap = z_in_module < module_tolerance_bpx0 || z_in_module > (module_length_bpx0 - module_tolerance_bpx0); + return gap; + } + + __device__ inline bool hole4(Hits const& hh, GPUCACell const& innerCell) const { + using caConstants::first_ladder_bpx4; + using caConstants::max_ladder_bpx4; + using caConstants::module_length_bpx4; + using caConstants::module_tolerance_bpx4; + int p = outer_iphi(hh); + if (p < 0) + p += std::numeric_limits::max(); + p = (max_ladder_bpx4 * p) / std::numeric_limits::max(); + p %= max_ladder_bpx4; + auto il = first_ladder_bpx4 + p; + auto r4 = hh.averageGeometry().ladderR[il]; + auto ri = innerCell.inner_r(hh); + auto zi = innerCell.inner_z(hh); + auto ro = outer_r(hh); + auto zo = outer_z(hh); + auto z4 = zo + (r4 - ro) * (zo - zi) / (ro - ri); + auto z_in_ladder = std::abs(z4 - hh.averageGeometry().ladderZ[il]); + auto z_in_module = z_in_ladder - module_length_bpx4 * int(z_in_ladder / module_length_bpx4); + auto gap = z_in_module < module_tolerance_bpx4 || z_in_module > (module_length_bpx4 - module_tolerance_bpx4); + auto holeP = z4 > hh.averageGeometry().ladderMaxZ[il] && z4 < hh.averageGeometry().endCapZ[0]; + auto holeN = z4 < hh.averageGeometry().ladderMinZ[il] && z4 > hh.averageGeometry().endCapZ[1]; + return gap || holeP || holeN; + } + + // trying to free the track building process from hardcoded layers, leaving + // the visit of the graph based on the neighborhood connections between cells. + __device__ inline void find_ntuplets(Hits const& hh, + GPUCACell* __restrict__ cells, + CellTracksVector& cellTracks, + HitContainer& foundNtuplets, + cms::cuda::AtomicPairCounter& apc, + Quality* __restrict__ quality, + TmpTuple& tmpNtuplet, + const unsigned int minHitsPerNtuplet, + bool startAt0) const { + // the building process for a track ends if: + // it has no right neighbor + // it has no compatible neighbor + // the ntuplets is then saved if the number of hits it contains is greater + // than a threshold + + tmpNtuplet.push_back_unsafe(theDoubletId_); + assert(tmpNtuplet.size() <= 4); + + bool last = true; + for (unsigned int otherCell : outerNeighbors()) { + if (cells[otherCell].theDoubletId_ < 0) + continue; // killed by earlyFishbone + last = false; + cells[otherCell].find_ntuplets( + hh, cells, cellTracks, foundNtuplets, apc, quality, tmpNtuplet, minHitsPerNtuplet, startAt0); + } + if (last) { // if long enough save... + if ((unsigned int)(tmpNtuplet.size()) >= minHitsPerNtuplet - 1) { +#ifdef ONLY_TRIPLETS_IN_HOLE + // triplets accepted only pointing to the hole + if (tmpNtuplet.size() >= 3 || (startAt0 && hole4(hh, cells[tmpNtuplet[0]])) || + ((!startAt0) && hole0(hh, cells[tmpNtuplet[0]]))) +#endif + { + hindex_type hits[6]; + auto nh = 0U; + for (auto c : tmpNtuplet) { + hits[nh++] = cells[c].theInnerHitId; + } + hits[nh] = theOuterHitId; + auto it = foundNtuplets.bulkFill(apc, hits, tmpNtuplet.size() + 1); + if (it >= 0) { // if negative is overflow.... + for (auto c : tmpNtuplet) + cells[c].addTrack(it, cellTracks); + quality[it] = bad; // initialize to bad + } + } + } + } + tmpNtuplet.pop_back(); + assert(tmpNtuplet.size() < 4); + } + + // Cell status management + __device__ __forceinline__ void kill() { theDoubletId_ = -1; } + __device__ __forceinline__ bool isKilled() const { return theDoubletId_ < 0; } + + __device__ __forceinline__ int16_t layerPairId() const { return theLayerPairId_; } + + __device__ __forceinline__ bool unused() const { return !theUsed_; } + __device__ __forceinline__ void setUsedBit(uint16_t bit) { theUsed_ |= bit; } + +private: + CellNeighbors* theOuterNeighbors; + CellTracks* theTracks; + + int32_t theDoubletId_; + int16_t theLayerPairId_; + uint16_t theUsed_; // tbd + + float theInnerZ; + float theInnerR; + hindex_type theInnerHitId; + hindex_type theOuterHitId; +}; + +#endif // RecoPixelVertexing_PixelTriplets_plugins_GPUCACell_h diff --git a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc new file mode 100644 index 0000000000000..880bdb47dfb5c --- /dev/null +++ b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc @@ -0,0 +1,16 @@ +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HelixFitOnGPU.h" + +void HelixFitOnGPU::allocateOnGPU(Tuples const *tuples, + TupleMultiplicity const *tupleMultiplicity, + OutputSoA *helix_fit_results) { + tuples_ = tuples; + tupleMultiplicity_ = tupleMultiplicity; + outputSoa_ = helix_fit_results; + + assert(tuples_); + assert(tupleMultiplicity_); + assert(outputSoa_); +} + +void HelixFitOnGPU::deallocateOnGPU() {} diff --git a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h new file mode 100644 index 0000000000000..938994840f8c0 --- /dev/null +++ b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h @@ -0,0 +1,68 @@ +#ifndef RecoPixelVertexing_PixelTriplets_plugins_HelixFitOnGPU_h +#define RecoPixelVertexing_PixelTriplets_plugins_HelixFitOnGPU_h + +#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" +#include "RecoPixelVertexing/PixelTrackFitting/interface/FitResult.h" + +#include "CAConstants.h" + +namespace riemannFit { + // in case of memory issue can be made smaller + constexpr uint32_t maxNumberOfConcurrentFits = caConstants::maxNumberOfTuples; + constexpr uint32_t stride = maxNumberOfConcurrentFits; + using Matrix3x4d = Eigen::Matrix; + using Map3x4d = Eigen::Map >; + using Matrix6x4f = Eigen::Matrix; + using Map6x4f = Eigen::Map >; + + // hits + template + using Matrix3xNd = Eigen::Matrix; + template + using Map3xNd = Eigen::Map, 0, Eigen::Stride<3 * stride, stride> >; + // errors + template + using Matrix6xNf = Eigen::Matrix; + template + using Map6xNf = Eigen::Map, 0, Eigen::Stride<6 * stride, stride> >; + // fast fit + using Map4d = Eigen::Map >; + +} // namespace riemannFit + +class HelixFitOnGPU { +public: + using HitsView = TrackingRecHit2DSOAView; + + using Tuples = pixelTrack::HitContainer; + using OutputSoA = pixelTrack::TrackSoA; + + using TupleMultiplicity = caConstants::TupleMultiplicity; + + explicit HelixFitOnGPU(float bf, bool fit5as4) : bField_(bf), fit5as4_(fit5as4) {} + ~HelixFitOnGPU() { deallocateOnGPU(); } + + void setBField(double bField) { bField_ = bField; } + void launchRiemannKernels(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples, cudaStream_t cudaStream); + void launchBrokenLineKernels(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples, cudaStream_t cudaStream); + + void launchRiemannKernelsOnCPU(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples); + void launchBrokenLineKernelsOnCPU(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples); + + void allocateOnGPU(Tuples const *tuples, TupleMultiplicity const *tupleMultiplicity, OutputSoA *outputSoA); + void deallocateOnGPU(); + +private: + static constexpr uint32_t maxNumberOfConcurrentFits_ = riemannFit::maxNumberOfConcurrentFits; + + // fowarded + Tuples const *tuples_ = nullptr; + TupleMultiplicity const *tupleMultiplicity_ = nullptr; + OutputSoA *outputSoa_; + float bField_; + + const bool fit5as4_; +}; + +#endif // RecoPixelVertexing_PixelTriplets_plugins_HelixFitOnGPU_h diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cc new file mode 100644 index 0000000000000..491dd0df2004f --- /dev/null +++ b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cc @@ -0,0 +1,113 @@ +#include "RiemannFitOnGPU.h" + +void HelixFitOnGPU::launchRiemannKernelsOnCPU(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples) { + assert(tuples_); + + // Fit internals + auto hitsGPU = + std::make_unique(maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix3xNd<4>) / sizeof(double)); + auto hits_geGPU = + std::make_unique(maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix6x4f) / sizeof(float)); + auto fast_fit_resultsGPU = + std::make_unique(maxNumberOfConcurrentFits_ * sizeof(riemannFit::Vector4d) / sizeof(double)); + auto circle_fit_resultsGPU_holder = + std::make_unique(maxNumberOfConcurrentFits_ * sizeof(riemannFit::CircleFit)); + riemannFit::CircleFit *circle_fit_resultsGPU = (riemannFit::CircleFit *)(circle_fit_resultsGPU_holder.get()); + + for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) { + // triplets + kernel_FastFit<3>( + tuples_, tupleMultiplicity_, 3, hv, hitsGPU.get(), hits_geGPU.get(), fast_fit_resultsGPU.get(), offset); + + kernel_CircleFit<3>(tupleMultiplicity_, + 3, + bField_, + hitsGPU.get(), + hits_geGPU.get(), + fast_fit_resultsGPU.get(), + circle_fit_resultsGPU, + offset); + + kernel_LineFit<3>(tupleMultiplicity_, + 3, + bField_, + outputSoa_, + hitsGPU.get(), + hits_geGPU.get(), + fast_fit_resultsGPU.get(), + circle_fit_resultsGPU, + offset); + + // quads + kernel_FastFit<4>( + tuples_, tupleMultiplicity_, 4, hv, hitsGPU.get(), hits_geGPU.get(), fast_fit_resultsGPU.get(), offset); + + kernel_CircleFit<4>(tupleMultiplicity_, + 4, + bField_, + hitsGPU.get(), + hits_geGPU.get(), + fast_fit_resultsGPU.get(), + circle_fit_resultsGPU, + offset); + + kernel_LineFit<4>(tupleMultiplicity_, + 4, + bField_, + outputSoa_, + hitsGPU.get(), + hits_geGPU.get(), + fast_fit_resultsGPU.get(), + circle_fit_resultsGPU, + offset); + + if (fit5as4_) { + // penta + kernel_FastFit<4>( + tuples_, tupleMultiplicity_, 5, hv, hitsGPU.get(), hits_geGPU.get(), fast_fit_resultsGPU.get(), offset); + + kernel_CircleFit<4>(tupleMultiplicity_, + 5, + bField_, + hitsGPU.get(), + hits_geGPU.get(), + fast_fit_resultsGPU.get(), + circle_fit_resultsGPU, + offset); + + kernel_LineFit<4>(tupleMultiplicity_, + 5, + bField_, + outputSoa_, + hitsGPU.get(), + hits_geGPU.get(), + fast_fit_resultsGPU.get(), + circle_fit_resultsGPU, + offset); + + } else { + // penta all 5 + kernel_FastFit<5>( + tuples_, tupleMultiplicity_, 5, hv, hitsGPU.get(), hits_geGPU.get(), fast_fit_resultsGPU.get(), offset); + + kernel_CircleFit<5>(tupleMultiplicity_, + 5, + bField_, + hitsGPU.get(), + hits_geGPU.get(), + fast_fit_resultsGPU.get(), + circle_fit_resultsGPU, + offset); + + kernel_LineFit<5>(tupleMultiplicity_, + 5, + bField_, + outputSoa_, + hitsGPU.get(), + hits_geGPU.get(), + fast_fit_resultsGPU.get(), + circle_fit_resultsGPU, + offset); + } + } +} diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu new file mode 100644 index 0000000000000..90af2ac13730b --- /dev/null +++ b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu @@ -0,0 +1,131 @@ +#include "RiemannFitOnGPU.h" +#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" + +void HelixFitOnGPU::launchRiemannKernels(HitsView const *hv, + uint32_t nhits, + uint32_t maxNumberOfTuples, + cudaStream_t stream) { + assert(tuples_); + + auto blockSize = 64; + auto numberOfBlocks = (maxNumberOfConcurrentFits_ + blockSize - 1) / blockSize; + + // Fit internals + auto hitsGPU = cms::cuda::make_device_unique( + maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix3xNd<4>) / sizeof(double), stream); + auto hits_geGPU = cms::cuda::make_device_unique( + maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix6x4f) / sizeof(float), stream); + auto fast_fit_resultsGPU = cms::cuda::make_device_unique( + maxNumberOfConcurrentFits_ * sizeof(riemannFit::Vector4d) / sizeof(double), stream); + auto circle_fit_resultsGPU_holder = + cms::cuda::make_device_unique(maxNumberOfConcurrentFits_ * sizeof(riemannFit::CircleFit), stream); + riemannFit::CircleFit *circle_fit_resultsGPU_ = (riemannFit::CircleFit *)(circle_fit_resultsGPU_holder.get()); + + for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) { + // triplets + kernel_FastFit<3><<>>( + tuples_, tupleMultiplicity_, 3, hv, hitsGPU.get(), hits_geGPU.get(), fast_fit_resultsGPU.get(), offset); + cudaCheck(cudaGetLastError()); + + kernel_CircleFit<3><<>>(tupleMultiplicity_, + 3, + bField_, + hitsGPU.get(), + hits_geGPU.get(), + fast_fit_resultsGPU.get(), + circle_fit_resultsGPU_, + offset); + cudaCheck(cudaGetLastError()); + + kernel_LineFit<3><<>>(tupleMultiplicity_, + 3, + bField_, + outputSoa_, + hitsGPU.get(), + hits_geGPU.get(), + fast_fit_resultsGPU.get(), + circle_fit_resultsGPU_, + offset); + cudaCheck(cudaGetLastError()); + + // quads + kernel_FastFit<4><<>>( + tuples_, tupleMultiplicity_, 4, hv, hitsGPU.get(), hits_geGPU.get(), fast_fit_resultsGPU.get(), offset); + cudaCheck(cudaGetLastError()); + + kernel_CircleFit<4><<>>(tupleMultiplicity_, + 4, + bField_, + hitsGPU.get(), + hits_geGPU.get(), + fast_fit_resultsGPU.get(), + circle_fit_resultsGPU_, + offset); + cudaCheck(cudaGetLastError()); + + kernel_LineFit<4><<>>(tupleMultiplicity_, + 4, + bField_, + outputSoa_, + hitsGPU.get(), + hits_geGPU.get(), + fast_fit_resultsGPU.get(), + circle_fit_resultsGPU_, + offset); + cudaCheck(cudaGetLastError()); + + if (fit5as4_) { + // penta + kernel_FastFit<4><<>>( + tuples_, tupleMultiplicity_, 5, hv, hitsGPU.get(), hits_geGPU.get(), fast_fit_resultsGPU.get(), offset); + cudaCheck(cudaGetLastError()); + + kernel_CircleFit<4><<>>(tupleMultiplicity_, + 5, + bField_, + hitsGPU.get(), + hits_geGPU.get(), + fast_fit_resultsGPU.get(), + circle_fit_resultsGPU_, + offset); + cudaCheck(cudaGetLastError()); + + kernel_LineFit<4><<>>(tupleMultiplicity_, + 5, + bField_, + outputSoa_, + hitsGPU.get(), + hits_geGPU.get(), + fast_fit_resultsGPU.get(), + circle_fit_resultsGPU_, + offset); + cudaCheck(cudaGetLastError()); + } else { + // penta all 5 + kernel_FastFit<5><<>>( + tuples_, tupleMultiplicity_, 5, hv, hitsGPU.get(), hits_geGPU.get(), fast_fit_resultsGPU.get(), offset); + cudaCheck(cudaGetLastError()); + + kernel_CircleFit<5><<>>(tupleMultiplicity_, + 5, + bField_, + hitsGPU.get(), + hits_geGPU.get(), + fast_fit_resultsGPU.get(), + circle_fit_resultsGPU_, + offset); + cudaCheck(cudaGetLastError()); + + kernel_LineFit<5><<>>(tupleMultiplicity_, + 5, + bField_, + outputSoa_, + hitsGPU.get(), + hits_geGPU.get(), + fast_fit_resultsGPU.get(), + circle_fit_resultsGPU_, + offset); + cudaCheck(cudaGetLastError()); + } + } +} diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h new file mode 100644 index 0000000000000..5b661bc3be028 --- /dev/null +++ b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h @@ -0,0 +1,187 @@ +// +// Author: Felice Pantaleo, CERN +// + +#include + +#include + +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" +#include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h" +#include "RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h" + +#include "HelixFitOnGPU.h" + +using HitsOnGPU = TrackingRecHit2DSOAView; +using Tuples = pixelTrack::HitContainer; +using OutputSoA = pixelTrack::TrackSoA; + +template +__global__ void kernel_FastFit(Tuples const *__restrict__ foundNtuplets, + caConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity, + uint32_t nHits, + HitsOnGPU const *__restrict__ hhp, + double *__restrict__ phits, + float *__restrict__ phits_ge, + double *__restrict__ pfast_fit, + uint32_t offset) { + constexpr uint32_t hitsInFit = N; + + assert(hitsInFit <= nHits); + + assert(pfast_fit); + assert(foundNtuplets); + assert(tupleMultiplicity); + + // look in bin for this hit multiplicity + auto local_start = blockIdx.x * blockDim.x + threadIdx.x; + +#ifdef RIEMANN_DEBUG + if (0 == local_start) + printf("%d Ntuple of size %d for %d hits to fit\n", tupleMultiplicity->size(nHits), nHits, hitsInFit); +#endif + + for (int local_idx = local_start, nt = riemannFit::maxNumberOfConcurrentFits; local_idx < nt; + local_idx += gridDim.x * blockDim.x) { + auto tuple_idx = local_idx + offset; + if (tuple_idx >= tupleMultiplicity->size(nHits)) + break; + + // get it from the ntuple container (one to one to helix) + auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_idx); + assert(tkid < foundNtuplets->nbins()); + + assert(foundNtuplets->size(tkid) == nHits); + + riemannFit::Map3xNd hits(phits + local_idx); + riemannFit::Map4d fast_fit(pfast_fit + local_idx); + riemannFit::Map6xNf hits_ge(phits_ge + local_idx); + + // Prepare data structure + auto const *hitId = foundNtuplets->begin(tkid); + for (unsigned int i = 0; i < hitsInFit; ++i) { + auto hit = hitId[i]; + // printf("Hit global: %f,%f,%f\n", hhp->xg_d[hit],hhp->yg_d[hit],hhp->zg_d[hit]); + float ge[6]; + hhp->cpeParams() + .detParams(hhp->detectorIndex(hit)) + .frame.toGlobal(hhp->xerrLocal(hit), 0, hhp->yerrLocal(hit), ge); + // printf("Error: %d: %f,%f,%f,%f,%f,%f\n",hhp->detInd_d[hit],ge[0],ge[1],ge[2],ge[3],ge[4],ge[5]); + + hits.col(i) << hhp->xGlobal(hit), hhp->yGlobal(hit), hhp->zGlobal(hit); + hits_ge.col(i) << ge[0], ge[1], ge[2], ge[3], ge[4], ge[5]; + } + riemannFit::fastFit(hits, fast_fit); + + // no NaN here.... + assert(fast_fit(0) == fast_fit(0)); + assert(fast_fit(1) == fast_fit(1)); + assert(fast_fit(2) == fast_fit(2)); + assert(fast_fit(3) == fast_fit(3)); + } +} + +template +__global__ void kernel_CircleFit(caConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity, + uint32_t nHits, + double bField, + double *__restrict__ phits, + float *__restrict__ phits_ge, + double *__restrict__ pfast_fit_input, + riemannFit::CircleFit *circle_fit, + uint32_t offset) { + assert(circle_fit); + assert(N <= nHits); + + // same as above... + + // look in bin for this hit multiplicity + auto local_start = blockIdx.x * blockDim.x + threadIdx.x; + for (int local_idx = local_start, nt = riemannFit::maxNumberOfConcurrentFits; local_idx < nt; + local_idx += gridDim.x * blockDim.x) { + auto tuple_idx = local_idx + offset; + if (tuple_idx >= tupleMultiplicity->size(nHits)) + break; + + riemannFit::Map3xNd hits(phits + local_idx); + riemannFit::Map4d fast_fit(pfast_fit_input + local_idx); + riemannFit::Map6xNf hits_ge(phits_ge + local_idx); + + riemannFit::VectorNd rad = (hits.block(0, 0, 2, N).colwise().norm()); + + riemannFit::Matrix2Nd hits_cov = riemannFit::Matrix2Nd::Zero(); + riemannFit::loadCovariance2D(hits_ge, hits_cov); + + circle_fit[local_idx] = riemannFit::circleFit(hits.block(0, 0, 2, N), hits_cov, fast_fit, rad, bField, true); + +#ifdef RIEMANN_DEBUG +// auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_idx); +// printf("kernelCircleFit circle.par(0,1,2): %d %f,%f,%f\n", tkid, +// circle_fit[local_idx].par(0), circle_fit[local_idx].par(1), circle_fit[local_idx].par(2)); +#endif + } +} + +template +__global__ void kernel_LineFit(caConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity, + uint32_t nHits, + double bField, + OutputSoA *results, + double *__restrict__ phits, + float *__restrict__ phits_ge, + double *__restrict__ pfast_fit_input, + riemannFit::CircleFit *__restrict__ circle_fit, + uint32_t offset) { + assert(results); + assert(circle_fit); + assert(N <= nHits); + + // same as above... + + // look in bin for this hit multiplicity + auto local_start = (blockIdx.x * blockDim.x + threadIdx.x); + for (int local_idx = local_start, nt = riemannFit::maxNumberOfConcurrentFits; local_idx < nt; + local_idx += gridDim.x * blockDim.x) { + auto tuple_idx = local_idx + offset; + if (tuple_idx >= tupleMultiplicity->size(nHits)) + break; + + // get it for the ntuple container (one to one to helix) + auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_idx); + + riemannFit::Map3xNd hits(phits + local_idx); + riemannFit::Map4d fast_fit(pfast_fit_input + local_idx); + riemannFit::Map6xNf hits_ge(phits_ge + local_idx); + + auto const &line_fit = riemannFit::lineFit(hits, hits_ge, circle_fit[local_idx], fast_fit, bField, true); + + riemannFit::fromCircleToPerigee(circle_fit[local_idx]); + + results->stateAtBS.copyFromCircle( + circle_fit[local_idx].par, circle_fit[local_idx].cov, line_fit.par, line_fit.cov, 1.f / float(bField), tkid); + results->pt(tkid) = bField / std::abs(circle_fit[local_idx].par(2)); + results->eta(tkid) = asinhf(line_fit.par(0)); + results->chi2(tkid) = (circle_fit[local_idx].chi2 + line_fit.chi2) / (2 * N - 5); + +#ifdef RIEMANN_DEBUG + printf("kernelLineFit size %d for %d hits circle.par(0,1,2): %d %f,%f,%f\n", + N, + nHits, + tkid, + circle_fit[local_idx].par(0), + circle_fit[local_idx].par(1), + circle_fit[local_idx].par(2)); + printf("kernelLineFit line.par(0,1): %d %f,%f\n", tkid, line_fit.par(0), line_fit.par(1)); + printf("kernelLineFit chi2 cov %f/%f %e,%e,%e,%e,%e\n", + circle_fit[local_idx].chi2, + line_fit.chi2, + circle_fit[local_idx].cov(0, 0), + circle_fit[local_idx].cov(1, 1), + circle_fit[local_idx].cov(2, 2), + line_fit.cov(0, 0), + line_fit.cov(1, 1)); +#endif + } +} diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h new file mode 100644 index 0000000000000..09cd5c18e65ae --- /dev/null +++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h @@ -0,0 +1,91 @@ +#ifndef RecoPixelVertexing_PixelTriplets_plugins_gpuFishbone_h +#define RecoPixelVertexing_PixelTriplets_plugins_gpuFishbone_h + +#include +#include +#include +#include +#include + +#include "DataFormats/Math/interface/approx_atan2.h" +#include "Geometry/TrackerGeometryBuilder/interface/phase1PixelTopology.h" +#include "HeterogeneousCore/CUDAUtilities/interface/VecArray.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" + +#include "GPUCACell.h" + +namespace gpuPixelDoublets { + + // __device__ + // __forceinline__ + __global__ void fishbone(GPUCACell::Hits const* __restrict__ hhp, + GPUCACell* cells, + uint32_t const* __restrict__ nCells, + GPUCACell::OuterHitOfCell const* __restrict__ isOuterHitOfCell, + uint32_t nHits, + bool checkTrack) { + constexpr auto maxCellsPerHit = GPUCACell::maxCellsPerHit; + + auto const& hh = *hhp; + + // x run faster... + auto firstY = threadIdx.y + blockIdx.y * blockDim.y; + auto firstX = threadIdx.x; + + float x[maxCellsPerHit], y[maxCellsPerHit], z[maxCellsPerHit], n[maxCellsPerHit]; + uint16_t d[maxCellsPerHit]; // uint8_t l[maxCellsPerHit]; + uint32_t cc[maxCellsPerHit]; + + for (int idy = firstY, nt = nHits; idy < nt; idy += gridDim.y * blockDim.y) { + auto const& vc = isOuterHitOfCell[idy]; + auto size = vc.size(); + if (size < 2) + continue; + // if alligned kill one of the two. + // in principle one could try to relax the cut (only in r-z?) for jumping-doublets + auto const& c0 = cells[vc[0]]; + auto xo = c0.outer_x(hh); + auto yo = c0.outer_y(hh); + auto zo = c0.outer_z(hh); + auto sg = 0; + for (int32_t ic = 0; ic < size; ++ic) { + auto& ci = cells[vc[ic]]; + if (ci.unused()) + continue; // for triplets equivalent to next + if (checkTrack && ci.tracks().empty()) + continue; + cc[sg] = vc[ic]; + d[sg] = ci.inner_detIndex(hh); + x[sg] = ci.inner_x(hh) - xo; + y[sg] = ci.inner_y(hh) - yo; + z[sg] = ci.inner_z(hh) - zo; + n[sg] = x[sg] * x[sg] + y[sg] * y[sg] + z[sg] * z[sg]; + ++sg; + } + if (sg < 2) + continue; + // here we parallelize + for (int32_t ic = firstX; ic < sg - 1; ic += blockDim.x) { + auto& ci = cells[cc[ic]]; + for (auto jc = ic + 1; jc < sg; ++jc) { + auto& cj = cells[cc[jc]]; + // must be different detectors (in the same layer) + // if (d[ic]==d[jc]) continue; + // || l[ic]!=l[jc]) continue; + auto cos12 = x[ic] * x[jc] + y[ic] * y[jc] + z[ic] * z[jc]; + if (d[ic] != d[jc] && cos12 * cos12 >= 0.99999f * n[ic] * n[jc]) { + // alligned: kill farthest (prefer consecutive layers) + if (n[ic] > n[jc]) { + ci.kill(); + break; + } else { + cj.kill(); + } + } + } //cj + } // ci + } // hits + } +} // namespace gpuPixelDoublets + +#endif // RecoPixelVertexing_PixelTriplets_plugins_gpuFishbone_h diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h new file mode 100644 index 0000000000000..6de3f1a51acaa --- /dev/null +++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h @@ -0,0 +1,130 @@ +#ifndef RecoPixelVertexing_PixelTriplets_plugins_gpuPixelDoublets_h +#define RecoPixelVertexing_PixelTriplets_plugins_gpuPixelDoublets_h + +#include "RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h" + +#define CONSTANT_VAR __constant__ + +namespace gpuPixelDoublets { + + constexpr int nPairs = 13 + 2 + 4; + static_assert(nPairs <= caConstants::maxNumberOfLayerPairs); + + // start constants + // clang-format off + + CONSTANT_VAR const uint8_t layerPairs[2 * nPairs] = { + 0, 1, 0, 4, 0, 7, // BPIX1 (3) + 1, 2, 1, 4, 1, 7, // BPIX2 (5) + 4, 5, 7, 8, // FPIX1 (8) + 2, 3, 2, 4, 2, 7, 5, 6, 8, 9, // BPIX3 & FPIX2 (13) + 0, 2, 1, 3, // Jumping Barrel (15) + 0, 5, 0, 8, // Jumping Forward (BPIX1,FPIX2) + 4, 6, 7, 9 // Jumping Forward (19) + }; + + constexpr int16_t phi0p05 = 522; // round(521.52189...) = phi2short(0.05); + constexpr int16_t phi0p06 = 626; // round(625.82270...) = phi2short(0.06); + constexpr int16_t phi0p07 = 730; // round(730.12648...) = phi2short(0.07); + + CONSTANT_VAR const int16_t phicuts[nPairs]{phi0p05, + phi0p07, + phi0p07, + phi0p05, + phi0p06, + phi0p06, + phi0p05, + phi0p05, + phi0p06, + phi0p06, + phi0p06, + phi0p05, + phi0p05, + phi0p05, + phi0p05, + phi0p05, + phi0p05, + phi0p05, + phi0p05}; + // phi0p07, phi0p07, phi0p06,phi0p06, phi0p06,phi0p06}; // relaxed cuts + + CONSTANT_VAR float const minz[nPairs] = { + -20., 0., -30., -22., 10., -30., -70., -70., -22., 15., -30, -70., -70., -20., -22., 0, -30., -70., -70.}; + CONSTANT_VAR float const maxz[nPairs] = { + 20., 30., 0., 22., 30., -10., 70., 70., 22., 30., -15., 70., 70., 20., 22., 30., 0., 70., 70.}; + CONSTANT_VAR float const maxr[nPairs] = { + 20., 9., 9., 20., 7., 7., 5., 5., 20., 6., 6., 5., 5., 20., 20., 9., 9., 9., 9.}; + + // end constants + // clang-format on + + using CellNeighbors = caConstants::CellNeighbors; + using CellTracks = caConstants::CellTracks; + using CellNeighborsVector = caConstants::CellNeighborsVector; + using CellTracksVector = caConstants::CellTracksVector; + + __global__ void initDoublets(GPUCACell::OuterHitOfCell* isOuterHitOfCell, + int nHits, + CellNeighborsVector* cellNeighbors, + CellNeighbors* cellNeighborsContainer, + CellTracksVector* cellTracks, + CellTracks* cellTracksContainer) { + assert(isOuterHitOfCell); + int first = blockIdx.x * blockDim.x + threadIdx.x; + for (int i = first; i < nHits; i += gridDim.x * blockDim.x) + isOuterHitOfCell[i].reset(); + + if (0 == first) { + cellNeighbors->construct(caConstants::maxNumOfActiveDoublets, cellNeighborsContainer); + cellTracks->construct(caConstants::maxNumOfActiveDoublets, cellTracksContainer); + auto i = cellNeighbors->extend(); + assert(0 == i); + (*cellNeighbors)[0].reset(); + i = cellTracks->extend(); + assert(0 == i); + (*cellTracks)[0].reset(); + } + } + + constexpr auto getDoubletsFromHistoMaxBlockSize = 64; // for both x and y + constexpr auto getDoubletsFromHistoMinBlocksPerMP = 16; + + __global__ +#ifdef __CUDACC__ + __launch_bounds__(getDoubletsFromHistoMaxBlockSize, getDoubletsFromHistoMinBlocksPerMP) +#endif + void getDoubletsFromHisto(GPUCACell* cells, + uint32_t* nCells, + CellNeighborsVector* cellNeighbors, + CellTracksVector* cellTracks, + TrackingRecHit2DSOAView const* __restrict__ hhp, + GPUCACell::OuterHitOfCell* isOuterHitOfCell, + int nActualPairs, + bool ideal_cond, + bool doClusterCut, + bool doZ0Cut, + bool doPtCut, + uint32_t maxNumOfDoublets) { + auto const& __restrict__ hh = *hhp; + doubletsFromHisto(layerPairs, + nActualPairs, + cells, + nCells, + cellNeighbors, + cellTracks, + hh, + isOuterHitOfCell, + phicuts, + minz, + maxz, + maxr, + ideal_cond, + doClusterCut, + doZ0Cut, + doPtCut, + maxNumOfDoublets); + } + +} // namespace gpuPixelDoublets + +#endif // RecoPixelVertexing_PixelTriplets_plugins_gpuPixelDoublets_h diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h new file mode 100644 index 0000000000000..a12dee0785b36 --- /dev/null +++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h @@ -0,0 +1,243 @@ +#ifndef RecoPixelVertexing_PixelTriplets_plugins_gpuPixelDoubletsAlgos_h +#define RecoPixelVertexing_PixelTriplets_plugins_gpuPixelDoubletsAlgos_h + +#include +#include +#include +#include +#include + +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" +#include "DataFormats/Math/interface/approx_atan2.h" +#include "HeterogeneousCore/CUDAUtilities/interface/VecArray.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" + +#include "CAConstants.h" +#include "GPUCACell.h" + +namespace gpuPixelDoublets { + + using CellNeighbors = caConstants::CellNeighbors; + using CellTracks = caConstants::CellTracks; + using CellNeighborsVector = caConstants::CellNeighborsVector; + using CellTracksVector = caConstants::CellTracksVector; + + __device__ __forceinline__ void doubletsFromHisto(uint8_t const* __restrict__ layerPairs, + uint32_t nPairs, + GPUCACell* cells, + uint32_t* nCells, + CellNeighborsVector* cellNeighbors, + CellTracksVector* cellTracks, + TrackingRecHit2DSOAView const& __restrict__ hh, + GPUCACell::OuterHitOfCell* isOuterHitOfCell, + int16_t const* __restrict__ phicuts, + float const* __restrict__ minz, + float const* __restrict__ maxz, + float const* __restrict__ maxr, + bool ideal_cond, + bool doClusterCut, + bool doZ0Cut, + bool doPtCut, + uint32_t maxNumOfDoublets) { + // ysize cuts (z in the barrel) times 8 + // these are used if doClusterCut is true + constexpr int minYsizeB1 = 36; + constexpr int minYsizeB2 = 28; + constexpr int maxDYsize12 = 28; + constexpr int maxDYsize = 20; + constexpr int maxDYPred = 20; + constexpr float dzdrFact = 8 * 0.0285 / 0.015; // from dz/dr to "DY" + + bool isOuterLadder = ideal_cond; + + using PhiBinner = TrackingRecHit2DSOAView::PhiBinner; + + auto const& __restrict__ phiBinner = hh.phiBinner(); + uint32_t const* __restrict__ offsets = hh.hitsLayerStart(); + assert(offsets); + + auto layerSize = [=](uint8_t li) { return offsets[li + 1] - offsets[li]; }; + + // nPairsMax to be optimized later (originally was 64). + // If it should be much bigger, consider using a block-wide parallel prefix scan, + // e.g. see https://nvlabs.github.io/cub/classcub_1_1_warp_scan.html + const int nPairsMax = caConstants::maxNumberOfLayerPairs; + assert(nPairs <= nPairsMax); + __shared__ uint32_t innerLayerCumulativeSize[nPairsMax]; + __shared__ uint32_t ntot; + if (threadIdx.y == 0 && threadIdx.x == 0) { + innerLayerCumulativeSize[0] = layerSize(layerPairs[0]); + for (uint32_t i = 1; i < nPairs; ++i) { + innerLayerCumulativeSize[i] = innerLayerCumulativeSize[i - 1] + layerSize(layerPairs[2 * i]); + } + ntot = innerLayerCumulativeSize[nPairs - 1]; + } + __syncthreads(); + + // x runs faster + auto idy = blockIdx.y * blockDim.y + threadIdx.y; + auto first = threadIdx.x; + auto stride = blockDim.x; + + uint32_t pairLayerId = 0; // cannot go backward + for (auto j = idy; j < ntot; j += blockDim.y * gridDim.y) { + while (j >= innerLayerCumulativeSize[pairLayerId++]) + ; + --pairLayerId; // move to lower_bound ?? + + assert(pairLayerId < nPairs); + assert(j < innerLayerCumulativeSize[pairLayerId]); + assert(0 == pairLayerId || j >= innerLayerCumulativeSize[pairLayerId - 1]); + + uint8_t inner = layerPairs[2 * pairLayerId]; + uint8_t outer = layerPairs[2 * pairLayerId + 1]; + assert(outer > inner); + + auto hoff = PhiBinner::histOff(outer); + + auto i = (0 == pairLayerId) ? j : j - innerLayerCumulativeSize[pairLayerId - 1]; + i += offsets[inner]; + + // printf("Hit in Layer %d %d %d %d\n", i, inner, pairLayerId, j); + + assert(i >= offsets[inner]); + assert(i < offsets[inner + 1]); + + // found hit corresponding to our cuda thread, now do the job + auto mi = hh.detectorIndex(i); + if (mi > gpuClustering::maxNumModules) + continue; // invalid + + /* maybe clever, not effective when zoCut is on + auto bpos = (mi%8)/4; // if barrel is 1 for z>0 + auto fpos = (outer>3) & (outer<7); + if ( ((inner<3) & (outer>3)) && bpos!=fpos) continue; + */ + + auto mez = hh.zGlobal(i); + + if (mez < minz[pairLayerId] || mez > maxz[pairLayerId]) + continue; + + int16_t mes = -1; // make compiler happy + if (doClusterCut) { + // if ideal treat inner ladder as outer + if (inner == 0) + assert(mi < 96); + isOuterLadder = ideal_cond ? true : 0 == (mi / 8) % 2; // only for B1/B2/B3 B4 is opposite, FPIX:noclue... + + // in any case we always test mes>0 ... + mes = inner > 0 || isOuterLadder ? hh.clusterSizeY(i) : -1; + + if (inner == 0 && outer > 3) // B1 and F1 + if (mes > 0 && mes < minYsizeB1) + continue; // only long cluster (5*8) + if (inner == 1 && outer > 3) // B2 and F1 + if (mes > 0 && mes < minYsizeB2) + continue; + } + auto mep = hh.iphi(i); + auto mer = hh.rGlobal(i); + + // all cuts: true if fails + constexpr float z0cut = 12.f; // cm + constexpr float hardPtCut = 0.5f; // GeV + // cm (1 GeV track has 1 GeV/c / (e * 3.8T) ~ 87 cm radius in a 3.8T field) + constexpr float minRadius = hardPtCut * 87.78f; + constexpr float minRadius2T4 = 4.f * minRadius * minRadius; + auto ptcut = [&](int j, int16_t idphi) { + auto r2t4 = minRadius2T4; + auto ri = mer; + auto ro = hh.rGlobal(j); + auto dphi = short2phi(idphi); + return dphi * dphi * (r2t4 - ri * ro) > (ro - ri) * (ro - ri); + }; + auto z0cutoff = [&](int j) { + auto zo = hh.zGlobal(j); + auto ro = hh.rGlobal(j); + auto dr = ro - mer; + return dr > maxr[pairLayerId] || dr < 0 || std::abs((mez * ro - mer * zo)) > z0cut * dr; + }; + + auto zsizeCut = [&](int j) { + auto onlyBarrel = outer < 4; + auto so = hh.clusterSizeY(j); + auto dy = inner == 0 ? maxDYsize12 : maxDYsize; + // in the barrel cut on difference in size + // in the endcap on the prediction on the first layer (actually in the barrel only: happen to be safe for endcap as well) + // FIXME move pred cut to z0cutoff to optmize loading of and computaiton ... + auto zo = hh.zGlobal(j); + auto ro = hh.rGlobal(j); + return onlyBarrel ? mes > 0 && so > 0 && std::abs(so - mes) > dy + : (inner < 4) && mes > 0 && + std::abs(mes - int(std::abs((mez - zo) / (mer - ro)) * dzdrFact + 0.5f)) > maxDYPred; + }; + + auto iphicut = phicuts[pairLayerId]; + + auto kl = PhiBinner::bin(int16_t(mep - iphicut)); + auto kh = PhiBinner::bin(int16_t(mep + iphicut)); + auto incr = [](auto& k) { return k = (k + 1) % PhiBinner::nbins(); }; + +#ifdef GPU_DEBUG + int tot = 0; + int nmin = 0; + int tooMany = 0; +#endif + + auto khh = kh; + incr(khh); + for (auto kk = kl; kk != khh; incr(kk)) { +#ifdef GPU_DEBUG + if (kk != kl && kk != kh) + nmin += phiBinner.size(kk + hoff); +#endif + auto const* __restrict__ p = phiBinner.begin(kk + hoff); + auto const* __restrict__ e = phiBinner.end(kk + hoff); + p += first; + for (; p < e; p += stride) { + auto oi = __ldg(p); + assert(oi >= offsets[outer]); + assert(oi < offsets[outer + 1]); + auto mo = hh.detectorIndex(oi); + if (mo > gpuClustering::maxNumModules) + continue; // invalid + + if (doZ0Cut && z0cutoff(oi)) + continue; + + auto mop = hh.iphi(oi); + uint16_t idphi = std::min(std::abs(int16_t(mop - mep)), std::abs(int16_t(mep - mop))); + if (idphi > iphicut) + continue; + + if (doClusterCut && zsizeCut(oi)) + continue; + if (doPtCut && ptcut(oi, idphi)) + continue; + + auto ind = atomicAdd(nCells, 1); + if (ind >= maxNumOfDoublets) { + atomicSub(nCells, 1); + break; + } // move to SimpleVector?? + // int layerPairId, int doubletId, int innerHitId, int outerHitId) + cells[ind].init(*cellNeighbors, *cellTracks, hh, pairLayerId, ind, i, oi); + isOuterHitOfCell[oi].push_back(ind); +#ifdef GPU_DEBUG + if (isOuterHitOfCell[oi].full()) + ++tooMany; + ++tot; +#endif + } + } +#ifdef GPU_DEBUG + if (tooMany > 0) + printf("OuterHitOfCell full for %d in layer %d/%d, %d,%d %d\n", i, inner, outer, nmin, tot, tooMany); +#endif + } // loop in block... + } + +} // namespace gpuPixelDoublets + +#endif // RecoPixelVertexing_PixelTriplets_plugins_gpuPixelDoubletsAlgos_h diff --git a/RecoPixelVertexing/PixelTriplets/python/caHitQuadrupletEDProducer_cfi.py b/RecoPixelVertexing/PixelTriplets/python/caHitQuadrupletEDProducer_cfi.py new file mode 100644 index 0000000000000..c72c07ae5a721 --- /dev/null +++ b/RecoPixelVertexing/PixelTriplets/python/caHitQuadrupletEDProducer_cfi.py @@ -0,0 +1,4 @@ +import FWCore.ParameterSet.Config as cms +from RecoPixelVertexing.PixelTriplets.caHitQuadrupletDefaultEDProducer_cfi import caHitQuadrupletDefaultEDProducer as _caHitQuadrupletDefaultEDProducer + +caHitQuadrupletEDProducer = _caHitQuadrupletDefaultEDProducer.clone() diff --git a/RecoPixelVertexing/PixelTriplets/test/BuildFile.xml b/RecoPixelVertexing/PixelTriplets/test/BuildFile.xml index 6d6f1553b32f3..d480d7408b9e2 100644 --- a/RecoPixelVertexing/PixelTriplets/test/BuildFile.xml +++ b/RecoPixelVertexing/PixelTriplets/test/BuildFile.xml @@ -16,3 +16,14 @@ + + + + + + + + + + + diff --git a/RecoPixelVertexing/PixelTriplets/test/CAsizes_t.cpp b/RecoPixelVertexing/PixelTriplets/test/CAsizes_t.cpp new file mode 100644 index 0000000000000..5cf2e6526b860 --- /dev/null +++ b/RecoPixelVertexing/PixelTriplets/test/CAsizes_t.cpp @@ -0,0 +1,25 @@ +#include "RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h" + +#include +#include + +template +void print() { + std::cout << "size of " << typeid(T).name() << ' ' << sizeof(T) << std::endl; +} + +int main() { + using namespace caConstants; + + print(); + print(); + print(); + print(); + print(); + print(); + print(); + + print(); + + return 0; +} diff --git a/RecoPixelVertexing/PixelTriplets/test/CircleEq_t.cpp b/RecoPixelVertexing/PixelTriplets/test/CircleEq_t.cpp new file mode 100644 index 0000000000000..504f9c144b284 --- /dev/null +++ b/RecoPixelVertexing/PixelTriplets/test/CircleEq_t.cpp @@ -0,0 +1,77 @@ +#include "RecoPixelVertexing/PixelTriplets/interface/CircleEq.h" +#include + +struct OriCircle { + using T = float; + + float radius = 0; + float x_center = 0; + float y_center = 0; + + constexpr OriCircle(T x1, T y1, T x2, T y2, T x3, T y3) { compute(x1, y1, x2, y2, x3, y3); } + + // dca to origin + constexpr T dca0() const { return std::sqrt(x_center * x_center + y_center * y_center) - radius; } + + // dca to given point + constexpr T dca(T x, T y) const { + x -= x_center; + y -= y_center; + return std::sqrt(x * x + y * y) - radius; + } + + constexpr void compute(T x1, T y1, T x2, T y2, T x3, T y3) { + auto det = (x1 - x2) * (y2 - y3) - (x2 - x3) * (y1 - y2); + + auto offset = x2 * x2 + y2 * y2; + + auto bc = (x1 * x1 + y1 * y1 - offset) * 0.5f; + + auto cd = (offset - x3 * x3 - y3 * y3) * 0.5f; + + auto idet = 1.f / det; + + x_center = (bc * (y2 - y3) - cd * (y1 - y2)) * idet; + y_center = (cd * (x1 - x2) - bc * (x2 - x3)) * idet; + + radius = std::sqrt((x2 - x_center) * (x2 - x_center) + (y2 - y_center) * (y2 - y_center)); + } +}; + +#include + +template +bool equal(T a, T b) { + // return float(a-b)==0; + return std::abs(float(a - b)) < std::abs(0.01f * a); +} + +int main() { + float r1 = 4, r2 = 8, r3 = 15; + for (float phi = -3; phi < 3.1; phi += 0.5) { + float x1 = r1 * cos(phi); + float x2 = r2 * cos(phi); + float y1 = r1 * sin(phi); + float y2 = r2 * sin(phi); + for (float phi3 = phi - 0.31; phi3 < phi + 0.31; phi3 += 0.05) { + float x3 = r3 * cos(phi3); + float y3 = r3 * sin(phi3); + + OriCircle ori(x1, y1, x2, y2, x3, y3); + CircleEq eq(x1, y1, x2, y2, x3, y3); + // std::cout << "r " << ori.radius <<' '<< eq.radius() << std::endl; + assert(equal(ori.radius, std::abs(eq.radius()))); + auto c = eq.center(); + auto dir = eq.cosdir(); + assert(equal(1.f, dir.first * dir.first + dir.second * dir.second)); + assert(equal(ori.x_center, c.first)); + assert(equal(ori.y_center, c.second)); + // std::cout << "dca " << ori.dca0() <<' '<< eq.radius()*eq.dca0() << std::endl; + assert(equal(std::abs(ori.dca0()), std::abs(eq.radius() * eq.dca0()))); + // std::cout << "dca " << ori.dca(1.,1.) <<' '<< eq.radius()*eq.dca(1.,1.) << std::endl; + assert(equal(std::abs(ori.dca(1., 1.)), std::abs(eq.radius() * eq.dca(1., 1.)))); + } + } + + return 0; +} diff --git a/RecoPixelVertexing/PixelTriplets/test/fastDPHI_t.cpp b/RecoPixelVertexing/PixelTriplets/test/fastDPHI_t.cpp new file mode 100644 index 0000000000000..8538970a196ff --- /dev/null +++ b/RecoPixelVertexing/PixelTriplets/test/fastDPHI_t.cpp @@ -0,0 +1,165 @@ +// this test documents the derivation of the fast deltaphi used in gpu doublet code.. +// +// +// +#include +#include +#include +#include + +/** +| 1) circle is parameterized as: | +| C*[(X-Xp)**2+(Y-Yp)**2] - 2*alpha*(X-Xp) - 2*beta*(Y-Yp) = 0 | +| Xp,Yp is a point on the track (Yp is at the center of the chamber); | +| C = 1/r0 is the curvature ( sign of C is charge of particle ); | +| alpha & beta are the direction cosines of the radial vector at Xp,Yp | +| i.e. alpha = C*(X0-Xp), | +| beta = C*(Y0-Yp), | +| where center of circle is at X0,Y0. | +| Alpha > 0 | +| Slope dy/dx of tangent at Xp,Yp is -alpha/beta. | +| 2) the z dimension of the helix is parameterized by gamma = dZ/dSperp | +| this is also the tangent of the pitch angle of the helix. | +| with this parameterization, (alpha,beta,gamma) rotate like a vector. | +| 3) For tracks going inward at (Xp,Yp), C, alpha, beta, and gamma change sign| +| +*/ + +template +class FastCircle { +public: + FastCircle() {} + FastCircle(T x1, T y1, T x2, T y2, T x3, T y3) { compute(x1, y1, x2, y2, x3, y3); } + + void compute(T x1, T y1, T x2, T y2, T x3, T y3); + + T m_xp; + T m_yp; + T m_c; + T m_alpha; + T m_beta; +}; + +template +void FastCircle::compute(T x1, T y1, T x2, T y2, T x3, T y3) { + bool flip = std::abs(x3 - x1) > std::abs(y3 - y1); + + auto x1p = x1 - x2; + auto y1p = y1 - y2; + auto d12 = x1p * x1p + y1p * y1p; + auto x3p = x3 - x2; + auto y3p = y3 - y2; + auto d32 = x3p * x3p + y3p * y3p; + + if (flip) { + std::swap(x1p, y1p); + std::swap(x3p, y3p); + } + + auto num = x1p * y3p - y1p * x3p; // num also gives correct sign for CT + auto det = d12 * y3p - d32 * y1p; + if (std::abs(det) == 0) { + // and why we flip???? + } + auto ct = num / det; + auto sn = det > 0 ? T(1.) : T(-1.); + auto st2 = (d12 * x3p - d32 * x1p) / det; + auto seq = T(1.) + st2 * st2; + auto al2 = sn / std::sqrt(seq); + auto be2 = -st2 * al2; + ct *= T(2.) * al2; + + if (flip) { + std::swap(x1p, y1p); + std::swap(al2, be2); + al2 = -al2; + be2 = -be2; + ct = -ct; + } + + m_xp = x1; + m_yp = y1; + m_c = ct; + m_alpha = al2 - ct * x1p; + m_beta = be2 - ct * y1p; +} + +// compute curvature given two points (and origin) +float fastDPHI(float ri, float ro, float dphi) { + /* + x3=0 y1=0 x1=0; + y3=ro + */ + + // auto x2 = ri*dphi; + // auto y2 = ri*(1.f-0.5f*dphi*dphi); + + /* + auto x1p = x1-x2; + auto y1p = y1-y2; + auto d12 = x1p*x1p + y1p*y1p; + auto x3p = x3-x2; + auto y3p = y3-y2; + auto d32 = x3p*x3p + y3p*y3p; + */ + + /* + auto x1p = -x2; + auto y1p = -y2; + auto d12 = ri*ri; + auto x3p = -x2; + auto y3p = ro-y2; + auto d32 = ri*ri + ro*ro - 2.f*ro*y2; + */ + + // auto rat = (ro -2.f*y2); + // auto det = ro - ri - (ro - 2.f*ri -0.5f*ro)*dphi*dphi; + + //auto det2 = (ro-ri)*(ro-ri) -2.*(ro-ri)*(ro - 2.f*ri -0.5f*ro)*dphi*dphi; + // auto seq = det2 + dphi*dphi*(ro-2.f*ri)*(ro-2.f*ri); // *rat2; + // auto seq = (ro-ri)*(ro-ri) + dphi*dphi*ri*ro; + + // and little by little simplifing and removing higher over terms + // we get + auto r2 = (ro - ri) * (ro - ri) / (dphi * dphi) + ri * ro; + + // d2 = (ro-ri)*(ro-ri)/(4.f*r2 -ri*ro); + // return -2.f*dphi/std::sqrt(seq); + + return -1.f / std::sqrt(r2 / 4.f); +} + +#include + +template +bool equal(T a, T b) { + // return float(a-b)==0; + return std::abs(float(a - b)) < std::abs(0.01f * a); +} + +int n = 0; +void go(float ri, float ro, float dphi, bool print = false) { + ++n; + float x3 = 0.f, y3 = ro; + float x2 = ri * sin(dphi); + float y2 = ri * cos(dphi); + + FastCircle c(0, 0, x2, y2, x3, y3); + + auto cc = fastDPHI(ri, ro, dphi); + if (print) + std::cout << c.m_c << ' ' << cc << std::endl; + assert(equal(c.m_c, cc)); +} + +int main() { + go(4., 7., 0.1, true); + + for (float r1 = 2; r1 < 15; r1 += 1) + for (float dr = 0.5; dr < 10; dr += 0.5) + for (float dphi = 0.02; dphi < 0.2; dphi += 0.2) + go(r1, r1 + dr, dphi); + + std::cout << "done " << n << std::endl; + return 0; +}; diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/BuildFile.xml b/RecoPixelVertexing/PixelVertexFinding/plugins/BuildFile.xml index 427799cb122b5..99b91b2587bcf 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/BuildFile.xml +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/BuildFile.xml @@ -1,3 +1,4 @@ + @@ -15,10 +16,12 @@ + + - + diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc new file mode 100644 index 0000000000000..e9054dbf17c53 --- /dev/null +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc @@ -0,0 +1,125 @@ +#include + +#include "CUDADataFormats/Common/interface/Product.h" +#include "DataFormats/Common/interface/Handle.h" +#include "FWCore/Framework/interface/ESHandle.h" +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Framework/interface/EventSetup.h" +#include "FWCore/Framework/interface/MakerMacros.h" +#include "FWCore/Framework/interface/global/EDProducer.h" +#include "FWCore/Framework/interface/ConsumesCollector.h" +#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" +#include "FWCore/Utilities/interface/InputTag.h" +#include "FWCore/PluginManager/interface/ModuleDef.h" +#include "FWCore/Utilities/interface/EDGetToken.h" +#include "FWCore/Utilities/interface/RunningAverage.h" +#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h" + +#include "gpuVertexFinder.h" + +class PixelVertexProducerCUDA : public edm::global::EDProducer<> { +public: + explicit PixelVertexProducerCUDA(const edm::ParameterSet& iConfig); + ~PixelVertexProducerCUDA() override = default; + + static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); + +private: + void produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const override; + + bool m_OnGPU; + + edm::EDGetTokenT> tokenGPUTrack_; + edm::EDPutTokenT tokenGPUVertex_; + edm::EDGetTokenT tokenCPUTrack_; + edm::EDPutTokenT tokenCPUVertex_; + + const gpuVertexFinder::Producer m_gpuAlgo; + + // Tracking cuts before sending tracks to vertex algo + const float m_ptMin; +}; + +PixelVertexProducerCUDA::PixelVertexProducerCUDA(const edm::ParameterSet& conf) + : m_OnGPU(conf.getParameter("onGPU")), + m_gpuAlgo(conf.getParameter("oneKernel"), + conf.getParameter("useDensity"), + conf.getParameter("useDBSCAN"), + conf.getParameter("useIterative"), + conf.getParameter("minT"), + conf.getParameter("eps"), + conf.getParameter("errmax"), + conf.getParameter("chi2max")), + m_ptMin(conf.getParameter("PtMin")) // 0.5 GeV +{ + if (m_OnGPU) { + tokenGPUTrack_ = + consumes>(conf.getParameter("pixelTrackSrc")); + tokenGPUVertex_ = produces(); + } else { + tokenCPUTrack_ = consumes(conf.getParameter("pixelTrackSrc")); + tokenCPUVertex_ = produces(); + } +} + +void PixelVertexProducerCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { + edm::ParameterSetDescription desc; + + // Only one of these three algos can be used at once. + // Maybe this should become a Plugin Factory + desc.add("onGPU", true); + desc.add("oneKernel", true); + desc.add("useDensity", true); + desc.add("useDBSCAN", false); + desc.add("useIterative", false); + + desc.add("minT", 2); // min number of neighbours to be "core" + desc.add("eps", 0.07); // max absolute distance to cluster + desc.add("errmax", 0.01); // max error to be "seed" + desc.add("chi2max", 9.); // max normalized distance to cluster + + desc.add("PtMin", 0.5); + desc.add("pixelTrackSrc", edm::InputTag("caHitNtupletCUDA")); + + auto label = "pixelVertexCUDA"; + descriptions.add(label, desc); +} + +void PixelVertexProducerCUDA::produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const { + if (m_OnGPU) { + edm::Handle> hTracks; + iEvent.getByToken(tokenGPUTrack_, hTracks); + + cms::cuda::ScopedContextProduce ctx{*hTracks}; + auto const* tracks = ctx.get(*hTracks).get(); + + assert(tracks); + + ctx.emplace(iEvent, tokenGPUVertex_, m_gpuAlgo.makeAsync(ctx.stream(), tracks, m_ptMin)); + + } else { + auto const* tracks = iEvent.get(tokenCPUTrack_).get(); + assert(tracks); + + /* + auto const & tsoa = *tracks; + auto maxTracks = tsoa.stride(); + std::cout << "size of SoA " << sizeof(tsoa) << " stride " << maxTracks << std::endl; + + int32_t nt = 0; + for (int32_t it = 0; it < maxTracks; ++it) { + auto nHits = tsoa.nHits(it); + assert(nHits==int(tsoa.hitIndices.size(it))); + if (nHits == 0) break; // this is a guard: maybe we need to move to nTracks... + nt++; + } + std::cout << "found " << nt << " tracks in cpu SoA for Vertexing at " << tracks << std::endl; + */ + + iEvent.emplace(tokenCPUVertex_, m_gpuAlgo.make(tracks, m_ptMin)); + } +} + +DEFINE_FWK_MODULE(PixelVertexProducerCUDA); diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerFromSoA.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerFromSoA.cc new file mode 100644 index 0000000000000..e642e3fd734f9 --- /dev/null +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerFromSoA.cc @@ -0,0 +1,175 @@ +#include "DataFormats/BeamSpot/interface/BeamSpot.h" +#include "DataFormats/Common/interface/OrphanHandle.h" +#include "DataFormats/TrackReco/interface/Track.h" +#include "DataFormats/TrackReco/interface/TrackExtra.h" +#include "DataFormats/TrackReco/interface/TrackFwd.h" +#include "FWCore/Framework/interface/ESHandle.h" +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Framework/interface/EventSetup.h" +#include "FWCore/Framework/interface/MakerMacros.h" +#include "FWCore/Framework/interface/global/EDProducer.h" +#include "FWCore/Framework/interface/ConsumesCollector.h" +#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" +#include "FWCore/Utilities/interface/InputTag.h" +#include "FWCore/PluginManager/interface/ModuleDef.h" +#include "FWCore/Utilities/interface/EDGetToken.h" +#include "Geometry/Records/interface/TrackerTopologyRcd.h" +#include "MagneticField/Records/interface/IdealMagneticFieldRecord.h" + +#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h" + +#include "DataFormats/VertexReco/interface/Vertex.h" +#include "DataFormats/VertexReco/interface/VertexFwd.h" + +class PixelVertexProducerFromSoA : public edm::global::EDProducer<> { +public: + using IndToEdm = std::vector; + + explicit PixelVertexProducerFromSoA(const edm::ParameterSet &iConfig); + ~PixelVertexProducerFromSoA() override = default; + + static void fillDescriptions(edm::ConfigurationDescriptions &descriptions); + +private: + void produce(edm::StreamID streamID, edm::Event &iEvent, const edm::EventSetup &iSetup) const override; + + edm::EDGetTokenT tokenVertex_; + edm::EDGetTokenT tokenBeamSpot_; + edm::EDGetTokenT tokenTracks_; + edm::EDGetTokenT tokenIndToEdm_; +}; + +PixelVertexProducerFromSoA::PixelVertexProducerFromSoA(const edm::ParameterSet &conf) + : tokenVertex_(consumes(conf.getParameter("src"))), + tokenBeamSpot_(consumes(conf.getParameter("beamSpot"))), + tokenTracks_(consumes(conf.getParameter("TrackCollection"))), + tokenIndToEdm_(consumes(conf.getParameter("TrackCollection"))) { + produces(); +} + +void PixelVertexProducerFromSoA::fillDescriptions(edm::ConfigurationDescriptions &descriptions) { + edm::ParameterSetDescription desc; + + desc.add("TrackCollection", edm::InputTag("pixelTracks")); + desc.add("beamSpot", edm::InputTag("offlineBeamSpot")); + desc.add("src", edm::InputTag("pixelVertexSoA")); + + descriptions.add("pixelVertexFromSoA", desc); +} + +void PixelVertexProducerFromSoA::produce(edm::StreamID streamID, edm::Event &iEvent, const edm::EventSetup &) const { + auto vertexes = std::make_unique(); + + edm::Handle trackCollection; + iEvent.getByToken(tokenTracks_, trackCollection); + auto const &tracks = *(trackCollection.product()); + edm::Handle indToEdmH; + iEvent.getByToken(tokenIndToEdm_, indToEdmH); + auto const &indToEdm = *indToEdmH; + + edm::Handle bsHandle; + iEvent.getByToken(tokenBeamSpot_, bsHandle); + + float x0 = 0, y0 = 0, z0 = 0, dxdz = 0, dydz = 0; + std::vector itrk; + if (!bsHandle.isValid()) { + edm::LogWarning("PixelVertexProducer") << "No beamspot found. returning vertexes with (0,0,Z) "; + } else { + const reco::BeamSpot &bs = *bsHandle; + x0 = bs.x0(); + y0 = bs.y0(); + z0 = bs.z0(); + dxdz = bs.dxdz(); + dydz = bs.dydz(); + } + + auto const &soa = *(iEvent.get(tokenVertex_).get()); + + int nv = soa.nvFinal; + + // std::cout << "converting " << nv << " vertices " << " from " << indToEdm.size() << " tracks" << std::endl; + + std::set uind; // fort verifing index consistency + for (int j = nv - 1; j >= 0; --j) { + auto i = soa.sortInd[j]; // on gpu sorted in ascending order.... + assert(i < nv); + uind.insert(i); + assert(itrk.empty()); + auto z = soa.zv[i]; + auto x = x0 + dxdz * z; + auto y = y0 + dydz * z; + z += z0; + reco::Vertex::Error err; + err(2, 2) = 1.f / soa.wv[i]; + err(2, 2) *= 2.; // artifically inflate error + //Copy also the tracks (no intention to be efficient....) + for (auto k = 0U; k < indToEdm.size(); ++k) { + if (soa.idv[k] == int16_t(i)) + itrk.push_back(k); + } + auto nt = itrk.size(); + if (nt == 0) { + std::cout << "vertex " << i << " with no tracks..." << std::endl; + continue; + } + if (nt < 2) { + itrk.clear(); + continue; + } // remove outliers + (*vertexes).emplace_back(reco::Vertex::Point(x, y, z), err, soa.chi2[i], soa.ndof[i], nt); + auto &v = (*vertexes).back(); + for (auto it : itrk) { + assert(it < int(indToEdm.size())); + auto k = indToEdm[it]; + if (k > tracks.size()) { + edm::LogWarning("PixelVertexProducer") << "oops track " << it << " does not exists on CPU " << k; + continue; + } + auto tk = reco::TrackRef(trackCollection, k); + v.add(reco::TrackBaseRef(tk)); + } + itrk.clear(); + } + + LogDebug("PixelVertexProducer") << ": Found " << vertexes->size() << " vertexes\n"; + for (unsigned int i = 0; i < vertexes->size(); ++i) { + LogDebug("PixelVertexProducer") << "Vertex number " << i << " has " << (*vertexes)[i].tracksSize() + << " tracks with a position of " << (*vertexes)[i].z() << " +- " + << std::sqrt((*vertexes)[i].covariance(2, 2)); + } + + // legacy logic.... + if (vertexes->empty() && bsHandle.isValid()) { + const reco::BeamSpot &bs = *bsHandle; + + GlobalError bse(bs.rotatedCovariance3D()); + if ((bse.cxx() <= 0.) || (bse.cyy() <= 0.) || (bse.czz() <= 0.)) { + AlgebraicSymMatrix33 we; + we(0, 0) = 10000; + we(1, 1) = 10000; + we(2, 2) = 10000; + vertexes->push_back(reco::Vertex(bs.position(), we, 0., 0., 0)); + + edm::LogInfo("PixelVertexProducer") << "No vertices found. Beamspot with invalid errors " << bse.matrix() + << "\nWill put Vertex derived from dummy-fake BeamSpot into Event.\n" + << (*vertexes)[0].x() << "\n" + << (*vertexes)[0].y() << "\n" + << (*vertexes)[0].z() << "\n"; + } else { + vertexes->push_back(reco::Vertex(bs.position(), bs.rotatedCovariance3D(), 0., 0., 0)); + + edm::LogInfo("PixelVertexProducer") << "No vertices found. Will put Vertex derived from BeamSpot into Event:\n" + << (*vertexes)[0].x() << "\n" + << (*vertexes)[0].y() << "\n" + << (*vertexes)[0].z() << "\n"; + } + } else if (vertexes->empty() && !bsHandle.isValid()) { + edm::LogWarning("PixelVertexProducer") << "No beamspot and no vertex found. No vertex returned."; + } + + iEvent.put(std::move(vertexes)); +} + +DEFINE_FWK_MODULE(PixelVertexProducerFromSoA); diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexSoAFromCUDA.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexSoAFromCUDA.cc new file mode 100644 index 0000000000000..0cadf24580cf7 --- /dev/null +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexSoAFromCUDA.cc @@ -0,0 +1,65 @@ +#include + +#include "CUDADataFormats/Common/interface/Product.h" +#include "CUDADataFormats/Common/interface/HostProduct.h" +#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h" +#include "DataFormats/Common/interface/Handle.h" +#include "FWCore/Framework/interface/ESHandle.h" +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Framework/interface/EventSetup.h" +#include "FWCore/Framework/interface/MakerMacros.h" +#include "FWCore/Framework/interface/stream/EDProducer.h" +#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" +#include "FWCore/PluginManager/interface/ModuleDef.h" +#include "FWCore/Utilities/interface/EDGetToken.h" +#include "FWCore/Utilities/interface/InputTag.h" +#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h" + +class PixelVertexSoAFromCUDA : public edm::stream::EDProducer { +public: + explicit PixelVertexSoAFromCUDA(const edm::ParameterSet& iConfig); + ~PixelVertexSoAFromCUDA() override = default; + + static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); + +private: + void acquire(edm::Event const& iEvent, + edm::EventSetup const& iSetup, + edm::WaitingTaskWithArenaHolder waitingTaskHolder) override; + void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override; + + edm::EDGetTokenT> tokenCUDA_; + edm::EDPutTokenT tokenSOA_; + + cms::cuda::host::unique_ptr m_soa; +}; + +PixelVertexSoAFromCUDA::PixelVertexSoAFromCUDA(const edm::ParameterSet& iConfig) + : tokenCUDA_(consumes>(iConfig.getParameter("src"))), + tokenSOA_(produces()) {} + +void PixelVertexSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { + edm::ParameterSetDescription desc; + + desc.add("src", edm::InputTag("pixelVertexCUDA")); + descriptions.add("pixelVertexSoA", desc); +} + +void PixelVertexSoAFromCUDA::acquire(edm::Event const& iEvent, + edm::EventSetup const& iSetup, + edm::WaitingTaskWithArenaHolder waitingTaskHolder) { + auto const& inputDataWrapped = iEvent.get(tokenCUDA_); + cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)}; + auto const& inputData = ctx.get(inputDataWrapped); + + m_soa = inputData.toHostAsync(ctx.stream()); +} + +void PixelVertexSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) { + // No copies.... + iEvent.emplace(tokenSOA_, ZVertexHeterogeneous(std::move(m_soa))); +} + +DEFINE_FWK_MODULE(PixelVertexSoAFromCUDA); diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h new file mode 100644 index 0000000000000..b32c7d5b613db --- /dev/null +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h @@ -0,0 +1,234 @@ +#ifndef RecoPixelVertexing_PixelVertexFinding_src_gpuClusterTracksByDensity_h +#define RecoPixelVertexing_PixelVertexFinding_src_gpuClusterTracksByDensity_h + +#include +#include +#include + +#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" + +#include "gpuVertexFinder.h" + +namespace gpuVertexFinder { + + // this algo does not really scale as it works in a single block... + // enough for <10K tracks we have + // + // based on Rodrighez&Laio algo + // + __device__ __forceinline__ void clusterTracksByDensity(gpuVertexFinder::ZVertices* pdata, + gpuVertexFinder::WorkSpace* pws, + int minT, // min number of neighbours to be "seed" + float eps, // max absolute distance to cluster + float errmax, // max error to be "seed" + float chi2max // max normalized distance to cluster + ) { + using namespace gpuVertexFinder; + constexpr bool verbose = false; // in principle the compiler should optmize out if false + + if (verbose && 0 == threadIdx.x) + printf("params %d %f %f %f\n", minT, eps, errmax, chi2max); + + auto er2mx = errmax * errmax; + + auto& __restrict__ data = *pdata; + auto& __restrict__ ws = *pws; + auto nt = ws.ntrks; + float const* __restrict__ zt = ws.zt; + float const* __restrict__ ezt2 = ws.ezt2; + + uint32_t& nvFinal = data.nvFinal; + uint32_t& nvIntermediate = ws.nvIntermediate; + + uint8_t* __restrict__ izt = ws.izt; + int32_t* __restrict__ nn = data.ndof; + int32_t* __restrict__ iv = ws.iv; + + assert(pdata); + assert(zt); + + using Hist = cms::cuda::HistoContainer; + __shared__ Hist hist; + __shared__ typename Hist::Counter hws[32]; + for (auto j = threadIdx.x; j < Hist::totbins(); j += blockDim.x) { + hist.off[j] = 0; + } + __syncthreads(); + + if (verbose && 0 == threadIdx.x) + printf("booked hist with %d bins, size %d for %d tracks\n", hist.nbins(), hist.capacity(), nt); + + assert(nt <= hist.capacity()); + + // fill hist (bin shall be wider than "eps") + for (auto i = threadIdx.x; i < nt; i += blockDim.x) { + assert(i < ZVertices::MAXTRACKS); + int iz = int(zt[i] * 10.); // valid if eps<=0.1 + // iz = std::clamp(iz, INT8_MIN, INT8_MAX); // sorry c++17 only + iz = std::min(std::max(iz, INT8_MIN), INT8_MAX); + izt[i] = iz - INT8_MIN; + assert(iz - INT8_MIN >= 0); + assert(iz - INT8_MIN < 256); + hist.count(izt[i]); + iv[i] = i; + nn[i] = 0; + } + __syncthreads(); + if (threadIdx.x < 32) + hws[threadIdx.x] = 0; // used by prefix scan... + __syncthreads(); + hist.finalize(hws); + __syncthreads(); + assert(hist.size() == nt); + for (auto i = threadIdx.x; i < nt; i += blockDim.x) { + hist.fill(izt[i], uint16_t(i)); + } + __syncthreads(); + + // count neighbours + for (auto i = threadIdx.x; i < nt; i += blockDim.x) { + if (ezt2[i] > er2mx) + continue; + auto loop = [&](uint32_t j) { + if (i == j) + return; + auto dist = std::abs(zt[i] - zt[j]); + if (dist > eps) + return; + if (dist * dist > chi2max * (ezt2[i] + ezt2[j])) + return; + nn[i]++; + }; + + cms::cuda::forEachInBins(hist, izt[i], 1, loop); + } + + __syncthreads(); + + // find closest above me .... (we ignore the possibility of two j at same distance from i) + for (auto i = threadIdx.x; i < nt; i += blockDim.x) { + float mdist = eps; + auto loop = [&](uint32_t j) { + if (nn[j] < nn[i]) + return; + if (nn[j] == nn[i] && zt[j] >= zt[i]) + return; // if equal use natural order... + auto dist = std::abs(zt[i] - zt[j]); + if (dist > mdist) + return; + if (dist * dist > chi2max * (ezt2[i] + ezt2[j])) + return; // (break natural order???) + mdist = dist; + iv[i] = j; // assign to cluster (better be unique??) + }; + cms::cuda::forEachInBins(hist, izt[i], 1, loop); + } + + __syncthreads(); + +#ifdef GPU_DEBUG + // mini verification + for (auto i = threadIdx.x; i < nt; i += blockDim.x) { + if (iv[i] != int(i)) + assert(iv[iv[i]] != int(i)); + } + __syncthreads(); +#endif + + // consolidate graph (percolate index of seed) + for (auto i = threadIdx.x; i < nt; i += blockDim.x) { + auto m = iv[i]; + while (m != iv[m]) + m = iv[m]; + iv[i] = m; + } + +#ifdef GPU_DEBUG + __syncthreads(); + // mini verification + for (auto i = threadIdx.x; i < nt; i += blockDim.x) { + if (iv[i] != int(i)) + assert(iv[iv[i]] != int(i)); + } +#endif + +#ifdef GPU_DEBUG + // and verify that we did not spit any cluster... + for (auto i = threadIdx.x; i < nt; i += blockDim.x) { + auto minJ = i; + auto mdist = eps; + auto loop = [&](uint32_t j) { + if (nn[j] < nn[i]) + return; + if (nn[j] == nn[i] && zt[j] >= zt[i]) + return; // if equal use natural order... + auto dist = std::abs(zt[i] - zt[j]); + if (dist > mdist) + return; + if (dist * dist > chi2max * (ezt2[i] + ezt2[j])) + return; + mdist = dist; + minJ = j; + }; + cms::cuda::forEachInBins(hist, izt[i], 1, loop); + // should belong to the same cluster... + assert(iv[i] == iv[minJ]); + assert(nn[i] <= nn[iv[i]]); + } + __syncthreads(); +#endif + + __shared__ unsigned int foundClusters; + foundClusters = 0; + __syncthreads(); + + // find the number of different clusters, identified by a tracks with clus[i] == i and density larger than threshold; + // mark these tracks with a negative id. + for (auto i = threadIdx.x; i < nt; i += blockDim.x) { + if (iv[i] == int(i)) { + if (nn[i] >= minT) { + auto old = atomicInc(&foundClusters, 0xffffffff); + iv[i] = -(old + 1); + } else { // noise + iv[i] = -9998; + } + } + } + __syncthreads(); + + assert(foundClusters < ZVertices::MAXVTX); + + // propagate the negative id to all the tracks in the cluster. + for (auto i = threadIdx.x; i < nt; i += blockDim.x) { + if (iv[i] >= 0) { + // mark each track in a cluster with the same id as the first one + iv[i] = iv[iv[i]]; + } + } + __syncthreads(); + + // adjust the cluster id to be a positive value starting from 0 + for (auto i = threadIdx.x; i < nt; i += blockDim.x) { + iv[i] = -iv[i] - 1; + } + + nvIntermediate = nvFinal = foundClusters; + + if (verbose && 0 == threadIdx.x) + printf("found %d proto vertices\n", foundClusters); + } + + __global__ void clusterTracksByDensityKernel(gpuVertexFinder::ZVertices* pdata, + gpuVertexFinder::WorkSpace* pws, + int minT, // min number of neighbours to be "seed" + float eps, // max absolute distance to cluster + float errmax, // max error to be "seed" + float chi2max // max normalized distance to cluster + ) { + clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max); + } + +} // namespace gpuVertexFinder + +#endif // RecoPixelVertexing_PixelVertexFinding_src_gpuClusterTracksByDensity_h diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h new file mode 100644 index 0000000000000..ffd7fdc948bf8 --- /dev/null +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h @@ -0,0 +1,242 @@ +#ifndef RecoPixelVertexing_PixelVertexFinding_src_gpuClusterTracksDBSCAN_h +#define RecoPixelVertexing_PixelVertexFinding_src_gpuClusterTracksDBSCAN_h + +#include +#include +#include + +#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" + +#include "gpuVertexFinder.h" + +namespace gpuVertexFinder { + + // this algo does not really scale as it works in a single block... + // enough for <10K tracks we have + __global__ void clusterTracksDBSCAN(ZVertices* pdata, + WorkSpace* pws, + int minT, // min number of neighbours to be "core" + float eps, // max absolute distance to cluster + float errmax, // max error to be "seed" + float chi2max // max normalized distance to cluster + ) { + constexpr bool verbose = false; // in principle the compiler should optmize out if false + + if (verbose && 0 == threadIdx.x) + printf("params %d %f %f %f\n", minT, eps, errmax, chi2max); + + auto er2mx = errmax * errmax; + + auto& __restrict__ data = *pdata; + auto& __restrict__ ws = *pws; + auto nt = ws.ntrks; + float const* __restrict__ zt = ws.zt; + float const* __restrict__ ezt2 = ws.ezt2; + + uint32_t& nvFinal = data.nvFinal; + uint32_t& nvIntermediate = ws.nvIntermediate; + + uint8_t* __restrict__ izt = ws.izt; + int32_t* __restrict__ nn = data.ndof; + int32_t* __restrict__ iv = ws.iv; + + assert(pdata); + assert(zt); + + using Hist = cms::cuda::HistoContainer; + __shared__ Hist hist; + __shared__ typename Hist::Counter hws[32]; + for (auto j = threadIdx.x; j < Hist::totbins(); j += blockDim.x) { + hist.off[j] = 0; + } + __syncthreads(); + + if (verbose && 0 == threadIdx.x) + printf("booked hist with %d bins, size %d for %d tracks\n", hist.nbins(), hist.capacity(), nt); + + assert(nt <= hist.capacity()); + + // fill hist (bin shall be wider than "eps") + for (auto i = threadIdx.x; i < nt; i += blockDim.x) { + assert(i < ZVertices::MAXTRACKS); + int iz = int(zt[i] * 10.); // valid if eps<=0.1 + // iz = std::clamp(iz, INT8_MIN, INT8_MAX); // sorry c++17 only + iz = std::min(std::max(iz, INT8_MIN), INT8_MAX); + izt[i] = iz - INT8_MIN; + assert(iz - INT8_MIN >= 0); + assert(iz - INT8_MIN < 256); + hist.count(izt[i]); + iv[i] = i; + nn[i] = 0; + } + __syncthreads(); + if (threadIdx.x < 32) + hws[threadIdx.x] = 0; // used by prefix scan... + __syncthreads(); + hist.finalize(hws); + __syncthreads(); + assert(hist.size() == nt); + for (auto i = threadIdx.x; i < nt; i += blockDim.x) { + hist.fill(izt[i], uint16_t(i)); + } + __syncthreads(); + + // count neighbours + for (auto i = threadIdx.x; i < nt; i += blockDim.x) { + if (ezt2[i] > er2mx) + continue; + auto loop = [&](uint32_t j) { + if (i == j) + return; + auto dist = std::abs(zt[i] - zt[j]); + if (dist > eps) + return; + // if (dist*dist>chi2max*(ezt2[i]+ezt2[j])) return; + nn[i]++; + }; + + cms::cuda::forEachInBins(hist, izt[i], 1, loop); + } + + __syncthreads(); + + // find NN with smaller z... + for (auto i = threadIdx.x; i < nt; i += blockDim.x) { + if (nn[i] < minT) + continue; // DBSCAN core rule + float mz = zt[i]; + auto loop = [&](uint32_t j) { + if (zt[j] >= mz) + return; + if (nn[j] < minT) + return; // DBSCAN core rule + auto dist = std::abs(zt[i] - zt[j]); + if (dist > eps) + return; + // if (dist*dist>chi2max*(ezt2[i]+ezt2[j])) return; + mz = zt[j]; + iv[i] = j; // assign to cluster (better be unique??) + }; + cms::cuda::forEachInBins(hist, izt[i], 1, loop); + } + + __syncthreads(); + +#ifdef GPU_DEBUG + // mini verification + for (auto i = threadIdx.x; i < nt; i += blockDim.x) { + if (iv[i] != int(i)) + assert(iv[iv[i]] != int(i)); + } + __syncthreads(); +#endif + + // consolidate graph (percolate index of seed) + for (auto i = threadIdx.x; i < nt; i += blockDim.x) { + auto m = iv[i]; + while (m != iv[m]) + m = iv[m]; + iv[i] = m; + } + + __syncthreads(); + +#ifdef GPU_DEBUG + // mini verification + for (auto i = threadIdx.x; i < nt; i += blockDim.x) { + if (iv[i] != int(i)) + assert(iv[iv[i]] != int(i)); + } + __syncthreads(); +#endif + +#ifdef GPU_DEBUG + // and verify that we did not spit any cluster... + for (auto i = threadIdx.x; i < nt; i += blockDim.x) { + if (nn[i] < minT) + continue; // DBSCAN core rule + assert(zt[iv[i]] <= zt[i]); + auto loop = [&](uint32_t j) { + if (nn[j] < minT) + return; // DBSCAN core rule + auto dist = std::abs(zt[i] - zt[j]); + if (dist > eps) + return; + // if (dist*dist>chi2max*(ezt2[i]+ezt2[j])) return; + // they should belong to the same cluster, isn't it? + if (iv[i] != iv[j]) { + printf("ERROR %d %d %f %f %d\n", i, iv[i], zt[i], zt[iv[i]], iv[iv[i]]); + printf(" %d %d %f %f %d\n", j, iv[j], zt[j], zt[iv[j]], iv[iv[j]]); + ; + } + assert(iv[i] == iv[j]); + }; + cms::cuda::forEachInBins(hist, izt[i], 1, loop); + } + __syncthreads(); +#endif + + // collect edges (assign to closest cluster of closest point??? here to closest point) + for (auto i = threadIdx.x; i < nt; i += blockDim.x) { + // if (nn[i]==0 || nn[i]>=minT) continue; // DBSCAN edge rule + if (nn[i] >= minT) + continue; // DBSCAN edge rule + float mdist = eps; + auto loop = [&](uint32_t j) { + if (nn[j] < minT) + return; // DBSCAN core rule + auto dist = std::abs(zt[i] - zt[j]); + if (dist > mdist) + return; + if (dist * dist > chi2max * (ezt2[i] + ezt2[j])) + return; // needed? + mdist = dist; + iv[i] = iv[j]; // assign to cluster (better be unique??) + }; + cms::cuda::forEachInBins(hist, izt[i], 1, loop); + } + + __shared__ unsigned int foundClusters; + foundClusters = 0; + __syncthreads(); + + // find the number of different clusters, identified by a tracks with clus[i] == i; + // mark these tracks with a negative id. + for (auto i = threadIdx.x; i < nt; i += blockDim.x) { + if (iv[i] == int(i)) { + if (nn[i] >= minT) { + auto old = atomicInc(&foundClusters, 0xffffffff); + iv[i] = -(old + 1); + } else { // noise + iv[i] = -9998; + } + } + } + __syncthreads(); + + assert(foundClusters < ZVertices::MAXVTX); + + // propagate the negative id to all the tracks in the cluster. + for (auto i = threadIdx.x; i < nt; i += blockDim.x) { + if (iv[i] >= 0) { + // mark each track in a cluster with the same id as the first one + iv[i] = iv[iv[i]]; + } + } + __syncthreads(); + + // adjust the cluster id to be a positive value starting from 0 + for (auto i = threadIdx.x; i < nt; i += blockDim.x) { + iv[i] = -iv[i] - 1; + } + + nvIntermediate = nvFinal = foundClusters; + + if (verbose && 0 == threadIdx.x) + printf("found %d proto vertices\n", foundClusters); + } + +} // namespace gpuVertexFinder + +#endif // RecoPixelVertexing_PixelVertexFinding_src_gpuClusterTracksDBSCAN_h diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h new file mode 100644 index 0000000000000..49da86e941867 --- /dev/null +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h @@ -0,0 +1,213 @@ +#ifndef RecoPixelVertexing_PixelVertexFinding_src_gpuClusterTracksIterative_h +#define RecoPixelVertexing_PixelVertexFinding_src_gpuClusterTracksIterative_h + +#include +#include +#include + +#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" + +#include "gpuVertexFinder.h" + +namespace gpuVertexFinder { + + // this algo does not really scale as it works in a single block... + // enough for <10K tracks we have + __global__ void clusterTracksIterative(ZVertices* pdata, + WorkSpace* pws, + int minT, // min number of neighbours to be "core" + float eps, // max absolute distance to cluster + float errmax, // max error to be "seed" + float chi2max // max normalized distance to cluster + ) { + constexpr bool verbose = false; // in principle the compiler should optmize out if false + + if (verbose && 0 == threadIdx.x) + printf("params %d %f %f %f\n", minT, eps, errmax, chi2max); + + auto er2mx = errmax * errmax; + + auto& __restrict__ data = *pdata; + auto& __restrict__ ws = *pws; + auto nt = ws.ntrks; + float const* __restrict__ zt = ws.zt; + float const* __restrict__ ezt2 = ws.ezt2; + + uint32_t& nvFinal = data.nvFinal; + uint32_t& nvIntermediate = ws.nvIntermediate; + + uint8_t* __restrict__ izt = ws.izt; + int32_t* __restrict__ nn = data.ndof; + int32_t* __restrict__ iv = ws.iv; + + assert(pdata); + assert(zt); + + using Hist = cms::cuda::HistoContainer; + __shared__ Hist hist; + __shared__ typename Hist::Counter hws[32]; + for (auto j = threadIdx.x; j < Hist::totbins(); j += blockDim.x) { + hist.off[j] = 0; + } + __syncthreads(); + + if (verbose && 0 == threadIdx.x) + printf("booked hist with %d bins, size %d for %d tracks\n", hist.nbins(), hist.capacity(), nt); + + assert(nt <= hist.capacity()); + + // fill hist (bin shall be wider than "eps") + for (auto i = threadIdx.x; i < nt; i += blockDim.x) { + assert(i < ZVertices::MAXTRACKS); + int iz = int(zt[i] * 10.); // valid if eps<=0.1 + // iz = std::clamp(iz, INT8_MIN, INT8_MAX); // sorry c++17 only + iz = std::min(std::max(iz, INT8_MIN), INT8_MAX); + izt[i] = iz - INT8_MIN; + assert(iz - INT8_MIN >= 0); + assert(iz - INT8_MIN < 256); + hist.count(izt[i]); + iv[i] = i; + nn[i] = 0; + } + __syncthreads(); + if (threadIdx.x < 32) + hws[threadIdx.x] = 0; // used by prefix scan... + __syncthreads(); + hist.finalize(hws); + __syncthreads(); + assert(hist.size() == nt); + for (auto i = threadIdx.x; i < nt; i += blockDim.x) { + hist.fill(izt[i], uint16_t(i)); + } + __syncthreads(); + + // count neighbours + for (auto i = threadIdx.x; i < nt; i += blockDim.x) { + if (ezt2[i] > er2mx) + continue; + auto loop = [&](uint32_t j) { + if (i == j) + return; + auto dist = std::abs(zt[i] - zt[j]); + if (dist > eps) + return; + if (dist * dist > chi2max * (ezt2[i] + ezt2[j])) + return; + nn[i]++; + }; + + cms::cuda::forEachInBins(hist, izt[i], 1, loop); + } + + __shared__ int nloops; + nloops = 0; + + __syncthreads(); + + // cluster seeds only + bool more = true; + while (__syncthreads_or(more)) { + if (1 == nloops % 2) { + for (auto i = threadIdx.x; i < nt; i += blockDim.x) { + auto m = iv[i]; + while (m != iv[m]) + m = iv[m]; + iv[i] = m; + } + } else { + more = false; + for (auto k = threadIdx.x; k < hist.size(); k += blockDim.x) { + auto p = hist.begin() + k; + auto i = (*p); + auto be = std::min(Hist::bin(izt[i]) + 1, int(hist.nbins() - 1)); + if (nn[i] < minT) + continue; // DBSCAN core rule + auto loop = [&](uint32_t j) { + assert(i != j); + if (nn[j] < minT) + return; // DBSCAN core rule + auto dist = std::abs(zt[i] - zt[j]); + if (dist > eps) + return; + if (dist * dist > chi2max * (ezt2[i] + ezt2[j])) + return; + auto old = atomicMin(&iv[j], iv[i]); + if (old != iv[i]) { + // end the loop only if no changes were applied + more = true; + } + atomicMin(&iv[i], old); + }; + ++p; + for (; p < hist.end(be); ++p) + loop(*p); + } // for i + } + if (threadIdx.x == 0) + ++nloops; + } // while + + // collect edges (assign to closest cluster of closest point??? here to closest point) + for (auto i = threadIdx.x; i < nt; i += blockDim.x) { + // if (nn[i]==0 || nn[i]>=minT) continue; // DBSCAN edge rule + if (nn[i] >= minT) + continue; // DBSCAN edge rule + float mdist = eps; + auto loop = [&](int j) { + if (nn[j] < minT) + return; // DBSCAN core rule + auto dist = std::abs(zt[i] - zt[j]); + if (dist > mdist) + return; + if (dist * dist > chi2max * (ezt2[i] + ezt2[j])) + return; // needed? + mdist = dist; + iv[i] = iv[j]; // assign to cluster (better be unique??) + }; + cms::cuda::forEachInBins(hist, izt[i], 1, loop); + } + + __shared__ unsigned int foundClusters; + foundClusters = 0; + __syncthreads(); + + // find the number of different clusters, identified by a tracks with clus[i] == i; + // mark these tracks with a negative id. + for (auto i = threadIdx.x; i < nt; i += blockDim.x) { + if (iv[i] == int(i)) { + if (nn[i] >= minT) { + auto old = atomicInc(&foundClusters, 0xffffffff); + iv[i] = -(old + 1); + } else { // noise + iv[i] = -9998; + } + } + } + __syncthreads(); + + assert(foundClusters < ZVertices::MAXVTX); + + // propagate the negative id to all the tracks in the cluster. + for (auto i = threadIdx.x; i < nt; i += blockDim.x) { + if (iv[i] >= 0) { + // mark each track in a cluster with the same id as the first one + iv[i] = iv[iv[i]]; + } + } + __syncthreads(); + + // adjust the cluster id to be a positive value starting from 0 + for (auto i = threadIdx.x; i < nt; i += blockDim.x) { + iv[i] = -iv[i] - 1; + } + + nvIntermediate = nvFinal = foundClusters; + + if (verbose && 0 == threadIdx.x) + printf("found %d proto vertices\n", foundClusters); + } + +} // namespace gpuVertexFinder + +#endif // RecoPixelVertexing_PixelVertexFinding_src_gpuClusterTracksIterative_h diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h new file mode 100644 index 0000000000000..4487cb12ea17b --- /dev/null +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h @@ -0,0 +1,113 @@ +#ifndef RecoPixelVertexing_PixelVertexFinding_src_gpuFitVertices_h +#define RecoPixelVertexing_PixelVertexFinding_src_gpuFitVertices_h + +#include +#include +#include + +#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" + +#include "gpuVertexFinder.h" + +namespace gpuVertexFinder { + + __device__ __forceinline__ void fitVertices(ZVertices* pdata, + WorkSpace* pws, + float chi2Max // for outlier rejection + ) { + constexpr bool verbose = false; // in principle the compiler should optmize out if false + + auto& __restrict__ data = *pdata; + auto& __restrict__ ws = *pws; + auto nt = ws.ntrks; + float const* __restrict__ zt = ws.zt; + float const* __restrict__ ezt2 = ws.ezt2; + float* __restrict__ zv = data.zv; + float* __restrict__ wv = data.wv; + float* __restrict__ chi2 = data.chi2; + uint32_t& nvFinal = data.nvFinal; + uint32_t& nvIntermediate = ws.nvIntermediate; + + int32_t* __restrict__ nn = data.ndof; + int32_t* __restrict__ iv = ws.iv; + + assert(pdata); + assert(zt); + + assert(nvFinal <= nvIntermediate); + nvFinal = nvIntermediate; + auto foundClusters = nvFinal; + + // zero + for (auto i = threadIdx.x; i < foundClusters; i += blockDim.x) { + zv[i] = 0; + wv[i] = 0; + chi2[i] = 0; + } + + // only for test + __shared__ int noise; + if (verbose && 0 == threadIdx.x) + noise = 0; + + __syncthreads(); + + // compute cluster location + for (auto i = threadIdx.x; i < nt; i += blockDim.x) { + if (iv[i] > 9990) { + if (verbose) + atomicAdd(&noise, 1); + continue; + } + assert(iv[i] >= 0); + assert(iv[i] < int(foundClusters)); + auto w = 1.f / ezt2[i]; + atomicAdd(&zv[iv[i]], zt[i] * w); + atomicAdd(&wv[iv[i]], w); + } + + __syncthreads(); + // reuse nn + for (auto i = threadIdx.x; i < foundClusters; i += blockDim.x) { + assert(wv[i] > 0.f); + zv[i] /= wv[i]; + nn[i] = -1; // ndof + } + __syncthreads(); + + // compute chi2 + for (auto i = threadIdx.x; i < nt; i += blockDim.x) { + if (iv[i] > 9990) + continue; + + auto c2 = zv[iv[i]] - zt[i]; + c2 *= c2 / ezt2[i]; + if (c2 > chi2Max) { + iv[i] = 9999; + continue; + } + atomicAdd(&chi2[iv[i]], c2); + atomicAdd(&nn[iv[i]], 1); + } + __syncthreads(); + for (auto i = threadIdx.x; i < foundClusters; i += blockDim.x) + if (nn[i] > 0) + wv[i] *= float(nn[i]) / chi2[i]; + + if (verbose && 0 == threadIdx.x) + printf("found %d proto clusters ", foundClusters); + if (verbose && 0 == threadIdx.x) + printf("and %d noise\n", noise); + } + + __global__ void fitVerticesKernel(ZVertices* pdata, + WorkSpace* pws, + float chi2Max // for outlier rejection + ) { + fitVertices(pdata, pws, chi2Max); + } + +} // namespace gpuVertexFinder + +#endif // RecoPixelVertexing_PixelVertexFinding_src_gpuFitVertices_h diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h new file mode 100644 index 0000000000000..89cc9a3844f76 --- /dev/null +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h @@ -0,0 +1,73 @@ +#ifndef RecoPixelVertexing_PixelVertexFinding_src_gpuSortByPt2_h +#define RecoPixelVertexing_PixelVertexFinding_src_gpuSortByPt2_h + +#include +#include +#include + +#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" +#ifdef __CUDA_ARCH__ +#include "HeterogeneousCore/CUDAUtilities/interface/radixSort.h" +#endif + +#include "gpuVertexFinder.h" + +namespace gpuVertexFinder { + + __device__ __forceinline__ void sortByPt2(ZVertices* pdata, WorkSpace* pws) { + auto& __restrict__ data = *pdata; + auto& __restrict__ ws = *pws; + auto nt = ws.ntrks; + float const* __restrict__ ptt2 = ws.ptt2; + uint32_t const& nvFinal = data.nvFinal; + + int32_t const* __restrict__ iv = ws.iv; + float* __restrict__ ptv2 = data.ptv2; + uint16_t* __restrict__ sortInd = data.sortInd; + + // if (threadIdx.x == 0) + // printf("sorting %d vertices\n",nvFinal); + + if (nvFinal < 1) + return; + + // fill indexing + for (auto i = threadIdx.x; i < nt; i += blockDim.x) { + data.idv[ws.itrk[i]] = iv[i]; + } + + // can be done asynchronoisly at the end of previous event + for (auto i = threadIdx.x; i < nvFinal; i += blockDim.x) { + ptv2[i] = 0; + } + __syncthreads(); + + for (auto i = threadIdx.x; i < nt; i += blockDim.x) { + if (iv[i] > 9990) + continue; + atomicAdd(&ptv2[iv[i]], ptt2[i]); + } + __syncthreads(); + + if (1 == nvFinal) { + if (threadIdx.x == 0) + sortInd[0] = 0; + return; + } +#ifdef __CUDA_ARCH__ + __shared__ uint16_t sws[1024]; + // sort using only 16 bits + radixSort(ptv2, sortInd, sws, nvFinal); +#else + for (uint16_t i = 0; i < nvFinal; ++i) + sortInd[i] = i; + std::sort(sortInd, sortInd + nvFinal, [&](auto i, auto j) { return ptv2[i] < ptv2[j]; }); +#endif + } + + __global__ void sortByPt2Kernel(ZVertices* pdata, WorkSpace* pws) { sortByPt2(pdata, pws); } + +} // namespace gpuVertexFinder + +#endif // RecoPixelVertexing_PixelVertexFinding_src_gpuSortByPt2_h diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSplitVertices.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSplitVertices.h new file mode 100644 index 0000000000000..694915ab02157 --- /dev/null +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSplitVertices.h @@ -0,0 +1,139 @@ +#ifndef RecoPixelVertexing_PixelVertexFinding_src_gpuSplitVertices_h +#define RecoPixelVertexing_PixelVertexFinding_src_gpuSplitVertices_h + +#include +#include +#include + +#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" + +#include "gpuVertexFinder.h" + +namespace gpuVertexFinder { + + __device__ __forceinline__ void splitVertices(ZVertices* pdata, WorkSpace* pws, float maxChi2) { + constexpr bool verbose = false; // in principle the compiler should optmize out if false + + auto& __restrict__ data = *pdata; + auto& __restrict__ ws = *pws; + auto nt = ws.ntrks; + float const* __restrict__ zt = ws.zt; + float const* __restrict__ ezt2 = ws.ezt2; + float* __restrict__ zv = data.zv; + float* __restrict__ wv = data.wv; + float const* __restrict__ chi2 = data.chi2; + uint32_t& nvFinal = data.nvFinal; + + int32_t const* __restrict__ nn = data.ndof; + int32_t* __restrict__ iv = ws.iv; + + assert(pdata); + assert(zt); + + // one vertex per block + for (auto kv = blockIdx.x; kv < nvFinal; kv += gridDim.x) { + if (nn[kv] < 4) + continue; + if (chi2[kv] < maxChi2 * float(nn[kv])) + continue; + + constexpr int MAXTK = 512; + assert(nn[kv] < MAXTK); + if (nn[kv] >= MAXTK) + continue; // too bad FIXME + __shared__ uint32_t it[MAXTK]; // track index + __shared__ float zz[MAXTK]; // z pos + __shared__ uint8_t newV[MAXTK]; // 0 or 1 + __shared__ float ww[MAXTK]; // z weight + + __shared__ uint32_t nq; // number of track for this vertex + nq = 0; + __syncthreads(); + + // copy to local + for (auto k = threadIdx.x; k < nt; k += blockDim.x) { + if (iv[k] == int(kv)) { + auto old = atomicInc(&nq, MAXTK); + zz[old] = zt[k] - zv[kv]; + newV[old] = zz[old] < 0 ? 0 : 1; + ww[old] = 1.f / ezt2[k]; + it[old] = k; + } + } + + __shared__ float znew[2], wnew[2]; // the new vertices + + __syncthreads(); + assert(int(nq) == nn[kv] + 1); + + int maxiter = 20; + // kt-min.... + bool more = true; + while (__syncthreads_or(more)) { + more = false; + if (0 == threadIdx.x) { + znew[0] = 0; + znew[1] = 0; + wnew[0] = 0; + wnew[1] = 0; + } + __syncthreads(); + for (auto k = threadIdx.x; k < nq; k += blockDim.x) { + auto i = newV[k]; + atomicAdd(&znew[i], zz[k] * ww[k]); + atomicAdd(&wnew[i], ww[k]); + } + __syncthreads(); + if (0 == threadIdx.x) { + znew[0] /= wnew[0]; + znew[1] /= wnew[1]; + } + __syncthreads(); + for (auto k = threadIdx.x; k < nq; k += blockDim.x) { + auto d0 = fabs(zz[k] - znew[0]); + auto d1 = fabs(zz[k] - znew[1]); + auto newer = d0 < d1 ? 0 : 1; + more |= newer != newV[k]; + newV[k] = newer; + } + --maxiter; + if (maxiter <= 0) + more = false; + } + + // avoid empty vertices + if (0 == wnew[0] || 0 == wnew[1]) + continue; + + // quality cut + auto dist2 = (znew[0] - znew[1]) * (znew[0] - znew[1]); + + auto chi2Dist = dist2 / (1.f / wnew[0] + 1.f / wnew[1]); + + if (verbose && 0 == threadIdx.x) + printf("inter %d %f %f\n", 20 - maxiter, chi2Dist, dist2 * wv[kv]); + + if (chi2Dist < 4) + continue; + + // get a new global vertex + __shared__ uint32_t igv; + if (0 == threadIdx.x) + igv = atomicAdd(&ws.nvIntermediate, 1); + __syncthreads(); + for (auto k = threadIdx.x; k < nq; k += blockDim.x) { + if (1 == newV[k]) + iv[it[k]] = igv; + } + + } // loop on vertices + } + + __global__ void splitVerticesKernel(ZVertices* pdata, WorkSpace* pws, float maxChi2) { + splitVertices(pdata, pws, maxChi2); + } + +} // namespace gpuVertexFinder + +#endif // RecoPixelVertexing_PixelVertexFinding_src_gpuSplitVertices_h diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc new file mode 100644 index 0000000000000..084763385bdb4 --- /dev/null +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc @@ -0,0 +1 @@ +#include "gpuVertexFinderImpl.h" diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cu b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cu new file mode 100644 index 0000000000000..084763385bdb4 --- /dev/null +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cu @@ -0,0 +1 @@ +#include "gpuVertexFinderImpl.h" diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h new file mode 100644 index 0000000000000..6cd86c93a6737 --- /dev/null +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h @@ -0,0 +1,83 @@ +#ifndef RecoPixelVertexing_PixelVertexFinding_src_gpuVertexFinder_h +#define RecoPixelVertexing_PixelVertexFinding_src_gpuVertexFinder_h + +#include +#include + +#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h" + +namespace gpuVertexFinder { + + using ZVertices = ZVertexSoA; + using TkSoA = pixelTrack::TrackSoA; + + // workspace used in the vertex reco algos + struct WorkSpace { + static constexpr uint32_t MAXTRACKS = ZVertexSoA::MAXTRACKS; + static constexpr uint32_t MAXVTX = ZVertexSoA::MAXVTX; + + uint32_t ntrks; // number of "selected tracks" + uint16_t itrk[MAXTRACKS]; // index of original track + float zt[MAXTRACKS]; // input track z at bs + float ezt2[MAXTRACKS]; // input error^2 on the above + float ptt2[MAXTRACKS]; // input pt^2 on the above + uint8_t izt[MAXTRACKS]; // interized z-position of input tracks + int32_t iv[MAXTRACKS]; // vertex index for each associated track + + uint32_t nvIntermediate; // the number of vertices after splitting pruning etc. + + __host__ __device__ void init() { + ntrks = 0; + nvIntermediate = 0; + } + }; + + __global__ void init(ZVertexSoA* pdata, WorkSpace* pws) { + pdata->init(); + pws->init(); + } + + class Producer { + public: + using ZVertices = ZVertexSoA; + using WorkSpace = gpuVertexFinder::WorkSpace; + using TkSoA = pixelTrack::TrackSoA; + + Producer(bool oneKernel, + bool useDensity, + bool useDBSCAN, + bool useIterative, + int iminT, // min number of neighbours to be "core" + float ieps, // max absolute distance to cluster + float ierrmax, // max error to be "seed" + float ichi2max // max normalized distance to cluster + ) + : oneKernel_(oneKernel && !(useDBSCAN || useIterative)), + useDensity_(useDensity), + useDBSCAN_(useDBSCAN), + useIterative_(useIterative), + minT(iminT), + eps(ieps), + errmax(ierrmax), + chi2max(ichi2max) {} + + ~Producer() = default; + + ZVertexHeterogeneous makeAsync(cudaStream_t stream, TkSoA const* tksoa, float ptMin) const; + ZVertexHeterogeneous make(TkSoA const* tksoa, float ptMin) const; + + private: + const bool oneKernel_; + const bool useDensity_; + const bool useDBSCAN_; + const bool useIterative_; + + int minT; // min number of neighbours to be "core" + float eps; // max absolute distance to cluster + float errmax; // max error to be "seed" + float chi2max; // max normalized distance to cluster + }; + +} // namespace gpuVertexFinder + +#endif // RecoPixelVertexing_PixelVertexFinding_src_gpuVertexFinder_h diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinderImpl.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinderImpl.h new file mode 100644 index 0000000000000..ae423dd375e06 --- /dev/null +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinderImpl.h @@ -0,0 +1,169 @@ +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" + +#include "gpuClusterTracksByDensity.h" +#include "gpuClusterTracksDBSCAN.h" +#include "gpuClusterTracksIterative.h" +#include "gpuFitVertices.h" +#include "gpuSortByPt2.h" +#include "gpuSplitVertices.h" + +namespace gpuVertexFinder { + + __global__ void loadTracks(TkSoA const* ptracks, ZVertexSoA* soa, WorkSpace* pws, float ptMin) { + assert(ptracks); + assert(soa); + auto const& tracks = *ptracks; + auto const& fit = tracks.stateAtBS; + auto const* quality = tracks.qualityData(); + + auto first = blockIdx.x * blockDim.x + threadIdx.x; + for (int idx = first, nt = TkSoA::stride(); idx < nt; idx += gridDim.x * blockDim.x) { + auto nHits = tracks.nHits(idx); + if (nHits == 0) + break; // this is a guard: maybe we need to move to nTracks... + + // initialize soa... + soa->idv[idx] = -1; + + if (nHits < 4) + continue; // no triplets + if (quality[idx] != pixelTrack::Quality::loose) + continue; + + auto pt = tracks.pt(idx); + + if (pt < ptMin) + continue; + + auto& data = *pws; + auto it = atomicAdd(&data.ntrks, 1); + data.itrk[it] = idx; + data.zt[it] = tracks.zip(idx); + data.ezt2[it] = fit.covariance(idx)(14); + data.ptt2[it] = pt * pt; + } + } + +// #define THREE_KERNELS +#ifndef THREE_KERNELS + __global__ void vertexFinderOneKernel(gpuVertexFinder::ZVertices* pdata, + gpuVertexFinder::WorkSpace* pws, + int minT, // min number of neighbours to be "seed" + float eps, // max absolute distance to cluster + float errmax, // max error to be "seed" + float chi2max // max normalized distance to cluster, + ) { + clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max); + __syncthreads(); + fitVertices(pdata, pws, 50.); + __syncthreads(); + splitVertices(pdata, pws, 9.f); + __syncthreads(); + fitVertices(pdata, pws, 5000.); + __syncthreads(); + sortByPt2(pdata, pws); + } +#else + __global__ void vertexFinderKernel1(gpuVertexFinder::ZVertices* pdata, + gpuVertexFinder::WorkSpace* pws, + int minT, // min number of neighbours to be "seed" + float eps, // max absolute distance to cluster + float errmax, // max error to be "seed" + float chi2max // max normalized distance to cluster, + ) { + clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max); + __syncthreads(); + fitVertices(pdata, pws, 50.); + } + + __global__ void vertexFinderKernel2(gpuVertexFinder::ZVertices* pdata, gpuVertexFinder::WorkSpace* pws) { + fitVertices(pdata, pws, 5000.); + __syncthreads(); + sortByPt2(pdata, pws); + } +#endif + +#ifdef __CUDACC__ + ZVertexHeterogeneous Producer::makeAsync(cudaStream_t stream, TkSoA const* tksoa, float ptMin) const { + // std::cout << "producing Vertices on GPU" << std::endl; + ZVertexHeterogeneous vertices(cms::cuda::make_device_unique(stream)); +#else + ZVertexHeterogeneous Producer::make(TkSoA const* tksoa, float ptMin) const { + // std::cout << "producing Vertices on CPU" << std::endl; + ZVertexHeterogeneous vertices(std::make_unique()); +#endif + assert(tksoa); + auto* soa = vertices.get(); + assert(soa); + +#ifdef __CUDACC__ + auto ws_d = cms::cuda::make_device_unique(stream); +#else + auto ws_d = std::make_unique(); +#endif + +#ifdef __CUDACC__ + init<<<1, 1, 0, stream>>>(soa, ws_d.get()); + auto blockSize = 128; + auto numberOfBlocks = (TkSoA::stride() + blockSize - 1) / blockSize; + loadTracks<<>>(tksoa, soa, ws_d.get(), ptMin); + cudaCheck(cudaGetLastError()); +#else + init(soa, ws_d.get()); + loadTracks(tksoa, soa, ws_d.get(), ptMin); +#endif + +#ifdef __CUDACC__ + if (oneKernel_) { + // implemented only for density clustesrs +#ifndef THREE_KERNELS + vertexFinderOneKernel<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max); +#else + vertexFinderKernel1<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max); + cudaCheck(cudaGetLastError()); + // one block per vertex... + splitVerticesKernel<<<1024, 128, 0, stream>>>(soa, ws_d.get(), 9.f); + cudaCheck(cudaGetLastError()); + vertexFinderKernel2<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get()); +#endif + } else { // five kernels + if (useDensity_) { + clusterTracksByDensityKernel<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max); + } else if (useDBSCAN_) { + clusterTracksDBSCAN<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max); + } else if (useIterative_) { + clusterTracksIterative<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max); + } + cudaCheck(cudaGetLastError()); + fitVerticesKernel<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), 50.); + cudaCheck(cudaGetLastError()); + // one block per vertex... + splitVerticesKernel<<<1024, 128, 0, stream>>>(soa, ws_d.get(), 9.f); + cudaCheck(cudaGetLastError()); + fitVerticesKernel<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get(), 5000.); + cudaCheck(cudaGetLastError()); + sortByPt2Kernel<<<1, 1024 - 256, 0, stream>>>(soa, ws_d.get()); + } + cudaCheck(cudaGetLastError()); +#else // __CUDACC__ + if (useDensity_) { + clusterTracksByDensity(soa, ws_d.get(), minT, eps, errmax, chi2max); + } else if (useDBSCAN_) { + clusterTracksDBSCAN(soa, ws_d.get(), minT, eps, errmax, chi2max); + } else if (useIterative_) { + clusterTracksIterative(soa, ws_d.get(), minT, eps, errmax, chi2max); + } + // std::cout << "found " << (*ws_d).nvIntermediate << " vertices " << std::endl; + fitVertices(soa, ws_d.get(), 50.); + // one block per vertex! + splitVertices(soa, ws_d.get(), 9.f); + fitVertices(soa, ws_d.get(), 5000.); + sortByPt2(soa, ws_d.get()); +#endif + + return vertices; + } + +} // namespace gpuVertexFinder + +#undef FROM diff --git a/RecoPixelVertexing/PixelVertexFinding/python/PixelVertexes_cfi.py b/RecoPixelVertexing/PixelVertexFinding/python/PixelVertexes_cfi.py index 77a9f367b9d9b..903c2a894ff86 100644 --- a/RecoPixelVertexing/PixelVertexFinding/python/PixelVertexes_cfi.py +++ b/RecoPixelVertexing/PixelVertexFinding/python/PixelVertexes_cfi.py @@ -18,5 +18,3 @@ refToPSet_ = cms.string('pvClusterComparer') ) ) - - diff --git a/RecoPixelVertexing/PixelVertexFinding/test/BuildFile.xml b/RecoPixelVertexing/PixelVertexFinding/test/BuildFile.xml index 0f4f4dee63832..f5c154b298574 100644 --- a/RecoPixelVertexing/PixelVertexFinding/test/BuildFile.xml +++ b/RecoPixelVertexing/PixelVertexFinding/test/BuildFile.xml @@ -2,8 +2,41 @@ - + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h new file mode 100644 index 0000000000000..e3298f8c5761b --- /dev/null +++ b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h @@ -0,0 +1,347 @@ +#include +#include +#include +#include +#include + +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" +#include "HeterogeneousCore/CUDAUtilities/interface/launch.h" +#ifdef USE_DBSCAN +#include "../plugins/gpuClusterTracksDBSCAN.h" +#define CLUSTERIZE gpuVertexFinder::clusterTracksDBSCAN +#elif USE_ITERATIVE +#include "../plugins/gpuClusterTracksIterative.h" +#define CLUSTERIZE gpuVertexFinder::clusterTracksIterative +#else +#include "../plugins/gpuClusterTracksByDensity.h" +#define CLUSTERIZE gpuVertexFinder::clusterTracksByDensityKernel +#endif +#include "../plugins/gpuFitVertices.h" +#include "../plugins/gpuSortByPt2.h" +#include "../plugins/gpuSplitVertices.h" + +#ifdef ONE_KERNEL +#ifdef __CUDACC__ +__global__ void vertexFinderOneKernel(gpuVertexFinder::ZVertices* pdata, + gpuVertexFinder::WorkSpace* pws, + int minT, // min number of neighbours to be "seed" + float eps, // max absolute distance to cluster + float errmax, // max error to be "seed" + float chi2max // max normalized distance to cluster, +) { + clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max); + __syncthreads(); + fitVertices(pdata, pws, 50.); + __syncthreads(); + splitVertices(pdata, pws, 9.f); + __syncthreads(); + fitVertices(pdata, pws, 5000.); + __syncthreads(); + sortByPt2(pdata, pws); +} +#endif +#endif + +struct Event { + std::vector zvert; + std::vector itrack; + std::vector ztrack; + std::vector eztrack; + std::vector pttrack; + std::vector ivert; +}; + +struct ClusterGenerator { + explicit ClusterGenerator(float nvert, float ntrack) + : rgen(-13., 13), errgen(0.005, 0.025), clusGen(nvert), trackGen(ntrack), gauss(0., 1.), ptGen(1.) {} + + void operator()(Event& ev) { + int nclus = clusGen(reng); + ev.zvert.resize(nclus); + ev.itrack.resize(nclus); + for (auto& z : ev.zvert) { + z = 3.5f * gauss(reng); + } + + ev.ztrack.clear(); + ev.eztrack.clear(); + ev.ivert.clear(); + for (int iv = 0; iv < nclus; ++iv) { + auto nt = trackGen(reng); + ev.itrack[nclus] = nt; + for (int it = 0; it < nt; ++it) { + auto err = errgen(reng); // reality is not flat.... + ev.ztrack.push_back(ev.zvert[iv] + err * gauss(reng)); + ev.eztrack.push_back(err * err); + ev.ivert.push_back(iv); + ev.pttrack.push_back((iv == 5 ? 1.f : 0.5f) + ptGen(reng)); + ev.pttrack.back() *= ev.pttrack.back(); + } + } + // add noise + auto nt = 2 * trackGen(reng); + for (int it = 0; it < nt; ++it) { + auto err = 0.03f; + ev.ztrack.push_back(rgen(reng)); + ev.eztrack.push_back(err * err); + ev.ivert.push_back(9999); + ev.pttrack.push_back(0.5f + ptGen(reng)); + ev.pttrack.back() *= ev.pttrack.back(); + } + } + + std::mt19937 reng; + std::uniform_real_distribution rgen; + std::uniform_real_distribution errgen; + std::poisson_distribution clusGen; + std::poisson_distribution trackGen; + std::normal_distribution gauss; + std::exponential_distribution ptGen; +}; + +// a macro SORRY +#define LOC_ONGPU(M) ((char*)(onGPU_d.get()) + offsetof(gpuVertexFinder::ZVertices, M)) +#define LOC_WS(M) ((char*)(ws_d.get()) + offsetof(gpuVertexFinder::WorkSpace, M)) + +__global__ void print(gpuVertexFinder::ZVertices const* pdata, gpuVertexFinder::WorkSpace const* pws) { + auto const& __restrict__ data = *pdata; + auto const& __restrict__ ws = *pws; + printf("nt,nv %d %d,%d\n", ws.ntrks, data.nvFinal, ws.nvIntermediate); +} + +int main() { +#ifdef __CUDACC__ + cms::cudatest::requireDevices(); + + auto onGPU_d = cms::cuda::make_device_unique(1, nullptr); + auto ws_d = cms::cuda::make_device_unique(1, nullptr); +#else + auto onGPU_d = std::make_unique(); + auto ws_d = std::make_unique(); +#endif + + Event ev; + + float eps = 0.1f; + std::array par{{eps, 0.01f, 9.0f}}; + for (int nav = 30; nav < 80; nav += 20) { + ClusterGenerator gen(nav, 10); + + for (int i = 8; i < 20; ++i) { + auto kk = i / 4; // M param + + gen(ev); + +#ifdef __CUDACC__ + init<<<1, 1, 0, 0>>>(onGPU_d.get(), ws_d.get()); +#else + onGPU_d->init(); + ws_d->init(); +#endif + + std::cout << "v,t size " << ev.zvert.size() << ' ' << ev.ztrack.size() << std::endl; + auto nt = ev.ztrack.size(); +#ifdef __CUDACC__ + cudaCheck(cudaMemcpy(LOC_WS(ntrks), &nt, sizeof(uint32_t), cudaMemcpyHostToDevice)); + cudaCheck(cudaMemcpy(LOC_WS(zt), ev.ztrack.data(), sizeof(float) * ev.ztrack.size(), cudaMemcpyHostToDevice)); + cudaCheck(cudaMemcpy(LOC_WS(ezt2), ev.eztrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice)); + cudaCheck(cudaMemcpy(LOC_WS(ptt2), ev.pttrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice)); +#else + ::memcpy(LOC_WS(ntrks), &nt, sizeof(uint32_t)); + ::memcpy(LOC_WS(zt), ev.ztrack.data(), sizeof(float) * ev.ztrack.size()); + ::memcpy(LOC_WS(ezt2), ev.eztrack.data(), sizeof(float) * ev.eztrack.size()); + ::memcpy(LOC_WS(ptt2), ev.pttrack.data(), sizeof(float) * ev.eztrack.size()); +#endif + + std::cout << "M eps, pset " << kk << ' ' << eps << ' ' << (i % 4) << std::endl; + + if ((i % 4) == 0) + par = {{eps, 0.02f, 12.0f}}; + if ((i % 4) == 1) + par = {{eps, 0.02f, 9.0f}}; + if ((i % 4) == 2) + par = {{eps, 0.01f, 9.0f}}; + if ((i % 4) == 3) + par = {{0.7f * eps, 0.01f, 9.0f}}; + + uint32_t nv = 0; +#ifdef __CUDACC__ + print<<<1, 1, 0, 0>>>(onGPU_d.get(), ws_d.get()); + cudaCheck(cudaGetLastError()); + cudaDeviceSynchronize(); + +#ifdef ONE_KERNEL + cms::cuda::launch(vertexFinderOneKernel, {1, 512 + 256}, onGPU_d.get(), ws_d.get(), kk, par[0], par[1], par[2]); +#else + cms::cuda::launch(CLUSTERIZE, {1, 512 + 256}, onGPU_d.get(), ws_d.get(), kk, par[0], par[1], par[2]); +#endif + print<<<1, 1, 0, 0>>>(onGPU_d.get(), ws_d.get()); + + cudaCheck(cudaGetLastError()); + cudaDeviceSynchronize(); + + cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f); + cudaCheck(cudaGetLastError()); + cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost)); + +#else + print(onGPU_d.get(), ws_d.get()); + CLUSTERIZE(onGPU_d.get(), ws_d.get(), kk, par[0], par[1], par[2]); + print(onGPU_d.get(), ws_d.get()); + fitVertices(onGPU_d.get(), ws_d.get(), 50.f); + nv = onGPU_d->nvFinal; +#endif + + if (nv == 0) { + std::cout << "NO VERTICES???" << std::endl; + continue; + } + + float* zv = nullptr; + float* wv = nullptr; + float* ptv2 = nullptr; + int32_t* nn = nullptr; + uint16_t* ind = nullptr; + + // keep chi2 separated... + float chi2[2 * nv]; // make space for splitting... + +#ifdef __CUDACC__ + float hzv[2 * nv]; + float hwv[2 * nv]; + float hptv2[2 * nv]; + int32_t hnn[2 * nv]; + uint16_t hind[2 * nv]; + + zv = hzv; + wv = hwv; + ptv2 = hptv2; + nn = hnn; + ind = hind; +#else + zv = onGPU_d->zv; + wv = onGPU_d->wv; + ptv2 = onGPU_d->ptv2; + nn = onGPU_d->ndof; + ind = onGPU_d->sortInd; +#endif + +#ifdef __CUDACC__ + cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost)); +#else + memcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float)); +#endif + + for (auto j = 0U; j < nv; ++j) + if (nn[j] > 0) + chi2[j] /= float(nn[j]); + { + auto mx = std::minmax_element(chi2, chi2 + nv); + std::cout << "after fit nv, min max chi2 " << nv << " " << *mx.first << ' ' << *mx.second << std::endl; + } + +#ifdef __CUDACC__ + cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f); + cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost)); +#else + fitVertices(onGPU_d.get(), ws_d.get(), 50.f); + nv = onGPU_d->nvFinal; + memcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float)); +#endif + + for (auto j = 0U; j < nv; ++j) + if (nn[j] > 0) + chi2[j] /= float(nn[j]); + { + auto mx = std::minmax_element(chi2, chi2 + nv); + std::cout << "before splitting nv, min max chi2 " << nv << " " << *mx.first << ' ' << *mx.second << std::endl; + } + +#ifdef __CUDACC__ + // one vertex per block!!! + cms::cuda::launch(gpuVertexFinder::splitVerticesKernel, {1024, 64}, onGPU_d.get(), ws_d.get(), 9.f); + cudaCheck(cudaMemcpy(&nv, LOC_WS(nvIntermediate), sizeof(uint32_t), cudaMemcpyDeviceToHost)); +#else + splitVertices(onGPU_d.get(), ws_d.get(), 9.f); + nv = ws_d->nvIntermediate; +#endif + std::cout << "after split " << nv << std::endl; + +#ifdef __CUDACC__ + cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 5000.f); + cudaCheck(cudaGetLastError()); + + cms::cuda::launch(gpuVertexFinder::sortByPt2Kernel, {1, 256}, onGPU_d.get(), ws_d.get()); + cudaCheck(cudaGetLastError()); + cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost)); +#else + fitVertices(onGPU_d.get(), ws_d.get(), 5000.f); + sortByPt2(onGPU_d.get(), ws_d.get()); + nv = onGPU_d->nvFinal; + memcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float)); +#endif + + if (nv == 0) { + std::cout << "NO VERTICES???" << std::endl; + continue; + } + +#ifdef __CUDACC__ + cudaCheck(cudaMemcpy(zv, LOC_ONGPU(zv), nv * sizeof(float), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(wv, LOC_ONGPU(wv), nv * sizeof(float), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(ptv2, LOC_ONGPU(ptv2), nv * sizeof(float), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(ind, LOC_ONGPU(sortInd), nv * sizeof(uint16_t), cudaMemcpyDeviceToHost)); +#endif + for (auto j = 0U; j < nv; ++j) + if (nn[j] > 0) + chi2[j] /= float(nn[j]); + { + auto mx = std::minmax_element(chi2, chi2 + nv); + std::cout << "nv, min max chi2 " << nv << " " << *mx.first << ' ' << *mx.second << std::endl; + } + + { + auto mx = std::minmax_element(wv, wv + nv); + std::cout << "min max error " << 1. / std::sqrt(*mx.first) << ' ' << 1. / std::sqrt(*mx.second) << std::endl; + } + + { + auto mx = std::minmax_element(ptv2, ptv2 + nv); + std::cout << "min max ptv2 " << *mx.first << ' ' << *mx.second << std::endl; + std::cout << "min max ptv2 " << ptv2[ind[0]] << ' ' << ptv2[ind[nv - 1]] << " at " << ind[0] << ' ' + << ind[nv - 1] << std::endl; + } + + float dd[nv]; + for (auto kv = 0U; kv < nv; ++kv) { + auto zr = zv[kv]; + auto md = 500.0f; + for (auto zs : ev.ztrack) { + auto d = std::abs(zr - zs); + md = std::min(d, md); + } + dd[kv] = md; + } + if (i == 6) { + for (auto d : dd) + std::cout << d << ' '; + std::cout << std::endl; + } + auto mx = std::minmax_element(dd, dd + nv); + float rms = 0; + for (auto d : dd) + rms += d * d; + rms = std::sqrt(rms) / (nv - 1); + std::cout << "min max rms " << *mx.first << ' ' << *mx.second << ' ' << rms << std::endl; + + } // loop on events + } // lopp on ave vert + + return 0; +} diff --git a/RecoPixelVertexing/PixelVertexFinding/test/cpuVertexFinder_t.cpp b/RecoPixelVertexing/PixelVertexFinding/test/cpuVertexFinder_t.cpp new file mode 100644 index 0000000000000..a7906fe0d03f5 --- /dev/null +++ b/RecoPixelVertexing/PixelVertexFinding/test/cpuVertexFinder_t.cpp @@ -0,0 +1 @@ +#include "VertexFinder_t.h" diff --git a/RecoPixelVertexing/PixelVertexFinding/test/gpuVertexFinder_t.cu b/RecoPixelVertexing/PixelVertexFinding/test/gpuVertexFinder_t.cu new file mode 100644 index 0000000000000..a7906fe0d03f5 --- /dev/null +++ b/RecoPixelVertexing/PixelVertexFinding/test/gpuVertexFinder_t.cu @@ -0,0 +1 @@ +#include "VertexFinder_t.h" diff --git a/RecoTracker/Configuration/python/customizePixelOnlyForProfiling.py b/RecoTracker/Configuration/python/customizePixelOnlyForProfiling.py new file mode 100644 index 0000000000000..24774bbda649c --- /dev/null +++ b/RecoTracker/Configuration/python/customizePixelOnlyForProfiling.py @@ -0,0 +1,59 @@ +import FWCore.ParameterSet.Config as cms + +# Customise the Pixel-only reconstruction to run on GPU +# +# Run the unpacker, clustering, ntuplets, track fit and vertex reconstruction on GPU. +def customizePixelOnlyForProfilingGPUOnly(process): + + process.consumer = cms.EDAnalyzer("GenericConsumer", + eventProducts = cms.untracked.vstring('caHitNtupletCUDA', 'pixelVertexCUDA') + ) + + process.consume_step = cms.EndPath(process.consumer) + + process.schedule = cms.Schedule(process.raw2digi_step, process.reconstruction_step, process.consume_step) + + return process + + +# Customise the Pixel-only reconstruction to run on GPU, and copy the data to the host +# +# Run the unpacker, clustering, ntuplets, track fit and vertex reconstruction on GPU, +# and copy all the products to the host in SoA format. +# +# The same customisation can be also used on the SoA CPU workflow, running up to the +# tracks and vertices on the CPU in SoA format, without conversion to legacy format. +def customizePixelOnlyForProfilingGPUWithHostCopy(process): + + #? process.siPixelRecHitSoAFromLegacy.convertToLegacy = False + + process.consumer = cms.EDAnalyzer("GenericConsumer", + eventProducts = cms.untracked.vstring('pixelTrackSoA', 'pixelVertexSoA') + ) + + process.consume_step = cms.EndPath(process.consumer) + + process.schedule = cms.Schedule(process.raw2digi_step, process.reconstruction_step, process.consume_step) + + return process + + +# Customise the Pixel-only reconstruction to run on GPU, copy the data to the host, +# and convert to legacy format +# +# Run the unpacker, clustering, ntuplets, track fit and vertex reconstruction on GPU; +# copy all the products to the host in SoA format; and convert them to legacy format. +# +# The same customisation can be also used on the CPU workflow, running up to the +# tracks and vertices on the CPU. +def customizePixelOnlyForProfiling(process): + + process.consumer = cms.EDAnalyzer("GenericConsumer", + eventProducts = cms.untracked.vstring('pixelTracks', 'pixelVertices') + ) + + process.consume_step = cms.EndPath(process.consumer) + + process.schedule = cms.Schedule(process.raw2digi_step, process.reconstruction_step, process.consume_step) + + return process diff --git a/RecoTracker/TkSeedGenerator/plugins/BuildFile.xml b/RecoTracker/TkSeedGenerator/plugins/BuildFile.xml index 4dc0bfafbb439..65c849c69bbdf 100644 --- a/RecoTracker/TkSeedGenerator/plugins/BuildFile.xml +++ b/RecoTracker/TkSeedGenerator/plugins/BuildFile.xml @@ -1,19 +1,15 @@ - + - - - - - - + + @@ -26,14 +22,21 @@ + + + + + + + @@ -44,5 +47,4 @@ - diff --git a/RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc b/RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc new file mode 100644 index 0000000000000..0e5823fc46c46 --- /dev/null +++ b/RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc @@ -0,0 +1,170 @@ +#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" +#include "DataFormats/BeamSpot/interface/BeamSpot.h" +#include "DataFormats/GeometrySurface/interface/Plane.h" +#include "DataFormats/TrackerCommon/interface/TrackerTopology.h" +#include "DataFormats/TrackingRecHit/interface/InvalidTrackingRecHit.h" +#include "DataFormats/TrajectorySeed/interface/TrajectorySeedCollection.h" +#include "DataFormats/TrajectoryState/interface/LocalTrajectoryParameters.h" +#include "FWCore/Framework/interface/ConsumesCollector.h" +#include "FWCore/Framework/interface/ESHandle.h" +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Framework/interface/EventSetup.h" +#include "FWCore/Framework/interface/MakerMacros.h" +#include "FWCore/Framework/interface/global/EDProducer.h" +#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" +#include "FWCore/PluginManager/interface/ModuleDef.h" +#include "FWCore/Utilities/interface/EDGetToken.h" +#include "FWCore/Utilities/interface/InputTag.h" +#include "Geometry/CommonDetUnit/interface/GeomDet.h" +#include "Geometry/Records/interface/TrackerDigiGeometryRecord.h" +#include "Geometry/Records/interface/TrackerTopologyRcd.h" +#include "Geometry/TrackerGeometryBuilder/interface/TrackerGeometry.h" +#include "MagneticField/Records/interface/IdealMagneticFieldRecord.h" +#include "RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h" +#include "TrackingTools/AnalyticalJacobians/interface/JacobianLocalToCurvilinear.h" +#include "TrackingTools/MaterialEffects/interface/PropagatorWithMaterial.h" +#include "TrackingTools/Records/interface/TrackingComponentsRecord.h" +#include "TrackingTools/TrajectoryParametrization/interface/CurvilinearTrajectoryError.h" +#include "TrackingTools/TrajectoryParametrization/interface/GlobalTrajectoryParameters.h" +#include "TrackingTools/TrajectoryState/interface/TrajectoryStateTransform.h" + +/* + produces seeds directly from cuda produced tuples +*/ +class SeedProducerFromSoA : public edm::global::EDProducer<> { +public: + explicit SeedProducerFromSoA(const edm::ParameterSet& iConfig); + ~SeedProducerFromSoA() override = default; + + static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); + +private: + void produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const override; + + // Event data tokens + const edm::EDGetTokenT tBeamSpot_; + const edm::EDGetTokenT tokenTrack_; + // Event setup tokens + const edm::ESGetToken idealMagneticFieldToken_; + const edm::ESGetToken trackerDigiGeometryToken_; + const edm::ESGetToken trackerPropagatorToken_; + int32_t minNumberOfHits_; +}; + +SeedProducerFromSoA::SeedProducerFromSoA(const edm::ParameterSet& iConfig) + : tBeamSpot_(consumes(iConfig.getParameter("beamSpot"))), + tokenTrack_(consumes(iConfig.getParameter("src"))), + idealMagneticFieldToken_(esConsumes()), + trackerDigiGeometryToken_(esConsumes()), + trackerPropagatorToken_(esConsumes(edm::ESInputTag("PropagatorWithMaterial"))), + minNumberOfHits_(iConfig.getParameter("minNumberOfHits")) + +{ + produces(); +} + +void SeedProducerFromSoA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { + edm::ParameterSetDescription desc; + desc.add("beamSpot", edm::InputTag("offlineBeamSpot")); + desc.add("src", edm::InputTag("pixelTrackSoA")); + desc.add("minNumberOfHits", 0); + + descriptions.addWithDefaultLabel(desc); +} + +void SeedProducerFromSoA::produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const { + // std::cout << "Converting gpu helix to trajectory seed" << std::endl; + auto result = std::make_unique(); + + auto const& fieldESH = iSetup.getHandle(idealMagneticFieldToken_); + auto const& tracker = iSetup.getHandle(trackerDigiGeometryToken_); + auto const& dus = tracker->detUnits(); + + auto const& propagatorHandle = iSetup.getHandle(trackerPropagatorToken_); + const Propagator* propagator = &(*propagatorHandle); + + const auto& bsh = iEvent.get(tBeamSpot_); + // std::cout << "beamspot " << bsh.x0() << ' ' << bsh.y0() << ' ' << bsh.z0() << std::endl; + GlobalPoint bs(bsh.x0(), bsh.y0(), bsh.z0()); + + const auto& tsoa = *(iEvent.get(tokenTrack_)); + + auto const* quality = tsoa.qualityData(); + auto const& fit = tsoa.stateAtBS; + auto const& detIndices = tsoa.detIndices; + auto maxTracks = tsoa.stride(); + + int32_t nt = 0; + for (int32_t it = 0; it < maxTracks; ++it) { + auto nHits = tsoa.nHits(it); + if (nHits == 0) + break; // this is a guard: maybe we need to move to nTracks... + + auto q = quality[it]; + if (q != pixelTrack::Quality::loose) + continue; // FIXME + if (nHits < minNumberOfHits_) + continue; + ++nt; + + // fill hits with invalid just to hold the detId + auto b = detIndices.begin(it); + edm::OwnVector hits; + for (int iHit = 0; iHit < nHits; ++iHit) { + auto const* det = dus[*(b + iHit)]; + // FIXME at some point get a proper type ... + hits.push_back(new InvalidTrackingRecHit(*det, TrackingRecHit::bad)); + } + + // mind: this values are respect the beamspot! + + float phi = tsoa.phi(it); + + riemannFit::Vector5d ipar, opar; + riemannFit::Matrix5d icov, ocov; + fit.copyToDense(ipar, icov, it); + riemannFit::transformToPerigeePlane(ipar, icov, opar, ocov); + + LocalTrajectoryParameters lpar(opar(0), opar(1), opar(2), opar(3), opar(4), 1.); + AlgebraicSymMatrix55 m; + for (int i = 0; i < 5; ++i) + for (int j = i; j < 5; ++j) + m(i, j) = ocov(i, j); + + float sp = std::sin(phi); + float cp = std::cos(phi); + Surface::RotationType rot(sp, -cp, 0, 0, 0, -1.f, cp, sp, 0); + + Plane impPointPlane(bs, rot); + GlobalTrajectoryParameters gp(impPointPlane.toGlobal(lpar.position()), + impPointPlane.toGlobal(lpar.momentum()), + lpar.charge(), + fieldESH.product()); + + JacobianLocalToCurvilinear jl2c(impPointPlane, lpar, *fieldESH.product()); + + AlgebraicSymMatrix55 mo = ROOT::Math::Similarity(jl2c.jacobian(), m); + + FreeTrajectoryState fts(gp, CurvilinearTrajectoryError(mo)); + + auto const& lastHit = hits.back(); + + TrajectoryStateOnSurface outerState = propagator->propagate(fts, *lastHit.surface()); + + if (!outerState.isValid()) { + edm::LogError("SeedFromGPU") << " was trying to create a seed from:\n" + << fts << "\n propagating to: " << lastHit.geographicalId().rawId(); + continue; + } + + auto const& pTraj = trajectoryStateTransform::persistentState(outerState, lastHit.geographicalId().rawId()); + + result->emplace_back(pTraj, hits, alongMomentum); + } + + iEvent.put(std::move(result)); +} + +DEFINE_FWK_MODULE(SeedProducerFromSoA); diff --git a/SimTracker/TrackerHitAssociation/BuildFile.xml b/SimTracker/TrackerHitAssociation/BuildFile.xml index aa66f443cabb9..5ea8794eda917 100644 --- a/SimTracker/TrackerHitAssociation/BuildFile.xml +++ b/SimTracker/TrackerHitAssociation/BuildFile.xml @@ -5,6 +5,7 @@ + @@ -18,6 +19,7 @@ + diff --git a/SimTracker/TrackerHitAssociation/interface/trackerHitAssociationHeterogeneous.h b/SimTracker/TrackerHitAssociation/interface/trackerHitAssociationHeterogeneous.h new file mode 100644 index 0000000000000..86fe89f05b7d2 --- /dev/null +++ b/SimTracker/TrackerHitAssociation/interface/trackerHitAssociationHeterogeneous.h @@ -0,0 +1,69 @@ +#ifndef SimTracker_TrackerHitAssociation_plugins_trackerHitAssociationHeterogeneousProduct_h +#define SimTracker_TrackerHitAssociation_plugins_trackerHitAssociationHeterogeneousProduct_h + +#include "CUDADataFormats/Common/interface/HeterogeneousSoA.h" +#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" + +namespace trackerHitAssociationHeterogeneous { + + struct ClusterSLView { + using Clus2TP = std::array; + + Clus2TP* links_d; + uint32_t* tkId_d; + uint32_t* tkId2_d; + uint32_t* n1_d; + uint32_t* n2_d; + }; + + template + class Product { + public: + template + using unique_ptr = typename Traits::template unique_ptr; + + Product() = default; + ~Product() = default; + Product(Product const&) = delete; + Product(Product&&) = default; + + Product(int nlinks, int nhits, cudaStream_t stream); + + ClusterSLView& view() { return m_view; } + ClusterSLView const& view() const { return m_view; } + + int nLinks() const { return m_nLinks; } + int nHits() const { return m_nHits; } + + private: + static constexpr uint32_t n32 = 4; + + unique_ptr m_storeTP; //! + unique_ptr m_store32; //! + + ClusterSLView m_view; //! + + int m_nLinks; + int m_nHits; + }; + + template + Product::Product(int nlinks, int nhits, cudaStream_t stream) : m_nLinks(nlinks), m_nHits(nhits) { + m_storeTP = Traits::template make_device_unique(m_nLinks * 7, stream); + m_store32 = Traits::template make_device_unique(m_nHits * n32, stream); + + auto get32 = [&](int i) { return m_store32.get() + i * m_nHits; }; + + m_view.links_d = (ClusterSLView::Clus2TP*)(m_storeTP.get()); + m_view.tkId_d = get32(0); + m_view.tkId2_d = get32(1); + m_view.n1_d = get32(2); + m_view.n2_d = get32(3); + } + + using ProductCUDA = Product; + +} // namespace trackerHitAssociationHeterogeneous + +#endif // SimTracker_TrackerHitAssociation_plugins_trackerHitAssociationHeterogeneousProduct_h diff --git a/SimTracker/TrackerHitAssociation/plugins/BuildFile.xml b/SimTracker/TrackerHitAssociation/plugins/BuildFile.xml index ecda84011006b..186f04cbd611d 100644 --- a/SimTracker/TrackerHitAssociation/plugins/BuildFile.xml +++ b/SimTracker/TrackerHitAssociation/plugins/BuildFile.xml @@ -1,5 +1,10 @@ + + + + + - + diff --git a/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.cu b/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.cu new file mode 100644 index 0000000000000..0aab26d9cc091 --- /dev/null +++ b/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.cu @@ -0,0 +1,224 @@ +#include +#include +#include + +#include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h" +#include "RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h" +#include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h" + +#include "ClusterSLOnGPU.h" + +using ClusterSLView = trackerHitAssociationHeterogeneous::ClusterSLView; +using Clus2TP = ClusterSLView::Clus2TP; + +// #define DUMP_TK2 + +__global__ void simLink(const SiPixelDigisCUDA::DeviceConstView* dd, + uint32_t ndigis, + TrackingRecHit2DSOAView const* hhp, + ClusterSLView sl, + uint32_t n) { + constexpr uint32_t invTK = 0; // std::numeric_limits::max(); + using gpuClustering::invalidModuleId; + using gpuClustering::maxNumModules; + + auto const& hh = *hhp; + auto i = blockIdx.x * blockDim.x + threadIdx.x; + + if (i >= ndigis) + return; + + auto id = dd->moduleInd(i); + if (invalidModuleId == id) + return; + assert(id < maxNumModules); + + auto ch = pixelgpudetails::pixelToChannel(dd->xx(i), dd->yy(i)); + auto first = hh.hitsModuleStart(id); + auto cl = first + dd->clus(i); + assert(cl < maxNumModules * blockDim.x); + + const Clus2TP me{{id, ch, 0, 0, 0, 0, 0}}; + + auto less = [] __host__ __device__(Clus2TP const& a, Clus2TP const& b) -> bool { + // in this context we do not care of [2] + return a[0] < b[0] or ((not(b[0] < a[0])) and (a[1] < b[1])); + }; + + auto equal = [] __host__ __device__(Clus2TP const& a, Clus2TP const& b) -> bool { + // in this context we do not care of [2] + return a[0] == b[0] and a[1] == b[1]; + }; + + auto const* b = sl.links_d; + auto const* e = b + n; + + auto p = cuda_std::lower_bound(b, e, me, less); + int32_t j = p - sl.links_d; + assert(j >= 0); + + auto getTK = [&](int i) { + auto const& l = sl.links_d[i]; + return l[2]; + }; + + j = std::min(int(j), int(n - 1)); + if (equal(me, sl.links_d[j])) { + auto const itk = j; + auto const tk = getTK(j); + auto old = atomicCAS(&sl.tkId_d[cl], invTK, itk); + if (invTK == old or tk == getTK(old)) { + atomicAdd(&sl.n1_d[cl], 1); + } else { + auto old = atomicCAS(&sl.tkId2_d[cl], invTK, itk); + if (invTK == old or tk == getTK(old)) + atomicAdd(&sl.n2_d[cl], 1); + } + } +} + +__global__ void doZero(uint32_t nhits, ClusterSLView sl) { + auto i = blockIdx.x * blockDim.x + threadIdx.x; + if (i > nhits) + return; + + sl.tkId_d[i] = 0; + sl.n1_d[i] = 0; + sl.tkId2_d[i] = 0; + sl.n2_d[i] = 0; +} + +__global__ void dumpLink(int first, int ev, TrackingRecHit2DSOAView const* hhp, uint32_t nhits, ClusterSLView sl) { + auto i = first + blockIdx.x * blockDim.x + threadIdx.x; + if (i > nhits) + return; + + auto const& hh = *hhp; + + auto const& tk1 = sl.links_d[sl.tkId_d[i]]; + +#ifdef DUMP_TK2 + auto const& tk2 = sl.links_d[sl.tkId2_d[i]]; + + printf("HIT: %d %d %d %d %.4f %.4f %.4f %.4f %d %d %d %d %d %d %d %d %d %d %d %d %d %d\n", +#else + printf("HIT: %d %d %d %d %.4f %.4f %.4f %.4f %d %d %d %d %d %d %d %d %d\n", +#endif + ev, + i, + hh.detectorIndex(i), + hh.charge(i), + hh.xGlobal(i), + hh.yGlobal(i), + hh.zGlobal(i), + hh.rGlobal(i), + hh.iphi(i), + hh.clusterSizeX(i), + hh.clusterSizeY(i), + tk1[2], + tk1[3], + tk1[4], + tk1[5], + tk1[6], + sl.n1_d[i] +#ifdef DUMP_TK2 + , + tk2[2], + tk2[3], + tk2[4], + tk2[5], + tk2[6], + sl.n2_d[i] +#endif + ); +} + +namespace clusterSLOnGPU { + + void printCSVHeader() { +#ifdef DUMP_TK2 + printf("HIT: %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s\n", +#else + printf("HIT: %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s\n", +#endif + "ev", + "ind", + "det", + "charge", + "xg", + "yg", + "zg", + "rg", + "iphi", + "xsize", + "ysize", + "tkId", + "pt", + "eta", + "z0", + "r0", + "n1" +#ifdef DUMP_TK2 + , + "tkId2", + "pt2", + "eta", + "z02", + "r02", + "n2" +#endif + ); + } + + std::atomic evId(0); + std::once_flag doneCSVHeader; + + Kernel::Kernel(bool dump) : doDump(dump) { + if (doDump) + std::call_once(doneCSVHeader, printCSVHeader); + } + + trackerHitAssociationHeterogeneous::ProductCUDA Kernel::makeAsync(SiPixelDigisCUDA const& dd, + uint32_t ndigis, + HitsOnCPU const& hh, + Clus2TP const* digi2tp, + uint32_t nhits, + uint32_t nlinks, + cudaStream_t stream) const { + trackerHitAssociationHeterogeneous::ProductCUDA product(nlinks, nhits, stream); + auto& csl = product.view(); + + cudaCheck(cudaMemcpyAsync(csl.links_d, digi2tp, sizeof(Clus2TP) * nlinks, cudaMemcpyDefault, stream)); + + if (0 == nhits) + return product; + + int ev = ++evId; + int threadsPerBlock = 256; + + int blocks = (nhits + threadsPerBlock - 1) / threadsPerBlock; + doZero<<>>(nhits, csl); + cudaCheck(cudaGetLastError()); + + blocks = (ndigis + threadsPerBlock - 1) / threadsPerBlock; + simLink<<>>(dd.view(), ndigis, hh.view(), csl, nlinks); + cudaCheck(cudaGetLastError()); + + if (doDump) { + cudaStreamSynchronize(stream); // flush previous printf + // one line == 200B so each kernel can print only 5K lines.... + blocks = 16; + for (int first = 0; first < int(nhits); first += blocks * threadsPerBlock) { + dumpLink<<>>(first, ev, hh.view(), nhits, csl); + cudaCheck(cudaGetLastError()); + cudaStreamSynchronize(stream); + } + } + cudaCheck(cudaGetLastError()); + + return product; + } + +} // namespace clusterSLOnGPU diff --git a/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.h b/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.h new file mode 100644 index 0000000000000..3109e6ed45a76 --- /dev/null +++ b/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.h @@ -0,0 +1,36 @@ +#ifndef SimTracker_TrackerHitAssociation_plugins_ClusterSLOnGPU_h +#define SimTracker_TrackerHitAssociation_plugins_ClusterSLOnGPU_h + +#include + +#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "SimTracker/TrackerHitAssociation/interface/trackerHitAssociationHeterogeneous.h" + +namespace clusterSLOnGPU { + + using ClusterSLView = trackerHitAssociationHeterogeneous::ClusterSLView; + using Clus2TP = ClusterSLView::Clus2TP; + using HitsOnGPU = TrackingRecHit2DSOAView; + using HitsOnCPU = TrackingRecHit2DCUDA; + + class Kernel { + public: + explicit Kernel(bool dump); + ~Kernel() {} + trackerHitAssociationHeterogeneous::ProductCUDA makeAsync(SiPixelDigisCUDA const& dd, + uint32_t ndigis, + HitsOnCPU const& hh, + Clus2TP const* digi2tp, + uint32_t nhits, + uint32_t nlinks, + cudaStream_t stream) const; + + private: + public: + bool doDump; + }; +} // namespace clusterSLOnGPU + +#endif // SimTracker_TrackerHitAssociation_plugins_ClusterSLOnGPU_h diff --git a/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationProducerCUDA.cc b/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationProducerCUDA.cc new file mode 100644 index 0000000000000..35337151eda91 --- /dev/null +++ b/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationProducerCUDA.cc @@ -0,0 +1,227 @@ +#include +#include +#include + +#include + +#include "CUDADataFormats/Common/interface/Product.h" +#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" +#include "DataFormats/Common/interface/DetSetVector.h" +#include "DataFormats/Common/interface/DetSetVectorNew.h" +#include "DataFormats/Common/interface/Handle.h" +#include "DataFormats/DetId/interface/DetId.h" +#include "DataFormats/Phase2TrackerCluster/interface/Phase2TrackerCluster1D.h" +#include "DataFormats/Phase2TrackerDigi/interface/Phase2TrackerDigi.h" +#include "DataFormats/SiPixelCluster/interface/SiPixelCluster.h" +#include "DataFormats/SiPixelDetId/interface/PixelChannelIdentifier.h" +#include "DataFormats/SiStripCluster/interface/SiStripCluster.h" +#include "DataFormats/TrackerRecHit2D/interface/OmniClusterRef.h" +#include "FWCore/Framework/interface/ESHandle.h" +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Framework/interface/EventSetup.h" +#include "FWCore/Framework/interface/Frameworkfwd.h" +#include "FWCore/Framework/interface/global/EDProducer.h" +#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" +#include "FWCore/Utilities/interface/EDGetToken.h" +#include "FWCore/Utilities/interface/InputTag.h" +#include "Geometry/Records/interface/TrackerDigiGeometryRecord.h" +#include "Geometry/TrackerGeometryBuilder/interface/TrackerGeometry.h" +#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "SimDataFormats/Track/interface/SimTrackContainer.h" +#include "SimDataFormats/TrackerDigiSimLink/interface/PixelDigiSimLink.h" +#include "SimDataFormats/TrackerDigiSimLink/interface/StripDigiSimLink.h" +#include "SimDataFormats/TrackingAnalysis/interface/TrackingParticle.h" +#include "SimDataFormats/TrackingAnalysis/interface/TrackingParticleFwd.h" +#include "SimTracker/TrackerHitAssociation/interface/ClusterTPAssociation.h" + +#include "ClusterSLOnGPU.h" + +class ClusterTPAssociationProducerCUDA : public edm::global::EDProducer<> { +public: + typedef std::vector OmniClusterCollection; + + using ClusterSLGPU = trackerHitAssociationHeterogeneous::ClusterSLView; + using Clus2TP = ClusterSLGPU::Clus2TP; + using ProductCUDA = trackerHitAssociationHeterogeneous::ProductCUDA; + + explicit ClusterTPAssociationProducerCUDA(const edm::ParameterSet &); + ~ClusterTPAssociationProducerCUDA() override = default; + + static void fillDescriptions(edm::ConfigurationDescriptions &descriptions); + +private: + void produce(edm::StreamID streamID, edm::Event &iEvent, const edm::EventSetup &iSetup) const override; + + std::map, TrackingParticleRef> makeMap(const edm::Event &iEvent) const; + + template + std::vector> getSimTrackId(const edm::Handle> &simLinks, + const DetId &detId, + uint32_t channel) const; + + edm::EDGetTokenT> sipixelSimLinksToken_; + edm::EDGetTokenT> sistripSimLinksToken_; + edm::EDGetTokenT> siphase2OTSimLinksToken_; + edm::EDGetTokenT> pixelClustersToken_; + edm::EDGetTokenT> stripClustersToken_; + edm::EDGetTokenT> phase2OTClustersToken_; + edm::EDGetTokenT trackingParticleToken_; + + edm::EDGetTokenT> tGpuDigis; + edm::EDGetTokenT> tGpuHits; + + edm::EDPutTokenT> tokenGPUProd_; + + clusterSLOnGPU::Kernel m_gpuAlgo; +}; + +ClusterTPAssociationProducerCUDA::ClusterTPAssociationProducerCUDA(const edm::ParameterSet &cfg) + : sipixelSimLinksToken_( + consumes>(cfg.getParameter("pixelSimLinkSrc"))), + sistripSimLinksToken_( + consumes>(cfg.getParameter("stripSimLinkSrc"))), + siphase2OTSimLinksToken_( + consumes>(cfg.getParameter("phase2OTSimLinkSrc"))), + pixelClustersToken_( + consumes>(cfg.getParameter("pixelClusterSrc"))), + stripClustersToken_( + consumes>(cfg.getParameter("stripClusterSrc"))), + phase2OTClustersToken_(consumes>( + cfg.getParameter("phase2OTClusterSrc"))), + trackingParticleToken_( + consumes(cfg.getParameter("trackingParticleSrc"))), + tGpuDigis(consumes>( + cfg.getParameter("heterogeneousPixelDigiClusterSrc"))), + tGpuHits(consumes>( + cfg.getParameter("heterogeneousPixelRecHitSrc"))), + m_gpuAlgo(cfg.getParameter("dumpCSV")) { + tokenGPUProd_ = produces>(); +} + +void ClusterTPAssociationProducerCUDA::fillDescriptions(edm::ConfigurationDescriptions &descriptions) { + edm::ParameterSetDescription desc; + desc.add("simTrackSrc", edm::InputTag("g4SimHits")); + desc.add("pixelSimLinkSrc", edm::InputTag("simSiPixelDigis")); + desc.add("stripSimLinkSrc", edm::InputTag("simSiStripDigis")); + desc.add("phase2OTSimLinkSrc", edm::InputTag("simSiPixelDigis", "Tracker")); + desc.add("pixelClusterSrc", edm::InputTag("siPixelClusters")); + desc.add("stripClusterSrc", edm::InputTag("siStripClusters")); + desc.add("phase2OTClusterSrc", edm::InputTag("siPhase2Clusters")); + desc.add("trackingParticleSrc", edm::InputTag("mix", "MergedTrackTruth")); + desc.add("heterogeneousPixelDigiClusterSrc", edm::InputTag("siPixelClustersPreSplittingCUDA")); + desc.add("heterogeneousPixelRecHitSrc", edm::InputTag("siPixelRecHitsPreSplittingCUDA")); + + desc.add("dumpCSV", false); + + descriptions.add("tpClusterProducerCUDADefault", desc); +} + +std::map, TrackingParticleRef> ClusterTPAssociationProducerCUDA::makeMap( + const edm::Event &iEvent) const { + // TrackingParticle + edm::Handle TPCollectionH; + iEvent.getByToken(trackingParticleToken_, TPCollectionH); + + // prepare temporary map between SimTrackId and TrackingParticle index + std::map, TrackingParticleRef> mapping; + for (TrackingParticleCollection::size_type itp = 0; itp < TPCollectionH.product()->size(); ++itp) { + TrackingParticleRef trackingParticle(TPCollectionH, itp); + + // SimTracks inside TrackingParticle + EncodedEventId eid(trackingParticle->eventId()); + for (auto itrk = trackingParticle->g4Track_begin(); itrk != trackingParticle->g4Track_end(); ++itrk) { + std::pair trkid(itrk->trackId(), eid); + //std::cout << "creating map for id: " << trkid.first << " with tp: " << trackingParticle.key() << std::endl; + mapping.insert(std::make_pair(trkid, trackingParticle)); + } + } + return mapping; +} + +void ClusterTPAssociationProducerCUDA::produce(edm::StreamID streamID, + edm::Event &iEvent, + const edm::EventSetup &iSetup) const { + edm::ESHandle geom; + iSetup.get().get(geom); + + // Pixel DigiSimLink + edm::Handle> sipixelSimLinks; + // iEvent.getByLabel(_pixelSimLinkSrc, sipixelSimLinks); + iEvent.getByToken(sipixelSimLinksToken_, sipixelSimLinks); + + // TrackingParticle + edm::Handle TPCollectionH; + iEvent.getByToken(trackingParticleToken_, TPCollectionH); + + auto mapping = makeMap(iEvent); + + edm::Handle> gd; + iEvent.getByToken(tGpuDigis, gd); + edm::Handle> gh; + iEvent.getByToken(tGpuHits, gh); + + cms::cuda::ScopedContextProduce ctx{*gd}; + auto const &gDigis = ctx.get(*gd); + auto const &gHits = ctx.get(*gh); + auto ndigis = gDigis.nDigis(); + auto nhits = gHits.nHits(); + + std::vector digi2tp; + digi2tp.push_back({{0, 0, 0, 0, 0, 0, 0}}); // put at 0 0 + for (auto const &links : *sipixelSimLinks) { + DetId detId(links.detId()); + const GeomDetUnit *genericDet = geom->idToDetUnit(detId); + uint32_t gind = genericDet->index(); + for (auto const &link : links) { + if (link.fraction() < 0.5f) { + continue; + } + auto tkid = std::make_pair(link.SimTrackId(), link.eventId()); + auto ipos = mapping.find(tkid); + if (ipos != mapping.end()) { + uint32_t pt = 1000 * (*ipos).second->pt(); + uint32_t eta = 10000 * (*ipos).second->eta(); + uint32_t z0 = 10000 * (*ipos).second->vz(); // in um + uint32_t r0 = 10000 * std::sqrt((*ipos).second->vx() * (*ipos).second->vx() + + (*ipos).second->vy() * (*ipos).second->vy()); // in um + digi2tp.push_back({{gind, uint32_t(link.channel()), (*ipos).second.key(), pt, eta, z0, r0}}); + } + } + } + + std::sort(digi2tp.begin(), digi2tp.end()); + + ctx.emplace(iEvent, + tokenGPUProd_, + m_gpuAlgo.makeAsync(gDigis, ndigis, gHits, digi2tp.data(), nhits, digi2tp.size(), ctx.stream())); +} + +template +std::vector> +//std::pair +ClusterTPAssociationProducerCUDA::getSimTrackId(const edm::Handle> &simLinks, + const DetId &detId, + uint32_t channel) const { + //std::pair simTrkId; + std::vector> simTrkId; + auto isearch = simLinks->find(detId); + if (isearch != simLinks->end()) { + // Loop over DigiSimLink in this det unit + edm::DetSet link_detset = (*isearch); + for (typename edm::DetSet::const_iterator it = link_detset.data.begin(); it != link_detset.data.end(); ++it) { + if (channel == it->channel()) { + simTrkId.push_back(std::make_pair(it->SimTrackId(), it->eventId())); + } + } + } + return simTrkId; +} + +#include "FWCore/PluginManager/interface/ModuleDef.h" +#include "FWCore/Framework/interface/MakerMacros.h" + +DEFINE_FWK_MODULE(ClusterTPAssociationProducerCUDA); diff --git a/SimTracker/TrackerHitAssociation/python/tpClusterProducer_cfi.py b/SimTracker/TrackerHitAssociation/python/tpClusterProducer_cfi.py index 8757a67226fb8..890d05c4fc093 100644 --- a/SimTracker/TrackerHitAssociation/python/tpClusterProducer_cfi.py +++ b/SimTracker/TrackerHitAssociation/python/tpClusterProducer_cfi.py @@ -18,3 +18,6 @@ stripSimLinkSrc = "mixData:StripDigiSimLink", phase2OTSimLinkSrc = "mixData:Phase2OTDigiSimLink", ) + +from SimTracker.TrackerHitAssociation.tpClusterProducerCUDADefault_cfi import tpClusterProducerCUDADefault as _tpClusterProducerCUDA +tpClusterProducerCUDA = _tpClusterProducerCUDA.clone() diff --git a/SimTracker/TrackerHitAssociation/src/classes.h b/SimTracker/TrackerHitAssociation/src/classes.h index 457b6683d5cea..c8f98cd38ca81 100644 --- a/SimTracker/TrackerHitAssociation/src/classes.h +++ b/SimTracker/TrackerHitAssociation/src/classes.h @@ -5,6 +5,8 @@ #include "DataFormats/Common/interface/AssociationMap.h" #include "DataFormats/TrackerRecHit2D/interface/OmniClusterRef.h" #include "SimTracker/TrackerHitAssociation/interface/ClusterTPAssociation.h" +#include "CUDADataFormats/Common/interface/Product.h" +#include "SimTracker/TrackerHitAssociation/interface/trackerHitAssociationHeterogeneous.h" #include "DataFormats/Common/interface/AssociationMap.h" namespace SimTracker_TrackerHitAssociation { diff --git a/SimTracker/TrackerHitAssociation/src/classes_def.xml b/SimTracker/TrackerHitAssociation/src/classes_def.xml index f801d25b176e0..e9701e768fe75 100644 --- a/SimTracker/TrackerHitAssociation/src/classes_def.xml +++ b/SimTracker/TrackerHitAssociation/src/classes_def.xml @@ -20,4 +20,9 @@ + + + + + diff --git a/SimTracker/TrackerHitAssociation/test/BuildFile.xml b/SimTracker/TrackerHitAssociation/test/BuildFile.xml index a0dc6b61844a0..df2be2331d810 100644 --- a/SimTracker/TrackerHitAssociation/test/BuildFile.xml +++ b/SimTracker/TrackerHitAssociation/test/BuildFile.xml @@ -2,12 +2,14 @@ + + diff --git a/SimTracker/TrackerHitAssociation/test/ClusterTPCUDAdump.cc b/SimTracker/TrackerHitAssociation/test/ClusterTPCUDAdump.cc new file mode 100644 index 0000000000000..9c7a2e3e4828b --- /dev/null +++ b/SimTracker/TrackerHitAssociation/test/ClusterTPCUDAdump.cc @@ -0,0 +1,66 @@ +#include + +#include "CUDADataFormats/Common/interface/Product.h" +#include "DataFormats/Common/interface/Handle.h" +#include "FWCore/Framework/interface/ConsumesCollector.h" +#include "FWCore/Framework/interface/ESHandle.h" +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Framework/interface/EventSetup.h" +#include "FWCore/Framework/interface/MakerMacros.h" +#include "FWCore/Framework/interface/global/EDAnalyzer.h" +#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" +#include "FWCore/PluginManager/interface/ModuleDef.h" +#include "FWCore/ServiceRegistry/interface/Service.h" +#include "FWCore/Utilities/interface/EDGetToken.h" +#include "FWCore/Utilities/interface/InputTag.h" +#include "FWCore/Utilities/interface/RunningAverage.h" +#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h" +#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" +#include "SimTracker/TrackerHitAssociation/interface/trackerHitAssociationHeterogeneous.h" + +class ClusterTPCUDAdump : public edm::global::EDAnalyzer<> { +public: + using ClusterSLGPU = trackerHitAssociationHeterogeneous::ClusterSLView; + using Clus2TP = ClusterSLGPU::Clus2TP; + using ProductCUDA = trackerHitAssociationHeterogeneous::ProductCUDA; + + explicit ClusterTPCUDAdump(const edm::ParameterSet& iConfig); + ~ClusterTPCUDAdump() override = default; + + static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); + +private: + void analyze(edm::StreamID streamID, edm::Event const& iEvent, const edm::EventSetup& iSetup) const override; + const bool m_onGPU; + edm::EDGetTokenT> tokenGPU_; +}; + +ClusterTPCUDAdump::ClusterTPCUDAdump(const edm::ParameterSet& iConfig) : m_onGPU(iConfig.getParameter("onGPU")) { + if (m_onGPU) { + tokenGPU_ = consumes>(iConfig.getParameter("clusterTP")); + } else { + } +} + +void ClusterTPCUDAdump::analyze(edm::StreamID streamID, edm::Event const& iEvent, const edm::EventSetup& iSetup) const { + if (m_onGPU) { + auto const& hctp = iEvent.get(tokenGPU_); + cms::cuda::ScopedContextProduce ctx{hctp}; + + auto const& ctp = ctx.get(hctp); + auto const& soa = ctp.view(); + assert(soa.links_d); + } else { + } +} + +void ClusterTPCUDAdump::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { + edm::ParameterSetDescription desc; + desc.add("onGPU", true); + desc.add("clusterTP", edm::InputTag("tpClusterProducerCUDAPreSplitting")); + descriptions.add("clusterTPCUDAdump", desc); +} + +DEFINE_FWK_MODULE(ClusterTPCUDAdump); diff --git a/Validation/RecoTrack/python/PostProcessorTracker_cfi.py b/Validation/RecoTrack/python/PostProcessorTracker_cfi.py index df73b303d5061..54fa0364fe239 100644 --- a/Validation/RecoTrack/python/PostProcessorTracker_cfi.py +++ b/Validation/RecoTrack/python/PostProcessorTracker_cfi.py @@ -350,9 +350,9 @@ def _addNoFlow(module): postProcessorTrackTrackingOnly = postProcessorTrack.clone() -postProcessorTrackTrackingOnly.subDirs.extend(["Tracking/TrackBHadron/*","Tracking/TrackSeeding/*", "Tracking/PixelTrack/*"]) +postProcessorTrackTrackingOnly.subDirs.extend(["Tracking/TrackBHadron/*", "Tracking/TrackSeeding/*", "Tracking/PixelTrack/*", "Tracking/PixelTrackFromPV/*", "Tracking/PixelTrackFromPVAllTP/*", "Tracking/PixelTrackBHadron/*"]) postProcessorTrackSummaryTrackingOnly = postProcessorTrackSummary.clone() -postProcessorTrackSummaryTrackingOnly.subDirs.extend(["Tracking/TrackBHadron","Tracking/TrackSeeding", "Tracking/PixelTrack"]) +postProcessorTrackSummaryTrackingOnly.subDirs.extend(["Tracking/TrackBHadron", "Tracking/TrackSeeding", "Tracking/PixelTrack", "Tracking/PixelTrackFromPV", "Tracking/PixelTrackFromPVAllTP", "Tracking/PixelTrackBHadron"]) postProcessorTrackSequenceTrackingOnly = cms.Sequence( postProcessorTrackTrackingOnly+ diff --git a/Validation/RecoTrack/python/TrackValidation_cff.py b/Validation/RecoTrack/python/TrackValidation_cff.py index 9882bf2a9ca7f..21678c35e4b44 100644 --- a/Validation/RecoTrack/python/TrackValidation_cff.py +++ b/Validation/RecoTrack/python/TrackValidation_cff.py @@ -522,6 +522,11 @@ def _getMVASelectors(postfix): # Built tracks, in the standard sequence mainly for monitoring the track selection MVA tpClusterProducerPreSplitting = tpClusterProducer.clone(pixelClusterSrc = "siPixelClustersPreSplitting") quickTrackAssociatorByHitsPreSplitting = quickTrackAssociatorByHits.clone(cluster2TPSrc = "tpClusterProducerPreSplitting") + +tpClusterProducerCUDAPreSplitting = tpClusterProducerCUDA.clone( + pixelClusterSrc = "siPixelClustersPreSplitting" +) + _trackValidatorSeedingBuilding = trackValidator.clone( # common for built tracks and seeds (in trackingOnly) associators = ["quickTrackAssociatorByHits"], UseAssociators = True, @@ -701,6 +706,16 @@ def _uniqueFirstLayers(layerList): VertexAssociatorByPositionAndTracks, trackingParticleNumberOfLayersProducer ) + +#gpu tp ??? +from Configuration.ProcessModifiers.gpu_cff import gpu +tpClusterProducerPreSplittingCUDA = cms.Task( + tpClusterProducerCUDAPreSplitting +) +_tracksValidationTruth_gpu = tracksValidationTruth.copy() +_tracksValidationTruth_gpu.add(tpClusterProducerPreSplittingCUDA) +gpu.toReplaceWith(tracksValidationTruth,_tracksValidationTruth_gpu) + fastSim.toModify(tracksValidationTruth, lambda x: x.remove(tpClusterProducer)) tracksPreValidation = cms.Task( @@ -974,9 +989,17 @@ def _uniqueFirstLayers(layerList): trackAssociation = "trackingParticlePixelTrackAsssociation" ) +_pixelTracksCustom = dict( + src = "pixelTracks", + vertexTag = "pixelVertices", +) +pixelTracksPt09 = generalTracksPt09.clone(quality = ["undefQuality"], **_pixelTracksCustom) +pixelTracksFromPV = generalTracksFromPV.clone(quality = "undefQuality", **_pixelTracksCustom) +pixelTracksFromPVPt09 = pixelTracksPt09.clone(src = "pixelTracksFromPV") + trackValidatorPixelTrackingOnly = trackValidator.clone( dirName = "Tracking/PixelTrack/", - label = ["pixelTracks"], + label = ["pixelTracks", "pixelTracksPt09"], doResolutionPlotsForLabels = [], trackCollectionForDrCalculation = "pixelTracks", associators = ["trackingParticlePixelTrackAsssociation"], @@ -985,16 +1008,59 @@ def _uniqueFirstLayers(layerList): dodEdxPlots = False, cores = cms.InputTag(""), ) +trackValidatorFromPVPixelTrackingOnly = trackValidatorPixelTrackingOnly.clone( + dirName = "Tracking/PixelTrackFromPV/", + label = ["pixelTracksFromPV", "pixelTracksFromPVPt09"], + label_tp_effic = "trackingParticlesSignal", + label_tp_fake = "trackingParticlesSignal", + label_tp_effic_refvector = True, + label_tp_fake_refvector = True, + trackCollectionForDrCalculation = "pixelTracksFromPV", + doPlotsOnlyForTruePV = True, + doPVAssociationPlots = False, + doResolutionPlotsForLabels = ["disabled"], +) +trackValidatorFromPVAllTPPixelTrackingOnly = trackValidatorFromPVPixelTrackingOnly.clone( + dirName = "Tracking/PixelTrackFromPVAllTP/", + label_tp_effic = trackValidatorPixelTrackingOnly.label_tp_effic.value(), + label_tp_fake = trackValidatorPixelTrackingOnly.label_tp_fake.value(), + label_tp_effic_refvector = False, + label_tp_fake_refvector = False, + doSimPlots = False, + doSimTrackPlots = False, +) +trackValidatorBHadronPixelTrackingOnly = trackValidatorPixelTrackingOnly.clone( + dirName = "Tracking/PixelTrackBHadron/", + label_tp_effic = "trackingParticlesBHadron", + label_tp_effic_refvector = True, + doSimPlots = True, + doRecoTrackPlots = False, # Fake rate is defined wrt. all TPs, and that is already included in trackValidator + dodEdxPlots = False, +) + tracksValidationTruthPixelTrackingOnly = tracksValidationTruth.copy() tracksValidationTruthPixelTrackingOnly.replace(trackingParticleRecoTrackAsssociation, trackingParticlePixelTrackAsssociation) tracksValidationTruthPixelTrackingOnly.replace(VertexAssociatorByPositionAndTracks, PixelVertexAssociatorByPositionAndTracks) +tracksValidationTruthPixelTrackingOnly.add(trackingParticlesBHadron) + +tracksPreValidationPixelTrackingOnly = cms.Task( + tracksValidationTruthPixelTrackingOnly, + trackingParticlesSignal, + pixelTracksPt09, + pixelTracksFromPV, + pixelTracksFromPVPt09, +) tracksValidationPixelTrackingOnly = cms.Sequence( - trackValidatorPixelTrackingOnly, - tracksValidationTruthPixelTrackingOnly + trackValidatorPixelTrackingOnly + + trackValidatorFromPVPixelTrackingOnly + + trackValidatorFromPVAllTPPixelTrackingOnly + + trackValidatorBHadronPixelTrackingOnly, + tracksPreValidationPixelTrackingOnly ) + ### Lite mode (only generalTracks and HP) trackValidatorLite = trackValidator.clone( label = ["generalTracks", "cutsRecoTracksHp"] diff --git a/Validation/RecoTrack/python/TrackingParticleSelectionsForEfficiency_cff.py b/Validation/RecoTrack/python/TrackingParticleSelectionsForEfficiency_cff.py index c020d894c8d4b..04f9e52ee18a7 100644 --- a/Validation/RecoTrack/python/TrackingParticleSelectionsForEfficiency_cff.py +++ b/Validation/RecoTrack/python/TrackingParticleSelectionsForEfficiency_cff.py @@ -12,7 +12,7 @@ ptMin = cms.double(0.9), ptMax = cms.double(1e100), maxRapidity = cms.double(2.5), - tip = cms.double(3.5), + tip = cms.double(2.0), minPhi = cms.double(-3.2), maxPhi = cms.double(3.2), invertRapidityCut = cms.bool(False) diff --git a/Validation/RecoTrack/python/plotting/html.py b/Validation/RecoTrack/python/plotting/html.py index a9fed5cc12975..3985f8edc9abf 100644 --- a/Validation/RecoTrack/python/plotting/html.py +++ b/Validation/RecoTrack/python/plotting/html.py @@ -63,8 +63,14 @@ def _allToHP(s): return s.replace("All", "High purity") def _allToBTV(s): return s.replace("All", "BTV-like") +def _allPtCut(s): + return s.replace("All tracks", "Tracks pT > 0.9 GeV") def _ptCut(s): return s.replace("Tracks", "Tracks pT > 0.9 GeV").replace("tracks", "tracks pT > 0.9 GeV") +def _allToPixel(s): + return s.replace("All", "Pixel") +def _toPixel(s): + return s.replace("Tracks", "Pixel tracks") _trackQualityNameOrder = collections.OrderedDict([ ("seeding_seeds", "Seeds"), ("seeding_seedsa", "Seeds A"), @@ -75,8 +81,8 @@ def _ptCut(s): ("building_", "Built tracks"), ("", _allName), ("highPurity", _allToHP(_allName)), - ("Pt09", "Tracks pT > 0.9 GeV"), - ("highPurityPt09", "High purity tracks pT > 0.9 GeV"), + ("Pt09", _allPtCut(_allName)), + ("highPurityPt09", _ptCut(_allToHP(_allName))), ("ByOriginalAlgo", _toOriAlgo(_allName)), ("highPurityByOriginalAlgo", _toOriAlgo(_toHP(_allName))), ("ByAlgoMask", _toAlgoMask(_allName)), @@ -120,6 +126,15 @@ def _ptCut(s): ("displaced_highPurityByOriginalAlgo", _toOriAlgo(_allToHP(_displacedName))), ("displaced_ByAlgoMask", _toAlgoMask(_displacedName)), ("displaced_highPurityByAlgoMask", _toAlgoMask(_allToHP(_displacedName))), + # Pixel tracks + ("pixel_", _allToPixel(_allName)), + ("pixel_Pt09", _ptCut(_allToPixel(_allName))), + ("pixelFromPV_", _toPixel(_fromPVName)), + ("pixelFromPV_Pt09", _ptCut(_toPixel(_fromPVName))), + ("pixelFromPVAllTP_", _toPixel(_fromPVAllTPName)), + ("pixelFromPVAllTP_Pt09", _ptCut(_toPixel(_fromPVAllTPName))), + ("pixelbhadron_", _allToPixel(_bhadronName)), + ("pixelbhadron_Pt09", _ptCut(_allToPixel(_bhadronName))), ]) _trackAlgoName = { @@ -134,6 +149,7 @@ def _ptCut(s): "iter7" : "Iterative Step 7", "iter9" : "Iterative Step 9", "iter10": "Iterative Step 10", + "pixel": "Pixel tracks", } _trackAlgoOrder = [ @@ -169,6 +185,7 @@ def _ptCut(s): 'iter7', 'iter9', 'iter10', + "pixel", ] _pageNameMap = { @@ -186,10 +203,10 @@ def _ptCut(s): # These are for the summary page ("seeding_seeds", "Seeds"), ("building", "Built tracks"), - ("", "All tracks"), - ("Pt09", "All tracks (pT>0.9 GeV)"), - ("highPurity", "High purity tracks"), - ("highPurityPt09", "High purity tracks (pT>0.9 GeV)"), + ("", _allName), + ("Pt09", _allPtCut(_allName)), + ("highPurity", _allToHP(_allName)), + ("highPurityPt09", _ptCut(_allToHP(_allName))), ("tpPtLess09", _tpPtLess09Name), ("tpPtLess09_highPurity", _allToHP(_tpPtLess09Name)), ("tpEtaGreater2p7", _tpEtaGreater2p7Name), @@ -209,7 +226,14 @@ def _ptCut(s): ("displaced", _displacedName), ("displaced_highPurity", _allToHP(_displacedName)), # Pixel tracks - ("pixel", "Pixel tracks"), + ("pixel", _allToPixel(_allName)), + ("pixelPt09", _ptCut(_allToPixel(_allName))), + ("pixelFromPV", _toPixel(_fromPVName)), + ("pixelFromPVPt09", _ptCut(_toPixel(_fromPVName))), + ("pixelFromPVAllTP", _toPixel(_fromPVAllTPName)), + ("pixelFromPVAllTPPt09", _ptCut(_toPixel(_fromPVAllTPName))), + ("pixelbhadron", _allToPixel(_bhadronName)), + ("pixelbhadronPt09", _ptCut(_allToPixel(_bhadronName))), # These are for vertices ("genvertex", "Gen vertices"), ("pixelVertices", "Pixel vertices"), @@ -233,6 +257,7 @@ def _ptCut(s): _fromPVAllTP2Legend = "Tracks from reco PV (another method), fake rate numerator contains all TrackingParticles (separates fake tracks from pileup tracks)" _fromPVAllTPPt2Legend = "Tracks (pT > 0.9 GeV) from reco PV (another method), fake rate numerator contains all TrackingParticles (separates fake tracks from pileup tracks)" _bhadronLegend = "All tracks, efficiency denominator contains only TrackingParticles from B-hadron decays" +_bhadronPtLegend = "Tracks (pT > 0.9 GeV), efficiency denominator contains only TrackingParticles from B-hadron decays" def _sectionNameLegend(): return { @@ -258,6 +283,12 @@ def _sectionNameLegend(): "bhadron_": _bhadronLegend, "bhadron_highPurity": _allToHP(_bhadronLegend), "bhadron_btvLike": _bhadronLegend.replace("All tracks", _btvLegend), + "pixelFromPV_": _fromPVLegend, + "pixelFromPV_Pt09": _fromPVPtLegend, + "pixelFromPVAllTP_": _fromPVAllTPLegend, + "pixelFromPVAllTP_Pt09": _fromPVAllTPPtLegend, + "pixelbhadron_": _bhadronLegend, + "pixelbhadron_Pt09": _bhadronPtLegend, } class Table: @@ -701,7 +732,7 @@ def __init__(self, sample, title, fastVsFull, pileupComparison): self._timingPage = PageSet(*params) self._pfPages = PageSet(*params) self._hltPages = PageSet(*params, dqmSubFolderTranslatedToSectionName=lambda algoQuality: algoQuality[0]) - self._pixelPages = PageSet(*params, dqmSubFolderTranslatedToSectionName=lambda algoQuality: algoQuality[0]) + self._pixelPages = TrackingPageSet(*params) self._otherPages = PageSet(*params) self._purposePageMap = { diff --git a/Validation/RecoTrack/python/plotting/trackingPlots.py b/Validation/RecoTrack/python/plotting/trackingPlots.py index 7f5d84738ee81..fda80114d7e8f 100644 --- a/Validation/RecoTrack/python/plotting/trackingPlots.py +++ b/Validation/RecoTrack/python/plotting/trackingPlots.py @@ -615,6 +615,8 @@ def _mapCollectionToAlgoQuality(collName): prefixes = ["cutsreco", "cutsrecofrompv", "cutsrecofrompv2", "cutsrecofrompvalltp", "cutsrecoetagreater2p7"] if collNameLow in ["general", "generalfrompv", "generaletagreater2p7"]+prefixes: algo = "ootb" + elif collNameLow in ["pixel", "pixelfrompv", "pixelfrompvalltp"]: + algo = "pixel" else: def testColl(coll): for pfx in prefixes: @@ -939,6 +941,7 @@ class HighPurityPt09: pass class BTVLike: pass class AK4PFJets: pass class Pixel: pass + class PixelPt09: pass def __init__(self, section, collection=GeneralTracks): self._collection = collection @@ -981,6 +984,8 @@ def _getN(hname): return _getAlgoQuality(data, "ak4PFJets", "") elif self._collection == TrackingSummaryTable.Pixel: return _getAlgoQuality(data, "pixel", "") + elif self._collection == TrackingSummaryTable.PixelPt09: + return _getAlgoQuality(data, "pixel", "Pt09") else: raise Exception("Collection not recognized, %s" % str(self._collection)) def _formatOrNone(num, func): @@ -1354,11 +1359,21 @@ def _appendTrackingPlots(lastDirName, name, algoPlots, onlyForPileup=False, only _appendTrackingPlots("TrackBHadron", "bhadron", _simBasedPlots+_recoBasedPlots, onlyForBHadron=True) _appendTrackingPlots("TrackDisplaced", "displaced", _simBasedPlots+_recoBasedPlots) # Pixel tracks -_common = dict(purpose=PlotPurpose.Pixel, page="pixel") -plotter.append("pixelTrack", _trackingFolders("PixelTrack"), TrackingPlotFolder(*(_simBasedPlots+_recoBasedPlots), **_common)) -plotterExt.append("pixelTrack", _trackingFolders("PixelTrack"), TrackingPlotFolder(*_extendedPlots, **_common)) -plotter.append("pixelTrack_summary", _trackingFolders("PixelTrack"), PlotFolder(_summaryRaw, _summaryRawN, loopSubFolders=False, purpose=PlotPurpose.TrackingSummary, page="summary", section="pixel")) -plotter.appendTable("pixelTrack_summary", _trackingFolders("PixelTrack"), TrackingSummaryTable(section="pixel", collection=TrackingSummaryTable.Pixel)) +def _appendPixelTrackingPlots(lastDirName, name): + _common = dict(purpose=PlotPurpose.Pixel, page="pixel") + _folders = _trackingFolders(lastDirName) + + plotter.append(name, _folders, TrackingPlotFolder(*(_simBasedPlots+_recoBasedPlots), **_common)) + plotterExt.append(name, _folders, TrackingPlotFolder(*_extendedPlots, **_common)) + + plotter.append(name+"_summary", _folders, PlotFolder(_summaryRaw, _summaryRawN, loopSubFolders=False, purpose=PlotPurpose.TrackingSummary, page="summary", section=name)) + plotter.append(name+"_summary", _folders, PlotFolder(_summaryRaw, _summaryRawN, loopSubFolders=False, purpose=PlotPurpose.TrackingSummary, page="summary", section=name+"Pt09")) + plotter.appendTable(name+"_summary", _folders, TrackingSummaryTable(section=name, collection=TrackingSummaryTable.Pixel)) + plotter.appendTable(name+"_summary", _folders, TrackingSummaryTable(section=name+"Pt09", collection=TrackingSummaryTable.PixelPt09)) +_appendPixelTrackingPlots("PixelTrack", "pixel") +_appendPixelTrackingPlots("PixelTrackFromPV", "pixelFromPV") +_appendPixelTrackingPlots("PixelTrackFromPVAllTP", "pixelFromPVAllTP") +_appendPixelTrackingPlots("PixelTrackBHadron", "pixelbhadron") # MiniAOD