cms-sw · fwyzard · Mar 23, 2021 · smuzaffar · Apr 7, 2020 · fwyzard
diff --git a/CUDADataFormats/Track/BuildFile.xml b/CUDADataFormats/Track/BuildFile.xml
@@ -0,0 +1,9 @@
+<use name="cuda"/>
+<use name="rootcore"/>
+<use name="CUDADataFormats/Common"/>
+<use name="DataFormats/Common"/>
+<use name="HeterogeneousCore/CUDAUtilities"/>
+<use name="eigen"/>
+<export>
+    <lib name="1"/>
+</export>
diff --git a/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h b/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h
@@ -0,0 +1,9 @@
+#ifndef CUDADataFormats_Track_PixelTrackHeterogeneous_h
+#define CUDADataFormats_Track_PixelTrackHeterogeneous_h
+
+#include "CUDADataFormats/Common/interface/HeterogeneousSoA.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h"
+
+using PixelTrackHeterogeneous = HeterogeneousSoA<pixelTrack::TrackSoA>;
+
+#endif  // #ifndef CUDADataFormats_Track_PixelTrackHeterogeneous_h
diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h
@@ -0,0 +1,73 @@
+#ifndef CUDADataFormats_Track_TrackHeterogeneousT_H
+#define CUDADataFormats_Track_TrackHeterogeneousT_H
+
+#include "CUDADataFormats/Track/interface/TrajectoryStateSoAT.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+
+#include "CUDADataFormats/Common/interface/HeterogeneousSoA.h"
+
+namespace pixelTrack {
+  enum class Quality : uint8_t { bad = 0, dup, loose, strict, tight, highPurity };
+}
+
+template <int32_t S>
+class TrackSoAHeterogeneousT {
+public:
+  static constexpr int32_t stride() { return S; }
+
+  using Quality = pixelTrack::Quality;
+  using hindex_type = uint32_t;
+  using HitContainer = cms::cuda::OneToManyAssoc<hindex_type, S, 5 * S>;
+
+  // Always check quality is at least loose!
+  // CUDA does not support enums  in __lgc ...
+private:
+  eigenSoA::ScalarSoA<uint8_t, S> quality_;
+
+public:
+  constexpr Quality quality(int32_t i) const { return (Quality)(quality_(i)); }
+  constexpr Quality &quality(int32_t i) { return (Quality &)(quality_(i)); }
+  constexpr Quality const *qualityData() const { return (Quality const *)(quality_.data()); }
+  constexpr Quality *qualityData() { return (Quality *)(quality_.data()); }
+
+  // this is chi2/ndof as not necessarely all hits are used in the fit
+  eigenSoA::ScalarSoA<float, S> chi2;
+
+  constexpr int nHits(int i) const { return detIndices.size(i); }
+
+  // State at the Beam spot
+  // phi,tip,1/pt,cotan(theta),zip
+  TrajectoryStateSoAT<S> stateAtBS;
+  eigenSoA::ScalarSoA<float, S> eta;
+  eigenSoA::ScalarSoA<float, S> pt;
+  constexpr float charge(int32_t i) const { return std::copysign(1.f, stateAtBS.state(i)(2)); }
+  constexpr float phi(int32_t i) const { return stateAtBS.state(i)(0); }
+  constexpr float tip(int32_t i) const { return stateAtBS.state(i)(1); }
+  constexpr float zip(int32_t i) const { return stateAtBS.state(i)(4); }
+
+  // state at the detector of the outermost hit
+  // representation to be decided...
+  // not yet filled on GPU
+  // TrajectoryStateSoA<S> stateAtOuterDet;
+
+  HitContainer hitIndices;
+  HitContainer detIndices;
+};
+
+namespace pixelTrack {
+
+#ifdef GPU_SMALL_EVENTS
+  // kept for testing and debugging
+  constexpr uint32_t maxNumber() { return 2 * 1024; }
+#else
+  // tested on MC events with 55-75 pileup events
+  constexpr uint32_t maxNumber() { return 32 * 1024; }
+#endif
+
+  using TrackSoA = TrackSoAHeterogeneousT<maxNumber()>;
+  using TrajectoryState = TrajectoryStateSoAT<maxNumber()>;
+  using HitContainer = TrackSoA::HitContainer;
+
+}  // namespace pixelTrack
+
+#endif  // CUDADataFormats_Track_TrackHeterogeneousT_H
diff --git a/CUDADataFormats/Track/interface/TrajectoryStateSoAT.h b/CUDADataFormats/Track/interface/TrajectoryStateSoAT.h
@@ -0,0 +1,59 @@
+#ifndef CUDADataFormats_Track_TrajectoryStateSOAT_H
+#define CUDADataFormats_Track_TrajectoryStateSOAT_H
+
+#include <Eigen/Dense>
+#include "HeterogeneousCore/CUDAUtilities/interface/eigenSoA.h"
+
+template <int32_t S>
+struct TrajectoryStateSoAT {
+  using Vector5f = Eigen::Matrix<float, 5, 1>;
+  using Vector15f = Eigen::Matrix<float, 15, 1>;
+
+  using Vector5d = Eigen::Matrix<double, 5, 1>;
+  using Matrix5d = Eigen::Matrix<double, 5, 5>;
+
+  static constexpr int32_t stride() { return S; }
+
+  eigenSoA::MatrixSoA<Vector5f, S> state;
+  eigenSoA::MatrixSoA<Vector15f, S> covariance;
+
+  template <typename V3, typename M3, typename V2, typename M2>
+  __host__ __device__ inline void copyFromCircle(
+      V3 const& cp, M3 const& ccov, V2 const& lp, M2 const& lcov, float b, int32_t i) {
+    state(i) << cp.template cast<float>(), lp.template cast<float>();
+    state(i)(2) *= b;
+    auto cov = covariance(i);
+    cov(0) = ccov(0, 0);
+    cov(1) = ccov(0, 1);
+    cov(2) = b * float(ccov(0, 2));
+    cov(4) = cov(3) = 0;
+    cov(5) = ccov(1, 1);
+    cov(6) = b * float(ccov(1, 2));
+    cov(8) = cov(7) = 0;
+    cov(9) = b * b * float(ccov(2, 2));
+    cov(11) = cov(10) = 0;
+    cov(12) = lcov(0, 0);
+    cov(13) = lcov(0, 1);
+    cov(14) = lcov(1, 1);
+  }
+
+  template <typename V5, typename M5>
+  __host__ __device__ inline void copyFromDense(V5 const& v, M5 const& cov, int32_t i) {
+    state(i) = v.template cast<float>();
+    for (int j = 0, ind = 0; j < 5; ++j)
+      for (auto k = j; k < 5; ++k)
+        covariance(i)(ind++) = cov(j, k);
+  }
+
+  template <typename V5, typename M5>
+  __host__ __device__ inline void copyToDense(V5& v, M5& cov, int32_t i) const {
+    v = state(i).template cast<typename V5::Scalar>();
+    for (int j = 0, ind = 0; j < 5; ++j) {
+      cov(j, j) = covariance(i)(ind++);
+      for (auto k = j + 1; k < 5; ++k)
+        cov(k, j) = cov(j, k) = covariance(i)(ind++);
+    }
+  }
+};
+
+#endif  // CUDADataFormats_Track_TrajectoryStateSOAT_H
diff --git a/CUDADataFormats/Track/src/classes.h b/CUDADataFormats/Track/src/classes.h
@@ -0,0 +1,9 @@
+#ifndef CUDADataFormats_Track_src_classes_h
+#define CUDADataFormats_Track_src_classes_h
+
+#include "CUDADataFormats/Common/interface/Product.h"
+#include "CUDADataFormats/Common/interface/HostProduct.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h"
+#include "DataFormats/Common/interface/Wrapper.h"
+
+#endif  // CUDADataFormats_Track_src_classes_h
diff --git a/CUDADataFormats/Track/src/classes_def.xml b/CUDADataFormats/Track/src/classes_def.xml
@@ -0,0 +1,6 @@
+<lcgdict>
+  <class name="cms::cuda::Product<HeterogeneousSoA<pixelTrack::TrackSoA>>" persistent="false"/>
+  <class name="edm::Wrapper<cms::cuda::Product<HeterogeneousSoA<pixelTrack::TrackSoA>>>" persistent="false"/>
+  <class name="HeterogeneousSoA<pixelTrack::TrackSoA>" persistent="false"/>
+  <class name="edm::Wrapper<HeterogeneousSoA<pixelTrack::TrackSoA>>" persistent="false"/>
+</lcgdict>
diff --git a/CUDADataFormats/Track/test/BuildFile.xml b/CUDADataFormats/Track/test/BuildFile.xml
@@ -0,0 +1,13 @@
+<use name="HeterogeneousCore/CUDAUtilities"/>
+
+<bin file="TrajectoryStateSOA_t.cpp" name="cpuTrajectoryStateSOA_t">
+  <use name="eigen"/>
+  <flags CXXFLAGS="-g -DGPU_DEBUG"/>
+</bin>
+
+<bin file="TrajectoryStateSOA_t.cu" name="gpuTrajectoryStateSOA_t">
+  <use name="eigen"/>
+  <flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
+  <flags CXXFLAGS="-g -DGPU_DEBUG"/>
+</bin>
+
diff --git a/CUDADataFormats/Track/test/TrajectoryStateSOA_t.cpp b/CUDADataFormats/Track/test/TrajectoryStateSOA_t.cpp
@@ -0,0 +1 @@
+#include "TrajectoryStateSOA_t.h"
diff --git a/CUDADataFormats/Track/test/TrajectoryStateSOA_t.cu b/CUDADataFormats/Track/test/TrajectoryStateSOA_t.cu
@@ -0,0 +1 @@
+#include "TrajectoryStateSOA_t.h"
diff --git a/CUDADataFormats/Track/test/TrajectoryStateSOA_t.h b/CUDADataFormats/Track/test/TrajectoryStateSOA_t.h
@@ -0,0 +1,75 @@
+#include "CUDADataFormats/Track/interface/TrajectoryStateSoAT.h"
+
+using Vector5d = Eigen::Matrix<double, 5, 1>;
+using Matrix5d = Eigen::Matrix<double, 5, 5>;
+
+__host__ __device__ Matrix5d loadCov(Vector5d const& e) {
+  Matrix5d cov;
+  for (int i = 0; i < 5; ++i)
+    cov(i, i) = e(i) * e(i);
+  for (int i = 0; i < 5; ++i) {
+    for (int j = 0; j < i; ++j) {
+      double v = 0.3 * std::sqrt(cov(i, i) * cov(j, j));  // this makes the matrix pos defined
+      cov(i, j) = (i + j) % 2 ? -0.4 * v : 0.1 * v;
+      cov(j, i) = cov(i, j);
+    }
+  }
+  return cov;
+}
+
+using TS = TrajectoryStateSoAT<128>;
+
+__global__ void testTSSoA(TS* pts, int n) {
+  assert(n <= 128);
+
+  Vector5d par0;
+  par0 << 0.2, 0.1, 3.5, 0.8, 0.1;
+  Vector5d e0;
+  e0 << 0.01, 0.01, 0.035, -0.03, -0.01;
+  auto cov0 = loadCov(e0);
+
+  TS& ts = *pts;
+
+  int first = threadIdx.x + blockIdx.x * blockDim.x;
+
+  for (int i = first; i < n; i += blockDim.x * gridDim.x) {
+    ts.copyFromDense(par0, cov0, i);
+    Vector5d par1;
+    Matrix5d cov1;
+    ts.copyToDense(par1, cov1, i);
+    Vector5d delV = par1 - par0;
+    Matrix5d delM = cov1 - cov0;
+    for (int j = 0; j < 5; ++j) {
+      assert(std::abs(delV(j)) < 1.e-5);
+      for (auto k = j; k < 5; ++k) {
+        assert(cov0(k, j) == cov0(j, k));
+        assert(cov1(k, j) == cov1(j, k));
+        assert(std::abs(delM(k, j)) < 1.e-5);
+      }
+    }
+  }
+}
+
+#ifdef __CUDACC__
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#endif
+
+int main() {
+#ifdef __CUDACC__
+  cms::cudatest::requireDevices();
+#endif
+
+  TS ts;
+
+#ifdef __CUDACC__
+  TS* ts_d;
+  cudaCheck(cudaMalloc(&ts_d, sizeof(TS)));
+  testTSSoA<<<1, 64>>>(ts_d, 128);
+  cudaCheck(cudaGetLastError());
+  cudaCheck(cudaMemcpy(&ts, ts_d, sizeof(TS), cudaMemcpyDefault));
+  cudaCheck(cudaDeviceSynchronize());
+#else
+  testTSSoA(&ts, 128);
+#endif
+}
diff --git a/CUDADataFormats/Vertex/BuildFile.xml b/CUDADataFormats/Vertex/BuildFile.xml
@@ -0,0 +1,9 @@
+<use name="cuda"/>
+<use name="rootcore"/>
+<use name="CUDADataFormats/Common"/>
+<use name="DataFormats/Common"/>
+<use name="HeterogeneousCore/CUDAUtilities"/>
+<use name="eigen"/>
+<export>
+    <lib name="1"/>
+</export>
diff --git a/CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h b/CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h
@@ -0,0 +1,14 @@
+#ifndef CUDADataFormatsVertexZVertexHeterogeneous_H
+#define CUDADataFormatsVertexZVertexHeterogeneous_H
+
+#include "CUDADataFormats/Vertex/interface/ZVertexSoA.h"
+#include "CUDADataFormats/Common/interface/HeterogeneousSoA.h"
+#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+
+using ZVertexHeterogeneous = HeterogeneousSoA<ZVertexSoA>;
+#ifndef __CUDACC__
+#include "CUDADataFormats/Common/interface/Product.h"
+using ZVertexCUDAProduct = cms::cuda::Product<ZVertexHeterogeneous>;
+#endif
+
+#endif
diff --git a/CUDADataFormats/Vertex/interface/ZVertexSoA.h b/CUDADataFormats/Vertex/interface/ZVertexSoA.h
@@ -0,0 +1,26 @@
+#ifndef CUDADataFormatsVertexZVertexSoA_H
+#define CUDADataFormatsVertexZVertexSoA_H
+
+#include <cstdint>
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h"
+
+// SOA for vertices
+// These vertices are clusterized and fitted only along the beam line (z)
+// to obtain their global coordinate the beam spot position shall be added (eventually correcting for the beam angle as well)
+struct ZVertexSoA {
+  static constexpr uint32_t MAXTRACKS = 32 * 1024;
+  static constexpr uint32_t MAXVTX = 1024;
+
+  int16_t idv[MAXTRACKS];    // vertex index for each associated (original) track  (-1 == not associate)
+  float zv[MAXVTX];          // output z-posistion of found vertices
+  float wv[MAXVTX];          // output weight (1/error^2) on the above
+  float chi2[MAXVTX];        // vertices chi2
+  float ptv2[MAXVTX];        // vertices pt^2
+  int32_t ndof[MAXTRACKS];   // vertices number of dof (reused as workspace for the number of nearest neighbours FIXME)
+  uint16_t sortInd[MAXVTX];  // sorted index (by pt2)  ascending
+  uint32_t nvFinal;          // the number of vertices
+
+  __host__ __device__ void init() { nvFinal = 0; }
+};
+
+#endif  // CUDADataFormatsVertexZVertexSoA.H
diff --git a/CUDADataFormats/Vertex/src/classes.h b/CUDADataFormats/Vertex/src/classes.h
@@ -0,0 +1,8 @@
+#ifndef CUDADataFormats__src_classes_h
+#define CUDADataFormats__src_classes_h
+
+#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h"
+#include "CUDADataFormats/Common/interface/Product.h"
+#include "DataFormats/Common/interface/Wrapper.h"
+
+#endif
diff --git a/CUDADataFormats/Vertex/src/classes_def.xml b/CUDADataFormats/Vertex/src/classes_def.xml
@@ -0,0 +1,6 @@
+<lcgdict>
+  <class name="cms::cuda::Product<ZVertexHeterogeneous>" persistent="false"/>
+  <class name="edm::Wrapper<ZVertexCUDAProduct>" persistent="false"/>
+  <class name="ZVertexHeterogeneous" persistent="false"/>
+  <class name="edm::Wrapper<ZVertexHeterogeneous>" persistent="false"/>
+</lcgdict>
diff --git a/Configuration/PyReleaseValidation/python/relval_2017.py b/Configuration/PyReleaseValidation/python/relval_2017.py
@@ -5,7 +5,7 @@
 # here only define the workflows as a combination of the steps defined above:
 workflows = Matrix()
 
-# each workflow defines a name and a list of steps to be done. 
+# each workflow defines a name and a list of steps to be done.
 # if no explicit name/label given for the workflow (first arg),
 # the name of step1 will be used
 
@@ -24,16 +24,16 @@
 #        (HE collapse: TTbar, TTbar PU, TTbar design)
 #        (ParkingBPH: TTbar)
 #        (TTbar PU with JME NanoAOD)
-#        (Patatrack pixel-only: ZMM - on CPU)
-#        (Patatrack pixel-only: TTbar - on CPU)
+#        (Patatrack pixel-only: ZMM - on CPU: quadruplets, triplets)
+#        (Patatrack pixel-only: TTbar - on CPU: quadruplets, triplets)
 #        (Patatrack ECAL-only: TTbar - on CPU)
 #        (Patatrack HCAL-only: TTbar - on CPU)
 #   2021 (DD4HEP: TTbar, ZMM)
 #        (ele guns 10, 35, 1000; pho guns 10, 35; mu guns 1, 10, 100, 1000, QCD 3TeV, QCD Flat)
 #        (ZMM, TTbar, ZEE, MinBias, TTbar PU, TTbar PU premix, ZEE PU, TTbar design)
 #        (TTbar trackingOnly, pixelTrackingOnly, trackingMkFit, trackdnn)
-#        (Patatrack pixel-only: ZMM - on CPU)
-#        (Patatrack pixel-only: TTbar - on CPU)
+#        (Patatrack pixel-only: ZMM - on CPU: quadruplets, triplets)
+#        (Patatrack pixel-only: TTbar - on CPU: quadruplets, triplets)
 #        (Patatrack ECAL-only: TTbar - on CPU)
 #        (Patatrack HCAL-only: TTbar - on CPU)
 #        (TTbar 0T, TTbar PU 0T)
@@ -51,16 +51,16 @@
            10824.6,11024.6,11224.6,
            10824.8,
            11024.15,
-           10842.501,
-           10824.501,
+           10842.501,10842.505,
+           10824.501,10824.505,
            10824.511,
            10824.521,
            11634.911, 11650.911,
            11601.0,11602.0,11603.0,11604.0,11605.0,11606.0,11607.0,11608.0,11609.0,11630.0,11643.0,
            11650.0,11634.0,11646.0,11640.0,11834.0,11834.99,11846.0,12024.0,
            11634.1,11634.5,11634.7,11634.91,
-           11650.501,
-           11634.501,
+           11650.501,11650.505,
+           11634.501,11634.505,
            11634.511,
            11634.521,
            11634.24,11834.24,