diff --git a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
index baa2e7f64c613..c398b29a64c69 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
@@ -8,14 +8,6 @@
 #include <atomic>
 #endif // __CUDA_ARCH__
 
-#ifdef __CUDACC__
-#include <cuda_runtime.h>
-#else
-#define __device__
-#define __global__
-#define __host__
-#endif // __CUDACC__
-
 #include "HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h"
 
 #ifdef __CUDACC__
diff --git a/HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h b/HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h
index 4bfa4e694d9dc..c107f4b228539 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h
@@ -1,6 +1,9 @@
 #ifndef HeterogeneousCore_CUDAUtilities_cudastdAlgorithm_h
 #define HeterogeneousCore_CUDAUtilities_cudastdAlgorithm_h
 
+#include <cuda_runtime.h>
+
+
 #include <utility>
 
 // reimplementation of std algorithms able to compile with CUDA and run on GPUs,
@@ -10,6 +13,7 @@ namespace cuda_std  {
 
   template<typename T = void>
   struct less {
+    __device__ __host__
     constexpr bool operator()(const T &lhs, const T &rhs) const {
       return lhs < rhs;
     }
@@ -18,10 +22,12 @@ namespace cuda_std  {
   template<>
   struct less<void> {
     template<typename T, typename U>
+    __device__ __host__
     constexpr bool operator()(const T &lhs, const U &rhs ) const { return lhs < rhs;}
   };
 
   template<typename RandomIt, typename T, typename Compare = less<T>>
+  __device__ __host__
   constexpr
   RandomIt lower_bound(RandomIt first, RandomIt last, const T& value, Compare comp={})
   {
@@ -43,6 +49,7 @@ namespace cuda_std  {
   }
 
   template<typename RandomIt, typename T, typename Compare = less<T>>
+  __device__ __host__
   constexpr
   RandomIt upper_bound(RandomIt first, RandomIt last, const T& value, Compare comp={})
   {
@@ -64,6 +71,7 @@ namespace cuda_std  {
   }
 
   template<typename RandomIt, typename T, typename Compare = cuda_std::less<T>>
+  __device__ __host__
   constexpr
   RandomIt binary_find(RandomIt first, RandomIt last, const T& value, Compare comp={})
   {
diff --git a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
index 1c1d28dbf3761..ab97c243c4385 100644
--- a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
+++ b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
@@ -1,31 +1,31 @@
+<use name="HeterogeneousCore/CUDAUtilities"/>
+
 <bin file="test_GPUSimpleVector.cu" name="test_GPUSimpleVector"/>
 
-<bin file="cudastdAlgorithm_t.cpp"/>
+<bin file="cudastdAlgorithm_t.cpp">
+</bin>
+
 <bin file="cudastdAlgorithm_t.cu" name="gpuCudastdAlgorithm_t">
-  <use name="cuda"/>
   <use name="cuda-api-wrappers"/>
 </bin>
 
 <bin file="radixSort_t.cu" name="gpuRadixSort_t">
-  <use name="cuda"/>
   <use name="cuda-api-wrappers"/>
 </bin>
 
 <bin file="radixSort_t.cu" name="gpuRadixSort_debug">
-  <use name="cuda"/>
   <use name="cuda-api-wrappers"/>
   <flags CUDA_FLAGS="-g -G"/>
 </bin>
 
-<bin file="HistoContainer_t.cpp"/>
+<bin file="HistoContainer_t.cpp">
+</bin>
+
 <bin file="HistoContainer_t.cu" name="gpuHistoContainer_t">
-  <use name="cuda"/>
   <use name="cuda-api-wrappers"/>
 </bin>
 
-<bin file="HistoContainer_t.cpp"/>
 <bin file="HistoContainer_t.cu" name="gpuHistoContainer_debug">
-  <use name="cuda"/>
   <use name="cuda-api-wrappers"/>
   <flags CUDA_FLAGS="-g -G"/>
 </bin>
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
index 29e5e82049b5c..88004245dddae 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
@@ -38,7 +38,7 @@
 
 namespace pixelgpudetails {
 
-  SiPixelRawToClusterGPUKernel::SiPixelRawToClusterGPUKernel() {
+  SiPixelRawToClusterGPUKernel::SiPixelRawToClusterGPUKernel(cuda::stream_t<>& cudaStream) {
     int WSIZE = pixelgpudetails::MAX_FED * pixelgpudetails::MAX_WORD;
     cudaMallocHost(&word,       sizeof(unsigned int)*WSIZE);
     cudaMallocHost(&fedId_h,    sizeof(unsigned char)*WSIZE);
@@ -90,6 +90,13 @@ namespace pixelgpudetails {
     cudaCheck(cudaMalloc((void**) & moduleStart_d, (MaxNumModules+1)*sizeof(uint32_t) ));
     cudaCheck(cudaMalloc((void**) & clusInModule_d,(MaxNumModules)*sizeof(uint32_t) ));
     cudaCheck(cudaMalloc((void**) & moduleId_d,    (MaxNumModules)*sizeof(uint32_t) ));
+
+    cudaCheck(cudaMalloc((void**) & gpuProduct_d, sizeof(GPUProduct)));
+    gpuProduct = getProduct();
+    assert(xx_d==gpuProduct.xx_d);
+      
+    cudaCheck(cudaMemcpyAsync(gpuProduct_d, &gpuProduct, sizeof(GPUProduct), cudaMemcpyDefault,cudaStream.id()));
+
   }
 
   SiPixelRawToClusterGPUKernel::~SiPixelRawToClusterGPUKernel() {
@@ -111,6 +118,10 @@ namespace pixelgpudetails {
     cudaCheck(cudaFree(clus_d));
     cudaCheck(cudaFree(clusInModule_d));
     cudaCheck(cudaFree(moduleId_d));
+
+    cudaCheck(cudaFree(gpuProduct_d));
+
+
   }
 
   void SiPixelRawToClusterGPUKernel::initializeWordFed(int fedId, unsigned int wordCounterGPU, const cms_uint32_t *src, unsigned int length) {
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h
index 2b0b205c9f536..bb7768ebf5a76 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h
@@ -144,12 +144,22 @@ namespace pixelgpudetails {
            (adc << thePacking.adc_shift);
   }
 
+  constexpr
+  uint32_t pixelToChannel( int row, int col) {
+    constexpr Packing thePacking = packing();
+    return (row << thePacking.column_width) | col;
+  }
+
+
   using error_obj = siPixelRawToClusterHeterogeneousProduct::error_obj;
 
 
   class SiPixelRawToClusterGPUKernel {
   public:
-    SiPixelRawToClusterGPUKernel();
+
+    using GPUProduct = siPixelRawToClusterHeterogeneousProduct::GPUProduct;
+
+    SiPixelRawToClusterGPUKernel(cuda::stream_t<>& cudaStream);
     ~SiPixelRawToClusterGPUKernel();
 
     
@@ -170,8 +180,9 @@ namespace pixelgpudetails {
     auto getProduct() const {
       return siPixelRawToClusterHeterogeneousProduct::GPUProduct{
         pdigi_h, rawIdArr_h, clus_h, adc_h, error_h,
-        nDigis, nModulesActive,
-        xx_d, yy_d, adc_d, moduleInd_d, moduleStart_d,clus_d, clusInModule_d, moduleId_d
+        gpuProduct_d,
+        xx_d, yy_d, adc_d, moduleInd_d, moduleStart_d,clus_d, clusInModule_d, moduleId_d,
+        nDigis, nModulesActive
       };
     }
 
@@ -181,6 +192,11 @@ namespace pixelgpudetails {
     unsigned char *fedId_h = nullptr;    // to hold fed index for each word
 
     // output
+    GPUProduct gpuProduct;
+    GPUProduct * gpuProduct_d;
+
+    // FIXME cleanup all these are in the gpuProduct above...
+    
     uint32_t *pdigi_h = nullptr, *rawIdArr_h = nullptr;                   // host copy of output
     uint16_t *adc_h = nullptr; int32_t *clus_h = nullptr; // host copy of calib&clus output
     pixelgpudetails::error_obj *data_h = nullptr;
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc
index 196a08b85d861..e2c2dcf828685 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc
@@ -425,7 +425,7 @@ void SiPixelRawToClusterHeterogeneous::produceCPU(edm::HeterogeneousEvent& ev, c
 // -----------------------------------------------------------------------------
 void SiPixelRawToClusterHeterogeneous::beginStreamGPUCuda(edm::StreamID streamId, cuda::stream_t<>& cudaStream) {
   // Allocate GPU resources here
-  gpuAlgo_ = std::make_unique<pixelgpudetails::SiPixelRawToClusterGPUKernel>();
+  gpuAlgo_ = std::make_unique<pixelgpudetails::SiPixelRawToClusterGPUKernel>(cudaStream);
   gpuModulesToUnpack_ = std::make_unique<SiPixelFedCablingMapGPUWrapper::ModulesToUnpack>();
 }
 
@@ -523,6 +523,7 @@ void SiPixelRawToClusterHeterogeneous::acquireGPUCuda(const edm::HeterogeneousEv
 
 void SiPixelRawToClusterHeterogeneous::produceGPUCuda(edm::HeterogeneousEvent& ev, const edm::EventSetup& es, cuda::stream_t<>& cudaStream) {
   auto output = std::make_unique<GPUProduct>(gpuAlgo_->getProduct());
+  assert(output->me_d);
   ev.put<Output>(std::move(output), [this](const GPUProduct& gpu, CPUProduct& cpu) {
       this->convertGPUtoCPU(gpu, cpu);
     });
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/siPixelRawToClusterHeterogeneousProduct.h b/RecoLocalTracker/SiPixelClusterizer/plugins/siPixelRawToClusterHeterogeneousProduct.h
index 6d6da10934532..5c0d027fe57b5 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/siPixelRawToClusterHeterogeneousProduct.h
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/siPixelRawToClusterHeterogeneousProduct.h
@@ -36,6 +36,7 @@ namespace siPixelRawToClusterHeterogeneousProduct {
     { }
   };
 
+  // FIXME split in two 
   struct GPUProduct {
     // Needed for digi and cluster CPU output
     uint32_t const * pdigi_h = nullptr;
@@ -44,9 +45,9 @@ namespace siPixelRawToClusterHeterogeneousProduct {
     uint16_t const * adc_h = nullptr;
     GPU::SimpleVector<error_obj> const * error_h = nullptr;
 
+    GPUProduct const * me_d = nullptr;
+
     // Needed for GPU rechits
-    uint32_t nDigis;
-    uint32_t nModules;
     uint16_t const * xx_d;
     uint16_t const * yy_d;
     uint16_t const * adc_d;
@@ -55,6 +56,10 @@ namespace siPixelRawToClusterHeterogeneousProduct {
     int32_t const *  clus_d;
     uint32_t const * clusInModule_d;
     uint32_t const * moduleId_d;
+
+    uint32_t nDigis;
+    uint32_t nModules;
+
   };
 
   using HeterogeneousDigiCluster = HeterogeneousProductImpl<heterogeneous::CPUProduct<CPUProduct>,
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
index 999fdcd6eff19..1bf3c7dce3d94 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
@@ -36,9 +36,10 @@ namespace pixelgpudetails {
     cudaCheck(cudaMalloc((void**) & gpu_.sortIndex_d,(gpuClustering::MaxNumModules*256)*sizeof(uint16_t)));
     cudaCheck(cudaMalloc((void**) & gpu_.mr_d,(gpuClustering::MaxNumModules*256)*sizeof(uint16_t)));
     cudaCheck(cudaMalloc((void**) & gpu_.mc_d,(gpuClustering::MaxNumModules*256)*sizeof(uint16_t)));
-//    cudaCheck(cudaMalloc((void**) & gpu_.hist_d, 10*sizeof(HitsOnGPU::Hist)));
+    cudaCheck(cudaMalloc((void**) & gpu_.hist_d, 10*sizeof(HitsOnGPU::Hist)));
 
     cudaCheck(cudaMalloc((void**) & gpu_d, sizeof(HitsOnGPU)));
+    gpu_.me_d = gpu_d;
     cudaCheck(cudaMemcpyAsync(gpu_d, &gpu_, sizeof(HitsOnGPU), cudaMemcpyDefault,cudaStream.id()));
 
   }
@@ -59,7 +60,7 @@ namespace pixelgpudetails {
     cudaCheck(cudaFree(gpu_.sortIndex_d));
     cudaCheck(cudaFree(gpu_.mr_d));
     cudaCheck(cudaFree(gpu_.mc_d));
-    // cudaCheck(cudaFree(gpu_.hist_d));
+    cudaCheck(cudaFree(gpu_.hist_d));
 
     cudaCheck(cudaFree(gpu_d));
   }
@@ -119,7 +120,7 @@ namespace pixelgpudetails {
     // for timing test
     // radixSortMultiWrapper<int16_t><<<10, 256, 0, c.stream>>>(gpu_.iphi_d,gpu_.sortIndex_d,gpu_.hitsLayerStart_d);
 
-    // fillManyFromVector(gpu_.hist_d,10,gpu_.iphi_d, gpu_.hitsLayerStart_d, nhits,256,c.stream);
+    cudautils::fillManyFromVector(gpu_.hist_d,10,gpu_.iphi_d, gpu_.hitsLayerStart_d, nhits,256,stream.id());
 
 
   }
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h b/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h
index 6ccd435faa7ba..79608c13713d7 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h
@@ -8,7 +8,7 @@
 #include <cstdint>
 #include <vector>
 
-// #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
 
 
 namespace siPixelRecHitsHeterogeneousProduct {
@@ -31,22 +31,26 @@ namespace siPixelRecHitsHeterogeneousProduct {
      uint16_t * mr_d;
      uint16_t * mc_d;
 
-     // using Hist = HistoContainer<int16_t,7,8>;
-     // Hist * hist_d;
+     using Hist = HistoContainer<int16_t,7,8>;
+     Hist * hist_d;
+    
+     HitsOnGPU const * me_d=nullptr;
   };
 
 
   struct HitsOnCPU {
     HitsOnCPU() = default;
     explicit HitsOnCPU(uint32_t nhits) :
-      charge(nhits),xl(nhits),yl(nhits),xe(nhits),ye(nhits), mr(nhits), mc(nhits){}
+      charge(nhits),xl(nhits),yl(nhits),xe(nhits),ye(nhits), mr(nhits), mc(nhits),
+      nHits(nhits){}
     uint32_t hitsModuleStart[2001];
     std::vector<int32_t> charge;
     std::vector<float> xl, yl;
     std::vector<float> xe, ye;
     std::vector<uint16_t> mr;
     std::vector<uint16_t> mc;
-
+ 
+    uint32_t nHits;
     HitsOnGPU const * gpu_d=nullptr;
   };
 
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletHeterogeneousEDProducer.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletHeterogeneousEDProducer.cc
index 186a84617e8d3..e9873247a119f 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletHeterogeneousEDProducer.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletHeterogeneousEDProducer.cc
@@ -18,6 +18,11 @@
 
 #include "CAHitQuadrupletGeneratorGPU.h"
 
+// gpu
+#include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h" 
+
+
+
 namespace {
 void fillNtuplets(RegionsSeedingHitSets::RegionFiller &seedingHitSetsFiller,
                   const OrderedHitSeeds &quadruplets) {
@@ -31,6 +36,10 @@ class CAHitNtupletHeterogeneousEDProducer
     : public HeterogeneousEDProducer<heterogeneous::HeterogeneousDevices<
           heterogeneous::GPUCuda, heterogeneous::CPU>> {
 public:
+
+  using PixelRecHitsH = siPixelRecHitsHeterogeneousProduct::HeterogeneousPixelRecHit;
+
+
   CAHitNtupletHeterogeneousEDProducer(const edm::ParameterSet &iConfig);
   ~CAHitNtupletHeterogeneousEDProducer() = default;
 
@@ -50,6 +59,9 @@ class CAHitNtupletHeterogeneousEDProducer
 private:
   edm::EDGetTokenT<IntermediateHitDoublets> doubletToken_;
 
+  edm::EDGetTokenT<HeterogeneousProduct> tGpuHits;
+
+
   edm::RunningAverage localRA_;
   CAHitQuadrupletGeneratorGPU GPUGenerator_;
   CAHitQuadrupletGenerator CPUGenerator_;
@@ -63,6 +75,7 @@ CAHitNtupletHeterogeneousEDProducer::CAHitNtupletHeterogeneousEDProducer(
     : HeterogeneousEDProducer(iConfig),
       doubletToken_(consumes<IntermediateHitDoublets>(
           iConfig.getParameter<edm::InputTag>("doublets"))),
+      tGpuHits(consumesHeterogeneous(iConfig.getParameter<edm::InputTag>("heterogeneousPixelRecHitSrc"))),
       GPUGenerator_(iConfig, consumesCollector()),
       CPUGenerator_(iConfig, consumesCollector()) {
   produces<RegionsSeedingHitSets>();
@@ -73,6 +86,9 @@ void CAHitNtupletHeterogeneousEDProducer::fillDescriptions(
   edm::ParameterSetDescription desc;
 
   desc.add<edm::InputTag>("doublets", edm::InputTag("hitPairEDProducer"));
+
+  desc.add<edm::InputTag>("heterogeneousPixelRecHitSrc", edm::InputTag("siPixelRecHitHeterogeneous"));
+
   CAHitQuadrupletGeneratorGPU::fillDescriptions(desc);
   HeterogeneousEDProducer::fillPSetDescription(desc);
   auto label = "caHitQuadrupletHeterogeneousEDProducer";
@@ -106,6 +122,13 @@ void CAHitNtupletHeterogeneousEDProducer::acquireGPUCuda(
 
   seedingHitSets_ = std::make_unique<RegionsSeedingHitSets>();
 
+  edm::Handle<siPixelRecHitsHeterogeneousProduct::GPUProduct> gh;
+  iEvent.getByToken<siPixelRecHitsHeterogeneousProduct::HeterogeneousPixelRecHit>(tGpuHits, gh);
+  auto const & gHits = *gh;
+//  auto nhits = gHits.nHits;
+
+  GPUGenerator_.buildDoublets(gHits,0.06f,cudaStream.id());
+
   if (regionDoublets.empty()) {
     emptyRegionDoublets = true;
   } else {
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu
index 4a7f3ce5aa5f1..b94aad94bf16b 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu
@@ -6,6 +6,8 @@
 #include "GPUCACell.h"
 #include "CAHitQuadrupletGeneratorGPU.h"
 
+#include "gpuPixelDoublets.h"
+
 __global__ void
 kernel_debug(unsigned int numberOfLayerPairs_, unsigned int numberOfLayers_,
              const GPULayerDoublets *gpuDoublets,
@@ -490,3 +492,18 @@ CAHitQuadrupletGeneratorGPU::fetchKernelResult(int regionIndex, cudaStream_t cud
   }
   return quadsInterface;
 }
+
+
+
+
+void CAHitQuadrupletGeneratorGPU::buildDoublets(HitsOnCPU const & hh, float phicut, cudaStream_t stream) {
+   auto nhits = hh.nHits;
+
+  float phiCut=0.06;
+  int threadsPerBlock = 256;
+  int blocks = (nhits + threadsPerBlock - 1) / threadsPerBlock;
+
+  gpuPixelDoublets::getDoubletsFromHisto<<<blocks, threadsPerBlock, 0, stream>>>(hh.gpu_d,phiCut);
+
+
+}
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.h
index a6e9db46e68e7..717164c9cff62 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.h
@@ -3,6 +3,10 @@
 
 #include <cuda_runtime.h>
 
+#include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h"
+
+
+
 #include "DataFormats/SiPixelDetId/interface/PixelSubdetector.h"
 #include "FWCore/ParameterSet/interface/ParameterSet.h"
 #include "FWCore/Utilities/interface/EDGetToken.h"
@@ -32,6 +36,11 @@ namespace edm {
 
 class CAHitQuadrupletGeneratorGPU {
 public:
+
+    using HitsOnGPU = siPixelRecHitsHeterogeneousProduct::HitsOnGPU;
+    using HitsOnCPU = siPixelRecHitsHeterogeneousProduct::HitsOnCPU;
+
+
     typedef LayerHitMapCache LayerCacheType;
 
     static constexpr unsigned int minLayers = 4;
@@ -49,6 +58,8 @@ class CAHitQuadrupletGeneratorGPU {
 
     void initEvent(const edm::Event& ev, const edm::EventSetup& es);
 
+    void buildDoublets(HitsOnCPU const & hh, float phicut, cudaStream_t stream);
+
     void hitNtuplets(const IntermediateHitDoublets& regionDoublets,
                      const edm::EventSetup& es,
                      const SeedingLayerSetsHits& layers, cudaStream_t stream);
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
new file mode 100644
index 0000000000000..019e775209f96
--- /dev/null
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
@@ -0,0 +1,161 @@
+#ifndef RecoLocalTracker_SiPixelRecHits_plugins_gpuPixelDoublets_h
+#define RecoLocalTracker_SiPixelRecHits_plugins_gpuPixelDouplets_h
+
+#include <cassert>
+#include <cstdint>
+#include <cstdio>
+#include <limits>
+#include <cmath>
+#include<algorithm>
+
+#include "DataFormats/Math/interface/approx_atan2.h"
+
+#include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h"
+
+namespace gpuPixelDoublets {
+
+
+  __device__
+  std::pair<int,int> 
+  findPhiLimits(int16_t phiMe, int16_t * iphi, uint16_t * index, uint16_t size, int16_t iphicut) {
+
+  assert(iphicut>0);
+  
+  // find extreemes in top
+  int16_t minPhi = phiMe-iphicut;
+  int16_t maxPhi = phiMe+iphicut;
+
+  // std::cout << "\n phi min/max " << phiMe << ' ' << minPhi << ' ' << maxPhi << std::endl;
+
+  // guess and adjust
+  auto findLimit = [&](int16_t mPhi) { 
+    int jm = float(0.5f*size)*(1.f+float(mPhi)/float(std::numeric_limits<short>::max()));
+    // std::cout << "jm for " << mPhi << ' ' << jm << std::endl;
+    jm = std::min(size-1,std::max(0,jm));
+    bool notDone=true;
+    while(jm>0 && mPhi<iphi[index[--jm]]){notDone=false;}
+    if (notDone) while(jm<size && mPhi>iphi[index[++jm]]){}
+    jm = std::min(size-1,std::max(0,jm));
+    return jm;
+  };
+ 
+  auto jmin = findLimit(minPhi);  
+  auto jmax = findLimit(maxPhi);
+  
+
+  /*
+  std::cout << "j min/max " << jmin << ' ' << jmax << std::endl;
+  std::cout << "found min/max " << iphi[index[jmin]] << ' ' << iphi[index[jmax]] << std::endl;  
+  std::cout << "found min/max +1 " << iphi[index[jmin+1]] << ' ' << iphi[index[jmax+1]] << std::endl;
+  std::cout << "found min/max -1 " << iphi[index[jmin-1]] << ' ' << iphi[index[jmax-1]] << std::endl;
+  */
+
+  return std::make_pair(jmin,jmax);
+  }
+
+
+  __global__
+  void getDoubletsFromSorted(int16_t * iphi, uint16_t * index, uint32_t * offsets, float phiCut) {
+
+  auto iphicut = phi2short(phiCut);
+
+  auto i = blockIdx.x*blockDim.x + threadIdx.x;
+  if (i>=offsets[9]) return; // get rid of last layer
+
+  assert(0==offsets[0]);
+  int top = (i>offsets[5]) ? 5: 0;
+  while (i>=offsets[++top]){};
+  assert(top<10);
+  auto bottom = top-1;
+  if (bottom==3 || bottom==6) return; // do not have UP... (9 we got rid already)
+  assert(i>=offsets[bottom]);
+  assert(i<offsets[top]);
+
+  if (index[i]>= (offsets[top]-offsets[bottom])) {
+    printf("index problem: %d %d %d %d %d\n",i, offsets[top], offsets[bottom], offsets[top]-offsets[bottom], index[i]);
+    return;
+  }   
+
+  assert(index[i]<offsets[top]-offsets[bottom]);
+
+  int16_t phiMe = iphi[offsets[bottom]+index[i]];
+  
+  auto size = offsets[top+1] - offsets[top]; 
+  assert(size<std::numeric_limits<uint16_t>::max());
+
+  auto jLimits = findPhiLimits(phiMe, iphi+offsets[top],index+offsets[top],size,iphicut);
+
+  auto slidingWindow = [&](uint16_t mysize, uint16_t mymin,uint16_t mymax) {
+    auto topPhi = iphi+offsets[top];
+    uint16_t imax =  std::numeric_limits<uint16_t>::max();
+    uint16_t offset = (mymin>mymax) ? imax-(mysize-1) : 0;
+    int n=0;
+    for (uint16_t i = mymin+offset; i!=mymax; i++) {
+      assert(i<=imax);
+      uint16_t k = (i>mymax) ? i-offset : i;
+      assert(k<mysize);
+      assert(k>=mymin || k<mymax);
+      if (int16_t(topPhi[k]-phiMe)>2*iphicut && int16_t(phiMe-topPhi[k])>2*iphicut) 
+         printf("deltaPhi problem: %d %d %d %d, deltas %d:%d cut %d\n",i,k,phiMe,topPhi[k],int16_t(topPhi[k]-phiMe),int16_t(phiMe-topPhi[k]),iphicut);
+      n++;
+    }
+    int tot = (mymin>mymax) ? (mysize-mymin)+mymax : mymax-mymin;
+    assert(n==tot);
+  };
+
+  slidingWindow(size,jLimits.first,jLimits.second);
+ 
+  }
+
+  template<typename Hist>
+  __device__
+  void doubletsFromHisto(int16_t const * iphi, Hist const * hist, uint32_t const * offsets, float phiCut) {
+
+  auto iphicut = phi2short(phiCut);
+
+  auto i = blockIdx.x*blockDim.x + threadIdx.x;
+  if (i>=offsets[9]) return; // get rid of last layer
+
+  assert(0==offsets[0]);
+  int top = (i>offsets[5]) ? 5: 0;
+  while (i>=offsets[++top]){};
+  assert(top<10);
+  auto bottom = top-1;
+  if (bottom==3 || bottom==6) return; // do not have UP... (9 we got rid already)
+  assert(i>=offsets[bottom]);
+  assert(i<offsets[top]);
+
+  auto mep = iphi[i];
+
+  auto kl = hist[top].bin(mep-iphicut);
+  auto kh = hist[top].bin(mep+iphicut);
+
+  auto incr = [](auto & k) { return k = (k+1)%Hist::nbins();};
+
+  int tot=0;
+  int nmin=0;
+  auto khh=kh;
+  incr(khh); 
+  for (auto kk=kl; kk!=khh; incr(kk)) {
+     if (kk!=kl && kk!=kh) nmin+=hist[top].size(kk);
+     for (auto p=hist[top].begin(kk); p<hist[top].end(kk); ++p) {
+        if ( std::min(std::abs(int16_t(iphi[*p]-mep)), std::abs(int16_t(mep-iphi[*p]))) > iphicut ) continue;
+        ++tot;
+     }
+  }
+  if (0==hist[top].nspills) assert(tot>=nmin);
+  // look in spill bin as well....
+
+  }
+
+  void 
+  __global__
+  getDoubletsFromHisto(siPixelRecHitsHeterogeneousProduct::HitsOnGPU const * hhp, float phiCut) {
+    auto const & hh = *hhp;
+    doubletsFromHisto(hh.iphi_d,hh.hist_d,hh.hitsLayerStart_d,phiCut);
+
+  }
+
+} // namespace end
+
+#endif
diff --git a/SimTracker/TrackerHitAssociation/plugins/BuildFile.xml b/SimTracker/TrackerHitAssociation/plugins/BuildFile.xml
index 7cc02d9313e26..c767b1e68936a 100644
--- a/SimTracker/TrackerHitAssociation/plugins/BuildFile.xml
+++ b/SimTracker/TrackerHitAssociation/plugins/BuildFile.xml
@@ -1,5 +1,14 @@
+<use name="HeterogeneousCore/CUDACore"/>
+<use name="HeterogeneousCore/Producer"/>
+<use name="HeterogeneousCore/Product"/>
 <use name="SimTracker/TrackerHitAssociation" />
 <use name="SimDataFormats/TrackingAnalysis" />
-<library file="*.cc" name="SimTrackerTrackerHitAssociationPlugins">
+
+<use name="Geometry/TrackerGeometryBuilder" />
+<use name="Geometry/Records" />
+
+<library file="*.cc *.cu" name="SimTrackerTrackerHitAssociationPlugins">
+  <use name="cuda"/>
+  <use name="cuda-api-wrappers"/>
   <flags EDM_PLUGIN="1"/>
 </library>
diff --git a/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.cu b/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.cu
new file mode 100644
index 0000000000000..7a3e4eb83161d
--- /dev/null
+++ b/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.cu
@@ -0,0 +1,220 @@
+#include "ClusterSLOnGPU.h"
+
+// for the "packing"
+#include "RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h"
+#include <limits>
+#include<cassert>
+#include<atomic>
+#include <mutex>
+
+
+using ClusterSLGPU = trackerHitAssociationHeterogeneousProduct::ClusterSLGPU;
+
+__global__
+void simLink(clusterSLOnGPU::DigisOnGPU const * ddp, uint32_t ndigis, clusterSLOnGPU::HitsOnGPU const * hhp, ClusterSLGPU const * slp, uint32_t n) {
+
+  assert(slp==slp->me_d);
+
+  constexpr int32_t invTK = 0; // std::numeric_limits<int32_t>::max();
+ 
+  constexpr uint16_t InvId=9999; // must be > MaxNumModules
+  
+  auto const & dd = *ddp;
+  auto const & hh = *hhp;
+  auto const & sl = *slp;
+  auto i = blockIdx.x*blockDim.x + threadIdx.x;
+  
+  if (i>ndigis) return;
+
+  auto id = dd.moduleInd_d[i];
+  if (InvId==id) return;
+  assert(id<2000);
+
+  auto ch = pixelgpudetails::pixelToChannel(dd.xx_d[i], dd.yy_d[i]);
+  auto first = hh.hitsModuleStart_d[id];
+  auto cl = first + dd.clus_d[i];
+  assert(cl<256*2000);
+  
+  const std::array<uint32_t,4> me{{id,ch,0,0}};
+
+  auto less = [] __device__ __host__ (std::array<uint32_t,4> const & a, std::array<uint32_t,4> const & b)->bool {
+     return a[0]<b[0] || ( !(b[0]<a[0]) && a[1]<b[1]); // in this context we do not care of [2] 
+  };
+
+  auto equal = [] __device__ __host__ (std::array<uint32_t,4> const & a, std::array<uint32_t,4> const & b)->bool {
+     return a[0]==b[0] && a[1]==b[1]; // in this context we do not care of [2]
+  };
+
+  auto const * b = sl.links_d;
+  auto const * e = b+n;
+
+  auto p = cuda_std::lower_bound(b,e,me,less);
+  int32_t j = p-sl.links_d;
+  assert(j>=0);
+
+  auto getTK = [&](int i) { auto const & l = sl.links_d[i]; return l[2];};
+
+  j = std::min(int(j),int(n-1));
+  if (equal(me,sl.links_d[j])) {
+    auto const itk = j;
+    auto const tk = getTK(j);
+    auto old = atomicCAS(&sl.tkId_d[cl],invTK,itk);
+    if (invTK==old || tk==getTK(old)) { 
+       atomicAdd(&sl.n1_d[cl],1);
+//         sl.n1_d[cl] = tk;
+    } else {
+      auto old = atomicCAS(&sl.tkId2_d[cl],invTK,itk);
+      if (invTK==old || tk==getTK(old)) atomicAdd(&sl.n2_d[cl],1);
+    }
+//    if (92==tk) printf("TK3: %d %d %d  %d: %d,%d ?%d?%d\n", j, cl, id, i, dd.xx_d[i], dd.yy_d[i], hh.mr_d[cl], hh.mc_d[cl]);
+  } 
+  /*
+  else {
+    auto const & k=sl.links_d[j];
+    auto const & kk = j+1<n ? sl.links_d[j+1] : k;
+    printf("digi not found %d:%d closest %d:%d:%d, %d:%d:%d\n",id,ch, k[0],k[1],k[2], kk[0],kk[1],kk[2]);
+  }
+  */
+
+}
+
+__global__
+void verifyZero(int ev, clusterSLOnGPU::DigisOnGPU const * ddp, clusterSLOnGPU::HitsOnGPU const * hhp, uint32_t nhits, ClusterSLGPU const * slp) {
+  auto i = blockIdx.x*blockDim.x + threadIdx.x;
+  if (i>nhits) return;
+
+//  auto const & dd = *ddp;
+//  auto const & hh = *hhp;
+  auto const & sl = *slp;
+
+  assert(sl.tkId_d[i]==0);
+  auto const & tk = sl.links_d[0];
+  assert(tk[0]==0);
+  assert(tk[1]==0);
+  assert(tk[2]==0);
+  assert(tk[3]==0);
+
+  // if (i==0) printf("xx_d gpu %x\n",dd.xx_d);
+
+}
+
+
+__global__
+void dumpLink(int first, int ev, clusterSLOnGPU::HitsOnGPU const * hhp, uint32_t nhits, ClusterSLGPU const * slp) {
+  auto i = first + blockIdx.x*blockDim.x + threadIdx.x;
+  if (i>nhits) return;
+
+  auto const & hh = *hhp;
+  auto const & sl = *slp;
+
+  auto const & tk1 = sl.links_d[sl.tkId_d[i]];
+  auto const & tk2 = sl.links_d[sl.tkId2_d[i]];
+
+  printf("HIT: %d %d %d %d %f %f %f %f %d %d %d %d %d %d %d\n",ev, i, 
+         hh.detInd_d[i], hh.charge_d[i], 
+         hh.xg_d[i],hh.yg_d[i],hh.zg_d[i],hh.rg_d[i],hh.iphi_d[i], 
+         tk1[2],tk1[3],sl.n1_d[i],
+         tk2[2],tk2[3],sl.n2_d[i]
+        );
+
+}
+
+
+
+namespace clusterSLOnGPU {
+
+ constexpr uint32_t invTK = 0; // std::numeric_limits<int32_t>::max();
+
+ void printCSVHeader() {
+      printf("HIT: %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s\n", "ev", "ind",
+         "det", "charge",	
+         "xg","yg","zg","rg","iphi", 
+         "tkId","pt","n1","tkId2","pt2","n2" 
+        );
+     }
+
+
+  std::atomic<int> evId(0);
+  std::once_flag doneCSVHeader;
+
+  Kernel::Kernel(cuda::stream_t<>& stream, bool dump) : doDump(dump) {
+    if (doDump) std::call_once(doneCSVHeader,printCSVHeader);
+    alloc(stream);
+  }
+
+
+  void
+  Kernel::alloc(cuda::stream_t<>& stream) {
+   cudaCheck(cudaMalloc((void**) & slgpu.links_d,(ClusterSLGPU::MAX_DIGIS)*sizeof(std::array<uint32_t,4>)));
+
+   cudaCheck(cudaMalloc((void**) & slgpu.tkId_d,(ClusterSLGPU::MaxNumModules*256)*sizeof(uint32_t)));
+   cudaCheck(cudaMalloc((void**) & slgpu.tkId2_d,(ClusterSLGPU::MaxNumModules*256)*sizeof(uint32_t)));
+   cudaCheck(cudaMalloc((void**) & slgpu.n1_d,(ClusterSLGPU::MaxNumModules*256)*sizeof(uint32_t)));
+   cudaCheck(cudaMalloc((void**) & slgpu.n2_d,(ClusterSLGPU::MaxNumModules*256)*sizeof(uint32_t)));
+
+
+   cudaCheck(cudaMalloc((void**) & slgpu.me_d, sizeof(ClusterSLGPU)));
+   cudaCheck(cudaMemcpyAsync(slgpu.me_d, &slgpu, sizeof(ClusterSLGPU), cudaMemcpyDefault, stream.id()));
+  }
+
+ void
+  Kernel::deAlloc() {
+   cudaCheck(cudaFree(slgpu.links_d));
+   cudaCheck(cudaFree(slgpu.tkId_d));
+   cudaCheck(cudaFree(slgpu.tkId2_d));
+   cudaCheck(cudaFree(slgpu.n1_d));
+   cudaCheck(cudaFree(slgpu.n2_d));
+   cudaCheck(cudaFree(slgpu.me_d));
+}
+
+
+  void
+  Kernel::zero(cudaStream_t stream) {
+   cudaCheck(cudaMemsetAsync(slgpu.tkId_d,invTK,(ClusterSLGPU::MaxNumModules*256)*sizeof(uint32_t), stream));
+   cudaCheck(cudaMemsetAsync(slgpu.tkId2_d,invTK,(ClusterSLGPU::MaxNumModules*256)*sizeof(uint32_t), stream));
+   cudaCheck(cudaMemsetAsync(slgpu.n1_d,0,(ClusterSLGPU::MaxNumModules*256)*sizeof(uint32_t), stream));
+   cudaCheck(cudaMemsetAsync(slgpu.n2_d,0,(ClusterSLGPU::MaxNumModules*256)*sizeof(uint32_t), stream));
+  }
+
+
+  void 
+  Kernel::algo(DigisOnGPU const & dd, uint32_t ndigis, HitsOnCPU const & hh, uint32_t nhits, uint32_t n, cuda::stream_t<>& stream) {
+    
+    /*
+    size_t pfs = 16*1024*1024;
+    // cudaDeviceSetLimit(cudaLimitPrintfFifoSize,pfs);
+    cudaDeviceGetLimit(&pfs,cudaLimitPrintfFifoSize);
+    std::cout << "cudaLimitPrintfFifoSize " << pfs << std::endl;
+    */
+
+    zero(stream.id());
+
+    ClusterSLGPU const & sl = slgpu;
+
+    int ev = ++evId;
+    int threadsPerBlock = 256;
+
+    int blocks = (nhits + threadsPerBlock - 1) / threadsPerBlock;
+    verifyZero<<<blocks, threadsPerBlock, 0, stream.id()>>>(ev, dd.me_d, hh.gpu_d, nhits, sl.me_d);
+
+
+    blocks = (ndigis + threadsPerBlock - 1) / threadsPerBlock;
+
+    assert(sl.me_d);
+    simLink<<<blocks, threadsPerBlock, 0, stream.id()>>>(dd.me_d,ndigis, hh.gpu_d, sl.me_d,n);
+
+    if (doDump) {
+      cudaStreamSynchronize(stream.id());	// flush previous printf
+      // one line == 200B so each kernel can print only 5K lines....
+      blocks = 16; // (nhits + threadsPerBlock - 1) / threadsPerBlock;
+      for (int first=0; first<int(nhits); first+=blocks*threadsPerBlock) {
+        dumpLink<<<blocks, threadsPerBlock, 0, stream.id()>>>(first, ev, hh.gpu_d, nhits, sl.me_d);
+        cudaStreamSynchronize(stream.id());
+      }
+    }
+    cudaCheck(cudaGetLastError());
+
+  }
+
+}
diff --git a/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.h b/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.h
new file mode 100644
index 0000000000000..352337e783105
--- /dev/null
+++ b/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.h
@@ -0,0 +1,44 @@
+#ifndef SimTrackerTrackerHitAssociationClusterSLOnGPU_H
+#define SimTrackerTrackerHitAssociationClusterSLOnGPU_H
+
+#include <cuda_runtime.h>
+#include <cuda/api_wrappers.h>
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+
+#include "trackerHitAssociationHeterogeneousProduct.h"
+
+#include "RecoLocalTracker/SiPixelClusterizer/plugins/siPixelRawToClusterHeterogeneousProduct.h"
+#include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h"
+
+
+
+
+namespace clusterSLOnGPU {
+
+  using ClusterSLGPU = trackerHitAssociationHeterogeneousProduct::ClusterSLGPU;
+  using GPUProduct = trackerHitAssociationHeterogeneousProduct::GPUProduct;
+
+  using DigisOnGPU = siPixelRawToClusterHeterogeneousProduct::GPUProduct;
+  using HitsOnGPU = siPixelRecHitsHeterogeneousProduct::HitsOnGPU;
+  using HitsOnCPU = siPixelRecHitsHeterogeneousProduct::HitsOnCPU;
+
+
+  class Kernel {
+  public:
+    Kernel(cuda::stream_t<>& stream, bool dump);
+    ~Kernel() {deAlloc();}
+    void algo(DigisOnGPU const & dd, uint32_t ndigis, HitsOnCPU const & hh, uint32_t nhits, uint32_t n, cuda::stream_t<>& stream);
+    GPUProduct getProduct() { return GPUProduct{slgpu.me_d};}
+    
+  private:
+     void alloc(cuda::stream_t<>& stream);
+     void deAlloc(); 
+     void zero(cudaStream_t stream);
+  public:
+     ClusterSLGPU slgpu; 
+     bool doDump;
+  };
+}
+
+#endif
diff --git a/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneous.cc b/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneous.cc
new file mode 100644
index 0000000000000..6cffed8ad0cd5
--- /dev/null
+++ b/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneous.cc
@@ -0,0 +1,442 @@
+#include <memory>
+#include <vector>
+#include <utility>
+
+#include "FWCore/Framework/interface/Frameworkfwd.h"
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Utilities/interface/InputTag.h"
+#include "FWCore/Utilities/interface/EDGetToken.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+
+#include "HeterogeneousCore/CUDACore/interface/GPUCuda.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+#include "HeterogeneousCore/Producer/interface/HeterogeneousEDProducer.h"
+
+
+#include "DataFormats/Common/interface/Handle.h"
+#include "FWCore/Framework/interface/ESHandle.h"
+#include "DataFormats/Common/interface/DetSetVector.h"
+#include "DataFormats/Common/interface/DetSetVectorNew.h"
+#include "DataFormats/DetId/interface/DetId.h"
+#include "DataFormats/SiPixelDetId/interface/PixelChannelIdentifier.h"
+#include "DataFormats/TrackerRecHit2D/interface/OmniClusterRef.h"
+#include "DataFormats/SiPixelCluster/interface/SiPixelCluster.h"
+#include "DataFormats/SiStripCluster/interface/SiStripCluster.h"
+#include "DataFormats/Phase2TrackerCluster/interface/Phase2TrackerCluster1D.h"
+
+#include "SimDataFormats/Track/interface/SimTrackContainer.h"
+#include "SimDataFormats/TrackerDigiSimLink/interface/StripDigiSimLink.h"
+#include "SimDataFormats/TrackerDigiSimLink/interface/PixelDigiSimLink.h"
+#include "DataFormats/Phase2TrackerDigi/interface/Phase2TrackerDigi.h"
+#include "SimDataFormats/TrackingAnalysis/interface/TrackingParticle.h"
+#include "SimDataFormats/TrackingAnalysis/interface/TrackingParticleFwd.h"
+#include "SimTracker/TrackerHitAssociation/interface/ClusterTPAssociation.h"
+
+#include "Geometry/TrackerGeometryBuilder/interface/TrackerGeometry.h"
+#include "Geometry/Records/interface/TrackerDigiGeometryRecord.h"
+
+
+// gpu
+#include "RecoLocalTracker/SiPixelClusterizer/plugins/siPixelRawToClusterHeterogeneousProduct.h"
+#include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h" 
+
+#include <cuda_runtime.h>
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "ClusterSLOnGPU.h"
+
+class ClusterTPAssociationHeterogeneous : public HeterogeneousEDProducer<heterogeneous::HeterogeneousDevices<
+          heterogeneous::GPUCuda, heterogeneous::CPU>>
+{
+public:
+  typedef std::vector<OmniClusterRef> OmniClusterCollection;
+
+  using ClusterSLGPU = trackerHitAssociationHeterogeneousProduct::ClusterSLGPU;
+  using GPUProduct = trackerHitAssociationHeterogeneousProduct::GPUProduct;
+  using CPUProduct = trackerHitAssociationHeterogeneousProduct::CPUProduct;
+  using Output = trackerHitAssociationHeterogeneousProduct::ClusterTPAHeterogeneousProduct;
+
+  using PixelDigiClustersH = siPixelRawToClusterHeterogeneousProduct::HeterogeneousDigiCluster;
+  using PixelRecHitsH = siPixelRecHitsHeterogeneousProduct::HeterogeneousPixelRecHit;
+
+  explicit ClusterTPAssociationHeterogeneous(const edm::ParameterSet&);
+  ~ClusterTPAssociationHeterogeneous() = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+private:
+
+  void beginStreamGPUCuda(edm::StreamID streamId,
+                          cuda::stream_t<> &cudaStream) override;
+
+  void acquireGPUCuda(const edm::HeterogeneousEvent &iEvent,
+                      const edm::EventSetup &iSetup,
+                      cuda::stream_t<> &cudaStream) override;
+  void produceGPUCuda(edm::HeterogeneousEvent &iEvent,
+                      const edm::EventSetup &iSetup,
+                      cuda::stream_t<> &cudaStream) override;
+  void produceCPU(edm::HeterogeneousEvent &iEvent,
+                  const edm::EventSetup &iSetup) override;
+
+  void makeMap(const edm::HeterogeneousEvent &iEvent);
+  std::unique_ptr<CPUProduct> produceLegacy(edm::HeterogeneousEvent &iEvent, const edm::EventSetup& es);
+
+  template <typename T>
+  std::vector<std::pair<uint32_t, EncodedEventId> >
+  getSimTrackId(const edm::Handle<edm::DetSetVector<T> >& simLinks, const DetId& detId, uint32_t channel) const;
+
+  edm::EDGetTokenT<edm::DetSetVector<PixelDigiSimLink> > sipixelSimLinksToken_;
+  edm::EDGetTokenT<edm::DetSetVector<StripDigiSimLink> > sistripSimLinksToken_;
+  edm::EDGetTokenT<edm::DetSetVector<PixelDigiSimLink> > siphase2OTSimLinksToken_;
+  edm::EDGetTokenT<edmNew::DetSetVector<SiPixelCluster> > pixelClustersToken_;
+  edm::EDGetTokenT<edmNew::DetSetVector<SiStripCluster> > stripClustersToken_;
+  edm::EDGetTokenT<edmNew::DetSetVector<Phase2TrackerCluster1D> > phase2OTClustersToken_;
+  edm::EDGetTokenT<TrackingParticleCollection> trackingParticleToken_;
+
+  edm::EDGetTokenT<HeterogeneousProduct> tGpuDigis;
+  edm::EDGetTokenT<HeterogeneousProduct> tGpuHits;
+
+  std::unique_ptr<clusterSLOnGPU::Kernel> gpuAlgo;
+
+  std::map<std::pair<size_t, EncodedEventId>, TrackingParticleRef> mapping;
+ 
+  std::vector<std::array<uint32_t,4>> digi2tp;
+
+  bool doDump;
+
+};
+
+ClusterTPAssociationHeterogeneous::ClusterTPAssociationHeterogeneous(const edm::ParameterSet & cfg)
+  : HeterogeneousEDProducer(cfg),
+    sipixelSimLinksToken_(consumes<edm::DetSetVector<PixelDigiSimLink> >(cfg.getParameter<edm::InputTag>("pixelSimLinkSrc"))),
+    sistripSimLinksToken_(consumes<edm::DetSetVector<StripDigiSimLink> >(cfg.getParameter<edm::InputTag>("stripSimLinkSrc"))),
+    siphase2OTSimLinksToken_(consumes<edm::DetSetVector<PixelDigiSimLink> >(cfg.getParameter<edm::InputTag>("phase2OTSimLinkSrc"))),
+    pixelClustersToken_(consumes<edmNew::DetSetVector<SiPixelCluster> >(cfg.getParameter<edm::InputTag>("pixelClusterSrc"))),
+    stripClustersToken_(consumes<edmNew::DetSetVector<SiStripCluster> >(cfg.getParameter<edm::InputTag>("stripClusterSrc"))),
+    phase2OTClustersToken_(consumes<edmNew::DetSetVector<Phase2TrackerCluster1D> >(cfg.getParameter<edm::InputTag>("phase2OTClusterSrc"))),
+    trackingParticleToken_(consumes<TrackingParticleCollection>(cfg.getParameter<edm::InputTag>("trackingParticleSrc"))),
+    tGpuDigis(consumesHeterogeneous(cfg.getParameter<edm::InputTag>("heterogeneousPixelDigiClusterSrc"))),
+    tGpuHits(consumesHeterogeneous(cfg.getParameter<edm::InputTag>("heterogeneousPixelRecHitSrc"))),
+    doDump(cfg.getParameter<bool>("dumpCSV"))
+{
+  produces<HeterogeneousProduct>();
+}
+
+void ClusterTPAssociationHeterogeneous::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+  desc.add<edm::InputTag>("simTrackSrc",     edm::InputTag("g4SimHits"));
+  desc.add<edm::InputTag>("pixelSimLinkSrc", edm::InputTag("simSiPixelDigis"));
+  desc.add<edm::InputTag>("stripSimLinkSrc", edm::InputTag("simSiStripDigis"));
+  desc.add<edm::InputTag>("phase2OTSimLinkSrc", edm::InputTag("simSiPixelDigis","Tracker"));
+  desc.add<edm::InputTag>("pixelClusterSrc", edm::InputTag("siPixelClusters"));
+  desc.add<edm::InputTag>("stripClusterSrc", edm::InputTag("siStripClusters"));
+  desc.add<edm::InputTag>("phase2OTClusterSrc", edm::InputTag("siPhase2Clusters"));
+  desc.add<edm::InputTag>("trackingParticleSrc", edm::InputTag("mix", "MergedTrackTruth"));
+  desc.add<edm::InputTag>("heterogeneousPixelDigiClusterSrc", edm::InputTag("siPixelClustersHeterogeneous"));
+  desc.add<edm::InputTag>("heterogeneousPixelRecHitSrc", edm::InputTag("siPixelRecHitHeterogeneous"));
+
+  desc.add<bool>("dumpCSV",false);
+
+  HeterogeneousEDProducer::fillPSetDescription(desc);
+
+  descriptions.add("tpClusterProducerHeterogeneousDefault", desc);
+}
+
+
+void ClusterTPAssociationHeterogeneous::beginStreamGPUCuda(edm::StreamID streamId,
+                          cuda::stream_t<> &cudaStream) {
+
+   gpuAlgo = std::make_unique<clusterSLOnGPU::Kernel>(cudaStream,doDump);
+
+}
+
+void ClusterTPAssociationHeterogeneous::makeMap(const edm::HeterogeneousEvent &iEvent) {
+    // TrackingParticle
+    edm::Handle<TrackingParticleCollection>  TPCollectionH;
+    iEvent.getByToken(trackingParticleToken_,TPCollectionH);
+
+
+    // prepare temporary map between SimTrackId and TrackingParticle index
+    mapping.clear();
+    for (TrackingParticleCollection::size_type itp = 0;
+                                             itp < TPCollectionH.product()->size(); ++itp) {
+      TrackingParticleRef trackingParticle(TPCollectionH, itp);
+
+      // SimTracks inside TrackingParticle
+      EncodedEventId eid(trackingParticle->eventId());
+      for (auto itrk  = trackingParticle->g4Track_begin();
+                itrk != trackingParticle->g4Track_end(); ++itrk) {
+        std::pair<uint32_t, EncodedEventId> trkid(itrk->trackId(), eid);
+        //std::cout << "creating map for id: " << trkid.first << " with tp: " << trackingParticle.key() << std::endl;
+        mapping.insert(std::make_pair(trkid, trackingParticle));
+      }
+    }
+
+
+}
+
+
+void ClusterTPAssociationHeterogeneous::acquireGPUCuda(const edm::HeterogeneousEvent &iEvent,
+                      const edm::EventSetup &iSetup,
+                      cuda::stream_t<> &cudaStream) {
+
+    edm::ESHandle<TrackerGeometry> geom;
+    iSetup.get<TrackerDigiGeometryRecord>().get( geom );
+
+    // Pixel DigiSimLink
+    edm::Handle<edm::DetSetVector<PixelDigiSimLink> > sipixelSimLinks;
+    //  iEvent.getByLabel(_pixelSimLinkSrc, sipixelSimLinks);
+    iEvent.getByToken(sipixelSimLinksToken_,sipixelSimLinks);
+
+    // TrackingParticle
+    edm::Handle<TrackingParticleCollection>  TPCollectionH;
+    iEvent.getByToken(trackingParticleToken_,TPCollectionH);
+
+    makeMap(iEvent);
+   
+    //  gpu stuff ------------------------
+
+    // std::cout << "In tpsimlink " << mapping.size() << std::endl;
+
+    edm::Handle<siPixelRawToClusterHeterogeneousProduct::GPUProduct> gd;
+    edm::Handle<siPixelRecHitsHeterogeneousProduct::GPUProduct> gh;
+    iEvent.getByToken<siPixelRawToClusterHeterogeneousProduct::HeterogeneousDigiCluster>(tGpuDigis, gd);
+    iEvent.getByToken<siPixelRecHitsHeterogeneousProduct::HeterogeneousPixelRecHit>(tGpuHits, gh);
+    auto const & gDigis = *gd;
+    auto const & gHits = *gh;
+    auto ndigis = gDigis.nDigis;
+    auto nhits = gHits.nHits;
+
+    uint32_t nn=0, ng=0, ng10=0;
+    digi2tp.clear();
+    {std::array<uint32_t,4> a{{0,0,0,0}}; digi2tp.push_back(a);} // put at 0 0
+    for (auto const & links : *sipixelSimLinks) {
+      DetId detId(links.detId());
+      const GeomDetUnit * genericDet = geom->idToDetUnit(detId);
+      uint32_t gind = genericDet->index();
+      for (auto const & link : links) {
+        ++ng;
+        if (link.fraction() > 0.3f) ++ng10;
+        if (link.fraction() < 0.5f) continue;
+        auto tkid = std::make_pair(link.SimTrackId(), link.eventId());
+        auto ipos = mapping.find(tkid);
+        if (ipos != mapping.end()) {
+            uint32_t pt = 1000*(*ipos).second->pt();
+            ++nn;
+            std::array<uint32_t,4> a{{gind,uint32_t(link.channel()),(*ipos).second.key(),pt}};
+            digi2tp.push_back(a);
+        }
+      }
+    }
+    std::sort(digi2tp.begin(),digi2tp.end());
+
+    // std::cout << "In tpsimlink found " << nn << " valid link out of " << ng << '/' << ng10 << ' ' << digi2tp.size() << std::endl;
+
+    cudaCheck(cudaMemcpyAsync(gpuAlgo->slgpu.links_d, digi2tp.data(), sizeof(std::array<uint32_t,4>)*digi2tp.size(), cudaMemcpyDefault, cudaStream.id()));
+    gpuAlgo->algo(gDigis, ndigis, gHits, nhits, digi2tp.size(),cudaStream);
+
+  //  end gpu stuff ---------------------
+
+
+}
+
+void ClusterTPAssociationHeterogeneous::produceGPUCuda(edm::HeterogeneousEvent &iEvent,
+                      const edm::EventSetup &iSetup,
+                      cuda::stream_t<> &cudaStream) {
+
+  auto output = std::make_unique<GPUProduct>(gpuAlgo->getProduct());
+
+  auto legacy = produceLegacy(iEvent,iSetup).release();
+
+  iEvent.put<Output>(std::move(output), [legacy](const GPUProduct& hits, CPUProduct& cpu) {
+                       cpu = *legacy; delete legacy;
+                     });
+
+}
+ 
+		
+void ClusterTPAssociationHeterogeneous::produceCPU(edm::HeterogeneousEvent &iEvent, const edm::EventSetup& es) {
+
+   makeMap(iEvent);
+   iEvent.put(std::move(produceLegacy(iEvent,es)));
+
+}
+
+std::unique_ptr<ClusterTPAssociationHeterogeneous::CPUProduct> 
+ClusterTPAssociationHeterogeneous::produceLegacy(edm::HeterogeneousEvent &iEvent, const edm::EventSetup& es) {
+
+
+  // Pixel DigiSimLink
+  edm::Handle<edm::DetSetVector<PixelDigiSimLink> > sipixelSimLinks;
+  //  iEvent.getByLabel(_pixelSimLinkSrc, sipixelSimLinks);
+  iEvent.getByToken(sipixelSimLinksToken_,sipixelSimLinks);
+
+  // SiStrip DigiSimLink
+  edm::Handle<edm::DetSetVector<StripDigiSimLink> > sistripSimLinks;
+  iEvent.getByToken(sistripSimLinksToken_,sistripSimLinks);
+
+  // Phase2 OT DigiSimLink
+  edm::Handle<edm::DetSetVector<PixelDigiSimLink> > siphase2OTSimLinks;
+  iEvent.getByToken(siphase2OTSimLinksToken_, siphase2OTSimLinks);
+
+  // Pixel Cluster
+  edm::Handle<edmNew::DetSetVector<SiPixelCluster> > pixelClusters;
+  bool foundPixelClusters = iEvent.getByToken(pixelClustersToken_,pixelClusters);
+
+  // Strip Cluster
+  edm::Handle<edmNew::DetSetVector<SiStripCluster> > stripClusters;
+  bool foundStripClusters = iEvent.getByToken(stripClustersToken_,stripClusters);
+
+  // Phase2 Cluster
+  edm::Handle<edmNew::DetSetVector<Phase2TrackerCluster1D> > phase2OTClusters;
+  bool foundPhase2OTClusters = iEvent.getByToken(phase2OTClustersToken_, phase2OTClusters);
+
+  // TrackingParticle
+  edm::Handle<TrackingParticleCollection>  TPCollectionH;
+  iEvent.getByToken(trackingParticleToken_,TPCollectionH);
+
+  auto output = std::make_unique<CPUProduct>(TPCollectionH);
+  auto & clusterTPList = output->collection;
+
+  if ( foundPixelClusters ) {
+    // Pixel Clusters 
+    for (edmNew::DetSetVector<SiPixelCluster>::const_iterator iter  = pixelClusters->begin(); 
+                                                            iter != pixelClusters->end(); ++iter) {
+      uint32_t detid = iter->id(); 
+      DetId detId(detid);
+      edmNew::DetSet<SiPixelCluster> link_pixel = (*iter);
+      for (edmNew::DetSet<SiPixelCluster>::const_iterator di  = link_pixel.begin(); 
+	   di != link_pixel.end(); ++di) {
+	const SiPixelCluster& cluster = (*di);
+	edm::Ref<edmNew::DetSetVector<SiPixelCluster>, SiPixelCluster> c_ref = 
+	  edmNew::makeRefTo(pixelClusters, di);
+	
+	std::set<std::pair<uint32_t, EncodedEventId> > simTkIds; 
+	for (int irow = cluster.minPixelRow(); irow <= cluster.maxPixelRow(); ++irow) {
+	  for (int icol = cluster.minPixelCol(); icol <= cluster.maxPixelCol(); ++icol) {
+	    uint32_t channel = PixelChannelIdentifier::pixelToChannel(irow, icol);
+	    std::vector<std::pair<uint32_t, EncodedEventId> > trkid(getSimTrackId<PixelDigiSimLink>(sipixelSimLinks, detId, channel));
+	    if (trkid.empty()) continue; 
+	    simTkIds.insert(trkid.begin(),trkid.end());
+	  }
+	}
+	for (std::set<std::pair<uint32_t, EncodedEventId> >::const_iterator iset  = simTkIds.begin(); 
+	     iset != simTkIds.end(); iset++) {
+	  auto ipos = mapping.find(*iset);
+	  if (ipos != mapping.end()) {
+	    //std::cout << "cluster in detid: " << detid << " from tp: " << ipos->second.key() << " " << iset->first << std::endl;
+	    clusterTPList.emplace_back(OmniClusterRef(c_ref), ipos->second);
+	  }
+	}
+      }
+    }
+  }
+
+  if ( foundStripClusters ) {
+    // Strip Clusters
+    for (edmNew::DetSetVector<SiStripCluster>::const_iterator iter  = stripClusters->begin(false), eter = stripClusters->end(false); 
+	 iter != eter; ++iter) {
+      if (!(*iter).isValid()) continue;
+      uint32_t detid = iter->id();  
+      DetId detId(detid);
+      edmNew::DetSet<SiStripCluster> link_strip = (*iter);
+      for (edmNew::DetSet<SiStripCluster>::const_iterator di  = link_strip.begin(); 
+	   di != link_strip.end(); di++) {
+	const SiStripCluster& cluster = (*di);
+	edm::Ref<edmNew::DetSetVector<SiStripCluster>, SiStripCluster> c_ref = 
+	  edmNew::makeRefTo(stripClusters, di);
+	
+	std::set<std::pair<uint32_t, EncodedEventId> > simTkIds; 
+	int first  = cluster.firstStrip();     
+	int last   = first + cluster.amplitudes().size();
+	
+	for (int istr = first; istr < last; ++istr) {
+	  std::vector<std::pair<uint32_t, EncodedEventId> > trkid(getSimTrackId<StripDigiSimLink>(sistripSimLinks, detId, istr));
+	  if (trkid.empty()) continue; 
+	  simTkIds.insert(trkid.begin(),trkid.end());
+	}
+	for (std::set<std::pair<uint32_t, EncodedEventId> >::const_iterator iset  = simTkIds.begin(); 
+	     iset != simTkIds.end(); iset++) {
+	  auto ipos = mapping.find(*iset);
+	  if (ipos != mapping.end()) {
+	    //std::cout << "cluster in detid: " << detid << " from tp: " << ipos->second.key() << " " << iset->first << std::endl;
+	    clusterTPList.emplace_back(OmniClusterRef(c_ref), ipos->second);
+	  } 
+	}
+      } 
+    }
+  }
+
+  if ( foundPhase2OTClusters ) {
+
+    // Phase2 Clusters
+    if(phase2OTClusters.isValid()){
+      for (edmNew::DetSetVector<Phase2TrackerCluster1D>::const_iterator iter  = phase2OTClusters->begin(false), eter = phase2OTClusters->end(false);
+                                                                iter != eter; ++iter) {
+        if (!(*iter).isValid()) continue;
+        uint32_t detid = iter->id();
+        DetId detId(detid);
+        edmNew::DetSet<Phase2TrackerCluster1D> link_phase2 = (*iter);
+        for (edmNew::DetSet<Phase2TrackerCluster1D>::const_iterator di  = link_phase2.begin();
+             di != link_phase2.end(); di++) {
+          const Phase2TrackerCluster1D& cluster = (*di);
+          edm::Ref<edmNew::DetSetVector<Phase2TrackerCluster1D>, Phase2TrackerCluster1D> c_ref =
+            edmNew::makeRefTo(phase2OTClusters, di);
+    
+          std::set<std::pair<uint32_t, EncodedEventId> > simTkIds;
+    
+          for (unsigned int istr(0); istr < cluster.size(); ++istr) {
+            uint32_t channel = Phase2TrackerDigi::pixelToChannel(cluster.firstRow() + istr, cluster.column());
+            std::vector<std::pair<uint32_t, EncodedEventId> > trkid(getSimTrackId<PixelDigiSimLink>(siphase2OTSimLinks, detId, channel));
+            if (trkid.empty()) continue;
+            simTkIds.insert(trkid.begin(),trkid.end());
+          }
+    
+          for (std::set<std::pair<uint32_t, EncodedEventId> >::const_iterator iset  = simTkIds.begin();
+                                                                              iset != simTkIds.end(); iset++) {
+            auto ipos = mapping.find(*iset);
+            if (ipos != mapping.end()) {
+              clusterTPList.emplace_back(OmniClusterRef(c_ref), ipos->second);
+            }
+          }
+        }
+      }
+    }
+
+  }
+
+
+  clusterTPList.sortAndUnique();
+
+  return output;
+  mapping.clear();
+}
+
+template <typename T>
+std::vector<std::pair<uint32_t, EncodedEventId> >
+//std::pair<uint32_t, EncodedEventId>
+ClusterTPAssociationHeterogeneous::getSimTrackId(const edm::Handle<edm::DetSetVector<T> >& simLinks,
+                                            const DetId& detId, uint32_t channel) const 
+{
+  //std::pair<uint32_t, EncodedEventId> simTrkId;
+  std::vector<std::pair<uint32_t, EncodedEventId> > simTrkId;
+  auto isearch = simLinks->find(detId);
+  if (isearch != simLinks->end()) {
+    // Loop over DigiSimLink in this det unit
+    edm::DetSet<T> link_detset = (*isearch);
+    for (typename edm::DetSet<T>::const_iterator it  = link_detset.data.begin(); 
+                                                 it != link_detset.data.end(); ++it) {
+      if (channel == it->channel()) {
+        simTrkId.push_back(std::make_pair(it->SimTrackId(), it->eventId()));
+      } 
+    }
+  }
+  return simTrkId;
+}
+#include "FWCore/PluginManager/interface/ModuleDef.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+
+DEFINE_FWK_MODULE(ClusterTPAssociationHeterogeneous);
diff --git a/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneousConverter.cc b/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneousConverter.cc
new file mode 100644
index 0000000000000..d8b9e099c3474
--- /dev/null
+++ b/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneousConverter.cc
@@ -0,0 +1,59 @@
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/global/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "HeterogeneousCore/Product/interface/HeterogeneousProduct.h"
+
+#include "trackerHitAssociationHeterogeneousProduct.h"
+
+
+class ClusterTPAssociationHeterogeneousConverter: public edm::global::EDProducer<> {
+public:
+
+  using Input = trackerHitAssociationHeterogeneousProduct::ClusterTPAHeterogeneousProduct;
+  using Product = ClusterTPAssociation;
+
+  explicit ClusterTPAssociationHeterogeneousConverter(edm::ParameterSet const& iConfig);
+  ~ClusterTPAssociationHeterogeneousConverter() = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+private:
+  void produce(edm::StreamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const override;
+
+  edm::EDGetTokenT<HeterogeneousProduct> token_;
+};
+
+ClusterTPAssociationHeterogeneousConverter::ClusterTPAssociationHeterogeneousConverter(edm::ParameterSet const& iConfig):
+  token_(consumes<HeterogeneousProduct>(iConfig.getParameter<edm::InputTag>("src")))
+{
+  produces<Product>(); 
+}
+
+void ClusterTPAssociationHeterogeneousConverter::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+  desc.add<edm::InputTag>("src", edm::InputTag("tpClusterProducerHeterogeneos"));
+
+  descriptions.add("tpClusterHeterogeneousConverter",desc);
+}
+
+namespace {
+  template <typename T>
+  auto copy_unique(const T& t) {
+    return std::make_unique<T>(t);
+  }
+}
+
+void ClusterTPAssociationHeterogeneousConverter::produce(edm::StreamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const {
+  edm::Handle<HeterogeneousProduct> hinput;
+  iEvent.getByToken(token_, hinput);
+
+  const auto& input = hinput->get<Input>().getProduct<HeterogeneousDevice::kCPU>();
+
+  iEvent.put(copy_unique(input.collection));
+}
+
+
+DEFINE_FWK_MODULE(ClusterTPAssociationHeterogeneousConverter);
diff --git a/SimTracker/TrackerHitAssociation/plugins/trackerHitAssociationHeterogeneousProduct.h b/SimTracker/TrackerHitAssociation/plugins/trackerHitAssociationHeterogeneousProduct.h
new file mode 100644
index 0000000000000..c3f86ddf855c8
--- /dev/null
+++ b/SimTracker/TrackerHitAssociation/plugins/trackerHitAssociationHeterogeneousProduct.h
@@ -0,0 +1,47 @@
+#ifndef SimTrackerTrackerHitAssociationClusterHeterogeneousProduct_H
+#define SimTrackerTrackerHitAssociationClusterHeterogeneousProduct_H
+
+#ifndef	__CUDACC__
+#include "SimTracker/TrackerHitAssociation/interface/ClusterTPAssociation.h"
+#endif
+
+#include "HeterogeneousCore/Product/interface/HeterogeneousProduct.h"
+
+
+namespace trackerHitAssociationHeterogeneousProduct {
+
+#ifndef __CUDACC__
+  struct CPUProduct {
+    CPUProduct() = default;
+    template<typename T> 
+    explicit CPUProduct(T const & t) : collection(t){}
+    ClusterTPAssociation collection;
+  };
+#endif
+
+  struct ClusterSLGPU {
+
+   ClusterSLGPU * me_d=nullptr;
+   std::array<uint32_t,4> * links_d;
+   uint32_t * tkId_d;
+   uint32_t * tkId2_d;
+   uint32_t * n1_d;
+   uint32_t * n2_d;
+
+   static constexpr uint32_t MAX_DIGIS = 2000*150;
+   static constexpr uint32_t MaxNumModules = 2000;
+
+  };
+
+  struct GPUProduct {
+     ClusterSLGPU *  gpu_d=nullptr;
+  };
+
+#ifndef	__CUDACC__
+   using ClusterTPAHeterogeneousProduct = HeterogeneousProductImpl<heterogeneous::CPUProduct<CPUProduct>,
+                                                                   heterogeneous::GPUCudaProduct<GPUProduct> >;
+#endif
+
+}
+
+#endif
diff --git a/SimTracker/TrackerHitAssociation/python/tpClusterProducer_cfi.py b/SimTracker/TrackerHitAssociation/python/tpClusterProducer_cfi.py
index 8757a67226fb8..e8330c074dfac 100644
--- a/SimTracker/TrackerHitAssociation/python/tpClusterProducer_cfi.py
+++ b/SimTracker/TrackerHitAssociation/python/tpClusterProducer_cfi.py
@@ -1,4 +1,5 @@
 import FWCore.ParameterSet.Config as cms
+from Configuration.ProcessModifiers.gpu_cff import gpu
 
 from SimTracker.TrackerHitAssociation.tpClusterProducerDefault_cfi import tpClusterProducerDefault as _tpClusterProducerDefault
 
@@ -18,3 +19,12 @@
     stripSimLinkSrc = "mixData:StripDigiSimLink",
     phase2OTSimLinkSrc = "mixData:Phase2OTDigiSimLink",
 )
+
+
+from SimTracker.TrackerHitAssociation.tpClusterProducerHeterogeneousDefault_cfi import tpClusterProducerHeterogeneousDefault as _tpClusterProducerHeterogeneous
+tpClusterProducerHeterogeneous = _tpClusterProducerHeterogeneous.clone()
+
+from SimTracker.TrackerHitAssociation.tpClusterHeterogeneousConverter_cfi import tpClusterHeterogeneousConverter as _tpHeterogeneousConverter
+
+gpu.toReplaceWith(tpClusterProducer, _tpHeterogeneousConverter.clone())
+
diff --git a/Validation/RecoTrack/python/TrackValidation_cff.py b/Validation/RecoTrack/python/TrackValidation_cff.py
index b6251473fcab4..dff571862bb46 100644
--- a/Validation/RecoTrack/python/TrackValidation_cff.py
+++ b/Validation/RecoTrack/python/TrackValidation_cff.py
@@ -709,9 +709,14 @@ def _uniqueFirstLayers(layerList):
 
 
 ### Pixel tracking only mode (placeholder for now)
-tpClusterProducerPixelTrackingOnly = tpClusterProducer.clone(
-    pixelClusterSrc = "siPixelClustersPreSplitting"
+
+tpClusterProducerHeterogeneousPixelTrackingOnly = tpClusterProducerHeterogeneous.clone(
+   pixelClusterSrc = "siPixelClustersPreSplitting"
 )
+tpClusterProducerPixelTrackingOnly = tpClusterProducer.clone()
+# Need to use the modifier to customize because the exact EDProducer type depends on the modifier
+gpu.toModify(tpClusterProducerPixelTrackingOnly, src = "tpClusterProducerHeterogeneousPixelTrackingOnly")
+
 quickTrackAssociatorByHitsPixelTrackingOnly = quickTrackAssociatorByHits.clone(
     cluster2TPSrc = "tpClusterProducerPixelTrackingOnly"
 )
@@ -739,12 +744,18 @@ def _uniqueFirstLayers(layerList):
 tracksValidationTruthPixelTrackingOnly.replace(quickTrackAssociatorByHits, quickTrackAssociatorByHitsPixelTrackingOnly)
 tracksValidationTruthPixelTrackingOnly.replace(trackingParticleRecoTrackAsssociation, trackingParticlePixelTrackAsssociation)
 tracksValidationTruthPixelTrackingOnly.replace(VertexAssociatorByPositionAndTracks, PixelVertexAssociatorByPositionAndTracks)
+
+_tracksValidationTruthPixelTrackingOnlyGPU = tracksValidationTruthPixelTrackingOnly.copy()
+_tracksValidationTruthPixelTrackingOnlyGPU.insert(0, tpClusterProducerHeterogeneousPixelTrackingOnly)
+gpu.toReplaceWith(tracksValidationTruthPixelTrackingOnly, _tracksValidationTruthPixelTrackingOnlyGPU)
+
 tracksValidationPixelTrackingOnly = cms.Sequence(
     tracksValidationTruthPixelTrackingOnly +
     trackValidatorPixelTrackingOnly
 )
 
 
+
 ### Lite mode (only generalTracks and HP)
 trackValidatorLite = trackValidator.clone(
     label = ["generalTracks", "cutsRecoTracksHp"]
diff --git a/Validation/RecoTrack/python/associators_cff.py b/Validation/RecoTrack/python/associators_cff.py
index 73c6f416e509c..8c2c8b7c7c85e 100644
--- a/Validation/RecoTrack/python/associators_cff.py
+++ b/Validation/RecoTrack/python/associators_cff.py
@@ -3,7 +3,8 @@
 #### TrackAssociation
 import SimTracker.TrackAssociatorProducers.quickTrackAssociatorByHits_cfi
 import SimTracker.TrackAssociatorProducers.trackAssociatorByPosition_cfi
-from SimTracker.TrackerHitAssociation.tpClusterProducer_cfi import tpClusterProducer as _tpClusterProducer
+# from SimTracker.TrackerHitAssociation.tpClusterProducer_cfi import tpClusterProducer as _tpClusterProducer
+from SimTracker.TrackerHitAssociation.tpClusterProducer_cfi import tpClusterProducerHeterogeneous as _tpClusterProducer
 from SimTracker.TrackAssociation.trackingParticleRecoTrackAsssociation_cfi import trackingParticleRecoTrackAsssociation as _trackingParticleRecoTrackAsssociation
 
 hltTPClusterProducer = _tpClusterProducer.clone(