diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
index 29e5e82049b5c..7eb90cffa2d77 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
@@ -38,7 +38,7 @@
 
 namespace pixelgpudetails {
 
-  SiPixelRawToClusterGPUKernel::SiPixelRawToClusterGPUKernel() {
+  SiPixelRawToClusterGPUKernel::SiPixelRawToClusterGPUKernel(cuda::stream_t<>& cudaStream) {
     int WSIZE = pixelgpudetails::MAX_FED * pixelgpudetails::MAX_WORD;
     cudaMallocHost(&word,       sizeof(unsigned int)*WSIZE);
     cudaMallocHost(&fedId_h,    sizeof(unsigned char)*WSIZE);
@@ -90,6 +90,12 @@ namespace pixelgpudetails {
     cudaCheck(cudaMalloc((void**) & moduleStart_d, (MaxNumModules+1)*sizeof(uint32_t) ));
     cudaCheck(cudaMalloc((void**) & clusInModule_d,(MaxNumModules)*sizeof(uint32_t) ));
     cudaCheck(cudaMalloc((void**) & moduleId_d,    (MaxNumModules)*sizeof(uint32_t) ));
+
+    cudaCheck(cudaMalloc((void**) & gpuProduct_d, sizeof(GPUProduct)));
+    gpuProduct = getProduct();
+    assert(xx_d==gpuProduct.xx_d);
+
+    cudaCheck(cudaMemcpyAsync(gpuProduct_d, &gpuProduct, sizeof(GPUProduct), cudaMemcpyDefault,cudaStream.id()));
   }
 
   SiPixelRawToClusterGPUKernel::~SiPixelRawToClusterGPUKernel() {
@@ -111,6 +117,7 @@ namespace pixelgpudetails {
     cudaCheck(cudaFree(clus_d));
     cudaCheck(cudaFree(clusInModule_d));
     cudaCheck(cudaFree(moduleId_d));
+    cudaCheck(cudaFree(gpuProduct_d));
   }
 
   void SiPixelRawToClusterGPUKernel::initializeWordFed(int fedId, unsigned int wordCounterGPU, const cms_uint32_t *src, unsigned int length) {
@@ -478,7 +485,7 @@ namespace pixelgpudetails {
           XX[gIndex]    = 0;  // 0 is an indicator of a noise/dead channel
           YY[gIndex]    = 0; // skip these pixels during clusterization
           ADC[gIndex]   = 0;
-          continue ; // 0: bad word
+          continue; // 0: bad word
         }
 
         uint32_t link  = getLink(ww);            // Extract link
@@ -521,9 +528,9 @@ namespace pixelgpudetails {
           // endcap ids
           layer = 0;
           panel = (rawId >> pixelgpudetails::panelStartBit) & pixelgpudetails::panelMask;
-          //disk  = (rawId >> diskStartBit_)  & diskMask_ ;
+          //disk  = (rawId >> diskStartBit_) & diskMask_;
           side  = (panel == 1)? -1 : 1;
-          //blade = (rawId>>bladeStartBit_) & bladeMask_;
+          //blade = (rawId >> bladeStartBit_) & bladeMask_;
         }
 
         // ***special case of layer to 1 be handled here
@@ -558,8 +565,8 @@ namespace pixelgpudetails {
         }
 
         pixelgpudetails::Pixel globalPix = frameConversion(barrel, side, layer, rocIdInDetUnit, localPix);
-        XX[gIndex]    = globalPix.row  ; // origin shifting by 1 0-159
-        YY[gIndex]    = globalPix.col ; // origin shifting by 1 0-415
+        XX[gIndex]    = globalPix.row;  // origin shifting by 1 0-159
+        YY[gIndex]    = globalPix.col;  // origin shifting by 1 0-415
         ADC[gIndex]   = getADC(ww);
         pdigi[gIndex] = pixelgpudetails::pack(globalPix.row,globalPix.col,ADC[gIndex]);
         moduleId[gIndex] = detId.moduleId;
@@ -583,7 +590,6 @@ namespace pixelgpudetails {
     const int threadsPerBlock = 512;
     const int blocks = (wordCounter + threadsPerBlock-1) /threadsPerBlock; // fill it all
 
-
     assert(0 == wordCounter%2);
     // wordCounter is the total no of words in each event to be trasfered on device
     cudaCheck(cudaMemcpyAsync(&word_d[0],     &word[0],     wordCounter*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
@@ -630,7 +636,6 @@ namespace pixelgpudetails {
       int threadsPerBlock = 256;
       int blocks = (wordCounter + threadsPerBlock - 1) / threadsPerBlock;
 
-
       gpuCalibPixel::calibDigis<<<blocks, threadsPerBlock, 0, stream.id()>>>(
           moduleInd_d,
           xx_d, yy_d, adc_d,
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h
index 2b0b205c9f536..2f7436052902b 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h
@@ -83,7 +83,7 @@ namespace pixelgpudetails {
   class Packing {
   public:
     using PackedDigiType = uint32_t;
-      
+
     // Constructor: pre-computes masks and shifts from field widths
     __host__ __device__
     inline
@@ -144,22 +144,32 @@ namespace pixelgpudetails {
            (adc << thePacking.adc_shift);
   }
 
+  constexpr
+  uint32_t pixelToChannel( int row, int col) {
+    constexpr Packing thePacking = packing();
+    return (row << thePacking.column_width) | col;
+  }
+
+
   using error_obj = siPixelRawToClusterHeterogeneousProduct::error_obj;
 
 
   class SiPixelRawToClusterGPUKernel {
   public:
-    SiPixelRawToClusterGPUKernel();
+
+    using GPUProduct = siPixelRawToClusterHeterogeneousProduct::GPUProduct;
+
+    SiPixelRawToClusterGPUKernel(cuda::stream_t<>& cudaStream);
     ~SiPixelRawToClusterGPUKernel();
 
-    
+
     SiPixelRawToClusterGPUKernel(const SiPixelRawToClusterGPUKernel&) = delete;
     SiPixelRawToClusterGPUKernel(SiPixelRawToClusterGPUKernel&&) = delete;
     SiPixelRawToClusterGPUKernel& operator=(const SiPixelRawToClusterGPUKernel&) = delete;
     SiPixelRawToClusterGPUKernel& operator=(SiPixelRawToClusterGPUKernel&&) = delete;
 
     void initializeWordFed(int fedId, unsigned int wordCounterGPU, const cms_uint32_t *src, unsigned int length);
-    
+
     // Not really very async yet...
     void makeClustersAsync(const SiPixelFedCablingMapGPU *cablingMap, const unsigned char *modToUnp,
                            const SiPixelGainForHLTonGPU *gains,
@@ -170,8 +180,9 @@ namespace pixelgpudetails {
     auto getProduct() const {
       return siPixelRawToClusterHeterogeneousProduct::GPUProduct{
         pdigi_h, rawIdArr_h, clus_h, adc_h, error_h,
-        nDigis, nModulesActive,
-        xx_d, yy_d, adc_d, moduleInd_d, moduleStart_d,clus_d, clusInModule_d, moduleId_d
+        gpuProduct_d,
+        xx_d, yy_d, adc_d, moduleInd_d, moduleStart_d,clus_d, clusInModule_d, moduleId_d,
+        nDigis, nModulesActive
       };
     }
 
@@ -181,6 +192,11 @@ namespace pixelgpudetails {
     unsigned char *fedId_h = nullptr;    // to hold fed index for each word
 
     // output
+    GPUProduct gpuProduct;
+    GPUProduct * gpuProduct_d;
+
+    // FIXME cleanup all these are in the gpuProduct above...
+
     uint32_t *pdigi_h = nullptr, *rawIdArr_h = nullptr;                   // host copy of output
     uint16_t *adc_h = nullptr; int32_t *clus_h = nullptr; // host copy of calib&clus output
     pixelgpudetails::error_obj *data_h = nullptr;
@@ -209,7 +225,7 @@ namespace pixelgpudetails {
     uint32_t * clusInModule_d;
     uint32_t * moduleId_d;
   };
-  
+
   // configuration and memory buffers alocated on the GPU
   struct context {
     uint32_t * word_d;
@@ -223,7 +239,7 @@ namespace pixelgpudetails {
 
     GPU::SimpleVector<error_obj> * error_d;
     error_obj * data_d;
-      
+
     // these are for the clusterizer (to be moved)
     uint32_t * moduleStart_d;
     int32_t *  clus_d;