Add missing summary metrics (#302)

* Add missing read-level and run-level cluster count metrics * Fix cluster count logic * Update version * Fix bug and update tests * Fix warning * Remove comment
Illumina · Oct 25, 2022 · 3178e08 · 3178e08
1 parent 09a5b40
commit 3178e08
Show file tree

Hide file tree

Showing 5 changed files with 140 additions and 24 deletions.
diff --git a/docs/src/changes.md b/docs/src/changes.md
@@ -1,5 +1,11 @@
 # Changes                                               {#changes}
 
+## v1.2.0
+
+| Date       | Description                                                |
+|------------|------------------------------------------------------------|
+| 2022-10-21 | Add missing read-level and run-level cluster count metrics |
+
 
 ## v1.1.28
 

diff --git a/interop/logic/summary/summary_statistics.h b/interop/logic/summary/summary_statistics.h
@@ -233,6 +233,17 @@ namespace illumina { namespace interop { namespace logic { namespace summary
         static const float eps = 1e-9f;
         return (div < eps) ? 0 : num / div;
     }
+    /** Safe divide
+     *
+     * @param num numerator
+     * @param div divisor
+     * @return result of division
+     */
+    inline double divide(const double num, const double div)
+    {
+        static const double eps = 1e-9;
+        return (div < eps) ? 0 : num / div;
+    }
 
     namespace op
     {

diff --git a/interop/logic/summary/tile_summary.h b/interop/logic/summary/tile_summary.h
@@ -89,12 +89,12 @@ namespace illumina { namespace interop { namespace logic { namespace summary
                   util::op::const_member_function_less(&model::metrics::tile_metric::percent_pf),
                   skip_median);
         stat_summary.percent_pf(stat);
-        stat_summary.reads(std::accumulate(tile_data.begin(),
+        stat_summary.reads(nan_accumulate(tile_data.begin(),
                                            tile_data.end(),
                                            uint64_t(0),
                                            util::op::const_member_function(
                                                    &model::metrics::tile_metric::cluster_count)));
-        stat_summary.reads_pf(std::accumulate(tile_data.begin(),
+        stat_summary.reads_pf(nan_accumulate(tile_data.begin(),
                                              tile_data.end(),
                                               uint64_t(0),
                                              util::op::const_member_function(
@@ -254,6 +254,10 @@ namespace illumina { namespace interop { namespace logic { namespace summary
         size_t total = 0;
         float percent_aligned_nonindex = 0;
         size_t total_nonindex = 0;
+        double cluster_count_raw = 0;
+        double cluster_count_pf = 0;
+        uint64_t total_reads_raw = 0;
+        uint64_t total_reads_pf = 0;
         for (size_t read = 0; read < run.size(); ++read)
         {
             INTEROP_ASSERT(read < run.size());
@@ -262,6 +266,11 @@ namespace illumina { namespace interop { namespace logic { namespace summary
             for (size_t lane = 0; lane < run[read].size(); ++lane)
             {
                 INTEROP_ASSERT(lane < run[0].size());
+                if(read == 0)
+                {
+                    cluster_count_pf += run[read][lane].reads_pf();
+                    cluster_count_raw += run[read][lane].reads();
+                }
                 const size_t non_nan = update_read_summary(read_data_by_lane_read(read, lane),
                                                            run[read][lane],
                                                            skip_median);
@@ -277,6 +286,12 @@ namespace illumina { namespace interop { namespace logic { namespace summary
                                      skip_median);
                 }
             }
+            run[read].summary().reads(static_cast<uint64_t>(cluster_count_raw));
+            run[read].summary().reads_pf(static_cast<uint64_t>(cluster_count_pf));
+            run[read].summary().cluster_count(cluster_count_raw);
+            run[read].summary().cluster_count_pf(cluster_count_pf);
+            total_reads_raw = static_cast<uint64_t>(cluster_count_raw);
+            total_reads_pf = static_cast<uint64_t>(cluster_count_pf);
             run[read].summary().percent_aligned(divide(percent_aligned_by_read, float(total_by_read)));
             percent_aligned += percent_aligned_by_read;
             total += total_by_read;
@@ -288,7 +303,14 @@ namespace illumina { namespace interop { namespace logic { namespace summary
         }
         run.nonindex_summary().percent_aligned(divide(percent_aligned_nonindex, static_cast<float>(total_nonindex)));
         run.total_summary().percent_aligned(divide(percent_aligned, static_cast<float>(total)));
-
+        run.nonindex_summary().reads(total_reads_raw);
+        run.total_summary().reads(total_reads_raw);
+        run.nonindex_summary().reads_pf(total_reads_pf);
+        run.total_summary().reads_pf(total_reads_pf);
+        run.nonindex_summary().cluster_count(cluster_count_raw);
+        run.total_summary().cluster_count(cluster_count_raw);
+        run.nonindex_summary().cluster_count_pf(cluster_count_pf);
+        run.total_summary().cluster_count_pf(cluster_count_pf);
     }
 
     /** Summarize a collection extended tile metrics
@@ -332,8 +354,8 @@ namespace illumina { namespace interop { namespace logic { namespace summary
 
 
         model::summary::metric_stat count_stat;
-        float total_cluster_occupied = 0;
-        float total_cluster_count = 0;
+        double total_cluster_occupied = 0;
+        double total_cluster_count = 0;
         const bool skip_median=false;
         for (size_t lane = 0; lane < run[0].size(); ++lane)
         {
@@ -375,10 +397,10 @@ namespace illumina { namespace interop { namespace logic { namespace summary
         }
         for (size_t read = 0; read < run.size(); ++read)
         {
-            run[read].summary().percent_occupied(divide(total_cluster_occupied, total_cluster_count)*100);
+            run[read].summary().percent_occupied(static_cast<float>(divide(total_cluster_occupied, total_cluster_count))*100);
         }
-        run.nonindex_summary().percent_occupied(divide(total_cluster_occupied, total_cluster_count)*100);
-        run.total_summary().percent_occupied(divide(total_cluster_occupied, total_cluster_count)*100);
+        run.nonindex_summary().percent_occupied(static_cast<float>(divide(total_cluster_occupied, total_cluster_count))*100);
+        run.total_summary().percent_occupied(static_cast<float>(divide(total_cluster_occupied, total_cluster_count))*100);
     }
 
 }}}}

diff --git a/interop/model/summary/metric_summary.h b/interop/model/summary/metric_summary.h
@@ -27,7 +27,11 @@ namespace illumina { namespace interop { namespace model { namespace summary {
                 m_yield_g(std::numeric_limits<float>::quiet_NaN()),
                 m_projected_yield_g(0),
                 m_percent_occupied(std::numeric_limits<float>::quiet_NaN()),
-                m_percent_occupancy_proxy(std::numeric_limits<float>::quiet_NaN())
+                m_percent_occupancy_proxy(std::numeric_limits<float>::quiet_NaN()),
+                m_reads_raw(0),
+                m_reads_pf(0),
+                m_cluster_count_raw(std::numeric_limits<float>::quiet_NaN()),
+                m_cluster_count_pf(std::numeric_limits<float>::quiet_NaN())
 
         {}
     public:
@@ -114,6 +118,39 @@ namespace illumina { namespace interop { namespace model { namespace summary {
         {
             return m_percent_occupancy_proxy;
         }
+        /** Get the cluster count
+         *
+         * @return cluster count
+         */
+        double cluster_count()const
+        {
+            return m_cluster_count_raw;
+        }
+
+        /** Get the cluster count PF
+         *
+         * @return cluster count PF
+         */
+        double cluster_count_pf()const
+        {
+            return m_cluster_count_pf;
+        }
+        /** Get the reads PF value for run
+         *
+         * @return reads PF val for run
+         */
+        uint64_t reads()const
+        {
+            return m_reads_raw;
+        }
+        /** Get the reads PF value for run
+         *
+         * @return reads PF val for run
+         */
+        uint64_t reads_pf()const
+        {
+            return m_reads_pf;
+        }
         /** @} */
         /** Set the first cycle intensity
          *
@@ -189,6 +226,42 @@ namespace illumina { namespace interop { namespace model { namespace summary {
         {
         }
 
+        /** Set the cluster count of the run
+         *
+         * @param val cluster count of the run.
+         */
+        void cluster_count(const double val)
+        {
+            m_cluster_count_raw = val;
+        }
+
+        /** Set the cluster count PF of the run
+         *
+         * @param val cluster count PF of the run.
+         */
+        void cluster_count_pf(const double val)
+        {
+            m_cluster_count_pf = val;
+        }
+
+        /** Set the total reads for the run
+         *
+         * @param val total reads for run
+         */
+        void reads(const uint64_t val)
+        {
+            m_reads_raw = val;
+        }
+
+        /** Set the reads PF for the run
+         *
+         * @param val reads PF for run
+         */
+        void reads_pf(const uint64_t val)
+        {
+            m_reads_pf = val;
+        }
+
     private:
         float m_error_rate;
         float m_percent_aligned;
@@ -198,6 +271,10 @@ namespace illumina { namespace interop { namespace model { namespace summary {
         float m_projected_yield_g;
         float m_percent_occupied;
         float m_percent_occupancy_proxy;
+        uint64_t m_reads_raw;
+        uint64_t m_reads_pf;
+        double m_cluster_count_raw;
+        double m_cluster_count_pf;
         template<class MetricType, int Version>
         friend struct io::generic_layout;
     };

diff --git a/src/ext/python/core.py b/src/ext/python/core.py
@@ -20,8 +20,8 @@
 
 >>> from interop import summary
 >>> summary(run_metrics_example)
-array([(0.36666667, 6.6666665, 0.)],
-      dtype=[('Error Rate', '<f4'), ('First Cycle Intensity', '<f4'), ('Projected Yield G', '<f4')])
+array([(0.36666667, 6.6666665, 0., 0., 0.)],
+      dtype=[('Error Rate', '<f4'), ('First Cycle Intensity', '<f4'), ('Projected Yield G', '<f4'), ('Reads', '<f4'), ('Reads Pf', '<f4')])
 
 >>> from interop import indexing
 >>> indexing(run_metrics_with_indexing)
@@ -233,21 +233,21 @@ def summary(run_metrics, level='Total', columns=None, dtype='f4', ignore_missing
 
 
     >>> summary(run_metrics_example)
-    array([(0.36666667, 6.6666665, 0.)],
-          dtype=[('Error Rate', '<f4'), ('First Cycle Intensity', '<f4'), ('Projected Yield G', '<f4')])
+    array([(0.36666667, 6.6666665, 0., 0., 0.)],
+          dtype=[('Error Rate', '<f4'), ('First Cycle Intensity', '<f4'), ('Projected Yield G', '<f4'), ('Reads', '<f4'), ('Reads Pf', '<f4')])
 
     >>> summary(run_metrics_example, 'Total')
-    array([(0.36666667, 6.6666665, 0.)],
-          dtype=[('Error Rate', '<f4'), ('First Cycle Intensity', '<f4'), ('Projected Yield G', '<f4')])
+    array([(0.36666667, 6.6666665, 0., 0., 0.)],
+          dtype=[('Error Rate', '<f4'), ('First Cycle Intensity', '<f4'), ('Projected Yield G', '<f4'), ('Reads', '<f4'), ('Reads Pf', '<f4')])
 
     >>> summary(run_metrics_example, 'NonIndex')
-    array([(0.2, 10., 0.)],
-          dtype=[('Error Rate', '<f4'), ('First Cycle Intensity', '<f4'), ('Projected Yield G', '<f4')])
+    array([(0.2, 10., 0., 0., 0.)],
+          dtype=[('Error Rate', '<f4'), ('First Cycle Intensity', '<f4'), ('Projected Yield G', '<f4'), ('Reads', '<f4'), ('Reads Pf', '<f4')])
 
     >>> summary(run_metrics_example, 'Read')
-    array([(1, 78, 0.2, 10., 0.), (2, 89, 0.4,  5., 0.),
-           (3, 89, 0.5,  5., 0.)],
-          dtype=[('ReadNumber', '<u2'), ('IsIndex', 'u1'), ('Error Rate', '<f4'), ('First Cycle Intensity', '<f4'), ('Projected Yield G', '<f4')])
+    array([(1, 78, 0.2, 10., 0., 0., 0.), (2, 89, 0.4,  5., 0., 0., 0.),
+           (3, 89, 0.5,  5., 0., 0., 0.)],
+          dtype=[('ReadNumber', '<u2'), ('IsIndex', 'u1'), ('Error Rate', '<f4'), ('First Cycle Intensity', '<f4'), ('Projected Yield G', '<f4'), ('Reads', '<f4'), ('Reads Pf', '<f4')])
 
     >>> summary(run_metrics_example, 'Lane')
     array([(1, 78, 1, 0.2, 10., 0., 0., 0., 1.),
@@ -443,13 +443,13 @@ def summary_columns(level='Total', ret_dict=False):
 
     The default columns are for the Run/Read level
     >>> summary_columns()
-    ('Error Rate', 'First Cycle Intensity', '% Aligned', '% >= Q30', '% Occupancy Proxy', '% Occupied', 'Projected Yield G', 'Yield G')
+    ('Cluster Count', 'Cluster Count Pf', 'Error Rate', 'First Cycle Intensity', '% Aligned', '% >= Q30', '% Occupancy Proxy', '% Occupied', 'Projected Yield G', 'Reads', 'Reads Pf', 'Yield G')
     >>> summary_columns(level='Total')
-    ('Error Rate', 'First Cycle Intensity', '% Aligned', '% >= Q30', '% Occupancy Proxy', '% Occupied', 'Projected Yield G', 'Yield G')
+    ('Cluster Count', 'Cluster Count Pf', 'Error Rate', 'First Cycle Intensity', '% Aligned', '% >= Q30', '% Occupancy Proxy', '% Occupied', 'Projected Yield G', 'Reads', 'Reads Pf', 'Yield G')
     >>> summary_columns(level='NonIndex')
-    ('Error Rate', 'First Cycle Intensity', '% Aligned', '% >= Q30', '% Occupancy Proxy', '% Occupied', 'Projected Yield G', 'Yield G')
+    ('Cluster Count', 'Cluster Count Pf', 'Error Rate', 'First Cycle Intensity', '% Aligned', '% >= Q30', '% Occupancy Proxy', '% Occupied', 'Projected Yield G', 'Reads', 'Reads Pf', 'Yield G')
     >>> summary_columns(level='Read')
-    ('Error Rate', 'First Cycle Intensity', '% Aligned', '% >= Q30', '% Occupancy Proxy', '% Occupied', 'Projected Yield G', 'Yield G')
+    ('Cluster Count', 'Cluster Count Pf', 'Error Rate', 'First Cycle Intensity', '% Aligned', '% >= Q30', '% Occupancy Proxy', '% Occupied', 'Projected Yield G', 'Reads', 'Reads Pf', 'Yield G')
 
     The lane/surface level give another set of columns for the summary table
     >>> summary_columns(level='Lane')