ROCm · bpickrel · Jul 17, 2024 · Jul 17, 2024 · Jul 17, 2024 · Jul 18, 2024
@@ -697,8 +697,11 @@ Operator Support Matrix
 |                          |           |                 | functions are                |
 |                          |           |                 | not enabled                  |
 +--------------------------+-----------+-----------------+------------------------------+
-| RoiAlign                 | ✅        | FP8, FP16,      |                              |
-|                          |           | FP32, FP64      |                              |
+| RoiAlign                 | ✅        | FP8, FP16,      | ``X``,                       |
+|                          |           | FP32, FP64,     | ``ROI`` take any floating-   |
+|                          |           | UINT8, UINT16,   | point type;                 |
+|                          |           | UINT32, UINT64,  | ``batch_indices``           |
+|                          |           |                  | takes any integral type     |
 +--------------------------+-----------+-----------------+------------------------------+
 | Round                    | ✅        | FP8, FP16,      |                              |
 |                          |           | FP32, FP64      |                              |

@@ -1,7 +1,7 @@
 /*
  * The MIT License (MIT)
  *
- * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -74,6 +74,15 @@ struct roialign
         auto type     = inputs.at(0).type();
 
         // check input correct
+        if(shape::is_integral(type))
+            MIGRAPHX_THROW("ROIALIGN: incorrect type for input data! (should be non-integer)");
+        if(shape::is_integral(inputs.at(1).type()))
+            MIGRAPHX_THROW("ROIALIGN: incorrect data type for rois! (should be non-integer)");
+        if(!shape::is_integral(inputs.at(2).type()))
+            MIGRAPHX_THROW(
+                "ROIALIGN: incorrect datatype for roi indices! (should be an integral type)");
+        if(x_lens.size() != 4)
+            MIGRAPHX_THROW("ROIALIGN: data input must have 4 dimensions n, c, h, w");
         if(bi_lens.size() != 1)
         {
             MIGRAPHX_THROW("ROIALIGN: batch indices should be 1 dimension!");
@@ -100,12 +109,18 @@ struct roialign
 
     struct pos_weight
     {
-        // neighbor indices for the bilinear interpolation
+        // neighbor indices for the bilinear interpolation, i.e.
+        // the integral bounds of the pixel containing a point.
         std::array<std::size_t, 4> pos = {0, 0, 0, 0};
-        // neighbor weights for the bilinear interpolation
+        // neighbor weights for the bilinear interpolation.  "weights"
+        // for interpolation are defined as the distances of the point from the
+        // surrounding indices in pos. aka the fractional parts of floating-point values.
         std::array<float, 4> w = {0.0f, 0.0f, 0.0f, 0.0f};
     };
 
+    // Precalculate the indices/weights used in bilinear interpolation.
+    //   These depend only on the
+    // ROI/batch layer pairs, and are reused across all channels c.
     auto calc_pos_weight(const std::array<std::size_t, 2>& dims,
                          const shape& comp_s,
                          const std::array<float, 2>& roi_start,
@@ -115,17 +130,22 @@ struct roialign
         std::vector<pos_weight> results(bin_grid_size[0] * bin_grid_size[1] * output_height *
                                         output_width);
         shape_for_each(comp_s, [&](const auto& idx_v, size_t index) {
-            std::array<std::size_t, 2> p = {idx_v[0], idx_v[1]};
-            std::array<std::size_t, 2> i = {idx_v[2], idx_v[3]};
+            // The p and i indexes correspond to nested looping parameters in ORT that go in y, x
+            // order.  The i[x] value is least significant and iterates the fastest.
+            std::array<std::size_t, 2> p = {idx_v[1], idx_v[0]};
+            std::array<std::size_t, 2> i = {idx_v[3], idx_v[2]}; // these are always equal
 
+            // xy is scaled coordinates of start point of ROI
             std::array<float, 2> xy{};
+            // low, high are floor and ceiling of the xy value (i.e. the bounds of the pixel it lies
+            // inside) from which we will interpolate.
             std::array<int64_t, 2> low{};
             std::array<int64_t, 2> high{};
             for(auto ii : range(p.size()))
             {
                 xy[ii] = roi_start[ii] + p[ii] * bin_size[ii] +
                          (i[ii] + .5f) * bin_size[ii] / bin_grid_size[ii];
-                xy[ii] = (coord_trans_mode == "half_pixel") ? (xy[ii] - 0.5f) : xy[ii];
+
                 if(xy[ii] < -1.0 or xy[ii] > dims[ii])
                 {
                     results[index] = pos_weight{};
@@ -140,21 +160,18 @@ struct roialign
                     xy[ii] = high[ii] = low[ii] = dims[ii] - 1;
                 }
             }
+            results[index].pos = {low[1] * dims[0] + low[0],
+                                  low[1] * dims[0] + high[0],
+                                  high[1] * dims[0] + low[0],
+                                  high[1] * dims[0] + high[0]};
 
-            results[index].pos = {low[0] * dims[1] + low[1],
-                                  low[0] * dims[1] + high[1],
-                                  high[0] * dims[1] + low[1],
-                                  high[0] * dims[1] + high[1]};
-
-            float ly = xy[0] - low[0];
-            float lx = xy[1] - low[1];
+            float lx = xy[0] - low[0];
+            float ly = xy[1] - low[1];
             float hy = 1.0f - ly;
             float hx = 1.0f - lx;
-
-            // save weights and indeces
+            // save weights and indices
             results[index].w = {hy * hx, hy * lx, ly * hx, ly * lx};
         });
-
         return results;
     }
 
@@ -176,11 +193,12 @@ struct roialign
         double final(double x, std::size_t y) { return (y == 0) ? 0.0 : (x / y); }
     };
 
+    // Calculate a pooling value for 1 block of bin_grid_size*bin_grid_size weights
     template <class T, class Op>
     std::tuple<double, int64_t> calc_pooling(const T& data,
                                              const std::array<std::size_t, 2>& bin_grid_size,
                                              const std::vector<pos_weight>& pos_weights,
-                                             int64_t index,
+                                             int64_t index, // index to c
                                              Op op) const
     {
         double output_val   = op.init();
@@ -203,41 +221,49 @@ struct roialign
 
     argument compute(const shape& output_shape, std::vector<argument> args) const
     {
-        argument result{output_shape};
+        // argument result{output_shape};
         const auto& out_lens = output_shape.lens();
         int64_t n_rois       = out_lens[0];
         std::size_t channels = out_lens[1];
+
         // output dims of height and width, in all 2-dim arrays, the first dim
-        // is for height and second dim is for width
-        std::array<std::size_t, 2> out_dims = {out_lens[2], out_lens[3]};
+        // is for height and second dim is for width i.e. (y, x) order
+        std::array<std::size_t, 2> out_dims = {out_lens[3], out_lens[2]};
         const auto& x_lens                  = args.at(0).get_shape().lens();
         // input dims of height and width
-        std::array<std::size_t, 2> in_dims = {x_lens[2], x_lens[3]};
+        std::array<std::size_t, 2> in_dims = {x_lens[3], x_lens[2]};
         auto roi_s                         = args.at(1).get_shape();
 
-        visit_all(result, args.at(0), args.at(1))([&](auto output, auto x, auto roi) {
+        // the internal
+        // computations are done on a different shape than the eventual output.
+        shape visit_shape{output_shape.type(),
+                          {out_lens[0], out_lens[1], out_lens[3], out_lens[2]}};
+        argument visit_result({visit_shape});
+        visit_all(visit_result, args.at(0), args.at(1))([&](auto output, auto x, auto roi) {
             const auto* batch_indices = args.at(2).cast<int64_t>();
             par_for(n_rois, [&](auto n) {
                 const auto bottom_data   = x.begin();
                 const auto roi_batch_ind = batch_indices[n];
-                // Do not using rounding; this implementation detail is critical
+                // Do not use rounding here even if data is a quantized type; this
+                // implementation detail is critical
+                const float offset              = (coord_trans_mode == "half_pixel") ? 0.5 : 0.0;
                 std::array<float, 2> roi_starts = {
-                    static_cast<float>(roi[roi_s.index({n, 1})] * spatial_scale),
-                    static_cast<float>(roi[roi_s.index({n, 0})] * spatial_scale)};
+                    static_cast<float>(roi[roi_s.index({n, 0})] * spatial_scale - offset),
+                    static_cast<float>(roi[roi_s.index({n, 1})] * spatial_scale - offset)};
                 std::array<float, 2> roi_ends = {
-                    static_cast<float>(roi[roi_s.index({n, 3})] * spatial_scale),
-                    static_cast<float>(roi[roi_s.index({n, 2})] * spatial_scale)};
+                    static_cast<float>(roi[roi_s.index({n, 2})] * spatial_scale - offset),
+                    static_cast<float>(roi[roi_s.index({n, 3})] * spatial_scale - offset)};
 
-                // Force malformed ROIs to be 1x1
+                // Force malformed ROIs to be 1x1, if in output_half_pixel transform mode
                 std::array<float, 2> roi_size{};
                 std::array<float, 2> bin_size{};
                 std::array<std::size_t, 2> bin_grid_size{};
 
                 for(auto ii : range(roi_size.size()))
                 {
                     roi_size[ii] = roi_ends[ii] - roi_starts[ii];
-                    roi_size[ii] = std::max(roi_size[ii], 1.0f);
-
+                    if(coord_trans_mode != "half_pixel")
+                        roi_size[ii] = std::max(roi_size[ii], 1.0f);
                     bin_size[ii]      = roi_size[ii] / out_dims[ii];
                     bin_grid_size[ii] = (sampling_ratio > 0)
                                             ? sampling_ratio
@@ -247,22 +273,24 @@ struct roialign
                 // we want to precalculate indices and weights shared by all channels,
                 // this is the key point of optimization
                 std::vector<std::size_t> comp_lens = {
-                    out_dims[0], out_dims[1], bin_grid_size[0], bin_grid_size[1]};
+                    out_dims[1], out_dims[0], bin_grid_size[1], bin_grid_size[0]};
                 shape comp_s{shape::float_type, comp_lens};
                 auto pre_calc =
                     this->calc_pos_weight(in_dims, comp_s, roi_starts, bin_size, bin_grid_size);
 
                 std::vector<std::size_t> comp_lens1 = {channels, out_dims[0], out_dims[1]};
                 shape comp_s1{migraphx::shape::float_type, comp_lens1};
                 std::vector<int64_t> vec_index(channels, 0);
+
                 shape_for_each(comp_s1, [&](const auto& idx) {
-                    auto c  = idx[0];
+                    auto c  = idx[0]; // channel count
                     auto ph = idx[1];
                     auto pw = idx[2];
 
                     const auto offset_bottom_data =
                         bottom_data + static_cast<int64_t>((roi_batch_ind * channels + c) *
                                                            in_dims[0] * in_dims[1]);
+
                     double output_val;
                     std::tie(output_val, vec_index[c]) =
                         (mode == migraphx::op::pooling_mode::average)
@@ -281,7 +309,8 @@ struct roialign
             });
         });
 
-        return result;
+        // reshape visit_result to the final result
+        return visit_result.reshape(output_shape);
     }
 };