Add XeTile block operation fallback pass (#991)

Certain block operation legal at XeTile dialect level cannot be supported by matching XeGPU dialect op due to not meeting HW restriction. This PR adds a new pass that provide a fallback for some cases. Pass can be called with command line arg --xetile-blockop-fallback The cases covered are: Source of Tile is a static shaped row major memref but - pitch is not a multiple of 16 bytes or less than 64 bytes - or memory space indicates SLM memory For such fitting case, this pass turns - block tile to scatter tile - load_tile to load - store_tile to store - update_tile_offset to use tile shaped indices instead of X, Y offset - impacted scf.for arguments from block tile type to scatter tile type
intel · Dec 20, 2024 · ff51594 · ff51594
1 parent 43a7d7c
commit ff51594
Show file tree

Hide file tree

Showing 11 changed files with 1,109 additions and 2 deletions.
diff --git a/include/imex/Dialect/XeTile/Transforms/Passes.h b/include/imex/Dialect/XeTile/Transforms/Passes.h
@@ -40,6 +40,7 @@ std::unique_ptr<mlir::Pass>
 createXeTileBlockingPass(const std::string &device = "pvc");
 std::unique_ptr<mlir::Pass> createXeTileWgToSgPass();
 std::unique_ptr<mlir::Pass> createXeTileCanonicalizationPass();
+std::unique_ptr<mlir::Pass> createXeTileBlockOpFallbackPass();
 
 #define GEN_PASS_DECL_XETILEBLOCKING
 #define GEN_PASS_DECL_XETILECANONICALIZATION

diff --git a/include/imex/Dialect/XeTile/Transforms/Passes.td b/include/imex/Dialect/XeTile/Transforms/Passes.td
@@ -96,4 +96,21 @@ def XeTileBlocking : Pass<"xetile-blocking", "::mlir::gpu::GPUModuleOp">{
 }
 
 
+def XeTileBlockOpFallback : Pass<"xetile-blockop-fallback", "::mlir::gpu::GPUModuleOp">{
+  let summary = "Transform unsuitable block ops to fallback scattered ops";
+
+  let description = [{
+    This transform pass transforms XeTile block ops that are not suitable due to HW restrictions,
+    to scattered XeTile ops.
+  }];
+
+  let constructor = "imex::createXeTileBlockOpFallbackPass()";
+  let dependentDialects = ["imex::xetile::XeTileDialect",
+                           "mlir::arith::ArithDialect",
+                           "mlir::gpu::GPUDialect",
+                           "mlir::index::IndexDialect",
+                           "mlir::memref::MemRefDialect",
+                           "mlir::vector::VectorDialect"];
+}
+
 #endif // _XeTile_PASSES_TD_INCLUDED_
diff --git a/lib/Dialect/XeTile/Transforms/BlockOpFallback.cpp b/lib/Dialect/XeTile/Transforms/BlockOpFallback.cpp
diff --git a/lib/Dialect/XeTile/Transforms/CMakeLists.txt b/lib/Dialect/XeTile/Transforms/CMakeLists.txt
@@ -1,6 +1,7 @@
 add_imex_dialect_library(IMEXXeTileTransforms
   Blocking.cpp
   BlockingAnalysis.cpp
+  BlockOpFallback.cpp
   InitDuplicate.cpp
   Canonicalization.cpp
   WgToSg.cpp

diff --git a/lib/Transforms/RemoveSingleElemVector.cpp b/lib/Transforms/RemoveSingleElemVector.cpp
@@ -253,7 +253,9 @@ struct RemoveSingleElemVectorPass final
         });
 
     mlir::RewritePatternSet patterns(context);
-    patterns.add<VectorExtractStridedSliceConversion, VectorizableOpPattern,
+    // Disable ectorExtractStridedSliceConversion for now as it interferes with
+    // xetile-blockop-fallback pass
+    patterns.add</*VectorExtractStridedSliceConversion,*/ VectorizableOpPattern,
                  VectorShffleOpConversion, VectorInterleaveOpConversion,
                  VectorSplatOpConversion, VectorExtractElementOpConversion>(
         typeConverter, context);

diff --git a/test/Dialect/XeTile/Transforms/block_op_fallback.mlir b/test/Dialect/XeTile/Transforms/block_op_fallback.mlir
diff --git a/test/Integration/Dialect/XeTile/fallback/narrow_tile_one_elem_wide.mlir b/test/Integration/Dialect/XeTile/fallback/narrow_tile_one_elem_wide.mlir
@@ -0,0 +1,64 @@
+// RUN: %python_executable %imex_runner --requires=l0-runtime -i %s --pass-pipeline-file=%p/xetile-fallback-to-func-vc.pp \
+// RUN:                                       --runner imex-cpu-runner -e main \
+// RUN:                                       --entry-point-result=void \
+// RUN:                                       --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%levelzero_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=sycl-runtime -i %s --pass-pipeline-file=%p/xetile-fallback-to-func-vc.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+
+module @narrow_tile attributes {gpu.container_module} {
+  func.func @test(%A: memref<64x1xf32>) -> memref<64x1xf32> attributes {llvm.emit_c_interface} {
+    %c1 = arith.constant 1 : index
+    %A_gpu = gpu.alloc  host_shared() : memref<64x1xf32>
+    memref.copy %A, %A_gpu : memref<64x1xf32> to memref<64x1xf32>
+    %B_gpu = gpu.alloc  host_shared() : memref<64x1xf32>
+    gpu.launch_func @test_module::@test_scf_for blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%A_gpu : memref<64x1xf32>, %B_gpu : memref<64x1xf32>)
+    %B = memref.alloc() : memref<64x1xf32>
+    memref.copy %B_gpu, %B : memref<64x1xf32> to memref<64x1xf32>
+    gpu.dealloc %A_gpu : memref<64x1xf32>
+    gpu.dealloc %B_gpu : memref<64x1xf32>
+    return %B : memref<64x1xf32>
+  }
+  gpu.module @test_module attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Bfloat16ConversionINTEL, BFloat16TypeKHR, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR, VectorAnyINTEL, VectorComputeINTEL], [SPV_EXT_shader_atomic_float_add, SPV_KHR_bfloat16, SPV_KHR_expect_assume, SPV_INTEL_bfloat16_conversion, SPV_INTEL_vector_compute]>, api=OpenCL, #spirv.resource_limits<>>} {
+    gpu.func @test_scf_for(%arg0: memref<64x1xf32>, %arg1: memref<64x1xf32>) kernel attributes {VectorComputeFunctionINTEL, known_block_size = array<i32: 1, 1, 1>, known_grid_size = array<i32: 1, 1, 1>, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
+      %cst0 = arith.constant 0 : index
+      %cst16 = arith.constant 16 : index
+      %cst64 = arith.constant 64 : index
+      %0 = xetile.init_tile %arg0 [0, 0] : memref<64x1xf32> -> !xetile.tile<16x1xf32, #xetile.tile_attr<order = [1, 0]>>
+      %1 = xetile.init_tile %arg1 [0, 0] : memref<64x1xf32> -> !xetile.tile<16x1xf32, #xetile.tile_attr<order = [1, 0]>>
+      %out:2 = scf.for %k = %cst0 to %cst64 step %cst16
+        iter_args(%a_tile = %0, %b_tile = %1)
+        -> (!xetile.tile<16x1xf32, #xetile.tile_attr<order = [1, 0]>>, !xetile.tile<16x1xf32, #xetile.tile_attr<order = [1, 0]>>) {
+        %a_value = xetile.load_tile %a_tile : !xetile.tile<16x1xf32, #xetile.tile_attr<order = [1, 0]>> -> vector<16x1xf32>
+        xetile.store_tile %a_value, %b_tile : vector<16x1xf32>, !xetile.tile<16x1xf32, #xetile.tile_attr<order = [1, 0]>>
+        %a_next_tile = xetile.update_tile_offset %a_tile, [%cst16, %cst0] : !xetile.tile<16x1xf32, #xetile.tile_attr<order = [1, 0]>>
+        %b_next_tile = xetile.update_tile_offset %b_tile, [%cst16, %cst0] : !xetile.tile<16x1xf32, #xetile.tile_attr<order = [1, 0]>>
+        scf.yield %a_next_tile, %b_next_tile : !xetile.tile<16x1xf32, #xetile.tile_attr<order = [1, 0]>>, !xetile.tile<16x1xf32, #xetile.tile_attr<order = [1, 0]>>
+      }
+      gpu.return
+    }
+  }
+  func.func @main() attributes {llvm.emit_c_interface} {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c64 = arith.constant 64 : index
+
+    %A = memref.alloc() : memref<64x1xf32>
+    scf.for %arg0 = %c0 to %c64 step %c1 {
+      %0 = index.castu %arg0 : index to i32
+      %val = arith.uitofp %0 : i32 to f32
+      memref.store %val, %A[%arg0, %c0] : memref<64x1xf32>
+    }
+    %C = call @test(%A) : (memref<64x1xf32>) -> memref<64x1xf32>
+    %cast_A = memref.cast %A : memref<64x1xf32> to memref<*xf32>
+    %cast_C = memref.cast %C : memref<64x1xf32> to memref<*xf32>
+    // CHECK: [ALLCLOSE: TRUE]
+    call @printAllcloseF32(%cast_C, %cast_A) : (memref<*xf32>, memref<*xf32>) -> ()
+    //call @printMemrefF32(%cast_A) : (memref<*xf32>) -> ()
+    //call @printMemrefF32(%cast_C) : (memref<*xf32>) -> ()
+    return
+  }
+  func.func private @printAllcloseF32(memref<*xf32>, memref<*xf32>) attributes {llvm.emit_c_interface}
+  //func.func private @printMemrefF32(memref<*xf32>) attributes {llvm.emit_c_interface}
+}
diff --git a/test/Integration/Dialect/XeTile/fallback/narrow_tile_two_elem_wide.mlir b/test/Integration/Dialect/XeTile/fallback/narrow_tile_two_elem_wide.mlir
@@ -0,0 +1,65 @@
+// RUN: %python_executable %imex_runner --requires=l0-runtime -i %s --pass-pipeline-file=%p/xetile-fallback-to-func-vc.pp \
+// RUN:                                       --runner imex-cpu-runner -e main \
+// RUN:                                       --entry-point-result=void \
+// RUN:                                       --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%levelzero_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=sycl-runtime -i %s --pass-pipeline-file=%p/xetile-fallback-to-func-vc.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+
+module @narrow_tile attributes {gpu.container_module} {
+  func.func @test(%A: memref<64x2xf32>) -> memref<64x2xf32> attributes {llvm.emit_c_interface} {
+    %c1 = arith.constant 1 : index
+    %A_gpu = gpu.alloc  host_shared() : memref<64x2xf32>
+    memref.copy %A, %A_gpu : memref<64x2xf32> to memref<64x2xf32>
+    %B_gpu = gpu.alloc  host_shared() : memref<64x2xf32>
+    gpu.launch_func @test_module::@test_scf_for blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%A_gpu : memref<64x2xf32>, %B_gpu : memref<64x2xf32>)
+    %B = memref.alloc() : memref<64x2xf32>
+    memref.copy %B_gpu, %B : memref<64x2xf32> to memref<64x2xf32>
+    gpu.dealloc %A_gpu : memref<64x2xf32>
+    gpu.dealloc %B_gpu : memref<64x2xf32>
+    return %B : memref<64x2xf32>
+  }
+  gpu.module @test_module attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Bfloat16ConversionINTEL, BFloat16TypeKHR, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR, VectorAnyINTEL, VectorComputeINTEL], [SPV_EXT_shader_atomic_float_add, SPV_KHR_bfloat16, SPV_KHR_expect_assume, SPV_INTEL_bfloat16_conversion, SPV_INTEL_vector_compute]>, api=OpenCL, #spirv.resource_limits<>>} {
+    gpu.func @test_scf_for(%arg0: memref<64x2xf32>, %arg1: memref<64x2xf32>) kernel attributes {VectorComputeFunctionINTEL, known_block_size = array<i32: 1, 1, 1>, known_grid_size = array<i32: 1, 1, 1>, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
+      %cst0 = arith.constant 0 : index
+      %cst16 = arith.constant 16 : index
+      %cst64 = arith.constant 64 : index
+      %0 = xetile.init_tile %arg0 [0, 0] : memref<64x2xf32> -> !xetile.tile<16x2xf32, #xetile.tile_attr<order = [1, 0]>>
+      %1 = xetile.init_tile %arg1 [0, 0] : memref<64x2xf32> -> !xetile.tile<16x2xf32, #xetile.tile_attr<order = [1, 0]>>
+      %out:2 = scf.for %k = %cst0 to %cst64 step %cst16
+        iter_args(%a_tile = %0, %b_tile = %1)
+        -> (!xetile.tile<16x2xf32, #xetile.tile_attr<order = [1, 0]>>, !xetile.tile<16x2xf32, #xetile.tile_attr<order = [1, 0]>>) {
+        %a_value = xetile.load_tile %a_tile : !xetile.tile<16x2xf32, #xetile.tile_attr<order = [1, 0]>> -> vector<16x2xf32>
+        xetile.store_tile %a_value, %b_tile : vector<16x2xf32>, !xetile.tile<16x2xf32, #xetile.tile_attr<order = [1, 0]>>
+        %a_next_tile = xetile.update_tile_offset %a_tile, [%cst16, %cst0] : !xetile.tile<16x2xf32, #xetile.tile_attr<order = [1, 0]>>
+        %b_next_tile = xetile.update_tile_offset %b_tile, [%cst16, %cst0] : !xetile.tile<16x2xf32, #xetile.tile_attr<order = [1, 0]>>
+        scf.yield %a_next_tile, %b_next_tile : !xetile.tile<16x2xf32, #xetile.tile_attr<order = [1, 0]>>, !xetile.tile<16x2xf32, #xetile.tile_attr<order = [1, 0]>>
+      }
+      gpu.return
+    }
+  }
+  func.func @main() attributes {llvm.emit_c_interface} {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c64 = arith.constant 64 : index
+
+    %A = memref.alloc() : memref<64x2xf32>
+    scf.for %arg0 = %c0 to %c64 step %c1 {
+      %0 = index.castu %arg0 : index to i32
+      %val = arith.uitofp %0 : i32 to f32
+      memref.store %val, %A[%arg0, %c0] : memref<64x2xf32>
+      memref.store %val, %A[%arg0, %c1] : memref<64x2xf32>
+    }
+    %C = call @test(%A) : (memref<64x2xf32>) -> memref<64x2xf32>
+    %cast_A = memref.cast %A : memref<64x2xf32> to memref<*xf32>
+    %cast_C = memref.cast %C : memref<64x2xf32> to memref<*xf32>
+    // CHECK: [ALLCLOSE: TRUE]
+    call @printAllcloseF32(%cast_C, %cast_A) : (memref<*xf32>, memref<*xf32>) -> ()
+    //call @printMemrefF32(%cast_A) : (memref<*xf32>) -> ()
+    //call @printMemrefF32(%cast_C) : (memref<*xf32>) -> ()
+    return
+  }
+  func.func private @printAllcloseF32(memref<*xf32>, memref<*xf32>) attributes {llvm.emit_c_interface}
+  //func.func private @printMemrefF32(memref<*xf32>) attributes {llvm.emit_c_interface}
+}
diff --git a/test/Integration/Dialect/XeTile/fallback/slm.mlir b/test/Integration/Dialect/XeTile/fallback/slm.mlir
@@ -0,0 +1,80 @@
+// RUN: %python_executable %imex_runner --requires=l0-runtime -i %s --pass-pipeline-file=%p/xetile-fallback-to-func-vc.pp \
+// RUN:                                       --runner imex-cpu-runner -e main \
+// RUN:                                       --entry-point-result=void \
+// RUN:                                       --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%levelzero_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=sycl-runtime -i %s --pass-pipeline-file=%p/xetile-fallback-to-func-vc.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+
+module @narrow_tile attributes {gpu.container_module} {
+  func.func @test(%A: memref<32x32xf32>) -> memref<32x32xf32> attributes {llvm.emit_c_interface} {
+    %c1 = arith.constant 1 : index
+    %A_gpu = gpu.alloc  host_shared() : memref<32x32xf32>
+    memref.copy %A, %A_gpu : memref<32x32xf32> to memref<32x32xf32>
+    %B_gpu = gpu.alloc  host_shared() : memref<32x32xf32>
+    gpu.launch_func @test_module::@test_scf_for blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%A_gpu : memref<32x32xf32>, %B_gpu : memref<32x32xf32>)
+    %B = memref.alloc() : memref<32x32xf32>
+    memref.copy %B_gpu, %B : memref<32x32xf32> to memref<32x32xf32>
+    gpu.dealloc %A_gpu : memref<32x32xf32>
+    gpu.dealloc %B_gpu : memref<32x32xf32>
+    return %B : memref<32x32xf32>
+  }
+  gpu.module @test_module attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Bfloat16ConversionINTEL, BFloat16TypeKHR, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR, VectorAnyINTEL, VectorComputeINTEL], [SPV_EXT_shader_atomic_float_add, SPV_KHR_bfloat16, SPV_KHR_expect_assume, SPV_INTEL_bfloat16_conversion, SPV_INTEL_vector_compute]>, api=OpenCL, #spirv.resource_limits<>>} {
+    gpu.func @test_scf_for(%arg0: memref<32x32xf32>, %arg1: memref<32x32xf32>) kernel attributes {VectorComputeFunctionINTEL, known_block_size = array<i32: 1, 1, 1>, known_grid_size = array<i32: 1, 1, 1>, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
+      %cst0 = arith.constant 0 : index
+      %cst8 = arith.constant 8 : index
+      %cst16 = arith.constant 16 : index
+      %cst32 = arith.constant 32 : index
+      %0 = xetile.init_tile %arg0 [0, 0] : memref<32x32xf32> -> !xetile.tile<8x16xf32, #xetile.tile_attr<order = [1, 0]>>
+      %1 = xetile.init_tile %arg1 [0, 0] : memref<32x32xf32> -> !xetile.tile<8x16xf32, #xetile.tile_attr<order = [1, 0]>>
+      %slm = memref.alloc() : memref<8x16xf32, 3>
+      %slm_tile = xetile.init_tile %slm [0, 0] : memref<8x16xf32, 3> -> !xetile.tile<8x16xf32, #xetile.tile_attr<order = [1, 0], memory_space = 3 : i32>>
+      %out:2 = scf.for %j = %cst0 to %cst32 step %cst8
+        iter_args(%a_tile = %0, %b_tile = %1)
+        -> (!xetile.tile<8x16xf32, #xetile.tile_attr<order = [1, 0]>>, !xetile.tile<8x16xf32, #xetile.tile_attr<order = [1, 0]>>) {
+        %out:2 = scf.for %k = %cst0 to %cst32 step %cst16
+          iter_args(%c_tile = %a_tile, %d_tile = %b_tile)
+          -> (!xetile.tile<8x16xf32, #xetile.tile_attr<order = [1, 0]>>, !xetile.tile<8x16xf32, #xetile.tile_attr<order = [1, 0]>>) {
+          %c_value = xetile.load_tile %c_tile : !xetile.tile<8x16xf32, #xetile.tile_attr<order = [1, 0]>> -> vector<8x16xf32>
+          xetile.store_tile %c_value, %slm_tile : vector<8x16xf32>, !xetile.tile<8x16xf32, #xetile.tile_attr<order = [1, 0], memory_space = 3 : i32>>
+          %d_value = xetile.load_tile %slm_tile : !xetile.tile<8x16xf32, #xetile.tile_attr<order = [1, 0], memory_space = 3 : i32>> -> vector<8x16xf32>
+          xetile.store_tile %d_value, %d_tile : vector<8x16xf32>, !xetile.tile<8x16xf32, #xetile.tile_attr<order = [1, 0]>>
+          %c_next_tile = xetile.update_tile_offset %c_tile, [%cst0, %cst16] : !xetile.tile<8x16xf32, #xetile.tile_attr<order = [1, 0]>>
+          %d_next_tile = xetile.update_tile_offset %d_tile, [%cst0, %cst16] : !xetile.tile<8x16xf32, #xetile.tile_attr<order = [1, 0]>>
+          scf.yield %c_next_tile, %d_next_tile : !xetile.tile<8x16xf32, #xetile.tile_attr<order = [1, 0]>>, !xetile.tile<8x16xf32, #xetile.tile_attr<order = [1, 0]>>
+        }
+        %a_next_tile = xetile.update_tile_offset %a_tile, [%cst8, %cst0] : !xetile.tile<8x16xf32, #xetile.tile_attr<order = [1, 0]>>
+        %b_next_tile = xetile.update_tile_offset %b_tile, [%cst8, %cst0] : !xetile.tile<8x16xf32, #xetile.tile_attr<order = [1, 0]>>
+        scf.yield %a_next_tile, %b_next_tile : !xetile.tile<8x16xf32, #xetile.tile_attr<order = [1, 0]>>, !xetile.tile<8x16xf32, #xetile.tile_attr<order = [1, 0]>>
+      }
+      gpu.return
+    }
+  }
+  func.func @main() attributes {llvm.emit_c_interface} {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c32 = arith.constant 32 : index
+
+    %A = memref.alloc() : memref<32x32xf32>
+    scf.for %arg0 = %c0 to %c32 step %c1 {
+      scf.for %arg1 = %c0 to %c32 step %c1 {
+        %0 = index.castu %arg0 : index to i32
+        %1 = index.castu %arg1 : index to i32
+        %2 = arith.addi %0, %1 : i32
+        %val = arith.uitofp %2 : i32 to f32
+        memref.store %val, %A[%arg0, %arg1] : memref<32x32xf32>
+      }
+    }
+    %C = call @test(%A) : (memref<32x32xf32>) -> memref<32x32xf32>
+    %cast_A = memref.cast %A : memref<32x32xf32> to memref<*xf32>
+    %cast_C = memref.cast %C : memref<32x32xf32> to memref<*xf32>
+    // CHECK: [ALLCLOSE: TRUE]
+    call @printAllcloseF32(%cast_C, %cast_A) : (memref<*xf32>, memref<*xf32>) -> ()
+    //call @printMemrefF32(%cast_A) : (memref<*xf32>) -> ()
+    //call @printMemrefF32(%cast_C) : (memref<*xf32>) -> ()
+    return
+  }
+  func.func private @printAllcloseF32(memref<*xf32>, memref<*xf32>) attributes {llvm.emit_c_interface}
+  //func.func private @printMemrefF32(memref<*xf32>) attributes {llvm.emit_c_interface}
+}
diff --git a/test/Integration/Dialect/XeTile/fallback/xetile-fallback-to-func-vc.pp b/test/Integration/Dialect/XeTile/fallback/xetile-fallback-to-func-vc.pp
@@ -0,0 +1,41 @@
+builtin.module(
+    cse
+    gpu.module(xetile-init-duplicate
+        xetile-canonicalization
+        xetile-blockop-fallback
+        xetile-blocking
+	cse
+        convert-xetile-to-xegpu
+	cse
+        imex-xegpu-hoist-transpose
+        imex-xegpu-apply-vnni-transformation
+        imex-xegpu-optimize-transpose)
+    cse
+    imex-vector-linearize
+    cse
+    imex-remove-single-elem-vector
+    canonicalize
+    cse
+    gpu.module(convert-xegpu-to-vc)
+    reconcile-unrealized-casts
+    bf16-to-gpu
+    cse
+    imex-convert-gpu-to-spirv
+    spirv.module(spirv-lower-abi-attrs
+             spirv-update-vce)
+    func.func(llvm-request-c-wrappers)
+    serialize-spirv
+    convert-vector-to-scf
+    convert-gpu-to-gpux
+    convert-scf-to-cf
+    expand-strided-metadata
+    finalize-memref-to-llvm
+    convert-cf-to-llvm
+    convert-vector-to-llvm
+    convert-index-to-llvm
+    convert-arith-to-llvm
+    convert-func-to-llvm
+    convert-math-to-llvm
+    convert-gpux-to-llvm
+    lower-affine
+    reconcile-unrealized-casts)
diff --git a/test/Transforms/RemoveSingleElemVector/postop_reduce_n.mlir b/test/Transforms/RemoveSingleElemVector/postop_reduce_n.mlir
@@ -61,7 +61,9 @@ module {
       %34 = arith.remsi %11, %c4 : index
       %35 = scf.for %arg3 = %c0 to %c3 step %c1 iter_args(%arg4 = %cst) -> (vector<8x1xf32>) {
         %39 = vector.shape_cast %arg4 : vector<8x1xf32> to vector<8xf32>
-        //CHECK-COUNT-8: vector.extractelement {{.*}} : vector<8xf32>
+        // Disabling remove single elem vector.extra_stride_slice for now.
+        // DISABLE-CHECK-COUNT-8: vector.extractelement {{.*}} : vector<8xf32>
+        // CHECK-COUNT-8: vector.extract_strided_slice
         %40 = vector.extract_strided_slice %39 {offsets = [0], sizes = [1], strides = [1]} : vector<8xf32> to vector<1xf32>
         %41 = vector.extract_strided_slice %39 {offsets = [1], sizes = [1], strides = [1]} : vector<8xf32> to vector<1xf32>
         %42 = vector.extract_strided_slice %39 {offsets = [2], sizes = [1], strides = [1]} : vector<8xf32> to vector<1xf32>