Skip to content

Commit

Permalink
Add XeTile block operation fallback pass (#991)
Browse files Browse the repository at this point in the history
Certain block operation legal at XeTile dialect level cannot be
supported by matching XeGPU dialect op due to not meeting HW
restriction.
This PR adds a new pass that provide a fallback for some cases.
Pass can be called with command line arg --xetile-blockop-fallback
The cases covered are:
Source of Tile is a static shaped row major memref but
- pitch is not a multiple of 16 bytes or less than 64 bytes
- or memory space indicates SLM memory

For such fitting case, this pass turns
- block tile to scatter tile
- load_tile to load
- store_tile to store
- update_tile_offset to use tile shaped indices instead of X, Y offset
- impacted scf.for arguments from block tile type to scatter tile type
  • Loading branch information
silee2 authored Dec 20, 2024
1 parent 43a7d7c commit ff51594
Show file tree
Hide file tree
Showing 11 changed files with 1,109 additions and 2 deletions.
1 change: 1 addition & 0 deletions include/imex/Dialect/XeTile/Transforms/Passes.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ std::unique_ptr<mlir::Pass>
createXeTileBlockingPass(const std::string &device = "pvc");
std::unique_ptr<mlir::Pass> createXeTileWgToSgPass();
std::unique_ptr<mlir::Pass> createXeTileCanonicalizationPass();
std::unique_ptr<mlir::Pass> createXeTileBlockOpFallbackPass();

#define GEN_PASS_DECL_XETILEBLOCKING
#define GEN_PASS_DECL_XETILECANONICALIZATION
Expand Down
17 changes: 17 additions & 0 deletions include/imex/Dialect/XeTile/Transforms/Passes.td
Original file line number Diff line number Diff line change
Expand Up @@ -96,4 +96,21 @@ def XeTileBlocking : Pass<"xetile-blocking", "::mlir::gpu::GPUModuleOp">{
}


def XeTileBlockOpFallback : Pass<"xetile-blockop-fallback", "::mlir::gpu::GPUModuleOp">{
let summary = "Transform unsuitable block ops to fallback scattered ops";

let description = [{
This transform pass transforms XeTile block ops that are not suitable due to HW restrictions,
to scattered XeTile ops.
}];

let constructor = "imex::createXeTileBlockOpFallbackPass()";
let dependentDialects = ["imex::xetile::XeTileDialect",
"mlir::arith::ArithDialect",
"mlir::gpu::GPUDialect",
"mlir::index::IndexDialect",
"mlir::memref::MemRefDialect",
"mlir::vector::VectorDialect"];
}

#endif // _XeTile_PASSES_TD_INCLUDED_
443 changes: 443 additions & 0 deletions lib/Dialect/XeTile/Transforms/BlockOpFallback.cpp

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions lib/Dialect/XeTile/Transforms/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
add_imex_dialect_library(IMEXXeTileTransforms
Blocking.cpp
BlockingAnalysis.cpp
BlockOpFallback.cpp
InitDuplicate.cpp
Canonicalization.cpp
WgToSg.cpp
Expand Down
4 changes: 3 additions & 1 deletion lib/Transforms/RemoveSingleElemVector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,9 @@ struct RemoveSingleElemVectorPass final
});

mlir::RewritePatternSet patterns(context);
patterns.add<VectorExtractStridedSliceConversion, VectorizableOpPattern,
// Disable ectorExtractStridedSliceConversion for now as it interferes with
// xetile-blockop-fallback pass
patterns.add</*VectorExtractStridedSliceConversion,*/ VectorizableOpPattern,
VectorShffleOpConversion, VectorInterleaveOpConversion,
VectorSplatOpConversion, VectorExtractElementOpConversion>(
typeConverter, context);
Expand Down
391 changes: 391 additions & 0 deletions test/Dialect/XeTile/Transforms/block_op_fallback.mlir

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
// RUN: %python_executable %imex_runner --requires=l0-runtime -i %s --pass-pipeline-file=%p/xetile-fallback-to-func-vc.pp \
// RUN: --runner imex-cpu-runner -e main \
// RUN: --entry-point-result=void \
// RUN: --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%levelzero_runtime --filecheck
// RUN: %python_executable %imex_runner --requires=sycl-runtime -i %s --pass-pipeline-file=%p/xetile-fallback-to-func-vc.pp \
// RUN: --runner imex-cpu-runner -e main \
// RUN: --entry-point-result=void \
// RUN: --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck

module @narrow_tile attributes {gpu.container_module} {
func.func @test(%A: memref<64x1xf32>) -> memref<64x1xf32> attributes {llvm.emit_c_interface} {
%c1 = arith.constant 1 : index
%A_gpu = gpu.alloc host_shared() : memref<64x1xf32>
memref.copy %A, %A_gpu : memref<64x1xf32> to memref<64x1xf32>
%B_gpu = gpu.alloc host_shared() : memref<64x1xf32>
gpu.launch_func @test_module::@test_scf_for blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%A_gpu : memref<64x1xf32>, %B_gpu : memref<64x1xf32>)
%B = memref.alloc() : memref<64x1xf32>
memref.copy %B_gpu, %B : memref<64x1xf32> to memref<64x1xf32>
gpu.dealloc %A_gpu : memref<64x1xf32>
gpu.dealloc %B_gpu : memref<64x1xf32>
return %B : memref<64x1xf32>
}
gpu.module @test_module attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Bfloat16ConversionINTEL, BFloat16TypeKHR, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR, VectorAnyINTEL, VectorComputeINTEL], [SPV_EXT_shader_atomic_float_add, SPV_KHR_bfloat16, SPV_KHR_expect_assume, SPV_INTEL_bfloat16_conversion, SPV_INTEL_vector_compute]>, api=OpenCL, #spirv.resource_limits<>>} {
gpu.func @test_scf_for(%arg0: memref<64x1xf32>, %arg1: memref<64x1xf32>) kernel attributes {VectorComputeFunctionINTEL, known_block_size = array<i32: 1, 1, 1>, known_grid_size = array<i32: 1, 1, 1>, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
%cst0 = arith.constant 0 : index
%cst16 = arith.constant 16 : index
%cst64 = arith.constant 64 : index
%0 = xetile.init_tile %arg0 [0, 0] : memref<64x1xf32> -> !xetile.tile<16x1xf32, #xetile.tile_attr<order = [1, 0]>>
%1 = xetile.init_tile %arg1 [0, 0] : memref<64x1xf32> -> !xetile.tile<16x1xf32, #xetile.tile_attr<order = [1, 0]>>
%out:2 = scf.for %k = %cst0 to %cst64 step %cst16
iter_args(%a_tile = %0, %b_tile = %1)
-> (!xetile.tile<16x1xf32, #xetile.tile_attr<order = [1, 0]>>, !xetile.tile<16x1xf32, #xetile.tile_attr<order = [1, 0]>>) {
%a_value = xetile.load_tile %a_tile : !xetile.tile<16x1xf32, #xetile.tile_attr<order = [1, 0]>> -> vector<16x1xf32>
xetile.store_tile %a_value, %b_tile : vector<16x1xf32>, !xetile.tile<16x1xf32, #xetile.tile_attr<order = [1, 0]>>
%a_next_tile = xetile.update_tile_offset %a_tile, [%cst16, %cst0] : !xetile.tile<16x1xf32, #xetile.tile_attr<order = [1, 0]>>
%b_next_tile = xetile.update_tile_offset %b_tile, [%cst16, %cst0] : !xetile.tile<16x1xf32, #xetile.tile_attr<order = [1, 0]>>
scf.yield %a_next_tile, %b_next_tile : !xetile.tile<16x1xf32, #xetile.tile_attr<order = [1, 0]>>, !xetile.tile<16x1xf32, #xetile.tile_attr<order = [1, 0]>>
}
gpu.return
}
}
func.func @main() attributes {llvm.emit_c_interface} {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index

%A = memref.alloc() : memref<64x1xf32>
scf.for %arg0 = %c0 to %c64 step %c1 {
%0 = index.castu %arg0 : index to i32
%val = arith.uitofp %0 : i32 to f32
memref.store %val, %A[%arg0, %c0] : memref<64x1xf32>
}
%C = call @test(%A) : (memref<64x1xf32>) -> memref<64x1xf32>
%cast_A = memref.cast %A : memref<64x1xf32> to memref<*xf32>
%cast_C = memref.cast %C : memref<64x1xf32> to memref<*xf32>
// CHECK: [ALLCLOSE: TRUE]
call @printAllcloseF32(%cast_C, %cast_A) : (memref<*xf32>, memref<*xf32>) -> ()
//call @printMemrefF32(%cast_A) : (memref<*xf32>) -> ()
//call @printMemrefF32(%cast_C) : (memref<*xf32>) -> ()
return
}
func.func private @printAllcloseF32(memref<*xf32>, memref<*xf32>) attributes {llvm.emit_c_interface}
//func.func private @printMemrefF32(memref<*xf32>) attributes {llvm.emit_c_interface}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
// RUN: %python_executable %imex_runner --requires=l0-runtime -i %s --pass-pipeline-file=%p/xetile-fallback-to-func-vc.pp \
// RUN: --runner imex-cpu-runner -e main \
// RUN: --entry-point-result=void \
// RUN: --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%levelzero_runtime --filecheck
// RUN: %python_executable %imex_runner --requires=sycl-runtime -i %s --pass-pipeline-file=%p/xetile-fallback-to-func-vc.pp \
// RUN: --runner imex-cpu-runner -e main \
// RUN: --entry-point-result=void \
// RUN: --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck

module @narrow_tile attributes {gpu.container_module} {
func.func @test(%A: memref<64x2xf32>) -> memref<64x2xf32> attributes {llvm.emit_c_interface} {
%c1 = arith.constant 1 : index
%A_gpu = gpu.alloc host_shared() : memref<64x2xf32>
memref.copy %A, %A_gpu : memref<64x2xf32> to memref<64x2xf32>
%B_gpu = gpu.alloc host_shared() : memref<64x2xf32>
gpu.launch_func @test_module::@test_scf_for blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%A_gpu : memref<64x2xf32>, %B_gpu : memref<64x2xf32>)
%B = memref.alloc() : memref<64x2xf32>
memref.copy %B_gpu, %B : memref<64x2xf32> to memref<64x2xf32>
gpu.dealloc %A_gpu : memref<64x2xf32>
gpu.dealloc %B_gpu : memref<64x2xf32>
return %B : memref<64x2xf32>
}
gpu.module @test_module attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Bfloat16ConversionINTEL, BFloat16TypeKHR, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR, VectorAnyINTEL, VectorComputeINTEL], [SPV_EXT_shader_atomic_float_add, SPV_KHR_bfloat16, SPV_KHR_expect_assume, SPV_INTEL_bfloat16_conversion, SPV_INTEL_vector_compute]>, api=OpenCL, #spirv.resource_limits<>>} {
gpu.func @test_scf_for(%arg0: memref<64x2xf32>, %arg1: memref<64x2xf32>) kernel attributes {VectorComputeFunctionINTEL, known_block_size = array<i32: 1, 1, 1>, known_grid_size = array<i32: 1, 1, 1>, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
%cst0 = arith.constant 0 : index
%cst16 = arith.constant 16 : index
%cst64 = arith.constant 64 : index
%0 = xetile.init_tile %arg0 [0, 0] : memref<64x2xf32> -> !xetile.tile<16x2xf32, #xetile.tile_attr<order = [1, 0]>>
%1 = xetile.init_tile %arg1 [0, 0] : memref<64x2xf32> -> !xetile.tile<16x2xf32, #xetile.tile_attr<order = [1, 0]>>
%out:2 = scf.for %k = %cst0 to %cst64 step %cst16
iter_args(%a_tile = %0, %b_tile = %1)
-> (!xetile.tile<16x2xf32, #xetile.tile_attr<order = [1, 0]>>, !xetile.tile<16x2xf32, #xetile.tile_attr<order = [1, 0]>>) {
%a_value = xetile.load_tile %a_tile : !xetile.tile<16x2xf32, #xetile.tile_attr<order = [1, 0]>> -> vector<16x2xf32>
xetile.store_tile %a_value, %b_tile : vector<16x2xf32>, !xetile.tile<16x2xf32, #xetile.tile_attr<order = [1, 0]>>
%a_next_tile = xetile.update_tile_offset %a_tile, [%cst16, %cst0] : !xetile.tile<16x2xf32, #xetile.tile_attr<order = [1, 0]>>
%b_next_tile = xetile.update_tile_offset %b_tile, [%cst16, %cst0] : !xetile.tile<16x2xf32, #xetile.tile_attr<order = [1, 0]>>
scf.yield %a_next_tile, %b_next_tile : !xetile.tile<16x2xf32, #xetile.tile_attr<order = [1, 0]>>, !xetile.tile<16x2xf32, #xetile.tile_attr<order = [1, 0]>>
}
gpu.return
}
}
func.func @main() attributes {llvm.emit_c_interface} {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index

%A = memref.alloc() : memref<64x2xf32>
scf.for %arg0 = %c0 to %c64 step %c1 {
%0 = index.castu %arg0 : index to i32
%val = arith.uitofp %0 : i32 to f32
memref.store %val, %A[%arg0, %c0] : memref<64x2xf32>
memref.store %val, %A[%arg0, %c1] : memref<64x2xf32>
}
%C = call @test(%A) : (memref<64x2xf32>) -> memref<64x2xf32>
%cast_A = memref.cast %A : memref<64x2xf32> to memref<*xf32>
%cast_C = memref.cast %C : memref<64x2xf32> to memref<*xf32>
// CHECK: [ALLCLOSE: TRUE]
call @printAllcloseF32(%cast_C, %cast_A) : (memref<*xf32>, memref<*xf32>) -> ()
//call @printMemrefF32(%cast_A) : (memref<*xf32>) -> ()
//call @printMemrefF32(%cast_C) : (memref<*xf32>) -> ()
return
}
func.func private @printAllcloseF32(memref<*xf32>, memref<*xf32>) attributes {llvm.emit_c_interface}
//func.func private @printMemrefF32(memref<*xf32>) attributes {llvm.emit_c_interface}
}
80 changes: 80 additions & 0 deletions test/Integration/Dialect/XeTile/fallback/slm.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
// RUN: %python_executable %imex_runner --requires=l0-runtime -i %s --pass-pipeline-file=%p/xetile-fallback-to-func-vc.pp \
// RUN: --runner imex-cpu-runner -e main \
// RUN: --entry-point-result=void \
// RUN: --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%levelzero_runtime --filecheck
// RUN: %python_executable %imex_runner --requires=sycl-runtime -i %s --pass-pipeline-file=%p/xetile-fallback-to-func-vc.pp \
// RUN: --runner imex-cpu-runner -e main \
// RUN: --entry-point-result=void \
// RUN: --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck

module @narrow_tile attributes {gpu.container_module} {
func.func @test(%A: memref<32x32xf32>) -> memref<32x32xf32> attributes {llvm.emit_c_interface} {
%c1 = arith.constant 1 : index
%A_gpu = gpu.alloc host_shared() : memref<32x32xf32>
memref.copy %A, %A_gpu : memref<32x32xf32> to memref<32x32xf32>
%B_gpu = gpu.alloc host_shared() : memref<32x32xf32>
gpu.launch_func @test_module::@test_scf_for blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%A_gpu : memref<32x32xf32>, %B_gpu : memref<32x32xf32>)
%B = memref.alloc() : memref<32x32xf32>
memref.copy %B_gpu, %B : memref<32x32xf32> to memref<32x32xf32>
gpu.dealloc %A_gpu : memref<32x32xf32>
gpu.dealloc %B_gpu : memref<32x32xf32>
return %B : memref<32x32xf32>
}
gpu.module @test_module attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Bfloat16ConversionINTEL, BFloat16TypeKHR, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR, VectorAnyINTEL, VectorComputeINTEL], [SPV_EXT_shader_atomic_float_add, SPV_KHR_bfloat16, SPV_KHR_expect_assume, SPV_INTEL_bfloat16_conversion, SPV_INTEL_vector_compute]>, api=OpenCL, #spirv.resource_limits<>>} {
gpu.func @test_scf_for(%arg0: memref<32x32xf32>, %arg1: memref<32x32xf32>) kernel attributes {VectorComputeFunctionINTEL, known_block_size = array<i32: 1, 1, 1>, known_grid_size = array<i32: 1, 1, 1>, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
%cst0 = arith.constant 0 : index
%cst8 = arith.constant 8 : index
%cst16 = arith.constant 16 : index
%cst32 = arith.constant 32 : index
%0 = xetile.init_tile %arg0 [0, 0] : memref<32x32xf32> -> !xetile.tile<8x16xf32, #xetile.tile_attr<order = [1, 0]>>
%1 = xetile.init_tile %arg1 [0, 0] : memref<32x32xf32> -> !xetile.tile<8x16xf32, #xetile.tile_attr<order = [1, 0]>>
%slm = memref.alloc() : memref<8x16xf32, 3>
%slm_tile = xetile.init_tile %slm [0, 0] : memref<8x16xf32, 3> -> !xetile.tile<8x16xf32, #xetile.tile_attr<order = [1, 0], memory_space = 3 : i32>>
%out:2 = scf.for %j = %cst0 to %cst32 step %cst8
iter_args(%a_tile = %0, %b_tile = %1)
-> (!xetile.tile<8x16xf32, #xetile.tile_attr<order = [1, 0]>>, !xetile.tile<8x16xf32, #xetile.tile_attr<order = [1, 0]>>) {
%out:2 = scf.for %k = %cst0 to %cst32 step %cst16
iter_args(%c_tile = %a_tile, %d_tile = %b_tile)
-> (!xetile.tile<8x16xf32, #xetile.tile_attr<order = [1, 0]>>, !xetile.tile<8x16xf32, #xetile.tile_attr<order = [1, 0]>>) {
%c_value = xetile.load_tile %c_tile : !xetile.tile<8x16xf32, #xetile.tile_attr<order = [1, 0]>> -> vector<8x16xf32>
xetile.store_tile %c_value, %slm_tile : vector<8x16xf32>, !xetile.tile<8x16xf32, #xetile.tile_attr<order = [1, 0], memory_space = 3 : i32>>
%d_value = xetile.load_tile %slm_tile : !xetile.tile<8x16xf32, #xetile.tile_attr<order = [1, 0], memory_space = 3 : i32>> -> vector<8x16xf32>
xetile.store_tile %d_value, %d_tile : vector<8x16xf32>, !xetile.tile<8x16xf32, #xetile.tile_attr<order = [1, 0]>>
%c_next_tile = xetile.update_tile_offset %c_tile, [%cst0, %cst16] : !xetile.tile<8x16xf32, #xetile.tile_attr<order = [1, 0]>>
%d_next_tile = xetile.update_tile_offset %d_tile, [%cst0, %cst16] : !xetile.tile<8x16xf32, #xetile.tile_attr<order = [1, 0]>>
scf.yield %c_next_tile, %d_next_tile : !xetile.tile<8x16xf32, #xetile.tile_attr<order = [1, 0]>>, !xetile.tile<8x16xf32, #xetile.tile_attr<order = [1, 0]>>
}
%a_next_tile = xetile.update_tile_offset %a_tile, [%cst8, %cst0] : !xetile.tile<8x16xf32, #xetile.tile_attr<order = [1, 0]>>
%b_next_tile = xetile.update_tile_offset %b_tile, [%cst8, %cst0] : !xetile.tile<8x16xf32, #xetile.tile_attr<order = [1, 0]>>
scf.yield %a_next_tile, %b_next_tile : !xetile.tile<8x16xf32, #xetile.tile_attr<order = [1, 0]>>, !xetile.tile<8x16xf32, #xetile.tile_attr<order = [1, 0]>>
}
gpu.return
}
}
func.func @main() attributes {llvm.emit_c_interface} {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c32 = arith.constant 32 : index

%A = memref.alloc() : memref<32x32xf32>
scf.for %arg0 = %c0 to %c32 step %c1 {
scf.for %arg1 = %c0 to %c32 step %c1 {
%0 = index.castu %arg0 : index to i32
%1 = index.castu %arg1 : index to i32
%2 = arith.addi %0, %1 : i32
%val = arith.uitofp %2 : i32 to f32
memref.store %val, %A[%arg0, %arg1] : memref<32x32xf32>
}
}
%C = call @test(%A) : (memref<32x32xf32>) -> memref<32x32xf32>
%cast_A = memref.cast %A : memref<32x32xf32> to memref<*xf32>
%cast_C = memref.cast %C : memref<32x32xf32> to memref<*xf32>
// CHECK: [ALLCLOSE: TRUE]
call @printAllcloseF32(%cast_C, %cast_A) : (memref<*xf32>, memref<*xf32>) -> ()
//call @printMemrefF32(%cast_A) : (memref<*xf32>) -> ()
//call @printMemrefF32(%cast_C) : (memref<*xf32>) -> ()
return
}
func.func private @printAllcloseF32(memref<*xf32>, memref<*xf32>) attributes {llvm.emit_c_interface}
//func.func private @printMemrefF32(memref<*xf32>) attributes {llvm.emit_c_interface}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
builtin.module(
cse
gpu.module(xetile-init-duplicate
xetile-canonicalization
xetile-blockop-fallback
xetile-blocking
cse
convert-xetile-to-xegpu
cse
imex-xegpu-hoist-transpose
imex-xegpu-apply-vnni-transformation
imex-xegpu-optimize-transpose)
cse
imex-vector-linearize
cse
imex-remove-single-elem-vector
canonicalize
cse
gpu.module(convert-xegpu-to-vc)
reconcile-unrealized-casts
bf16-to-gpu
cse
imex-convert-gpu-to-spirv
spirv.module(spirv-lower-abi-attrs
spirv-update-vce)
func.func(llvm-request-c-wrappers)
serialize-spirv
convert-vector-to-scf
convert-gpu-to-gpux
convert-scf-to-cf
expand-strided-metadata
finalize-memref-to-llvm
convert-cf-to-llvm
convert-vector-to-llvm
convert-index-to-llvm
convert-arith-to-llvm
convert-func-to-llvm
convert-math-to-llvm
convert-gpux-to-llvm
lower-affine
reconcile-unrealized-casts)
4 changes: 3 additions & 1 deletion test/Transforms/RemoveSingleElemVector/postop_reduce_n.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,9 @@ module {
%34 = arith.remsi %11, %c4 : index
%35 = scf.for %arg3 = %c0 to %c3 step %c1 iter_args(%arg4 = %cst) -> (vector<8x1xf32>) {
%39 = vector.shape_cast %arg4 : vector<8x1xf32> to vector<8xf32>
//CHECK-COUNT-8: vector.extractelement {{.*}} : vector<8xf32>
// Disabling remove single elem vector.extra_stride_slice for now.
// DISABLE-CHECK-COUNT-8: vector.extractelement {{.*}} : vector<8xf32>
// CHECK-COUNT-8: vector.extract_strided_slice
%40 = vector.extract_strided_slice %39 {offsets = [0], sizes = [1], strides = [1]} : vector<8xf32> to vector<1xf32>
%41 = vector.extract_strided_slice %39 {offsets = [1], sizes = [1], strides = [1]} : vector<8xf32> to vector<1xf32>
%42 = vector.extract_strided_slice %39 {offsets = [2], sizes = [1], strides = [1]} : vector<8xf32> to vector<1xf32>
Expand Down

0 comments on commit ff51594

Please sign in to comment.