-
Notifications
You must be signed in to change notification settings - Fork 6
Fusion benchmark
Takuro IIZUKA edited this page Aug 20, 2024
·
2 revisions
We will illustrate how much filter-fusion improves the end-to-end performance of the pipeline. Fusion is the functionality to eliminate the loop over the image of a sequence of filter applications while keeping the filter modular and composable.
Before fusion: |A| -> |B| -> |C|
for y in 0..640:
for x in 0..480:
// A
for y in 0..640:
for x in 0..480:
// B
for y in 0..640:
for x in 0..480:
// C
After fusion: |A+B+C|
for y in 0..640:
for x in 0..480:
// A+B+C
Target | Fusion | Execution Time (us) |
---|---|---|
CPU (i7-13700K) | No | 3707.19 |
CPU (i7-13700K) | Yes | 276.088 |
GPU (RTX 4060 Ti) | No | 14.5658 |
GPU (RTX 4060 Ti) | Yes | 1.4592 |
#include "ion/ion.h"
using namespace ion;
class Inc : public BuildingBlock<Inc> {
public:
Input<Halide::Func> input{"input", type_of<uint8_t>(), 3};
Output<Halide::Func> output{"output", type_of<uint8_t>(), 3};
BuildingBlockParam<bool> enable_fuse{"enable_fuse", false};
void generate() {
output(c, x, y) = input(c, x, y) + 1;
}
void schedule() {
if (!enable_fuse) {
output.compute_root();
if (get_target().has_gpu_feature()) {
Halide::Var xo, yo, xi, yi;
output.gpu_tile(x, y, xo, yo, xi, yi, 16, 16);
}
}
}
private:
Halide::Var c, x, y;
};
ION_REGISTER_BUILDING_BLOCK(Inc, inc);
void run_experiments(const Target& target, bool control_fuse_) {
static int ex_cnt = 0;
int32_t w = 640;
int32_t h = 480;
Builder b;
b.set_target(target);
ion::Buffer<uint8_t> src{3, w, h};
src.fill(0);
Param control_fuse{"enable_fuse", control_fuse_};
int num_inc = 10;
Node n;
n = b.add("inc")(src).set_params(control_fuse); // 1
n = b.add("inc")(n["output"]).set_params(control_fuse); // 2
n = b.add("inc")(n["output"]).set_params(control_fuse); // 3
n = b.add("inc")(n["output"]).set_params(control_fuse); // 4
n = b.add("inc")(n["output"]).set_params(control_fuse); // 5
n = b.add("inc")(n["output"]).set_params(control_fuse); // 6
n = b.add("inc")(n["output"]).set_params(control_fuse); // 7
n = b.add("inc")(n["output"]).set_params(control_fuse); // 8
n = b.add("inc")(n["output"]).set_params(control_fuse); // 9
n = b.add("inc")(n["output"]).set_params(Param{"enable_fuse", false}); // 10
ion::Buffer<uint8_t> dst{3, w, h};
n["output"].bind(dst);
int num_try = 100;
// Warmup
b.run();
auto s = std::chrono::high_resolution_clock::now();
for (int i=0; i<num_try; ++i) {
b.run();
}
auto e = std::chrono::high_resolution_clock::now();
std::chrono::duration<double, std::micro> elapsed = e - s;
std::cout << "## Experiment #" << ex_cnt++ << ":" << target.to_string() << std::endl;
std::cout << "Executgion:" << elapsed.count() / num_try << " us" << std::endl;
std::cout << "Loop nest:";
b.print_loop_nest();
std::cout << "====" << std::endl;
dst.copy_to_host();
for (int y=0; y<h; ++y) {
for (int x=0; x<w; ++x) {
if (dst(x, y) != (src(x, y) + num_inc)) {
throw std::runtime_error("Invalid result");
}
}
}
return;
}
int main()
{
try {
run_experiments(get_host_target(), false);
run_experiments(get_host_target(), true);
run_experiments(get_host_target().with_feature(Halide::Target::CUDA), false);
run_experiments(get_host_target().with_feature(Halide::Target::CUDA), true);
} catch (Halide::Error& e) {
std::cerr << e.what() << std::endl;
return 1;
} catch (const std::exception& e) {
std::cerr << e.what() << std::endl;
return 1;
}
std::cout << "Passed" << std::endl;
return 0;
}