Fusion benchmark

Overview

We will illustrate how much filter-fusion improves the end-to-end performance of the pipeline. Fusion is the functionality to eliminate the loop over the image of a sequence of filter applications while keeping the filter modular and composable.

Before fusion: |A| -> |B| -> |C|

for y in 0..640:
 for x in 0..480:
  // A
for y in 0..640:
 for x in 0..480:
  // B
for y in 0..640:
 for x in 0..480:
  // C

After fusion: |A+B+C|

for y in 0..640:
 for x in 0..480:
  // A+B+C

Results

Target	Fusion	Execution Time (us)
CPU (i7-13700K)	No	3707.19
CPU (i7-13700K)	Yes	276.088
GPU (RTX 4060 Ti)	No	14.5658
GPU (RTX 4060 Ti)	Yes	1.4592

Source code

#include "ion/ion.h"

using namespace ion;

class Inc : public BuildingBlock<Inc> {
public:
    Input<Halide::Func> input{"input", type_of<uint8_t>(), 3};
    Output<Halide::Func> output{"output", type_of<uint8_t>(), 3};
    BuildingBlockParam<bool> enable_fuse{"enable_fuse", false};

    void generate() {
        output(c, x, y) = input(c, x, y) + 1;
    }

    void schedule() {
        if (!enable_fuse) {
            output.compute_root();
            if (get_target().has_gpu_feature()) {
                Halide::Var xo, yo, xi, yi;
                output.gpu_tile(x, y, xo, yo, xi, yi, 16, 16);
            }
        }
    }

private:
    Halide::Var c, x, y;
};

ION_REGISTER_BUILDING_BLOCK(Inc, inc);

void run_experiments(const Target& target, bool control_fuse_) {
    static int ex_cnt = 0;

    int32_t w = 640;
    int32_t h = 480;

    Builder b;
    b.set_target(target);

    ion::Buffer<uint8_t> src{3, w, h};
    src.fill(0);

    Param control_fuse{"enable_fuse", control_fuse_};

    int num_inc = 10;

    Node n;
    n = b.add("inc")(src).set_params(control_fuse);         // 1
    n = b.add("inc")(n["output"]).set_params(control_fuse); // 2
    n = b.add("inc")(n["output"]).set_params(control_fuse); // 3
    n = b.add("inc")(n["output"]).set_params(control_fuse); // 4
    n = b.add("inc")(n["output"]).set_params(control_fuse); // 5
    n = b.add("inc")(n["output"]).set_params(control_fuse); // 6
    n = b.add("inc")(n["output"]).set_params(control_fuse); // 7
    n = b.add("inc")(n["output"]).set_params(control_fuse); // 8
    n = b.add("inc")(n["output"]).set_params(control_fuse); // 9
    n = b.add("inc")(n["output"]).set_params(Param{"enable_fuse", false}); // 10

    ion::Buffer<uint8_t> dst{3, w, h};
    n["output"].bind(dst);

    int num_try = 100;

    // Warmup
    b.run();

    auto s = std::chrono::high_resolution_clock::now();

    for (int i=0; i<num_try; ++i) {
        b.run();
    }

    auto e = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double, std::micro> elapsed = e - s;
    std::cout << "## Experiment #" << ex_cnt++ << ":" << target.to_string() << std::endl;
    std::cout << "Executgion:"  << elapsed.count() / num_try << " us" << std::endl;
    std::cout << "Loop nest:";
    b.print_loop_nest();
    std::cout << "====" << std::endl;

    dst.copy_to_host();

    for (int y=0; y<h; ++y) {
        for (int x=0; x<w; ++x) {
            if (dst(x, y) != (src(x, y) + num_inc)) {
                throw std::runtime_error("Invalid result");
            }
        }
    }

    return;
}

int main()
{
    try {
        run_experiments(get_host_target(), false);
        run_experiments(get_host_target(), true);
        run_experiments(get_host_target().with_feature(Halide::Target::CUDA), false);
        run_experiments(get_host_target().with_feature(Halide::Target::CUDA), true);
    } catch (Halide::Error& e) {
        std::cerr << e.what() << std::endl;
        return 1;
    } catch (const std::exception& e) {
        std::cerr << e.what() << std::endl;
        return 1;
    }

    std::cout << "Passed" << std::endl;

    return 0;
}

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Fusion benchmark

Overview

Results

Source code

Clone this wiki locally