From beaa4bf781cd7585253997b54f325fdd63becd03 Mon Sep 17 00:00:00 2001 From: sunli <466530738@qq.com> Date: Thu, 11 May 2023 12:50:38 +0000 Subject: [PATCH 01/11] using nvcc compiler --- cinn/backends/nvrtc/nvcc_util.cc | 99 ++++++++++++++++++++++++++++++++ cinn/backends/nvrtc/nvcc_util.h | 38 ++++++++++++ 2 files changed, 137 insertions(+) create mode 100644 cinn/backends/nvrtc/nvcc_util.cc create mode 100644 cinn/backends/nvrtc/nvcc_util.h diff --git a/cinn/backends/nvrtc/nvcc_util.cc b/cinn/backends/nvrtc/nvcc_util.cc new file mode 100644 index 0000000000..b35f02c926 --- /dev/null +++ b/cinn/backends/nvrtc/nvcc_util.cc @@ -0,0 +1,99 @@ +// Copyright (c) 2023 CINN Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "cinn/backends/nvrtc/nvcc_util.h" + +#include + +#include +#include +#include + +#include "cinn/common/common.h" + +#ifdef CINN_WITH_CUDA + +namespace cinn { +namespace backends { +namespace nvrtc { + +std::string NvccCompiler::operator()(const std::string& cuda_c) { + // read dir source + std::string dir = "./source"; + if (_access(dir.c_str(), 0) == -1) { + CHECK(mkdir(dir.c_str()) != -1) << "Fail to mkdir " << dir; + } + + // get unqiue prefix name + auto prefix_name = dir + common::UniqName("cuda_c"); + + auto cuda_c_file = prefix_name + ".cu"; + std::ofstream ofs(cuda_c_file, std::ios::out); + CHECK(ofs.is_open()) << "Fail to open file " << cuda_c_file; + ofs << cuda_c; + ofs.close(); + + CompileToPtx(prefix_name); + CompileToCubin(prefix_name); + + // open cubin file + auto cuda_cubin_file = prefix_name + ".cubin"; + std::ifstream ifs(cuda_cubin_file, std::ios::in | std::ios::binary); + CHECK(ifs.is_open()) << "Fail to open file " << cuda_cubin_file; + ifs.seekg(std::ios::end); + auto len = ifs.tellg(); + ifs.seekg(0); + + // read cubin file + std::string cubin(len,''); + ifs.read(&ubin.data(), len); + ifs.close(); + return cubin; +} + +void NvccCompiler::CompileToPtx(const std::string& prefix_name) { + auto options = "export PATH=/usr/local/cuda/bin:$PATH && nvcc --ptx -O3"; + options += " -arch=" + GetDeviceArch; + options += " -o " + prefix_name + ".ptx"; + options += " " + prefix_name + ".cu"; + + CHECK(system(options) == 0) << options; +} + +void NvccCompiler::CompileToCubin(const std::string& prefix_name) { + auto options = "export PATH=/usr/local/cuda/bin:$PATH && nvcc --cubin -O3"; + options += " -arch=" + GetDeviceArch; + options += " -o " + prefix_name + ".cubin"; + options += " " + prefix_name + ".ptx"; + + CHECK(system(options) == 0) << options; +} + +std::string NvccCompiler::GetDeviceArch() { + int major = 0, minor = 0; + if (cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, 0) == cudaSuccess && + cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, 0) == cudaSuccess) { + return "sm" + std::to_string(major) + std::to_string(minor); + } else { + LOG(WARNING) << "cannot detect compute capability from your device, " + << "fall back to compute_30."; + return "sm_30" + } +} + +} // namespace nvrtc +} // namespace backends +} // namespace cinn + +#endif // CINN_WITH_CUDA diff --git a/cinn/backends/nvrtc/nvcc_util.h b/cinn/backends/nvrtc/nvcc_util.h new file mode 100644 index 0000000000..d3049b48a3 --- /dev/null +++ b/cinn/backends/nvrtc/nvcc_util.h @@ -0,0 +1,38 @@ +// Copyright (c) 2023 CINN Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#ifdef CINN_WITH_CUDA + +namespace cinn { +namespace backends { +namespace nvrtc { + +class NvccCompiler { + public: + NvccCompiler(){}; + ~NvccCompiler(){}; + std::string operator()(const std::string&); + + private: + void CompileToPtx(const std::string&); + void CompileToCubin(const std::string&); + std::string GetDeviceArch(); +}; + +} // namespace nvrtc +} // namespace backends +} // namespace cinn + +#endif // CINN_WITH_CUDA From a4bc137f57ac7d18b1c82e128b1001849090130d Mon Sep 17 00:00:00 2001 From: sunli <466530738@qq.com> Date: Thu, 11 May 2023 13:10:22 +0000 Subject: [PATCH 02/11] using nvcc compiler --- cinn/backends/compiler.cc | 7 ++-- cinn/backends/nvrtc/CMakeLists.txt | 1 + cinn/backends/nvrtc/nvcc_util.cc | 44 ++++++++++++++---------- cinn/backends/nvrtc/nvcc_util.h | 3 ++ cinn/hlir/framework/parallel_compiler.cc | 10 +++--- 5 files changed, 38 insertions(+), 27 deletions(-) diff --git a/cinn/backends/compiler.cc b/cinn/backends/compiler.cc index 880441ff63..1c23cab3cd 100644 --- a/cinn/backends/compiler.cc +++ b/cinn/backends/compiler.cc @@ -22,6 +22,7 @@ #include "cinn/backends/codegen_cuda_dev.h" #include "cinn/backends/codegen_cuda_host.h" #include "cinn/backends/codegen_cuda_util.h" +#include "cinn/backends/nvrtc/nvcc_util.h" #include "cinn/backends/nvrtc/nvrtc_util.h" #include "cinn/runtime/cuda/cuda_module.h" #include "cinn/runtime/cuda/cuda_util.h" @@ -123,13 +124,15 @@ void Compiler::CompileCudaModule(const Module& module, const std::string& code) SourceCodePrint::GetInstance()->write(source_code); using runtime::cuda::CUDAModule; + /* backends::nvrtc::Compiler compiler; auto ptx = compiler(source_code); CHECK(!ptx.empty()) << "Compile PTX failed from source code:\n" << source_code; + */ + backends::nvrtc::NvccCompiler compiler; - cuda_module_.reset( - new CUDAModule(ptx, compiler.compile_to_cubin() ? CUDAModule::Kind::CUBIN : CUDAModule::Kind::PTX)); + cuda_module_.reset(new CUDAModule(compiler(source_code), CUDAModule::Kind::CUBIN)); RuntimeSymbols symbols; diff --git a/cinn/backends/nvrtc/CMakeLists.txt b/cinn/backends/nvrtc/CMakeLists.txt index a344b65ca9..b67a2b6413 100644 --- a/cinn/backends/nvrtc/CMakeLists.txt +++ b/cinn/backends/nvrtc/CMakeLists.txt @@ -3,6 +3,7 @@ core_gather_headers() gather_srcs(cinnapi_src SRCS header_generator.cc nvrtc_util.cc + nvcc_util.cc ) nv_test(test_nvrtc_util SRCS nvrtc_util_test.cc DEPS cinncore) diff --git a/cinn/backends/nvrtc/nvcc_util.cc b/cinn/backends/nvrtc/nvcc_util.cc index b35f02c926..993c79dab2 100644 --- a/cinn/backends/nvrtc/nvcc_util.cc +++ b/cinn/backends/nvrtc/nvcc_util.cc @@ -14,16 +14,17 @@ #include "cinn/backends/nvrtc/nvcc_util.h" +#include "cinn/common/common.h" + +#ifdef CINN_WITH_CUDA + +#include #include #include #include #include -#include "cinn/common/common.h" - -#ifdef CINN_WITH_CUDA - namespace cinn { namespace backends { namespace nvrtc { @@ -36,7 +37,7 @@ std::string NvccCompiler::operator()(const std::string& cuda_c) { } // get unqiue prefix name - auto prefix_name = dir + common::UniqName("cuda_c"); + auto prefix_name = dir + "/" + common::UniqName("rtc_tmp"); auto cuda_c_file = prefix_name + ".cu"; std::ofstream ofs(cuda_c_file, std::ios::out); @@ -47,21 +48,11 @@ std::string NvccCompiler::operator()(const std::string& cuda_c) { CompileToPtx(prefix_name); CompileToCubin(prefix_name); - // open cubin file - auto cuda_cubin_file = prefix_name + ".cubin"; - std::ifstream ifs(cuda_cubin_file, std::ios::in | std::ios::binary); - CHECK(ifs.is_open()) << "Fail to open file " << cuda_cubin_file; - ifs.seekg(std::ios::end); - auto len = ifs.tellg(); - ifs.seekg(0); - - // read cubin file - std::string cubin(len,''); - ifs.read(&ubin.data(), len); - ifs.close(); - return cubin; + return ReadFile(prefix_name + ".cubin", std::ios::in | std::ios::binary); } +std::string GetPtx() { return ReadFile(prefix_name + ".ptx", std::ios::in); } + void NvccCompiler::CompileToPtx(const std::string& prefix_name) { auto options = "export PATH=/usr/local/cuda/bin:$PATH && nvcc --ptx -O3"; options += " -arch=" + GetDeviceArch; @@ -84,7 +75,7 @@ std::string NvccCompiler::GetDeviceArch() { int major = 0, minor = 0; if (cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, 0) == cudaSuccess && cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, 0) == cudaSuccess) { - return "sm" + std::to_string(major) + std::to_string(minor); + return "sm_" + std::to_string(major) + std::to_string(minor); } else { LOG(WARNING) << "cannot detect compute capability from your device, " << "fall back to compute_30."; @@ -92,6 +83,21 @@ std::string NvccCompiler::GetDeviceArch() { } } +std::string NvccCompiler::ReadFile(const std::string& file_name, std::ios_base::openmode mode) { + // open cubin file + std::ifstream ifs(file_name, mode); + CHECK(ifs.is_open()) << "Fail to open file " << file_name; + ifs.seekg(std::ios::end); + auto len = ifs.tellg(); + ifs.seekg(0); + + // read cubin file + std::string file_data(len,''); + ifs.read(&file_data.data(), len); + ifs.close(); + return file_data; +} + } // namespace nvrtc } // namespace backends } // namespace cinn diff --git a/cinn/backends/nvrtc/nvcc_util.h b/cinn/backends/nvrtc/nvcc_util.h index d3049b48a3..62b2b00919 100644 --- a/cinn/backends/nvrtc/nvcc_util.h +++ b/cinn/backends/nvrtc/nvcc_util.h @@ -24,11 +24,14 @@ class NvccCompiler { NvccCompiler(){}; ~NvccCompiler(){}; std::string operator()(const std::string&); + std::string GetPtx(); private: void CompileToPtx(const std::string&); void CompileToCubin(const std::string&); std::string GetDeviceArch(); + + std::string ReadFile(const std::string&, std::ios_base::openmode); }; } // namespace nvrtc diff --git a/cinn/hlir/framework/parallel_compiler.cc b/cinn/hlir/framework/parallel_compiler.cc index aa22dfce65..56cb449e9a 100644 --- a/cinn/hlir/framework/parallel_compiler.cc +++ b/cinn/hlir/framework/parallel_compiler.cc @@ -175,13 +175,11 @@ void ParallelCompiler::Task::CodegenAndJit() { graph->SaveSourceCode(cuda_c); using runtime::cuda::CUDAModule; - backends::nvrtc::Compiler compiler; - auto ptx = compiler(cuda_c); - CHECK(!ptx.empty()) << "Compile PTX failed from source code:\n" << cuda_c; - graph->SavePTXCode(ptx); + backends::nvrtc::NvccCompiler compiler; + cuda_module_.reset(new CUDAModule(compiler(source_code), CUDAModule::Kind::CUBIN)); - // load cumodule - cumodule.reset(new CUDAModule(ptx, compiler.compile_to_cubin() ? CUDAModule::Kind::CUBIN : CUDAModule::Kind::PTX)); + // save ptx + graph->SavePTXCode(compiler.GetPtx()); // register kernel backends::RuntimeSymbols symbols; for (auto& fn : dmodule.functions()) { From ff9dc17e21db5a441a1be3acfea0ed079df4bf12 Mon Sep 17 00:00:00 2001 From: sunli <466530738@qq.com> Date: Thu, 11 May 2023 13:47:35 +0000 Subject: [PATCH 03/11] compile pass --- cinn/backends/compiler.cc | 2 +- cinn/backends/nvrtc/nvcc_util.cc | 55 ++++++++++++------------ cinn/backends/nvrtc/nvcc_util.h | 9 +++- cinn/hlir/framework/parallel_compiler.cc | 3 +- 4 files changed, 38 insertions(+), 31 deletions(-) diff --git a/cinn/backends/compiler.cc b/cinn/backends/compiler.cc index 1c23cab3cd..50feb6fd4a 100644 --- a/cinn/backends/compiler.cc +++ b/cinn/backends/compiler.cc @@ -130,7 +130,7 @@ void Compiler::CompileCudaModule(const Module& module, const std::string& code) auto ptx = compiler(source_code); CHECK(!ptx.empty()) << "Compile PTX failed from source code:\n" << source_code; */ - backends::nvrtc::NvccCompiler compiler; + nvrtc::NvccCompiler compiler; cuda_module_.reset(new CUDAModule(compiler(source_code), CUDAModule::Kind::CUBIN)); diff --git a/cinn/backends/nvrtc/nvcc_util.cc b/cinn/backends/nvrtc/nvcc_util.cc index 993c79dab2..25fd1f138b 100644 --- a/cinn/backends/nvrtc/nvcc_util.cc +++ b/cinn/backends/nvrtc/nvcc_util.cc @@ -19,11 +19,12 @@ #ifdef CINN_WITH_CUDA #include -#include +#include +#include +#include -#include +#include #include -#include namespace cinn { namespace backends { @@ -32,43 +33,43 @@ namespace nvrtc { std::string NvccCompiler::operator()(const std::string& cuda_c) { // read dir source std::string dir = "./source"; - if (_access(dir.c_str(), 0) == -1) { - CHECK(mkdir(dir.c_str()) != -1) << "Fail to mkdir " << dir; + if (access(dir.c_str(), 0) == -1) { + CHECK(mkdir(dir.c_str(), 7) != -1) << "Fail to mkdir " << dir; } // get unqiue prefix name - auto prefix_name = dir + "/" + common::UniqName("rtc_tmp"); + prefix_name_ = dir + "/" + common::UniqName("rtc_tmp"); - auto cuda_c_file = prefix_name + ".cu"; + auto cuda_c_file = prefix_name_ + ".cu"; std::ofstream ofs(cuda_c_file, std::ios::out); CHECK(ofs.is_open()) << "Fail to open file " << cuda_c_file; ofs << cuda_c; ofs.close(); - CompileToPtx(prefix_name); - CompileToCubin(prefix_name); + CompileToPtx(); + CompileToCubin(); - return ReadFile(prefix_name + ".cubin", std::ios::in | std::ios::binary); + return ReadFile(prefix_name_ + ".cubin", std::ios::in | std::ios::binary); } -std::string GetPtx() { return ReadFile(prefix_name + ".ptx", std::ios::in); } +std::string NvccCompiler::GetPtx() { return ReadFile(prefix_name_ + ".ptx", std::ios::in); } -void NvccCompiler::CompileToPtx(const std::string& prefix_name) { - auto options = "export PATH=/usr/local/cuda/bin:$PATH && nvcc --ptx -O3"; - options += " -arch=" + GetDeviceArch; - options += " -o " + prefix_name + ".ptx"; - options += " " + prefix_name + ".cu"; +void NvccCompiler::CompileToPtx() { + std::string options = "export PATH=/usr/local/cuda/bin:$PATH && nvcc --ptx -O3"; + options += " -arch=" + GetDeviceArch(); + options += " -o " + prefix_name_ + ".ptx"; + options += " " + prefix_name_ + ".cu"; - CHECK(system(options) == 0) << options; + CHECK(system(options.c_str()) == 0) << options; } -void NvccCompiler::CompileToCubin(const std::string& prefix_name) { - auto options = "export PATH=/usr/local/cuda/bin:$PATH && nvcc --cubin -O3"; - options += " -arch=" + GetDeviceArch; - options += " -o " + prefix_name + ".cubin"; - options += " " + prefix_name + ".ptx"; +void NvccCompiler::CompileToCubin() { + std::string options = "export PATH=/usr/local/cuda/bin:$PATH && nvcc --cubin -O3"; + options += " -arch=" + GetDeviceArch(); + options += " -o " + prefix_name_ + ".cubin"; + options += " " + prefix_name_ + ".ptx"; - CHECK(system(options) == 0) << options; + CHECK(system(options.c_str()) == 0) << options; } std::string NvccCompiler::GetDeviceArch() { @@ -79,7 +80,7 @@ std::string NvccCompiler::GetDeviceArch() { } else { LOG(WARNING) << "cannot detect compute capability from your device, " << "fall back to compute_30."; - return "sm_30" + return "sm_30"; } } @@ -92,10 +93,10 @@ std::string NvccCompiler::ReadFile(const std::string& file_name, std::ios_base:: ifs.seekg(0); // read cubin file - std::string file_data(len,''); - ifs.read(&file_data.data(), len); + std::string file_data(len, ' '); + ifs.read(&file_data[0], len); ifs.close(); - return file_data; + return std::move(file_data); } } // namespace nvrtc diff --git a/cinn/backends/nvrtc/nvcc_util.h b/cinn/backends/nvrtc/nvcc_util.h index 62b2b00919..c3db6d151a 100644 --- a/cinn/backends/nvrtc/nvcc_util.h +++ b/cinn/backends/nvrtc/nvcc_util.h @@ -13,6 +13,9 @@ // limitations under the License. #pragma once +#include +#include + #ifdef CINN_WITH_CUDA namespace cinn { @@ -27,11 +30,13 @@ class NvccCompiler { std::string GetPtx(); private: - void CompileToPtx(const std::string&); - void CompileToCubin(const std::string&); + void CompileToPtx(); + void CompileToCubin(); std::string GetDeviceArch(); std::string ReadFile(const std::string&, std::ios_base::openmode); + + std::string prefix_name_{""}; }; } // namespace nvrtc diff --git a/cinn/hlir/framework/parallel_compiler.cc b/cinn/hlir/framework/parallel_compiler.cc index 56cb449e9a..c4c221d439 100644 --- a/cinn/hlir/framework/parallel_compiler.cc +++ b/cinn/hlir/framework/parallel_compiler.cc @@ -24,6 +24,7 @@ #include "cinn/backends/compiler.h" #include "cinn/backends/llvm/codegen_x86.h" #include "cinn/backends/llvm/runtime_symbol_registry.h" +#include "cinn/backends/nvrtc/nvcc_util.h" #include "cinn/backends/nvrtc/nvrtc_util.h" #include "cinn/common/context.h" #include "cinn/hlir/framework/pass.h" @@ -176,7 +177,7 @@ void ParallelCompiler::Task::CodegenAndJit() { using runtime::cuda::CUDAModule; backends::nvrtc::NvccCompiler compiler; - cuda_module_.reset(new CUDAModule(compiler(source_code), CUDAModule::Kind::CUBIN)); + cumodule.reset(new CUDAModule(compiler(cuda_c), CUDAModule::Kind::CUBIN)); // save ptx graph->SavePTXCode(compiler.GetPtx()); From 1d20fc382858e4cda96f113564a87c60912dbf07 Mon Sep 17 00:00:00 2001 From: sunli <466530738@qq.com> Date: Fri, 12 May 2023 03:30:35 +0000 Subject: [PATCH 04/11] add include dir in nvcc option --- cinn/backends/nvrtc/nvcc_util.cc | 15 +++++++++++++-- cinn/runtime/cuda/cuda_module.cc | 7 ++++--- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/cinn/backends/nvrtc/nvcc_util.cc b/cinn/backends/nvrtc/nvcc_util.cc index 25fd1f138b..a968f76d1b 100644 --- a/cinn/backends/nvrtc/nvcc_util.cc +++ b/cinn/backends/nvrtc/nvcc_util.cc @@ -49,13 +49,24 @@ std::string NvccCompiler::operator()(const std::string& cuda_c) { CompileToPtx(); CompileToCubin(); - return ReadFile(prefix_name_ + ".cubin", std::ios::in | std::ios::binary); + return prefix_name_ + ".cubin"; } std::string NvccCompiler::GetPtx() { return ReadFile(prefix_name_ + ".ptx", std::ios::in); } void NvccCompiler::CompileToPtx() { - std::string options = "export PATH=/usr/local/cuda/bin:$PATH && nvcc --ptx -O3"; + auto include_dir = common::Context::Global().runtime_include_dir(); + std::string include_dir_str = ""; + for (auto dir : include_dir) { + if (include_dir_str.empty()) { + include_dir_str = dir; + } else { + include_dir_str += ":" + dir; + } + } + + std::string options = + std::string("export PATH=/usr/local/cuda/bin:$PATH && nvcc -std=c++14 --ptx -O3 -I ") + include_dir_str; options += " -arch=" + GetDeviceArch(); options += " -o " + prefix_name_ + ".ptx"; options += " " + prefix_name_ + ".cu"; diff --git a/cinn/runtime/cuda/cuda_module.cc b/cinn/runtime/cuda/cuda_module.cc index 0ec5aa0bfe..2c80998897 100644 --- a/cinn/runtime/cuda/cuda_module.cc +++ b/cinn/runtime/cuda/cuda_module.cc @@ -103,8 +103,9 @@ CUfunction CUDAModule::GetFunction(int device_id, const std::string& func_name) jit_options[4] = CU_JIT_GENERATE_LINE_INFO; jit_opt_vals[4] = reinterpret_cast(value); - CUresult status = cuModuleLoadDataEx( - &module_per_card_[device_id], data_.c_str(), jit_num_options, jit_options.data(), jit_opt_vals.data()); + // CUresult status = cuModuleLoadDataEx( + // &module_per_card_[device_id], data_.c_str(), jit_num_options, jit_options.data(), jit_opt_vals.data()); + CUresult status = cuModuleLoad(&module_per_card_[device_id], data_.c_str()); if (CUDA_SUCCESS != status) { RAW_LOG(ERROR, "PTX JIT ERROR LOG: %s\n.", log_buffer.data()); @@ -124,7 +125,7 @@ CUfunction CUDAModule::GetFunction(int device_id, const std::string& func_name) CUdeviceptr CUDAModule::GetGlobal(int device_id, const std::string& name, size_t nbytes) { if (!module_per_card_[device_id]) { std::lock_guard lock(mutex_); - CUDA_DRIVER_CALL(cuModuleLoadData(&module_per_card_[device_id], data_.c_str())); + CUDA_DRIVER_CALL(cuModuleLoad(&module_per_card_[device_id], data_.c_str())); } CUdeviceptr global; From 6f490743b6c7b0fa9633a6d610221e3d5e0bc01f Mon Sep 17 00:00:00 2001 From: sunli <466530738@qq.com> Date: Fri, 12 May 2023 07:09:40 +0000 Subject: [PATCH 05/11] check using nvcc --- cinn/backends/compiler.cc | 20 ++++++++++--------- cinn/hlir/framework/parallel_compiler.cc | 19 ++++++++++++++---- cinn/runtime/cuda/cuda_module.cc | 25 ++++++++++++------------ cinn/runtime/flags.cc | 8 ++++++++ cinn/runtime/flags.h | 2 ++ 5 files changed, 48 insertions(+), 26 deletions(-) diff --git a/cinn/backends/compiler.cc b/cinn/backends/compiler.cc index 50feb6fd4a..08be15db33 100644 --- a/cinn/backends/compiler.cc +++ b/cinn/backends/compiler.cc @@ -26,6 +26,7 @@ #include "cinn/backends/nvrtc/nvrtc_util.h" #include "cinn/runtime/cuda/cuda_module.h" #include "cinn/runtime/cuda/cuda_util.h" +#include "cinn/runtime/flags.h" #endif DECLARE_string(cinn_source_code_save_path); @@ -124,15 +125,16 @@ void Compiler::CompileCudaModule(const Module& module, const std::string& code) SourceCodePrint::GetInstance()->write(source_code); using runtime::cuda::CUDAModule; - /* - backends::nvrtc::Compiler compiler; - - auto ptx = compiler(source_code); - CHECK(!ptx.empty()) << "Compile PTX failed from source code:\n" << source_code; - */ - nvrtc::NvccCompiler compiler; - - cuda_module_.reset(new CUDAModule(compiler(source_code), CUDAModule::Kind::CUBIN)); + if (runtime::CanUseNvccCompiler()) { + nvrtc::NvccCompiler compiler; + cuda_module_.reset(new CUDAModule(compiler(source_code), CUDAModule::Kind::CUBIN)); + } else { + nvrtc::Compiler compiler; + auto ptx = compiler(source_code); + CHECK(!ptx.empty()) << "Compile PTX failed from source code:\n" << source_code; + cuda_module_.reset( + new CUDAModule(ptx, compiler.compile_to_cubin() ? CUDAModule::Kind::CUBIN : CUDAModule::Kind::PTX)); + } RuntimeSymbols symbols; diff --git a/cinn/hlir/framework/parallel_compiler.cc b/cinn/hlir/framework/parallel_compiler.cc index c4c221d439..cc3ac7e140 100644 --- a/cinn/hlir/framework/parallel_compiler.cc +++ b/cinn/hlir/framework/parallel_compiler.cc @@ -29,6 +29,7 @@ #include "cinn/common/context.h" #include "cinn/hlir/framework/pass.h" #include "cinn/ir/module.h" +#include "cinn/runtime/flags.h" DECLARE_int32(cinn_parallel_compile_size); DECLARE_int32(cinn_parallel_compile_thread); @@ -176,11 +177,21 @@ void ParallelCompiler::Task::CodegenAndJit() { graph->SaveSourceCode(cuda_c); using runtime::cuda::CUDAModule; - backends::nvrtc::NvccCompiler compiler; - cumodule.reset(new CUDAModule(compiler(cuda_c), CUDAModule::Kind::CUBIN)); + if (runtime::CanUseNvccCompiler()) { + backends::nvrtc::NvccCompiler compiler; + // load cumodule + cumodule.reset(new CUDAModule(compiler(cuda_c), CUDAModule::Kind::CUBIN)); + graph->SavePTXCode(compiler.GetPtx()); + } else { + backends::nvrtc::Compiler compiler; + auto ptx = compiler(cuda_c); + CHECK(!ptx.empty()) << "Compile PTX failed from source code:\n" << cuda_c; + // load cumodule + cumodule.reset( + new CUDAModule(ptx, compiler.compile_to_cubin() ? CUDAModule::Kind::CUBIN : CUDAModule::Kind::PTX)); + graph->SavePTXCode(ptx); + } - // save ptx - graph->SavePTXCode(compiler.GetPtx()); // register kernel backends::RuntimeSymbols symbols; for (auto& fn : dmodule.functions()) { diff --git a/cinn/runtime/cuda/cuda_module.cc b/cinn/runtime/cuda/cuda_module.cc index 2c80998897..5a90d0cccb 100644 --- a/cinn/runtime/cuda/cuda_module.cc +++ b/cinn/runtime/cuda/cuda_module.cc @@ -25,6 +25,7 @@ #include "cinn/backends/cuda_util.h" #include "cinn/runtime/cuda/cuda_util.h" +#include "cinn/runtime/flags.h" namespace cinn { namespace runtime { @@ -103,17 +104,11 @@ CUfunction CUDAModule::GetFunction(int device_id, const std::string& func_name) jit_options[4] = CU_JIT_GENERATE_LINE_INFO; jit_opt_vals[4] = reinterpret_cast(value); - // CUresult status = cuModuleLoadDataEx( - // &module_per_card_[device_id], data_.c_str(), jit_num_options, jit_options.data(), jit_opt_vals.data()); - CUresult status = cuModuleLoad(&module_per_card_[device_id], data_.c_str()); - - if (CUDA_SUCCESS != status) { - RAW_LOG(ERROR, "PTX JIT ERROR LOG: %s\n.", log_buffer.data()); - const char* name; - cuGetErrorName(status, &name); - const char* msg; - cuGetErrorString(status, &msg); - RAW_LOG(FATAL, "The error `%s` occurs while compiling the ptx! And its message is `%s`.", name, msg); + if (runtime::CanUseNvccCompiler()) { + CUDA_DRIVER_CALL(cuModuleLoad(&module_per_card_[device_id], data_.c_str())); + } else { + CUDA_DRIVER_CALL(cuModuleLoadDataEx( + &module_per_card_[device_id], data_.c_str(), jit_num_options, jit_options.data(), jit_opt_vals.data())); } } @@ -125,11 +120,15 @@ CUfunction CUDAModule::GetFunction(int device_id, const std::string& func_name) CUdeviceptr CUDAModule::GetGlobal(int device_id, const std::string& name, size_t nbytes) { if (!module_per_card_[device_id]) { std::lock_guard lock(mutex_); - CUDA_DRIVER_CALL(cuModuleLoad(&module_per_card_[device_id], data_.c_str())); + if (runtime::CanUseNvccCompiler()) { + CUDA_DRIVER_CALL(cuModuleLoad(&module_per_card_[device_id], data_.c_str())); + } else { + CUDA_DRIVER_CALL(cuModuleLoadData(&module_per_card_[device_id], data_.c_str())); + } } - CUdeviceptr global; size_t _nbytes; + CUdeviceptr global; CUDA_DRIVER_CALL(cuModuleGetGlobal(&global, &_nbytes, module_per_card_[device_id], name.c_str())); return global; } diff --git a/cinn/runtime/flags.cc b/cinn/runtime/flags.cc index a567a6a59e..af2433121c 100644 --- a/cinn/runtime/flags.cc +++ b/cinn/runtime/flags.cc @@ -16,6 +16,9 @@ #include #include +#include +#include +#include #include @@ -173,5 +176,10 @@ unsigned long long RandomSeed::Clear() { return old_seed; } +bool CanUseNvccCompiler() { + std::string nvcc_dir = "/usr/local/cuda/bin/nvcc"; + return access(nvcc_dir.c_str(), 0) == -1 ? false : true; +} + } // namespace runtime } // namespace cinn diff --git a/cinn/runtime/flags.h b/cinn/runtime/flags.h index 59ebdba4e1..4491097c42 100644 --- a/cinn/runtime/flags.h +++ b/cinn/runtime/flags.h @@ -25,6 +25,8 @@ bool CheckStringFlagFalse(const std::string &flag); void SetCinnCudnnDeterministic(bool state); bool GetCinnCudnnDeterministic(); +bool CanUseNvccCompiler(); + class RandomSeed { public: static unsigned long long GetOrSet(unsigned long long seed = 0); From e5aaf6e416fe1ae2996805d9e4951151b2fb2cbf Mon Sep 17 00:00:00 2001 From: sunli <466530738@qq.com> Date: Fri, 12 May 2023 07:31:37 +0000 Subject: [PATCH 06/11] flag compile with nvrtc --- cinn/backends/compiler.cc | 1 - cinn/runtime/flags.cc | 8 ++++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/cinn/backends/compiler.cc b/cinn/backends/compiler.cc index 08be15db33..d9d63cda39 100644 --- a/cinn/backends/compiler.cc +++ b/cinn/backends/compiler.cc @@ -137,7 +137,6 @@ void Compiler::CompileCudaModule(const Module& module, const std::string& code) } RuntimeSymbols symbols; - for (auto& fn : device_module.functions()) { std::string kernel_fn_name = fn->name; auto fn_kernel = cuda_module_->GetFunction(0, kernel_fn_name); diff --git a/cinn/runtime/flags.cc b/cinn/runtime/flags.cc index af2433121c..0a66bc991f 100644 --- a/cinn/runtime/flags.cc +++ b/cinn/runtime/flags.cc @@ -83,9 +83,13 @@ DEFINE_bool(cinn_use_dense_merge_pass, "Whether use dense merge pass."); DEFINE_bool(nvrtc_compile_to_cubin, - BoolFromEnv("FLAGS_nvrtc_compile_to_cubin", false), + BoolFromEnv("FLAGS_nvrtc_compile_to_cubin", true), "Whether nvrtc compile cuda source into cubin instead of ptx (only works after cuda-11.1)."); +DEFINE_bool(cinn_compile_with_nvrtc, + BoolFromEnv("FLAGS_cinn_compile_with_nvrtc", false), + "Whether nvrtc compile cuda source with nvrtc(default nvcc)."); + // FLAGS for performance analysis and accuracy debug DEFINE_bool(cinn_sync_run, BoolFromEnv("FLAGS_cinn_sync_run", false), @@ -178,7 +182,7 @@ unsigned long long RandomSeed::Clear() { bool CanUseNvccCompiler() { std::string nvcc_dir = "/usr/local/cuda/bin/nvcc"; - return access(nvcc_dir.c_str(), 0) == -1 ? false : true; + return (access(nvcc_dir.c_str(), 0) == -1 ? false : true) && (!FLAGS_cinn_compile_with_nvrtc); } } // namespace runtime From c851b685319ef333c12056b09d63afff21272624 Mon Sep 17 00:00:00 2001 From: sunli <466530738@qq.com> Date: Fri, 12 May 2023 07:53:09 +0000 Subject: [PATCH 07/11] add nvcc cmd path --- cinn/backends/nvrtc/nvcc_util.cc | 11 ++++++++--- cinn/runtime/flags.cc | 5 ++++- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/cinn/backends/nvrtc/nvcc_util.cc b/cinn/backends/nvrtc/nvcc_util.cc index a968f76d1b..4264444656 100644 --- a/cinn/backends/nvrtc/nvcc_util.cc +++ b/cinn/backends/nvrtc/nvcc_util.cc @@ -26,6 +26,8 @@ #include #include +DECLARE_string(cinn_nvcc_cmd_path); + namespace cinn { namespace backends { namespace nvrtc { @@ -65,21 +67,24 @@ void NvccCompiler::CompileToPtx() { } } - std::string options = - std::string("export PATH=/usr/local/cuda/bin:$PATH && nvcc -std=c++14 --ptx -O3 -I ") + include_dir_str; + std::string options = std::string("export PATH=") + FLAGS_cinn_nvcc_cmd_path + + std::string(":$PATH && nvcc -std=c++14 --ptx -O3 -I ") + include_dir_str; options += " -arch=" + GetDeviceArch(); options += " -o " + prefix_name_ + ".ptx"; options += " " + prefix_name_ + ".cu"; + VLOG(2) << "Nvcc Compile Options : " << options; CHECK(system(options.c_str()) == 0) << options; } void NvccCompiler::CompileToCubin() { - std::string options = "export PATH=/usr/local/cuda/bin:$PATH && nvcc --cubin -O3"; + std::string options = + std::string("export PATH=") + FLAGS_cinn_nvcc_cmd_path + std::string(":$PATH && nvcc --cubin -O3"); options += " -arch=" + GetDeviceArch(); options += " -o " + prefix_name_ + ".cubin"; options += " " + prefix_name_ + ".ptx"; + VLOG(2) << "Nvcc Compile Options : " << options; CHECK(system(options.c_str()) == 0) << options; } diff --git a/cinn/runtime/flags.cc b/cinn/runtime/flags.cc index 0a66bc991f..7238c9e59f 100644 --- a/cinn/runtime/flags.cc +++ b/cinn/runtime/flags.cc @@ -36,6 +36,9 @@ using ::GFLAGS_NAMESPACE::Int64FromEnv; using ::GFLAGS_NAMESPACE::StringFromEnv; DEFINE_string(cinn_x86_builtin_code_root, StringFromEnv("FLAGS_cinn_x86_builtin_code_root", ""), ""); +DEFINE_string(cinn_nvcc_cmd_path, + StringFromEnv("FLAGS_cinn_nvcc_cmd_path", "/usr/local/cuda/bin"), + "Setting nvcc default path!"); DEFINE_int32(cinn_parallel_compile_size, Int32FromEnv("FLAGS_cinn_parallel_compile_size", 16), @@ -181,7 +184,7 @@ unsigned long long RandomSeed::Clear() { } bool CanUseNvccCompiler() { - std::string nvcc_dir = "/usr/local/cuda/bin/nvcc"; + std::string nvcc_dir = FLAGS_cinn_nvcc_cmd_path + "/nvcc"; return (access(nvcc_dir.c_str(), 0) == -1 ? false : true) && (!FLAGS_cinn_compile_with_nvrtc); } From a0c35dea8550938b583bbc99045cab6fe845cc23 Mon Sep 17 00:00:00 2001 From: sunli <466530738@qq.com> Date: Mon, 15 May 2023 13:26:35 +0000 Subject: [PATCH 08/11] using nvcc in nvrtc --- cinn/backends/compiler.cc | 16 +-- cinn/backends/nvrtc/CMakeLists.txt | 1 - cinn/backends/nvrtc/nvcc_util.cc | 122 ----------------------- cinn/backends/nvrtc/nvcc_util.h | 46 --------- cinn/backends/nvrtc/nvrtc_util.cc | 94 +++++++++++++++++ cinn/backends/nvrtc/nvrtc_util.h | 13 +++ cinn/hlir/framework/parallel_compiler.cc | 20 +--- 7 files changed, 117 insertions(+), 195 deletions(-) delete mode 100644 cinn/backends/nvrtc/nvcc_util.cc delete mode 100644 cinn/backends/nvrtc/nvcc_util.h diff --git a/cinn/backends/compiler.cc b/cinn/backends/compiler.cc index d9d63cda39..798b0a96a2 100644 --- a/cinn/backends/compiler.cc +++ b/cinn/backends/compiler.cc @@ -22,7 +22,6 @@ #include "cinn/backends/codegen_cuda_dev.h" #include "cinn/backends/codegen_cuda_host.h" #include "cinn/backends/codegen_cuda_util.h" -#include "cinn/backends/nvrtc/nvcc_util.h" #include "cinn/backends/nvrtc/nvrtc_util.h" #include "cinn/runtime/cuda/cuda_module.h" #include "cinn/runtime/cuda/cuda_util.h" @@ -125,16 +124,11 @@ void Compiler::CompileCudaModule(const Module& module, const std::string& code) SourceCodePrint::GetInstance()->write(source_code); using runtime::cuda::CUDAModule; - if (runtime::CanUseNvccCompiler()) { - nvrtc::NvccCompiler compiler; - cuda_module_.reset(new CUDAModule(compiler(source_code), CUDAModule::Kind::CUBIN)); - } else { - nvrtc::Compiler compiler; - auto ptx = compiler(source_code); - CHECK(!ptx.empty()) << "Compile PTX failed from source code:\n" << source_code; - cuda_module_.reset( - new CUDAModule(ptx, compiler.compile_to_cubin() ? CUDAModule::Kind::CUBIN : CUDAModule::Kind::PTX)); - } + nvrtc::Compiler compiler; + auto ptx = compiler(source_code); + CHECK(!ptx.empty()) << "Compile PTX failed from source code:\n" << source_code; + cuda_module_.reset( + new CUDAModule(ptx, compiler.compile_to_cubin() ? CUDAModule::Kind::CUBIN : CUDAModule::Kind::PTX)); RuntimeSymbols symbols; for (auto& fn : device_module.functions()) { diff --git a/cinn/backends/nvrtc/CMakeLists.txt b/cinn/backends/nvrtc/CMakeLists.txt index b67a2b6413..a344b65ca9 100644 --- a/cinn/backends/nvrtc/CMakeLists.txt +++ b/cinn/backends/nvrtc/CMakeLists.txt @@ -3,7 +3,6 @@ core_gather_headers() gather_srcs(cinnapi_src SRCS header_generator.cc nvrtc_util.cc - nvcc_util.cc ) nv_test(test_nvrtc_util SRCS nvrtc_util_test.cc DEPS cinncore) diff --git a/cinn/backends/nvrtc/nvcc_util.cc b/cinn/backends/nvrtc/nvcc_util.cc deleted file mode 100644 index 4264444656..0000000000 --- a/cinn/backends/nvrtc/nvcc_util.cc +++ /dev/null @@ -1,122 +0,0 @@ -// Copyright (c) 2023 CINN Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "cinn/backends/nvrtc/nvcc_util.h" - -#include "cinn/common/common.h" - -#ifdef CINN_WITH_CUDA - -#include -#include -#include -#include - -#include -#include - -DECLARE_string(cinn_nvcc_cmd_path); - -namespace cinn { -namespace backends { -namespace nvrtc { - -std::string NvccCompiler::operator()(const std::string& cuda_c) { - // read dir source - std::string dir = "./source"; - if (access(dir.c_str(), 0) == -1) { - CHECK(mkdir(dir.c_str(), 7) != -1) << "Fail to mkdir " << dir; - } - - // get unqiue prefix name - prefix_name_ = dir + "/" + common::UniqName("rtc_tmp"); - - auto cuda_c_file = prefix_name_ + ".cu"; - std::ofstream ofs(cuda_c_file, std::ios::out); - CHECK(ofs.is_open()) << "Fail to open file " << cuda_c_file; - ofs << cuda_c; - ofs.close(); - - CompileToPtx(); - CompileToCubin(); - - return prefix_name_ + ".cubin"; -} - -std::string NvccCompiler::GetPtx() { return ReadFile(prefix_name_ + ".ptx", std::ios::in); } - -void NvccCompiler::CompileToPtx() { - auto include_dir = common::Context::Global().runtime_include_dir(); - std::string include_dir_str = ""; - for (auto dir : include_dir) { - if (include_dir_str.empty()) { - include_dir_str = dir; - } else { - include_dir_str += ":" + dir; - } - } - - std::string options = std::string("export PATH=") + FLAGS_cinn_nvcc_cmd_path + - std::string(":$PATH && nvcc -std=c++14 --ptx -O3 -I ") + include_dir_str; - options += " -arch=" + GetDeviceArch(); - options += " -o " + prefix_name_ + ".ptx"; - options += " " + prefix_name_ + ".cu"; - - VLOG(2) << "Nvcc Compile Options : " << options; - CHECK(system(options.c_str()) == 0) << options; -} - -void NvccCompiler::CompileToCubin() { - std::string options = - std::string("export PATH=") + FLAGS_cinn_nvcc_cmd_path + std::string(":$PATH && nvcc --cubin -O3"); - options += " -arch=" + GetDeviceArch(); - options += " -o " + prefix_name_ + ".cubin"; - options += " " + prefix_name_ + ".ptx"; - - VLOG(2) << "Nvcc Compile Options : " << options; - CHECK(system(options.c_str()) == 0) << options; -} - -std::string NvccCompiler::GetDeviceArch() { - int major = 0, minor = 0; - if (cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, 0) == cudaSuccess && - cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, 0) == cudaSuccess) { - return "sm_" + std::to_string(major) + std::to_string(minor); - } else { - LOG(WARNING) << "cannot detect compute capability from your device, " - << "fall back to compute_30."; - return "sm_30"; - } -} - -std::string NvccCompiler::ReadFile(const std::string& file_name, std::ios_base::openmode mode) { - // open cubin file - std::ifstream ifs(file_name, mode); - CHECK(ifs.is_open()) << "Fail to open file " << file_name; - ifs.seekg(std::ios::end); - auto len = ifs.tellg(); - ifs.seekg(0); - - // read cubin file - std::string file_data(len, ' '); - ifs.read(&file_data[0], len); - ifs.close(); - return std::move(file_data); -} - -} // namespace nvrtc -} // namespace backends -} // namespace cinn - -#endif // CINN_WITH_CUDA diff --git a/cinn/backends/nvrtc/nvcc_util.h b/cinn/backends/nvrtc/nvcc_util.h deleted file mode 100644 index c3db6d151a..0000000000 --- a/cinn/backends/nvrtc/nvcc_util.h +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright (c) 2023 CINN Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#pragma once - -#include -#include - -#ifdef CINN_WITH_CUDA - -namespace cinn { -namespace backends { -namespace nvrtc { - -class NvccCompiler { - public: - NvccCompiler(){}; - ~NvccCompiler(){}; - std::string operator()(const std::string&); - std::string GetPtx(); - - private: - void CompileToPtx(); - void CompileToCubin(); - std::string GetDeviceArch(); - - std::string ReadFile(const std::string&, std::ios_base::openmode); - - std::string prefix_name_{""}; -}; - -} // namespace nvrtc -} // namespace backends -} // namespace cinn - -#endif // CINN_WITH_CUDA diff --git a/cinn/backends/nvrtc/nvrtc_util.cc b/cinn/backends/nvrtc/nvrtc_util.cc index 101406984e..4598054701 100644 --- a/cinn/backends/nvrtc/nvrtc_util.cc +++ b/cinn/backends/nvrtc/nvrtc_util.cc @@ -17,12 +17,20 @@ #include #include #include +#include +#include +#include + +#include +#include #include "cinn/backends/cuda_util.h" #include "cinn/backends/nvrtc/header_generator.h" #include "cinn/common/common.h" +#include "cinn/runtime/flags.h" #include "cinn/utils/string.h" +DECLARE_string(cinn_nvcc_cmd_path); DECLARE_bool(nvrtc_compile_to_cubin); namespace cinn { @@ -30,6 +38,9 @@ namespace backends { namespace nvrtc { std::string Compiler::operator()(const std::string& code, bool include_headers) { + if (runtime::CanUseNvccCompiler()) { + return CompileWithNvcc(code); + } return CompileCudaSource(code, include_headers); } @@ -140,6 +151,89 @@ std::string Compiler::CompileCudaSource(const std::string& code, bool include_he return data; } +std::string Compiler::CompileWithNvcc(const std::string& cuda_c) { + // read dir source + std::string dir = "./source"; + if (access(dir.c_str(), 0) == -1) { + CHECK(mkdir(dir.c_str(), 7) != -1) << "Fail to mkdir " << dir; + } + + // get unqiue prefix name + prefix_name_ = dir + "/" + common::UniqName("rtc_tmp"); + + auto cuda_c_file = prefix_name_ + ".cu"; + std::ofstream ofs(cuda_c_file, std::ios::out); + CHECK(ofs.is_open()) << "Fail to open file " << cuda_c_file; + ofs << cuda_c; + ofs.close(); + + CompileToPtx(); + CompileToCubin(); + + return prefix_name_ + ".cubin"; +} + +// std::string Compiler::GetPtx() { return ReadFile(prefix_name_ + ".ptx", std::ios::in); } + +void Compiler::CompileToPtx() { + auto include_dir = common::Context::Global().runtime_include_dir(); + std::string include_dir_str = ""; + for (auto dir : include_dir) { + if (include_dir_str.empty()) { + include_dir_str = dir; + } else { + include_dir_str += ":" + dir; + } + } + + std::string options = std::string("export PATH=") + FLAGS_cinn_nvcc_cmd_path + + std::string(":$PATH && nvcc -std=c++14 --ptx -O3 -I ") + include_dir_str; + options += " -arch=" + GetDeviceArch(); + options += " -o " + prefix_name_ + ".ptx"; + options += " " + prefix_name_ + ".cu"; + + VLOG(2) << "Nvcc Compile Options : " << options; + CHECK(system(options.c_str()) == 0) << options; +} + +void Compiler::CompileToCubin() { + std::string options = + std::string("export PATH=") + FLAGS_cinn_nvcc_cmd_path + std::string(":$PATH && nvcc --cubin -O3"); + options += " -arch=" + GetDeviceArch(); + options += " -o " + prefix_name_ + ".cubin"; + options += " " + prefix_name_ + ".ptx"; + + VLOG(2) << "Nvcc Compile Options : " << options; + CHECK(system(options.c_str()) == 0) << options; +} + +std::string Compiler::GetDeviceArch() { + int major = 0, minor = 0; + if (cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, 0) == cudaSuccess && + cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, 0) == cudaSuccess) { + return "sm_" + std::to_string(major) + std::to_string(minor); + } else { + LOG(WARNING) << "cannot detect compute capability from your device, " + << "fall back to compute_30."; + return "sm_30"; + } +} + +std::string Compiler::ReadFile(const std::string& file_name, std::ios_base::openmode mode) { + // open cubin file + std::ifstream ifs(file_name, mode); + CHECK(ifs.is_open()) << "Fail to open file " << file_name; + ifs.seekg(std::ios::end); + auto len = ifs.tellg(); + ifs.seekg(0); + + // read cubin file + std::string file_data(len, ' '); + ifs.read(&file_data[0], len); + ifs.close(); + return std::move(file_data); +} + } // namespace nvrtc } // namespace backends } // namespace cinn diff --git a/cinn/backends/nvrtc/nvrtc_util.h b/cinn/backends/nvrtc/nvrtc_util.h index a5f8424a31..b13c24c550 100644 --- a/cinn/backends/nvrtc/nvrtc_util.h +++ b/cinn/backends/nvrtc/nvrtc_util.h @@ -70,6 +70,19 @@ class Compiler { * whether to compile the source code into cubin, only works with cuda version > 11.1 */ bool compile_to_cubin_{false}; + + // compile with nvcc + std::string CompileWithNvcc(const std::string&); + + // compile to ptx + void CompileToPtx(); + // compile to cubin + void CompileToCubin(); + std::string GetDeviceArch(); + + std::string ReadFile(const std::string&, std::ios_base::openmode); + + std::string prefix_name_{""}; }; } // namespace nvrtc diff --git a/cinn/hlir/framework/parallel_compiler.cc b/cinn/hlir/framework/parallel_compiler.cc index cc3ac7e140..ede13cab04 100644 --- a/cinn/hlir/framework/parallel_compiler.cc +++ b/cinn/hlir/framework/parallel_compiler.cc @@ -24,7 +24,6 @@ #include "cinn/backends/compiler.h" #include "cinn/backends/llvm/codegen_x86.h" #include "cinn/backends/llvm/runtime_symbol_registry.h" -#include "cinn/backends/nvrtc/nvcc_util.h" #include "cinn/backends/nvrtc/nvrtc_util.h" #include "cinn/common/context.h" #include "cinn/hlir/framework/pass.h" @@ -177,20 +176,11 @@ void ParallelCompiler::Task::CodegenAndJit() { graph->SaveSourceCode(cuda_c); using runtime::cuda::CUDAModule; - if (runtime::CanUseNvccCompiler()) { - backends::nvrtc::NvccCompiler compiler; - // load cumodule - cumodule.reset(new CUDAModule(compiler(cuda_c), CUDAModule::Kind::CUBIN)); - graph->SavePTXCode(compiler.GetPtx()); - } else { - backends::nvrtc::Compiler compiler; - auto ptx = compiler(cuda_c); - CHECK(!ptx.empty()) << "Compile PTX failed from source code:\n" << cuda_c; - // load cumodule - cumodule.reset( - new CUDAModule(ptx, compiler.compile_to_cubin() ? CUDAModule::Kind::CUBIN : CUDAModule::Kind::PTX)); - graph->SavePTXCode(ptx); - } + backends::nvrtc::Compiler compiler; + auto ptx = compiler(cuda_c); + CHECK(!ptx.empty()) << "Compile PTX failed from source code:\n" << cuda_c; + // load cumodule + cumodule.reset(new CUDAModule(ptx, compiler.compile_to_cubin() ? CUDAModule::Kind::CUBIN : CUDAModule::Kind::PTX)); // register kernel backends::RuntimeSymbols symbols; From c67842fe37061fde5302cce18627ecae0c303b72 Mon Sep 17 00:00:00 2001 From: sunli <466530738@qq.com> Date: Mon, 15 May 2023 13:27:57 +0000 Subject: [PATCH 09/11] default with nvrtc --- cinn/runtime/flags.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cinn/runtime/flags.cc b/cinn/runtime/flags.cc index 7238c9e59f..b6aff4213d 100644 --- a/cinn/runtime/flags.cc +++ b/cinn/runtime/flags.cc @@ -90,7 +90,7 @@ DEFINE_bool(nvrtc_compile_to_cubin, "Whether nvrtc compile cuda source into cubin instead of ptx (only works after cuda-11.1)."); DEFINE_bool(cinn_compile_with_nvrtc, - BoolFromEnv("FLAGS_cinn_compile_with_nvrtc", false), + BoolFromEnv("FLAGS_cinn_compile_with_nvrtc", true), "Whether nvrtc compile cuda source with nvrtc(default nvcc)."); // FLAGS for performance analysis and accuracy debug From af08e4b86dde406c4b85a8495f1c810456f8d6cb Mon Sep 17 00:00:00 2001 From: sunli <466530738@qq.com> Date: Tue, 16 May 2023 01:39:33 +0000 Subject: [PATCH 10/11] remove used unit test --- cinn/hlir/pe/pe_transform_test.cc | 711 ------------------------------ 1 file changed, 711 deletions(-) diff --git a/cinn/hlir/pe/pe_transform_test.cc b/cinn/hlir/pe/pe_transform_test.cc index 58f0c109f2..ce65e95f95 100644 --- a/cinn/hlir/pe/pe_transform_test.cc +++ b/cinn/hlir/pe/pe_transform_test.cc @@ -224,717 +224,6 @@ TEST(Concat, ConcatCase0) { #endif } -TEST(Reduce, Reduce_Test_0) { - int m = 128; - int n = 128; - Expr M(m), N(n); - - Placeholder A("A", {M, N}); - Placeholder B("B", {M, N}); - - auto C = hlir::pe::Add(A.tensor(), B.tensor()); - auto D = hlir::pe::ReduceSum(C, {0}); - auto stages = CreateStages({C, D}); - stages[C]->SetBuffer("local"); - stages[C]->Reorder({1, 0}); - stages[D]->Bind(0, "threadIdx.x"); - stages[C]->SimpleComputeAt(stages[D], 1); - - auto func = Lower("fn", stages, {A, B, D}); - LOG(INFO) << "func:\n" << func; - -#ifdef CINN_WITH_CUDA - auto target = common::DefaultNVGPUTarget(); - Module::Builder builder("Concat_Builder", target); - builder.AddFunction(func); - - auto module = builder.Build(); - auto host_module_device_module = backends::SplitCudaAndHostModule(module); - auto &host_module = std::get<0>(host_module_device_module); - auto &device_module = std::get<1>(host_module_device_module); - - backends::CodeGenCUDA_Dev codegen(target); - auto source_code = codegen.Compile(builder.Build()); - LOG(INFO) << "compiled code:\n\n\n" << source_code; - - // nv jit compile to ptx - backends::nvrtc::Compiler compiler; - auto ptx = compiler(source_code); - CHECK(!ptx.empty()); -#endif -} - -#ifdef CINN_WITH_CUDA -void CudaReduceReorder(poly::StageMap stages, ir::Tensor input, const std::vector &axes) { - auto &shape = input->shape; - std::vector order; - for (int idx = 0; idx < shape.size(); ++idx) { - if (std::find(axes.begin(), axes.end(), idx) == axes.end()) { - order.push_back(idx); - } - } - for (auto axis : axes) { - order.push_back(axis); - } - stages[input]->Reorder(order); - - int last_dimension_num = shape.size() - axes.back() - 1; - int index = shape.size() - last_dimension_num - axes.size(); - for (auto idx = index; idx < index + last_dimension_num - 1; ++idx) { - stages[input]->Fuse(index, index + 1); - } - - if (stages[input]->GetDimRange(index) > 1024) { - stages[input]->Split(index, 1024); - } - - for (int idx = 0; idx < index - 1; ++idx) { - stages[input]->Fuse(0, 1); - } -} - -TEST(Reduce, Reduce_Test_1) { - int m = 128; - int n = 128; - Expr M(m), N(n); - - Placeholder A("A", {M, M, M, N, N}); - Placeholder B("B", {M, M, M, N, N}); - - auto C = hlir::pe::Add(A.tensor(), B.tensor()); - auto D = hlir::pe::ReduceSum(C, {0, 2}); - auto stages = CreateStages({C, D}); - hlir::pe::CudaReduceSchedule(stages, D, 2, common::DefaultNVGPUTarget()); - CudaReduceReorder(stages, C, {0, 2}); - stages[C]->SetBuffer("local"); - stages[C]->SimpleComputeAt(stages[D], stages[D]->n_out_dims() - 1); - // stages[C]->ComputeInline(); - - auto func = Lower("fn", stages, {A, B, D}); - LOG(INFO) << "func:\n" << func; - - auto target = common::DefaultNVGPUTarget(); - Module::Builder builder("Concat_Builder", target); - builder.AddFunction(func); - - auto module = builder.Build(); - auto host_module_device_module = backends::SplitCudaAndHostModule(module); - auto &host_module = std::get<0>(host_module_device_module); - auto &device_module = std::get<1>(host_module_device_module); - - backends::CodeGenCUDA_Dev codegen(target); - auto source_code = codegen.Compile(builder.Build()); - LOG(INFO) << "compiled code:\n\n\n" << source_code; - - // nv jit compile to ptx - backends::nvrtc::Compiler compiler; - auto ptx = compiler(source_code); - CHECK(!ptx.empty()); -} - -TEST(Reduce, Reduce_Test_2) { - int m = 10201; - int n = 50; - Expr M(m), N(n); - - Placeholder A("A", {M, N}); - - auto reduce_out = hlir::pe::BlockShuffleReduceSum(A.tensor(), {0}, false); - CHECK_EQ(reduce_out.size(), 3) << "the output of reduce is not equal to 3"; - auto stages = CreateStages({A, reduce_out[2], reduce_out[1], reduce_out[0]}); - - CudaBlockShuffleReduceSchedule(stages, reduce_out[2], reduce_out[1], reduce_out[0], common::DefaultNVGPUTarget()); - - auto func = Lower("fn", stages, {A, reduce_out[0]}); - LOG(INFO) << "func:\n" << func; - - auto target = common::DefaultNVGPUTarget(); - Module::Builder builder("Concat_Builder", target); - builder.AddFunction(func); - - auto module = builder.Build(); - auto host_module_device_module = backends::SplitCudaAndHostModule(module); - auto &host_module = std::get<0>(host_module_device_module); - auto &device_module = std::get<1>(host_module_device_module); - - backends::CodeGenCUDA_Dev codegen(target); - auto source_code = codegen.Compile(builder.Build()); - LOG(INFO) << "compiled code:\n\n\n" << source_code; - - // nv jit compile to ptx - backends::nvrtc::Compiler compiler; - auto ptx = compiler(source_code); - CHECK(!ptx.empty()); -} - -TEST(Reduce, Reduce_Test_2_1) { - int m = 10240; - int n = 64; - Expr M(m), N(n); - - Placeholder A("A", {M, N}); - - auto reduce_out = hlir::pe::BlockShuffleReduceSum(A.tensor(), {0}, false); - CHECK_EQ(reduce_out.size(), 3) << "the output of reduce is not equal to 3"; - auto stages = CreateStages({A, reduce_out[2], reduce_out[1], reduce_out[0]}); - - CudaBlockShuffleReduceSchedule(stages, reduce_out[2], reduce_out[1], reduce_out[0], common::DefaultNVGPUTarget()); - - auto func = Lower("fn", stages, {A, reduce_out[0]}); - LOG(INFO) << "func:\n" << func; - - auto target = common::DefaultNVGPUTarget(); - Module::Builder builder("Concat_Builder", target); - builder.AddFunction(func); - - auto module = builder.Build(); - auto host_module_device_module = backends::SplitCudaAndHostModule(module); - auto &host_module = std::get<0>(host_module_device_module); - auto &device_module = std::get<1>(host_module_device_module); - - backends::CodeGenCUDA_Dev codegen(target); - auto source_code = codegen.Compile(builder.Build()); - LOG(INFO) << "compiled code:\n\n\n" << source_code; - - // nv jit compile to ptx - backends::nvrtc::Compiler compiler; - auto ptx = compiler(source_code); - CHECK(!ptx.empty()); -} - -TEST(Reduce, Reduce_Test_2_2) { - int m = 10240; - int n = 64; - Expr M(m), N(n); - - Placeholder A("A", {N, M, N}); - - auto reduce_out = hlir::pe::BlockShuffleReduceSum(A.tensor(), {1}, false); - CHECK_EQ(reduce_out.size(), 3) << "the output of reduce is not equal to 3"; - auto stages = CreateStages({A, reduce_out[2], reduce_out[1], reduce_out[0]}); - - CudaBlockShuffleReduceSchedule(stages, reduce_out[2], reduce_out[1], reduce_out[0], common::DefaultNVGPUTarget()); - - auto func = Lower("fn", stages, {A, reduce_out[0]}); - LOG(INFO) << "func:\n" << func; - - auto target = common::DefaultNVGPUTarget(); - Module::Builder builder("Concat_Builder", target); - builder.AddFunction(func); - - auto module = builder.Build(); - auto host_module_device_module = backends::SplitCudaAndHostModule(module); - auto &host_module = std::get<0>(host_module_device_module); - auto &device_module = std::get<1>(host_module_device_module); - - backends::CodeGenCUDA_Dev codegen(target); - auto source_code = codegen.Compile(builder.Build()); - LOG(INFO) << "compiled code:\n\n\n" << source_code; - - // nv jit compile to ptx - backends::nvrtc::Compiler compiler; - auto ptx = compiler(source_code); - CHECK(!ptx.empty()); -} - -TEST(Reduce, Reduce_Test_2_3) { - int m = 10240; - int n = 16; - Expr M(m), N(n); - - Placeholder A("A", {M, N, N}); - - auto reduce_out = hlir::pe::BlockShuffleReduceSum(A.tensor(), {0}, false); - CHECK_EQ(reduce_out.size(), 3) << "the output of reduce is not equal to 3"; - auto stages = CreateStages({A, reduce_out[2], reduce_out[1], reduce_out[0]}); - - CudaBlockShuffleReduceSchedule(stages, reduce_out[2], reduce_out[1], reduce_out[0], common::DefaultNVGPUTarget()); - - auto func = Lower("fn", stages, {A, reduce_out[0]}); - LOG(INFO) << "func:\n" << func; - - auto target = common::DefaultNVGPUTarget(); - Module::Builder builder("Concat_Builder", target); - builder.AddFunction(func); - - auto module = builder.Build(); - auto host_module_device_module = backends::SplitCudaAndHostModule(module); - auto &host_module = std::get<0>(host_module_device_module); - auto &device_module = std::get<1>(host_module_device_module); - - backends::CodeGenCUDA_Dev codegen(target); - auto source_code = codegen.Compile(builder.Build()); - LOG(INFO) << "compiled code:\n\n\n" << source_code; - - // nv jit compile to ptx - backends::nvrtc::Compiler compiler; - auto ptx = compiler(source_code); - CHECK(!ptx.empty()); -} - -TEST(Reduce, Reduce_Test_3) { - int m = 10201; - Expr M(m); - - Placeholder A("A", {M}); - - auto reduce_out = hlir::pe::TwoStepBlockReduceSum(A.tensor(), {0}, false); - CHECK_EQ(reduce_out.size(), 4) << "the output of reduce is not equal to 4!"; - auto stages = CreateStages({A, reduce_out[3], reduce_out[2], reduce_out[1], reduce_out[0]}); - - CudaTwoStepReduceSchedule( - stages, reduce_out[3], reduce_out[2], reduce_out[1], reduce_out[0], common::DefaultNVGPUTarget()); - - auto func = Lower("fn", stages, {A, reduce_out[0]}); - LOG(INFO) << "func:\n" << func; - - auto target = common::DefaultNVGPUTarget(); - Module::Builder builder("Concat_Builder", target); - builder.AddFunction(func); - - auto module = builder.Build(); - auto host_module_device_module = backends::SplitCudaAndHostModule(module); - auto &host_module = std::get<0>(host_module_device_module); - auto &device_module = std::get<1>(host_module_device_module); - - backends::CodeGenCUDA_Dev codegen(target); - auto source_code = codegen.Compile(builder.Build()); - LOG(INFO) << "compiled code:\n\n\n" << source_code; - - // nv jit compile to ptx - backends::nvrtc::Compiler compiler; - auto ptx = compiler(source_code); - CHECK(!ptx.empty()); -} - -TEST(Reduce, Reduce_Test_3_1) { - int m = 10240; - Expr M(m); - - Placeholder A("A", {M}); - - auto reduce_out = hlir::pe::TwoStepBlockReduceSum(A.tensor(), {0}, false); - CHECK_EQ(reduce_out.size(), 4) << "the output of reduce is not equal to 4!"; - auto stages = CreateStages({A, reduce_out[3], reduce_out[2], reduce_out[1], reduce_out[0]}); - - CudaTwoStepReduceSchedule( - stages, reduce_out[3], reduce_out[2], reduce_out[1], reduce_out[0], common::DefaultNVGPUTarget()); - - auto func = Lower("fn", stages, {A, reduce_out[0]}); - LOG(INFO) << "func:\n" << func; - - auto target = common::DefaultNVGPUTarget(); - Module::Builder builder("Concat_Builder", target); - builder.AddFunction(func); - - auto module = builder.Build(); - auto host_module_device_module = backends::SplitCudaAndHostModule(module); - auto &host_module = std::get<0>(host_module_device_module); - auto &device_module = std::get<1>(host_module_device_module); - - backends::CodeGenCUDA_Dev codegen(target); - auto source_code = codegen.Compile(builder.Build()); - LOG(INFO) << "compiled code:\n\n\n" << source_code; - - // nv jit compile to ptx - backends::nvrtc::Compiler compiler; - auto ptx = compiler(source_code); - CHECK(!ptx.empty()); -} - -TEST(Reduce, Reduce_Test_3_2) { - int m = 10240; - int n = 64; - Expr M(m), N(n); - - Placeholder A("A", {N, M}); - - auto reduce_out = hlir::pe::TwoStepBlockReduceSum(A.tensor(), {1}, false); - CHECK_EQ(reduce_out.size(), 4) << "the output of reduce is not equal to 4!"; - auto stages = CreateStages({A, reduce_out[3], reduce_out[2], reduce_out[1], reduce_out[0]}); - - CudaTwoStepReduceSchedule( - stages, reduce_out[3], reduce_out[2], reduce_out[1], reduce_out[0], common::DefaultNVGPUTarget()); - - auto func = Lower("fn", stages, {A, reduce_out[0]}); - LOG(INFO) << "func:\n" << func; - - auto target = common::DefaultNVGPUTarget(); - Module::Builder builder("Concat_Builder", target); - builder.AddFunction(func); - - auto module = builder.Build(); - auto host_module_device_module = backends::SplitCudaAndHostModule(module); - auto &host_module = std::get<0>(host_module_device_module); - auto &device_module = std::get<1>(host_module_device_module); - - backends::CodeGenCUDA_Dev codegen(target); - auto source_code = codegen.Compile(builder.Build()); - LOG(INFO) << "compiled code:\n\n\n" << source_code; - - // nv jit compile to ptx - backends::nvrtc::Compiler compiler; - auto ptx = compiler(source_code); - CHECK(!ptx.empty()); -} - -TEST(Reduce, Reduce_Test_4) { - int m = 10201; - int n = 64; - Expr M(m), N(n); - - Placeholder A("A", {N, M}); - - auto reduce_out = hlir::pe::TwoStepBlockReduceSum(A.tensor(), {1}, false); - CHECK_EQ(reduce_out.size(), 4) << "the output of reduce is not equal to 4!"; - auto stages = CreateStages({A, reduce_out[3], reduce_out[2], reduce_out[1], reduce_out[0]}); - - CudaTwoStepReduceSchedule( - stages, reduce_out[3], reduce_out[2], reduce_out[1], reduce_out[0], common::DefaultNVGPUTarget()); - - auto func = Lower("fn", stages, {A, reduce_out[0]}); - LOG(INFO) << "func:\n" << func; - - auto target = common::DefaultNVGPUTarget(); - Module::Builder builder("Concat_Builder", target); - builder.AddFunction(func); - - auto module = builder.Build(); - auto host_module_device_module = backends::SplitCudaAndHostModule(module); - auto &host_module = std::get<0>(host_module_device_module); - auto &device_module = std::get<1>(host_module_device_module); - - backends::CodeGenCUDA_Dev codegen(target); - auto source_code = codegen.Compile(builder.Build()); - LOG(INFO) << "compiled code:\n\n\n" << source_code; - - // nv jit compile to ptx - backends::nvrtc::Compiler compiler; - auto ptx = compiler(source_code); - CHECK(!ptx.empty()); -} - -TEST(Reduce, Reduce_Test_5) { - int m = 32; - int n = 64; - Expr M(m), N(n); - - Placeholder A("A", {N, M, M}); - - auto reduce_out = hlir::pe::TwoStepBlockReduceSum(A.tensor(), {1, 2}, false); - CHECK_EQ(reduce_out.size(), 2) << "the output of reduce is not equal to 4!"; - auto stages = CreateStages({A, reduce_out[1], reduce_out[0]}); - - CudaBlockReduceInternalSchedule(stages, reduce_out[1], reduce_out[0], common::DefaultNVGPUTarget()); - - auto func = Lower("fn", stages, {A, reduce_out[0]}); - LOG(INFO) << "func:\n" << func; - - auto target = common::DefaultNVGPUTarget(); - Module::Builder builder("Concat_Builder", target); - builder.AddFunction(func); - - auto module = builder.Build(); - auto host_module_device_module = backends::SplitCudaAndHostModule(module); - auto &host_module = std::get<0>(host_module_device_module); - auto &device_module = std::get<1>(host_module_device_module); - - backends::CodeGenCUDA_Dev codegen(target); - auto source_code = codegen.Compile(builder.Build()); - LOG(INFO) << "compiled code:\n\n\n" << source_code; - - // nv jit compile to ptx - backends::nvrtc::Compiler compiler; - auto ptx = compiler(source_code); - CHECK(!ptx.empty()); -} - -TEST(Reduce, Reduce_Test_6) { - int m = 32; - int n = 64; - Expr M(m), N(n); - - Placeholder A("A", {N, N, M, M}); - - auto reduce_out = hlir::pe::TwoStepBlockReduceSum(A.tensor(), {0, 2, 3}, false); - CHECK_EQ(reduce_out.size(), 3) << "the output of reduce is not equal to 4!"; - auto stages = CreateStages({A, reduce_out[1], reduce_out[0]}); - - CudaBlockReduceSchedule(stages, reduce_out[2], reduce_out[1], reduce_out[0], common::DefaultNVGPUTarget()); - - auto func = Lower("fn", stages, {A, reduce_out[0]}); - LOG(INFO) << "func:\n" << func; - - auto target = common::DefaultNVGPUTarget(); - Module::Builder builder("Concat_Builder", target); - builder.AddFunction(func); - - auto module = builder.Build(); - auto host_module_device_module = backends::SplitCudaAndHostModule(module); - auto &host_module = std::get<0>(host_module_device_module); - auto &device_module = std::get<1>(host_module_device_module); - - backends::CodeGenCUDA_Dev codegen(target); - auto source_code = codegen.Compile(builder.Build()); - LOG(INFO) << "compiled code:\n\n\n" << source_code; - - // nv jit compile to ptx - backends::nvrtc::Compiler compiler; - auto ptx = compiler(source_code); - CHECK(!ptx.empty()); -} - -TEST(Reduce, Reduce_Test_7) { - int m = 10201; - int n = 64; - Expr M(m), N(n); - - Placeholder A("A", {N, N, M}); - - auto reduce_out = hlir::pe::TwoStepBlockReduceSum(A.tensor(), {1, 2}, false); - CHECK_EQ(reduce_out.size(), 4) << "the output of reduce is not equal to 4!"; - auto stages = CreateStages({A, reduce_out[3], reduce_out[2], reduce_out[1], reduce_out[0]}); - - CudaTwoStepReduceSchedule( - stages, reduce_out[3], reduce_out[2], reduce_out[1], reduce_out[0], common::DefaultNVGPUTarget()); - - auto func = Lower("fn", stages, {A, reduce_out[0]}); - LOG(INFO) << "func:\n" << func; - - auto target = common::DefaultNVGPUTarget(); - Module::Builder builder("Concat_Builder", target); - builder.AddFunction(func); - - auto module = builder.Build(); - auto host_module_device_module = backends::SplitCudaAndHostModule(module); - auto &host_module = std::get<0>(host_module_device_module); - auto &device_module = std::get<1>(host_module_device_module); - - backends::CodeGenCUDA_Dev codegen(target); - auto source_code = codegen.Compile(builder.Build()); - LOG(INFO) << "compiled code:\n\n\n" << source_code; - - // nv jit compile to ptx - backends::nvrtc::Compiler compiler; - auto ptx = compiler(source_code); - CHECK(!ptx.empty()); -} - -TEST(Reduce, Reduce_Test_8) { - int m = 128; - int n = 112; - Expr M(m), N(n); - - Placeholder A("A", {M, M, N, N}); - - auto reduce_out = hlir::pe::TwoStepBlockReduceSum(A.tensor(), {0, 2, 3}, false); - CHECK_EQ(reduce_out.size(), 4) << "the output of reduce is not equal to 4!"; - auto stages = CreateStages({A, reduce_out[3], reduce_out[2], reduce_out[1], reduce_out[0]}); - - CudaTwoStepReduceSchedule( - stages, reduce_out[3], reduce_out[2], reduce_out[1], reduce_out[0], common::DefaultNVGPUTarget()); - - auto func = Lower("fn", stages, {A, reduce_out[0]}); - LOG(INFO) << "func:\n" << func; - - auto target = common::DefaultNVGPUTarget(); - Module::Builder builder("Concat_Builder", target); - builder.AddFunction(func); - - auto module = builder.Build(); - auto host_module_device_module = backends::SplitCudaAndHostModule(module); - auto &host_module = std::get<0>(host_module_device_module); - auto &device_module = std::get<1>(host_module_device_module); - - backends::CodeGenCUDA_Dev codegen(target); - auto source_code = codegen.Compile(builder.Build()); - LOG(INFO) << "compiled code:\n\n\n" << source_code; - - // nv jit compile to ptx - backends::nvrtc::Compiler compiler; - auto ptx = compiler(source_code); - CHECK(!ptx.empty()); -} - -TEST(Reduce, Reduce_Test_9) { - int m = 128; - int n = 56; - Expr M(m), N(n); - - Placeholder A("A", {M, M, N, N}); - - auto reduce_out = hlir::pe::TwoStepBlockReduceSum(A.tensor(), {0, 2, 3}, false); - CHECK_EQ(reduce_out.size(), 4) << "the output of reduce is not equal to 4!"; - auto stages = CreateStages({A, reduce_out[3], reduce_out[2], reduce_out[1], reduce_out[0]}); - - CudaTwoStepReduceSchedule( - stages, reduce_out[3], reduce_out[2], reduce_out[1], reduce_out[0], common::DefaultNVGPUTarget()); - - auto func = Lower("fn", stages, {A, reduce_out[0]}); - LOG(INFO) << "func:\n" << func; - - auto target = common::DefaultNVGPUTarget(); - Module::Builder builder("Concat_Builder", target); - builder.AddFunction(func); - - auto module = builder.Build(); - auto host_module_device_module = backends::SplitCudaAndHostModule(module); - auto &host_module = std::get<0>(host_module_device_module); - auto &device_module = std::get<1>(host_module_device_module); - - backends::CodeGenCUDA_Dev codegen(target); - auto source_code = codegen.Compile(builder.Build()); - LOG(INFO) << "compiled code:\n\n\n" << source_code; - - // nv jit compile to ptx - backends::nvrtc::Compiler compiler; - auto ptx = compiler(source_code); - CHECK(!ptx.empty()); -} - -TEST(Reduce, Reduce_Test_10) { - int m = 128; - int n = 128; - Expr M(m), N(n); - - Placeholder A("A", {M, N}); - Placeholder B("B", {M, N}); - - auto c = hlir::pe::Add(A.tensor(), B.tensor()); - auto reduce_out = hlir::pe::BlockShuffleReduceSum(c, {0}, false); - CHECK_EQ(reduce_out.size(), 3) << "the output of reduce is not equal to 4!"; - auto stages = CreateStages({A, B, c, reduce_out[2], reduce_out[1], reduce_out[0]}); - - stages[c]->Split(0, 8); - stages[c]->Fuse(1, 2); - stages[c]->Reorder({1, 0}); - stages[c]->SetBuffer("local"); - stages[c]->SimpleComputeAt(stages[reduce_out[1]], 1); - stages[reduce_out[2]]->ComputeInline(); - stages[reduce_out[1]]->Bind(0, "threadIdx.x"); - stages[reduce_out[1]]->SetBuffer("shared"); - stages[reduce_out[0]]->Bind(0, "threadIdx.x"); - - auto func = Lower("fn", stages, {A, B, reduce_out[0]}); - LOG(INFO) << "func:\n" << func; - - auto target = common::DefaultNVGPUTarget(); - Module::Builder builder("Concat_Builder", target); - builder.AddFunction(func); - - auto module = builder.Build(); - auto host_module_device_module = backends::SplitCudaAndHostModule(module); - auto &host_module = std::get<0>(host_module_device_module); - auto &device_module = std::get<1>(host_module_device_module); - - backends::CodeGenCUDA_Dev codegen(target); - auto source_code = codegen.Compile(builder.Build()); - LOG(INFO) << "compiled code:\n\n\n" << source_code; - - // nv jit compile to ptx - backends::nvrtc::Compiler compiler; - auto ptx = compiler(source_code); - CHECK(!ptx.empty()); -} - -TEST(Reduce, Reduce_Test_11) { - int m = 10; - int n = 10; - Expr M(m), N(n); - - Placeholder A("A", {M, N, N}); - auto reduce_out = hlir::pe::BlockShuffleReduceSum(A.tensor(), {0, 1}, false); - CHECK_EQ(reduce_out.size(), 3) << "the output of reduce is not equal to 4!"; - auto stages = CreateStages({A, reduce_out[2], reduce_out[1], reduce_out[0]}); - - CudaBlockShuffleReduceSchedule(stages, reduce_out[2], reduce_out[1], reduce_out[0], common::DefaultNVGPUTarget()); - auto func = Lower("fn", stages, {A, reduce_out[0]}); - LOG(INFO) << "func:\n" << func; - - auto target = common::DefaultNVGPUTarget(); - Module::Builder builder("Concat_Builder", target); - builder.AddFunction(func); - - auto module = builder.Build(); - auto host_module_device_module = backends::SplitCudaAndHostModule(module); - auto &host_module = std::get<0>(host_module_device_module); - auto &device_module = std::get<1>(host_module_device_module); - - backends::CodeGenCUDA_Dev codegen(target); - auto source_code = codegen.Compile(builder.Build()); - LOG(INFO) << "compiled code:\n\n\n" << source_code; - - // nv jit compile to ptx - backends::nvrtc::Compiler compiler; - auto ptx = compiler(source_code); - CHECK(!ptx.empty()); -} - -TEST(Reduce, Reduce_Test_12) { - int m = 10; - int n = 10; - Expr M(m), N(n); - - Placeholder A("A", {M, M, N, N}); - auto reduce_out = hlir::pe::BlockShuffleReduceSum(A.tensor(), {0, 1, 2}, false); - CHECK_EQ(reduce_out.size(), 3) << "the output of reduce is not equal to 4!"; - auto stages = CreateStages({A, reduce_out[2], reduce_out[1], reduce_out[0]}); - - CudaBlockShuffleReduceSchedule(stages, reduce_out[2], reduce_out[1], reduce_out[0], common::DefaultNVGPUTarget()); - auto func = Lower("fn", stages, {A, reduce_out[0]}); - LOG(INFO) << "func:\n" << func; - - auto target = common::DefaultNVGPUTarget(); - Module::Builder builder("Concat_Builder", target); - builder.AddFunction(func); - - auto module = builder.Build(); - auto host_module_device_module = backends::SplitCudaAndHostModule(module); - auto &host_module = std::get<0>(host_module_device_module); - auto &device_module = std::get<1>(host_module_device_module); - - backends::CodeGenCUDA_Dev codegen(target); - auto source_code = codegen.Compile(builder.Build()); - LOG(INFO) << "compiled code:\n\n\n" << source_code; - - // nv jit compile to ptx - backends::nvrtc::Compiler compiler; - auto ptx = compiler(source_code); - CHECK(!ptx.empty()); -} - -TEST(Reduce, Reduce_Test_13) { - int m = 16; - int n = 16; - Expr M(m), N(n); - - Placeholder A("A", {M, M, N, N}); - auto reduce_out = hlir::pe::BlockShuffleReduceSum(A.tensor(), {0, 1, 2}, false); - CHECK_EQ(reduce_out.size(), 3) << "the output of reduce is not equal to 4!"; - auto stages = CreateStages({A, reduce_out[2], reduce_out[1], reduce_out[0]}); - - CudaBlockShuffleReduceSchedule(stages, reduce_out[2], reduce_out[1], reduce_out[0], common::DefaultNVGPUTarget()); - auto func = Lower("fn", stages, {A, reduce_out[0]}); - LOG(INFO) << "func:\n" << func; - - auto target = common::DefaultNVGPUTarget(); - Module::Builder builder("Concat_Builder", target); - builder.AddFunction(func); - - auto module = builder.Build(); - auto host_module_device_module = backends::SplitCudaAndHostModule(module); - auto &host_module = std::get<0>(host_module_device_module); - auto &device_module = std::get<1>(host_module_device_module); - - backends::CodeGenCUDA_Dev codegen(target); - auto source_code = codegen.Compile(builder.Build()); - LOG(INFO) << "compiled code:\n\n\n" << source_code; - - // nv jit compile to ptx - backends::nvrtc::Compiler compiler; - auto ptx = compiler(source_code); - CHECK(!ptx.empty()); -} #endif } // namespace pe From aa248521fbe62bb71e588509e14860817becdfb7 Mon Sep 17 00:00:00 2001 From: sunli <466530738@qq.com> Date: Tue, 16 May 2023 04:55:42 +0000 Subject: [PATCH 11/11] fix pe transform test --- cinn/hlir/pe/pe_transform_test.cc | 2 -- 1 file changed, 2 deletions(-) diff --git a/cinn/hlir/pe/pe_transform_test.cc b/cinn/hlir/pe/pe_transform_test.cc index ce65e95f95..f5a76014e8 100644 --- a/cinn/hlir/pe/pe_transform_test.cc +++ b/cinn/hlir/pe/pe_transform_test.cc @@ -224,8 +224,6 @@ TEST(Concat, ConcatCase0) { #endif } -#endif - } // namespace pe } // namespace hlir } // namespace cinn