From beaa4bf781cd7585253997b54f325fdd63becd03 Mon Sep 17 00:00:00 2001
From: sunli <466530738@qq.com>
Date: Thu, 11 May 2023 12:50:38 +0000
Subject: [PATCH 01/11] using nvcc compiler

---
 cinn/backends/nvrtc/nvcc_util.cc | 99 ++++++++++++++++++++++++++++++++
 cinn/backends/nvrtc/nvcc_util.h  | 38 ++++++++++++
 2 files changed, 137 insertions(+)
 create mode 100644 cinn/backends/nvrtc/nvcc_util.cc
 create mode 100644 cinn/backends/nvrtc/nvcc_util.h
diff --git a/cinn/backends/nvrtc/nvcc_util.cc b/cinn/backends/nvrtc/nvcc_util.cc
new file mode 100644
index 0000000000..b35f02c926
--- /dev/null
+++ b/cinn/backends/nvrtc/nvcc_util.cc
@@ -0,0 +1,99 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/backends/nvrtc/nvcc_util.h"
+
+#include <direct.h>
+
+#include <ifstream>
+#include <iostream>
+#include <ofstream>
+
+#include "cinn/common/common.h"
+
+#ifdef CINN_WITH_CUDA
+
+namespace cinn {
+namespace backends {
+namespace nvrtc {
+
+std::string NvccCompiler::operator()(const std::string& cuda_c) {
+  // read dir source
+  std::string dir = "./source";
+  if (_access(dir.c_str(), 0) == -1) {
+    CHECK(mkdir(dir.c_str()) != -1) << "Fail to mkdir " << dir;
+  }
+
+  // get unqiue prefix name
+  auto prefix_name = dir + common::UniqName("cuda_c");
+
+  auto cuda_c_file = prefix_name + ".cu";
+  std::ofstream ofs(cuda_c_file, std::ios::out);
+  CHECK(ofs.is_open()) << "Fail to open file " << cuda_c_file;
+  ofs << cuda_c;
+  ofs.close();
+
+  CompileToPtx(prefix_name);
+  CompileToCubin(prefix_name);
+
+  // open cubin file
+  auto cuda_cubin_file = prefix_name + ".cubin";
+  std::ifstream ifs(cuda_cubin_file, std::ios::in | std::ios::binary);
+  CHECK(ifs.is_open()) << "Fail to open file " << cuda_cubin_file;
+  ifs.seekg(std::ios::end);
+  auto len = ifs.tellg();
+  ifs.seekg(0);
+
+  // read cubin file
+  std::string cubin(len,'');
+  ifs.read(&ubin.data(), len);
+  ifs.close();
+  return cubin;
+}
+
+void NvccCompiler::CompileToPtx(const std::string& prefix_name) {
+  auto options = "export PATH=/usr/local/cuda/bin:$PATH && nvcc --ptx -O3";
+  options += " -arch=" + GetDeviceArch;
+  options += " -o " + prefix_name + ".ptx";
+  options += " " + prefix_name + ".cu";
+
+  CHECK(system(options) == 0) << options;
+}
+
+void NvccCompiler::CompileToCubin(const std::string& prefix_name) {
+  auto options = "export PATH=/usr/local/cuda/bin:$PATH && nvcc --cubin -O3";
+  options += " -arch=" + GetDeviceArch;
+  options += " -o " + prefix_name + ".cubin";
+  options += " " + prefix_name + ".ptx";
+
+  CHECK(system(options) == 0) << options;
+}
+
+std::string NvccCompiler::GetDeviceArch() {
+  int major = 0, minor = 0;
+  if (cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, 0) == cudaSuccess &&
+      cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, 0) == cudaSuccess) {
+    return "sm" + std::to_string(major) + std::to_string(minor);
+  } else {
+    LOG(WARNING) << "cannot detect compute capability from your device, "
+                 << "fall back to compute_30.";
+    return "sm_30"
+  }
+}
+
+}  // namespace nvrtc
+}  // namespace backends
+}  // namespace cinn
+
+#endif  // CINN_WITH_CUDA
diff --git a/cinn/backends/nvrtc/nvcc_util.h b/cinn/backends/nvrtc/nvcc_util.h
new file mode 100644
index 0000000000..d3049b48a3
--- /dev/null
+++ b/cinn/backends/nvrtc/nvcc_util.h
@@ -0,0 +1,38 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#ifdef CINN_WITH_CUDA
+
+namespace cinn {
+namespace backends {
+namespace nvrtc {
+
+class NvccCompiler {
+ public:
+  NvccCompiler(){};
+  ~NvccCompiler(){};
+  std::string operator()(const std::string&);
+
+ private:
+  void CompileToPtx(const std::string&);
+  void CompileToCubin(const std::string&);
+  std::string GetDeviceArch();
+};
+
+}  // namespace nvrtc
+}  // namespace backends
+}  // namespace cinn
+
+#endif  // CINN_WITH_CUDA

From a4bc137f57ac7d18b1c82e128b1001849090130d Mon Sep 17 00:00:00 2001
From: sunli <466530738@qq.com>
Date: Thu, 11 May 2023 13:10:22 +0000
Subject: [PATCH 02/11] using nvcc compiler

---
 cinn/backends/compiler.cc                |  7 ++--
 cinn/backends/nvrtc/CMakeLists.txt       |  1 +
 cinn/backends/nvrtc/nvcc_util.cc         | 44 ++++++++++++++----------
 cinn/backends/nvrtc/nvcc_util.h          |  3 ++
 cinn/hlir/framework/parallel_compiler.cc | 10 +++---
 5 files changed, 38 insertions(+), 27 deletions(-)

diff --git a/cinn/backends/compiler.cc b/cinn/backends/compiler.cc
index 880441ff63..1c23cab3cd 100644
--- a/cinn/backends/compiler.cc
+++ b/cinn/backends/compiler.cc
@@ -22,6 +22,7 @@
 #include "cinn/backends/codegen_cuda_dev.h"
 #include "cinn/backends/codegen_cuda_host.h"
 #include "cinn/backends/codegen_cuda_util.h"
+#include "cinn/backends/nvrtc/nvcc_util.h"
 #include "cinn/backends/nvrtc/nvrtc_util.h"
 #include "cinn/runtime/cuda/cuda_module.h"
 #include "cinn/runtime/cuda/cuda_util.h"
@@ -123,13 +124,15 @@ void Compiler::CompileCudaModule(const Module& module, const std::string& code)
   SourceCodePrint::GetInstance()->write(source_code);
   using runtime::cuda::CUDAModule;
 
+  /*
   backends::nvrtc::Compiler compiler;
 
   auto ptx = compiler(source_code);
   CHECK(!ptx.empty()) << "Compile PTX failed from source code:\n" << source_code;
+  */
+  backends::nvrtc::NvccCompiler compiler;
 
-  cuda_module_.reset(
-      new CUDAModule(ptx, compiler.compile_to_cubin() ? CUDAModule::Kind::CUBIN : CUDAModule::Kind::PTX));
+  cuda_module_.reset(new CUDAModule(compiler(source_code), CUDAModule::Kind::CUBIN));
 
   RuntimeSymbols symbols;
 
diff --git a/cinn/backends/nvrtc/CMakeLists.txt b/cinn/backends/nvrtc/CMakeLists.txt
index a344b65ca9..b67a2b6413 100644
--- a/cinn/backends/nvrtc/CMakeLists.txt
+++ b/cinn/backends/nvrtc/CMakeLists.txt
@@ -3,6 +3,7 @@ core_gather_headers()
 gather_srcs(cinnapi_src SRCS
   header_generator.cc
   nvrtc_util.cc
+  nvcc_util.cc
 )
 
 nv_test(test_nvrtc_util SRCS nvrtc_util_test.cc DEPS cinncore)
diff --git a/cinn/backends/nvrtc/nvcc_util.cc b/cinn/backends/nvrtc/nvcc_util.cc
index b35f02c926..993c79dab2 100644
--- a/cinn/backends/nvrtc/nvcc_util.cc
+++ b/cinn/backends/nvrtc/nvcc_util.cc
@@ -14,16 +14,17 @@
 
 #include "cinn/backends/nvrtc/nvcc_util.h"
 
+#include "cinn/common/common.h"
+
+#ifdef CINN_WITH_CUDA
+
+#include <cuda_runtime.h>
 #include <direct.h>
 
 #include <ifstream>
 #include <iostream>
 #include <ofstream>
 
-#include "cinn/common/common.h"
-
-#ifdef CINN_WITH_CUDA
-
 namespace cinn {
 namespace backends {
 namespace nvrtc {
@@ -36,7 +37,7 @@ std::string NvccCompiler::operator()(const std::string& cuda_c) {
   }
 
   // get unqiue prefix name
-  auto prefix_name = dir + common::UniqName("cuda_c");
+  auto prefix_name = dir + "/" + common::UniqName("rtc_tmp");
 
   auto cuda_c_file = prefix_name + ".cu";
   std::ofstream ofs(cuda_c_file, std::ios::out);
@@ -47,21 +48,11 @@ std::string NvccCompiler::operator()(const std::string& cuda_c) {
   CompileToPtx(prefix_name);
   CompileToCubin(prefix_name);
 
-  // open cubin file
-  auto cuda_cubin_file = prefix_name + ".cubin";
-  std::ifstream ifs(cuda_cubin_file, std::ios::in | std::ios::binary);
-  CHECK(ifs.is_open()) << "Fail to open file " << cuda_cubin_file;
-  ifs.seekg(std::ios::end);
-  auto len = ifs.tellg();
-  ifs.seekg(0);
-
-  // read cubin file
-  std::string cubin(len,'');
-  ifs.read(&ubin.data(), len);
-  ifs.close();
-  return cubin;
+  return ReadFile(prefix_name + ".cubin", std::ios::in | std::ios::binary);
 }
 
+std::string GetPtx() { return ReadFile(prefix_name + ".ptx", std::ios::in); }
+
 void NvccCompiler::CompileToPtx(const std::string& prefix_name) {
   auto options = "export PATH=/usr/local/cuda/bin:$PATH && nvcc --ptx -O3";
   options += " -arch=" + GetDeviceArch;
@@ -84,7 +75,7 @@ std::string NvccCompiler::GetDeviceArch() {
   int major = 0, minor = 0;
   if (cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, 0) == cudaSuccess &&
       cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, 0) == cudaSuccess) {
-    return "sm" + std::to_string(major) + std::to_string(minor);
+    return "sm_" + std::to_string(major) + std::to_string(minor);
   } else {
     LOG(WARNING) << "cannot detect compute capability from your device, "
                  << "fall back to compute_30.";
@@ -92,6 +83,21 @@ std::string NvccCompiler::GetDeviceArch() {
   }
 }
 
+std::string NvccCompiler::ReadFile(const std::string& file_name, std::ios_base::openmode mode) {
+  // open cubin file
+  std::ifstream ifs(file_name, mode);
+  CHECK(ifs.is_open()) << "Fail to open file " << file_name;
+  ifs.seekg(std::ios::end);
+  auto len = ifs.tellg();
+  ifs.seekg(0);
+
+  // read cubin file
+  std::string file_data(len,'');
+  ifs.read(&file_data.data(), len);
+  ifs.close();
+  return file_data;
+}
+
 }  // namespace nvrtc
 }  // namespace backends
 }  // namespace cinn
diff --git a/cinn/backends/nvrtc/nvcc_util.h b/cinn/backends/nvrtc/nvcc_util.h
index d3049b48a3..62b2b00919 100644
--- a/cinn/backends/nvrtc/nvcc_util.h
+++ b/cinn/backends/nvrtc/nvcc_util.h
@@ -24,11 +24,14 @@ class NvccCompiler {
   NvccCompiler(){};
   ~NvccCompiler(){};
   std::string operator()(const std::string&);
+  std::string GetPtx();
 
  private:
   void CompileToPtx(const std::string&);
   void CompileToCubin(const std::string&);
   std::string GetDeviceArch();
+
+  std::string ReadFile(const std::string&, std::ios_base::openmode);
 };
 
 }  // namespace nvrtc
diff --git a/cinn/hlir/framework/parallel_compiler.cc b/cinn/hlir/framework/parallel_compiler.cc
index aa22dfce65..56cb449e9a 100644
--- a/cinn/hlir/framework/parallel_compiler.cc
+++ b/cinn/hlir/framework/parallel_compiler.cc
@@ -175,13 +175,11 @@ void ParallelCompiler::Task::CodegenAndJit() {
     graph->SaveSourceCode(cuda_c);
 
     using runtime::cuda::CUDAModule;
-    backends::nvrtc::Compiler compiler;
-    auto ptx = compiler(cuda_c);
-    CHECK(!ptx.empty()) << "Compile PTX failed from source code:\n" << cuda_c;
-    graph->SavePTXCode(ptx);
+    backends::nvrtc::NvccCompiler compiler;
+    cuda_module_.reset(new CUDAModule(compiler(source_code), CUDAModule::Kind::CUBIN));
 
-    // load cumodule
-    cumodule.reset(new CUDAModule(ptx, compiler.compile_to_cubin() ? CUDAModule::Kind::CUBIN : CUDAModule::Kind::PTX));
+    // save ptx
+    graph->SavePTXCode(compiler.GetPtx());
     // register kernel
     backends::RuntimeSymbols symbols;
     for (auto& fn : dmodule.functions()) {

From ff9dc17e21db5a441a1be3acfea0ed079df4bf12 Mon Sep 17 00:00:00 2001
From: sunli <466530738@qq.com>
Date: Thu, 11 May 2023 13:47:35 +0000
Subject: [PATCH 03/11] compile pass

---
 cinn/backends/compiler.cc                |  2 +-
 cinn/backends/nvrtc/nvcc_util.cc         | 55 ++++++++++++------------
 cinn/backends/nvrtc/nvcc_util.h          |  9 +++-
 cinn/hlir/framework/parallel_compiler.cc |  3 +-
 4 files changed, 38 insertions(+), 31 deletions(-)

diff --git a/cinn/backends/compiler.cc b/cinn/backends/compiler.cc
index 1c23cab3cd..50feb6fd4a 100644
--- a/cinn/backends/compiler.cc
+++ b/cinn/backends/compiler.cc
@@ -130,7 +130,7 @@ void Compiler::CompileCudaModule(const Module& module, const std::string& code)
   auto ptx = compiler(source_code);
   CHECK(!ptx.empty()) << "Compile PTX failed from source code:\n" << source_code;
   */
-  backends::nvrtc::NvccCompiler compiler;
+  nvrtc::NvccCompiler compiler;
 
   cuda_module_.reset(new CUDAModule(compiler(source_code), CUDAModule::Kind::CUBIN));
 
diff --git a/cinn/backends/nvrtc/nvcc_util.cc b/cinn/backends/nvrtc/nvcc_util.cc
index 993c79dab2..25fd1f138b 100644
--- a/cinn/backends/nvrtc/nvcc_util.cc
+++ b/cinn/backends/nvrtc/nvcc_util.cc
@@ -19,11 +19,12 @@
 #ifdef CINN_WITH_CUDA
 
 #include <cuda_runtime.h>
-#include <direct.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
 
-#include <ifstream>
+#include <fstream>
 #include <iostream>
-#include <ofstream>
 
 namespace cinn {
 namespace backends {
@@ -32,43 +33,43 @@ namespace nvrtc {
 std::string NvccCompiler::operator()(const std::string& cuda_c) {
   // read dir source
   std::string dir = "./source";
-  if (_access(dir.c_str(), 0) == -1) {
-    CHECK(mkdir(dir.c_str()) != -1) << "Fail to mkdir " << dir;
+  if (access(dir.c_str(), 0) == -1) {
+    CHECK(mkdir(dir.c_str(), 7) != -1) << "Fail to mkdir " << dir;
   }
 
   // get unqiue prefix name
-  auto prefix_name = dir + "/" + common::UniqName("rtc_tmp");
+  prefix_name_ = dir + "/" + common::UniqName("rtc_tmp");
 
-  auto cuda_c_file = prefix_name + ".cu";
+  auto cuda_c_file = prefix_name_ + ".cu";
   std::ofstream ofs(cuda_c_file, std::ios::out);
   CHECK(ofs.is_open()) << "Fail to open file " << cuda_c_file;
   ofs << cuda_c;
   ofs.close();
 
-  CompileToPtx(prefix_name);
-  CompileToCubin(prefix_name);
+  CompileToPtx();
+  CompileToCubin();
 
-  return ReadFile(prefix_name + ".cubin", std::ios::in | std::ios::binary);
+  return ReadFile(prefix_name_ + ".cubin", std::ios::in | std::ios::binary);
 }
 
-std::string GetPtx() { return ReadFile(prefix_name + ".ptx", std::ios::in); }
+std::string NvccCompiler::GetPtx() { return ReadFile(prefix_name_ + ".ptx", std::ios::in); }
 
-void NvccCompiler::CompileToPtx(const std::string& prefix_name) {
-  auto options = "export PATH=/usr/local/cuda/bin:$PATH && nvcc --ptx -O3";
-  options += " -arch=" + GetDeviceArch;
-  options += " -o " + prefix_name + ".ptx";
-  options += " " + prefix_name + ".cu";
+void NvccCompiler::CompileToPtx() {
+  std::string options = "export PATH=/usr/local/cuda/bin:$PATH && nvcc --ptx -O3";
+  options += " -arch=" + GetDeviceArch();
+  options += " -o " + prefix_name_ + ".ptx";
+  options += " " + prefix_name_ + ".cu";
 
-  CHECK(system(options) == 0) << options;
+  CHECK(system(options.c_str()) == 0) << options;
 }
 
-void NvccCompiler::CompileToCubin(const std::string& prefix_name) {
-  auto options = "export PATH=/usr/local/cuda/bin:$PATH && nvcc --cubin -O3";
-  options += " -arch=" + GetDeviceArch;
-  options += " -o " + prefix_name + ".cubin";
-  options += " " + prefix_name + ".ptx";
+void NvccCompiler::CompileToCubin() {
+  std::string options = "export PATH=/usr/local/cuda/bin:$PATH && nvcc --cubin -O3";
+  options += " -arch=" + GetDeviceArch();
+  options += " -o " + prefix_name_ + ".cubin";
+  options += " " + prefix_name_ + ".ptx";
 
-  CHECK(system(options) == 0) << options;
+  CHECK(system(options.c_str()) == 0) << options;
 }
 
 std::string NvccCompiler::GetDeviceArch() {
@@ -79,7 +80,7 @@ std::string NvccCompiler::GetDeviceArch() {
   } else {
     LOG(WARNING) << "cannot detect compute capability from your device, "
                  << "fall back to compute_30.";
-    return "sm_30"
+    return "sm_30";
   }
 }
 
@@ -92,10 +93,10 @@ std::string NvccCompiler::ReadFile(const std::string& file_name, std::ios_base::
   ifs.seekg(0);
 
   // read cubin file
-  std::string file_data(len,'');
-  ifs.read(&file_data.data(), len);
+  std::string file_data(len, ' ');
+  ifs.read(&file_data[0], len);
   ifs.close();
-  return file_data;
+  return std::move(file_data);
 }
 
 }  // namespace nvrtc
diff --git a/cinn/backends/nvrtc/nvcc_util.h b/cinn/backends/nvrtc/nvcc_util.h
index 62b2b00919..c3db6d151a 100644
--- a/cinn/backends/nvrtc/nvcc_util.h
+++ b/cinn/backends/nvrtc/nvcc_util.h
@@ -13,6 +13,9 @@
 // limitations under the License.
 #pragma once
 
+#include <ios>
+#include <string>
+
 #ifdef CINN_WITH_CUDA
 
 namespace cinn {
@@ -27,11 +30,13 @@ class NvccCompiler {
   std::string GetPtx();
 
  private:
-  void CompileToPtx(const std::string&);
-  void CompileToCubin(const std::string&);
+  void CompileToPtx();
+  void CompileToCubin();
   std::string GetDeviceArch();
 
   std::string ReadFile(const std::string&, std::ios_base::openmode);
+
+  std::string prefix_name_{""};
 };
 
 }  // namespace nvrtc
diff --git a/cinn/hlir/framework/parallel_compiler.cc b/cinn/hlir/framework/parallel_compiler.cc
index 56cb449e9a..c4c221d439 100644
--- a/cinn/hlir/framework/parallel_compiler.cc
+++ b/cinn/hlir/framework/parallel_compiler.cc
@@ -24,6 +24,7 @@
 #include "cinn/backends/compiler.h"
 #include "cinn/backends/llvm/codegen_x86.h"
 #include "cinn/backends/llvm/runtime_symbol_registry.h"
+#include "cinn/backends/nvrtc/nvcc_util.h"
 #include "cinn/backends/nvrtc/nvrtc_util.h"
 #include "cinn/common/context.h"
 #include "cinn/hlir/framework/pass.h"
@@ -176,7 +177,7 @@ void ParallelCompiler::Task::CodegenAndJit() {
 
     using runtime::cuda::CUDAModule;
     backends::nvrtc::NvccCompiler compiler;
-    cuda_module_.reset(new CUDAModule(compiler(source_code), CUDAModule::Kind::CUBIN));
+    cumodule.reset(new CUDAModule(compiler(cuda_c), CUDAModule::Kind::CUBIN));
 
     // save ptx
     graph->SavePTXCode(compiler.GetPtx());

From 1d20fc382858e4cda96f113564a87c60912dbf07 Mon Sep 17 00:00:00 2001
From: sunli <466530738@qq.com>
Date: Fri, 12 May 2023 03:30:35 +0000
Subject: [PATCH 04/11] add include dir in nvcc option

---
 cinn/backends/nvrtc/nvcc_util.cc | 15 +++++++++++++--
 cinn/runtime/cuda/cuda_module.cc |  7 ++++---
 2 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/cinn/backends/nvrtc/nvcc_util.cc b/cinn/backends/nvrtc/nvcc_util.cc
index 25fd1f138b..a968f76d1b 100644
--- a/cinn/backends/nvrtc/nvcc_util.cc
+++ b/cinn/backends/nvrtc/nvcc_util.cc
@@ -49,13 +49,24 @@ std::string NvccCompiler::operator()(const std::string& cuda_c) {
   CompileToPtx();
   CompileToCubin();
 
-  return ReadFile(prefix_name_ + ".cubin", std::ios::in | std::ios::binary);
+  return prefix_name_ + ".cubin";
 }
 
 std::string NvccCompiler::GetPtx() { return ReadFile(prefix_name_ + ".ptx", std::ios::in); }
 
 void NvccCompiler::CompileToPtx() {
-  std::string options = "export PATH=/usr/local/cuda/bin:$PATH && nvcc --ptx -O3";
+  auto include_dir            = common::Context::Global().runtime_include_dir();
+  std::string include_dir_str = "";
+  for (auto dir : include_dir) {
+    if (include_dir_str.empty()) {
+      include_dir_str = dir;
+    } else {
+      include_dir_str += ":" + dir;
+    }
+  }
+
+  std::string options =
+      std::string("export PATH=/usr/local/cuda/bin:$PATH && nvcc -std=c++14 --ptx -O3 -I ") + include_dir_str;
   options += " -arch=" + GetDeviceArch();
   options += " -o " + prefix_name_ + ".ptx";
   options += " " + prefix_name_ + ".cu";
diff --git a/cinn/runtime/cuda/cuda_module.cc b/cinn/runtime/cuda/cuda_module.cc
index 0ec5aa0bfe..2c80998897 100644
--- a/cinn/runtime/cuda/cuda_module.cc
+++ b/cinn/runtime/cuda/cuda_module.cc
@@ -103,8 +103,9 @@ CUfunction CUDAModule::GetFunction(int device_id, const std::string& func_name)
     jit_options[4]  = CU_JIT_GENERATE_LINE_INFO;
     jit_opt_vals[4] = reinterpret_cast<void*>(value);
 
-    CUresult status = cuModuleLoadDataEx(
-        &module_per_card_[device_id], data_.c_str(), jit_num_options, jit_options.data(), jit_opt_vals.data());
+    // CUresult status = cuModuleLoadDataEx(
+    //    &module_per_card_[device_id], data_.c_str(), jit_num_options, jit_options.data(), jit_opt_vals.data());
+    CUresult status = cuModuleLoad(&module_per_card_[device_id], data_.c_str());
 
     if (CUDA_SUCCESS != status) {
       RAW_LOG(ERROR, "PTX JIT ERROR LOG: %s\n.", log_buffer.data());
@@ -124,7 +125,7 @@ CUfunction CUDAModule::GetFunction(int device_id, const std::string& func_name)
 CUdeviceptr CUDAModule::GetGlobal(int device_id, const std::string& name, size_t nbytes) {
   if (!module_per_card_[device_id]) {
     std::lock_guard<std::mutex> lock(mutex_);
-    CUDA_DRIVER_CALL(cuModuleLoadData(&module_per_card_[device_id], data_.c_str()));
+    CUDA_DRIVER_CALL(cuModuleLoad(&module_per_card_[device_id], data_.c_str()));
   }
 
   CUdeviceptr global;

From 6f490743b6c7b0fa9633a6d610221e3d5e0bc01f Mon Sep 17 00:00:00 2001
From: sunli <466530738@qq.com>
Date: Fri, 12 May 2023 07:09:40 +0000
Subject: [PATCH 05/11] check using nvcc

---
 cinn/backends/compiler.cc                | 20 ++++++++++---------
 cinn/hlir/framework/parallel_compiler.cc | 19 ++++++++++++++----
 cinn/runtime/cuda/cuda_module.cc         | 25 ++++++++++++------------
 cinn/runtime/flags.cc                    |  8 ++++++++
 cinn/runtime/flags.h                     |  2 ++
 5 files changed, 48 insertions(+), 26 deletions(-)

diff --git a/cinn/backends/compiler.cc b/cinn/backends/compiler.cc
index 50feb6fd4a..08be15db33 100644
--- a/cinn/backends/compiler.cc
+++ b/cinn/backends/compiler.cc
@@ -26,6 +26,7 @@
 #include "cinn/backends/nvrtc/nvrtc_util.h"
 #include "cinn/runtime/cuda/cuda_module.h"
 #include "cinn/runtime/cuda/cuda_util.h"
+#include "cinn/runtime/flags.h"
 #endif
 
 DECLARE_string(cinn_source_code_save_path);
@@ -124,15 +125,16 @@ void Compiler::CompileCudaModule(const Module& module, const std::string& code)
   SourceCodePrint::GetInstance()->write(source_code);
   using runtime::cuda::CUDAModule;
 
-  /*
-  backends::nvrtc::Compiler compiler;
-
-  auto ptx = compiler(source_code);
-  CHECK(!ptx.empty()) << "Compile PTX failed from source code:\n" << source_code;
-  */
-  nvrtc::NvccCompiler compiler;
-
-  cuda_module_.reset(new CUDAModule(compiler(source_code), CUDAModule::Kind::CUBIN));
+  if (runtime::CanUseNvccCompiler()) {
+    nvrtc::NvccCompiler compiler;
+    cuda_module_.reset(new CUDAModule(compiler(source_code), CUDAModule::Kind::CUBIN));
+  } else {
+    nvrtc::Compiler compiler;
+    auto ptx = compiler(source_code);
+    CHECK(!ptx.empty()) << "Compile PTX failed from source code:\n" << source_code;
+    cuda_module_.reset(
+        new CUDAModule(ptx, compiler.compile_to_cubin() ? CUDAModule::Kind::CUBIN : CUDAModule::Kind::PTX));
+  }
 
   RuntimeSymbols symbols;
 
diff --git a/cinn/hlir/framework/parallel_compiler.cc b/cinn/hlir/framework/parallel_compiler.cc
index c4c221d439..cc3ac7e140 100644
--- a/cinn/hlir/framework/parallel_compiler.cc
+++ b/cinn/hlir/framework/parallel_compiler.cc
@@ -29,6 +29,7 @@
 #include "cinn/common/context.h"
 #include "cinn/hlir/framework/pass.h"
 #include "cinn/ir/module.h"
+#include "cinn/runtime/flags.h"
 
 DECLARE_int32(cinn_parallel_compile_size);
 DECLARE_int32(cinn_parallel_compile_thread);
@@ -176,11 +177,21 @@ void ParallelCompiler::Task::CodegenAndJit() {
     graph->SaveSourceCode(cuda_c);
 
     using runtime::cuda::CUDAModule;
-    backends::nvrtc::NvccCompiler compiler;
-    cumodule.reset(new CUDAModule(compiler(cuda_c), CUDAModule::Kind::CUBIN));
+    if (runtime::CanUseNvccCompiler()) {
+      backends::nvrtc::NvccCompiler compiler;
+      // load cumodule
+      cumodule.reset(new CUDAModule(compiler(cuda_c), CUDAModule::Kind::CUBIN));
+      graph->SavePTXCode(compiler.GetPtx());
+    } else {
+      backends::nvrtc::Compiler compiler;
+      auto ptx = compiler(cuda_c);
+      CHECK(!ptx.empty()) << "Compile PTX failed from source code:\n" << cuda_c;
+      // load cumodule
+      cumodule.reset(
+          new CUDAModule(ptx, compiler.compile_to_cubin() ? CUDAModule::Kind::CUBIN : CUDAModule::Kind::PTX));
+      graph->SavePTXCode(ptx);
+    }
 
-    // save ptx
-    graph->SavePTXCode(compiler.GetPtx());
     // register kernel
     backends::RuntimeSymbols symbols;
     for (auto& fn : dmodule.functions()) {
diff --git a/cinn/runtime/cuda/cuda_module.cc b/cinn/runtime/cuda/cuda_module.cc
index 2c80998897..5a90d0cccb 100644
--- a/cinn/runtime/cuda/cuda_module.cc
+++ b/cinn/runtime/cuda/cuda_module.cc
@@ -25,6 +25,7 @@
 
 #include "cinn/backends/cuda_util.h"
 #include "cinn/runtime/cuda/cuda_util.h"
+#include "cinn/runtime/flags.h"
 
 namespace cinn {
 namespace runtime {
@@ -103,17 +104,11 @@ CUfunction CUDAModule::GetFunction(int device_id, const std::string& func_name)
     jit_options[4]  = CU_JIT_GENERATE_LINE_INFO;
     jit_opt_vals[4] = reinterpret_cast<void*>(value);
 
-    // CUresult status = cuModuleLoadDataEx(
-    //    &module_per_card_[device_id], data_.c_str(), jit_num_options, jit_options.data(), jit_opt_vals.data());
-    CUresult status = cuModuleLoad(&module_per_card_[device_id], data_.c_str());
-
-    if (CUDA_SUCCESS != status) {
-      RAW_LOG(ERROR, "PTX JIT ERROR LOG: %s\n.", log_buffer.data());
-      const char* name;
-      cuGetErrorName(status, &name);
-      const char* msg;
-      cuGetErrorString(status, &msg);
-      RAW_LOG(FATAL, "The error `%s` occurs while compiling the ptx! And its message is `%s`.", name, msg);
+    if (runtime::CanUseNvccCompiler()) {
+      CUDA_DRIVER_CALL(cuModuleLoad(&module_per_card_[device_id], data_.c_str()));
+    } else {
+      CUDA_DRIVER_CALL(cuModuleLoadDataEx(
+          &module_per_card_[device_id], data_.c_str(), jit_num_options, jit_options.data(), jit_opt_vals.data()));
     }
   }
 
@@ -125,11 +120,15 @@ CUfunction CUDAModule::GetFunction(int device_id, const std::string& func_name)
 CUdeviceptr CUDAModule::GetGlobal(int device_id, const std::string& name, size_t nbytes) {
   if (!module_per_card_[device_id]) {
     std::lock_guard<std::mutex> lock(mutex_);
-    CUDA_DRIVER_CALL(cuModuleLoad(&module_per_card_[device_id], data_.c_str()));
+    if (runtime::CanUseNvccCompiler()) {
+      CUDA_DRIVER_CALL(cuModuleLoad(&module_per_card_[device_id], data_.c_str()));
+    } else {
+      CUDA_DRIVER_CALL(cuModuleLoadData(&module_per_card_[device_id], data_.c_str()));
+    }
   }
 
-  CUdeviceptr global;
   size_t _nbytes;
+  CUdeviceptr global;
   CUDA_DRIVER_CALL(cuModuleGetGlobal(&global, &_nbytes, module_per_card_[device_id], name.c_str()));
   return global;
 }
diff --git a/cinn/runtime/flags.cc b/cinn/runtime/flags.cc
index a567a6a59e..af2433121c 100644
--- a/cinn/runtime/flags.cc
+++ b/cinn/runtime/flags.cc
@@ -16,6 +16,9 @@
 
 #include <gflags/gflags.h>
 #include <glog/logging.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
 
 #include <unordered_set>
 
@@ -173,5 +176,10 @@ unsigned long long RandomSeed::Clear() {
   return old_seed;
 }
 
+bool CanUseNvccCompiler() {
+  std::string nvcc_dir = "/usr/local/cuda/bin/nvcc";
+  return access(nvcc_dir.c_str(), 0) == -1 ? false : true;
+}
+
 }  // namespace runtime
 }  // namespace cinn
diff --git a/cinn/runtime/flags.h b/cinn/runtime/flags.h
index 59ebdba4e1..4491097c42 100644
--- a/cinn/runtime/flags.h
+++ b/cinn/runtime/flags.h
@@ -25,6 +25,8 @@ bool CheckStringFlagFalse(const std::string &flag);
 void SetCinnCudnnDeterministic(bool state);
 bool GetCinnCudnnDeterministic();
 
+bool CanUseNvccCompiler();
+
 class RandomSeed {
  public:
   static unsigned long long GetOrSet(unsigned long long seed = 0);

From e5aaf6e416fe1ae2996805d9e4951151b2fb2cbf Mon Sep 17 00:00:00 2001
From: sunli <466530738@qq.com>
Date: Fri, 12 May 2023 07:31:37 +0000
Subject: [PATCH 06/11] flag compile with nvrtc

---
 cinn/backends/compiler.cc | 1 -
 cinn/runtime/flags.cc     | 8 ++++++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/cinn/backends/compiler.cc b/cinn/backends/compiler.cc
index 08be15db33..d9d63cda39 100644
--- a/cinn/backends/compiler.cc
+++ b/cinn/backends/compiler.cc
@@ -137,7 +137,6 @@ void Compiler::CompileCudaModule(const Module& module, const std::string& code)
   }
 
   RuntimeSymbols symbols;
-
   for (auto& fn : device_module.functions()) {
     std::string kernel_fn_name = fn->name;
     auto fn_kernel             = cuda_module_->GetFunction(0, kernel_fn_name);
diff --git a/cinn/runtime/flags.cc b/cinn/runtime/flags.cc
index af2433121c..0a66bc991f 100644
--- a/cinn/runtime/flags.cc
+++ b/cinn/runtime/flags.cc
@@ -83,9 +83,13 @@ DEFINE_bool(cinn_use_dense_merge_pass,
             "Whether use dense merge pass.");
 
 DEFINE_bool(nvrtc_compile_to_cubin,
-            BoolFromEnv("FLAGS_nvrtc_compile_to_cubin", false),
+            BoolFromEnv("FLAGS_nvrtc_compile_to_cubin", true),
             "Whether nvrtc compile cuda source into cubin instead of ptx (only works after cuda-11.1).");
 
+DEFINE_bool(cinn_compile_with_nvrtc,
+            BoolFromEnv("FLAGS_cinn_compile_with_nvrtc", false),
+            "Whether nvrtc compile cuda source with nvrtc(default nvcc).");
+
 // FLAGS for performance analysis and accuracy debug
 DEFINE_bool(cinn_sync_run,
             BoolFromEnv("FLAGS_cinn_sync_run", false),
@@ -178,7 +182,7 @@ unsigned long long RandomSeed::Clear() {
 
 bool CanUseNvccCompiler() {
   std::string nvcc_dir = "/usr/local/cuda/bin/nvcc";
-  return access(nvcc_dir.c_str(), 0) == -1 ? false : true;
+  return (access(nvcc_dir.c_str(), 0) == -1 ? false : true) && (!FLAGS_cinn_compile_with_nvrtc);
 }
 
 }  // namespace runtime

From c851b685319ef333c12056b09d63afff21272624 Mon Sep 17 00:00:00 2001
From: sunli <466530738@qq.com>
Date: Fri, 12 May 2023 07:53:09 +0000
Subject: [PATCH 07/11] add nvcc cmd path

---
 cinn/backends/nvrtc/nvcc_util.cc | 11 ++++++++---
 cinn/runtime/flags.cc            |  5 ++++-
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/cinn/backends/nvrtc/nvcc_util.cc b/cinn/backends/nvrtc/nvcc_util.cc
index a968f76d1b..4264444656 100644
--- a/cinn/backends/nvrtc/nvcc_util.cc
+++ b/cinn/backends/nvrtc/nvcc_util.cc
@@ -26,6 +26,8 @@
 #include <fstream>
 #include <iostream>
 
+DECLARE_string(cinn_nvcc_cmd_path);
+
 namespace cinn {
 namespace backends {
 namespace nvrtc {
@@ -65,21 +67,24 @@ void NvccCompiler::CompileToPtx() {
     }
   }
 
-  std::string options =
-      std::string("export PATH=/usr/local/cuda/bin:$PATH && nvcc -std=c++14 --ptx -O3 -I ") + include_dir_str;
+  std::string options = std::string("export PATH=") + FLAGS_cinn_nvcc_cmd_path +
+                        std::string(":$PATH && nvcc -std=c++14 --ptx -O3 -I ") + include_dir_str;
   options += " -arch=" + GetDeviceArch();
   options += " -o " + prefix_name_ + ".ptx";
   options += " " + prefix_name_ + ".cu";
 
+  VLOG(2) << "Nvcc Compile Options : " << options;
   CHECK(system(options.c_str()) == 0) << options;
 }
 
 void NvccCompiler::CompileToCubin() {
-  std::string options = "export PATH=/usr/local/cuda/bin:$PATH && nvcc --cubin -O3";
+  std::string options =
+      std::string("export PATH=") + FLAGS_cinn_nvcc_cmd_path + std::string(":$PATH && nvcc --cubin -O3");
   options += " -arch=" + GetDeviceArch();
   options += " -o " + prefix_name_ + ".cubin";
   options += " " + prefix_name_ + ".ptx";
 
+  VLOG(2) << "Nvcc Compile Options : " << options;
   CHECK(system(options.c_str()) == 0) << options;
 }
 
diff --git a/cinn/runtime/flags.cc b/cinn/runtime/flags.cc
index 0a66bc991f..7238c9e59f 100644
--- a/cinn/runtime/flags.cc
+++ b/cinn/runtime/flags.cc
@@ -36,6 +36,9 @@ using ::GFLAGS_NAMESPACE::Int64FromEnv;
 using ::GFLAGS_NAMESPACE::StringFromEnv;
 
 DEFINE_string(cinn_x86_builtin_code_root, StringFromEnv("FLAGS_cinn_x86_builtin_code_root", ""), "");
+DEFINE_string(cinn_nvcc_cmd_path,
+              StringFromEnv("FLAGS_cinn_nvcc_cmd_path", "/usr/local/cuda/bin"),
+              "Setting nvcc default path!");
 
 DEFINE_int32(cinn_parallel_compile_size,
              Int32FromEnv("FLAGS_cinn_parallel_compile_size", 16),
@@ -181,7 +184,7 @@ unsigned long long RandomSeed::Clear() {
 }
 
 bool CanUseNvccCompiler() {
-  std::string nvcc_dir = "/usr/local/cuda/bin/nvcc";
+  std::string nvcc_dir = FLAGS_cinn_nvcc_cmd_path + "/nvcc";
   return (access(nvcc_dir.c_str(), 0) == -1 ? false : true) && (!FLAGS_cinn_compile_with_nvrtc);
 }
 

From a0c35dea8550938b583bbc99045cab6fe845cc23 Mon Sep 17 00:00:00 2001
From: sunli <466530738@qq.com>
Date: Mon, 15 May 2023 13:26:35 +0000
Subject: [PATCH 08/11] using nvcc in nvrtc

---
 cinn/backends/compiler.cc                |  16 +--
 cinn/backends/nvrtc/CMakeLists.txt       |   1 -
 cinn/backends/nvrtc/nvcc_util.cc         | 122 -----------------------
 cinn/backends/nvrtc/nvcc_util.h          |  46 ---------
 cinn/backends/nvrtc/nvrtc_util.cc        |  94 +++++++++++++++++
 cinn/backends/nvrtc/nvrtc_util.h         |  13 +++
 cinn/hlir/framework/parallel_compiler.cc |  20 +---
 7 files changed, 117 insertions(+), 195 deletions(-)
 delete mode 100644 cinn/backends/nvrtc/nvcc_util.cc
 delete mode 100644 cinn/backends/nvrtc/nvcc_util.h

diff --git a/cinn/backends/compiler.cc b/cinn/backends/compiler.cc
index d9d63cda39..798b0a96a2 100644
--- a/cinn/backends/compiler.cc
+++ b/cinn/backends/compiler.cc
@@ -22,7 +22,6 @@
 #include "cinn/backends/codegen_cuda_dev.h"
 #include "cinn/backends/codegen_cuda_host.h"
 #include "cinn/backends/codegen_cuda_util.h"
-#include "cinn/backends/nvrtc/nvcc_util.h"
 #include "cinn/backends/nvrtc/nvrtc_util.h"
 #include "cinn/runtime/cuda/cuda_module.h"
 #include "cinn/runtime/cuda/cuda_util.h"
@@ -125,16 +124,11 @@ void Compiler::CompileCudaModule(const Module& module, const std::string& code)
   SourceCodePrint::GetInstance()->write(source_code);
   using runtime::cuda::CUDAModule;
 
-  if (runtime::CanUseNvccCompiler()) {
-    nvrtc::NvccCompiler compiler;
-    cuda_module_.reset(new CUDAModule(compiler(source_code), CUDAModule::Kind::CUBIN));
-  } else {
-    nvrtc::Compiler compiler;
-    auto ptx = compiler(source_code);
-    CHECK(!ptx.empty()) << "Compile PTX failed from source code:\n" << source_code;
-    cuda_module_.reset(
-        new CUDAModule(ptx, compiler.compile_to_cubin() ? CUDAModule::Kind::CUBIN : CUDAModule::Kind::PTX));
-  }
+  nvrtc::Compiler compiler;
+  auto ptx = compiler(source_code);
+  CHECK(!ptx.empty()) << "Compile PTX failed from source code:\n" << source_code;
+  cuda_module_.reset(
+      new CUDAModule(ptx, compiler.compile_to_cubin() ? CUDAModule::Kind::CUBIN : CUDAModule::Kind::PTX));
 
   RuntimeSymbols symbols;
   for (auto& fn : device_module.functions()) {
diff --git a/cinn/backends/nvrtc/CMakeLists.txt b/cinn/backends/nvrtc/CMakeLists.txt
index b67a2b6413..a344b65ca9 100644
--- a/cinn/backends/nvrtc/CMakeLists.txt
+++ b/cinn/backends/nvrtc/CMakeLists.txt
@@ -3,7 +3,6 @@ core_gather_headers()
 gather_srcs(cinnapi_src SRCS
   header_generator.cc
   nvrtc_util.cc
-  nvcc_util.cc
 )
 
 nv_test(test_nvrtc_util SRCS nvrtc_util_test.cc DEPS cinncore)
diff --git a/cinn/backends/nvrtc/nvcc_util.cc b/cinn/backends/nvrtc/nvcc_util.cc
deleted file mode 100644
index 4264444656..0000000000
--- a/cinn/backends/nvrtc/nvcc_util.cc
+++ /dev/null
@@ -1,122 +0,0 @@
-// Copyright (c) 2023 CINN Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "cinn/backends/nvrtc/nvcc_util.h"
-
-#include "cinn/common/common.h"
-
-#ifdef CINN_WITH_CUDA
-
-#include <cuda_runtime.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <unistd.h>
-
-#include <fstream>
-#include <iostream>
-
-DECLARE_string(cinn_nvcc_cmd_path);
-
-namespace cinn {
-namespace backends {
-namespace nvrtc {
-
-std::string NvccCompiler::operator()(const std::string& cuda_c) {
-  // read dir source
-  std::string dir = "./source";
-  if (access(dir.c_str(), 0) == -1) {
-    CHECK(mkdir(dir.c_str(), 7) != -1) << "Fail to mkdir " << dir;
-  }
-
-  // get unqiue prefix name
-  prefix_name_ = dir + "/" + common::UniqName("rtc_tmp");
-
-  auto cuda_c_file = prefix_name_ + ".cu";
-  std::ofstream ofs(cuda_c_file, std::ios::out);
-  CHECK(ofs.is_open()) << "Fail to open file " << cuda_c_file;
-  ofs << cuda_c;
-  ofs.close();
-
-  CompileToPtx();
-  CompileToCubin();
-
-  return prefix_name_ + ".cubin";
-}
-
-std::string NvccCompiler::GetPtx() { return ReadFile(prefix_name_ + ".ptx", std::ios::in); }
-
-void NvccCompiler::CompileToPtx() {
-  auto include_dir            = common::Context::Global().runtime_include_dir();
-  std::string include_dir_str = "";
-  for (auto dir : include_dir) {
-    if (include_dir_str.empty()) {
-      include_dir_str = dir;
-    } else {
-      include_dir_str += ":" + dir;
-    }
-  }
-
-  std::string options = std::string("export PATH=") + FLAGS_cinn_nvcc_cmd_path +
-                        std::string(":$PATH && nvcc -std=c++14 --ptx -O3 -I ") + include_dir_str;
-  options += " -arch=" + GetDeviceArch();
-  options += " -o " + prefix_name_ + ".ptx";
-  options += " " + prefix_name_ + ".cu";
-
-  VLOG(2) << "Nvcc Compile Options : " << options;
-  CHECK(system(options.c_str()) == 0) << options;
-}
-
-void NvccCompiler::CompileToCubin() {
-  std::string options =
-      std::string("export PATH=") + FLAGS_cinn_nvcc_cmd_path + std::string(":$PATH && nvcc --cubin -O3");
-  options += " -arch=" + GetDeviceArch();
-  options += " -o " + prefix_name_ + ".cubin";
-  options += " " + prefix_name_ + ".ptx";
-
-  VLOG(2) << "Nvcc Compile Options : " << options;
-  CHECK(system(options.c_str()) == 0) << options;
-}
-
-std::string NvccCompiler::GetDeviceArch() {
-  int major = 0, minor = 0;
-  if (cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, 0) == cudaSuccess &&
-      cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, 0) == cudaSuccess) {
-    return "sm_" + std::to_string(major) + std::to_string(minor);
-  } else {
-    LOG(WARNING) << "cannot detect compute capability from your device, "
-                 << "fall back to compute_30.";
-    return "sm_30";
-  }
-}
-
-std::string NvccCompiler::ReadFile(const std::string& file_name, std::ios_base::openmode mode) {
-  // open cubin file
-  std::ifstream ifs(file_name, mode);
-  CHECK(ifs.is_open()) << "Fail to open file " << file_name;
-  ifs.seekg(std::ios::end);
-  auto len = ifs.tellg();
-  ifs.seekg(0);
-
-  // read cubin file
-  std::string file_data(len, ' ');
-  ifs.read(&file_data[0], len);
-  ifs.close();
-  return std::move(file_data);
-}
-
-}  // namespace nvrtc
-}  // namespace backends
-}  // namespace cinn
-
-#endif  // CINN_WITH_CUDA
diff --git a/cinn/backends/nvrtc/nvcc_util.h b/cinn/backends/nvrtc/nvcc_util.h
deleted file mode 100644
index c3db6d151a..0000000000
--- a/cinn/backends/nvrtc/nvcc_util.h
+++ /dev/null
@@ -1,46 +0,0 @@
-// Copyright (c) 2023 CINN Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-
-#include <ios>
-#include <string>
-
-#ifdef CINN_WITH_CUDA
-
-namespace cinn {
-namespace backends {
-namespace nvrtc {
-
-class NvccCompiler {
- public:
-  NvccCompiler(){};
-  ~NvccCompiler(){};
-  std::string operator()(const std::string&);
-  std::string GetPtx();
-
- private:
-  void CompileToPtx();
-  void CompileToCubin();
-  std::string GetDeviceArch();
-
-  std::string ReadFile(const std::string&, std::ios_base::openmode);
-
-  std::string prefix_name_{""};
-};
-
-}  // namespace nvrtc
-}  // namespace backends
-}  // namespace cinn
-
-#endif  // CINN_WITH_CUDA
diff --git a/cinn/backends/nvrtc/nvrtc_util.cc b/cinn/backends/nvrtc/nvrtc_util.cc
index 101406984e..4598054701 100644
--- a/cinn/backends/nvrtc/nvrtc_util.cc
+++ b/cinn/backends/nvrtc/nvrtc_util.cc
@@ -17,12 +17,20 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <nvrtc.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <fstream>
+#include <iostream>
 
 #include "cinn/backends/cuda_util.h"
 #include "cinn/backends/nvrtc/header_generator.h"
 #include "cinn/common/common.h"
+#include "cinn/runtime/flags.h"
 #include "cinn/utils/string.h"
 
+DECLARE_string(cinn_nvcc_cmd_path);
 DECLARE_bool(nvrtc_compile_to_cubin);
 
 namespace cinn {
@@ -30,6 +38,9 @@ namespace backends {
 namespace nvrtc {
 
 std::string Compiler::operator()(const std::string& code, bool include_headers) {
+  if (runtime::CanUseNvccCompiler()) {
+    return CompileWithNvcc(code);
+  }
   return CompileCudaSource(code, include_headers);
 }
 
@@ -140,6 +151,89 @@ std::string Compiler::CompileCudaSource(const std::string& code, bool include_he
   return data;
 }
 
+std::string Compiler::CompileWithNvcc(const std::string& cuda_c) {
+  // read dir source
+  std::string dir = "./source";
+  if (access(dir.c_str(), 0) == -1) {
+    CHECK(mkdir(dir.c_str(), 7) != -1) << "Fail to mkdir " << dir;
+  }
+
+  // get unqiue prefix name
+  prefix_name_ = dir + "/" + common::UniqName("rtc_tmp");
+
+  auto cuda_c_file = prefix_name_ + ".cu";
+  std::ofstream ofs(cuda_c_file, std::ios::out);
+  CHECK(ofs.is_open()) << "Fail to open file " << cuda_c_file;
+  ofs << cuda_c;
+  ofs.close();
+
+  CompileToPtx();
+  CompileToCubin();
+
+  return prefix_name_ + ".cubin";
+}
+
+// std::string Compiler::GetPtx() { return ReadFile(prefix_name_ + ".ptx", std::ios::in); }
+
+void Compiler::CompileToPtx() {
+  auto include_dir            = common::Context::Global().runtime_include_dir();
+  std::string include_dir_str = "";
+  for (auto dir : include_dir) {
+    if (include_dir_str.empty()) {
+      include_dir_str = dir;
+    } else {
+      include_dir_str += ":" + dir;
+    }
+  }
+
+  std::string options = std::string("export PATH=") + FLAGS_cinn_nvcc_cmd_path +
+                        std::string(":$PATH && nvcc -std=c++14 --ptx -O3 -I ") + include_dir_str;
+  options += " -arch=" + GetDeviceArch();
+  options += " -o " + prefix_name_ + ".ptx";
+  options += " " + prefix_name_ + ".cu";
+
+  VLOG(2) << "Nvcc Compile Options : " << options;
+  CHECK(system(options.c_str()) == 0) << options;
+}
+
+void Compiler::CompileToCubin() {
+  std::string options =
+      std::string("export PATH=") + FLAGS_cinn_nvcc_cmd_path + std::string(":$PATH && nvcc --cubin -O3");
+  options += " -arch=" + GetDeviceArch();
+  options += " -o " + prefix_name_ + ".cubin";
+  options += " " + prefix_name_ + ".ptx";
+
+  VLOG(2) << "Nvcc Compile Options : " << options;
+  CHECK(system(options.c_str()) == 0) << options;
+}
+
+std::string Compiler::GetDeviceArch() {
+  int major = 0, minor = 0;
+  if (cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, 0) == cudaSuccess &&
+      cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, 0) == cudaSuccess) {
+    return "sm_" + std::to_string(major) + std::to_string(minor);
+  } else {
+    LOG(WARNING) << "cannot detect compute capability from your device, "
+                 << "fall back to compute_30.";
+    return "sm_30";
+  }
+}
+
+std::string Compiler::ReadFile(const std::string& file_name, std::ios_base::openmode mode) {
+  // open cubin file
+  std::ifstream ifs(file_name, mode);
+  CHECK(ifs.is_open()) << "Fail to open file " << file_name;
+  ifs.seekg(std::ios::end);
+  auto len = ifs.tellg();
+  ifs.seekg(0);
+
+  // read cubin file
+  std::string file_data(len, ' ');
+  ifs.read(&file_data[0], len);
+  ifs.close();
+  return std::move(file_data);
+}
+
 }  // namespace nvrtc
 }  // namespace backends
 }  // namespace cinn
diff --git a/cinn/backends/nvrtc/nvrtc_util.h b/cinn/backends/nvrtc/nvrtc_util.h
index a5f8424a31..b13c24c550 100644
--- a/cinn/backends/nvrtc/nvrtc_util.h
+++ b/cinn/backends/nvrtc/nvrtc_util.h
@@ -70,6 +70,19 @@ class Compiler {
    * whether to compile the source code into cubin, only works with cuda version > 11.1
    */
   bool compile_to_cubin_{false};
+
+  // compile with nvcc
+  std::string CompileWithNvcc(const std::string&);
+
+  // compile to ptx
+  void CompileToPtx();
+  // compile to cubin
+  void CompileToCubin();
+  std::string GetDeviceArch();
+
+  std::string ReadFile(const std::string&, std::ios_base::openmode);
+
+  std::string prefix_name_{""};
 };
 
 }  // namespace nvrtc
diff --git a/cinn/hlir/framework/parallel_compiler.cc b/cinn/hlir/framework/parallel_compiler.cc
index cc3ac7e140..ede13cab04 100644
--- a/cinn/hlir/framework/parallel_compiler.cc
+++ b/cinn/hlir/framework/parallel_compiler.cc
@@ -24,7 +24,6 @@
 #include "cinn/backends/compiler.h"
 #include "cinn/backends/llvm/codegen_x86.h"
 #include "cinn/backends/llvm/runtime_symbol_registry.h"
-#include "cinn/backends/nvrtc/nvcc_util.h"
 #include "cinn/backends/nvrtc/nvrtc_util.h"
 #include "cinn/common/context.h"
 #include "cinn/hlir/framework/pass.h"
@@ -177,20 +176,11 @@ void ParallelCompiler::Task::CodegenAndJit() {
     graph->SaveSourceCode(cuda_c);
 
     using runtime::cuda::CUDAModule;
-    if (runtime::CanUseNvccCompiler()) {
-      backends::nvrtc::NvccCompiler compiler;
-      // load cumodule
-      cumodule.reset(new CUDAModule(compiler(cuda_c), CUDAModule::Kind::CUBIN));
-      graph->SavePTXCode(compiler.GetPtx());
-    } else {
-      backends::nvrtc::Compiler compiler;
-      auto ptx = compiler(cuda_c);
-      CHECK(!ptx.empty()) << "Compile PTX failed from source code:\n" << cuda_c;
-      // load cumodule
-      cumodule.reset(
-          new CUDAModule(ptx, compiler.compile_to_cubin() ? CUDAModule::Kind::CUBIN : CUDAModule::Kind::PTX));
-      graph->SavePTXCode(ptx);
-    }
+    backends::nvrtc::Compiler compiler;
+    auto ptx = compiler(cuda_c);
+    CHECK(!ptx.empty()) << "Compile PTX failed from source code:\n" << cuda_c;
+    // load cumodule
+    cumodule.reset(new CUDAModule(ptx, compiler.compile_to_cubin() ? CUDAModule::Kind::CUBIN : CUDAModule::Kind::PTX));
 
     // register kernel
     backends::RuntimeSymbols symbols;

From c67842fe37061fde5302cce18627ecae0c303b72 Mon Sep 17 00:00:00 2001
From: sunli <466530738@qq.com>
Date: Mon, 15 May 2023 13:27:57 +0000
Subject: [PATCH 09/11] default with nvrtc

---
 cinn/runtime/flags.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cinn/runtime/flags.cc b/cinn/runtime/flags.cc
index 7238c9e59f..b6aff4213d 100644
--- a/cinn/runtime/flags.cc
+++ b/cinn/runtime/flags.cc
@@ -90,7 +90,7 @@ DEFINE_bool(nvrtc_compile_to_cubin,
             "Whether nvrtc compile cuda source into cubin instead of ptx (only works after cuda-11.1).");
 
 DEFINE_bool(cinn_compile_with_nvrtc,
-            BoolFromEnv("FLAGS_cinn_compile_with_nvrtc", false),
+            BoolFromEnv("FLAGS_cinn_compile_with_nvrtc", true),
             "Whether nvrtc compile cuda source with nvrtc(default nvcc).");
 
 // FLAGS for performance analysis and accuracy debug

From af08e4b86dde406c4b85a8495f1c810456f8d6cb Mon Sep 17 00:00:00 2001
From: sunli <466530738@qq.com>
Date: Tue, 16 May 2023 01:39:33 +0000
Subject: [PATCH 10/11] remove used unit test

---
 cinn/hlir/pe/pe_transform_test.cc | 711 ------------------------------
 1 file changed, 711 deletions(-)

diff --git a/cinn/hlir/pe/pe_transform_test.cc b/cinn/hlir/pe/pe_transform_test.cc
index 58f0c109f2..ce65e95f95 100644
--- a/cinn/hlir/pe/pe_transform_test.cc
+++ b/cinn/hlir/pe/pe_transform_test.cc
@@ -224,717 +224,6 @@ TEST(Concat, ConcatCase0) {
 #endif
 }
 
-TEST(Reduce, Reduce_Test_0) {
-  int m = 128;
-  int n = 128;
-  Expr M(m), N(n);
-
-  Placeholder<float> A("A", {M, N});
-  Placeholder<float> B("B", {M, N});
-
-  auto C      = hlir::pe::Add(A.tensor(), B.tensor());
-  auto D      = hlir::pe::ReduceSum(C, {0});
-  auto stages = CreateStages({C, D});
-  stages[C]->SetBuffer("local");
-  stages[C]->Reorder({1, 0});
-  stages[D]->Bind(0, "threadIdx.x");
-  stages[C]->SimpleComputeAt(stages[D], 1);
-
-  auto func = Lower("fn", stages, {A, B, D});
-  LOG(INFO) << "func:\n" << func;
-
-#ifdef CINN_WITH_CUDA
-  auto target = common::DefaultNVGPUTarget();
-  Module::Builder builder("Concat_Builder", target);
-  builder.AddFunction(func);
-
-  auto module                    = builder.Build();
-  auto host_module_device_module = backends::SplitCudaAndHostModule(module);
-  auto &host_module              = std::get<0>(host_module_device_module);
-  auto &device_module            = std::get<1>(host_module_device_module);
-
-  backends::CodeGenCUDA_Dev codegen(target);
-  auto source_code = codegen.Compile(builder.Build());
-  LOG(INFO) << "compiled code:\n\n\n" << source_code;
-
-  // nv jit compile to ptx
-  backends::nvrtc::Compiler compiler;
-  auto ptx = compiler(source_code);
-  CHECK(!ptx.empty());
-#endif
-}
-
-#ifdef CINN_WITH_CUDA
-void CudaReduceReorder(poly::StageMap stages, ir::Tensor input, const std::vector<int> &axes) {
-  auto &shape = input->shape;
-  std::vector<int> order;
-  for (int idx = 0; idx < shape.size(); ++idx) {
-    if (std::find(axes.begin(), axes.end(), idx) == axes.end()) {
-      order.push_back(idx);
-    }
-  }
-  for (auto axis : axes) {
-    order.push_back(axis);
-  }
-  stages[input]->Reorder(order);
-
-  int last_dimension_num = shape.size() - axes.back() - 1;
-  int index              = shape.size() - last_dimension_num - axes.size();
-  for (auto idx = index; idx < index + last_dimension_num - 1; ++idx) {
-    stages[input]->Fuse(index, index + 1);
-  }
-
-  if (stages[input]->GetDimRange(index) > 1024) {
-    stages[input]->Split(index, 1024);
-  }
-
-  for (int idx = 0; idx < index - 1; ++idx) {
-    stages[input]->Fuse(0, 1);
-  }
-}
-
-TEST(Reduce, Reduce_Test_1) {
-  int m = 128;
-  int n = 128;
-  Expr M(m), N(n);
-
-  Placeholder<float> A("A", {M, M, M, N, N});
-  Placeholder<float> B("B", {M, M, M, N, N});
-
-  auto C      = hlir::pe::Add(A.tensor(), B.tensor());
-  auto D      = hlir::pe::ReduceSum(C, {0, 2});
-  auto stages = CreateStages({C, D});
-  hlir::pe::CudaReduceSchedule(stages, D, 2, common::DefaultNVGPUTarget());
-  CudaReduceReorder(stages, C, {0, 2});
-  stages[C]->SetBuffer("local");
-  stages[C]->SimpleComputeAt(stages[D], stages[D]->n_out_dims() - 1);
-  // stages[C]->ComputeInline();
-
-  auto func = Lower("fn", stages, {A, B, D});
-  LOG(INFO) << "func:\n" << func;
-
-  auto target = common::DefaultNVGPUTarget();
-  Module::Builder builder("Concat_Builder", target);
-  builder.AddFunction(func);
-
-  auto module                    = builder.Build();
-  auto host_module_device_module = backends::SplitCudaAndHostModule(module);
-  auto &host_module              = std::get<0>(host_module_device_module);
-  auto &device_module            = std::get<1>(host_module_device_module);
-
-  backends::CodeGenCUDA_Dev codegen(target);
-  auto source_code = codegen.Compile(builder.Build());
-  LOG(INFO) << "compiled code:\n\n\n" << source_code;
-
-  // nv jit compile to ptx
-  backends::nvrtc::Compiler compiler;
-  auto ptx = compiler(source_code);
-  CHECK(!ptx.empty());
-}
-
-TEST(Reduce, Reduce_Test_2) {
-  int m = 10201;
-  int n = 50;
-  Expr M(m), N(n);
-
-  Placeholder<float> A("A", {M, N});
-
-  auto reduce_out = hlir::pe::BlockShuffleReduceSum(A.tensor(), {0}, false);
-  CHECK_EQ(reduce_out.size(), 3) << "the output of reduce is not equal to 3";
-  auto stages = CreateStages({A, reduce_out[2], reduce_out[1], reduce_out[0]});
-
-  CudaBlockShuffleReduceSchedule(stages, reduce_out[2], reduce_out[1], reduce_out[0], common::DefaultNVGPUTarget());
-
-  auto func = Lower("fn", stages, {A, reduce_out[0]});
-  LOG(INFO) << "func:\n" << func;
-
-  auto target = common::DefaultNVGPUTarget();
-  Module::Builder builder("Concat_Builder", target);
-  builder.AddFunction(func);
-
-  auto module                    = builder.Build();
-  auto host_module_device_module = backends::SplitCudaAndHostModule(module);
-  auto &host_module              = std::get<0>(host_module_device_module);
-  auto &device_module            = std::get<1>(host_module_device_module);
-
-  backends::CodeGenCUDA_Dev codegen(target);
-  auto source_code = codegen.Compile(builder.Build());
-  LOG(INFO) << "compiled code:\n\n\n" << source_code;
-
-  // nv jit compile to ptx
-  backends::nvrtc::Compiler compiler;
-  auto ptx = compiler(source_code);
-  CHECK(!ptx.empty());
-}
-
-TEST(Reduce, Reduce_Test_2_1) {
-  int m = 10240;
-  int n = 64;
-  Expr M(m), N(n);
-
-  Placeholder<float> A("A", {M, N});
-
-  auto reduce_out = hlir::pe::BlockShuffleReduceSum(A.tensor(), {0}, false);
-  CHECK_EQ(reduce_out.size(), 3) << "the output of reduce is not equal to 3";
-  auto stages = CreateStages({A, reduce_out[2], reduce_out[1], reduce_out[0]});
-
-  CudaBlockShuffleReduceSchedule(stages, reduce_out[2], reduce_out[1], reduce_out[0], common::DefaultNVGPUTarget());
-
-  auto func = Lower("fn", stages, {A, reduce_out[0]});
-  LOG(INFO) << "func:\n" << func;
-
-  auto target = common::DefaultNVGPUTarget();
-  Module::Builder builder("Concat_Builder", target);
-  builder.AddFunction(func);
-
-  auto module                    = builder.Build();
-  auto host_module_device_module = backends::SplitCudaAndHostModule(module);
-  auto &host_module              = std::get<0>(host_module_device_module);
-  auto &device_module            = std::get<1>(host_module_device_module);
-
-  backends::CodeGenCUDA_Dev codegen(target);
-  auto source_code = codegen.Compile(builder.Build());
-  LOG(INFO) << "compiled code:\n\n\n" << source_code;
-
-  // nv jit compile to ptx
-  backends::nvrtc::Compiler compiler;
-  auto ptx = compiler(source_code);
-  CHECK(!ptx.empty());
-}
-
-TEST(Reduce, Reduce_Test_2_2) {
-  int m = 10240;
-  int n = 64;
-  Expr M(m), N(n);
-
-  Placeholder<float> A("A", {N, M, N});
-
-  auto reduce_out = hlir::pe::BlockShuffleReduceSum(A.tensor(), {1}, false);
-  CHECK_EQ(reduce_out.size(), 3) << "the output of reduce is not equal to 3";
-  auto stages = CreateStages({A, reduce_out[2], reduce_out[1], reduce_out[0]});
-
-  CudaBlockShuffleReduceSchedule(stages, reduce_out[2], reduce_out[1], reduce_out[0], common::DefaultNVGPUTarget());
-
-  auto func = Lower("fn", stages, {A, reduce_out[0]});
-  LOG(INFO) << "func:\n" << func;
-
-  auto target = common::DefaultNVGPUTarget();
-  Module::Builder builder("Concat_Builder", target);
-  builder.AddFunction(func);
-
-  auto module                    = builder.Build();
-  auto host_module_device_module = backends::SplitCudaAndHostModule(module);
-  auto &host_module              = std::get<0>(host_module_device_module);
-  auto &device_module            = std::get<1>(host_module_device_module);
-
-  backends::CodeGenCUDA_Dev codegen(target);
-  auto source_code = codegen.Compile(builder.Build());
-  LOG(INFO) << "compiled code:\n\n\n" << source_code;
-
-  // nv jit compile to ptx
-  backends::nvrtc::Compiler compiler;
-  auto ptx = compiler(source_code);
-  CHECK(!ptx.empty());
-}
-
-TEST(Reduce, Reduce_Test_2_3) {
-  int m = 10240;
-  int n = 16;
-  Expr M(m), N(n);
-
-  Placeholder<float> A("A", {M, N, N});
-
-  auto reduce_out = hlir::pe::BlockShuffleReduceSum(A.tensor(), {0}, false);
-  CHECK_EQ(reduce_out.size(), 3) << "the output of reduce is not equal to 3";
-  auto stages = CreateStages({A, reduce_out[2], reduce_out[1], reduce_out[0]});
-
-  CudaBlockShuffleReduceSchedule(stages, reduce_out[2], reduce_out[1], reduce_out[0], common::DefaultNVGPUTarget());
-
-  auto func = Lower("fn", stages, {A, reduce_out[0]});
-  LOG(INFO) << "func:\n" << func;
-
-  auto target = common::DefaultNVGPUTarget();
-  Module::Builder builder("Concat_Builder", target);
-  builder.AddFunction(func);
-
-  auto module                    = builder.Build();
-  auto host_module_device_module = backends::SplitCudaAndHostModule(module);
-  auto &host_module              = std::get<0>(host_module_device_module);
-  auto &device_module            = std::get<1>(host_module_device_module);
-
-  backends::CodeGenCUDA_Dev codegen(target);
-  auto source_code = codegen.Compile(builder.Build());
-  LOG(INFO) << "compiled code:\n\n\n" << source_code;
-
-  // nv jit compile to ptx
-  backends::nvrtc::Compiler compiler;
-  auto ptx = compiler(source_code);
-  CHECK(!ptx.empty());
-}
-
-TEST(Reduce, Reduce_Test_3) {
-  int m = 10201;
-  Expr M(m);
-
-  Placeholder<float> A("A", {M});
-
-  auto reduce_out = hlir::pe::TwoStepBlockReduceSum(A.tensor(), {0}, false);
-  CHECK_EQ(reduce_out.size(), 4) << "the output of reduce is not equal to 4!";
-  auto stages = CreateStages({A, reduce_out[3], reduce_out[2], reduce_out[1], reduce_out[0]});
-
-  CudaTwoStepReduceSchedule(
-      stages, reduce_out[3], reduce_out[2], reduce_out[1], reduce_out[0], common::DefaultNVGPUTarget());
-
-  auto func = Lower("fn", stages, {A, reduce_out[0]});
-  LOG(INFO) << "func:\n" << func;
-
-  auto target = common::DefaultNVGPUTarget();
-  Module::Builder builder("Concat_Builder", target);
-  builder.AddFunction(func);
-
-  auto module                    = builder.Build();
-  auto host_module_device_module = backends::SplitCudaAndHostModule(module);
-  auto &host_module              = std::get<0>(host_module_device_module);
-  auto &device_module            = std::get<1>(host_module_device_module);
-
-  backends::CodeGenCUDA_Dev codegen(target);
-  auto source_code = codegen.Compile(builder.Build());
-  LOG(INFO) << "compiled code:\n\n\n" << source_code;
-
-  // nv jit compile to ptx
-  backends::nvrtc::Compiler compiler;
-  auto ptx = compiler(source_code);
-  CHECK(!ptx.empty());
-}
-
-TEST(Reduce, Reduce_Test_3_1) {
-  int m = 10240;
-  Expr M(m);
-
-  Placeholder<float> A("A", {M});
-
-  auto reduce_out = hlir::pe::TwoStepBlockReduceSum(A.tensor(), {0}, false);
-  CHECK_EQ(reduce_out.size(), 4) << "the output of reduce is not equal to 4!";
-  auto stages = CreateStages({A, reduce_out[3], reduce_out[2], reduce_out[1], reduce_out[0]});
-
-  CudaTwoStepReduceSchedule(
-      stages, reduce_out[3], reduce_out[2], reduce_out[1], reduce_out[0], common::DefaultNVGPUTarget());
-
-  auto func = Lower("fn", stages, {A, reduce_out[0]});
-  LOG(INFO) << "func:\n" << func;
-
-  auto target = common::DefaultNVGPUTarget();
-  Module::Builder builder("Concat_Builder", target);
-  builder.AddFunction(func);
-
-  auto module                    = builder.Build();
-  auto host_module_device_module = backends::SplitCudaAndHostModule(module);
-  auto &host_module              = std::get<0>(host_module_device_module);
-  auto &device_module            = std::get<1>(host_module_device_module);
-
-  backends::CodeGenCUDA_Dev codegen(target);
-  auto source_code = codegen.Compile(builder.Build());
-  LOG(INFO) << "compiled code:\n\n\n" << source_code;
-
-  // nv jit compile to ptx
-  backends::nvrtc::Compiler compiler;
-  auto ptx = compiler(source_code);
-  CHECK(!ptx.empty());
-}
-
-TEST(Reduce, Reduce_Test_3_2) {
-  int m = 10240;
-  int n = 64;
-  Expr M(m), N(n);
-
-  Placeholder<float> A("A", {N, M});
-
-  auto reduce_out = hlir::pe::TwoStepBlockReduceSum(A.tensor(), {1}, false);
-  CHECK_EQ(reduce_out.size(), 4) << "the output of reduce is not equal to 4!";
-  auto stages = CreateStages({A, reduce_out[3], reduce_out[2], reduce_out[1], reduce_out[0]});
-
-  CudaTwoStepReduceSchedule(
-      stages, reduce_out[3], reduce_out[2], reduce_out[1], reduce_out[0], common::DefaultNVGPUTarget());
-
-  auto func = Lower("fn", stages, {A, reduce_out[0]});
-  LOG(INFO) << "func:\n" << func;
-
-  auto target = common::DefaultNVGPUTarget();
-  Module::Builder builder("Concat_Builder", target);
-  builder.AddFunction(func);
-
-  auto module                    = builder.Build();
-  auto host_module_device_module = backends::SplitCudaAndHostModule(module);
-  auto &host_module              = std::get<0>(host_module_device_module);
-  auto &device_module            = std::get<1>(host_module_device_module);
-
-  backends::CodeGenCUDA_Dev codegen(target);
-  auto source_code = codegen.Compile(builder.Build());
-  LOG(INFO) << "compiled code:\n\n\n" << source_code;
-
-  // nv jit compile to ptx
-  backends::nvrtc::Compiler compiler;
-  auto ptx = compiler(source_code);
-  CHECK(!ptx.empty());
-}
-
-TEST(Reduce, Reduce_Test_4) {
-  int m = 10201;
-  int n = 64;
-  Expr M(m), N(n);
-
-  Placeholder<float> A("A", {N, M});
-
-  auto reduce_out = hlir::pe::TwoStepBlockReduceSum(A.tensor(), {1}, false);
-  CHECK_EQ(reduce_out.size(), 4) << "the output of reduce is not equal to 4!";
-  auto stages = CreateStages({A, reduce_out[3], reduce_out[2], reduce_out[1], reduce_out[0]});
-
-  CudaTwoStepReduceSchedule(
-      stages, reduce_out[3], reduce_out[2], reduce_out[1], reduce_out[0], common::DefaultNVGPUTarget());
-
-  auto func = Lower("fn", stages, {A, reduce_out[0]});
-  LOG(INFO) << "func:\n" << func;
-
-  auto target = common::DefaultNVGPUTarget();
-  Module::Builder builder("Concat_Builder", target);
-  builder.AddFunction(func);
-
-  auto module                    = builder.Build();
-  auto host_module_device_module = backends::SplitCudaAndHostModule(module);
-  auto &host_module              = std::get<0>(host_module_device_module);
-  auto &device_module            = std::get<1>(host_module_device_module);
-
-  backends::CodeGenCUDA_Dev codegen(target);
-  auto source_code = codegen.Compile(builder.Build());
-  LOG(INFO) << "compiled code:\n\n\n" << source_code;
-
-  // nv jit compile to ptx
-  backends::nvrtc::Compiler compiler;
-  auto ptx = compiler(source_code);
-  CHECK(!ptx.empty());
-}
-
-TEST(Reduce, Reduce_Test_5) {
-  int m = 32;
-  int n = 64;
-  Expr M(m), N(n);
-
-  Placeholder<float> A("A", {N, M, M});
-
-  auto reduce_out = hlir::pe::TwoStepBlockReduceSum(A.tensor(), {1, 2}, false);
-  CHECK_EQ(reduce_out.size(), 2) << "the output of reduce is not equal to 4!";
-  auto stages = CreateStages({A, reduce_out[1], reduce_out[0]});
-
-  CudaBlockReduceInternalSchedule(stages, reduce_out[1], reduce_out[0], common::DefaultNVGPUTarget());
-
-  auto func = Lower("fn", stages, {A, reduce_out[0]});
-  LOG(INFO) << "func:\n" << func;
-
-  auto target = common::DefaultNVGPUTarget();
-  Module::Builder builder("Concat_Builder", target);
-  builder.AddFunction(func);
-
-  auto module                    = builder.Build();
-  auto host_module_device_module = backends::SplitCudaAndHostModule(module);
-  auto &host_module              = std::get<0>(host_module_device_module);
-  auto &device_module            = std::get<1>(host_module_device_module);
-
-  backends::CodeGenCUDA_Dev codegen(target);
-  auto source_code = codegen.Compile(builder.Build());
-  LOG(INFO) << "compiled code:\n\n\n" << source_code;
-
-  // nv jit compile to ptx
-  backends::nvrtc::Compiler compiler;
-  auto ptx = compiler(source_code);
-  CHECK(!ptx.empty());
-}
-
-TEST(Reduce, Reduce_Test_6) {
-  int m = 32;
-  int n = 64;
-  Expr M(m), N(n);
-
-  Placeholder<float> A("A", {N, N, M, M});
-
-  auto reduce_out = hlir::pe::TwoStepBlockReduceSum(A.tensor(), {0, 2, 3}, false);
-  CHECK_EQ(reduce_out.size(), 3) << "the output of reduce is not equal to 4!";
-  auto stages = CreateStages({A, reduce_out[1], reduce_out[0]});
-
-  CudaBlockReduceSchedule(stages, reduce_out[2], reduce_out[1], reduce_out[0], common::DefaultNVGPUTarget());
-
-  auto func = Lower("fn", stages, {A, reduce_out[0]});
-  LOG(INFO) << "func:\n" << func;
-
-  auto target = common::DefaultNVGPUTarget();
-  Module::Builder builder("Concat_Builder", target);
-  builder.AddFunction(func);
-
-  auto module                    = builder.Build();
-  auto host_module_device_module = backends::SplitCudaAndHostModule(module);
-  auto &host_module              = std::get<0>(host_module_device_module);
-  auto &device_module            = std::get<1>(host_module_device_module);
-
-  backends::CodeGenCUDA_Dev codegen(target);
-  auto source_code = codegen.Compile(builder.Build());
-  LOG(INFO) << "compiled code:\n\n\n" << source_code;
-
-  // nv jit compile to ptx
-  backends::nvrtc::Compiler compiler;
-  auto ptx = compiler(source_code);
-  CHECK(!ptx.empty());
-}
-
-TEST(Reduce, Reduce_Test_7) {
-  int m = 10201;
-  int n = 64;
-  Expr M(m), N(n);
-
-  Placeholder<float> A("A", {N, N, M});
-
-  auto reduce_out = hlir::pe::TwoStepBlockReduceSum(A.tensor(), {1, 2}, false);
-  CHECK_EQ(reduce_out.size(), 4) << "the output of reduce is not equal to 4!";
-  auto stages = CreateStages({A, reduce_out[3], reduce_out[2], reduce_out[1], reduce_out[0]});
-
-  CudaTwoStepReduceSchedule(
-      stages, reduce_out[3], reduce_out[2], reduce_out[1], reduce_out[0], common::DefaultNVGPUTarget());
-
-  auto func = Lower("fn", stages, {A, reduce_out[0]});
-  LOG(INFO) << "func:\n" << func;
-
-  auto target = common::DefaultNVGPUTarget();
-  Module::Builder builder("Concat_Builder", target);
-  builder.AddFunction(func);
-
-  auto module                    = builder.Build();
-  auto host_module_device_module = backends::SplitCudaAndHostModule(module);
-  auto &host_module              = std::get<0>(host_module_device_module);
-  auto &device_module            = std::get<1>(host_module_device_module);
-
-  backends::CodeGenCUDA_Dev codegen(target);
-  auto source_code = codegen.Compile(builder.Build());
-  LOG(INFO) << "compiled code:\n\n\n" << source_code;
-
-  // nv jit compile to ptx
-  backends::nvrtc::Compiler compiler;
-  auto ptx = compiler(source_code);
-  CHECK(!ptx.empty());
-}
-
-TEST(Reduce, Reduce_Test_8) {
-  int m = 128;
-  int n = 112;
-  Expr M(m), N(n);
-
-  Placeholder<float> A("A", {M, M, N, N});
-
-  auto reduce_out = hlir::pe::TwoStepBlockReduceSum(A.tensor(), {0, 2, 3}, false);
-  CHECK_EQ(reduce_out.size(), 4) << "the output of reduce is not equal to 4!";
-  auto stages = CreateStages({A, reduce_out[3], reduce_out[2], reduce_out[1], reduce_out[0]});
-
-  CudaTwoStepReduceSchedule(
-      stages, reduce_out[3], reduce_out[2], reduce_out[1], reduce_out[0], common::DefaultNVGPUTarget());
-
-  auto func = Lower("fn", stages, {A, reduce_out[0]});
-  LOG(INFO) << "func:\n" << func;
-
-  auto target = common::DefaultNVGPUTarget();
-  Module::Builder builder("Concat_Builder", target);
-  builder.AddFunction(func);
-
-  auto module                    = builder.Build();
-  auto host_module_device_module = backends::SplitCudaAndHostModule(module);
-  auto &host_module              = std::get<0>(host_module_device_module);
-  auto &device_module            = std::get<1>(host_module_device_module);
-
-  backends::CodeGenCUDA_Dev codegen(target);
-  auto source_code = codegen.Compile(builder.Build());
-  LOG(INFO) << "compiled code:\n\n\n" << source_code;
-
-  // nv jit compile to ptx
-  backends::nvrtc::Compiler compiler;
-  auto ptx = compiler(source_code);
-  CHECK(!ptx.empty());
-}
-
-TEST(Reduce, Reduce_Test_9) {
-  int m = 128;
-  int n = 56;
-  Expr M(m), N(n);
-
-  Placeholder<float> A("A", {M, M, N, N});
-
-  auto reduce_out = hlir::pe::TwoStepBlockReduceSum(A.tensor(), {0, 2, 3}, false);
-  CHECK_EQ(reduce_out.size(), 4) << "the output of reduce is not equal to 4!";
-  auto stages = CreateStages({A, reduce_out[3], reduce_out[2], reduce_out[1], reduce_out[0]});
-
-  CudaTwoStepReduceSchedule(
-      stages, reduce_out[3], reduce_out[2], reduce_out[1], reduce_out[0], common::DefaultNVGPUTarget());
-
-  auto func = Lower("fn", stages, {A, reduce_out[0]});
-  LOG(INFO) << "func:\n" << func;
-
-  auto target = common::DefaultNVGPUTarget();
-  Module::Builder builder("Concat_Builder", target);
-  builder.AddFunction(func);
-
-  auto module                    = builder.Build();
-  auto host_module_device_module = backends::SplitCudaAndHostModule(module);
-  auto &host_module              = std::get<0>(host_module_device_module);
-  auto &device_module            = std::get<1>(host_module_device_module);
-
-  backends::CodeGenCUDA_Dev codegen(target);
-  auto source_code = codegen.Compile(builder.Build());
-  LOG(INFO) << "compiled code:\n\n\n" << source_code;
-
-  // nv jit compile to ptx
-  backends::nvrtc::Compiler compiler;
-  auto ptx = compiler(source_code);
-  CHECK(!ptx.empty());
-}
-
-TEST(Reduce, Reduce_Test_10) {
-  int m = 128;
-  int n = 128;
-  Expr M(m), N(n);
-
-  Placeholder<float> A("A", {M, N});
-  Placeholder<float> B("B", {M, N});
-
-  auto c          = hlir::pe::Add(A.tensor(), B.tensor());
-  auto reduce_out = hlir::pe::BlockShuffleReduceSum(c, {0}, false);
-  CHECK_EQ(reduce_out.size(), 3) << "the output of reduce is not equal to 4!";
-  auto stages = CreateStages({A, B, c, reduce_out[2], reduce_out[1], reduce_out[0]});
-
-  stages[c]->Split(0, 8);
-  stages[c]->Fuse(1, 2);
-  stages[c]->Reorder({1, 0});
-  stages[c]->SetBuffer("local");
-  stages[c]->SimpleComputeAt(stages[reduce_out[1]], 1);
-  stages[reduce_out[2]]->ComputeInline();
-  stages[reduce_out[1]]->Bind(0, "threadIdx.x");
-  stages[reduce_out[1]]->SetBuffer("shared");
-  stages[reduce_out[0]]->Bind(0, "threadIdx.x");
-
-  auto func = Lower("fn", stages, {A, B, reduce_out[0]});
-  LOG(INFO) << "func:\n" << func;
-
-  auto target = common::DefaultNVGPUTarget();
-  Module::Builder builder("Concat_Builder", target);
-  builder.AddFunction(func);
-
-  auto module                    = builder.Build();
-  auto host_module_device_module = backends::SplitCudaAndHostModule(module);
-  auto &host_module              = std::get<0>(host_module_device_module);
-  auto &device_module            = std::get<1>(host_module_device_module);
-
-  backends::CodeGenCUDA_Dev codegen(target);
-  auto source_code = codegen.Compile(builder.Build());
-  LOG(INFO) << "compiled code:\n\n\n" << source_code;
-
-  // nv jit compile to ptx
-  backends::nvrtc::Compiler compiler;
-  auto ptx = compiler(source_code);
-  CHECK(!ptx.empty());
-}
-
-TEST(Reduce, Reduce_Test_11) {
-  int m = 10;
-  int n = 10;
-  Expr M(m), N(n);
-
-  Placeholder<float> A("A", {M, N, N});
-  auto reduce_out = hlir::pe::BlockShuffleReduceSum(A.tensor(), {0, 1}, false);
-  CHECK_EQ(reduce_out.size(), 3) << "the output of reduce is not equal to 4!";
-  auto stages = CreateStages({A, reduce_out[2], reduce_out[1], reduce_out[0]});
-
-  CudaBlockShuffleReduceSchedule(stages, reduce_out[2], reduce_out[1], reduce_out[0], common::DefaultNVGPUTarget());
-  auto func = Lower("fn", stages, {A, reduce_out[0]});
-  LOG(INFO) << "func:\n" << func;
-
-  auto target = common::DefaultNVGPUTarget();
-  Module::Builder builder("Concat_Builder", target);
-  builder.AddFunction(func);
-
-  auto module                    = builder.Build();
-  auto host_module_device_module = backends::SplitCudaAndHostModule(module);
-  auto &host_module              = std::get<0>(host_module_device_module);
-  auto &device_module            = std::get<1>(host_module_device_module);
-
-  backends::CodeGenCUDA_Dev codegen(target);
-  auto source_code = codegen.Compile(builder.Build());
-  LOG(INFO) << "compiled code:\n\n\n" << source_code;
-
-  // nv jit compile to ptx
-  backends::nvrtc::Compiler compiler;
-  auto ptx = compiler(source_code);
-  CHECK(!ptx.empty());
-}
-
-TEST(Reduce, Reduce_Test_12) {
-  int m = 10;
-  int n = 10;
-  Expr M(m), N(n);
-
-  Placeholder<float> A("A", {M, M, N, N});
-  auto reduce_out = hlir::pe::BlockShuffleReduceSum(A.tensor(), {0, 1, 2}, false);
-  CHECK_EQ(reduce_out.size(), 3) << "the output of reduce is not equal to 4!";
-  auto stages = CreateStages({A, reduce_out[2], reduce_out[1], reduce_out[0]});
-
-  CudaBlockShuffleReduceSchedule(stages, reduce_out[2], reduce_out[1], reduce_out[0], common::DefaultNVGPUTarget());
-  auto func = Lower("fn", stages, {A, reduce_out[0]});
-  LOG(INFO) << "func:\n" << func;
-
-  auto target = common::DefaultNVGPUTarget();
-  Module::Builder builder("Concat_Builder", target);
-  builder.AddFunction(func);
-
-  auto module                    = builder.Build();
-  auto host_module_device_module = backends::SplitCudaAndHostModule(module);
-  auto &host_module              = std::get<0>(host_module_device_module);
-  auto &device_module            = std::get<1>(host_module_device_module);
-
-  backends::CodeGenCUDA_Dev codegen(target);
-  auto source_code = codegen.Compile(builder.Build());
-  LOG(INFO) << "compiled code:\n\n\n" << source_code;
-
-  // nv jit compile to ptx
-  backends::nvrtc::Compiler compiler;
-  auto ptx = compiler(source_code);
-  CHECK(!ptx.empty());
-}
-
-TEST(Reduce, Reduce_Test_13) {
-  int m = 16;
-  int n = 16;
-  Expr M(m), N(n);
-
-  Placeholder<float> A("A", {M, M, N, N});
-  auto reduce_out = hlir::pe::BlockShuffleReduceSum(A.tensor(), {0, 1, 2}, false);
-  CHECK_EQ(reduce_out.size(), 3) << "the output of reduce is not equal to 4!";
-  auto stages = CreateStages({A, reduce_out[2], reduce_out[1], reduce_out[0]});
-
-  CudaBlockShuffleReduceSchedule(stages, reduce_out[2], reduce_out[1], reduce_out[0], common::DefaultNVGPUTarget());
-  auto func = Lower("fn", stages, {A, reduce_out[0]});
-  LOG(INFO) << "func:\n" << func;
-
-  auto target = common::DefaultNVGPUTarget();
-  Module::Builder builder("Concat_Builder", target);
-  builder.AddFunction(func);
-
-  auto module                    = builder.Build();
-  auto host_module_device_module = backends::SplitCudaAndHostModule(module);
-  auto &host_module              = std::get<0>(host_module_device_module);
-  auto &device_module            = std::get<1>(host_module_device_module);
-
-  backends::CodeGenCUDA_Dev codegen(target);
-  auto source_code = codegen.Compile(builder.Build());
-  LOG(INFO) << "compiled code:\n\n\n" << source_code;
-
-  // nv jit compile to ptx
-  backends::nvrtc::Compiler compiler;
-  auto ptx = compiler(source_code);
-  CHECK(!ptx.empty());
-}
 #endif
 
 }  // namespace pe

From aa248521fbe62bb71e588509e14860817becdfb7 Mon Sep 17 00:00:00 2001
From: sunli <466530738@qq.com>
Date: Tue, 16 May 2023 04:55:42 +0000
Subject: [PATCH 11/11] fix pe transform test

---
 cinn/hlir/pe/pe_transform_test.cc | 2 --
 1 file changed, 2 deletions(-)

diff --git a/cinn/hlir/pe/pe_transform_test.cc b/cinn/hlir/pe/pe_transform_test.cc
index ce65e95f95..f5a76014e8 100644
--- a/cinn/hlir/pe/pe_transform_test.cc
+++ b/cinn/hlir/pe/pe_transform_test.cc
@@ -224,8 +224,6 @@ TEST(Concat, ConcatCase0) {
 #endif
 }
 
-#endif
-
 }  // namespace pe
 }  // namespace hlir
 }  // namespace cinn