diff --git a/DeepLearningShogi.sln b/DeepLearningShogi.sln index 3358924f..b55bab3d 100644 --- a/DeepLearningShogi.sln +++ b/DeepLearningShogi.sln @@ -13,8 +13,6 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cppshogi", "cppshogi\cppsho EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "test", "test\test.vcxproj", "{A246A881-CCDA-4FC5-B8EA-DEFD779B2724}" EndProject -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "make_hcp_by_self_play", "make_hcp_by_self_play\make_hcp_by_self_play.vcxproj", "{CC7D9CC6-5D53-4E73-87E9-A20C24523A0F}" -EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "make_hcpe_by_self_play", "make_hcpe_by_self_play\make_hcpe_by_self_play.vcxproj", "{8C4DECB0-1BD4-4485-9E2D-19993F5D7C5D}" EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "usi_onnxruntime", "usi_onnxruntime\usi_onnxruntime.vcxproj", "{A14AB8E7-348F-45B0-BBA3-BA1721FFE242}" @@ -62,14 +60,6 @@ Global {A246A881-CCDA-4FC5-B8EA-DEFD779B2724}.MakeBook|x64.ActiveCfg = MakeBook|x64 {A246A881-CCDA-4FC5-B8EA-DEFD779B2724}.Release_NoOpt|x64.ActiveCfg = Release_NoOpt|x64 {A246A881-CCDA-4FC5-B8EA-DEFD779B2724}.Release|x64.ActiveCfg = Release|x64 - {CC7D9CC6-5D53-4E73-87E9-A20C24523A0F}.Debug|x64.ActiveCfg = Debug|x64 - {CC7D9CC6-5D53-4E73-87E9-A20C24523A0F}.Debug|x64.Build.0 = Debug|x64 - {CC7D9CC6-5D53-4E73-87E9-A20C24523A0F}.MakeBook_NoOpt|x64.ActiveCfg = MakeBook_NoOpt|x64 - {CC7D9CC6-5D53-4E73-87E9-A20C24523A0F}.MakeBook|x64.ActiveCfg = MakeBook|x64 - {CC7D9CC6-5D53-4E73-87E9-A20C24523A0F}.Release_NoOpt|x64.ActiveCfg = Release_NoOpt|x64 - {CC7D9CC6-5D53-4E73-87E9-A20C24523A0F}.Release_NoOpt|x64.Build.0 = Release_NoOpt|x64 - {CC7D9CC6-5D53-4E73-87E9-A20C24523A0F}.Release|x64.ActiveCfg = Release|x64 - {CC7D9CC6-5D53-4E73-87E9-A20C24523A0F}.Release|x64.Build.0 = Release|x64 {8C4DECB0-1BD4-4485-9E2D-19993F5D7C5D}.Debug|x64.ActiveCfg = Debug|x64 {8C4DECB0-1BD4-4485-9E2D-19993F5D7C5D}.Debug|x64.Build.0 = Debug|x64 {8C4DECB0-1BD4-4485-9E2D-19993F5D7C5D}.MakeBook_NoOpt|x64.ActiveCfg = MakeBook_NoOpt|x64 diff --git a/README.md b/README.md index c26d6b55..86af1c64 100644 --- a/README.md +++ b/README.md @@ -2,26 +2,24 @@ 将棋でディープラーニングを実験するためのプロジェクトです。 -基本的にAlphaGoの手法を参考に実装していく予定です。 +基本的にAlphaGo/AlphaZeroの手法を参考に実装していく方針です。 検討経緯、実験結果などは、随時こちらのブログに掲載していきます。 http://tadaoyamaoka.hatenablog.com/ -## 使用ライブラリ -* [Apery](https://github.com/HiraokaTakuya/apery) - -※モンテカルロ木探索の実装は囲碁プログラムの[Ray+Rn](https://github.com/zakki/Ray)の実装を参考にしています。 +## ダウンロード +[Releases](https://github.com/TadaoYamaoka/DeepLearningShogi/releases)からダウンロードできます。 ## ソース構成 |フォルダ|説明| |:---|:---| |cppshogi|Aperyを流用した将棋ライブラリ(盤面管理、指し手生成)、入力特徴量作成| |dlshogi|ニューラルネットワークの学習(Python)| -|make_hcp_by_self_play|Policyネットワークによる自己対局| |make_hcpe_by_self_play|MCTSによる自己対局| |test|テストコード| |usi|対局用USIエンジン| +|usi_onnxruntime|OnnxRuntime版ビルド用プロジェクト| |utils|ツール類| ## ビルド環境 @@ -43,9 +41,15 @@ http://tadaoyamaoka.hatenablog.com/ 上記USIエンジンのビルド環境に加えて以下が必要 * [Pytorch](https://pytorch.org/) 1.6以上 * Python 3.7以上 ([Anaconda](https://www.continuum.io/downloads)) -* [Boost](http://www.boost.org/) 1.69.0 +* [Boost](http://www.boost.org/) 1.67.0以上 * CUDA (PyTorchに対応したバージョン) * cuDNN (CUDAに対応したバージョン) +## 謝辞 +* 将棋の局面管理、合法手生成に、[Apery](https://github.com/HiraokaTakuya/apery)のソースコードを使用しています。 +* モンテカルロ木探索の実装は囲碁プログラムの[Ray+Rn](https://github.com/zakki/Ray)の実装を参考にしています。 +* 探索部の一部にLeela Chess Zeroのソースコードを流用しています。 +* 王手生成などに、[やねうら王](https://github.com/yaneurao/YaneuraOu)のソースコードを流用しています。 + ## ライセンス -ライセンスはGPLライセンスとします。 \ No newline at end of file +ライセンスはGPL3ライセンスとします。 \ No newline at end of file diff --git a/cppshogi/cppshogi.cpp b/cppshogi/cppshogi.cpp index 15ca6243..e71dea71 100644 --- a/cppshogi/cppshogi.cpp +++ b/cppshogi/cppshogi.cpp @@ -42,19 +42,19 @@ inline void make_input_features(const Position& position, features1_t* features1 for (PieceType pt = Pawn; pt < PieceTypeNum; ++pt) { // 駒の配置 if (bb[pt].isSet(sq)) { - (*features1)[c2][pt - 1][sq2] = _one; + (*features1)[c2][pt - 1][sq2] = 1; } // 駒の利き if (attacks[c][pt].isSet(sq)) { - (*features1)[c2][PIECETYPE_NUM + pt - 1][sq2] = _one; + (*features1)[c2][PIECETYPE_NUM + pt - 1][sq2] = 1; } } // 利き数 const int num = std::min(MAX_ATTACK_NUM, position.attackersTo(c, sq, occupied_bb).popCount()); for (int k = 0; k < num; k++) { - (*features1)[c2][PIECETYPE_NUM + PIECETYPE_NUM + k][sq2] = _one; + (*features1)[c2][PIECETYPE_NUM + PIECETYPE_NUM + k][sq2] = 1; } } @@ -66,14 +66,14 @@ inline void make_input_features(const Position& position, features1_t* features1 if (num >= MAX_PIECES_IN_HAND[hp]) { num = MAX_PIECES_IN_HAND[hp]; } - std::fill_n((*features2_hand)[c2][p], (int)SquareNum * num, _one); + std::fill_n((*features2_hand)[c2][p], (int)SquareNum * num, 1); p += MAX_PIECES_IN_HAND[hp]; } } // is check if (position.inCheck()) { - std::fill_n((*features2)[MAX_FEATURES2_HAND_NUM], SquareNum, _one); + std::fill_n((*features2)[MAX_FEATURES2_HAND_NUM], SquareNum, 1); } } diff --git a/cppshogi/cppshogi.h b/cppshogi/cppshogi.h index ff8716a7..1cf2bf6b 100644 --- a/cppshogi/cppshogi.h +++ b/cppshogi/cppshogi.h @@ -4,10 +4,11 @@ #include "position.hpp" #include "search.hpp" #include "generateMoves.hpp" -#include "../usi/cudnn_dtype.h" #define LEN(array) (sizeof(array) / sizeof(array[0])) +typedef float DType; + constexpr int MAX_HPAWN_NUM = 8; // 歩の持ち駒の上限 constexpr int MAX_HLANCE_NUM = 4; constexpr int MAX_HKNIGHT_NUM = 4; diff --git a/make_hcp_by_self_play/make_hcp_by_self_play.cpp b/make_hcp_by_self_play/make_hcp_by_self_play.cpp deleted file mode 100644 index 6ed360b1..00000000 --- a/make_hcp_by_self_play/make_hcp_by_self_play.cpp +++ /dev/null @@ -1,205 +0,0 @@ -#include - -#include "nn_wideresnet10.h" -#include "nn_wideresnet15.h" -#include "cppshogi.h" -#include - -void randomMove(Position& pos, std::mt19937& mt); - -const Move select_move(const Position pos, float *logits) { - // 合法手一覧 - std::vector legal_moves; - std::vector legal_move_probabilities; - for (MoveList ml(pos); !ml.end(); ++ml) { - const Move move = ml.move(); - - const int move_label = make_move_label((u16)move.proFromAndTo(), pos.turn()); - - legal_moves.emplace_back(move); - legal_move_probabilities.emplace_back(logits[move_label]); - } - - if (legal_moves.size() == 0) { - return Move::moveNone(); - } - - // Boltzmann distribution - softmax_temperature_with_normalize(legal_move_probabilities); - - // 確率に応じて手を選択 - std::discrete_distribution distribution(legal_move_probabilities.begin(), legal_move_probabilities.end()); - int move_idx = distribution(g_randomTimeSeed); - - return legal_moves[move_idx]; -} - -int main(int argc, char** argv) -{ - if (argc < 5) { - std::cout << "model_path outfile batch_size position_num" << std::endl; - return 1; - } - - const char* model_path = argv[1]; - const char* outfile = argv[2]; - int max_batch_size = std::atoi(argv[3]); - int position_num = std::atoi(argv[4]); - - std::unique_ptr nn; - if (std::string(model_path).find("wideresnet15") != std::string::npos) - nn.reset((NN*)new NNWideResnet15(model_path, max_batch_size)); - else - nn.reset((NN*)new NNWideResnet10(model_path, max_batch_size)); - - initTable(); - Position::initZobrist(); - - Searcher s; - s.init(); - - // ボルツマン温度設定 - set_softmax_temperature(1.25f); - - features1_t *features1; - features2_t *features2; - checkCudaErrors(cudaHostAlloc(&features1, sizeof(features1_t) * max_batch_size, cudaHostAllocPortable)); - checkCudaErrors(cudaHostAlloc(&features2, sizeof(features2_t) * max_batch_size, cudaHostAllocPortable)); - - float* y1; - float* y2; - checkCudaErrors(cudaHostAlloc(&y1, MAX_MOVE_LABEL_NUM * (int)SquareNum * max_batch_size * sizeof(float), cudaHostAllocPortable)); - checkCudaErrors(cudaHostAlloc(&y2, max_batch_size * sizeof(float), cudaHostAllocPortable)); - - std::mt19937 mt(std::chrono::system_clock::now().time_since_epoch().count()); - std::uniform_int_distribution dist(4, 250); - std::uniform_int_distribution doRandomDist(0, 30); - - std::vector hcpvec; - - std::ofstream ofs(outfile, std::ios::binary); - if (!ofs) { - std::cerr << "Error: cannot open " << outfile << std::endl; - exit(EXIT_FAILURE); - } - - // 進捗状況表示 - std::atomic index = 0; - Timer t = Timer::currentTime(); - auto progressFunc = [&position_num](std::atomic& index, Timer& t) { - while (true) { - std::this_thread::sleep_for(std::chrono::seconds(5)); // 指定秒だけ待機し、進捗を表示する。 - const s64 madeTeacherNodes = index; - const double progress = static_cast(madeTeacherNodes) / position_num; - auto elapsed_msec = t.elapsed(); - if (progress > 0.0) // 0 除算を回避する。 - std::cout << std::fixed << "Progress: " << std::setprecision(2) << std::min(100.0, progress * 100.0) - << "%, Elapsed: " << elapsed_msec / 1000 - << "[s], Remaining: " << std::max(0, elapsed_msec*(1.0 - progress) / (progress * 1000)) << "[s]" << std::endl; - if (index >= position_num) - break; - } - }; - std::thread progressThread([&index, &progressFunc, &t] { progressFunc(index, t); }); - - std::vector positions; - std::vector maxply; - std::vector tmpply; - std::vector tmpply2; - std::vector ply; - std::vector stateLists; - std::vector hcptmp(max_batch_size); - std::vector hcptmp2(max_batch_size); - - // 局面初期化 - for (int i = 0; i < max_batch_size; i++) { - positions.emplace_back(DefaultStartPositionSFEN, s.thisptr); - maxply.emplace_back(dist(mt)); - int maxply2 = std::uniform_int_distribution(8, maxply[i])(mt); - tmpply.emplace_back(maxply2); - tmpply2.emplace_back(std::uniform_int_distribution(8, maxply2)(mt)); - ply.emplace_back(1); - stateLists.emplace_back(new std::deque(1)); - } - - while (hcpvec.size() < position_num) { - - // set all zero - std::fill_n((float*)features1, positions.size() * (int)ColorNum * MAX_FEATURES1_NUM * (int)SquareNum, 0.0f); - std::fill_n((float*)features2, positions.size() * MAX_FEATURES2_NUM * (int)SquareNum, 0.0f); - - // make input_features - for (int idx = 0; idx < positions.size(); idx++) { - make_input_features(positions[idx], &features1[idx], &features2[idx]); - } - - // predict - nn->forward(max_batch_size, features1, features2, y1, y2); - float(*logits)[MAX_MOVE_LABEL_NUM * SquareNum] = reinterpret_cast(y1); - - // do move - for (int idx = 0; idx < positions.size(); idx++, logits++) { - Move move = select_move(positions[idx], (float*)logits); - - if (move != Move::moveNone()) { - - stateLists[idx]->push_back(StateInfo()); - positions[idx].doMove(move, stateLists[idx]->back()); - - ply[idx]++; - - // 出力判定 - if (ply[idx] == maxply[idx]) { - hcpvec.emplace_back(positions[idx].toHuffmanCodedPos()); - index++; - } - else if (ply[idx] == tmpply[idx]) { - hcptmp[idx] = positions[idx].toHuffmanCodedPos(); - } - else if (ply[idx] == tmpply2[idx]) { - hcptmp2[idx] = positions[idx].toHuffmanCodedPos(); - } - } - else { - // 終局の場合、暫定で保存した局面を出力 - if (ply[idx] > tmpply[idx]) { - hcpvec.emplace_back(hcptmp[idx]); - index++; - } - else if (ply[idx] > tmpply2[idx]) { - hcpvec.emplace_back(hcptmp2[idx]); - index++; - } - } - - // 次のゲーム - if (move == Move::moveNone() || ply[idx] >= maxply[idx]) { - positions[idx].set(DefaultStartPositionSFEN); - maxply[idx] = dist(mt); - int maxply2 = std::uniform_int_distribution(8, maxply[idx])(mt); - tmpply.emplace_back(maxply2); - tmpply2.emplace_back(std::uniform_int_distribution(8, maxply2)(mt)); - ply[idx] = 1; - stateLists[idx]->clear(); - } - else { - // 低い確率でランダムムーブを入れる - if (doRandomDist(mt) == 0 && !positions[idx].inCheck()) { - randomMove(positions[idx], mt); - } - } - } - } - - // 出力 - ofs.write(reinterpret_cast(hcpvec.data()), sizeof(HuffmanCodedPos) * hcpvec.size()); - - progressThread.join(); - - checkCudaErrors(cudaFreeHost(features1)); - checkCudaErrors(cudaFreeHost(features2)); - checkCudaErrors(cudaFreeHost(y1)); - checkCudaErrors(cudaFreeHost(y2)); - - return 0; -} \ No newline at end of file diff --git a/make_hcp_by_self_play/make_hcp_by_self_play.vcxproj b/make_hcp_by_self_play/make_hcp_by_self_play.vcxproj deleted file mode 100644 index 1c92bd6e..00000000 --- a/make_hcp_by_self_play/make_hcp_by_self_play.vcxproj +++ /dev/null @@ -1,251 +0,0 @@ - - - - - MakeBook_NoOpt - x64 - - - MakeBook - x64 - - - Release_NoOpt - x64 - - - Debug - x64 - - - Release - x64 - - - - {CC7D9CC6-5D53-4E73-87E9-A20C24523A0F} - Win32Proj - make_hcp_by_self_play - 10.0 - - - - Application - true - v142 - Unicode - - - Application - false - v142 - true - Unicode - - - Application - false - v142 - true - Unicode - - - Application - false - v142 - true - Unicode - - - Application - false - v142 - true - Unicode - - - - - - - - - - - - - - - - - - - - - - - - true - C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.0\include;..\cppshogi;..\usi;$(VC_IncludePath);$(WindowsSDK_IncludePath); - C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.0\lib\x64;$(VC_LibraryPath_x64);$(WindowsSDK_LibraryPath_x64);$(NETFXKitsDir)Lib\um\x64 - - - false - C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.0\include;..\cppshogi;..\usi;$(VC_IncludePath);$(WindowsSDK_IncludePath); - C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.0\lib\x64;$(VC_LibraryPath_x64);$(WindowsSDK_LibraryPath_x64);$(NETFXKitsDir)Lib\um\x64 - - - false - C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.0\include;..\cppshogi;..\usi;$(VC_IncludePath);$(WindowsSDK_IncludePath); - C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.0\lib\x64;$(VC_LibraryPath_x64);$(WindowsSDK_LibraryPath_x64);$(NETFXKitsDir)Lib\um\x64 - - - false - C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.0\include;..\cppshogi;..\usi;$(VC_IncludePath);$(WindowsSDK_IncludePath); - C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.0\lib\x64;$(VC_LibraryPath_x64);$(WindowsSDK_LibraryPath_x64);$(NETFXKitsDir)Lib\um\x64 - - - false - C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.0\include;..\cppshogi;..\usi;$(VC_IncludePath);$(WindowsSDK_IncludePath); - C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.0\lib\x64;$(VC_LibraryPath_x64);$(WindowsSDK_LibraryPath_x64);$(NETFXKitsDir)Lib\um\x64 - - - - - - Level3 - Disabled - LEARN;_DEBUG;_CONSOLE;HAVE_SSE4;HAVE_BMI2;HAVE_AVX2;%(PreprocessorDefinitions) - AdvancedVectorExtensions2 - MultiThreadedDebug - stdcpp17 - - - Console - true - cuda.lib;cudnn.lib;cudart.lib;cublas.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) - - - - - Level3 - - - MaxSpeed - true - true - LEARN;NDEBUG;_CONSOLE;HAVE_SSE4;HAVE_BMI2;HAVE_AVX2;%(PreprocessorDefinitions) - AdvancedVectorExtensions2 - MultiThreaded - Speed - AnySuitable - stdcpp17 - - - Console - true - true - true - cuda.lib;cudnn.lib;cudart.lib;cublas.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) - - - - - Level3 - - - MaxSpeed - true - true - LEARN;NDEBUG;_CONSOLE;HAVE_SSE4;HAVE_BMI2;HAVE_AVX2;%(PreprocessorDefinitions) - AdvancedVectorExtensions2 - MultiThreaded - Speed - AnySuitable - stdcpp17 - - - Console - true - true - true - cuda.lib;cudnn.lib;cudart.lib;cublas.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) - - - - - Level3 - - - MaxSpeed - true - true - LEARN;NDEBUG;_CONSOLE;HAVE_SSE4;HAVE_BMI2;HAVE_AVX2;%(PreprocessorDefinitions) - AdvancedVectorExtensions2 - MultiThreaded - Speed - AnySuitable - stdcpp17 - - - Console - true - true - true - cuda.lib;cudnn.lib;cudart.lib;cublas.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) - - - - - Level3 - - - Disabled - true - true - LEARN;NDEBUG;_CONSOLE;HAVE_SSE4;HAVE_BMI2;HAVE_AVX2;%(PreprocessorDefinitions) - AdvancedVectorExtensions2 - MultiThreaded - stdcpp17 - - - Console - true - true - true - cuda.lib;cudnn.lib;cudart.lib;cublas.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - このプロジェクトは、このコンピューター上にない NuGet パッケージを参照しています。それらのパッケージをダウンロードするには、[NuGet パッケージの復元] を使用します。詳細については、http://go.microsoft.com/fwlink/?LinkID=322105 を参照してください。見つからないファイルは {0} です。 - - - - \ No newline at end of file diff --git a/make_hcp_by_self_play/make_hcp_by_self_play.vcxproj.filters b/make_hcp_by_self_play/make_hcp_by_self_play.vcxproj.filters deleted file mode 100644 index 3dcf1a1a..00000000 --- a/make_hcp_by_self_play/make_hcp_by_self_play.vcxproj.filters +++ /dev/null @@ -1,76 +0,0 @@ - - - - - {4FC737F1-C7A5-4376-A066-2A32D752A2FF} - cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx - - - {93995380-89BD-4b04-88EB-625FBE52EBFB} - h;hh;hpp;hxx;hm;inl;inc;xsd - - - {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} - rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms - - - {66386437-0ab6-4d2c-83c2-2d42c08934e7} - - - - - ソース ファイル - - - ソース ファイル\cppshogi - - - ソース ファイル\cppshogi - - - ソース ファイル\cppshogi - - - ソース ファイル\cppshogi - - - ソース ファイル\cppshogi - - - ソース ファイル\cppshogi - - - ソース ファイル\cppshogi - - - ソース ファイル\cppshogi - - - ソース ファイル\cppshogi - - - ソース ファイル\cppshogi - - - ソース ファイル\cppshogi - - - ソース ファイル\cppshogi - - - ソース ファイル\cppshogi - - - ソース ファイル - - - ソース ファイル - - - ソース ファイル - - - - - - \ No newline at end of file diff --git a/make_hcp_by_self_play/packages.config b/make_hcp_by_self_play/packages.config deleted file mode 100644 index 25f9ef3d..00000000 --- a/make_hcp_by_self_play/packages.config +++ /dev/null @@ -1,4 +0,0 @@ - - - - \ No newline at end of file diff --git a/make_hcpe_by_self_play/Makefile b/make_hcpe_by_self_play/Makefile index 9c72d410..032c50ec 100644 --- a/make_hcpe_by_self_play/Makefile +++ b/make_hcpe_by_self_play/Makefile @@ -6,7 +6,7 @@ LIB = -L/usr/local/cuda-10.2/lib64 -L$(CONDA_PREFIX)/lib target = bin/make_hcpe_by_self_play sources = self_play.cpp ZobristHash.cpp USIEngine.cpp -usi_sources = dfpn.cpp nn_fused_wideresnet10.cpp nn_senet10.cpp nn_wideresnet10.cpp nn_wideresnet15.cpp npz.cpp nn_tensorrt.cpp cudnn_dtype.cpp +usi_sources = dfpn.cpp nn_tensorrt.cpp cppshogi_sources = bitboard.cpp book.cpp common.cpp cppshogi.cpp generateMoves.cpp hand.cpp init.cpp move.cpp mt64bit.cpp position.cpp search.cpp square.cpp usi.cpp objects = $(addprefix obj/, $(sources:.cpp=.o)) usi_objects = $(addprefix obj/, $(usi_sources:.cpp=.o)) diff --git a/make_hcpe_by_self_play/make_hcpe_by_self_play.vcxproj b/make_hcpe_by_self_play/make_hcpe_by_self_play.vcxproj index 161b94aa..d66194d4 100644 --- a/make_hcpe_by_self_play/make_hcpe_by_self_play.vcxproj +++ b/make_hcpe_by_self_play/make_hcpe_by_self_play.vcxproj @@ -228,13 +228,8 @@ - - - - - diff --git a/make_hcpe_by_self_play/make_hcpe_by_self_play.vcxproj.filters b/make_hcpe_by_self_play/make_hcpe_by_self_play.vcxproj.filters index c7915efb..a758df26 100644 --- a/make_hcpe_by_self_play/make_hcpe_by_self_play.vcxproj.filters +++ b/make_hcpe_by_self_play/make_hcpe_by_self_play.vcxproj.filters @@ -60,27 +60,12 @@ ソース ファイル\cppshogi - - ソース ファイル - - - ソース ファイル - ソース ファイル - - ソース ファイル - - - ソース ファイル - ソース ファイル - - ソース ファイル - ソース ファイル diff --git a/make_hcpe_by_self_play/self_play.cpp b/make_hcpe_by_self_play/self_play.cpp index 55fa5dfe..4b98e392 100644 --- a/make_hcpe_by_self_play/self_play.cpp +++ b/make_hcpe_by_self_play/self_play.cpp @@ -19,9 +19,6 @@ #include "ZobristHash.h" #include "LruCache.h" #include "mate.h" -#include "nn_wideresnet10.h" -#include "nn_wideresnet15.h" -#include "nn_senet10.h" #include "nn_tensorrt.h" #include "dfpn.h" #include "USIEngine.h" @@ -342,17 +339,7 @@ class UCTSearcherGroupPair { void InitGPU() { mutex_all_gpu.lock(); if (nn == nullptr) { - if (model_path.find("onnx") != string::npos) - nn = (NN*)new NNTensorRT(model_path.c_str(), gpu_id, policy_value_batch_maxsize); - else if (model_path.find("wideresnet15") != string::npos) { - nn = (NN*)new NNWideResnet15(model_path.c_str(), policy_value_batch_maxsize); - } - else if (model_path.find("senet10") != string::npos) { - nn = (NN*)new NNSENet10(model_path.c_str(), policy_value_batch_maxsize); - } - else { - nn = (NN*)new NNWideResnet10(model_path.c_str(), policy_value_batch_maxsize); - } + nn = (NN*)new NNTensorRT(model_path.c_str(), gpu_id, policy_value_batch_maxsize); } mutex_all_gpu.unlock(); } @@ -407,8 +394,8 @@ UCTSearcherGroup::Initialize() } // キューを動的に確保する - checkCudaErrors(cudaHostAlloc(&features1, sizeof(features1_t) * policy_value_batch_maxsize, cudaHostAllocPortable)); - checkCudaErrors(cudaHostAlloc(&features2, sizeof(features2_t) * policy_value_batch_maxsize, cudaHostAllocPortable)); + checkCudaErrors(cudaHostAlloc((void**)&features1, sizeof(features1_t) * policy_value_batch_maxsize, cudaHostAllocPortable)); + checkCudaErrors(cudaHostAlloc((void**)&features2, sizeof(features2_t) * policy_value_batch_maxsize, cudaHostAllocPortable)); policy_value_hash_index = new unsigned int[policy_value_batch_maxsize]; // UCTSearcher @@ -418,8 +405,8 @@ UCTSearcherGroup::Initialize() searchers.emplace_back(this, uct_hash, uct_node, nn_cache, i, entryNum); } - checkCudaErrors(cudaHostAlloc(&y1, MAX_MOVE_LABEL_NUM * (size_t)SquareNum * policy_value_batch_maxsize * sizeof(DType), cudaHostAllocPortable)); - checkCudaErrors(cudaHostAlloc(&y2, policy_value_batch_maxsize * sizeof(DType), cudaHostAllocPortable)); + checkCudaErrors(cudaHostAlloc((void**)&y1, MAX_MOVE_LABEL_NUM * (size_t)SquareNum * policy_value_batch_maxsize * sizeof(DType), cudaHostAllocPortable)); + checkCudaErrors(cudaHostAlloc((void**)&y2, policy_value_batch_maxsize * sizeof(DType), cudaHostAllocPortable)); // 詰み探索 if (ROOT_MATE_SEARCH_DEPTH > 0) { @@ -926,8 +913,8 @@ void UCTSearcherGroup::QueuingNode(const Position *pos, unsigned int index) { // set all zero - std::fill_n((DType*)features1[current_policy_value_batch_index], sizeof(features1_t) / sizeof(DType), _zero); - std::fill_n((DType*)features2[current_policy_value_batch_index], sizeof(features2_t) / sizeof(DType), _zero); + std::fill_n((DType*)features1[current_policy_value_batch_index], sizeof(features1_t) / sizeof(DType), 0); + std::fill_n((DType*)features2[current_policy_value_batch_index], sizeof(features2_t) / sizeof(DType), 0); make_input_features(*pos, &features1[current_policy_value_batch_index], &features2[current_policy_value_batch_index]); policy_value_hash_index[current_policy_value_batch_index] = index; @@ -1000,11 +987,7 @@ void UCTSearcherGroup::EvalNode() { for (int j = 0; j < child_num; j++) { Move move = uct_child[j].move; const int move_label = make_move_label((u16)move.proFromAndTo(), color); -#ifdef FP16 - const float logit = __half2float((*logits)[move_label]); -#else const float logit = (*logits)[move_label]; -#endif legal_move_probabilities.emplace_back(logit); } @@ -1016,11 +999,7 @@ void UCTSearcherGroup::EvalNode() { req->nnrate[j] = legal_move_probabilities[j]; } -#ifdef FP16 - const float value_win = __half2float(*value); -#else const float value_win = *value; -#endif req->value_win = value_win; nn_cache.Insert(uct_node[index].key, std::move(req)); diff --git a/test/Makefile b/test/Makefile index cbbdd16e..f48f29ac 100644 --- a/test/Makefile +++ b/test/Makefile @@ -6,7 +6,7 @@ LIB = -L/usr/local/cuda-10.2/lib64 target = bin/test sources = gpubenchmark.cpp -usi_sources = dfpn.cpp Message.cpp UctSearch.cpp Node.cpp nn_fused_wideresnet10.cpp nn_senet10.cpp nn_wideresnet10.cpp nn_wideresnet15.cpp nn_tensorrt.cpp npz.cpp cudnn_dtype.cpp +usi_sources = dfpn.cpp Message.cpp UctSearch.cpp Node.cpp cppshogi_sources = bitboard.cpp book.cpp common.cpp cppshogi.cpp generateMoves.cpp hand.cpp init.cpp move.cpp mt64bit.cpp position.cpp search.cpp square.cpp usi.cpp objects = $(addprefix obj/, $(sources:.cpp=.o)) usi_objects = $(addprefix obj/, $(usi_sources:.cpp=.o)) diff --git a/test/test.vcxproj b/test/test.vcxproj index 24d21ba1..11b593e8 100644 --- a/test/test.vcxproj +++ b/test/test.vcxproj @@ -236,14 +236,8 @@ copy /y "$(SolutionDir)\packages\zlib-vc140-static-64.1.2.11\lib\native\libs\x64 - - - - - - diff --git a/test/test.vcxproj.filters b/test/test.vcxproj.filters index 8bff048c..39767b28 100644 --- a/test/test.vcxproj.filters +++ b/test/test.vcxproj.filters @@ -66,30 +66,12 @@ ソース ファイル\cppshogi - - ソース ファイル\usi - ソース ファイル - - ソース ファイル\usi - - - ソース ファイル\usi - - - ソース ファイル\usi - - - ソース ファイル\usi - ソース ファイル\usi - - ソース ファイル\usi - ソース ファイル\make_hcpe_by_self_play diff --git a/usi/Makefile b/usi/Makefile index d84925c9..934e36eb 100644 --- a/usi/Makefile +++ b/usi/Makefile @@ -5,7 +5,7 @@ INCLUDE = -I../usi -I../cppshogi -I../cppshogi -I/usr/local/cuda/include LIB = -L/usr/local/cuda-10.2/lib64 target = bin/usi -sources = main.cpp dfpn.cpp Message.cpp UctSearch.cpp Node.cpp nn_fused_wideresnet10.cpp nn_senet10.cpp nn_wideresnet10.cpp nn_wideresnet15.cpp nn_tensorrt.cpp npz.cpp cudnn_dtype.cpp +sources = main.cpp dfpn.cpp Message.cpp UctSearch.cpp Node.cpp nn_tensorrt.cpp cppshogi_sources = bitboard.cpp book.cpp common.cpp cppshogi.cpp generateMoves.cpp hand.cpp init.cpp move.cpp mt64bit.cpp position.cpp search.cpp square.cpp usi.cpp objects = $(addprefix obj/, $(sources:.cpp=.o)) cppshogi_objects = $(addprefix obj/, $(cppshogi_sources:.cpp=.o)) diff --git a/usi/UctSearch.cpp b/usi/UctSearch.cpp index 95bc5c56..d2a73022 100644 --- a/usi/UctSearch.cpp +++ b/usi/UctSearch.cpp @@ -25,10 +25,6 @@ #ifdef ONNXRUNTIME #include "nn_onnxruntime.h" #else -#include "nn_wideresnet10.h" -#include "nn_fused_wideresnet10.h" -#include "nn_wideresnet15.h" -#include "nn_senet10.h" #include "nn_tensorrt.h" #endif @@ -215,16 +211,7 @@ class UCTSearcherGroup { #ifdef ONNXRUNTIME nn = (NN*)new NNOnnxRuntime(model_path[gpu_id].c_str(), gpu_id, policy_value_batch_maxsize); #else - if (model_path[gpu_id].find("onnx") != string::npos) - nn = (NN*)new NNTensorRT(model_path[gpu_id].c_str(), gpu_id, policy_value_batch_maxsize); - else if (model_path[gpu_id].find("wideresnet15") != string::npos) - nn = (NN*)new NNWideResnet15(model_path[gpu_id].c_str(), policy_value_batch_maxsize); - else if (model_path[gpu_id].find("fused_wideresnet10") != string::npos) - nn = (NN*)new NNFusedWideResnet10(model_path[gpu_id].c_str(), policy_value_batch_maxsize); - else if (model_path[gpu_id].find("senet10") != string::npos) - nn = (NN*)new NNSENet10(model_path[gpu_id].c_str(), policy_value_batch_maxsize); - else - nn = (NN*)new NNWideResnet10(model_path[gpu_id].c_str(), policy_value_batch_maxsize); + nn = (NN*)new NNTensorRT(model_path[gpu_id].c_str(), gpu_id, policy_value_batch_maxsize); #endif } mutex_gpu.unlock(); @@ -277,10 +264,10 @@ class UCTSearcher { y1 = new DType[MAX_MOVE_LABEL_NUM * (size_t)SquareNum * policy_value_batch_maxsize]; y2 = new DType[policy_value_batch_maxsize]; #else - checkCudaErrors(cudaHostAlloc(&features1, sizeof(features1_t) * policy_value_batch_maxsize, cudaHostAllocPortable)); - checkCudaErrors(cudaHostAlloc(&features2, sizeof(features2_t) * policy_value_batch_maxsize, cudaHostAllocPortable)); - checkCudaErrors(cudaHostAlloc(&y1, MAX_MOVE_LABEL_NUM * (size_t)SquareNum * policy_value_batch_maxsize * sizeof(DType), cudaHostAllocPortable)); - checkCudaErrors(cudaHostAlloc(&y2, policy_value_batch_maxsize * sizeof(DType), cudaHostAllocPortable)); + checkCudaErrors(cudaHostAlloc((void**)&features1, sizeof(features1_t) * policy_value_batch_maxsize, cudaHostAllocPortable)); + checkCudaErrors(cudaHostAlloc((void**)&features2, sizeof(features2_t) * policy_value_batch_maxsize, cudaHostAllocPortable)); + checkCudaErrors(cudaHostAlloc((void**)&y1, MAX_MOVE_LABEL_NUM * (size_t)SquareNum * policy_value_batch_maxsize * sizeof(DType), cudaHostAllocPortable)); + checkCudaErrors(cudaHostAlloc((void**)&y2, policy_value_batch_maxsize * sizeof(DType), cudaHostAllocPortable)); #endif policy_value_batch = new batch_element_t[policy_value_batch_maxsize]; #ifdef MAKE_BOOK @@ -939,8 +926,8 @@ UCTSearcher::QueuingNode(const Position *pos, uct_node_t* node, float* value_win std::cout << "error" << std::endl; }*/ // set all zero - std::fill_n((DType*)features1[current_policy_value_batch_index], sizeof(features1_t) / sizeof(DType), _zero); - std::fill_n((DType*)features2[current_policy_value_batch_index], sizeof(features2_t) / sizeof(DType), _zero); + std::fill_n((DType*)features1[current_policy_value_batch_index], sizeof(features1_t) / sizeof(DType), 0); + std::fill_n((DType*)features2[current_policy_value_batch_index], sizeof(features2_t) / sizeof(DType), 0); make_input_features(*pos, &features1[current_policy_value_batch_index], &features2[current_policy_value_batch_index]); policy_value_batch[current_policy_value_batch_index] = { node, pos->turn(), value_win }; @@ -1469,11 +1456,7 @@ void UCTSearcher::EvalNode() { for (int j = 0; j < child_num; j++) { const Move move = uct_child[j].move; const int move_label = make_move_label((u16)move.proFromAndTo(), color); -#ifdef FP16 - const float logit = __half2float((*logits)[move_label]); -#else const float logit = (*logits)[move_label]; -#endif legal_move_probabilities.emplace_back(logit); } @@ -1484,11 +1467,7 @@ void UCTSearcher::EvalNode() { uct_child[j].nnrate = legal_move_probabilities[j]; } -#ifdef FP16 - *policy_value_batch[i].value_win = __half2float(*value); -#else *policy_value_batch[i].value_win = *value; -#endif #ifdef MAKE_BOOK // 定跡作成時は、事前確率に定跡の遷移確率も使用する diff --git a/usi/cudnn_dtype.cpp b/usi/cudnn_dtype.cpp deleted file mode 100644 index 16b128f4..00000000 --- a/usi/cudnn_dtype.cpp +++ /dev/null @@ -1,6 +0,0 @@ -#include "cudnn_dtype.h" - -#ifdef FP16 -const DType _zero = __float2half(0.0f); -const DType _one = __float2half(1.0f); -#endif \ No newline at end of file diff --git a/usi/cudnn_dtype.h b/usi/cudnn_dtype.h deleted file mode 100644 index 2faf09e2..00000000 --- a/usi/cudnn_dtype.h +++ /dev/null @@ -1,22 +0,0 @@ -#pragma once - -#ifdef FP16 -#include -typedef __half DType; -#define CUDNN_DATA_TYPE CUDNN_DATA_HALF -#define CUDA_DATA_TYPE CUDA_R_16F -extern const DType _zero; -extern const DType _one; -inline float to_float(const DType x) { - return __half2float(x); -} -#else -typedef float DType; -#define CUDNN_DATA_TYPE CUDNN_DATA_FLOAT -#define CUDA_DATA_TYPE CUDA_R_32F -constexpr const DType _zero = 0.0f; -constexpr const DType _one = 1.0f; -inline float to_float(DType x) { - return x; -} -#endif diff --git a/usi/cudnn_wrapper.h b/usi/cudnn_wrapper.h deleted file mode 100644 index bd834964..00000000 --- a/usi/cudnn_wrapper.h +++ /dev/null @@ -1,187 +0,0 @@ -#pragma once - -#include -#include -#include - -#include "error_util.h" -#include "cudnn_dtype.h" - -class CudnnHandle -{ -private: - cudnnHandle_t cudnnHandle; - -public: - CudnnHandle() { - checkCUDNN(cudnnCreate(&cudnnHandle)); - } - ~CudnnHandle() { - checkCUDNN(cudnnDestroy(cudnnHandle)); - } - - cudnnHandle_t* operator &() { - return &cudnnHandle; - } - - operator cudnnHandle_t() { - return cudnnHandle; - } -}; - -class CudnnTensorDescriptor -{ -private: - cudnnTensorDescriptor_t cudnnTensorDescriptor; - -public: - CudnnTensorDescriptor() { - checkCUDNN(cudnnCreateTensorDescriptor(&cudnnTensorDescriptor)); - } - ~CudnnTensorDescriptor() { - checkCUDNN(cudnnDestroyTensorDescriptor(cudnnTensorDescriptor)); - } - - cudnnTensorDescriptor_t* operator &() { - return &cudnnTensorDescriptor; - } - - operator cudnnTensorDescriptor_t() { - return cudnnTensorDescriptor; - } -}; - -class CudnnFilterDescriptor -{ -private: - cudnnFilterDescriptor_t cudnnFilterDescriptor; - -public: - CudnnFilterDescriptor() { - checkCUDNN(cudnnCreateFilterDescriptor(&cudnnFilterDescriptor)); - } - ~CudnnFilterDescriptor() { - checkCUDNN(cudnnDestroyFilterDescriptor(cudnnFilterDescriptor)); - } - - cudnnFilterDescriptor_t* operator &() { - return &cudnnFilterDescriptor; - } - - operator cudnnFilterDescriptor_t() { - return cudnnFilterDescriptor; - } -}; - -class CudnnConvolutionDescriptor -{ -private: - cudnnConvolutionDescriptor_t cudnnConvolutionDescriptor; - -public: - CudnnConvolutionDescriptor() { - checkCUDNN(cudnnCreateConvolutionDescriptor(&cudnnConvolutionDescriptor)); - } - ~CudnnConvolutionDescriptor() { - checkCUDNN(cudnnDestroyConvolutionDescriptor(cudnnConvolutionDescriptor)); - } - - cudnnConvolutionDescriptor_t* operator &() { - return &cudnnConvolutionDescriptor; - } - - operator cudnnConvolutionDescriptor_t() { - return cudnnConvolutionDescriptor; - } -}; - -class CudnnActivationDescriptor -{ -private: - cudnnActivationDescriptor_t cudnnActivationDescriptor; - -public: - CudnnActivationDescriptor() { - checkCUDNN(cudnnCreateActivationDescriptor(&cudnnActivationDescriptor)); - } - ~CudnnActivationDescriptor() { - checkCUDNN(cudnnDestroyActivationDescriptor(cudnnActivationDescriptor)); - } - - cudnnActivationDescriptor_t* operator &() { - return &cudnnActivationDescriptor; - } - - operator cudnnActivationDescriptor_t() { - return cudnnActivationDescriptor; - } -}; - -class CudnnPoolingDescriptor -{ -private: - cudnnPoolingDescriptor_t cudnnPoolingDescriptor; - -public: - CudnnPoolingDescriptor() { - checkCUDNN(cudnnCreatePoolingDescriptor(&cudnnPoolingDescriptor)); - } - ~CudnnPoolingDescriptor() { - checkCUDNN(cudnnDestroyPoolingDescriptor(cudnnPoolingDescriptor)); - } - - cudnnPoolingDescriptor_t* operator &() { - return &cudnnPoolingDescriptor; - } - - operator cudnnPoolingDescriptor_t() { - return cudnnPoolingDescriptor; - } -}; - -class CudnnOpTensorDescriptor -{ -private: - cudnnOpTensorDescriptor_t cudnnOpTensorDescriptor; - -public: - CudnnOpTensorDescriptor() { - checkCUDNN(cudnnCreateOpTensorDescriptor(&cudnnOpTensorDescriptor)); - } - ~CudnnOpTensorDescriptor() { - checkCUDNN(cudnnDestroyOpTensorDescriptor(cudnnOpTensorDescriptor)); - } - - cudnnOpTensorDescriptor_t* operator &() { - return &cudnnOpTensorDescriptor; - } - - operator cudnnOpTensorDescriptor_t() { - return cudnnOpTensorDescriptor; - } -}; - -class CublasHandle -{ -private: - cublasHandle_t cublasHandle; - -public: - CublasHandle() { - checkCublasErrors(cublasCreate(&cublasHandle)); -#ifdef FP16 - checkCublasErrors(cublasSetMathMode(cublasHandle, CUBLAS_TENSOR_OP_MATH)); -#endif - } - ~CublasHandle() { - checkCublasErrors(cublasDestroy(cublasHandle)); - } - - cublasHandle_t* operator &() { - return &cublasHandle; - } - - operator cublasHandle_t() { - return cublasHandle; - } -}; diff --git a/usi/error_util.h b/usi/error_util.h index d77f5b2c..e09aafe4 100644 --- a/usi/error_util.h +++ b/usi/error_util.h @@ -2,6 +2,8 @@ #include #include +#include +#include inline void FatalError(const std::string& s) { std::cerr << s << "\nAborting...\n"; @@ -9,14 +11,6 @@ inline void FatalError(const std::string& s) { exit(EXIT_FAILURE); } -inline void checkCUDNN(cudnnStatus_t status) { - if (status != CUDNN_STATUS_SUCCESS) { - std::stringstream _error; - _error << "CUDNN failure\nError: " << cudnnGetErrorString(status); - FatalError(_error.str()); - } -} - inline void checkCudaErrors(cudaError_t status) { if (status != 0) { std::stringstream _error; @@ -24,11 +18,3 @@ inline void checkCudaErrors(cudaError_t status) { FatalError(_error.str()); } } - -inline void checkCublasErrors(cublasStatus_t status) { - if (status != 0) { - std::stringstream _error; - _error << "Cublas failure\nError code " << status; - FatalError(_error.str()); - } -} diff --git a/usi/int8_calibrator.h b/usi/int8_calibrator.h index 56d9e6bb..9a18ccf3 100644 --- a/usi/int8_calibrator.h +++ b/usi/int8_calibrator.h @@ -1,7 +1,7 @@ #pragma once #include "cppshogi.h" -#include "cudnn_wrapper.h" +#include "error_util.h" #include "NvInfer.h" class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2 diff --git a/usi/layers.h b/usi/layers.h deleted file mode 100644 index 14af8865..00000000 --- a/usi/layers.h +++ /dev/null @@ -1,321 +0,0 @@ -#pragma once - -#include "cudnn_wrapper.h" - -template -class ConvLayer { -public: - ConvLayer() : W(nullptr), workSpace(nullptr) { - const size_t size = c * k * fsize * fsize; - checkCudaErrors(cudaMalloc((void**)&W, size * sizeof(DType))); - } - ~ConvLayer() { - checkCudaErrors(cudaFree(W)); - checkCudaErrors(cudaFree(workSpace)); - } - - void init(cudnnHandle_t handle, cudnnTensorDescriptor_t xDesc, cudnnTensorDescriptor_t yDesc) { - checkCUDNN(cudnnSetFilter4dDescriptor(wDesc, CUDNN_DATA_TYPE, CUDNN_TENSOR_NCHW, k, c, fsize, fsize)); - checkCUDNN(cudnnSetConvolution2dDescriptor(convDesc, pad, pad, stride, stride, 1, 1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_TYPE)); - cudnnConvolutionFwdAlgoPerf_t algo_perf[4]; - int returnedAlgoCount; -#ifdef FP16 - checkCUDNN(cudnnSetConvolutionMathType(convDesc, CUDNN_TENSOR_OP_MATH)); - checkCUDNN(cudnnGetConvolutionForwardAlgorithm_v7(handle, xDesc, wDesc, convDesc, yDesc, 4, &returnedAlgoCount, algo_perf)); -#else - checkCUDNN(cudnnFindConvolutionForwardAlgorithm(handle, xDesc, wDesc, convDesc, yDesc, 4, &returnedAlgoCount, algo_perf)); -#endif - int algo_index = 0; - algo = algo_perf[algo_index].algo; - workSpaceSizeInBytes = algo_perf[algo_index].memory; - checkCudaErrors(cudaMalloc(&workSpace, workSpaceSizeInBytes)); - } - - int get_yh(const int h) { - return (h + 2 * pad - fsize) / stride + 1; - } - - int get_yw(const int w) { - return (w + 2 * pad - fsize) / stride + 1; - } - - void get_xdesc(cudnnTensorDescriptor_t xDesc, const int n, const int h, const int w) { - checkCUDNN(cudnnSetTensor4dDescriptor(xDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_TYPE, n, c, h, w)); - } - - void get_ydesc(cudnnTensorDescriptor_t yDesc, const int n, const int h, const int w) { - checkCUDNN(cudnnSetTensor4dDescriptor(yDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_TYPE, n, k, h, w)); - } - - int get_xsize(const int n, const int h, const int w) { - return n * c * h * w * sizeof(DType); - } - - int get_ysize(const int n, const int h, const int w) { - return n * k * get_yh(h) * get_yw(w) * sizeof(DType); - } - - void set_param(float* data) { - const size_t size = c * k * fsize * fsize; -#ifdef FP16 - __half* tmp = new __half[size]; - for (size_t i = 0; i < size; i++) - tmp[i] = __float2half(data[i]); - checkCudaErrors(cudaMemcpy(W, tmp, size * sizeof(__half), cudaMemcpyHostToDevice)); - delete[] tmp; -#else - checkCudaErrors(cudaMemcpy(W, data, size * sizeof(DType), cudaMemcpyHostToDevice)); -#endif - } - - void operator() (cudnnHandle_t handle, cudnnTensorDescriptor_t xDesc, DType* x, cudnnTensorDescriptor_t yDesc, DType* y) { - const float alpha = 1.0f; - const float beta = 0.0f; - checkCUDNN(cudnnConvolutionForward(handle, &alpha, xDesc, x, wDesc, W, convDesc, algo, workSpace, workSpaceSizeInBytes, &beta, yDesc, y)); - } - -private: - CudnnFilterDescriptor wDesc; - CudnnConvolutionDescriptor convDesc; - cudnnConvolutionFwdAlgo_t algo; - size_t workSpaceSizeInBytes; - DType* W; - void* workSpace; -}; - -template -class Bias { -public: - Bias() : b(nullptr) { - checkCUDNN(cudnnSetTensor4dDescriptor(biasTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_TYPE, 1, c, h, w)); - const size_t size = c * h * w; - checkCudaErrors(cudaMalloc((void**)&b, size * sizeof(DType))); - } - ~Bias() { - checkCudaErrors(cudaFree(b)); - } - - void set_bias(float* data) { - const size_t size = c * h * w; -#ifdef FP16 - __half* tmp = new __half[size]; - for (size_t i = 0; i < size; i++) - tmp[i] = __float2half(data[i]); - checkCudaErrors(cudaMemcpy(b, tmp, size * sizeof(__half), cudaMemcpyHostToDevice)); - delete[] tmp; -#else - checkCudaErrors(cudaMemcpy(b, data, size * sizeof(DType), cudaMemcpyHostToDevice)); -#endif - } - - void operator() (cudnnHandle_t handle, cudnnTensorDescriptor_t xDesc, DType* x) { - const float alpha = 1.0f; - const float beta = 1.0f; - checkCUDNN(cudnnAddTensor(handle, &alpha, biasTensorDesc, b, &beta, xDesc, x)); - } - -private: - CudnnTensorDescriptor biasTensorDesc; - DType *b; -}; - -class ReLU { -public: - ReLU() { - checkCUDNN(cudnnSetActivationDescriptor(activDesc, CUDNN_ACTIVATION_RELU, CUDNN_PROPAGATE_NAN, 0.0/*reluCeiling*/)); - } - - void operator() (cudnnHandle_t handle, cudnnTensorDescriptor_t xDesc, DType* x) { - const float alpha = 1.0f; - const float beta = 0.0f; - checkCUDNN(cudnnActivationForward(handle, activDesc, &alpha, xDesc, x, &beta, xDesc, x)); - } - -private: - CudnnActivationDescriptor activDesc; -}; - -template -class Linear { -public: - Linear() : W(nullptr) { - const size_t size = k * n; - checkCudaErrors(cudaMalloc((void**)&W, size * sizeof(DType))); - } - ~Linear() { - checkCudaErrors(cudaFree(W)); - } - - void get_xdesc(cudnnTensorDescriptor_t xDesc, const int m) { - checkCUDNN(cudnnSetTensor4dDescriptor(xDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_TYPE, m, k, 1, 1)); - } - - void get_ydesc(cudnnTensorDescriptor_t yDesc, const int m) { - checkCUDNN(cudnnSetTensor4dDescriptor(yDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_TYPE, m, n, 1, 1)); - } - - void set_param(float* data) { - const size_t size = k * n; -#ifdef FP16 - __half* tmp = new __half[size]; - for (size_t i = 0; i < size; i++) - tmp[i] = __float2half(data[i]); - checkCudaErrors(cudaMemcpy(W, tmp, size * sizeof(__half), cudaMemcpyHostToDevice)); - delete[] tmp; -#else - checkCudaErrors(cudaMemcpy(W, data, size * sizeof(DType), cudaMemcpyHostToDevice)); -#endif - } - - void operator() (cublasHandle_t handle, const int m, DType* x, DType* y) { - const DType alpha = _one; - const DType beta = _zero; - // C = α op ( A ) op ( B ) + β C - // op ( A ) m × k , op ( B ) k × n and C m × n -#ifdef FP16 - checkCublasErrors(cublasGemmEx(handle, CUBLAS_OP_T, CUBLAS_OP_N, n, m, k, &alpha, W, CUDA_DATA_TYPE, k, x, CUDA_DATA_TYPE, k, &beta, y, CUDA_DATA_TYPE, n, CUDA_DATA_TYPE, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); -#else - checkCublasErrors(cublasSgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, n, m, k, &alpha, W, k, x, k, &beta, y, n)); -#endif - } - -private: - DType* W; -}; - -template -class MaxPooling2D { -public: - MaxPooling2D() { - checkCUDNN(cudnnSetPooling2dDescriptor(poolingDesc, CUDNN_POOLING_MAX, CUDNN_PROPAGATE_NAN, window, window, pad, pad, stride, stride)); - } - - int get_yh(const int h) { - return (h + 2 * pad - window) / stride + 1; - } - - int get_yw(const int w) { - return (w + 2 * pad - window) / stride + 1; - } - - void get_desc(cudnnTensorDescriptor_t desc, const int n, const int c, const int h, const int w) { - checkCUDNN(cudnnSetTensor4dDescriptor(desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_TYPE, n, c, h, w)); - } - - void operator() (cudnnHandle_t handle, cudnnTensorDescriptor_t xDesc, DType* x, cudnnTensorDescriptor_t yDesc, DType* y) { - const float alpha = 1.0f; - const float beta = 0.0f; - checkCUDNN(cudnnPoolingForward(handle, poolingDesc, &alpha, xDesc, x, &beta, yDesc, y)); - } - -private: - CudnnPoolingDescriptor poolingDesc; -}; - -template -class AveragePooling2D { -public: - AveragePooling2D() { - checkCUDNN(cudnnSetPooling2dDescriptor(poolingDesc, CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING, CUDNN_PROPAGATE_NAN, window, window, pad, pad, stride, stride)); - } - - int get_yh(const int h) { - return (h + 2 * pad - window) / stride + 1; - } - - int get_yw(const int w) { - return (w + 2 * pad - window) / stride + 1; - } - - void get_desc(cudnnTensorDescriptor_t desc, const int n, const int c, const int h, const int w) { - checkCUDNN(cudnnSetTensor4dDescriptor(desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_TYPE, n, c, h, w)); - } - - void operator() (cudnnHandle_t handle, cudnnTensorDescriptor_t xDesc, DType* x, cudnnTensorDescriptor_t yDesc, DType* y) { - const float alpha = 1.0f; - const float beta = 0.0f; - checkCUDNN(cudnnPoolingForward(handle, poolingDesc, &alpha, xDesc, x, &beta, yDesc, y)); - } - -private: - CudnnPoolingDescriptor poolingDesc; -}; - -class Softmax { -public: - void get_desc(cudnnTensorDescriptor_t desc, const int n, const int c) { - checkCUDNN(cudnnSetTensor4dDescriptor(desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_TYPE, n, c, 1, 1)); - } - - void operator() (cudnnHandle_t handle, cudnnTensorDescriptor_t xDesc, DType* x) { - const float alpha = 1.0f; - const float beta = 0.0f; - checkCUDNN(cudnnSoftmaxForward(handle, CUDNN_SOFTMAX_ACCURATE, CUDNN_SOFTMAX_MODE_CHANNEL, &alpha, xDesc, x, &beta, xDesc, x)); - } -}; - -class Sigmoid { -public: - Sigmoid() { - checkCUDNN(cudnnSetActivationDescriptor(activDesc, CUDNN_ACTIVATION_SIGMOID, CUDNN_PROPAGATE_NAN, 0.0)); - } - - void operator() (cudnnHandle_t handle, cudnnTensorDescriptor_t xDesc, DType* x) { - const float alpha = 1.0f; - const float beta = 0.0f; - checkCUDNN(cudnnActivationForward(handle, activDesc, &alpha, xDesc, x, &beta, xDesc, x)); - } - -private: - CudnnActivationDescriptor activDesc; -}; - -template -class BatchNormalization { -public: - BatchNormalization() : bnScale(nullptr), bnBias(nullptr), estimatedMean(nullptr), estimatedVariance(nullptr) { - const size_t size = k; - checkCudaErrors(cudaMalloc((void**)&bnScale, size * sizeof(float))); - checkCudaErrors(cudaMalloc((void**)&bnBias, size * sizeof(float))); - checkCudaErrors(cudaMalloc((void**)&estimatedMean, size * sizeof(float))); - checkCudaErrors(cudaMalloc((void**)&estimatedVariance, size * sizeof(float))); - } - ~BatchNormalization() { - checkCudaErrors(cudaFree(bnScale)); - checkCudaErrors(cudaFree(bnBias)); - checkCudaErrors(cudaFree(estimatedMean)); - checkCudaErrors(cudaFree(estimatedVariance)); - } - - void operator() (cudnnHandle_t handle, cudnnTensorDescriptor_t xDesc, DType* x, DType* y) { - const float alpha = 1.0f; - const float beta = 0.0f; - const double eps = 2e-5; - checkCUDNN(cudnnDeriveBNTensorDescriptor(bnScaleBiasMeanVarDesc, xDesc, CUDNN_BATCHNORM_SPATIAL)); - checkCUDNN(cudnnBatchNormalizationForwardInference(handle, CUDNN_BATCHNORM_SPATIAL, &alpha, &beta, xDesc, x, xDesc, y, bnScaleBiasMeanVarDesc, bnScale, bnBias, estimatedMean, estimatedVariance, eps)); - } - - void set_param(float* bnScale, float *bnBias, float *estimatedMean, float *estimatedVariance) { - const size_t size = k; - checkCudaErrors(cudaMemcpy(this->bnScale, bnScale, size * sizeof(float), cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpy(this->bnBias, bnBias, size * sizeof(float), cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpy(this->estimatedMean, estimatedMean, size * sizeof(float), cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpy(this->estimatedVariance, estimatedVariance, size * sizeof(float), cudaMemcpyHostToDevice)); - } - -private: - CudnnTensorDescriptor bnScaleBiasMeanVarDesc; - float *bnScale; - float *bnBias; - float *estimatedMean; - float *estimatedVariance; -}; - -class Add { -public: - void operator() (cudnnHandle_t handle, cudnnTensorDescriptor_t xDesc, DType* x, DType* y) { - const float alpha = 1.0f; - const float beta = 1.0f; - checkCUDNN(cudnnAddTensor(handle, &alpha, xDesc, x, &beta, xDesc, y)); - } -}; \ No newline at end of file diff --git a/usi/nn_fused_wideresnet10.cpp b/usi/nn_fused_wideresnet10.cpp deleted file mode 100644 index 9c8a593e..00000000 --- a/usi/nn_fused_wideresnet10.cpp +++ /dev/null @@ -1,292 +0,0 @@ -#include "nn_fused_wideresnet10.h" -#include "npz.h" - -NNFusedWideResnet10::NNFusedWideResnet10(const char* filename, const int max_batch_size) : max_batch_size(max_batch_size) -{ - prepare_desc(max_batch_size); - - // init conv layers - conv1_1_1.init(cudnnHandle, x1Desc, h1Desc); - conv1_1_2.init(cudnnHandle, x1Desc, h1Desc); - conv1_2.init(cudnnHandle, x2Desc, h1Desc); - conv2.init(cudnnHandle, h1Desc, h1Desc); - conv3.init(cudnnHandle, h1Desc, h1Desc); - conv4.init(cudnnHandle, h1Desc, h1Desc); - conv5.init(cudnnHandle, h1Desc, h1Desc); - conv6.init(cudnnHandle, h1Desc, h1Desc); - conv7.init(cudnnHandle, h1Desc, h1Desc); - conv8.init(cudnnHandle, h1Desc, h1Desc); - conv9.init(cudnnHandle, h1Desc, h1Desc); - conv10.init(cudnnHandle, h1Desc, h1Desc); - conv11.init(cudnnHandle, h1Desc, h1Desc); - conv12.init(cudnnHandle, h1Desc, h1Desc); - conv13.init(cudnnHandle, h1Desc, h1Desc); - conv14.init(cudnnHandle, h1Desc, h1Desc); - conv15.init(cudnnHandle, h1Desc, h1Desc); - conv16.init(cudnnHandle, h1Desc, h1Desc); - conv17.init(cudnnHandle, h1Desc, h1Desc); - conv18.init(cudnnHandle, h1Desc, h1Desc); - conv19.init(cudnnHandle, h1Desc, h1Desc); - conv20.init(cudnnHandle, h1Desc, h1Desc); - conv21.init(cudnnHandle, h1Desc, h1Desc); - conv22.init(cudnnHandle, h1Desc, y1Desc); - conv22v.init(cudnnHandle, h1Desc, h22vDesc); - - // malloc - checkCudaErrors(cudaMalloc((void**)&x1_dev, conv1_1_1.get_xsize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&x2_dev, conv1_2.get_xsize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h1_1_1_dev, conv1_1_1.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h1_1_2_dev, conv1_1_2.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h1_2_dev, conv1_2.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h1_bn_dev, conv1_1_1.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h2_dev, conv2.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h3_dev, conv3.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h5_dev, conv5.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h7_dev, conv7.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h9_dev, conv9.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h11_dev, conv11.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h13_dev, conv13.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h15_dev, conv15.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h17_dev, conv17.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h19_dev, conv19.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h21_dev, conv21.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h21_bn_dev, conv21.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&y1_dev, conv22.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h22v_dev, conv22v.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h22v_bn_dev, conv22v.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h23v_dev, max_batch_size * fcl * sizeof(DType))); - checkCudaErrors(cudaMalloc((void**)&y2_dev, max_batch_size * sizeof(DType))); - - load_model(filename); -} - -NNFusedWideResnet10::~NNFusedWideResnet10() { - checkCudaErrors(cudaFree(x1_dev)); - checkCudaErrors(cudaFree(x2_dev)); - checkCudaErrors(cudaFree(h1_1_1_dev)); - checkCudaErrors(cudaFree(h1_1_2_dev)); - checkCudaErrors(cudaFree(h1_2_dev)); - checkCudaErrors(cudaFree(h1_bn_dev)); - checkCudaErrors(cudaFree(h2_dev)); - checkCudaErrors(cudaFree(h3_dev)); - checkCudaErrors(cudaFree(h5_dev)); - checkCudaErrors(cudaFree(h7_dev)); - checkCudaErrors(cudaFree(h9_dev)); - checkCudaErrors(cudaFree(h11_dev)); - checkCudaErrors(cudaFree(h13_dev)); - checkCudaErrors(cudaFree(h15_dev)); - checkCudaErrors(cudaFree(h17_dev)); - checkCudaErrors(cudaFree(h19_dev)); - checkCudaErrors(cudaFree(h21_dev)); - checkCudaErrors(cudaFree(h21_bn_dev)); - checkCudaErrors(cudaFree(y1_dev)); - checkCudaErrors(cudaFree(h22v_dev)); - checkCudaErrors(cudaFree(h22v_bn_dev)); - checkCudaErrors(cudaFree(h23v_dev)); - checkCudaErrors(cudaFree(y2_dev)); -} - -void NNFusedWideResnet10::prepare_desc(const int batch_size) -{ - conv1_1_1.get_xdesc(x1Desc, batch_size, 9, 9); - conv1_2.get_xdesc(x2Desc, batch_size, 9, 9); - conv1_1_1.get_ydesc(h1Desc, batch_size, 9, 9); - - conv22.get_ydesc(y1Desc, batch_size, 9, 9); - - conv22v.get_ydesc(h22vDesc, batch_size, 9, 9); - l23v.get_ydesc(h23vDesc, batch_size); - l24v.get_ydesc(y2Desc, batch_size); -} - -void NNFusedWideResnet10::load_model(const char* filepath) -{ - // load nn params - ParamMap params; - load_npz(filepath, params); - - conv1_1_1.set_param(params["l1_1_1/W.npy"].data); - conv1_1_2.set_param(params["l1_1_2/W.npy"].data); - conv1_2.set_param(params["l1_2/W.npy"].data); - bn1.set_param(params["norm1/gamma.npy"].data, params["norm1/beta.npy"].data, params["norm1/avg_mean.npy"].data, params["norm1/avg_var.npy"].data); - conv2.set_param(params["l2/W.npy"].data); - bias2.set_bias(params["l2/b.npy"].data); - conv3.set_param(params["l3/W.npy"].data); - bn3.set_param(params["norm3/gamma.npy"].data, params["norm3/beta.npy"].data, params["norm3/avg_mean.npy"].data, params["norm3/avg_var.npy"].data); - conv4.set_param(params["l4/W.npy"].data); - bias4.set_bias(params["l4/b.npy"].data); - conv5.set_param(params["l5/W.npy"].data); - bn5.set_param(params["norm5/gamma.npy"].data, params["norm5/beta.npy"].data, params["norm5/avg_mean.npy"].data, params["norm5/avg_var.npy"].data); - conv6.set_param(params["l6/W.npy"].data); - bias6.set_bias(params["l6/b.npy"].data); - conv7.set_param(params["l7/W.npy"].data); - bn7.set_param(params["norm7/gamma.npy"].data, params["norm7/beta.npy"].data, params["norm7/avg_mean.npy"].data, params["norm7/avg_var.npy"].data); - conv8.set_param(params["l8/W.npy"].data); - bias8.set_bias(params["l8/b.npy"].data); - conv9.set_param(params["l9/W.npy"].data); - bn9.set_param(params["norm9/gamma.npy"].data, params["norm9/beta.npy"].data, params["norm9/avg_mean.npy"].data, params["norm9/avg_var.npy"].data); - conv10.set_param(params["l10/W.npy"].data); - bias10.set_bias(params["l10/b.npy"].data); - conv11.set_param(params["l11/W.npy"].data); - bn11.set_param(params["norm11/gamma.npy"].data, params["norm11/beta.npy"].data, params["norm11/avg_mean.npy"].data, params["norm11/avg_var.npy"].data); - conv12.set_param(params["l12/W.npy"].data); - bias12.set_bias(params["l12/b.npy"].data); - conv13.set_param(params["l13/W.npy"].data); - bn13.set_param(params["norm13/gamma.npy"].data, params["norm13/beta.npy"].data, params["norm13/avg_mean.npy"].data, params["norm13/avg_var.npy"].data); - conv14.set_param(params["l14/W.npy"].data); - bias14.set_bias(params["l14/b.npy"].data); - conv15.set_param(params["l15/W.npy"].data); - bn15.set_param(params["norm15/gamma.npy"].data, params["norm15/beta.npy"].data, params["norm15/avg_mean.npy"].data, params["norm15/avg_var.npy"].data); - conv16.set_param(params["l16/W.npy"].data); - bias16.set_bias(params["l16/b.npy"].data); - conv17.set_param(params["l17/W.npy"].data); - bn17.set_param(params["norm17/gamma.npy"].data, params["norm17/beta.npy"].data, params["norm17/avg_mean.npy"].data, params["norm17/avg_var.npy"].data); - conv18.set_param(params["l18/W.npy"].data); - bias18.set_bias(params["l18/b.npy"].data); - conv19.set_param(params["l19/W.npy"].data); - bn19.set_param(params["norm19/gamma.npy"].data, params["norm19/beta.npy"].data, params["norm19/avg_mean.npy"].data, params["norm19/avg_var.npy"].data); - conv20.set_param(params["l20/W.npy"].data); - bias20.set_bias(params["l20/b.npy"].data); - conv21.set_param(params["l21/W.npy"].data); - bn21.set_param(params["norm21/gamma.npy"].data, params["norm21/beta.npy"].data, params["norm21/avg_mean.npy"].data, params["norm21/avg_var.npy"].data); - conv22.set_param(params["l22/W.npy"].data); - bias22.set_bias(params["l22_2/b.npy"].data); - conv22v.set_param(params["l22_v/W.npy"].data); - bias22v.set_bias(params["l22_v/b.npy"].data); - bn22v.set_param(params["norm22_v/gamma.npy"].data, params["norm22_v/beta.npy"].data, params["norm22_v/avg_mean.npy"].data, params["norm22_v/avg_var.npy"].data); - l23v.set_param(params["l23_v/W.npy"].data); - bias23v.set_bias(params["l23_v/b.npy"].data); - l24v.set_param(params["l24_v/W.npy"].data); - bias24v.set_bias(params["l24_v/b.npy"].data); -} - -void NNFusedWideResnet10::forward(const int batch_size, features1_t* x1, features2_t* x2, DType* y1, DType* y2) -{ - prepare_desc(batch_size); - - // input - checkCudaErrors(cudaMemcpy(x1_dev, x1, sizeof(features1_t) * batch_size, cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpy(x2_dev, x2, sizeof(features2_t) * batch_size, cudaMemcpyHostToDevice)); - - // layer1 - conv1_1_1(cudnnHandle, x1Desc, x1_dev, h1Desc, h1_1_1_dev); - conv1_1_2(cudnnHandle, x1Desc, x1_dev, h1Desc, h1_1_2_dev); - conv1_2(cudnnHandle, x2Desc, x2_dev, h1Desc, h1_2_dev); - add(cudnnHandle, h1Desc, h1_1_2_dev, h1_1_1_dev); - add(cudnnHandle, h1Desc, h1_2_dev, h1_1_1_dev); - - // residual block1 - bn1(cudnnHandle, h1Desc, h1_1_1_dev, h1_bn_dev); - - relu(cudnnHandle, h1Desc, h1_bn_dev); - conv2(cudnnHandle, h1Desc, h1_bn_dev, h1Desc, h2_dev); - bias2(cudnnHandle, h1Desc, h2_dev); - relu(cudnnHandle, h1Desc, h2_dev); - conv3(cudnnHandle, h1Desc, h2_dev, h1Desc, h3_dev); - add(cudnnHandle, h1Desc, h1_1_1_dev, h3_dev); - - // residual block2 - bn3(cudnnHandle, h1Desc, h3_dev, h1_bn_dev); - relu(cudnnHandle, h1Desc, h1_bn_dev); - conv4(cudnnHandle, h1Desc, h1_bn_dev, h1Desc, h2_dev); - bias4(cudnnHandle, h1Desc, h2_dev); - relu(cudnnHandle, h1Desc, h2_dev); - conv5(cudnnHandle, h1Desc, h2_dev, h1Desc, h5_dev); - add(cudnnHandle, h1Desc, h3_dev, h5_dev); - - // residual block3 - bn5(cudnnHandle, h1Desc, h5_dev, h1_bn_dev); - relu(cudnnHandle, h1Desc, h1_bn_dev); - conv6(cudnnHandle, h1Desc, h1_bn_dev, h1Desc, h2_dev); - bias6(cudnnHandle, h1Desc, h2_dev); - relu(cudnnHandle, h1Desc, h2_dev); - conv7(cudnnHandle, h1Desc, h2_dev, h1Desc, h7_dev); - add(cudnnHandle, h1Desc, h5_dev, h7_dev); - - // residual block4 - bn7(cudnnHandle, h1Desc, h7_dev, h1_bn_dev); - relu(cudnnHandle, h1Desc, h1_bn_dev); - conv8(cudnnHandle, h1Desc, h1_bn_dev, h1Desc, h2_dev); - bias8(cudnnHandle, h1Desc, h2_dev); - relu(cudnnHandle, h1Desc, h2_dev); - conv9(cudnnHandle, h1Desc, h2_dev, h1Desc, h9_dev); - add(cudnnHandle, h1Desc, h7_dev, h9_dev); - - // residual block5 - bn9(cudnnHandle, h1Desc, h9_dev, h1_bn_dev); - relu(cudnnHandle, h1Desc, h1_bn_dev); - conv10(cudnnHandle, h1Desc, h1_bn_dev, h1Desc, h2_dev); - bias10(cudnnHandle, h1Desc, h2_dev); - relu(cudnnHandle, h1Desc, h2_dev); - conv11(cudnnHandle, h1Desc, h2_dev, h1Desc, h11_dev); - add(cudnnHandle, h1Desc, h9_dev, h11_dev); - - // residual block6 - bn11(cudnnHandle, h1Desc, h11_dev, h1_bn_dev); - relu(cudnnHandle, h1Desc, h1_bn_dev); - conv12(cudnnHandle, h1Desc, h1_bn_dev, h1Desc, h2_dev); - bias12(cudnnHandle, h1Desc, h2_dev); - relu(cudnnHandle, h1Desc, h2_dev); - conv13(cudnnHandle, h1Desc, h2_dev, h1Desc, h13_dev); - add(cudnnHandle, h1Desc, h11_dev, h13_dev); - - // residual block7 - bn13(cudnnHandle, h1Desc, h13_dev, h1_bn_dev); - relu(cudnnHandle, h1Desc, h1_bn_dev); - conv14(cudnnHandle, h1Desc, h1_bn_dev, h1Desc, h2_dev); - bias14(cudnnHandle, h1Desc, h2_dev); - relu(cudnnHandle, h1Desc, h2_dev); - conv15(cudnnHandle, h1Desc, h2_dev, h1Desc, h15_dev); - add(cudnnHandle, h1Desc, h13_dev, h15_dev); - - // residual block8 - bn15(cudnnHandle, h1Desc, h15_dev, h1_bn_dev); - relu(cudnnHandle, h1Desc, h1_bn_dev); - conv16(cudnnHandle, h1Desc, h1_bn_dev, h1Desc, h2_dev); - bias16(cudnnHandle, h1Desc, h2_dev); - relu(cudnnHandle, h1Desc, h2_dev); - conv17(cudnnHandle, h1Desc, h2_dev, h1Desc, h17_dev); - add(cudnnHandle, h1Desc, h15_dev, h17_dev); - - // residual block9 - bn17(cudnnHandle, h1Desc, h17_dev, h1_bn_dev); - relu(cudnnHandle, h1Desc, h1_bn_dev); - conv18(cudnnHandle, h1Desc, h1_bn_dev, h1Desc, h2_dev); - bias18(cudnnHandle, h1Desc, h2_dev); - relu(cudnnHandle, h1Desc, h2_dev); - conv19(cudnnHandle, h1Desc, h2_dev, h1Desc, h19_dev); - add(cudnnHandle, h1Desc, h17_dev, h19_dev); - - // residual block10 - bn19(cudnnHandle, h1Desc, h19_dev, h1_bn_dev); - relu(cudnnHandle, h1Desc, h1_bn_dev); - conv20(cudnnHandle, h1Desc, h1_bn_dev, h1Desc, h2_dev); - bias20(cudnnHandle, h1Desc, h2_dev); - relu(cudnnHandle, h1Desc, h2_dev); - conv21(cudnnHandle, h1Desc, h2_dev, h1Desc, h21_dev); - add(cudnnHandle, h1Desc, h19_dev, h21_dev); - - // after residual blocks - bn21(cudnnHandle, h1Desc, h21_dev, h21_bn_dev); - relu(cudnnHandle, h1Desc, h21_bn_dev); - - // policy network - conv22(cudnnHandle, h1Desc, h21_bn_dev, y1Desc, y1_dev); - bias22(cudnnHandle, y1Desc, y1_dev); - - // value network - conv22v(cudnnHandle, h1Desc, h21_bn_dev, h22vDesc, h22v_dev); - bias22v(cudnnHandle, h22vDesc, h22v_dev); - bn22v(cudnnHandle, h22vDesc, h22v_dev, h22v_bn_dev); - relu(cudnnHandle, h22vDesc, h22v_bn_dev); - l23v(cublasHandle, batch_size, h22v_bn_dev, h23v_dev); - bias23v(cudnnHandle, h23vDesc, h23v_dev); - relu(cudnnHandle, h23vDesc, h23v_dev); - l24v(cublasHandle, batch_size, h23v_dev, y2_dev); - bias24v(cudnnHandle, y2Desc, y2_dev); - sigmoid(cudnnHandle, y2Desc, y2_dev); - - // output - checkCudaErrors(cudaMemcpy(y1, y1_dev, conv22.get_ysize(batch_size, 9, 9), cudaMemcpyDeviceToHost)); - checkCudaErrors(cudaMemcpy(y2, y2_dev, batch_size * sizeof(DType), cudaMemcpyDeviceToHost)); -} diff --git a/usi/nn_fused_wideresnet10.h b/usi/nn_fused_wideresnet10.h deleted file mode 100644 index 4b838b13..00000000 --- a/usi/nn_fused_wideresnet10.h +++ /dev/null @@ -1,124 +0,0 @@ -#pragma once - -#include "nn.h" -#include "layers.h" - -class NNFusedWideResnet10 : NN { -public: - NNFusedWideResnet10(const char* filename, const int max_batch_size); - ~NNFusedWideResnet10(); - - void forward(const int batch_size, features1_t* x1, features2_t* x2, DType* y1, DType* y2); - -private: - void load_model(const char* filename); - void prepare_desc(const int batch_size); - - CudnnHandle cudnnHandle; - CublasHandle cublasHandle; - static constexpr int k = 192; - static constexpr int fcl = 256; - - const int max_batch_size; - - // input layer - ConvLayer conv1_1_1; - ConvLayer conv1_1_2; - ConvLayer conv1_2; - // residual blocks - BatchNormalization bn1; - ConvLayer conv2; - Bias bias2; - ConvLayer conv3; - BatchNormalization bn3; - ConvLayer conv4; - Bias bias4; - ConvLayer conv5; - BatchNormalization bn5; - ConvLayer conv6; - Bias bias6; - ConvLayer conv7; - BatchNormalization bn7; - ConvLayer conv8; - Bias bias8; - ConvLayer conv9; - BatchNormalization bn9; - ConvLayer conv10; - Bias bias10; - ConvLayer conv11; - BatchNormalization bn11; - ConvLayer conv12; - Bias bias12; - ConvLayer conv13; - BatchNormalization bn13; - ConvLayer conv14; - Bias bias14; - ConvLayer conv15; - BatchNormalization bn15; - ConvLayer conv16; - Bias bias16; - ConvLayer conv17; - BatchNormalization bn17; - ConvLayer conv18; - Bias bias18; - ConvLayer conv19; - BatchNormalization bn19; - ConvLayer conv20; - Bias bias20; - ConvLayer conv21; - BatchNormalization bn21; - // policy network - ConvLayer conv22; - Bias bias22; - // value network - ConvLayer conv22v; - Bias bias22v; - BatchNormalization bn22v; - Linear<9 * 9 * MAX_MOVE_LABEL_NUM, fcl> l23v; - Bias bias23v; - Linear l24v; - Bias<1, 1, 1> bias24v; - - ReLU relu; - Add add; - Sigmoid sigmoid; - - CudnnTensorDescriptor x1Desc; - CudnnTensorDescriptor x2Desc; - CudnnTensorDescriptor h1Desc; - CudnnTensorDescriptor h22Desc; - CudnnTensorDescriptor h22vDesc; - CudnnTensorDescriptor h23vDesc; - CudnnTensorDescriptor h24vDesc; - CudnnTensorDescriptor y1Desc; - CudnnTensorDescriptor y2Desc; - - // input layer - DType* x1_dev; - DType* x2_dev; - DType* h1_1_1_dev; - DType* h1_1_2_dev; - DType* h1_2_dev; - // residual block - DType* h1_bn_dev; - DType* h2_dev; - DType* h3_dev; - DType* h5_dev; - DType* h7_dev; - DType* h9_dev; - DType* h11_dev; - DType* h13_dev; - DType* h15_dev; - DType* h17_dev; - DType* h19_dev; - DType* h21_dev; - // after residual blocks - DType* h21_bn_dev; - // policy network - DType* y1_dev; - // value network - DType* h22v_dev; - DType* h22v_bn_dev; - DType* h23v_dev; - DType* y2_dev; -}; \ No newline at end of file diff --git a/usi/nn_senet10.cpp b/usi/nn_senet10.cpp deleted file mode 100644 index 2f84a89e..00000000 --- a/usi/nn_senet10.cpp +++ /dev/null @@ -1,356 +0,0 @@ -#include "nn_senet10.h" -#include "npz.h" - -void debug_print_dev(DType* dev, const size_t size) { - std::vector host(size); - checkCudaErrors(cudaMemcpy(host.data(), dev, size * sizeof(DType), cudaMemcpyDeviceToHost)); - std::cout << to_float(host[0]); - for (size_t i = 1; i < host.size(); i++) { - std::cout << ", " << to_float(host[i]); - } - std::cout << std::endl; -} - -NNSENet10::NNSENet10(const char* filename, const int max_batch_size) : max_batch_size(max_batch_size) -{ - prepare_desc(max_batch_size); - - // init conv layers - conv1_1_1.init(cudnnHandle, x1Desc, h1Desc); - conv1_1_2.init(cudnnHandle, x1Desc, h1Desc); - conv1_2.init(cudnnHandle, x2Desc, h1Desc); - conv2.init(cudnnHandle, h1Desc, h1Desc); - conv3.init(cudnnHandle, h1Desc, h1Desc); - conv4.init(cudnnHandle, h1Desc, h1Desc); - conv5.init(cudnnHandle, h1Desc, h1Desc); - conv6.init(cudnnHandle, h1Desc, h1Desc); - conv7.init(cudnnHandle, h1Desc, h1Desc); - conv8.init(cudnnHandle, h1Desc, h1Desc); - conv9.init(cudnnHandle, h1Desc, h1Desc); - conv10.init(cudnnHandle, h1Desc, h1Desc); - conv11.init(cudnnHandle, h1Desc, h1Desc); - conv12.init(cudnnHandle, h1Desc, h1Desc); - conv13.init(cudnnHandle, h1Desc, h1Desc); - conv14.init(cudnnHandle, h1Desc, h1Desc); - conv15.init(cudnnHandle, h1Desc, h1Desc); - conv16.init(cudnnHandle, h1Desc, h1Desc); - conv17.init(cudnnHandle, h1Desc, h1Desc); - conv18.init(cudnnHandle, h1Desc, h1Desc); - conv19.init(cudnnHandle, h1Desc, h1Desc); - conv20.init(cudnnHandle, h1Desc, h1Desc); - conv21.init(cudnnHandle, h1Desc, h1Desc); - conv22.init(cudnnHandle, h1Desc, y1Desc); - conv22v.init(cudnnHandle, h1Desc, h22vDesc); - - // malloc - checkCudaErrors(cudaMalloc((void**)&x1_dev, conv1_1_1.get_xsize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&x2_dev, conv1_2.get_xsize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h1_1_1_dev, conv1_1_1.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h1_1_2_dev, conv1_1_2.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h1_2_dev, conv1_2.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h1_bn_dev, conv1_1_1.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h2_dev, conv2.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h2_bn_dev, conv2.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h3_dev, conv3.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h5_dev, conv5.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h7_dev, conv7.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h9_dev, conv9.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h11_dev, conv11.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h13_dev, conv13.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h15_dev, conv15.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h17_dev, conv17.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h19_dev, conv19.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h21_dev, conv21.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&se1_dev, max_batch_size * k * sizeof(DType))); - checkCudaErrors(cudaMalloc((void**)&se2_dev, max_batch_size * (k / reduction) * sizeof(DType))); - checkCudaErrors(cudaMalloc((void**)&se3_dev, max_batch_size * k * sizeof(DType))); - checkCudaErrors(cudaMalloc((void**)&h21_bn_dev, conv21.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&y1_dev, conv22.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h22v_dev, conv22v.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h22v_bn_dev, conv22v.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h23v_dev, max_batch_size * fcl * sizeof(DType))); - checkCudaErrors(cudaMalloc((void**)&y2_dev, max_batch_size * sizeof(DType))); - - load_model(filename); -} - -NNSENet10::~NNSENet10() { - checkCudaErrors(cudaFree(x1_dev)); - checkCudaErrors(cudaFree(x2_dev)); - checkCudaErrors(cudaFree(h1_1_1_dev)); - checkCudaErrors(cudaFree(h1_1_2_dev)); - checkCudaErrors(cudaFree(h1_2_dev)); - checkCudaErrors(cudaFree(h1_bn_dev)); - checkCudaErrors(cudaFree(h2_dev)); - checkCudaErrors(cudaFree(h2_bn_dev)); - checkCudaErrors(cudaFree(h3_dev)); - checkCudaErrors(cudaFree(h5_dev)); - checkCudaErrors(cudaFree(h7_dev)); - checkCudaErrors(cudaFree(h9_dev)); - checkCudaErrors(cudaFree(h11_dev)); - checkCudaErrors(cudaFree(h13_dev)); - checkCudaErrors(cudaFree(h15_dev)); - checkCudaErrors(cudaFree(h17_dev)); - checkCudaErrors(cudaFree(h19_dev)); - checkCudaErrors(cudaFree(h21_dev)); - checkCudaErrors(cudaFree(se1_dev)); - checkCudaErrors(cudaFree(se2_dev)); - checkCudaErrors(cudaFree(se3_dev)); - checkCudaErrors(cudaFree(h21_bn_dev)); - checkCudaErrors(cudaFree(y1_dev)); - checkCudaErrors(cudaFree(h22v_dev)); - checkCudaErrors(cudaFree(h22v_bn_dev)); - checkCudaErrors(cudaFree(h23v_dev)); - checkCudaErrors(cudaFree(y2_dev)); -} - -void NNSENet10::prepare_desc(const int batch_size) -{ - conv1_1_1.get_xdesc(x1Desc, batch_size, 9, 9); - conv1_2.get_xdesc(x2Desc, batch_size, 9, 9); - conv1_1_1.get_ydesc(h1Desc, batch_size, 9, 9); - - averagePooling2D.get_desc(se1Desc, batch_size, k, 1, 1); - se3_l1.get_ydesc(se2Desc, batch_size); - se3_l2.get_ydesc(se3Desc, batch_size); - cudnnSetOpTensorDescriptor(opMulDesc, CUDNN_OP_TENSOR_MUL, CUDNN_DATA_FLOAT, CUDNN_PROPAGATE_NAN); - - conv22.get_ydesc(y1Desc, batch_size, 9, 9); - - conv22v.get_ydesc(h22vDesc, batch_size, 9, 9); - l23v.get_ydesc(h23vDesc, batch_size); - l24v.get_ydesc(y2Desc, batch_size); -} - -void NNSENet10::load_model(const char* filepath) -{ - // load nn params - ParamMap params; - load_npz(filepath, params); - - conv1_1_1.set_param(params["l1_1_1/W.npy"].data); - conv1_1_2.set_param(params["l1_1_2/W.npy"].data); - conv1_2.set_param(params["l1_2/W.npy"].data); - bn1.set_param(params["norm1/gamma.npy"].data, params["norm1/beta.npy"].data, params["norm1/avg_mean.npy"].data, params["norm1/avg_var.npy"].data); - conv2.set_param(params["l2/W.npy"].data); - bn2.set_param(params["norm2/gamma.npy"].data, params["norm2/beta.npy"].data, params["norm2/avg_mean.npy"].data, params["norm2/avg_var.npy"].data); - conv3.set_param(params["l3/W.npy"].data); - bn3.set_param(params["norm3/gamma.npy"].data, params["norm3/beta.npy"].data, params["norm3/avg_mean.npy"].data, params["norm3/avg_var.npy"].data); - se3_l1.set_param(params["se3/l1/W.npy"].data); - se3_l2.set_param(params["se3/l2/W.npy"].data); - conv4.set_param(params["l4/W.npy"].data); - bn4.set_param(params["norm4/gamma.npy"].data, params["norm4/beta.npy"].data, params["norm4/avg_mean.npy"].data, params["norm4/avg_var.npy"].data); - conv5.set_param(params["l5/W.npy"].data); - bn5.set_param(params["norm5/gamma.npy"].data, params["norm5/beta.npy"].data, params["norm5/avg_mean.npy"].data, params["norm5/avg_var.npy"].data); - se5_l1.set_param(params["se5/l1/W.npy"].data); - se5_l2.set_param(params["se5/l2/W.npy"].data); - conv6.set_param(params["l6/W.npy"].data); - bn6.set_param(params["norm6/gamma.npy"].data, params["norm6/beta.npy"].data, params["norm6/avg_mean.npy"].data, params["norm6/avg_var.npy"].data); - conv7.set_param(params["l7/W.npy"].data); - bn7.set_param(params["norm7/gamma.npy"].data, params["norm7/beta.npy"].data, params["norm7/avg_mean.npy"].data, params["norm7/avg_var.npy"].data); - se7_l1.set_param(params["se7/l1/W.npy"].data); - se7_l2.set_param(params["se7/l2/W.npy"].data); - conv8.set_param(params["l8/W.npy"].data); - bn8.set_param(params["norm8/gamma.npy"].data, params["norm8/beta.npy"].data, params["norm8/avg_mean.npy"].data, params["norm8/avg_var.npy"].data); - conv9.set_param(params["l9/W.npy"].data); - bn9.set_param(params["norm9/gamma.npy"].data, params["norm9/beta.npy"].data, params["norm9/avg_mean.npy"].data, params["norm9/avg_var.npy"].data); - se9_l1.set_param(params["se9/l1/W.npy"].data); - se9_l2.set_param(params["se9/l2/W.npy"].data); - conv10.set_param(params["l10/W.npy"].data); - bn10.set_param(params["norm10/gamma.npy"].data, params["norm10/beta.npy"].data, params["norm10/avg_mean.npy"].data, params["norm10/avg_var.npy"].data); - conv11.set_param(params["l11/W.npy"].data); - bn11.set_param(params["norm11/gamma.npy"].data, params["norm11/beta.npy"].data, params["norm11/avg_mean.npy"].data, params["norm11/avg_var.npy"].data); - se11_l1.set_param(params["se11/l1/W.npy"].data); - se11_l2.set_param(params["se11/l2/W.npy"].data); - conv12.set_param(params["l12/W.npy"].data); - bn12.set_param(params["norm12/gamma.npy"].data, params["norm12/beta.npy"].data, params["norm12/avg_mean.npy"].data, params["norm12/avg_var.npy"].data); - conv13.set_param(params["l13/W.npy"].data); - bn13.set_param(params["norm13/gamma.npy"].data, params["norm13/beta.npy"].data, params["norm13/avg_mean.npy"].data, params["norm13/avg_var.npy"].data); - se13_l1.set_param(params["se13/l1/W.npy"].data); - se13_l2.set_param(params["se13/l2/W.npy"].data); - conv14.set_param(params["l14/W.npy"].data); - bn14.set_param(params["norm14/gamma.npy"].data, params["norm14/beta.npy"].data, params["norm14/avg_mean.npy"].data, params["norm14/avg_var.npy"].data); - conv15.set_param(params["l15/W.npy"].data); - bn15.set_param(params["norm15/gamma.npy"].data, params["norm15/beta.npy"].data, params["norm15/avg_mean.npy"].data, params["norm15/avg_var.npy"].data); - se15_l1.set_param(params["se15/l1/W.npy"].data); - se15_l2.set_param(params["se15/l2/W.npy"].data); - conv16.set_param(params["l16/W.npy"].data); - bn16.set_param(params["norm16/gamma.npy"].data, params["norm16/beta.npy"].data, params["norm16/avg_mean.npy"].data, params["norm16/avg_var.npy"].data); - conv17.set_param(params["l17/W.npy"].data); - bn17.set_param(params["norm17/gamma.npy"].data, params["norm17/beta.npy"].data, params["norm17/avg_mean.npy"].data, params["norm17/avg_var.npy"].data); - se17_l1.set_param(params["se17/l1/W.npy"].data); - se17_l2.set_param(params["se17/l2/W.npy"].data); - conv18.set_param(params["l18/W.npy"].data); - bn18.set_param(params["norm18/gamma.npy"].data, params["norm18/beta.npy"].data, params["norm18/avg_mean.npy"].data, params["norm18/avg_var.npy"].data); - conv19.set_param(params["l19/W.npy"].data); - bn19.set_param(params["norm19/gamma.npy"].data, params["norm19/beta.npy"].data, params["norm19/avg_mean.npy"].data, params["norm19/avg_var.npy"].data); - se19_l1.set_param(params["se19/l1/W.npy"].data); - se19_l2.set_param(params["se19/l2/W.npy"].data); - conv20.set_param(params["l20/W.npy"].data); - bn20.set_param(params["norm20/gamma.npy"].data, params["norm20/beta.npy"].data, params["norm20/avg_mean.npy"].data, params["norm20/avg_var.npy"].data); - conv21.set_param(params["l21/W.npy"].data); - bn21.set_param(params["norm21/gamma.npy"].data, params["norm21/beta.npy"].data, params["norm21/avg_mean.npy"].data, params["norm21/avg_var.npy"].data); - se21_l1.set_param(params["se21/l1/W.npy"].data); - se21_l2.set_param(params["se21/l2/W.npy"].data); - conv22.set_param(params["l22/W.npy"].data); - bias22.set_bias(params["l22_2/b.npy"].data); - conv22v.set_param(params["l22_v/W.npy"].data); - bias22v.set_bias(params["l22_v/b.npy"].data); - bn22v.set_param(params["norm22_v/gamma.npy"].data, params["norm22_v/beta.npy"].data, params["norm22_v/avg_mean.npy"].data, params["norm22_v/avg_var.npy"].data); - l23v.set_param(params["l23_v/W.npy"].data); - bias23v.set_bias(params["l23_v/b.npy"].data); - l24v.set_param(params["l24_v/W.npy"].data); - bias24v.set_bias(params["l24_v/b.npy"].data); -} - -void NNSENet10::forward(const int batch_size, features1_t* x1, features2_t* x2, DType* y1, DType* y2) -{ - prepare_desc(batch_size); - - // input - checkCudaErrors(cudaMemcpy(x1_dev, x1, sizeof(features1_t) * batch_size, cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpy(x2_dev, x2, sizeof(features2_t) * batch_size, cudaMemcpyHostToDevice)); - - // layer1 - conv1_1_1(cudnnHandle, x1Desc, x1_dev, h1Desc, h1_1_1_dev); - conv1_1_2(cudnnHandle, x1Desc, x1_dev, h1Desc, h1_1_2_dev); - conv1_2(cudnnHandle, x2Desc, x2_dev, h1Desc, h1_2_dev); - add(cudnnHandle, h1Desc, h1_1_2_dev, h1_1_1_dev); - add(cudnnHandle, h1Desc, h1_2_dev, h1_1_1_dev); - - // residual block1 - bn1(cudnnHandle, h1Desc, h1_1_1_dev, h1_bn_dev); - relu(cudnnHandle, h1Desc, h1_bn_dev); - conv2(cudnnHandle, h1Desc, h1_bn_dev, h1Desc, h2_dev); - bn2(cudnnHandle, h1Desc, h2_dev, h2_bn_dev); - relu(cudnnHandle, h1Desc, h2_bn_dev); - conv3(cudnnHandle, h1Desc, h2_bn_dev, h1Desc, h3_dev); - se(se3_l1, se3_l2, batch_size, h3_dev); - add(cudnnHandle, h1Desc, h1_1_1_dev, h3_dev); - - // residual block2 - bn3(cudnnHandle, h1Desc, h3_dev, h1_bn_dev); - relu(cudnnHandle, h1Desc, h1_bn_dev); - conv4(cudnnHandle, h1Desc, h1_bn_dev, h1Desc, h2_dev); - bn4(cudnnHandle, h1Desc, h2_dev, h2_bn_dev); - relu(cudnnHandle, h1Desc, h2_bn_dev); - conv5(cudnnHandle, h1Desc, h2_bn_dev, h1Desc, h5_dev); - se(se5_l1, se5_l2, batch_size, h5_dev); - add(cudnnHandle, h1Desc, h3_dev, h5_dev); - - // residual block3 - bn5(cudnnHandle, h1Desc, h5_dev, h1_bn_dev); - relu(cudnnHandle, h1Desc, h1_bn_dev); - conv6(cudnnHandle, h1Desc, h1_bn_dev, h1Desc, h2_dev); - bn6(cudnnHandle, h1Desc, h2_dev, h2_bn_dev); - relu(cudnnHandle, h1Desc, h2_bn_dev); - conv7(cudnnHandle, h1Desc, h2_bn_dev, h1Desc, h7_dev); - se(se7_l1, se7_l2, batch_size, h7_dev); - add(cudnnHandle, h1Desc, h5_dev, h7_dev); - - // residual block4 - bn7(cudnnHandle, h1Desc, h7_dev, h1_bn_dev); - relu(cudnnHandle, h1Desc, h1_bn_dev); - conv8(cudnnHandle, h1Desc, h1_bn_dev, h1Desc, h2_dev); - bn8(cudnnHandle, h1Desc, h2_dev, h2_bn_dev); - relu(cudnnHandle, h1Desc, h2_bn_dev); - conv9(cudnnHandle, h1Desc, h2_bn_dev, h1Desc, h9_dev); - se(se9_l1, se9_l2, batch_size, h9_dev); - add(cudnnHandle, h1Desc, h7_dev, h9_dev); - - // residual block5 - bn9(cudnnHandle, h1Desc, h9_dev, h1_bn_dev); - relu(cudnnHandle, h1Desc, h1_bn_dev); - conv10(cudnnHandle, h1Desc, h1_bn_dev, h1Desc, h2_dev); - bn10(cudnnHandle, h1Desc, h2_dev, h2_bn_dev); - relu(cudnnHandle, h1Desc, h2_bn_dev); - conv11(cudnnHandle, h1Desc, h2_bn_dev, h1Desc, h11_dev); - se(se11_l1, se11_l2, batch_size, h11_dev); - add(cudnnHandle, h1Desc, h9_dev, h11_dev); - - // residual block6 - bn11(cudnnHandle, h1Desc, h11_dev, h1_bn_dev); - relu(cudnnHandle, h1Desc, h1_bn_dev); - conv12(cudnnHandle, h1Desc, h1_bn_dev, h1Desc, h2_dev); - bn12(cudnnHandle, h1Desc, h2_dev, h2_bn_dev); - relu(cudnnHandle, h1Desc, h2_bn_dev); - conv13(cudnnHandle, h1Desc, h2_bn_dev, h1Desc, h13_dev); - se(se13_l1, se13_l2, batch_size, h13_dev); - add(cudnnHandle, h1Desc, h11_dev, h13_dev); - - // residual block7 - bn13(cudnnHandle, h1Desc, h13_dev, h1_bn_dev); - relu(cudnnHandle, h1Desc, h1_bn_dev); - conv14(cudnnHandle, h1Desc, h1_bn_dev, h1Desc, h2_dev); - bn14(cudnnHandle, h1Desc, h2_dev, h2_bn_dev); - relu(cudnnHandle, h1Desc, h2_bn_dev); - conv15(cudnnHandle, h1Desc, h2_bn_dev, h1Desc, h15_dev); - se(se15_l1, se15_l2, batch_size, h15_dev); - add(cudnnHandle, h1Desc, h13_dev, h15_dev); - - // residual block8 - bn15(cudnnHandle, h1Desc, h15_dev, h1_bn_dev); - relu(cudnnHandle, h1Desc, h1_bn_dev); - conv16(cudnnHandle, h1Desc, h1_bn_dev, h1Desc, h2_dev); - bn16(cudnnHandle, h1Desc, h2_dev, h2_bn_dev); - relu(cudnnHandle, h1Desc, h2_bn_dev); - conv17(cudnnHandle, h1Desc, h2_bn_dev, h1Desc, h17_dev); - se(se17_l1, se17_l2, batch_size, h17_dev); - add(cudnnHandle, h1Desc, h15_dev, h17_dev); - - // residual block9 - bn17(cudnnHandle, h1Desc, h17_dev, h1_bn_dev); - relu(cudnnHandle, h1Desc, h1_bn_dev); - conv18(cudnnHandle, h1Desc, h1_bn_dev, h1Desc, h2_dev); - bn18(cudnnHandle, h1Desc, h2_dev, h2_bn_dev); - relu(cudnnHandle, h1Desc, h2_bn_dev); - conv19(cudnnHandle, h1Desc, h2_bn_dev, h1Desc, h19_dev); - se(se19_l1, se19_l2, batch_size, h19_dev); - add(cudnnHandle, h1Desc, h17_dev, h19_dev); - - // residual block10 - bn19(cudnnHandle, h1Desc, h19_dev, h1_bn_dev); - relu(cudnnHandle, h1Desc, h1_bn_dev); - conv20(cudnnHandle, h1Desc, h1_bn_dev, h1Desc, h2_dev); - bn20(cudnnHandle, h1Desc, h2_dev, h2_bn_dev); - relu(cudnnHandle, h1Desc, h2_bn_dev); - conv21(cudnnHandle, h1Desc, h2_bn_dev, h1Desc, h21_dev); - se(se21_l1, se21_l2, batch_size, h21_dev); - add(cudnnHandle, h1Desc, h19_dev, h21_dev); - - // after residual blocks - bn21(cudnnHandle, h1Desc, h21_dev, h21_bn_dev); - relu(cudnnHandle, h1Desc, h21_bn_dev); - - // policy network - conv22(cudnnHandle, h1Desc, h21_bn_dev, y1Desc, y1_dev); - bias22(cudnnHandle, y1Desc, y1_dev); - - // value network - conv22v(cudnnHandle, h1Desc, h21_bn_dev, h22vDesc, h22v_dev); - bias22v(cudnnHandle, h22vDesc, h22v_dev); - bn22v(cudnnHandle, h22vDesc, h22v_dev, h22v_bn_dev); - relu(cudnnHandle, h22vDesc, h22v_bn_dev); - l23v(cublasHandle, batch_size, h22v_bn_dev, h23v_dev); - bias23v(cudnnHandle, h23vDesc, h23v_dev); - relu(cudnnHandle, h23vDesc, h23v_dev); - l24v(cublasHandle, batch_size, h23v_dev, y2_dev); - bias24v(cudnnHandle, y2Desc, y2_dev); - sigmoid(cudnnHandle, y2Desc, y2_dev); - - // output - checkCudaErrors(cudaMemcpy(y1, y1_dev, conv22.get_ysize(batch_size, 9, 9), cudaMemcpyDeviceToHost)); - checkCudaErrors(cudaMemcpy(y2, y2_dev, batch_size * sizeof(DType), cudaMemcpyDeviceToHost)); -} - -void NNSENet10::se(Linear& se_l1, Linear& se_l2, const int &batch_size, DType* x_dev) -{ - averagePooling2D(cudnnHandle, h1Desc, x_dev, se1Desc, se1_dev); - se_l1(cublasHandle, batch_size, se1_dev, se2_dev); - relu(cudnnHandle, se2Desc, se2_dev); - se_l2(cublasHandle, batch_size, se2_dev, se3_dev); - sigmoid(cudnnHandle, se3Desc, se3_dev); - const float alpha = 1.0f; - const float beta = 0.0f; - cudnnOpTensor(cudnnHandle, opMulDesc, &alpha, h1Desc, x_dev, &alpha, se1Desc, se3_dev, &beta, h1Desc, x_dev); -} diff --git a/usi/nn_senet10.h b/usi/nn_senet10.h deleted file mode 100644 index 685fe4f3..00000000 --- a/usi/nn_senet10.h +++ /dev/null @@ -1,158 +0,0 @@ -#pragma once - -#include "nn.h" -#include "layers.h" - -class NNSENet10 : NN { -public: - NNSENet10(const char* filename, const int max_batch_size); - ~NNSENet10(); - - void forward(const int batch_size, features1_t* x1, features2_t* x2, DType* y1, DType* y2); - -private: - static constexpr int k = 192; - static constexpr int fcl = 256; - static constexpr int reduction = 16; - - void load_model(const char* filename); - void prepare_desc(const int batch_size); - void se(Linear& se_l1, Linear& se_l2, const int &batch_size, DType* x_dev); - - CudnnHandle cudnnHandle; - CublasHandle cublasHandle; - - const int max_batch_size; - - // input layer - ConvLayer conv1_1_1; - ConvLayer conv1_1_2; - ConvLayer conv1_2; - // residual blocks - BatchNormalization bn1; - ConvLayer conv2; - BatchNormalization bn2; - ConvLayer conv3; - BatchNormalization bn3; - Linear se3_l1; - Linear se3_l2; - ConvLayer conv4; - BatchNormalization bn4; - ConvLayer conv5; - BatchNormalization bn5; - Linear se5_l1; - Linear se5_l2; - ConvLayer conv6; - BatchNormalization bn6; - ConvLayer conv7; - BatchNormalization bn7; - Linear se7_l1; - Linear se7_l2; - ConvLayer conv8; - BatchNormalization bn8; - ConvLayer conv9; - BatchNormalization bn9; - Linear se9_l1; - Linear se9_l2; - ConvLayer conv10; - BatchNormalization bn10; - ConvLayer conv11; - BatchNormalization bn11; - Linear se11_l1; - Linear se11_l2; - ConvLayer conv12; - BatchNormalization bn12; - ConvLayer conv13; - BatchNormalization bn13; - Linear se13_l1; - Linear se13_l2; - ConvLayer conv14; - BatchNormalization bn14; - ConvLayer conv15; - BatchNormalization bn15; - Linear se15_l1; - Linear se15_l2; - ConvLayer conv16; - BatchNormalization bn16; - ConvLayer conv17; - BatchNormalization bn17; - Linear se17_l1; - Linear se17_l2; - ConvLayer conv18; - BatchNormalization bn18; - ConvLayer conv19; - BatchNormalization bn19; - Linear se19_l1; - Linear se19_l2; - ConvLayer conv20; - BatchNormalization bn20; - ConvLayer conv21; - BatchNormalization bn21; - Linear se21_l1; - Linear se21_l2; - // policy network - ConvLayer conv22; - Bias bias22; - // value network - ConvLayer conv22v; - Bias bias22v; - BatchNormalization bn22v; - Linear<9 * 9 * MAX_MOVE_LABEL_NUM, fcl> l23v; - Bias bias23v; - Linear l24v; - Bias<1, 1, 1> bias24v; - - ReLU relu; - Add add; - Sigmoid sigmoid; - AveragePooling2D<9> averagePooling2D; - - CudnnTensorDescriptor x1Desc; - CudnnTensorDescriptor x2Desc; - CudnnTensorDescriptor h1Desc; - CudnnTensorDescriptor se1Desc; - CudnnTensorDescriptor se2Desc; - CudnnTensorDescriptor se3Desc; - CudnnTensorDescriptor h22Desc; - CudnnTensorDescriptor h22vDesc; - CudnnTensorDescriptor h23vDesc; - CudnnTensorDescriptor h24vDesc; - CudnnTensorDescriptor y1Desc; - CudnnTensorDescriptor y2Desc; - - CudnnOpTensorDescriptor opMulDesc; - - // input layer - DType* x1_dev; - DType* x2_dev; - DType* h1_1_1_dev; - DType* h1_1_2_dev; - DType* h1_2_dev; - // residual block - DType* h1_bn_dev; - DType* h2_dev; - DType* h2_bn_dev; - DType* h3_dev; - DType* h5_dev; - DType* h7_dev; - DType* h9_dev; - DType* h11_dev; - DType* h13_dev; - DType* h15_dev; - DType* h17_dev; - DType* h19_dev; - DType* h21_dev; - // se layer - DType* se1_dev; - DType* se2_dev; - DType* se3_dev; - // after residual blocks - DType* h21_bn_dev; - // policy network - DType* y1_dev; - // value network - DType* h22v_dev; - DType* h22v_bn_dev; - DType* h23v_dev; - DType* y2_dev; -}; diff --git a/usi/nn_wideresnet10.cpp b/usi/nn_wideresnet10.cpp deleted file mode 100644 index 14d8cd8f..00000000 --- a/usi/nn_wideresnet10.cpp +++ /dev/null @@ -1,294 +0,0 @@ -#include "nn_wideresnet10.h" -#include "npz.h" - -NNWideResnet10::NNWideResnet10(const char* filename, const int max_batch_size) : max_batch_size(max_batch_size) -{ - prepare_desc(max_batch_size); - - // init conv layers - conv1_1_1.init(cudnnHandle, x1Desc, h1Desc); - conv1_1_2.init(cudnnHandle, x1Desc, h1Desc); - conv1_2.init(cudnnHandle, x2Desc, h1Desc); - conv2.init(cudnnHandle, h1Desc, h1Desc); - conv3.init(cudnnHandle, h1Desc, h1Desc); - conv4.init(cudnnHandle, h1Desc, h1Desc); - conv5.init(cudnnHandle, h1Desc, h1Desc); - conv6.init(cudnnHandle, h1Desc, h1Desc); - conv7.init(cudnnHandle, h1Desc, h1Desc); - conv8.init(cudnnHandle, h1Desc, h1Desc); - conv9.init(cudnnHandle, h1Desc, h1Desc); - conv10.init(cudnnHandle, h1Desc, h1Desc); - conv11.init(cudnnHandle, h1Desc, h1Desc); - conv12.init(cudnnHandle, h1Desc, h1Desc); - conv13.init(cudnnHandle, h1Desc, h1Desc); - conv14.init(cudnnHandle, h1Desc, h1Desc); - conv15.init(cudnnHandle, h1Desc, h1Desc); - conv16.init(cudnnHandle, h1Desc, h1Desc); - conv17.init(cudnnHandle, h1Desc, h1Desc); - conv18.init(cudnnHandle, h1Desc, h1Desc); - conv19.init(cudnnHandle, h1Desc, h1Desc); - conv20.init(cudnnHandle, h1Desc, h1Desc); - conv21.init(cudnnHandle, h1Desc, h1Desc); - conv22.init(cudnnHandle, h1Desc, y1Desc); - conv22v.init(cudnnHandle, h1Desc, h22vDesc); - - // malloc - checkCudaErrors(cudaMalloc((void**)&x1_dev, conv1_1_1.get_xsize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&x2_dev, conv1_2.get_xsize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h1_1_1_dev, conv1_1_1.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h1_1_2_dev, conv1_1_2.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h1_2_dev, conv1_2.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h1_bn_dev, conv1_1_1.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h2_dev, conv2.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h2_bn_dev, conv2.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h3_dev, conv3.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h5_dev, conv5.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h7_dev, conv7.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h9_dev, conv9.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h11_dev, conv11.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h13_dev, conv13.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h15_dev, conv15.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h17_dev, conv17.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h19_dev, conv19.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h21_dev, conv21.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h21_bn_dev, conv21.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&y1_dev, conv22.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h22v_dev, conv22v.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h22v_bn_dev, conv22v.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h23v_dev, max_batch_size * fcl * sizeof(DType))); - checkCudaErrors(cudaMalloc((void**)&y2_dev, max_batch_size * sizeof(DType))); - - load_model(filename); -} - -NNWideResnet10::~NNWideResnet10() { - checkCudaErrors(cudaFree(x1_dev)); - checkCudaErrors(cudaFree(x2_dev)); - checkCudaErrors(cudaFree(h1_1_1_dev)); - checkCudaErrors(cudaFree(h1_1_2_dev)); - checkCudaErrors(cudaFree(h1_2_dev)); - checkCudaErrors(cudaFree(h1_bn_dev)); - checkCudaErrors(cudaFree(h2_dev)); - checkCudaErrors(cudaFree(h2_bn_dev)); - checkCudaErrors(cudaFree(h3_dev)); - checkCudaErrors(cudaFree(h5_dev)); - checkCudaErrors(cudaFree(h7_dev)); - checkCudaErrors(cudaFree(h9_dev)); - checkCudaErrors(cudaFree(h11_dev)); - checkCudaErrors(cudaFree(h13_dev)); - checkCudaErrors(cudaFree(h15_dev)); - checkCudaErrors(cudaFree(h17_dev)); - checkCudaErrors(cudaFree(h19_dev)); - checkCudaErrors(cudaFree(h21_dev)); - checkCudaErrors(cudaFree(h21_bn_dev)); - checkCudaErrors(cudaFree(y1_dev)); - checkCudaErrors(cudaFree(h22v_dev)); - checkCudaErrors(cudaFree(h22v_bn_dev)); - checkCudaErrors(cudaFree(h23v_dev)); - checkCudaErrors(cudaFree(y2_dev)); -} - -void NNWideResnet10::prepare_desc(const int batch_size) -{ - conv1_1_1.get_xdesc(x1Desc, batch_size, 9, 9); - conv1_2.get_xdesc(x2Desc, batch_size, 9, 9); - conv1_1_1.get_ydesc(h1Desc, batch_size, 9, 9); - - conv22.get_ydesc(y1Desc, batch_size, 9, 9); - - conv22v.get_ydesc(h22vDesc, batch_size, 9, 9); - l23v.get_ydesc(h23vDesc, batch_size); - l24v.get_ydesc(y2Desc, batch_size); -} - -void NNWideResnet10::load_model(const char* filepath) -{ - // load nn params - ParamMap params; - load_npz(filepath, params); - - conv1_1_1.set_param(params["l1_1_1/W.npy"].data); - conv1_1_2.set_param(params["l1_1_2/W.npy"].data); - conv1_2.set_param(params["l1_2/W.npy"].data); - bn1.set_param(params["norm1/gamma.npy"].data, params["norm1/beta.npy"].data, params["norm1/avg_mean.npy"].data, params["norm1/avg_var.npy"].data); - conv2.set_param(params["l2/W.npy"].data); - bn2.set_param(params["norm2/gamma.npy"].data, params["norm2/beta.npy"].data, params["norm2/avg_mean.npy"].data, params["norm2/avg_var.npy"].data); - conv3.set_param(params["l3/W.npy"].data); - bn3.set_param(params["norm3/gamma.npy"].data, params["norm3/beta.npy"].data, params["norm3/avg_mean.npy"].data, params["norm3/avg_var.npy"].data); - conv4.set_param(params["l4/W.npy"].data); - bn4.set_param(params["norm4/gamma.npy"].data, params["norm4/beta.npy"].data, params["norm4/avg_mean.npy"].data, params["norm4/avg_var.npy"].data); - conv5.set_param(params["l5/W.npy"].data); - bn5.set_param(params["norm5/gamma.npy"].data, params["norm5/beta.npy"].data, params["norm5/avg_mean.npy"].data, params["norm5/avg_var.npy"].data); - conv6.set_param(params["l6/W.npy"].data); - bn6.set_param(params["norm6/gamma.npy"].data, params["norm6/beta.npy"].data, params["norm6/avg_mean.npy"].data, params["norm6/avg_var.npy"].data); - conv7.set_param(params["l7/W.npy"].data); - bn7.set_param(params["norm7/gamma.npy"].data, params["norm7/beta.npy"].data, params["norm7/avg_mean.npy"].data, params["norm7/avg_var.npy"].data); - conv8.set_param(params["l8/W.npy"].data); - bn8.set_param(params["norm8/gamma.npy"].data, params["norm8/beta.npy"].data, params["norm8/avg_mean.npy"].data, params["norm8/avg_var.npy"].data); - conv9.set_param(params["l9/W.npy"].data); - bn9.set_param(params["norm9/gamma.npy"].data, params["norm9/beta.npy"].data, params["norm9/avg_mean.npy"].data, params["norm9/avg_var.npy"].data); - conv10.set_param(params["l10/W.npy"].data); - bn10.set_param(params["norm10/gamma.npy"].data, params["norm10/beta.npy"].data, params["norm10/avg_mean.npy"].data, params["norm10/avg_var.npy"].data); - conv11.set_param(params["l11/W.npy"].data); - bn11.set_param(params["norm11/gamma.npy"].data, params["norm11/beta.npy"].data, params["norm11/avg_mean.npy"].data, params["norm11/avg_var.npy"].data); - conv12.set_param(params["l12/W.npy"].data); - bn12.set_param(params["norm12/gamma.npy"].data, params["norm12/beta.npy"].data, params["norm12/avg_mean.npy"].data, params["norm12/avg_var.npy"].data); - conv13.set_param(params["l13/W.npy"].data); - bn13.set_param(params["norm13/gamma.npy"].data, params["norm13/beta.npy"].data, params["norm13/avg_mean.npy"].data, params["norm13/avg_var.npy"].data); - conv14.set_param(params["l14/W.npy"].data); - bn14.set_param(params["norm14/gamma.npy"].data, params["norm14/beta.npy"].data, params["norm14/avg_mean.npy"].data, params["norm14/avg_var.npy"].data); - conv15.set_param(params["l15/W.npy"].data); - bn15.set_param(params["norm15/gamma.npy"].data, params["norm15/beta.npy"].data, params["norm15/avg_mean.npy"].data, params["norm15/avg_var.npy"].data); - conv16.set_param(params["l16/W.npy"].data); - bn16.set_param(params["norm16/gamma.npy"].data, params["norm16/beta.npy"].data, params["norm16/avg_mean.npy"].data, params["norm16/avg_var.npy"].data); - conv17.set_param(params["l17/W.npy"].data); - bn17.set_param(params["norm17/gamma.npy"].data, params["norm17/beta.npy"].data, params["norm17/avg_mean.npy"].data, params["norm17/avg_var.npy"].data); - conv18.set_param(params["l18/W.npy"].data); - bn18.set_param(params["norm18/gamma.npy"].data, params["norm18/beta.npy"].data, params["norm18/avg_mean.npy"].data, params["norm18/avg_var.npy"].data); - conv19.set_param(params["l19/W.npy"].data); - bn19.set_param(params["norm19/gamma.npy"].data, params["norm19/beta.npy"].data, params["norm19/avg_mean.npy"].data, params["norm19/avg_var.npy"].data); - conv20.set_param(params["l20/W.npy"].data); - bn20.set_param(params["norm20/gamma.npy"].data, params["norm20/beta.npy"].data, params["norm20/avg_mean.npy"].data, params["norm20/avg_var.npy"].data); - conv21.set_param(params["l21/W.npy"].data); - bn21.set_param(params["norm21/gamma.npy"].data, params["norm21/beta.npy"].data, params["norm21/avg_mean.npy"].data, params["norm21/avg_var.npy"].data); - conv22.set_param(params["l22/W.npy"].data); - bias22.set_bias(params["l22_2/b.npy"].data); - conv22v.set_param(params["l22_v/W.npy"].data); - bias22v.set_bias(params["l22_v/b.npy"].data); - bn22v.set_param(params["norm22_v/gamma.npy"].data, params["norm22_v/beta.npy"].data, params["norm22_v/avg_mean.npy"].data, params["norm22_v/avg_var.npy"].data); - l23v.set_param(params["l23_v/W.npy"].data); - bias23v.set_bias(params["l23_v/b.npy"].data); - l24v.set_param(params["l24_v/W.npy"].data); - bias24v.set_bias(params["l24_v/b.npy"].data); -} - -void NNWideResnet10::forward(const int batch_size, features1_t* x1, features2_t* x2, DType* y1, DType* y2) -{ - prepare_desc(batch_size); - - // input - checkCudaErrors(cudaMemcpy(x1_dev, x1, sizeof(features1_t) * batch_size, cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpy(x2_dev, x2, sizeof(features2_t) * batch_size, cudaMemcpyHostToDevice)); - - // layer1 - conv1_1_1(cudnnHandle, x1Desc, x1_dev, h1Desc, h1_1_1_dev); - conv1_1_2(cudnnHandle, x1Desc, x1_dev, h1Desc, h1_1_2_dev); - conv1_2(cudnnHandle, x2Desc, x2_dev, h1Desc, h1_2_dev); - add(cudnnHandle, h1Desc, h1_1_2_dev, h1_1_1_dev); - add(cudnnHandle, h1Desc, h1_2_dev, h1_1_1_dev); - - // residual block1 - bn1(cudnnHandle, h1Desc, h1_1_1_dev, h1_bn_dev); - - relu(cudnnHandle, h1Desc, h1_bn_dev); - conv2(cudnnHandle, h1Desc, h1_bn_dev, h1Desc, h2_dev); - bn2(cudnnHandle, h1Desc, h2_dev, h2_bn_dev); - relu(cudnnHandle, h1Desc, h2_bn_dev); - conv3(cudnnHandle, h1Desc, h2_bn_dev, h1Desc, h3_dev); - add(cudnnHandle, h1Desc, h1_1_1_dev, h3_dev); - - // residual block2 - bn3(cudnnHandle, h1Desc, h3_dev, h1_bn_dev); - relu(cudnnHandle, h1Desc, h1_bn_dev); - conv4(cudnnHandle, h1Desc, h1_bn_dev, h1Desc, h2_dev); - bn4(cudnnHandle, h1Desc, h2_dev, h2_bn_dev); - relu(cudnnHandle, h1Desc, h2_bn_dev); - conv5(cudnnHandle, h1Desc, h2_bn_dev, h1Desc, h5_dev); - add(cudnnHandle, h1Desc, h3_dev, h5_dev); - - // residual block3 - bn5(cudnnHandle, h1Desc, h5_dev, h1_bn_dev); - relu(cudnnHandle, h1Desc, h1_bn_dev); - conv6(cudnnHandle, h1Desc, h1_bn_dev, h1Desc, h2_dev); - bn6(cudnnHandle, h1Desc, h2_dev, h2_bn_dev); - relu(cudnnHandle, h1Desc, h2_bn_dev); - conv7(cudnnHandle, h1Desc, h2_bn_dev, h1Desc, h7_dev); - add(cudnnHandle, h1Desc, h5_dev, h7_dev); - - // residual block4 - bn7(cudnnHandle, h1Desc, h7_dev, h1_bn_dev); - relu(cudnnHandle, h1Desc, h1_bn_dev); - conv8(cudnnHandle, h1Desc, h1_bn_dev, h1Desc, h2_dev); - bn8(cudnnHandle, h1Desc, h2_dev, h2_bn_dev); - relu(cudnnHandle, h1Desc, h2_bn_dev); - conv9(cudnnHandle, h1Desc, h2_bn_dev, h1Desc, h9_dev); - add(cudnnHandle, h1Desc, h7_dev, h9_dev); - - // residual block5 - bn9(cudnnHandle, h1Desc, h9_dev, h1_bn_dev); - relu(cudnnHandle, h1Desc, h1_bn_dev); - conv10(cudnnHandle, h1Desc, h1_bn_dev, h1Desc, h2_dev); - bn10(cudnnHandle, h1Desc, h2_dev, h2_bn_dev); - relu(cudnnHandle, h1Desc, h2_bn_dev); - conv11(cudnnHandle, h1Desc, h2_bn_dev, h1Desc, h11_dev); - add(cudnnHandle, h1Desc, h9_dev, h11_dev); - - // residual block6 - bn11(cudnnHandle, h1Desc, h11_dev, h1_bn_dev); - relu(cudnnHandle, h1Desc, h1_bn_dev); - conv12(cudnnHandle, h1Desc, h1_bn_dev, h1Desc, h2_dev); - bn12(cudnnHandle, h1Desc, h2_dev, h2_bn_dev); - relu(cudnnHandle, h1Desc, h2_bn_dev); - conv13(cudnnHandle, h1Desc, h2_bn_dev, h1Desc, h13_dev); - add(cudnnHandle, h1Desc, h11_dev, h13_dev); - - // residual block7 - bn13(cudnnHandle, h1Desc, h13_dev, h1_bn_dev); - relu(cudnnHandle, h1Desc, h1_bn_dev); - conv14(cudnnHandle, h1Desc, h1_bn_dev, h1Desc, h2_dev); - bn14(cudnnHandle, h1Desc, h2_dev, h2_bn_dev); - relu(cudnnHandle, h1Desc, h2_bn_dev); - conv15(cudnnHandle, h1Desc, h2_bn_dev, h1Desc, h15_dev); - add(cudnnHandle, h1Desc, h13_dev, h15_dev); - - // residual block8 - bn15(cudnnHandle, h1Desc, h15_dev, h1_bn_dev); - relu(cudnnHandle, h1Desc, h1_bn_dev); - conv16(cudnnHandle, h1Desc, h1_bn_dev, h1Desc, h2_dev); - bn16(cudnnHandle, h1Desc, h2_dev, h2_bn_dev); - relu(cudnnHandle, h1Desc, h2_bn_dev); - conv17(cudnnHandle, h1Desc, h2_bn_dev, h1Desc, h17_dev); - add(cudnnHandle, h1Desc, h15_dev, h17_dev); - - // residual block9 - bn17(cudnnHandle, h1Desc, h17_dev, h1_bn_dev); - relu(cudnnHandle, h1Desc, h1_bn_dev); - conv18(cudnnHandle, h1Desc, h1_bn_dev, h1Desc, h2_dev); - bn18(cudnnHandle, h1Desc, h2_dev, h2_bn_dev); - relu(cudnnHandle, h1Desc, h2_bn_dev); - conv19(cudnnHandle, h1Desc, h2_bn_dev, h1Desc, h19_dev); - add(cudnnHandle, h1Desc, h17_dev, h19_dev); - - // residual block10 - bn19(cudnnHandle, h1Desc, h19_dev, h1_bn_dev); - relu(cudnnHandle, h1Desc, h1_bn_dev); - conv20(cudnnHandle, h1Desc, h1_bn_dev, h1Desc, h2_dev); - bn20(cudnnHandle, h1Desc, h2_dev, h2_bn_dev); - relu(cudnnHandle, h1Desc, h2_bn_dev); - conv21(cudnnHandle, h1Desc, h2_bn_dev, h1Desc, h21_dev); - add(cudnnHandle, h1Desc, h19_dev, h21_dev); - - // after residual blocks - bn21(cudnnHandle, h1Desc, h21_dev, h21_bn_dev); - relu(cudnnHandle, h1Desc, h21_bn_dev); - - // policy network - conv22(cudnnHandle, h1Desc, h21_bn_dev, y1Desc, y1_dev); - bias22(cudnnHandle, y1Desc, y1_dev); - - // value network - conv22v(cudnnHandle, h1Desc, h21_bn_dev, h22vDesc, h22v_dev); - bias22v(cudnnHandle, h22vDesc, h22v_dev); - bn22v(cudnnHandle, h22vDesc, h22v_dev, h22v_bn_dev); - relu(cudnnHandle, h22vDesc, h22v_bn_dev); - l23v(cublasHandle, batch_size, h22v_bn_dev, h23v_dev); - bias23v(cudnnHandle, h23vDesc, h23v_dev); - relu(cudnnHandle, h23vDesc, h23v_dev); - l24v(cublasHandle, batch_size, h23v_dev, y2_dev); - bias24v(cudnnHandle, y2Desc, y2_dev); - sigmoid(cudnnHandle, y2Desc, y2_dev); - - // output - checkCudaErrors(cudaMemcpy(y1, y1_dev, conv22.get_ysize(batch_size, 9, 9), cudaMemcpyDeviceToHost)); - checkCudaErrors(cudaMemcpy(y2, y2_dev, batch_size * sizeof(DType), cudaMemcpyDeviceToHost)); -} diff --git a/usi/nn_wideresnet10.h b/usi/nn_wideresnet10.h deleted file mode 100644 index b1deec4a..00000000 --- a/usi/nn_wideresnet10.h +++ /dev/null @@ -1,125 +0,0 @@ -#pragma once - -#include "nn.h" -#include "layers.h" - -class NNWideResnet10 : NN { -public: - NNWideResnet10(const char* filename, const int max_batch_size); - ~NNWideResnet10(); - - void forward(const int batch_size, features1_t* x1, features2_t* x2, DType* y1, DType* y2); - -private: - void load_model(const char* filename); - void prepare_desc(const int batch_size); - - CudnnHandle cudnnHandle; - CublasHandle cublasHandle; - static constexpr int k = 192; - static constexpr int fcl = 256; - - const int max_batch_size; - - // input layer - ConvLayer conv1_1_1; - ConvLayer conv1_1_2; - ConvLayer conv1_2; - // residual blocks - BatchNormalization bn1; - ConvLayer conv2; - BatchNormalization bn2; - ConvLayer conv3; - BatchNormalization bn3; - ConvLayer conv4; - BatchNormalization bn4; - ConvLayer conv5; - BatchNormalization bn5; - ConvLayer conv6; - BatchNormalization bn6; - ConvLayer conv7; - BatchNormalization bn7; - ConvLayer conv8; - BatchNormalization bn8; - ConvLayer conv9; - BatchNormalization bn9; - ConvLayer conv10; - BatchNormalization bn10; - ConvLayer conv11; - BatchNormalization bn11; - ConvLayer conv12; - BatchNormalization bn12; - ConvLayer conv13; - BatchNormalization bn13; - ConvLayer conv14; - BatchNormalization bn14; - ConvLayer conv15; - BatchNormalization bn15; - ConvLayer conv16; - BatchNormalization bn16; - ConvLayer conv17; - BatchNormalization bn17; - ConvLayer conv18; - BatchNormalization bn18; - ConvLayer conv19; - BatchNormalization bn19; - ConvLayer conv20; - BatchNormalization bn20; - ConvLayer conv21; - BatchNormalization bn21; - // policy network - ConvLayer conv22; - Bias bias22; - // value network - ConvLayer conv22v; - Bias bias22v; - BatchNormalization bn22v; - Linear<9 * 9 * MAX_MOVE_LABEL_NUM, fcl> l23v; - Bias bias23v; - Linear l24v; - Bias<1, 1, 1> bias24v; - - ReLU relu; - Add add; - Sigmoid sigmoid; - - CudnnTensorDescriptor x1Desc; - CudnnTensorDescriptor x2Desc; - CudnnTensorDescriptor h1Desc; - CudnnTensorDescriptor h22Desc; - CudnnTensorDescriptor h22vDesc; - CudnnTensorDescriptor h23vDesc; - CudnnTensorDescriptor h24vDesc; - CudnnTensorDescriptor y1Desc; - CudnnTensorDescriptor y2Desc; - - // input layer - DType* x1_dev; - DType* x2_dev; - DType* h1_1_1_dev; - DType* h1_1_2_dev; - DType* h1_2_dev; - // residual block - DType* h1_bn_dev; - DType* h2_dev; - DType* h2_bn_dev; - DType* h3_dev; - DType* h5_dev; - DType* h7_dev; - DType* h9_dev; - DType* h11_dev; - DType* h13_dev; - DType* h15_dev; - DType* h17_dev; - DType* h19_dev; - DType* h21_dev; - // after residual blocks - DType* h21_bn_dev; - // policy network - DType* y1_dev; - // value network - DType* h22v_dev; - DType* h22v_bn_dev; - DType* h23v_dev; - DType* y2_dev; -}; \ No newline at end of file diff --git a/usi/nn_wideresnet15.cpp b/usi/nn_wideresnet15.cpp deleted file mode 100644 index 63671888..00000000 --- a/usi/nn_wideresnet15.cpp +++ /dev/null @@ -1,380 +0,0 @@ -#include "nn_wideresnet15.h" -#include "npz.h" - -NNWideResnet15::NNWideResnet15(const char* filename, const int max_batch_size) : max_batch_size(max_batch_size) -{ - prepare_desc(max_batch_size); - - // init conv layers - conv1_1_1.init(cudnnHandle, x1Desc, h1Desc); - conv1_1_2.init(cudnnHandle, x1Desc, h1Desc); - conv1_2.init(cudnnHandle, x2Desc, h1Desc); - conv2.init(cudnnHandle, h1Desc, h1Desc); - conv3.init(cudnnHandle, h1Desc, h1Desc); - conv4.init(cudnnHandle, h1Desc, h1Desc); - conv5.init(cudnnHandle, h1Desc, h1Desc); - conv6.init(cudnnHandle, h1Desc, h1Desc); - conv7.init(cudnnHandle, h1Desc, h1Desc); - conv8.init(cudnnHandle, h1Desc, h1Desc); - conv9.init(cudnnHandle, h1Desc, h1Desc); - conv10.init(cudnnHandle, h1Desc, h1Desc); - conv11.init(cudnnHandle, h1Desc, h1Desc); - conv12.init(cudnnHandle, h1Desc, h1Desc); - conv13.init(cudnnHandle, h1Desc, h1Desc); - conv14.init(cudnnHandle, h1Desc, h1Desc); - conv15.init(cudnnHandle, h1Desc, h1Desc); - conv16.init(cudnnHandle, h1Desc, h1Desc); - conv17.init(cudnnHandle, h1Desc, h1Desc); - conv18.init(cudnnHandle, h1Desc, h1Desc); - conv19.init(cudnnHandle, h1Desc, h1Desc); - conv20.init(cudnnHandle, h1Desc, h1Desc); - conv21.init(cudnnHandle, h1Desc, h1Desc); - conv22.init(cudnnHandle, h1Desc, h1Desc); - conv23.init(cudnnHandle, h1Desc, h1Desc); - conv24.init(cudnnHandle, h1Desc, h1Desc); - conv25.init(cudnnHandle, h1Desc, h1Desc); - conv26.init(cudnnHandle, h1Desc, h1Desc); - conv27.init(cudnnHandle, h1Desc, h1Desc); - conv28.init(cudnnHandle, h1Desc, h1Desc); - conv29.init(cudnnHandle, h1Desc, h1Desc); - conv30.init(cudnnHandle, h1Desc, h1Desc); - conv31.init(cudnnHandle, h1Desc, h1Desc); - conv32.init(cudnnHandle, h1Desc, y1Desc); - conv32v.init(cudnnHandle, h1Desc, h32vDesc); - - // malloc - checkCudaErrors(cudaMalloc((void**)&x1_dev, conv1_1_1.get_xsize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&x2_dev, conv1_2.get_xsize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h1_1_1_dev, conv1_1_1.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h1_1_2_dev, conv1_1_2.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h1_2_dev, conv1_2.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h1_bn_dev, conv1_1_1.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h2_dev, conv2.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h2_bn_dev, conv2.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h3_dev, conv3.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h5_dev, conv5.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h7_dev, conv7.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h9_dev, conv9.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h11_dev, conv11.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h13_dev, conv13.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h15_dev, conv15.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h17_dev, conv17.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h19_dev, conv19.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h21_dev, conv21.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h23_dev, conv23.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h25_dev, conv25.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h27_dev, conv27.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h29_dev, conv29.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h31_dev, conv31.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h31_bn_dev, conv31.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&y1_dev, conv32.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h32v_dev, conv32v.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h32v_bn_dev, conv32v.get_ysize(max_batch_size, 9, 9))); - checkCudaErrors(cudaMalloc((void**)&h33v_dev, max_batch_size * fcl * sizeof(DType))); - checkCudaErrors(cudaMalloc((void**)&y2_dev, max_batch_size * sizeof(DType))); - - load_model(filename); -} - -NNWideResnet15::~NNWideResnet15() { - checkCudaErrors(cudaFree(x1_dev)); - checkCudaErrors(cudaFree(x2_dev)); - checkCudaErrors(cudaFree(h1_1_1_dev)); - checkCudaErrors(cudaFree(h1_1_2_dev)); - checkCudaErrors(cudaFree(h1_2_dev)); - checkCudaErrors(cudaFree(h1_bn_dev)); - checkCudaErrors(cudaFree(h2_dev)); - checkCudaErrors(cudaFree(h2_bn_dev)); - checkCudaErrors(cudaFree(h3_dev)); - checkCudaErrors(cudaFree(h5_dev)); - checkCudaErrors(cudaFree(h7_dev)); - checkCudaErrors(cudaFree(h9_dev)); - checkCudaErrors(cudaFree(h11_dev)); - checkCudaErrors(cudaFree(h13_dev)); - checkCudaErrors(cudaFree(h15_dev)); - checkCudaErrors(cudaFree(h17_dev)); - checkCudaErrors(cudaFree(h19_dev)); - checkCudaErrors(cudaFree(h21_dev)); - checkCudaErrors(cudaFree(h23_dev)); - checkCudaErrors(cudaFree(h25_dev)); - checkCudaErrors(cudaFree(h27_dev)); - checkCudaErrors(cudaFree(h29_dev)); - checkCudaErrors(cudaFree(h31_dev)); - checkCudaErrors(cudaFree(h31_bn_dev)); - checkCudaErrors(cudaFree(y1_dev)); - checkCudaErrors(cudaFree(h32v_dev)); - checkCudaErrors(cudaFree(h32v_bn_dev)); - checkCudaErrors(cudaFree(h33v_dev)); - checkCudaErrors(cudaFree(y2_dev)); -} - -void NNWideResnet15::prepare_desc(const int batch_size) -{ - conv1_1_1.get_xdesc(x1Desc, batch_size, 9, 9); - conv1_2.get_xdesc(x2Desc, batch_size, 9, 9); - conv1_1_1.get_ydesc(h1Desc, batch_size, 9, 9); - - conv32.get_ydesc(y1Desc, batch_size, 9, 9); - - conv32v.get_ydesc(h32vDesc, batch_size, 9, 9); - l33v.get_ydesc(h33vDesc, batch_size); - l34v.get_ydesc(y2Desc, batch_size); -} - -void NNWideResnet15::load_model(const char* filepath) -{ - // load nn params - ParamMap params; - load_npz(filepath, params); - - conv1_1_1.set_param(params["l1_1_1/W.npy"].data); - conv1_1_2.set_param(params["l1_1_2/W.npy"].data); - conv1_2.set_param(params["l1_2/W.npy"].data); - bn1.set_param(params["norm1/gamma.npy"].data, params["norm1/beta.npy"].data, params["norm1/avg_mean.npy"].data, params["norm1/avg_var.npy"].data); - conv2.set_param(params["l2/W.npy"].data); - bn2.set_param(params["norm2/gamma.npy"].data, params["norm2/beta.npy"].data, params["norm2/avg_mean.npy"].data, params["norm2/avg_var.npy"].data); - conv3.set_param(params["l3/W.npy"].data); - bn3.set_param(params["norm3/gamma.npy"].data, params["norm3/beta.npy"].data, params["norm3/avg_mean.npy"].data, params["norm3/avg_var.npy"].data); - conv4.set_param(params["l4/W.npy"].data); - bn4.set_param(params["norm4/gamma.npy"].data, params["norm4/beta.npy"].data, params["norm4/avg_mean.npy"].data, params["norm4/avg_var.npy"].data); - conv5.set_param(params["l5/W.npy"].data); - bn5.set_param(params["norm5/gamma.npy"].data, params["norm5/beta.npy"].data, params["norm5/avg_mean.npy"].data, params["norm5/avg_var.npy"].data); - conv6.set_param(params["l6/W.npy"].data); - bn6.set_param(params["norm6/gamma.npy"].data, params["norm6/beta.npy"].data, params["norm6/avg_mean.npy"].data, params["norm6/avg_var.npy"].data); - conv7.set_param(params["l7/W.npy"].data); - bn7.set_param(params["norm7/gamma.npy"].data, params["norm7/beta.npy"].data, params["norm7/avg_mean.npy"].data, params["norm7/avg_var.npy"].data); - conv8.set_param(params["l8/W.npy"].data); - bn8.set_param(params["norm8/gamma.npy"].data, params["norm8/beta.npy"].data, params["norm8/avg_mean.npy"].data, params["norm8/avg_var.npy"].data); - conv9.set_param(params["l9/W.npy"].data); - bn9.set_param(params["norm9/gamma.npy"].data, params["norm9/beta.npy"].data, params["norm9/avg_mean.npy"].data, params["norm9/avg_var.npy"].data); - conv10.set_param(params["l10/W.npy"].data); - bn10.set_param(params["norm10/gamma.npy"].data, params["norm10/beta.npy"].data, params["norm10/avg_mean.npy"].data, params["norm10/avg_var.npy"].data); - conv11.set_param(params["l11/W.npy"].data); - bn11.set_param(params["norm11/gamma.npy"].data, params["norm11/beta.npy"].data, params["norm11/avg_mean.npy"].data, params["norm11/avg_var.npy"].data); - conv12.set_param(params["l12/W.npy"].data); - bn12.set_param(params["norm12/gamma.npy"].data, params["norm12/beta.npy"].data, params["norm12/avg_mean.npy"].data, params["norm12/avg_var.npy"].data); - conv13.set_param(params["l13/W.npy"].data); - bn13.set_param(params["norm13/gamma.npy"].data, params["norm13/beta.npy"].data, params["norm13/avg_mean.npy"].data, params["norm13/avg_var.npy"].data); - conv14.set_param(params["l14/W.npy"].data); - bn14.set_param(params["norm14/gamma.npy"].data, params["norm14/beta.npy"].data, params["norm14/avg_mean.npy"].data, params["norm14/avg_var.npy"].data); - conv15.set_param(params["l15/W.npy"].data); - bn15.set_param(params["norm15/gamma.npy"].data, params["norm15/beta.npy"].data, params["norm15/avg_mean.npy"].data, params["norm15/avg_var.npy"].data); - conv16.set_param(params["l16/W.npy"].data); - bn16.set_param(params["norm16/gamma.npy"].data, params["norm16/beta.npy"].data, params["norm16/avg_mean.npy"].data, params["norm16/avg_var.npy"].data); - conv17.set_param(params["l17/W.npy"].data); - bn17.set_param(params["norm17/gamma.npy"].data, params["norm17/beta.npy"].data, params["norm17/avg_mean.npy"].data, params["norm17/avg_var.npy"].data); - conv18.set_param(params["l18/W.npy"].data); - bn18.set_param(params["norm18/gamma.npy"].data, params["norm18/beta.npy"].data, params["norm18/avg_mean.npy"].data, params["norm18/avg_var.npy"].data); - conv19.set_param(params["l19/W.npy"].data); - bn19.set_param(params["norm19/gamma.npy"].data, params["norm19/beta.npy"].data, params["norm19/avg_mean.npy"].data, params["norm19/avg_var.npy"].data); - conv20.set_param(params["l20/W.npy"].data); - bn20.set_param(params["norm20/gamma.npy"].data, params["norm20/beta.npy"].data, params["norm20/avg_mean.npy"].data, params["norm20/avg_var.npy"].data); - conv21.set_param(params["l21/W.npy"].data); - bn21.set_param(params["norm21/gamma.npy"].data, params["norm21/beta.npy"].data, params["norm21/avg_mean.npy"].data, params["norm21/avg_var.npy"].data); - conv22.set_param(params["l22/W.npy"].data); - bn22.set_param(params["norm22/gamma.npy"].data, params["norm22/beta.npy"].data, params["norm22/avg_mean.npy"].data, params["norm22/avg_var.npy"].data); - conv23.set_param(params["l23/W.npy"].data); - bn23.set_param(params["norm23/gamma.npy"].data, params["norm23/beta.npy"].data, params["norm23/avg_mean.npy"].data, params["norm23/avg_var.npy"].data); - conv24.set_param(params["l24/W.npy"].data); - bn24.set_param(params["norm24/gamma.npy"].data, params["norm24/beta.npy"].data, params["norm24/avg_mean.npy"].data, params["norm24/avg_var.npy"].data); - conv25.set_param(params["l25/W.npy"].data); - bn25.set_param(params["norm25/gamma.npy"].data, params["norm25/beta.npy"].data, params["norm25/avg_mean.npy"].data, params["norm25/avg_var.npy"].data); - conv26.set_param(params["l26/W.npy"].data); - bn26.set_param(params["norm26/gamma.npy"].data, params["norm26/beta.npy"].data, params["norm26/avg_mean.npy"].data, params["norm26/avg_var.npy"].data); - conv27.set_param(params["l27/W.npy"].data); - bn27.set_param(params["norm27/gamma.npy"].data, params["norm27/beta.npy"].data, params["norm27/avg_mean.npy"].data, params["norm27/avg_var.npy"].data); - conv28.set_param(params["l28/W.npy"].data); - bn28.set_param(params["norm28/gamma.npy"].data, params["norm28/beta.npy"].data, params["norm28/avg_mean.npy"].data, params["norm28/avg_var.npy"].data); - conv29.set_param(params["l29/W.npy"].data); - bn29.set_param(params["norm29/gamma.npy"].data, params["norm29/beta.npy"].data, params["norm29/avg_mean.npy"].data, params["norm29/avg_var.npy"].data); - conv30.set_param(params["l30/W.npy"].data); - bn30.set_param(params["norm30/gamma.npy"].data, params["norm30/beta.npy"].data, params["norm30/avg_mean.npy"].data, params["norm30/avg_var.npy"].data); - conv31.set_param(params["l31/W.npy"].data); - bn31.set_param(params["norm31/gamma.npy"].data, params["norm31/beta.npy"].data, params["norm31/avg_mean.npy"].data, params["norm31/avg_var.npy"].data); - conv32.set_param(params["l32/W.npy"].data); - bias32.set_bias(params["l32_2/b.npy"].data); - conv32v.set_param(params["l32_v/W.npy"].data); - bias32v.set_bias(params["l32_v/b.npy"].data); - bn32v.set_param(params["norm32_v/gamma.npy"].data, params["norm32_v/beta.npy"].data, params["norm32_v/avg_mean.npy"].data, params["norm32_v/avg_var.npy"].data); - l33v.set_param(params["l33_v/W.npy"].data); - bias33v.set_bias(params["l33_v/b.npy"].data); - l34v.set_param(params["l34_v/W.npy"].data); - bias34v.set_bias(params["l34_v/b.npy"].data); -} - -void NNWideResnet15::forward(const int batch_size, features1_t* x1, features2_t* x2, DType* y1, DType* y2) -{ - prepare_desc(batch_size); - - // input - checkCudaErrors(cudaMemcpy(x1_dev, x1, sizeof(features1_t) * batch_size, cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpy(x2_dev, x2, sizeof(features2_t) * batch_size, cudaMemcpyHostToDevice)); - - // layer1 - conv1_1_1(cudnnHandle, x1Desc, x1_dev, h1Desc, h1_1_1_dev); - conv1_1_2(cudnnHandle, x1Desc, x1_dev, h1Desc, h1_1_2_dev); - conv1_2(cudnnHandle, x2Desc, x2_dev, h1Desc, h1_2_dev); - add(cudnnHandle, h1Desc, h1_1_2_dev, h1_1_1_dev); - add(cudnnHandle, h1Desc, h1_2_dev, h1_1_1_dev); - - // residual block1 - bn1(cudnnHandle, h1Desc, h1_1_1_dev, h1_bn_dev); - - relu(cudnnHandle, h1Desc, h1_bn_dev); - conv2(cudnnHandle, h1Desc, h1_bn_dev, h1Desc, h2_dev); - bn2(cudnnHandle, h1Desc, h2_dev, h2_bn_dev); - relu(cudnnHandle, h1Desc, h2_bn_dev); - conv3(cudnnHandle, h1Desc, h2_bn_dev, h1Desc, h3_dev); - add(cudnnHandle, h1Desc, h1_1_1_dev, h3_dev); - - // residual block2 - bn3(cudnnHandle, h1Desc, h3_dev, h1_bn_dev); - relu(cudnnHandle, h1Desc, h1_bn_dev); - conv4(cudnnHandle, h1Desc, h1_bn_dev, h1Desc, h2_dev); - bn4(cudnnHandle, h1Desc, h2_dev, h2_bn_dev); - relu(cudnnHandle, h1Desc, h2_bn_dev); - conv5(cudnnHandle, h1Desc, h2_bn_dev, h1Desc, h5_dev); - add(cudnnHandle, h1Desc, h3_dev, h5_dev); - - // residual block3 - bn5(cudnnHandle, h1Desc, h5_dev, h1_bn_dev); - relu(cudnnHandle, h1Desc, h1_bn_dev); - conv6(cudnnHandle, h1Desc, h1_bn_dev, h1Desc, h2_dev); - bn6(cudnnHandle, h1Desc, h2_dev, h2_bn_dev); - relu(cudnnHandle, h1Desc, h2_bn_dev); - conv7(cudnnHandle, h1Desc, h2_bn_dev, h1Desc, h7_dev); - add(cudnnHandle, h1Desc, h5_dev, h7_dev); - - // residual block4 - bn7(cudnnHandle, h1Desc, h7_dev, h1_bn_dev); - relu(cudnnHandle, h1Desc, h1_bn_dev); - conv8(cudnnHandle, h1Desc, h1_bn_dev, h1Desc, h2_dev); - bn8(cudnnHandle, h1Desc, h2_dev, h2_bn_dev); - relu(cudnnHandle, h1Desc, h2_bn_dev); - conv9(cudnnHandle, h1Desc, h2_bn_dev, h1Desc, h9_dev); - add(cudnnHandle, h1Desc, h7_dev, h9_dev); - - // residual block5 - bn9(cudnnHandle, h1Desc, h9_dev, h1_bn_dev); - relu(cudnnHandle, h1Desc, h1_bn_dev); - conv10(cudnnHandle, h1Desc, h1_bn_dev, h1Desc, h2_dev); - bn10(cudnnHandle, h1Desc, h2_dev, h2_bn_dev); - relu(cudnnHandle, h1Desc, h2_bn_dev); - conv11(cudnnHandle, h1Desc, h2_bn_dev, h1Desc, h11_dev); - add(cudnnHandle, h1Desc, h9_dev, h11_dev); - - // residual block6 - bn11(cudnnHandle, h1Desc, h11_dev, h1_bn_dev); - relu(cudnnHandle, h1Desc, h1_bn_dev); - conv12(cudnnHandle, h1Desc, h1_bn_dev, h1Desc, h2_dev); - bn12(cudnnHandle, h1Desc, h2_dev, h2_bn_dev); - relu(cudnnHandle, h1Desc, h2_bn_dev); - conv13(cudnnHandle, h1Desc, h2_bn_dev, h1Desc, h13_dev); - add(cudnnHandle, h1Desc, h11_dev, h13_dev); - - // residual block7 - bn13(cudnnHandle, h1Desc, h13_dev, h1_bn_dev); - relu(cudnnHandle, h1Desc, h1_bn_dev); - conv14(cudnnHandle, h1Desc, h1_bn_dev, h1Desc, h2_dev); - bn14(cudnnHandle, h1Desc, h2_dev, h2_bn_dev); - relu(cudnnHandle, h1Desc, h2_bn_dev); - conv15(cudnnHandle, h1Desc, h2_bn_dev, h1Desc, h15_dev); - add(cudnnHandle, h1Desc, h13_dev, h15_dev); - - // residual block8 - bn15(cudnnHandle, h1Desc, h15_dev, h1_bn_dev); - relu(cudnnHandle, h1Desc, h1_bn_dev); - conv16(cudnnHandle, h1Desc, h1_bn_dev, h1Desc, h2_dev); - bn16(cudnnHandle, h1Desc, h2_dev, h2_bn_dev); - relu(cudnnHandle, h1Desc, h2_bn_dev); - conv17(cudnnHandle, h1Desc, h2_bn_dev, h1Desc, h17_dev); - add(cudnnHandle, h1Desc, h15_dev, h17_dev); - - // residual block9 - bn17(cudnnHandle, h1Desc, h17_dev, h1_bn_dev); - relu(cudnnHandle, h1Desc, h1_bn_dev); - conv18(cudnnHandle, h1Desc, h1_bn_dev, h1Desc, h2_dev); - bn18(cudnnHandle, h1Desc, h2_dev, h2_bn_dev); - relu(cudnnHandle, h1Desc, h2_bn_dev); - conv19(cudnnHandle, h1Desc, h2_bn_dev, h1Desc, h19_dev); - add(cudnnHandle, h1Desc, h17_dev, h19_dev); - - // residual block10 - bn19(cudnnHandle, h1Desc, h19_dev, h1_bn_dev); - relu(cudnnHandle, h1Desc, h1_bn_dev); - conv20(cudnnHandle, h1Desc, h1_bn_dev, h1Desc, h2_dev); - bn20(cudnnHandle, h1Desc, h2_dev, h2_bn_dev); - relu(cudnnHandle, h1Desc, h2_bn_dev); - conv21(cudnnHandle, h1Desc, h2_bn_dev, h1Desc, h21_dev); - add(cudnnHandle, h1Desc, h19_dev, h21_dev); - - // residual block11 - bn21(cudnnHandle, h1Desc, h21_dev, h1_bn_dev); - relu(cudnnHandle, h1Desc, h1_bn_dev); - conv22(cudnnHandle, h1Desc, h1_bn_dev, h1Desc, h2_dev); - bn22(cudnnHandle, h1Desc, h2_dev, h2_bn_dev); - relu(cudnnHandle, h1Desc, h2_bn_dev); - conv23(cudnnHandle, h1Desc, h2_bn_dev, h1Desc, h23_dev); - add(cudnnHandle, h1Desc, h21_dev, h23_dev); - - // residual block12 - bn23(cudnnHandle, h1Desc, h23_dev, h1_bn_dev); - relu(cudnnHandle, h1Desc, h1_bn_dev); - conv24(cudnnHandle, h1Desc, h1_bn_dev, h1Desc, h2_dev); - bn24(cudnnHandle, h1Desc, h2_dev, h2_bn_dev); - relu(cudnnHandle, h1Desc, h2_bn_dev); - conv25(cudnnHandle, h1Desc, h2_bn_dev, h1Desc, h25_dev); - add(cudnnHandle, h1Desc, h23_dev, h25_dev); - - // residual block13 - bn25(cudnnHandle, h1Desc, h25_dev, h1_bn_dev); - relu(cudnnHandle, h1Desc, h1_bn_dev); - conv26(cudnnHandle, h1Desc, h1_bn_dev, h1Desc, h2_dev); - bn26(cudnnHandle, h1Desc, h2_dev, h2_bn_dev); - relu(cudnnHandle, h1Desc, h2_bn_dev); - conv27(cudnnHandle, h1Desc, h2_bn_dev, h1Desc, h27_dev); - add(cudnnHandle, h1Desc, h25_dev, h27_dev); - - // residual block14 - bn27(cudnnHandle, h1Desc, h27_dev, h1_bn_dev); - relu(cudnnHandle, h1Desc, h1_bn_dev); - conv28(cudnnHandle, h1Desc, h1_bn_dev, h1Desc, h2_dev); - bn28(cudnnHandle, h1Desc, h2_dev, h2_bn_dev); - relu(cudnnHandle, h1Desc, h2_bn_dev); - conv29(cudnnHandle, h1Desc, h2_bn_dev, h1Desc, h29_dev); - add(cudnnHandle, h1Desc, h27_dev, h29_dev); - - // residual block15 - bn29(cudnnHandle, h1Desc, h29_dev, h1_bn_dev); - relu(cudnnHandle, h1Desc, h1_bn_dev); - conv30(cudnnHandle, h1Desc, h1_bn_dev, h1Desc, h2_dev); - bn30(cudnnHandle, h1Desc, h2_dev, h2_bn_dev); - relu(cudnnHandle, h1Desc, h2_bn_dev); - conv31(cudnnHandle, h1Desc, h2_bn_dev, h1Desc, h31_dev); - add(cudnnHandle, h1Desc, h29_dev, h31_dev); - - // after residual blocks - bn31(cudnnHandle, h1Desc, h31_dev, h31_bn_dev); - relu(cudnnHandle, h1Desc, h31_bn_dev); - - // policy network - conv32(cudnnHandle, h1Desc, h31_bn_dev, y1Desc, y1_dev); - bias32(cudnnHandle, y1Desc, y1_dev); - - // value network - conv32v(cudnnHandle, h1Desc, h31_bn_dev, h32vDesc, h32v_dev); - bias32v(cudnnHandle, h32vDesc, h32v_dev); - bn32v(cudnnHandle, h32vDesc, h32v_dev, h32v_bn_dev); - relu(cudnnHandle, h32vDesc, h32v_bn_dev); - - l33v(cublasHandle, batch_size, h32v_bn_dev, h33v_dev); - bias33v(cudnnHandle, h33vDesc, h33v_dev); - relu(cudnnHandle, h33vDesc, h33v_dev); - l34v(cublasHandle, batch_size, h33v_dev, y2_dev); - bias34v(cudnnHandle, y2Desc, y2_dev); - sigmoid(cudnnHandle, y2Desc, y2_dev); - - // output - checkCudaErrors(cudaMemcpy(y1, y1_dev, conv32.get_ysize(batch_size, 9, 9), cudaMemcpyDeviceToHost)); - checkCudaErrors(cudaMemcpy(y2, y2_dev, batch_size * sizeof(DType), cudaMemcpyDeviceToHost)); -} diff --git a/usi/nn_wideresnet15.h b/usi/nn_wideresnet15.h deleted file mode 100644 index 23ce6527..00000000 --- a/usi/nn_wideresnet15.h +++ /dev/null @@ -1,150 +0,0 @@ -#pragma once - -#include "nn.h" -#include "layers.h" - -class NNWideResnet15 : NN { -public: - NNWideResnet15(const char* filename, const int max_batch_size); - ~NNWideResnet15(); - - void forward(const int batch_size, features1_t* x1, features2_t* x2, DType* y1, DType* y2); - -private: - void load_model(const char* filename); - void prepare_desc(const int batch_size); - - CudnnHandle cudnnHandle; - CublasHandle cublasHandle; - static constexpr int k = 192; - static constexpr int fcl = 256; - - const int max_batch_size; - - // input layer - ConvLayer conv1_1_1; - ConvLayer conv1_1_2; - ConvLayer conv1_2; - // residual blocks - BatchNormalization bn1; - ConvLayer conv2; - BatchNormalization bn2; - ConvLayer conv3; - BatchNormalization bn3; - ConvLayer conv4; - BatchNormalization bn4; - ConvLayer conv5; - BatchNormalization bn5; - ConvLayer conv6; - BatchNormalization bn6; - ConvLayer conv7; - BatchNormalization bn7; - ConvLayer conv8; - BatchNormalization bn8; - ConvLayer conv9; - BatchNormalization bn9; - ConvLayer conv10; - BatchNormalization bn10; - ConvLayer conv11; - BatchNormalization bn11; - ConvLayer conv12; - BatchNormalization bn12; - ConvLayer conv13; - BatchNormalization bn13; - ConvLayer conv14; - BatchNormalization bn14; - ConvLayer conv15; - BatchNormalization bn15; - ConvLayer conv16; - BatchNormalization bn16; - ConvLayer conv17; - BatchNormalization bn17; - ConvLayer conv18; - BatchNormalization bn18; - ConvLayer conv19; - BatchNormalization bn19; - ConvLayer conv20; - BatchNormalization bn20; - ConvLayer conv21; - BatchNormalization bn21; - ConvLayer conv22; - BatchNormalization bn22; - ConvLayer conv23; - BatchNormalization bn23; - ConvLayer conv24; - BatchNormalization bn24; - ConvLayer conv25; - BatchNormalization bn25; - ConvLayer conv26; - BatchNormalization bn26; - ConvLayer conv27; - BatchNormalization bn27; - ConvLayer conv28; - BatchNormalization bn28; - ConvLayer conv29; - BatchNormalization bn29; - ConvLayer conv30; - BatchNormalization bn30; - ConvLayer conv31; - BatchNormalization bn31; - // policy network - ConvLayer conv32; - Bias bias32; - // value network - ConvLayer conv32v; - Bias bias32v; - BatchNormalization bn32v; - Linear<9 * 9 * MAX_MOVE_LABEL_NUM, fcl> l33v; - Bias bias33v; - Linear l34v; - Bias<1, 1, 1> bias34v; - - ReLU relu; - Add add; - Sigmoid sigmoid; - - CudnnTensorDescriptor x1Desc; - CudnnTensorDescriptor x2Desc; - CudnnTensorDescriptor h1Desc; - CudnnTensorDescriptor h32Desc; - CudnnTensorDescriptor h32vDesc; - CudnnTensorDescriptor h33vDesc; - CudnnTensorDescriptor h34vDesc; - CudnnTensorDescriptor y1Desc; - CudnnTensorDescriptor y2Desc; - - // input layer - DType* x1_dev; - DType* x2_dev; - DType* h1_1_1_dev; - DType* h1_1_2_dev; - DType* h1_2_dev; - // residual block - DType* h1_bn_dev; - DType* h2_dev; - DType* h2_bn_dev; - DType* h3_dev; - DType* h5_dev; - DType* h7_dev; - DType* h9_dev; - DType* h11_dev; - DType* h13_dev; - DType* h15_dev; - DType* h17_dev; - DType* h19_dev; - DType* h21_dev; - DType* h23_dev; - DType* h25_dev; - DType* h27_dev; - DType* h29_dev; - DType* h31_dev; - // after residual blocks - DType* h31_bn_dev; - // policy network - DType* y1_dev; - // value network - DType* h32v_dev; - DType* h32v_bn_dev; - DType* h33v_dev; - DType* y2_dev; -}; \ No newline at end of file diff --git a/usi/npz.cpp b/usi/npz.cpp deleted file mode 100644 index 7ae32570..00000000 --- a/usi/npz.cpp +++ /dev/null @@ -1,93 +0,0 @@ -#include "npz.h" - -#include -#include - -using namespace std; - -// https://pkware.cachefly.net/webdocs/casestudies/APPNOTE.TXT -struct LocalFileHeader -{ - uint32_t local_file_header_signature; // 4_bytes (0x04034b50) - uint16_t version_needed_to_extract; // 2_bytes - uint16_t general_purpose_bit_flag; // 2_bytes - uint16_t compression_method; // 2_bytes - uint16_t last_mod_file_time; // 2_bytes - uint16_t last_mod_file_date; // 2_bytes - uint32_t crc_32; // 4_bytes - uint32_t compressed_size; // 4_bytes - uint32_t uncompressed_size; // 4_bytes - uint16_t file_name_length; // 2_bytes - uint16_t extra_field_length; // 2_bytes - // ここまで30bytes - - //char* file_name; // (variable_size) - //char* extra_field; // (variable_size) -}; - -ifstream& operator >> (ifstream& ifs, LocalFileHeader& lfh) { - ifs.read((char*)&lfh.local_file_header_signature, sizeof(lfh.local_file_header_signature)); - ifs.read((char*)&lfh.version_needed_to_extract, sizeof(lfh.version_needed_to_extract)); - ifs.read((char*)&lfh.general_purpose_bit_flag, sizeof(lfh.general_purpose_bit_flag)); - ifs.read((char*)&lfh.compression_method, sizeof(lfh.compression_method)); - ifs.read((char*)&lfh.last_mod_file_time, sizeof(lfh.last_mod_file_time)); - ifs.read((char*)&lfh.last_mod_file_date, sizeof(lfh.last_mod_file_date)); - ifs.read((char*)&lfh.crc_32, sizeof(lfh.crc_32)); - ifs.read((char*)&lfh.compressed_size, sizeof(lfh.compressed_size)); - ifs.read((char*)&lfh.uncompressed_size, sizeof(lfh.uncompressed_size)); - ifs.read((char*)&lfh.file_name_length, sizeof(lfh.file_name_length)); - ifs.read((char*)&lfh.extra_field_length, sizeof(lfh.extra_field_length)); - return ifs; -} - -void load_npz(const char* file, ParamMap& params) -{ - ifstream infile(file, ios_base::in | ios_base::binary); - if (!infile) - return; - - while (true) - { - // Local file header - LocalFileHeader lfh; - infile >> lfh; - - if (lfh.local_file_header_signature != 0x04034b50) - { - break; - } - - char* file_name = new char[lfh.file_name_length + 1]; - - infile.read(file_name, lfh.file_name_length); - file_name[lfh.file_name_length] = '\0'; - - infile.seekg(lfh.extra_field_length, ios_base::cur); - - // File data - unsigned char* file_data = new unsigned char[lfh.compressed_size]; - infile.read((char*)file_data, lfh.compressed_size); - - NPY npy; - npy.uncompressed_data = new unsigned char[lfh.uncompressed_size]; - - z_stream strm = {}; - inflateInit2(&strm, -MAX_WBITS); - - strm.next_in = file_data; - strm.avail_in = lfh.compressed_size; - strm.next_out = npy.uncompressed_data; - strm.avail_out = lfh.uncompressed_size; - inflate(&strm, Z_NO_FLUSH); - inflateEnd(&strm); - - // NPY - const uint16_t header_len = *(uint16_t*)(npy.uncompressed_data + 8); - npy.data = (float*)(npy.uncompressed_data + 10 + header_len); - - params.emplace(file_name, std::move(npy)); - - delete[] file_data; - delete[] file_name; - } -} \ No newline at end of file diff --git a/usi/npz.h b/usi/npz.h deleted file mode 100644 index 2992ba9d..00000000 --- a/usi/npz.h +++ /dev/null @@ -1,28 +0,0 @@ -#pragma once - -#include -#include - -struct NPY -{ - //char magic_string[6]; // 6 bytes (0x93NUMPY) - //unsigned char major_version; // 1 byte - //unsigned char minor_version; // 1 byte - //unsigned short header_len; // 2 bytes - // ここまで10bytes - unsigned char* uncompressed_data; - float* data; - - NPY() : uncompressed_data(nullptr), data(nullptr) {} - NPY(NPY&& o) : uncompressed_data(o.uncompressed_data), data(o.data) { - o.uncompressed_data = nullptr; - o.data = nullptr; - } - ~NPY() { - delete[] uncompressed_data; - } -}; - -typedef std::map ParamMap; - -void load_npz(const char* file, ParamMap& params); diff --git a/usi/usi.vcxproj b/usi/usi.vcxproj index 7b02a1a6..571b3694 100644 --- a/usi/usi.vcxproj +++ b/usi/usi.vcxproj @@ -247,37 +247,23 @@ copy /y "$(SolutionDir)\packages\zlib-vc140-static-64.1.2.11\lib\native\libs\x64 - - - - - - - - - - - - - - diff --git a/usi/usi.vcxproj.filters b/usi/usi.vcxproj.filters index 61812498..1de0e13f 100644 --- a/usi/usi.vcxproj.filters +++ b/usi/usi.vcxproj.filters @@ -66,27 +66,9 @@ ソース ファイル\cppshogi - - ソース ファイル - - - ソース ファイル - ソース ファイル - - ソース ファイル - - - ソース ファイル - - - ソース ファイル - - - ソース ファイル - ソース ファイル @@ -104,39 +86,15 @@ ヘッダー ファイル - - ヘッダー ファイル - - - ヘッダー ファイル - ヘッダー ファイル - - ヘッダー ファイル - - - ヘッダー ファイル - ヘッダー ファイル - - ヘッダー ファイル - - - ヘッダー ファイル - ヘッダー ファイル - - ヘッダー ファイル - - - ヘッダー ファイル - ヘッダー ファイル