diff --git a/demos/PytorchComparison/CMakeLists.txt b/demos/PytorchComparison/CMakeLists.txt new file mode 100644 index 000000000..b765873cf --- /dev/null +++ b/demos/PytorchComparison/CMakeLists.txt @@ -0,0 +1,10 @@ +cmake_minimum_required(VERSION 3.18 FATAL_ERROR) +project(torch-basic-nn) + +find_package(Torch REQUIRED) +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}") + +add_executable(torch-basic-nn torch-basic-nn.cpp) +target_link_libraries(torch-basic-nn "${TORCH_LIBRARIES}") +set_property(TARGET torch-basic-nn PROPERTY CXX_STANDARD 17) + diff --git a/demos/PytorchComparison/clad-basic-nn.cpp b/demos/PytorchComparison/clad-basic-nn.cpp new file mode 100644 index 000000000..06d7c0a1f --- /dev/null +++ b/demos/PytorchComparison/clad-basic-nn.cpp @@ -0,0 +1,153 @@ +#include +#include +#include +#include "clad/Differentiator/Differentiator.h" + +/* + The code below implements a simple neural network with 3 hidden nodes and 1 + output node. The neural network is trained using gradient descent to minimize + the mean squared error loss. The code uses Clad to automatically generate the + gradient computation code for the loss function. + + To compile the code, use the following command: + clang++ -std=c++14 -I -fplugin= + basic-nn.cpp -DBENCHMARK_MATRIX_MULTIPLY +*/ + +#define M 100 // num of data points +#define N 100 // num of features +#define N_ITER 5000 // num of iterations + +void relu(double* arr, unsigned int n) { + for (int i = 0; i < n; ++i) + arr[i] = (arr[i] > 0) ? arr[i] : 0; +} + +// Create a model of a neural network with 3 hidden nodes and 1 output node. +// It takes in input data x and weight vectors for each hidden node and the +// output node. +double model_fn(double x[N], // input data + double w1[N], // weight vector for hidden node 1 + double w2[N], // weight vector for hidden node 2 + double w3[N], // weight vector for hidden node 3 + double w4[3] // weight vector for output node +) { + // Layer 1 - matrix multiplication + double h1 = 0, h2 = 0, h3 = 0; + for (int i = 0; i < N; ++i) { + h1 += x[i] * w1[i]; + h2 += x[i] * w2[i]; + h3 += x[i] * w3[i]; + } + // Computing non-linear activation function + double h[3] = {h1, h2, h3}; + relu(h, 3); + + // Layer 2 - matrix multiplication + double o = 0; + for (int i = 0; i < 3; ++i) + o += h[i] * w4[i]; + return o; +} + +double loss_fn(double x[M * N], double y[M], double w1[N], double w2[N], + double w3[N], double w4[3]) { + double loss = 0; + for (int i = 0; i < M; ++i) { + double y_hat = model_fn(x + i * N, w1, w2, w3, w4); + loss += (y_hat - y[i]) * (y_hat - y[i]); + } + return loss / M; +} + +// Perform a single gradient descent step. +// theta_x are the hypothesis parameters, t is the generated dataset and +// clad_grad is the gradient function generated by Clad +template +void performStep(double x[M * N], double y[M], double w1[N], double w2[N], + double w3[N], double w4[3], T clad_grad, + double learning_rate = 1e-2) { + // Gradient computation + double grad_w1[N], grad_w2[N], grad_w3[N], grad_w4[3]; + clad_grad.execute(x, y, w1, w2, w3, w4, grad_w1, grad_w2, grad_w3, grad_w4); + + // Update weights for hidden nodes + for (int i = 0; i < N; ++i) { + w1[i] -= learning_rate * grad_w1[i]; + w2[i] -= learning_rate * grad_w2[i]; + w3[i] -= learning_rate * grad_w3[i]; + } + + // Update weights for output node + for (int i = 0; i < 3; ++i) + w4[i] -= learning_rate * grad_w4[i]; +} + +// Perform gradient descent to optimize the weights. +void optimize(double x[M * N], double y[M], double w1[N], double w2[N], + double w3[N], double w4[3], double lr, int num_iters) { + auto grad = clad::gradient(loss_fn, "w1,w2,w3,w4"); + for (int i = 0; i < num_iters; ++i) + performStep(x, y, w1, w2, w3, w4, grad, lr); +} + +// Benchmark the time for 100 matrix multiplications. +void benchmark_matrix_multiply(double x[M + N], double y[M]) { + double z[M * N]; + auto start_bench = std::chrono::high_resolution_clock::now(); + for (int i = 0; i < 100; ++i) { + for (int j = 0; j < M; ++j) { + for (int k = 0; k < N; ++k) { + z[j * N + k] = 0; + for (int l = 0; l < N; ++l) + z[j * N + k] += x[j * N + l] * x[j * N + l]; + } + } + } + auto end_bench = std::chrono::high_resolution_clock::now(); + std::chrono::duration elapsed_bench = end_bench - start_bench; + std::cout << "Time taken for 100 matrix multiplications: " + << elapsed_bench.count() << " seconds" << std::endl; +} + +int main() { + // Initialize random number generator. + std::mt19937 gen(42); + + // Generate random input data and labels. + double x[M * N], y[M]; + for (int i = 0; i < M * N; ++i) + x[i] = std::uniform_real_distribution(0, 1)(gen); + for (int i = 0; i < M; ++i) + y[i] = std::uniform_real_distribution(0, 1)(gen); + +#ifdef BENCHMARK_MATRIX_MULTIPLY + benchmark_matrix_multiply(x, y); +#endif + + // Initialize weights from a random distribution. + double w1[N], w2[N], w3[N], w4[3]; + for (int i = 0; i < N; ++i) { + w1[i] = std::uniform_real_distribution(0, 1)(gen); + w2[i] = std::uniform_real_distribution(0, 1)(gen); + w3[i] = std::uniform_real_distribution(0, 1)(gen); + } + for (int i = 0; i < 3; ++i) + w4[i] = std::uniform_real_distribution(0, 1)(gen); + + // Calculate the loss before optimization. + double loss = loss_fn(x, y, w1, w2, w3, w4); + std::cout << "Initial loss before optimization: " << loss << std::endl; + + // Optimize the weights and compute the time taken. + auto start = std::chrono::high_resolution_clock::now(); + optimize(x, y, w1, w2, w3, w4, 1e-2, N_ITER); + auto end = std::chrono::high_resolution_clock::now(); + std::chrono::duration elapsed = end - start; + + // Calculate the loss after optimization. + loss = loss_fn(x, y, w1, w2, w3, w4); + std::cout << "Final loss after optimization: " << loss << std::endl; + std::cout << "Time taken: " << elapsed.count() << " seconds" << std::endl; + return 0; +} diff --git a/demos/PytorchComparison/torch-basic-nn.cpp b/demos/PytorchComparison/torch-basic-nn.cpp new file mode 100644 index 000000000..31c2c892e --- /dev/null +++ b/demos/PytorchComparison/torch-basic-nn.cpp @@ -0,0 +1,85 @@ +// Do the same thing what's done in basic-nn.cpp, but using libtorch instead of +// Clad. + +/* + To build and run + 1. Create a build directory: mkdir build + 2. Change to the build directory: cd build + 3. Run cmake: cmake .. -DCMAKE_PREFIX_PATH=/path/to/libtorch + 4. Run make: make + 5. Run the executable: ./torch-basic-nn +*/ + +#include +#include +#include +#include + +#define M 100 // num of data points +#define N 100 // num of features +#define N_ITER 5000 // num of iterations + +// C++ neural network model defined with value semantics +struct TwoLayerNet : torch::nn::Module { + // constructor with submodules registered in the initializer list + TwoLayerNet(int64_t D_in, int64_t D_out, int64_t H) + : linear1(register_module("linear1", torch::nn::Linear(D_in, H))), + linear2(register_module("linear2", torch::nn::Linear(H, D_out))) {} + + torch::Tensor forward(torch::Tensor x) { + x = torch::relu(linear1->forward(x)); + x = linear2->forward(x); + return x; + } + torch::nn::Linear linear1; + torch::nn::Linear linear2; +}; + +int main() { + // Generate random data + // std::default_random_engine generator; + // std::normal_distribution distribution(0.0, 1.0); + torch::Tensor x = torch::randn({M, N}); + torch::Tensor y = torch::randn({M, 1}); + + // Benchmark the time for 100 matrix multiplications + torch::Tensor z; + auto start_bench = std::chrono::high_resolution_clock::now(); + for (int i = 0; i < 100; ++i) + z = torch::matmul(x, x); + auto end_bench = std::chrono::high_resolution_clock::now(); + std::chrono::duration elapsed_bench = end_bench - start_bench; + std::cout << "Time taken for 100 matrix multiplications: " + << elapsed_bench.count() << " seconds" << std::endl; + + // Initialize model weights + TwoLayerNet model(N, 1, 3); + + // Create the optimizer and loss function + torch::optim::SGD optimizer(model.parameters(), 0.01); + torch::nn::MSELoss loss_fn; + + // Calculate the loss before optimization + torch::Tensor output = model.forward(x); + std::cout << "Initial loss before optimization: " + << loss_fn(output, y).item() << std::endl; + + // Optimize + auto start = std::chrono::high_resolution_clock::now(); + for (int i = 0; i < N_ITER; ++i) { + optimizer.zero_grad(); + torch::Tensor output = model.forward(x); + torch::Tensor loss = loss_fn(output, y); + loss.backward(); + optimizer.step(); + } + auto end = std::chrono::high_resolution_clock::now(); + std::chrono::duration elapsed = end - start; + + // Calculate the loss after optimization + output = model.forward(x); + std::cout << "Final loss after optimization: " + << loss_fn(output, y).item() << std::endl; + std::cout << "Time taken: " << elapsed.count() << " seconds" << std::endl; + return 0; +} diff --git a/demos/PytorchComparison/torch-basic-nn.py b/demos/PytorchComparison/torch-basic-nn.py new file mode 100644 index 000000000..b6d59770f --- /dev/null +++ b/demos/PytorchComparison/torch-basic-nn.py @@ -0,0 +1,68 @@ +# Do the same as torch-basic-nn.cpp but in Python using PyTorch +# To run: python torch-basic-nn.py + +import torch +import torch.nn as nn +import torch.optim as optim +import numpy as np + +M = 100 # Number of samples +N = 100 # Number of features +N_ITER = 5000 # Number of iterations + +# Create a simple neural network with 3 hidden nodes and 1 output node +class Net(nn.Module): + def __init__(self): + super(Net, self).__init__() + self.fc1 = nn.Linear(N, 3) + self.fc2 = nn.Linear(3, 1) + def forward(self, x): + x = torch.relu(self.fc1(x)) + x = self.fc2(x) + return x + +def main(): + # Create a simple dataset + x = np.random.randn(M, N).astype(np.float32) + y = np.random.randn(M, 1).astype(np.float32) + + # Convert the dataset to PyTorch tensors + x = torch.from_numpy(x) + y = torch.from_numpy(y) + + # Create the neural network + net = Net() + + # Create the loss function and the optimizer + criterion = nn.MSELoss() + optimizer = optim.SGD(net.parameters(), lr=0.01) + + # Print the initial loss + output = net(x) + loss = criterion(output, y) + print("Initial loss: ", loss.item()) + + # Calculate start time + import time + start = time.time() + + # Train the neural network + for i in range(N_ITER): + optimizer.zero_grad() + output = net(x) + loss = criterion(output, y) + loss.backward() + optimizer.step() + + # Stop the timer + end = time.time() + + # Print the final loss + output = net(x) + loss = criterion(output, y) + print("Final loss: ", loss.item()) + print("Time: ", end - start, " seconds") + + +if __name__ == "__main__": + main()