wangkuiyi · shendiaomo · Oct 20, 2020 · Oct 26, 2020 · Oct 26, 2020 · Nov 4, 2020
diff --git a/cgotorch/cgotorch.h b/cgotorch/cgotorch.h
@@ -6,6 +6,7 @@
 #include "cgotorch/init.h"
 #include "cgotorch/memory.h"
 #include "cgotorch/optim.h"
+#include "cgotorch/parallel.h"
 #include "cgotorch/pickle.h"
 #include "cgotorch/tensor.h"
 #include "cgotorch/torch.h"
diff --git a/cgotorch/parallel.cc b/cgotorch/parallel.cc
@@ -0,0 +1,43 @@
+// Copyright 2020, GoTorch Authors
+#ifdef WITH_CUDA
+#include <torch/nn/parallel/data_parallel.h>
+#endif
+
+#include <memory>
+
+#include "cgotorch/parallel.h"
+
+typedef Tensor (*ForwardMethod)(void *, Tensor);
+
+// goModule wraps the `goModuleForward` funciton defined in nn/parallel.go into
+// a class method
+struct goModule : torch::nn::Module {
+  char *m_;
+  ForwardMethod f_;
+  goModule(char *m, void *f) : m_(m), f_(reinterpret_cast<ForwardMethod>(f)) {}
+  at::Tensor forward(at::Tensor input) {  // NOLINT: include_what_you_use
+    // TODO(shendiaomo): check the return value of `f_`
+    return *f_(m_, &input);
+  }
+};
+
+const char *DataParallel(char *go_module, void *f, Tensor input,
+                         Device *devices, int64_t size, Device *output,
+                         int64_t dim) {
+#ifdef WITH_CUDA
+  try {
+    if (input == nullptr) {
+      throw std::runtime_error(
+          "invalid memory address or nil pointer dereference of input tensor");
+    }
+    torch::nn::parallel::data_parallel(std::make_shared<goModule>(go_module, f),
+                                       *input);
+    return nullptr;
+  } catch (const std::exception &e) {
+    return exception_str(e.what());
+  }
+#else
+  return exception_str(
+      "Parallel API needs -DWITH_CUDA on building libcgotorch.so");
+#endif
+}
diff --git a/cgotorch/parallel.h b/cgotorch/parallel.h
@@ -0,0 +1,18 @@
+/* Copyright 2020, GoTorch Authors */
+#pragma once
+
+#include "cgotorch/torchdef.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// Parallel
+////////////////////////////////////////////////////////////////////////////////
+
+const char *DataParallel(char *go_module, void *f, Tensor input, Device *device,
+                         int64_t size, Device *output, int64_t dim);
+#ifdef __cplusplus
+}
+#endif
diff --git a/nn/parallel/parallel.go b/nn/parallel/parallel.go
@@ -0,0 +1,41 @@
+package parallel
+
+// #cgo CFLAGS: -I ${SRCDIR}/../../ -I ${SRCDIR}../../cgotorch/libtorch/include
+// #cgo LDFLAGS: -L ${SRCDIR}/../../cgotorch -Wl,-rpath ${SRCDIR}/../../cgotorch -lcgotorch
+// #cgo LDFLAGS: -L ${SRCDIR}/../../cgotorch/libtorch/lib -Wl,-rpath ${SRCDIR}/../../cgotorch/libtorch/lib -lc10 -ltorch -ltorch_cpu
+// #include "cgotorch/cgotorch.h"
+// Tensor goModuleForward(char *m, Tensor input);
+import "C"
+import (
+	"reflect"
+	"runtime"
+	"unsafe"
+
+	torch "github.com/wangkuiyi/gotorch"
+	"github.com/wangkuiyi/gotorch/nn"
+)
+
+//export goModuleForward
+func goModuleForward(m *C.char, input C.Tensor) C.Tensor {
+	module := (*(*nn.IModule)(unsafe.Pointer(m)))
+	forward := reflect.ValueOf(module).MethodByName("Forward")
+	args := []reflect.Value{reflect.ValueOf(torch.Tensor{(*unsafe.Pointer)(&input)})}
+	return *(*C.Tensor)(forward.Call(args)[0].Interface().(torch.Tensor).T)
+}
+
+// DataParallel Evaluates module(input) in parallel across the given devices.
+// If `devices` is not supplied, the invocation is parallelized across all available CUDA devices.
+// If `outputDevice` is supplied, the final, combined tensor will be placed on this device. If not, it defaults to the first device in devices.
+// In detail, this method performs the following four distinct steps:
+//    1. Scatter the input to the given devices,
+//    2. Replicate (deep clone) the model on each device,
+//    3. Evaluate each module with its input on its device,
+//    4. Gather the outputs of each replica into a single output tensor, located on the `outputDevice`.
+func DataParallel(m nn.IModule, input torch.Tensor, devices []torch.Device, outputDevice torch.Device, dim int64) torch.Tensor {
+	// Convert `m` to `*C.char` to workaround the "cgo argument has Go pointer to Go
+	// pointer" check
+	torch.MustNil(unsafe.Pointer(C.DataParallel((*C.char)(unsafe.Pointer(&m)), C.goModuleForward, *(*C.Tensor)(input.T), nil, 0, nil, 0)))
+	runtime.KeepAlive(&m)
+	runtime.KeepAlive(&input)
+	return torch.Tensor{}
+}
diff --git a/nn/parallel/parallel_test.go b/nn/parallel/parallel_test.go
@@ -0,0 +1,35 @@
+package parallel
+
+import (
+	"fmt"
+	"github.com/stretchr/testify/assert"
+	torch "github.com/wangkuiyi/gotorch"
+	"github.com/wangkuiyi/gotorch/nn"
+	"testing"
+)
+
+type myModelModule struct {
+	nn.Module // Every model must derive from Module
+}
+
+// Forward executes the calculation
+func (m *myModelModule) Forward(x torch.Tensor) torch.Tensor {
+	fmt.Println("Forward")
+	return torch.RandN([]int64{1, 1}, false)
+}
+
+func myModel() *myModelModule {
+	m := &myModelModule{}
+	m.Init(m)
+	return m
+}
+
+func TestDataParallel(t *testing.T) {
+	m := myModel()
+	// panic: Parallel API needs -DWITH_CUDA on building libcgotorch.so
+	assert.Panics(t, func() {
+		DataParallel(m, torch.Tensor{nil}, []torch.Device{}, torch.Device{}, 0)
+	})
+	// Only for CUDA
+	// DataParallel(m, torch.RandN([]int64{1,1}, false), []torch.Device{}, torch.Device{}, 0)
+}