Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add distributed backend (XCCL) #1105

Open
wants to merge 32 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 10 commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
90a52d3
Happy Init
Chao1Han Nov 20, 2024
0d8bb51
oneccl private for xccl
Chao1Han Nov 20, 2024
f01b173
update cmake
Chao1Han Nov 20, 2024
405013c
update
Chao1Han Nov 20, 2024
7714885
update cmake
Chao1Han Nov 26, 2024
58a64a6
Merge branch 'main' into chao/xccl
Chao1Han Nov 26, 2024
b770640
update commit and add register
Chao1Han Nov 27, 2024
30f6cd2
update
Chao1Han Nov 27, 2024
8fff100
Merge branch 'main' into chao/xccl
Chao1Han Dec 4, 2024
fb851b1
imple allreduce and strcture
Chao1Han Dec 5, 2024
b1aee26
add non-reduction datatype
Chao1Han Dec 13, 2024
c55b16e
add comment
Chao1Han Dec 13, 2024
d139548
Simply cmake logit
Chao1Han Dec 13, 2024
b8e9f30
update
Chao1Han Dec 13, 2024
0fe320b
Merge branch 'main' into chao/xccl
Chao1Han Dec 16, 2024
86f09cb
update findxccl logit like mkl
Chao1Han Dec 16, 2024
d8c1e97
add oneccl path to cmake include
Chao1Han Dec 16, 2024
4b0eba0
add deault oneapi path
Chao1Han Dec 16, 2024
72b2687
rm default find path due to user source oneapi mandatory
Chao1Han Dec 17, 2024
5a40bd4
add simple xccl test
Chao1Han Dec 18, 2024
1989262
update find ccl
Chao1Han Dec 18, 2024
76d48bd
Merge branch 'main' into chao/xccl
Chao1Han Dec 19, 2024
a71447e
rm ut
Chao1Han Dec 23, 2024
8166ade
Merge branch 'main' into chao/xccl
Chao1Han Dec 23, 2024
0bbcb75
Merge branch 'main' into chao/xccl
Chao1Han Dec 24, 2024
0b53b6c
Merge branch 'main' into chao/xccl
Chao1Han Dec 27, 2024
95a133c
Merge branch 'main' into chao/xccl
Chao1Han Dec 27, 2024
e1ce466
Merge branch 'main' into chao/xccl
PenghuiCheng Dec 28, 2024
bbdc34f
Merge branch 'main' into chao/xccl
Chao1Han Dec 31, 2024
fadac62
Merge branch 'main' into chao/xccl
Chao1Han Jan 2, 2025
863dfa9
Merge branch 'main' into chao/xccl
Chao1Han Jan 3, 2025
ad817a3
Merge branch 'main' into chao/xccl
xytintel Jan 6, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,25 @@ list(APPEND CMAKE_MODULE_PATH ${TORCH_XPU_OPS_ROOT}/cmake/Modules)
include(${TORCH_XPU_OPS_ROOT}/cmake/SYCL.cmake)
include(${TORCH_XPU_OPS_ROOT}/cmake/BuildFlags.cmake)

option(USE_XCCL "Build with XCCL support" ON)
gujinghui marked this conversation as resolved.
Show resolved Hide resolved
if (DEFINED ENV{USE_XCCL})
gujinghui marked this conversation as resolved.
Show resolved Hide resolved
string(TOLOWER "$ENV{USE_XCCL}" USE_XCCL_LOWER)

if (NOT (USE_XCCL_LOWER STREQUAL "1" OR
USE_XCCL_LOWER STREQUAL "on" OR
USE_XCCL_LOWER STREQUAL "yes"))
set(USE_XCCL OFF CACHE BOOL "Build with XCCL support" FORCE)
else()
set(USE_XCCL ON CACHE BOOL "Build with XCCL support" FORCE)
endif()
endif()
gujinghui marked this conversation as resolved.
Show resolved Hide resolved

if(NOT WIN32 AND USE_XCCL)
include(${TORCH_XPU_OPS_ROOT}/cmake/XCCL.cmake)
set(USE_C10D_XCCL ON)
set(USE_C10D_XCCL ${USE_C10D_XCCL} PARENT_SCOPE)
endif()
gujinghui marked this conversation as resolved.
Show resolved Hide resolved

if(BUILD_TEST)
add_subdirectory(${TORCH_XPU_OPS_ROOT}/test/sycl ${CMAKE_BINARY_DIR}/test_sycl)
endif()
Expand Down
70 changes: 70 additions & 0 deletions cmake/Modules/FindXCCL.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
# This will define the following variables:
# XCCL_FOUND : True if the system has the XCCL library.
# XCCL_INCLUDE_DIR : Include directories needed to use XCCL.
# XCCL_LIBRARY_DIR :The path to the XCCL library.
# XCCL_LIBRARY : XCCL library fullname.

include(${CMAKE_ROOT}/Modules/FindPackageHandleStandardArgs.cmake)

set(XCCL_ROOT "/opt/intel/oneapi/ccl/latest")
gujinghui marked this conversation as resolved.
Show resolved Hide resolved
if (NOT EXISTS "${XCCL_ROOT}")
message(STATUS "Default OneCCL not found, using current environment OneCCL")
set(XCCL_ROOT $ENV{CCL_ROOT})
endif()

string(COMPARE EQUAL "${XCCL_ROOT}" "" nocclfound)
gujinghui marked this conversation as resolved.
Show resolved Hide resolved
if(nocclfound)
set(XCCL_FOUND False)
set(XCCL_REASON_FAILURE "OneCCL library not found!!")
set(XCCL_NOT_FOUND_MESSAGE "${XCCL_REASON_FAILURE}")
gujinghui marked this conversation as resolved.
Show resolved Hide resolved
return()
endif()

# Find include path from binary.
find_file(
XCCL_INCLUDE_DIR
NAMES include
HINTS ${XCCL_ROOT}
NO_DEFAULT_PATH
)

# Find include/oneapi path from include path.
find_file(
XCCL_INCLUDE_ONEAPI_DIR
NAMES oneapi
HINTS ${XCCL_ROOT}/include/
NO_DEFAULT_PATH
)

list(APPEND XCCL_INCLUDE_DIR ${XCCL_INCLUDE_ONEAPI_DIR})

# Find library directory from binary.
find_file(
XCCL_LIBRARY_DIR
NAMES lib
HINTS ${XCCL_ROOT}
NO_DEFAULT_PATH
)

# Find XCCL library fullname.
find_library(
XCCL_LIBRARY
NAMES ccl
HINTS ${XCCL_LIBRARY_DIR}
NO_DEFAULT_PATH
)

if((NOT XCCL_INCLUDE_DIR) OR (NOT XCCL_LIBRARY_DIR) OR (NOT XCCL_LIBRARY))
set(XCCL_FOUND False)
set(XCCL_REASON_FAILURE "OneCCL library not found!!")
set(XCCL_NOT_FOUND_MESSAGE "${XCCL_REASON_FAILURE}")
gujinghui marked this conversation as resolved.
Show resolved Hide resolved
return()
endif()

find_package_handle_standard_args(
XCCL
FOUND_VAR XCCL_FOUND
REQUIRED_VARS XCCL_INCLUDE_DIR XCCL_LIBRARY_DIR XCCL_LIBRARY
REASON_FAILURE_MESSAGE "${XCCL_REASON_FAILURE}"
)

20 changes: 20 additions & 0 deletions cmake/XCCL.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
if(NOT __XCCL_INCLUDED)
set(__XCCL_INCLUDED TRUE)

# XCCL_ROOT, XCCL_LIBRARY_DIR, XCCL_INCLUDE_DIR are handled by FindXCCL.cmake.
find_package(XCCL REQUIRED)
if(NOT XCCL_FOUND)
message("${XCCL_NOT_FOUND_MESSAGE")
return()
endif()
if(XCCL_FOUND)
add_library(torch::xccl INTERFACE IMPORTED)
set_property(
TARGET torch::xccl PROPERTY INTERFACE_INCLUDE_DIRECTORIES
${XCCL_INCLUDE_DIR})
set_property(
TARGET torch::xccl PROPERTY INTERFACE_LINK_LIBRARIES
${XCCL_LIBRARY})
endif()
endif()
gujinghui marked this conversation as resolved.
Show resolved Hide resolved

8 changes: 7 additions & 1 deletion src/BuildOnLinux.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,13 @@ add_library(
STATIC
${ATen_XPU_CPP_SRCS}
${ATen_XPU_NATIVE_CPP_SRCS}
${ATen_XPU_GEN_SRCS})
${ATen_XPU_GEN_SRCS}
${ATen_XPU_XCCL_SRCS})

if(USE_C10D_XCCL)
target_compile_definitions(torch_xpu_ops PRIVATE USE_C10D_XCCL)
target_link_libraries(torch_xpu_ops PUBLIC torch::xccl)
endif()

if(BUILD_SEPARATE_OPS)
foreach(sycl_src ${ATen_XPU_SYCL_SRCS})
Expand Down
5 changes: 4 additions & 1 deletion src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,14 @@ include(${TORCH_XPU_OPS_ROOT}/cmake/Codegen.cmake)
set(ATen_XPU_CPP_SRCS)
set(ATen_XPU_NATIVE_CPP_SRCS)
set(ATen_XPU_SYCL_SRCS)
set(ATen_XPU_XCCL_SRCS)

set(ATen_XPU_INCLUDE_DIRS ${TORCH_XPU_OPS_ROOT}/src CACHE STRING "ATen XPU Include directory")

add_subdirectory(ATen)

if(USE_C10D_XCCL)
add_subdirectory(xccl)
endif()
# With the increasement of bin size, we have to split libtorch_xpu.so into
# multiple libraries. Because of strict linkage requirements on Windows,
# we add extra logics to resolve, 1) Cyclic dependence, 2) Make symbols visible.
Expand Down
13 changes: 13 additions & 0 deletions src/xccl/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# XCCL sources

file(GLOB xccl_h "*.hpp")
file(GLOB xccl_cpp "*.cpp")

list(APPEND ATen_XPU_XCCL_SRCS ${xccl_cpp})

set(ATen_XPU_XCCL_SRCS ${ATen_XPU_XCCL_SRCS} PARENT_SCOPE)

# Copy the header file to the build directory so that the PyTorch registration file can locate it.
foreach(HEADER ${xccl_h})
gujinghui marked this conversation as resolved.
Show resolved Hide resolved
file(COPY ${HEADER} DESTINATION "${CMAKE_BINARY_DIR}/torch/csrc/distributed/c10d")
endforeach()
Loading
Loading