Skip to content

Commit

Permalink
Provide more CUDA diagnostic information
Browse files Browse the repository at this point in the history
* CUDA_VISIBLE_DEVICES can cause cuInit to fail, print it
* Mention verbose logging
* Clarify the error in the DSO search - not finding it is not necessarily bad
* Add a basic test to make sure the diagnostic log at least doesn't crash

PiperOrigin-RevId: 701961794
  • Loading branch information
nputikhin authored and Google-ML-Automation committed Dec 2, 2024
1 parent ebd7920 commit 28fe451
Show file tree
Hide file tree
Showing 3 changed files with 146 additions and 45 deletions.
20 changes: 20 additions & 0 deletions xla/stream_executor/cuda/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -139,11 +139,31 @@ cc_library(
"@com_google_absl//absl/status:statusor",
"@com_google_absl//absl/strings",
"@com_google_absl//absl/strings:str_format",
"@tsl//tsl/platform:env",
"@tsl//tsl/platform:logging",
"@tsl//tsl/platform:platform_port",
],
)

xla_test(
name = "cuda_diagnostics_test",
srcs = ["cuda_diagnostics_test.cc"],
backends = ["gpu"],
tags = ["cuda-only"],
deps = [
":cuda_diagnostics",
":cuda_platform",
"//xla/stream_executor:platform",
"//xla/stream_executor:platform_manager",
"@com_google_absl//absl/debugging:leak_check",
"@com_google_absl//absl/log:check",
"@com_google_absl//absl/log:globals",
"@com_google_googletest//:gtest_main",
"@tsl//tsl/platform:statusor",
"@tsl//tsl/platform:test",
],
)

# Buildozer can not remove dependencies inside select guards, so we have to use
# an intermediate target.
cc_library(
Expand Down
122 changes: 77 additions & 45 deletions xla/stream_executor/cuda/cuda_diagnostics.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@ limitations under the License.

#include "xla/stream_executor/cuda/cuda_diagnostics.h"

#include <cstdlib>
#include <set>

#if !defined(PLATFORM_WINDOWS)
#include <dirent.h>
#endif
Expand Down Expand Up @@ -42,8 +45,10 @@ limitations under the License.
#include "absl/strings/str_cat.h"
#include "absl/strings/str_format.h"
#include "absl/strings/str_split.h"
#include "absl/strings/string_view.h"
#include "absl/strings/strip.h"
#include "xla/stream_executor/gpu/gpu_diagnostics.h"
#include "tsl/platform/env.h"
#include "tsl/platform/host_info.h"
#include "tsl/platform/logging.h"

Expand Down Expand Up @@ -100,6 +105,29 @@ absl::StatusOr<DriverVersion> StringToDriverVersion(const std::string &value) {
return result;
}

void PrintLdLibraryPathIntoVlog() {
const char *value = std::getenv("LD_LIBRARY_PATH");
std::string library_path = value == nullptr ? "" : value;
VLOG(1) << "LD_LIBRARY_PATH is: \"" << library_path << "\"";

std::vector<std::string> pieces = absl::StrSplit(library_path, ':');
for (const auto &piece : pieces) {
if (piece.empty()) {
continue;
}
std::vector<std::string> dir_children;
absl::Status status =
tsl::Env::Default()->GetChildren(piece, &dir_children);
if (!status.ok()) {
VLOG(1) << "could not open \"" << piece << "\": " << status;
continue;
}
for (const std::string &filename : dir_children) {
VLOG(1) << piece << " :: " << filename;
}
}
}

} // namespace cuda
} // namespace stream_executor

Expand Down Expand Up @@ -134,6 +162,24 @@ void Diagnostician::LogDiagnosticInformation() {
}
#endif

const char *visible_devices_env = std::getenv("CUDA_VISIBLE_DEVICES");
if (visible_devices_env != nullptr) {
LOG(INFO) << "env: CUDA_VISIBLE_DEVICES=\"" << visible_devices_env << "\"";
std::set<std::string> common_disable_gpu_values = {"", "-1", "none"};
if (common_disable_gpu_values.count(visible_devices_env)) {
LOG(INFO) << "CUDA_VISIBLE_DEVICES is set to "
<< (std::string{} == visible_devices_env ? "an empty string"
: visible_devices_env)
<< " - this hides all GPUs from CUDA";
}
}

if (!VLOG_IS_ON(1)) {
LOG(INFO) << "verbose logging is disabled. Rerun with verbose logging "
"(usually --v=1 or --vmodule=cuda_diagnostics=1) to get more "
"diagnostic output from this module";
}

LOG(INFO) << "retrieving CUDA diagnostic information for host: "
<< tsl::port::Hostname();

Expand All @@ -144,36 +190,17 @@ void Diagnostician::LogDiagnosticInformation() {
LOG(INFO) << "hostname: " << tsl::port::Hostname();
#ifndef PLATFORM_WINDOWS
if (VLOG_IS_ON(1)) {
const char *value = getenv("LD_LIBRARY_PATH");
std::string library_path = value == nullptr ? "" : value;
VLOG(1) << "LD_LIBRARY_PATH is: \"" << library_path << "\"";

std::vector<std::string> pieces = absl::StrSplit(library_path, ':');
for (const auto &piece : pieces) {
if (piece.empty()) {
continue;
}
DIR *dir = opendir(piece.c_str());
if (dir == nullptr) {
VLOG(1) << "could not open \"" << piece << "\"";
continue;
}
while (dirent *entity = readdir(dir)) {
VLOG(1) << piece << " :: " << entity->d_name;
}
closedir(dir);
}
cuda::PrintLdLibraryPathIntoVlog();
}

absl::StatusOr<DriverVersion> dso_version = FindDsoVersion();
LOG(INFO) << "libcuda reported version is: "
<< cuda::DriverVersionStatusToString(dso_version);

absl::StatusOr<DriverVersion> kernel_version = FindKernelDriverVersion();
LOG(INFO) << "kernel reported version is: "
<< cuda::DriverVersionStatusToString(kernel_version);
#endif

#if !defined(PLATFORM_WINDOWS)
if (kernel_version.ok() && dso_version.ok()) {
WarnOnDsoKernelMismatch(dso_version, kernel_version);
}
Expand All @@ -184,37 +211,42 @@ void Diagnostician::LogDiagnosticInformation() {
// driver-interfacing DSO version number. Returns it as a string.
absl::StatusOr<DriverVersion> Diagnostician::FindDsoVersion() {
absl::StatusOr<DriverVersion> result(absl::NotFoundError(
"was unable to find libcuda.so DSO loaded into this program"));
"was unable to find libcuda.so DSO loaded into this program. The library "
"may be missing or provided via another object."));

#if !defined(PLATFORM_WINDOWS) && !defined(ANDROID_TEGRA)
// Callback used when iterating through DSOs. Looks for the driver-interfacing
// DSO and yields its version number into the callback data, when found.
auto iterate_phdr = [](struct dl_phdr_info *info, size_t size,
void *data) -> int {
if (strstr(info->dlpi_name, "libcuda.so.1")) {
VLOG(1) << "found DLL info with name: " << info->dlpi_name;
char resolved_path[PATH_MAX] = {0};
if (realpath(info->dlpi_name, resolved_path) == nullptr) {
return 0;
}
VLOG(1) << "found DLL info with resolved path: " << resolved_path;
const char *slash = rindex(resolved_path, '/');
if (slash == nullptr) {
return 0;
}
const char *so_suffix = ".so.";
const char *dot = strstr(slash, so_suffix);
if (dot == nullptr) {
return 0;
}
std::string dso_version = dot + strlen(so_suffix);
// TODO(b/22689637): Eliminate the explicit namespace if possible.
auto stripped_dso_version = absl::StripSuffix(dso_version, ".ld64");
auto result = static_cast<absl::StatusOr<DriverVersion> *>(data);
*result = cuda::StringToDriverVersion(std::string(stripped_dso_version));
return 1;
if (!strstr(info->dlpi_name, "libcuda.so.1")) {
return 0;
}
return 0;

VLOG(1) << "found CUDA DLL info with name: " << info->dlpi_name;
char resolved_path_buf[PATH_MAX] = {0};
if (realpath(info->dlpi_name, resolved_path_buf) == nullptr) {
return 0;
}
absl::string_view resolved_path(resolved_path_buf);
VLOG(1) << "found DLL info with resolved path: " << resolved_path;
size_t slash = resolved_path.rfind('/');
if (slash == absl::string_view::npos) {
return 0;
}
absl::string_view so_suffix = ".so.";
size_t dot = resolved_path.find(so_suffix, slash);
if (dot == absl::string_view::npos) {
return 0;
}

absl::string_view dso_version =
resolved_path.substr(dot + so_suffix.size());
absl::string_view stripped_dso_version =
absl::StripSuffix(dso_version, ".ld64");
auto result = static_cast<absl::StatusOr<DriverVersion> *>(data);
*result = cuda::StringToDriverVersion(std::string(stripped_dso_version));
return 1;
};

dl_iterate_phdr(iterate_phdr, &result);
Expand Down
49 changes: 49 additions & 0 deletions xla/stream_executor/cuda/cuda_diagnostics_test.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
/* Copyright 2024 The OpenXLA Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/

#include "xla/stream_executor/cuda/cuda_diagnostics.h"

#include <gtest/gtest.h>
#include "absl/debugging/leak_check.h"
#include "absl/log/check.h"
#include "absl/log/globals.h"
#include "xla/stream_executor/platform.h"
#include "xla/stream_executor/platform_manager.h"
#include "tsl/platform/statusor.h"
#include "tsl/platform/test.h"

namespace stream_executor::gpu {
namespace {

void EnsureCudaIsInitialized() {
// Platform is intentionally leaked.
// See the comment in platform_manager.h.
absl::LeakCheckDisabler disabler;

TF_ASSERT_OK_AND_ASSIGN(stream_executor::Platform * platform,
PlatformManager::PlatformWithName("CUDA"));
CHECK_GT(platform->VisibleDeviceCount(), 0);
}

TEST(CudaDiagnosticsTest, DiagnosticRuns) {
// Initialize the platform - this is not code under test, it only ensures that
// we have a working CUDA setup.
EnsureCudaIsInitialized();

cuda::Diagnostician::LogDiagnosticInformation();
}

} // namespace
} // namespace stream_executor::gpu

0 comments on commit 28fe451

Please sign in to comment.