From f938315360eed2a9dab68eace5172bfda9f67c73 Mon Sep 17 00:00:00 2001 From: Matt Wozniski Date: Sun, 23 Jun 2024 21:37:06 -0400 Subject: [PATCH 1/2] Reintroduce debuginfod deadlock workaround While switching from referencing the system debuginfod client library to vendoring our own copy of it, I removed a hack that turns out to still be required for some libc versions, because I incorrectly thought it only applied to the case when debuginfod was being loaded with `dlopen`. Signed-off-by: Matt Wozniski --- news/634.bugfix.rst | 1 + src/vendor/libbacktrace/elf.c | 25 +++++++++++- ...d868badcbceb3eeecc2e_debuginfod_patch.diff | 39 +++++++++++++++++-- 3 files changed, 60 insertions(+), 5 deletions(-) create mode 100644 news/634.bugfix.rst diff --git a/news/634.bugfix.rst b/news/634.bugfix.rst new file mode 100644 index 0000000000..fd96f076d7 --- /dev/null +++ b/news/634.bugfix.rst @@ -0,0 +1 @@ +Fix a deadlock that could occur on some Linux systems when resolving debug information using debuginfod. diff --git a/src/vendor/libbacktrace/elf.c b/src/vendor/libbacktrace/elf.c index 0fcce38f5d..2eb2546e06 100644 --- a/src/vendor/libbacktrace/elf.c +++ b/src/vendor/libbacktrace/elf.c @@ -853,6 +853,8 @@ elf_readlink (struct backtrace_state *state, const char *filename, #define SYSTEM_BUILD_ID_DIR "/usr/lib/debug/.build-id/" +static int debuginfod_guard = 0; + static int elf_open_debugfile_by_debuginfod (const char *buildid_data, size_t buildid_size, @@ -4455,7 +4457,7 @@ elf_add (struct backtrace_state *state, const char *filename, int descriptor, d = elf_open_debugfile_by_buildid (state, buildid_data, buildid_size, error_callback, data); - if (d < 0) { + if (d < 0 && !debuginfod_guard) { char* env = getenv(DEBUGINFOD_PROGRESS_ENV_VAR); if (env) { fprintf(stderr, "Trying to download debuginfo for %s\n", filename); @@ -4930,7 +4932,28 @@ backtrace_initialize (struct backtrace_state *state, const char *filename, pd.exe_filename = filename; pd.exe_descriptor = ret < 0 ? descriptor : -1; + /* Here, There Be Dragons: we are about to call dl_iterate_phdr, + which is a glibc-internal function that holds a libc internal + lock. As this function needs to iterate over all the loaded + modules, this lock is shared by dlopen so no new modules can be + loaded while is iterating. This is a problem for us, as the + debuginfod client will use libcurl to spawn threads to download + debuginfo files, and libcurl uses dlopen to load a bunch of stuff + for its backend in some versions. This can cause a deadlock because + the debuginfod client will wait until the libcurl threads finish but + they will never finish because they are waiting for the dlopen lock + to be released, which will not happen until the call to dl_iterate_phdr + finishes. + + To avoid this, we use a global variable to detect if we are already + iterating over the modules, and if so, we skip the query to debuginfod + and just try with the other default methods. + + Note this ONLY affects the symbol resolution when retrieving a backtrace, + and it doesn't affect offline symbolication. */ + debuginfod_guard++; dl_iterate_phdr (phdr_callback, (void *) &pd); + debuginfod_guard--; if (!state->threaded) { diff --git a/src/vendor/libbacktrace_2446c66076480ce07a6bd868badcbceb3eeecc2e_debuginfod_patch.diff b/src/vendor/libbacktrace_2446c66076480ce07a6bd868badcbceb3eeecc2e_debuginfod_patch.diff index 6d2f39b646..a17124917a 100644 --- a/src/vendor/libbacktrace_2446c66076480ce07a6bd868badcbceb3eeecc2e_debuginfod_patch.diff +++ b/src/vendor/libbacktrace_2446c66076480ce07a6bd868badcbceb3eeecc2e_debuginfod_patch.diff @@ -180,7 +180,7 @@ index 0000000..78f4d8d + +#endif /* _DEBUGINFOD_CLIENT_H */ diff --git a/elf.c b/elf.c -index fb54165..0fcce38 100644 +index fb54165..2eb2546 100644 --- a/elf.c +++ b/elf.c @@ -38,6 +38,7 @@ POSSIBILITY OF SUCH DAMAGE. */ @@ -199,10 +199,12 @@ index fb54165..0fcce38 100644 #ifndef S_ISLNK #ifndef S_IFLNK -@@ -851,6 +853,37 @@ elf_readlink (struct backtrace_state *state, const char *filename, +@@ -851,6 +853,39 @@ elf_readlink (struct backtrace_state *state, const char *filename, #define SYSTEM_BUILD_ID_DIR "/usr/lib/debug/.build-id/" ++static int debuginfod_guard = 0; ++ +static int +elf_open_debugfile_by_debuginfod (const char *buildid_data, + size_t buildid_size, @@ -237,11 +239,11 @@ index fb54165..0fcce38 100644 /* Open a separate debug info file, using the build ID to find it. Returns an open file descriptor, or -1. -@@ -4422,6 +4455,14 @@ elf_add (struct backtrace_state *state, const char *filename, int descriptor, +@@ -4422,6 +4457,14 @@ elf_add (struct backtrace_state *state, const char *filename, int descriptor, d = elf_open_debugfile_by_buildid (state, buildid_data, buildid_size, error_callback, data); -+ if (d < 0) { ++ if (d < 0 && !debuginfod_guard) { + char* env = getenv(DEBUGINFOD_PROGRESS_ENV_VAR); + if (env) { + fprintf(stderr, "Trying to download debuginfo for %s\n", filename); @@ -252,3 +254,32 @@ index fb54165..0fcce38 100644 if (d >= 0) { int ret; +@@ -4889,7 +4932,28 @@ backtrace_initialize (struct backtrace_state *state, const char *filename, + pd.exe_filename = filename; + pd.exe_descriptor = ret < 0 ? descriptor : -1; + ++ /* Here, There Be Dragons: we are about to call dl_iterate_phdr, ++ which is a glibc-internal function that holds a libc internal ++ lock. As this function needs to iterate over all the loaded ++ modules, this lock is shared by dlopen so no new modules can be ++ loaded while is iterating. This is a problem for us, as the ++ debuginfod client will use libcurl to spawn threads to download ++ debuginfo files, and libcurl uses dlopen to load a bunch of stuff ++ for its backend in some versions. This can cause a deadlock because ++ the debuginfod client will wait until the libcurl threads finish but ++ they will never finish because they are waiting for the dlopen lock ++ to be released, which will not happen until the call to dl_iterate_phdr ++ finishes. ++ ++ To avoid this, we use a global variable to detect if we are already ++ iterating over the modules, and if so, we skip the query to debuginfod ++ and just try with the other default methods. ++ ++ Note this ONLY affects the symbol resolution when retrieving a backtrace, ++ and it doesn't affect offline symbolication. */ ++ debuginfod_guard++; + dl_iterate_phdr (phdr_callback, (void *) &pd); ++ debuginfod_guard--; + + if (!state->threaded) + { From af85883a62145afae8fc651d0532944edd0e2e35 Mon Sep 17 00:00:00 2001 From: Matt Wozniski Date: Sun, 23 Jun 2024 21:40:04 -0400 Subject: [PATCH 2/2] Prepare for 1.13.1 release See changelog for more details. Signed-off-by: Matt Wozniski --- .bumpversion.cfg | 2 +- NEWS.rst | 9 +++++++++ news/634.bugfix.rst | 1 - src/memray/_version.py | 2 +- 4 files changed, 11 insertions(+), 3 deletions(-) delete mode 100644 news/634.bugfix.rst diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 54594fd718..6351b0de47 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 1.13.0 +current_version = 1.13.1 commit = True message = Prepare for {new_version} release diff --git a/NEWS.rst b/NEWS.rst index 8843db7b95..abfab12c1e 100644 --- a/NEWS.rst +++ b/NEWS.rst @@ -8,6 +8,15 @@ Changelog .. towncrier release notes start +memray 1.13.1 (2024-06-23) +-------------------------- + +Bug Fixes +~~~~~~~~~ + +- Fix a deadlock that could occur on some Linux systems when resolving debug information using debuginfod. (#634) + + memray 1.13.0 (2024-06-18) -------------------------- diff --git a/news/634.bugfix.rst b/news/634.bugfix.rst deleted file mode 100644 index fd96f076d7..0000000000 --- a/news/634.bugfix.rst +++ /dev/null @@ -1 +0,0 @@ -Fix a deadlock that could occur on some Linux systems when resolving debug information using debuginfod. diff --git a/src/memray/_version.py b/src/memray/_version.py index 9a34ccc9fa..4b67c39d7e 100644 --- a/src/memray/_version.py +++ b/src/memray/_version.py @@ -1 +1 @@ -__version__ = "1.13.0" +__version__ = "1.13.1"