Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reintroduce debuginfod deadlock workaround #635

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .bumpversion.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 1.13.0
current_version = 1.13.1
commit = True
message =
Prepare for {new_version} release
Expand Down
9 changes: 9 additions & 0 deletions NEWS.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,15 @@ Changelog

.. towncrier release notes start

memray 1.13.1 (2024-06-23)
--------------------------

Bug Fixes
~~~~~~~~~

- Fix a deadlock that could occur on some Linux systems when resolving debug information using debuginfod. (#634)


memray 1.13.0 (2024-06-18)
--------------------------

Expand Down
2 changes: 1 addition & 1 deletion src/memray/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "1.13.0"
__version__ = "1.13.1"
25 changes: 24 additions & 1 deletion src/vendor/libbacktrace/elf.c
Original file line number Diff line number Diff line change
Expand Up @@ -853,6 +853,8 @@ elf_readlink (struct backtrace_state *state, const char *filename,

#define SYSTEM_BUILD_ID_DIR "/usr/lib/debug/.build-id/"

static int debuginfod_guard = 0;

static int
elf_open_debugfile_by_debuginfod (const char *buildid_data,
size_t buildid_size,
Expand Down Expand Up @@ -4455,7 +4457,7 @@ elf_add (struct backtrace_state *state, const char *filename, int descriptor,

d = elf_open_debugfile_by_buildid (state, buildid_data, buildid_size,
error_callback, data);
if (d < 0) {
if (d < 0 && !debuginfod_guard) {
char* env = getenv(DEBUGINFOD_PROGRESS_ENV_VAR);
if (env) {
fprintf(stderr, "Trying to download debuginfo for %s\n", filename);
Expand Down Expand Up @@ -4930,7 +4932,28 @@ backtrace_initialize (struct backtrace_state *state, const char *filename,
pd.exe_filename = filename;
pd.exe_descriptor = ret < 0 ? descriptor : -1;

/* Here, There Be Dragons: we are about to call dl_iterate_phdr,
which is a glibc-internal function that holds a libc internal
lock. As this function needs to iterate over all the loaded
modules, this lock is shared by dlopen so no new modules can be
loaded while is iterating. This is a problem for us, as the
debuginfod client will use libcurl to spawn threads to download
debuginfo files, and libcurl uses dlopen to load a bunch of stuff
for its backend in some versions. This can cause a deadlock because
the debuginfod client will wait until the libcurl threads finish but
they will never finish because they are waiting for the dlopen lock
to be released, which will not happen until the call to dl_iterate_phdr
finishes.

To avoid this, we use a global variable to detect if we are already
iterating over the modules, and if so, we skip the query to debuginfod
and just try with the other default methods.

Note this ONLY affects the symbol resolution when retrieving a backtrace,
and it doesn't affect offline symbolication. */
debuginfod_guard++;
dl_iterate_phdr (phdr_callback, (void *) &pd);
debuginfod_guard--;

if (!state->threaded)
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ index 0000000..78f4d8d
+
+#endif /* _DEBUGINFOD_CLIENT_H */
diff --git a/elf.c b/elf.c
index fb54165..0fcce38 100644
index fb54165..2eb2546 100644
--- a/elf.c
+++ b/elf.c
@@ -38,6 +38,7 @@ POSSIBILITY OF SUCH DAMAGE. */
Expand All @@ -199,10 +199,12 @@ index fb54165..0fcce38 100644

#ifndef S_ISLNK
#ifndef S_IFLNK
@@ -851,6 +853,37 @@ elf_readlink (struct backtrace_state *state, const char *filename,
@@ -851,6 +853,39 @@ elf_readlink (struct backtrace_state *state, const char *filename,

#define SYSTEM_BUILD_ID_DIR "/usr/lib/debug/.build-id/"

+static int debuginfod_guard = 0;
+
+static int
+elf_open_debugfile_by_debuginfod (const char *buildid_data,
+ size_t buildid_size,
Expand Down Expand Up @@ -237,11 +239,11 @@ index fb54165..0fcce38 100644
/* Open a separate debug info file, using the build ID to find it.
Returns an open file descriptor, or -1.

@@ -4422,6 +4455,14 @@ elf_add (struct backtrace_state *state, const char *filename, int descriptor,
@@ -4422,6 +4457,14 @@ elf_add (struct backtrace_state *state, const char *filename, int descriptor,

d = elf_open_debugfile_by_buildid (state, buildid_data, buildid_size,
error_callback, data);
+ if (d < 0) {
+ if (d < 0 && !debuginfod_guard) {
+ char* env = getenv(DEBUGINFOD_PROGRESS_ENV_VAR);
+ if (env) {
+ fprintf(stderr, "Trying to download debuginfo for %s\n", filename);
Expand All @@ -252,3 +254,32 @@ index fb54165..0fcce38 100644
if (d >= 0)
{
int ret;
@@ -4889,7 +4932,28 @@ backtrace_initialize (struct backtrace_state *state, const char *filename,
pd.exe_filename = filename;
pd.exe_descriptor = ret < 0 ? descriptor : -1;

+ /* Here, There Be Dragons: we are about to call dl_iterate_phdr,
+ which is a glibc-internal function that holds a libc internal
+ lock. As this function needs to iterate over all the loaded
+ modules, this lock is shared by dlopen so no new modules can be
+ loaded while is iterating. This is a problem for us, as the
+ debuginfod client will use libcurl to spawn threads to download
+ debuginfo files, and libcurl uses dlopen to load a bunch of stuff
+ for its backend in some versions. This can cause a deadlock because
+ the debuginfod client will wait until the libcurl threads finish but
+ they will never finish because they are waiting for the dlopen lock
+ to be released, which will not happen until the call to dl_iterate_phdr
+ finishes.
+
+ To avoid this, we use a global variable to detect if we are already
+ iterating over the modules, and if so, we skip the query to debuginfod
+ and just try with the other default methods.
+
+ Note this ONLY affects the symbol resolution when retrieving a backtrace,
+ and it doesn't affect offline symbolication. */
+ debuginfod_guard++;
dl_iterate_phdr (phdr_callback, (void *) &pd);
+ debuginfod_guard--;

if (!state->threaded)
{
Loading