diff --git a/src/DiversionSession.cc b/src/DiversionSession.cc index 3933eaf0aa3..d52ab85fb96 100644 --- a/src/DiversionSession.cc +++ b/src/DiversionSession.cc @@ -72,9 +72,10 @@ static void process_syscall_arch(Task* t, int syscallno) { if (syscallno == t->session().syscall_number_for_rrcall_rdtsc()) { uint64_t rdtsc_value = static_cast(&t->session())->next_rdtsc_value(); LOG(debug) << "Faking rrcall_rdtsc syscall with value " << rdtsc_value; - remote_ptr out_param(t->regs().arg1()); - t->write_mem(out_param, rdtsc_value); - finish_emulated_syscall_with_ret(t, 0); + Registers r = t->regs(); + r.set_dx(rdtsc_value >> 32); + t->set_regs(r); + finish_emulated_syscall_with_ret(t, (uint32_t)rdtsc_value); return; } diff --git a/src/Monkeypatcher.cc b/src/Monkeypatcher.cc index ad630ce5f10..dfdf0713814 100644 --- a/src/Monkeypatcher.cc +++ b/src/Monkeypatcher.cc @@ -159,51 +159,56 @@ template static void substitute_extended_jump(uint8_t* buffer, uint64_t patch_addr, uint64_t return_addr, uint64_t target_addr, - uint32_t fake_syscall_number); + uint32_t fake_syscall_number, + uint8_t stub[20]); template <> void substitute_extended_jump( uint8_t* buffer, uint64_t patch_addr, uint64_t return_addr, - uint64_t target_addr, uint32_t) { + uint64_t target_addr, uint32_t, uint8_t stub[STUB_PATCH_LENGTH]) { int64_t offset = target_addr - (patch_addr + X86SyscallStubExtendedJump::trampoline_relative_addr_end); + int64_t ret_offset = + return_addr - + (patch_addr + X86SyscallStubExtendedJump::return_addr_relative_end); // An offset that appears to be > 2GB is OK here, since EIP will just // wrap around. - X86SyscallStubExtendedJump::substitute(buffer, (uint32_t)return_addr, - (uint32_t)offset); + X86SyscallStubExtendedJump::substitute(buffer, (uint32_t)offset, (char*)stub, + (uint32_t)ret_offset); } template <> void substitute_extended_jump( uint8_t* buffer, uint64_t, uint64_t return_addr, uint64_t target_addr, - uint32_t) { - X64SyscallStubExtendedJump::substitute(buffer, (uint32_t)return_addr, - (uint32_t)(return_addr >> 32), - target_addr); + uint32_t, uint8_t stub[STUB_PATCH_LENGTH]) { + X64SyscallStubExtendedJump::substitute(buffer, (char*)stub, + target_addr, return_addr); } template <> void substitute_extended_jump( uint8_t* buffer, uint64_t patch_addr, uint64_t return_addr, - uint64_t target_addr, uint32_t fake_syscall_number) { + uint64_t target_addr, uint32_t fake_syscall_number, uint8_t stub[STUB_PATCH_LENGTH]) { int64_t offset = target_addr - (patch_addr + X86SyscallStubExtendedJump::trampoline_relative_addr_end); + int64_t ret_offset = + return_addr - + (patch_addr + X86SyscallStubExtendedJump::return_addr_relative_end); // An offset that appears to be > 2GB is OK here, since EIP will just // wrap around. - X86TrapInstructionStubExtendedJump::substitute(buffer, (uint32_t)return_addr, - fake_syscall_number, (uint32_t)offset); + X86TrapInstructionStubExtendedJump::substitute(buffer, + fake_syscall_number, (uint32_t)offset, + (char*)stub, (uint32_t)ret_offset); } template <> void substitute_extended_jump( uint8_t* buffer, uint64_t, uint64_t return_addr, uint64_t target_addr, - uint32_t fake_syscall_number) { - X64TrapInstructionStubExtendedJump::substitute(buffer, (uint32_t)return_addr, - (uint32_t)(return_addr >> 32), - fake_syscall_number, - target_addr); + uint32_t fake_syscall_number, uint8_t stub[STUB_PATCH_LENGTH]) { + X64TrapInstructionStubExtendedJump::substitute(buffer, fake_syscall_number, (char*)stub, + target_addr, return_addr); } /** @@ -449,20 +454,34 @@ static remote_ptr allocate_extended_jump_aarch64( return jump_addr; } -bool Monkeypatcher::is_jump_stub_instruction(remote_code_ptr ip, bool include_safearea) { +Monkeypatcher::patched_syscall *Monkeypatcher::find_jump_stub(remote_code_ptr ip, bool include_safearea) { remote_ptr pp = ip.to_data_ptr(); - auto it = syscallbuf_stubs.upper_bound(pp); - if (it == syscallbuf_stubs.begin()) { - return false; + auto it = syscallbuf_stubs_by_extended_patch.upper_bound(pp); + if (it == syscallbuf_stubs_by_extended_patch.begin()) { + return nullptr; } --it; auto begin = it->first; - auto end = begin + it->second.size; + patched_syscall *ps = &syscall_stub_list[it->second]; + auto end = begin + ps->size; if (!include_safearea) { - begin += it->second.safe_prefix; - end -= it->second.safe_suffix; + begin += ps->safe_prefix; + end -= ps->safe_suffix; + } + return begin <= pp && pp < end ? ps : nullptr; +} + +Monkeypatcher::patched_syscall *Monkeypatcher::find_syscall_patch(remote_code_ptr ip) { + remote_ptr pp = ip.to_data_ptr(); + auto it = syscallbuf_stubs_by_patch_addr.upper_bound(pp); + if (it == syscallbuf_stubs_by_patch_addr.begin()) { + return nullptr; } - return begin <= pp && pp < end; + --it; + auto begin = it->first; + patched_syscall *ps = &syscall_stub_list[it->second]; + auto end = begin + ps->hook->patch_region_length; + return begin <= pp && pp < end ? ps : nullptr; } remote_code_ptr Monkeypatcher::get_jump_stub_exit_breakpoint(remote_code_ptr ip, @@ -471,18 +490,61 @@ remote_code_ptr Monkeypatcher::get_jump_stub_exit_breakpoint(remote_code_ptr ip, return nullptr; } remote_ptr pp = ip.to_data_ptr(); - auto it = syscallbuf_stubs.upper_bound(pp); - if (it == syscallbuf_stubs.begin()) { + auto it = syscallbuf_stubs_by_extended_patch.upper_bound(pp); + if (it == syscallbuf_stubs_by_extended_patch.begin()) { return nullptr; } --it; - auto bp = it->first + it->second.size - it->second.safe_suffix; + patched_syscall *ps = &syscall_stub_list[it->second]; + auto bp = it->first + ps->size - ps->safe_suffix; if (pp == bp || pp == bp - 4) { return remote_code_ptr(bp.as_int()); } return nullptr; } +template +uint64_t get_safe_suffix_length(); + +/* These need to match the size of the post-stack-restore region in assembly_templates.py */ +template <> +uint64_t get_safe_suffix_length() { + return 8 + 8 + 6 + 20 + 2; +} + +template <> +uint64_t get_safe_suffix_length() { + return 2 + 20 + 1 + 4; +} + + +static void fill_with_x86_nops(uint8_t *buf, size_t len) { + for (size_t i = 0; i < len;) { + switch (len - i) { + case 1: buf[i] = 0x90; return; + case 2: buf[i] = 0x60; buf[i+1] = 0x90; return; + case 3: buf[i] = 0x0f; buf[i+1] = 0x1f; buf[i+2] = 0x00; return; + case 4: buf[i] = 0x0f; buf[i+1] = 0x1f; buf[i+2] = 0x40; buf[i+3] = 0x00; break; + case 5: buf[i] = 0x0f; buf[i+1] = 0x1f; buf[i+2] = 0x44; + buf[i+3] = 0x00; buf[i+4] = 0x00; return; + case 6: buf[i] = 0x66; buf[i+1] = 0x0f; buf[i+2] = 0x1f; + buf[i+3] = 0x44; buf[i+4] = 0x00; buf[i+5] = 0x00; return; + case 7: buf[i] = 0x0f; buf[i+1] = 0x1f; buf[i+2] = 0x80; + buf[i+3] = 0x00; buf[i+4] = 0x00; buf[i+5] = 0x00; + buf[i+6] = 0x00; return; + case 8: buf[i] = 0x0f; buf[i+1] = 0x1f; buf[i+2] = 0x84; + buf[i+3] = 0x00; buf[i+4] = 0x00; buf[i+5] = 0x00; + buf[i+6] = 0x00; buf[i+7] = 0x00; return; + default: + case 9: + buf[i] = 0x66; buf[i+1] = 0x0f; buf[i+2] = 0x1f; + buf[i+3] = 0x84; buf[i+4] = 0x00; buf[i+5] = 0x00; + buf[i+6] = 0x00; buf[i+7] = 0x00; buf[i+8] = 0x00; + i += 9; continue; + } + } +} + /** * Some functions make system calls while storing local variables in memory * below the stack pointer. We need to decrement the stack pointer by @@ -539,26 +601,40 @@ static bool patch_syscall_with_hook_x86ish(Monkeypatcher& patcher, return false; } + uint8_t stub[20]; + memset(stub, 0x90, sizeof(stub)); + if (!(hook.flags & PATCH_SYSCALL_INSTRUCTION_IS_LAST)) { + memcpy(stub, hook.patch_region_bytes, hook.patch_region_length); + fill_with_x86_nops(stub + hook.patch_region_length, sizeof(stub) - hook.patch_region_length); + } + + uint16_t safe_suffix = get_safe_suffix_length(); // Everything starting from the syscall instruction if (fake_syscall_number) { uint8_t stub_patch[FakeSyscallExtendedJumpPatch::size]; substitute_extended_jump(stub_patch, extended_jump_start.as_int(), return_addr, hook.hook_address, - fake_syscall_number); + fake_syscall_number, + stub); write_and_record_bytes(t, extended_jump_start, stub_patch); - patcher.syscallbuf_stubs[extended_jump_start] = { &hook, FakeSyscallExtendedJumpPatch::size }; + patcher.syscall_stub_list.push_back({ &hook, jump_patch_start, extended_jump_start, FakeSyscallExtendedJumpPatch::size, 0, safe_suffix }); + patcher.syscallbuf_stubs_by_extended_patch[extended_jump_start] = patcher.syscall_stub_list.size() - 1; + patcher.syscallbuf_stubs_by_patch_addr[jump_patch_start] = patcher.syscall_stub_list.size() - 1; } else { uint8_t stub_patch[ExtendedJumpPatch::size]; substitute_extended_jump(stub_patch, extended_jump_start.as_int(), return_addr, hook.hook_address, - 0); + 0, + stub); write_and_record_bytes(t, extended_jump_start, stub_patch); - patcher.syscallbuf_stubs[extended_jump_start] = { &hook, ExtendedJumpPatch::size }; + patcher.syscall_stub_list.push_back({ &hook, jump_patch_start, extended_jump_start, ExtendedJumpPatch::size, 0, safe_suffix }); + patcher.syscallbuf_stubs_by_extended_patch[extended_jump_start] = patcher.syscall_stub_list.size() - 1; + patcher.syscallbuf_stubs_by_patch_addr[jump_patch_start] = patcher.syscall_stub_list.size() - 1; } intptr_t jump_offset = extended_jump_start - jump_patch_end; @@ -627,8 +703,8 @@ bool patch_syscall_with_hook_arch(Monkeypatcher& patcher, auto total_patch_size = inst_buff.size() * 4; write_and_record_bytes(t, extended_jump_start, total_patch_size, &inst_buff[0]); - patcher.syscallbuf_stubs[extended_jump_start] = { - &hook, total_patch_size, + patcher.syscall_stub_list.push_back({ + &hook, svc_ip, extended_jump_start, total_patch_size, /** * safe_prefix: * We have not modified any registers yet in the first two instructions. @@ -647,7 +723,9 @@ bool patch_syscall_with_hook_arch(Monkeypatcher& patcher, * The caller needs to have special handling for that instruction. */ 3 * 4 + 8 - }; + }); + patcher.syscallbuf_stubs_by_extended_patch[extended_jump_start] = patcher.syscall_stub_list.size() - 1; + patcher.syscallbuf_stubs_by_patch_addr[svc_ip] = patcher.syscall_stub_list.size() - 1; intptr_t jump_offset = extended_jump_start - svc_ip; ASSERT(t, jump_offset <= aarch64_b_max_offset && jump_offset >= aarch64_b_min_offset) @@ -672,33 +750,6 @@ static bool patch_syscall_with_hook(Monkeypatcher& patcher, RecordTask* t, instruction_length, fake_syscall_number); } -template -static bool match_extended_jump_patch(uint8_t patch[], - uint64_t *return_addr); - -template <> -bool match_extended_jump_patch( - uint8_t patch[], uint64_t *return_addr) { - uint32_t return_addr_lo, return_addr_hi; - uint64_t jmp_target; - if (!X64SyscallStubExtendedJump::match(patch, &return_addr_lo, &return_addr_hi, &jmp_target)) { - return false; - } - *return_addr = return_addr_lo | (((uint64_t)return_addr_hi) << 32); - return true; -} - -template <> -bool match_extended_jump_patch( - uint8_t patch[], uint64_t *return_addr) { - uint32_t return_addr_32, jmp_target_relative; - if (!X86SyscallStubExtendedJump::match(patch, &return_addr_32, &jmp_target_relative)) { - return false; - } - *return_addr = return_addr_32; - return true; -} - template static void substitute_replacement_patch(uint8_t *buffer, uint64_t patch_addr, uint64_t jmp_target); @@ -724,17 +775,11 @@ void substitute_replacement_patch(uint8_t *buffer, uint64 template static void unpatch_extended_jumps(Monkeypatcher& patcher, Task* t) { - for (auto patch : patcher.syscallbuf_stubs) { - const syscall_patch_hook &hook = *patch.second.hook; - ASSERT(t, patch.second.size == ExtendedJumpPatch::size); + for (auto &patch : patcher.syscall_stub_list) { + const syscall_patch_hook &hook = *patch.hook; + ASSERT(t, patch.size == ExtendedJumpPatch::size); uint8_t bytes[ExtendedJumpPatch::size]; - t->read_bytes_helper(patch.first, sizeof(bytes), bytes); - uint64_t return_addr; - if (!match_extended_jump_patch(bytes, &return_addr)) { - ASSERT(t, false) << "Failed to match extended jump patch at " << patch.first; - return; - } - + uint64_t return_addr = patch.patch_addr.as_int() + hook.patch_region_length; std::vector syscall = rr::syscall_instruction(t->arch()); // Replace with @@ -748,19 +793,20 @@ static void unpatch_extended_jumps(Monkeypatcher& patcher, ASSERT(t, hook.patch_region_length + ReplacementPatch::size + syscall.size() < ExtendedJumpPatch::size); uint8_t *ptr = bytes; - if (!(hook.flags & PATCH_SYSCALL_INSTRUCTION_IS_LAST)) { - memcpy(ptr, syscall.data(), syscall.size()); - ptr += syscall.size(); - } - memcpy(ptr, hook.patch_region_bytes, hook.patch_region_length); - ptr += hook.patch_region_length; if (hook.flags & PATCH_SYSCALL_INSTRUCTION_IS_LAST) { memcpy(ptr, syscall.data(), syscall.size()); ptr += syscall.size(); + memcpy(ptr, hook.patch_region_bytes, hook.patch_region_length); + substitute_replacement_patch(ptr, + patch.stub_addr.as_int()+(ptr-bytes), return_addr); + t->write_bytes_helper(patch.stub_addr, sizeof(bytes), bytes); + } else { + // We already have a copy of the replaced bytes in place - all we need to + // to is to nop out the preceeding instructions + uint64_t nop_area_size = ExtendedJumpPatch::size - get_safe_suffix_length(); + memset(ptr, 0x90, nop_area_size); + t->write_bytes_helper(patch.stub_addr, nop_area_size, bytes); } - substitute_replacement_patch(ptr, - patch.first.as_int()+(ptr-bytes), return_addr); - t->write_bytes_helper(patch.first, sizeof(bytes), bytes); } } @@ -781,19 +827,19 @@ void unpatch_syscalls_arch(Monkeypatcher &patcher, Task *t) { template <> void unpatch_syscalls_arch(Monkeypatcher &patcher, Task *t) { - for (auto patch : patcher.syscallbuf_stubs) { - const syscall_patch_hook &hook = *patch.second.hook; + for (auto patch : patcher.syscall_stub_list) { + const syscall_patch_hook &hook = *patch.hook; std::vector hook_prefix; uint32_t prefix_ninst; encode_extended_jump_aarch64(hook_prefix, hook.hook_address, 0, &prefix_ninst); uint32_t prefix_size = prefix_ninst * 4; DEBUG_ASSERT(prefix_size <= 13 * 4); - ASSERT(t, patch.second.size >= prefix_size + 8); + ASSERT(t, patch.size >= prefix_size + 8); uint8_t bytes[15 * 4]; - t->read_bytes_helper(patch.first, prefix_size + 8, bytes); + t->read_bytes_helper(patch.stub_addr, prefix_size + 8, bytes); // 3rd last instruction is the one jumping back and it won't match if (memcmp(&hook_prefix[0], bytes, prefix_size - 3 * 4) != 0) { - ASSERT(t, false) << "Failed to match extended jump patch at " << patch.first; + ASSERT(t, false) << "Failed to match extended jump patch at " << patch.stub_addr; return; } @@ -803,7 +849,7 @@ void unpatch_syscalls_arch(Monkeypatcher &patcher, Task *t) { uint32_t svc_inst = 0xd4000001; memcpy(bytes, &svc_inst, 4); - uint64_t reverse_jump_addr = patch.first.as_int() + 4; + uint64_t reverse_jump_addr = patch.stub_addr.as_int() + 4; int64_t reverse_offset = int64_t(return_addr - reverse_jump_addr); ASSERT(t, reverse_offset <= aarch64_b_max_offset && reverse_offset >= aarch64_b_min_offset) @@ -812,7 +858,7 @@ void unpatch_syscalls_arch(Monkeypatcher &patcher, Task *t) { uint32_t binst = 0x14000000 | offset_imm26; memcpy(&bytes[4], &binst, 4); - t->write_bytes_helper(patch.first, 4 * 2, bytes); + t->write_bytes_helper(patch.stub_addr, 4 * 2, bytes); } } diff --git a/src/Monkeypatcher.h b/src/Monkeypatcher.h index 9daf95b3c28..97ac7007d9a 100644 --- a/src/Monkeypatcher.h +++ b/src/Monkeypatcher.h @@ -125,24 +125,34 @@ class Monkeypatcher { }; std::vector extended_jump_pages; - bool is_jump_stub_instruction(remote_code_ptr p, bool include_safearea); - // Return the breakpoint instruction (i.e. the last branch back to caller) - // if we are on the exit path in the jump stub - remote_code_ptr get_jump_stub_exit_breakpoint(remote_code_ptr ip, RecordTask *t); struct patched_syscall { // Pointer to hook inside the syscall_hooks array, which gets initialized // once and is fixed afterwars. const syscall_patch_hook *hook; + remote_ptr patch_addr; + remote_ptr stub_addr; size_t size; uint16_t safe_prefix = 0; uint16_t safe_suffix = 0; }; + patched_syscall *find_jump_stub(remote_code_ptr ip, bool include_safearea); + bool is_jump_stub_instruction(remote_code_ptr p, bool include_safearea) { + return (bool)find_jump_stub(p, include_safearea); + } + + patched_syscall *find_syscall_patch(remote_code_ptr patch_location); + + // Return the breakpoint instruction (i.e. the last branch back to caller) + // if we are on the exit path in the jump stub + remote_code_ptr get_jump_stub_exit_breakpoint(remote_code_ptr ip, RecordTask *t); /** * Addresses/lengths of syscallbuf stubs. */ - std::map, patched_syscall> syscallbuf_stubs; + std::vector syscall_stub_list; + std::map, int> syscallbuf_stubs_by_extended_patch; + std::map, int> syscallbuf_stubs_by_patch_addr; private: /** diff --git a/src/RecordSession.cc b/src/RecordSession.cc index a3d63fb0672..461bed960fe 100644 --- a/src/RecordSession.cc +++ b/src/RecordSession.cc @@ -471,7 +471,6 @@ void RecordSession::handle_seccomp_traced_syscall(RecordTask* t, SupportedArch syscall_arch = t->detect_syscall_arch(); t->canonicalize_regs(syscall_arch); if (!process_syscall_entry(t, step_state, result, syscall_arch)) { - step_state->continue_type = RecordSession::DONT_CONTINUE; return; } *did_enter_syscall = true; @@ -508,6 +507,8 @@ static void seccomp_trap_done(RecordTask* t) { (uint8_t)1); } +extern void disarm_desched_event(RecordTask *t); +extern void leave_syscallbuf(RecordTask *t); static void handle_seccomp_trap(RecordTask* t, RecordSession::StepState* step_state, uint16_t seccomp_data) { @@ -542,27 +543,21 @@ static void handle_seccomp_trap(RecordTask* t, } } - if (t->is_in_untraced_syscall()) { - ASSERT(t, !t->delay_syscallbuf_reset_for_seccomp_trap); - // Don't reset the syscallbuf immediately after delivering the trap. We have - // to wait until this buffered syscall aborts completely before resetting - // the buffer. - t->delay_syscallbuf_reset_for_seccomp_trap = true; - - t->push_event(Event::seccomp_trap()); - + bool is_untraced_syscall = t->is_in_untraced_syscall(); + if (is_untraced_syscall) { // desched may be armed but we're not going to execute the syscall, let - // alone block. If it fires, ignore it. - t->write_mem( - REMOTE_PTR_FIELD(t->syscallbuf_child, desched_signal_may_be_relevant), - (uint8_t)0); + // alone block. Disarm the event and if it fires, ignore it. + disarm_desched_event(t); + leave_syscallbuf(t); + r = t->regs(); } + t->canonicalize_regs(t->detect_syscall_arch()); t->push_syscall_event(syscallno); t->ev().Syscall().failed_during_preparation = true; note_entering_syscall(t); - if (t->is_in_untraced_syscall() && !syscall_entry_already_recorded) { + if (is_untraced_syscall && !syscall_entry_already_recorded) { t->record_current_event(); } @@ -578,10 +573,21 @@ static void handle_seccomp_trap(RecordTask* t, si.native_api.si_code = SYS_SECCOMP; si.native_api._sifields._sigsys._arch = to_audit_arch(r.arch()); si.native_api._sifields._sigsys._syscall = syscallno; + // Documentation says that si_call_addr is the address of the syscall // instruction, but in tests it's immediately after the syscall // instruction. - si.native_api._sifields._sigsys._call_addr = t->ip().to_data_ptr(); + remote_code_ptr seccomp_ip = t->ip(); + + /* If we actually deliver this signal, we will fudge the ip value to instead + point into the patched-out syscall. The callee may rely on these values + matching, so do the same adjustment here. */ + Monkeypatcher::patched_syscall *ps = t->vm()->monkeypatcher().find_jump_stub(seccomp_ip, true); + if (ps) { + seccomp_ip = (ps->patch_addr + (seccomp_ip - ps->stub_addr.as_int()).register_value() - (ps->size - ps->safe_suffix)).as_int(); + } + + si.native_api._sifields._sigsys._call_addr = seccomp_ip.to_data_ptr(); LOG(debug) << "Synthesizing " << si.linux_api; t->stash_synthetic_sig(si.linux_api, DETERMINISTIC_SIG); @@ -591,16 +597,24 @@ static void handle_seccomp_trap(RecordTask* t, t->set_regs(r); t->maybe_restore_original_syscall_registers(); - if (t->is_in_untraced_syscall()) { + if (is_untraced_syscall) { + Registers r = t->regs(); + // Cause kernel processing to skip the syscall + r.set_original_syscallno(SECCOMP_MAGIC_SKIP_ORIGINAL_SYSCALLNO); + t->set_regs(r); + // For buffered syscalls, go ahead and record the exit state immediately. t->ev().Syscall().state = EXITING_SYSCALL; t->record_current_event(); t->pop_syscall(); - // The tracee is currently in the seccomp ptrace-stop. Advance it to the - // syscall-exit stop so that when we try to deliver the SIGSYS via + // The tracee is currently in the seccomp ptrace-stop or syscall-entry stop. + // Advance it to the syscall-exit stop so that when we try to deliver the SIGSYS via // PTRACE_SINGLESTEP, that doesn't trigger a SIGTRAP stop. t->resume_execution(RESUME_SYSCALL, RESUME_WAIT, RESUME_NO_TICKS); + if (t->status().ptrace_event() == PTRACE_EVENT_SECCOMP) { + t->resume_execution(RESUME_SYSCALL, RESUME_WAIT, RESUME_NO_TICKS); + } } // Don't continue yet. At the next iteration of record_step, if we @@ -815,12 +829,6 @@ void RecordSession::task_continue(const StepState& step_state) { // A task in an emulated ptrace-stop must really stay stopped ASSERT(t, !t->emulated_stop_pending); - bool may_restart = t->at_may_restart_syscall(); - - if (may_restart && t->seccomp_bpf_enabled) { - LOG(debug) << " PTRACE_SYSCALL to possibly-restarted " << t->ev(); - } - if (!t->vm()->first_run_event()) { t->vm()->set_first_run_event(trace_writer().time()); } @@ -892,7 +900,7 @@ void RecordSession::task_continue(const StepState& step_state) { makes PTRACE_SYSCALL traps be delivered *before* seccomp RET_TRACE traps. Detect and handle this. */ - if (!t->seccomp_bpf_enabled || may_restart || + if (!t->seccomp_bpf_enabled || syscall_seccomp_ordering_ == PTRACE_SYSCALL_BEFORE_SECCOMP_UNKNOWN) { resume = RESUME_SYSCALL; } else { @@ -1232,6 +1240,17 @@ void RecordSession::syscall_state_changed(RecordTask* t, ASSERT(t, t->regs().original_syscallno() == -1); } rec_did_sigreturn(t); + + /* The inverse of the processing we do during signal delivery - if the IP + points into a region that we patched out, move us to the extended jump + patch instead. */ + Monkeypatcher::patched_syscall *ps = t->vm()->monkeypatcher().find_syscall_patch(t->ip()); + if (ps) { + Registers r = t->regs(); + r.set_ip((ps->stub_addr + (r.ip() - ps->patch_addr.as_int()).register_value() + (ps->size - ps->safe_suffix)).as_int()); + t->set_regs(r); + } + t->record_current_event(); t->pop_syscall(); @@ -1500,6 +1519,33 @@ static bool inject_handled_signal(RecordTask* t) { t->stashed_signal_processed(); int sig = t->ev().Signal().siginfo.si_signo; + + /* + * If we're delivering a signal while in the extended jump patch, pretend we're in the + * unpatched code instead. That way, any unwinder that makes use of CFI for unwinding + * will see the correct unwind info of the patch site rather than that of the extended + * jump patch. The instruction sequence in the original code was of course altered by + * the patch, so if the signal handler inspects that, it might get confused. However, + * that is already a general problem with our patching strategy, in that the application + * is not allowed to read its own code. + * Naturally, we need to perform the inverse transformation in sigreturn. + */ + Monkeypatcher::patched_syscall *ps = t->vm()->monkeypatcher().find_jump_stub(t->ip(), true); + if (ps) { + Registers r = t->regs(); + uint64_t translated_patch_offset = (r.ip() - ps->stub_addr.as_int()).register_value() - (ps->size - ps->safe_suffix); + // We patch out the jump stub with nop, but of course, if we happen to find ourselves + // in the middle of the nop sled, we just want to end up at the end of the patch + // region. + size_t total_patch_region_size = ps->hook->patch_region_length + + rr::syscall_instruction_length(t->arch()); + if (translated_patch_offset > total_patch_region_size) { + translated_patch_offset = total_patch_region_size; + } + r.set_ip(ps->patch_addr.as_int() + translated_patch_offset); + t->set_regs(r); + } + do { // We are ready to inject our signal. // XXX we assume the kernel won't respond by notifying us of a different @@ -1909,32 +1955,22 @@ static bool is_ptrace_any_sysemu(SupportedArch arch, int command) bool RecordSession::process_syscall_entry(RecordTask* t, StepState* step_state, RecordResult* step_result, SupportedArch syscall_arch) { - if (const RecordTask::StashedSignal* sig = t->stashed_sig_not_synthetic_SIGCHLD()) { - // The only four cases where we allow a stashed signal to be pending on - // syscall entry are: - // -- the signal is a ptrace-related signal, in which case if it's generated - // during a blocking syscall, it does not interrupt the syscall - // -- rrcall_notify_syscall_hook_exit, which is effectively a noop and - // lets us dispatch signals afterward - // -- when we're entering a blocking untraced syscall. If it really blocks, - // we'll get the desched-signal notification and dispatch our stashed - // signal. - // -- when we're doing a privileged syscall that's internal to the preload - // logic - // We do not generally want to have stashed signals pending when we enter - // a syscall, because that will execute with a hacked signal mask - // (see RecordTask::will_resume_execution) which could make things go wrong. - ASSERT(t, - t->desched_rec() || is_rrcall_notify_syscall_hook_exit_syscall( - t->regs().original_syscallno(), t->arch()) || - t->ip() == - t->vm() - ->privileged_traced_syscall_ip() - .increment_by_syscall_insn_length(t->arch())) - << "Stashed signal pending on syscall entry when it shouldn't be: " - << sig->siginfo << "; regs=" << t->regs() - << "; last_execution_resume=" << t->last_execution_resume() - << "; sig ip=" << sig->ip; + if (!t->is_in_syscallbuf() && t->stashed_sig_not_synthetic_SIGCHLD()) { + // If we have a pending signal, deliver it as if it had happened just before + // execution of the syscall instruction. To this end, kick us out of the + // current syscall again and set up the registers for a restart. Regular + // signal injection will do the rest. + LOG(debug) << "Entered syscall, but signal pending - setting up pre-syscall signal delivery"; + Registers entry_regs = t->regs(); + Registers r = entry_regs; + // Cause kernel processing to skip the syscall + r.set_original_syscallno(SECCOMP_MAGIC_SKIP_ORIGINAL_SYSCALLNO); + t->set_regs(r); + t->exit_syscall(); + entry_regs.set_ip(entry_regs.ip().decrement_by_syscall_insn_length(syscall_arch)); + entry_regs.set_syscallno(entry_regs.original_syscallno()); + t->set_regs(entry_regs); + return false; } // We just entered a syscall. diff --git a/src/RecordTask.cc b/src/RecordTask.cc index 6e4b3a2e819..0b6e5ff71a8 100644 --- a/src/RecordTask.cc +++ b/src/RecordTask.cc @@ -634,6 +634,9 @@ bool RecordTask::will_resume_execution(ResumeRequest, WaitRequest, if (!set_sigmask(sigset)) { return false; } + LOG(debug) << "Set signal mask to block all signals (bar " + << "SYSCALLBUF_DESCHED_SIGNAL/TIME_SLICE_SIGNAL) while we " + << " have a stashed signal"; } // RESUME_NO_TICKS means that tracee code is not going to run so there's no @@ -710,7 +713,9 @@ void RecordTask::did_wait() { // state, because we do not allow stashed_signals_blocking_more_signals // to hold across syscalls (traced or untraced) that change the signal mask. ASSERT(this, !blocked_sigs_dirty); - xptrace(PTRACE_SETSIGMASK, remote_ptr(8), &blocked_sigs); + if (set_sigmask(blocked_sigs)) { + LOG(debug) << "Blocked signals restored"; + } } else if (syscallbuf_child) { // The syscallbuf struct is only 32 bytes currently so read the whole thing // at once to avoid multiple calls to read_mem. Even though this shouldn't @@ -1294,10 +1299,6 @@ bool RecordTask::set_sigmask(sig_set_t mask) { return false; } ASSERT(this, errno == EINVAL); - } else { - LOG(debug) << "Set signal mask to block all signals (bar " - << "SYSCALLBUF_DESCHED_SIGNAL/TIME_SLICE_SIGNAL) while we " - << " have a stashed signal"; } return true; } diff --git a/src/assembly_templates.py b/src/assembly_templates.py index 866eba71055..3ada7ed1552 100644 --- a/src/assembly_templates.py +++ b/src/assembly_templates.py @@ -14,13 +14,16 @@ class Field(object): def __init__(self, name, byte_length): self.name = name self.byte_length = byte_length + self.types = { 8: 'uint64_t', 4: 'uint32_t', 2: 'uint16_t', 1: 'uint8_t' } def __len__(self): return self.byte_length def c_type(self): - types = { 8: 'uint64_t', 4: 'uint32_t', 2: 'uint16_t', 1: 'uint8_t' } - return types[self.byte_length] + return self.types[self.byte_length] if (self.byte_length in self.types) else 'char' + + def c_arr(self): + return '' if (self.byte_length in self.types) else '[' + str(self.byte_length) + ']' class ShiftField(object): """A field embedded at some bit shift offset in another object.""" @@ -37,6 +40,9 @@ def c_type(self): types = { 8: 'uint64_t', 4: 'uint32_t', 2: 'uint16_t', 1: 'uint8_t' } return types[self.byte_length] + def c_arr(self): + return '' + def patch_c_type(self): types = { 8: 'uint64_t', 4: 'uint32_t', 2: 'uint16_t', 1: 'uint8_t' } return types[len(self.parent)] @@ -96,10 +102,15 @@ def bytes(self): RawBytes(0x8b, 0x25, 0x00, 0x10, 0x00, 0x70), # movl (syscallbuf_stub_alt_stack),%esp # dont_switch: RawBytes(0xff, 0x35, 0x08, 0x10, 0x00, 0x70), # pushl (stub_scratch_1) - RawBytes(0x68), # pushl $return_addr - Field('return_addr', 4), - RawBytes(0xe9), # jmp $trampoline_relative_addr - Field('trampoline_relative_addr', 4) + RawBytes(0xe8), # call $trampoline_relative_addr + Field('trampoline_relative_addr', 4), + # Restore the stack pointer + RawBytes(0x5c), # popl %esp + RawBytes(0xcd, 0x80), # int $0x80 + Field('stub', 20), + RawBytes(0xe9), # jmp $return_addr_relative + Field('return_addr_relative', 4) + ), 'X86TrapInstructionStubExtendedJump': AssemblyTemplate( # This code must match the stubs in syscall_hook.S. @@ -113,12 +124,16 @@ def bytes(self): RawBytes(0x8b, 0x25, 0x00, 0x10, 0x00, 0x70), # movl (syscallbuf_stub_alt_stack),%esp # dont_switch: RawBytes(0xff, 0x35, 0x08, 0x10, 0x00, 0x70), # pushl (stub_scratch_1) - RawBytes(0x68), # pushl $return_addr - Field('return_addr', 4), RawBytes(0xb8), # movl $fake_syscall_no,%eax Field('fake_syscall_no', 4), - RawBytes(0xe9), # jmp $trampoline_relative_addr - Field('trampoline_relative_addr', 4) + RawBytes(0xe8), # call $trampoline_relative_addr + Field('trampoline_relative_addr', 4), + # Restore the stack pointer + RawBytes(0x5c), # popl %esp + RawBytes(0xcd, 0x80), # int $0x80 + Field('stub', 20), + RawBytes(0xe9), # jmp $return_addr_relative + Field('return_addr_relative', 4) ), 'X86SyscallStubRestore': AssemblyTemplate( RawBytes(0xe9), # jmp $trampoline_relative_addr @@ -146,14 +161,16 @@ def bytes(self): # dont_switch: RawBytes(0x48, 0x81, 0xec, 0x00, 0x01, 0x00, 0x00), # subq $256, %rsp # after adjust + # Push the stack pointer we saved above onto our new stack RawBytes(0xff, 0x34, 0x25, 0x10, 0x10, 0x00, 0x70), # pushq (stub_scratch_1) - RawBytes(0x50), # pushq rax (just to make space for the next 2 instructions) - RawBytes(0xc7, 0x04, 0x24), # movl $return_addr_lo,(%rsp) - Field('return_addr_lo', 4), - RawBytes(0xc7, 0x44, 0x24, 0x04), # movl $return_addr_hi,(%rsp+4) - Field('return_addr_hi', 4), - RawBytes(0xff, 0x25, 0x00, 0x00, 0x00, 0x00), # jmp *0(%rip) + RawBytes(0xff, 0x15, 0x1d, 0x00, 0x00, 0x00), # callq *jump_target(%rip) + # Restore the stack pointer + RawBytes(0x5c), # popq %rsp + RawBytes(0x0f, 0x05), # syscall + Field('stub', 20), + RawBytes(0xff, 0x25, 0x08, 0x00, 0x00, 0x00), # jmp *8(%rip) Field('jump_target', 8), + Field('return_addr', 8) ), 'X64TrapInstructionStubExtendedJump': AssemblyTemplate( # This code must match the stubs in syscall_hook.S. @@ -170,15 +187,16 @@ def bytes(self): RawBytes(0x48, 0x81, 0xec, 0x00, 0x01, 0x00, 0x00), # subq $256, %rsp # after adjust RawBytes(0xff, 0x34, 0x25, 0x10, 0x10, 0x00, 0x70), # pushq (stub_scratch_1) - RawBytes(0x50), # pushq rax (just to make space for the next 2 instructions) - RawBytes(0xc7, 0x04, 0x24), # movl $return_addr_lo,(%rsp) - Field('return_addr_lo', 4), - RawBytes(0xc7, 0x44, 0x24, 0x04), # movl $return_addr_hi,(%rsp+4) - Field('return_addr_hi', 4), RawBytes(0xb8), # movl $fake_syscall_no,%eax Field('fake_syscall_no', 4), - RawBytes(0xff, 0x25, 0x00, 0x00, 0x00, 0x00), # jmp *0(%rip) + RawBytes(0xff, 0x15, 0x1d, 0x00, 0x00, 0x00), # callq *jump_target(%rip) + # Restore the stack pointer + RawBytes(0x5c), # popq %rsp + RawBytes(0x0f, 0x05), # syscall + Field('stub', 20), + RawBytes(0xff, 0x25, 0x08, 0x00, 0x00, 0x00), # jmp *8(%rip) Field('jump_target', 8), + Field('return_addr', 8) ), 'X64SyscallStubRestore': AssemblyTemplate( RawBytes(0xff, 0x25, 0x00, 0x00, 0x00, 0x00), # jmp *0(%rip) @@ -232,7 +250,8 @@ def generate_match_method(byte_array, template): fields = template.fields() field_types = [f.c_type() for f in fields] field_names = [f.name for f in fields] - args = ', ' + ', '.join("%s* %s" % (t, n) for t, n in zip(field_types, field_names)) \ + field_arrs = [f.c_arr() for f in fields] + args = ', ' + ', '.join("%s (*%s)%s" % (t, n, a) for t, n, a in zip(field_types, field_names, field_arrs)) \ if fields else '' s.write(' static bool match(const uint8_t* buffer %s) {\n' % (args,)) @@ -240,8 +259,8 @@ def generate_match_method(byte_array, template): for chunk in template.chunks: if isinstance(chunk, Field): field_name = chunk.name - s.write(' memcpy(%s, &buffer[%d], sizeof(*%s));\n' - % (field_name, offset, field_name)) + s.write(' memcpy(%s, &buffer[%d], %d);\n' + % (field_name, offset, len(chunk))) elif isinstance(chunk, ShiftField): s.write(' (void)%s;' % chunk.name) s.write(' assert(0 && "Matching not implemented for ShiftField");') @@ -256,8 +275,8 @@ def generate_match_method(byte_array, template): def generate_substitute_chunk(s, chunk, byte_array, offset): if isinstance(chunk, Field): field_name = chunk.name - s.write(' memcpy(&buffer[%d], &%s, sizeof(%s));\n' - % (offset, field_name, field_name)) + s.write(' memcpy(&buffer[%d], &%s, %d);\n' + % (offset, field_name if chunk.c_arr() == '' else '*'+field_name, len(chunk))) elif isinstance(chunk, ShiftField): generate_substitute_chunk(s, chunk.parent, byte_array, offset); typ = chunk.patch_c_type() @@ -275,7 +294,8 @@ def generate_substitute_method(byte_array, template): fields = template.fields() field_types = [f.c_type() for f in fields] field_names = [f.name for f in fields] - args = ', ' + ', '.join("%s %s" % (t, n) for t, n in zip(field_types, field_names)) \ + field_arrs = [f.c_arr() for f in fields] + args = ', ' + ', '.join("%s %s%s" % (t, n, a) for t, n, a in zip(field_types, field_names, field_arrs)) \ if fields else '' s.write(' static void substitute(uint8_t* buffer %s) {\n' % (args,)) diff --git a/src/preload/preload_interface.h b/src/preload/preload_interface.h index 423cd7fc72e..7198ebc3a06 100644 --- a/src/preload/preload_interface.h +++ b/src/preload/preload_interface.h @@ -178,6 +178,7 @@ static inline const char* extract_file_name(const char* s) { */ #define PATCH_SYSCALL_INSTRUCTION_IS_LAST (1 << 1) +#define STUB_PATCH_LENGTH 20 /** * To support syscall buffering, we replace syscall instructions with a "call" * instruction that calls a hook in the preload library to handle the syscall. @@ -493,7 +494,7 @@ struct syscallbuf_record { /* Does this record require an assist during replay ? */ uint8_t replay_assist : 1; uint8_t _flags_padding : 6; - uint8_t _padding; + uint8_t aborted; /* Size of entire record in bytes: this struct plus extra * recorded data stored inline after the last field, not * including padding. diff --git a/src/preload/syscall_hook.S b/src/preload/syscall_hook.S index fc963d2c9e2..c9efc87f1d9 100644 --- a/src/preload/syscall_hook.S +++ b/src/preload/syscall_hook.S @@ -31,6 +31,16 @@ ((val) & (0xFF << 0x30)) >> 0x30, \ ((val) & (0xFF << 0x38)) >> 0x38 +#define DW_OP_deref 0x06 +#define DW_OP_dup 0x12 +#define DW_OP_minus 0x1c +#define DW_OP_lit(val) 0x30+val +#define DW_OP_plus 0x22 +#define DW_OP_plus_uconst(const) 0x23, const + +#define DW_CFA_def_cfa_expression 0x0f +#define DW_CFA_val_expression 0x16 + #define REG_AT_ADDR32(reg, addr) \ .cfi_escape 0x10, /* DW_CFA_expression */ \ reg, \ @@ -42,6 +52,20 @@ 0x09, /* 9 byte expression follows */ \ DW_OP_CONST8U(addr) +#define REG_AT_ADDR32_PLUS_OFFSET(reg, addr, offset) \ + .cfi_escape DW_CFA_val_expression, \ + reg, \ + 0x08, /* 8 byte expression follows */ \ + DW_OP_CONST4U(addr), \ + DW_OP_deref, \ + DW_OP_plus_uconst(offset); + +#define CFA_AT_ADDR32(addr) \ + .cfi_escape DW_CFA_def_cfa_expression, \ + 0x06, /* 5 byte expression follows */ \ + DW_OP_CONST4U(addr), \ + DW_OP_deref; + // 10 bytes LEB128 is enough to encode 64bit integer and we shouldn't // really need anything longer than that. #define COUNT_LEB128(lebs...) \ @@ -53,7 +77,17 @@ reg, \ (COUNT_LEB128(lebs) + 1), /* 1 byte + LEB128 bytes */ \ (0x70 + base), /* DW_OP_breg0 + base */ \ - lebs + lebs; + +#define REG_AT_REG_OFFSET_DEREF_OFFSET(reg, base, offset1, offset2) \ + .cfi_escape 0x10, /* DW_CFA_expression */ \ + reg, \ + (2 + 1 + 2), \ + (0x70 + base), /* DW_OP_breg0 + base */ \ + offset1, \ + DW_OP_deref, \ + DW_OP_plus_uconst(offset2); + #if defined(__i386__) .text @@ -62,6 +96,77 @@ .set alt_stack_nesting_level, preload_thread_locals + 12 .set saved_flags, preload_thread_locals + 16 + +// Needs to match assembly_templates.py. Measured from the end of the call +// instruction to before the jmp instruction. +#define EXTENDED_JUMP_STUB_REGION_SIZE 23 + +/* + * The syscallbuf extended jump patch has the form: + * + * call + * pop %esl + * int $0x80 + * + * jmpl + * + * + * The macros help read the relative jump address from memory and convert + * it to an absolute address. The idea is that during execution in the + * syscallbuf, our backtrace will look like: + * + * < syscallbuf C code > + * _syscall_hook_trampoline + * < function that contains the patched syscall > + * + * except while executing in the actuall syscallhook stubs, where the back trace + * will look like: + * + * _syscallbuf_hook_**** + * < function that contains the patched syscall > + * + * There are two things to note here: + * 1. We always omit the extended jump patch from the backtrace, because + * we (currently) have no way to give GDB any unwind info for it. + * 2. While in the syscallbuf C code, we omit the syscallbuf_hook_* functions + * from the backtrace. This is because we may switch out that frame for + * the bail path, which would confuse GDB when attempting to leave a frame. + * This setup is a bit weird, but it's not terrible, because we are essentially + * modeling a set of leaf frames that tail call to each other, which is a + * supported mode of operation and should thus not confused GDB too much. + */ +#define REL_JMP_FROM_JMP_INSTR(offset2) \ + /* Move us to after the jmp instruction */ \ + DW_OP_plus_uconst(offset2), \ + DW_OP_dup, \ + DW_OP_lit(4), \ + DW_OP_minus, \ + DW_OP_deref, \ + DW_OP_plus +#define RIP_IS_AT_REL_JMP_OFFSET(base, offset1, offset2) \ +.cfi_escape DW_CFA_val_expression, \ + 0x08 /* %eip */, \ + 0xa, /* 10 byte expression follows */ \ + /* Compute the return address that's on the stack */ \ + (0x70 + base), /* DW_OP_breg0 + base */ \ + offset1, \ + DW_OP_deref, \ + REL_JMP_FROM_JMP_INSTR(offset2); +#define RIP_IS_AT_ADDR_REL_JMP_OFFSET(addr, offset2) \ +.cfi_escape DW_CFA_val_expression, \ + 0x08 /* %eip */, \ + 0xd, /* 13 byte expression follows */ \ + DW_OP_CONST4U(addr), \ + DW_OP_deref, \ + REL_JMP_FROM_JMP_INSTR(offset2); +#define RIP_IS_AT_CFA_REL_JMP_OFFSET(offset1, offset2) \ +.cfi_escape DW_CFA_val_expression, \ + 0x08 /* %eip */, \ + 0xa, /* 13 byte expression follows */ \ + DW_OP_plus_uconst(offset1), \ + DW_OP_deref, \ + REL_JMP_FROM_JMP_INSTR(offset2); + .p2align 4 _syscallbuf_code_start: @@ -76,6 +181,8 @@ _syscallbuf_final_exit_instruction: _syscall_hook_trampoline: .cfi_startproc + .cfi_offset %esp, 4; + RIP_IS_AT_CFA_REL_JMP_OFFSET(0x0, 0x5) /* Build a |struct syscall_info| by pushing all the syscall * args and the number onto the stack. */ /* struct syscall_info info; */ @@ -129,7 +236,7 @@ _syscall_hook_trampoline: pushl %ebp call syscall_hook - /* $eax = vsyscall_hook(&info); */ + /* $eax = syscall_hook(&info); */ movdqa 0x10(%esp),%xmm0 movdqa 0x20(%esp),%xmm1 @@ -140,6 +247,13 @@ _syscall_hook_trampoline: movdqa 0x70(%esp),%xmm6 movdqa 0x80(%esp),%xmm7 + test %eax,%eax + jnz 2f + + // Switch the syscallbuf hook frame to the bail path + movl $_syscall_hook_bail+1, 0x1c(%ebp) + +2: mov $saved_flags, %esp popfw /* From here on, non-application flag changes are not allowed */ @@ -148,9 +262,7 @@ _syscall_hook_trampoline: mov %ebp, %esp .cfi_def_cfa_register %esp - /* $eax is now the syscall return value. Erase |info.no| from the - * stack so that we can restore the other registers we saved. */ - lea 4(%esp),%esp + pop %eax .cfi_adjust_cfa_offset -4 /* Contract of __kernel_vsyscall() and real syscalls is that even @@ -189,18 +301,20 @@ _syscall_hook_trampoline: name: \ .cfi_startproc; \ .cfi_def_cfa_offset 0; \ - .cfi_offset %eip, 0; \ - .cfi_offset %esp, 4 + RIP_IS_AT_REL_JMP_OFFSET(0x04 /* esp */, 0x00, EXTENDED_JUMP_STUB_REGION_SIZE + 5) \ + .cfi_offset %esp, 4; \ + addl $EXTENDED_JUMP_STUB_REGION_SIZE, (%esp); \ + RIP_IS_AT_REL_JMP_OFFSET(0x04 /* esp */, 0x00, 5) #define SYSCALLHOOK_END(name) \ pop (stub_scratch_1); \ .cfi_adjust_cfa_offset -4; \ + RIP_IS_AT_ADDR_REL_JMP_OFFSET(stub_scratch_1, 0x05) \ pop %esp; \ .cfi_same_value %esp; \ - REG_AT_ADDR32(0x08 /* %eip */, stub_scratch_1); \ jmp _syscallbuf_final_exit_instruction; \ .cfi_endproc; \ - .size name, .-name + .size name, .-name; SYSCALLHOOK_START(_syscall_hook_trampoline_3d_01_f0_ff_ff) call _syscall_hook_trampoline @@ -211,6 +325,32 @@ SYSCALLHOOK_START(_syscall_hook_trampoline_90_90_90) call _syscall_hook_trampoline SYSCALLHOOK_END(_syscall_hook_trampoline_90_90_90) +.global _syscall_hook_bail +.hidden _syscall_hook_bail +.type _syscall_hook_bail, @function +_syscall_hook_bail: +.cfi_startproc + .cfi_def_cfa_offset 0; + .cfi_offset %esp, 4 + RIP_IS_AT_REL_JMP_OFFSET(0x04 /* esp */, 0x00, 0x05) + nop + // We target our return here rather than the first instruction in the function, + // because gdb likes to back up a byte for function identification and gets confused. + pop (stub_scratch_1); + .cfi_def_cfa_offset -4; + RIP_IS_AT_ADDR_REL_JMP_OFFSET(stub_scratch_1, 0x05) + // GDB doesn't like stack adjustments in increments of 2, so use pushfl, rather than pushfw + pushfl + .cfi_def_cfa_offset 0; + subl $EXTENDED_JUMP_STUB_REGION_SIZE, (stub_scratch_1) + RIP_IS_AT_ADDR_REL_JMP_OFFSET(stub_scratch_1, EXTENDED_JUMP_STUB_REGION_SIZE + 0x05) + popfl + .cfi_def_cfa_offset -4; + jmp *(stub_scratch_1) +nop +ret +.cfi_endproc + /* Declare gcc get_pc thunks here so they're in a known region of code */ .global _get_pc_thunks_start @@ -320,6 +460,23 @@ _syscall_hook_trampoline: movdqa 0x60(%rsp),%xmm6 movdqa 0x70(%rsp),%xmm7 + test %rax,%rax + jnz 2f + + // Switch the syscallbuf hook frame to the bail path + lea (_syscall_hook_bail+1)(%rip), %rdi + movq %rdi, 0x48(%rbx) + + // Canonicalize registers that are affected by syscall entry. + // We sometimes don't record a deferred event until we'we already + // hit the bail path syscall instruction, but want to pretend it + // hapened just before. By setting the registers here, replay will + // see that event point with the same register set we had during + // record. + movq $0x246, %r11 + movq $-1, %rcx + +2: mov $saved_flags, %rsp popfw /* From here on, non-application flag changes are not allowed */ @@ -327,9 +484,9 @@ _syscall_hook_trampoline: mov %rbx,%rsp .cfi_def_cfa_register %rsp - /* On entrance, we pushed the %rax, the syscall number. But we don't - want to |pop %rax|, as that will overwrite our return value. Skip over it. */ - pop %rdi + /* This restores either the original value of rax (if we're going out + via the bail path) or the syscall result (in the regular path). */ + pop %rax .cfi_adjust_cfa_offset -8 /* We don't really *need* to restore these, since the kernel could have @@ -406,10 +563,14 @@ _syscallbuf_final_exit_instruction: 0x77, offset; /* DW_OP_breg7, offset */ #define RIP_IS_DEREF_RSP(offset) REG_AT_REG_OFFSET(0x10 /* %rip */, 7, offset) +#define RIP_IS_DEREF_DEREF_RSP(offset1, offset2) \ + REG_AT_REG_OFFSET_DEREF_OFFSET(0x10 /* %rip */, 7, offset1, offset2) /** * On syscallhook entry, the stack has been switched to the end of per-task * scratch space, then the old RSP and the return address have been pushed. + * The CFA of our syscallbuf frame is the value that rsp was at the syscall + * instruction we're patching (i.e. on the unswitched stack). */ #define SYSCALLHOOK_START(name) \ .global name; \ @@ -419,17 +580,28 @@ name: \ .cfi_startproc; \ CFA_AT_RSP_OFFSET(8) \ RSP_IS_CFA \ - RIP_IS_DEREF_RSP(0) + RIP_IS_DEREF_DEREF_RSP(0, 0x25) +/* We skip returning into the extended jump patch, because we + don't have a CFI frame for it and this makes GDB slightly + happier. */ #define SYSCALLHOOK_END(name) \ - pop (stub_scratch_1); \ + popq (stub_scratch_1); \ CFA_AT_RSP_OFFSET(0) \ - REG_AT_ADDR32(0x10 /* %rip */, stub_scratch_1); \ - pop %rsp; \ + REG_AT_ADDR32_PLUS_OFFSET(0x10 /* %rip */, stub_scratch_1, 0x25) \ + popq %rsp; \ + .cfi_def_cfa %rsp, 0; \ + xchgq %rsp, (stub_scratch_1); \ + CFA_AT_ADDR32(stub_scratch_1) \ + RIP_IS_DEREF_RSP(0x25); \ + mov 0x25(%rsp), %rsp; \ + RIP_IS_DEREF_RSP(0); \ .cfi_def_cfa %rsp, 0; \ + xchgq %rsp, (stub_scratch_1); \ + REG_AT_ADDR32(0x10 /* %rip */, stub_scratch_1); \ jmp _syscallbuf_final_exit_instruction; \ .cfi_endproc; \ - .size name, .-name + .size name, .-name; /* See note above on what __morestack is for */ .global __morestack @@ -539,7 +711,6 @@ SYSCALLHOOK_START(_syscall_hook_trampoline_c3_nop) pop (stub_scratch_1) .cfi_adjust_cfa_offset -8 jmp _syscallbuf_final_exit_instruction - .cfi_endproc .size _syscall_hook_trampoline_c3_nop, .-_syscall_hook_trampoline_c3_nop @@ -609,6 +780,22 @@ SYSCALLHOOK_START(_syscall_hook_trampoline_b8_ca_00_00_00) callq __morestack SYSCALLHOOK_END(_syscall_hook_trampoline_b8_ca_00_00_00) +.global _syscall_hook_bail +.hidden _syscall_hook_bail +.type _syscall_hook_bail, @function +_syscall_hook_bail: +.cfi_startproc +CFA_AT_RSP_OFFSET(8) +RSP_IS_CFA +RIP_IS_DEREF_DEREF_RSP(0, 0x25) +nop +// We target our return here rather than the first instruction in the function, +// because gdb likes to back up a byte for function identification and gets confused. +nop +retq +.cfi_endproc + + #elif defined(__aarch64__) .text diff --git a/src/preload/syscallbuf.c b/src/preload/syscallbuf.c index f631332904b..aa35ee62811 100644 --- a/src/preload/syscallbuf.c +++ b/src/preload/syscallbuf.c @@ -320,26 +320,6 @@ static int privileged_traced_syscall(int syscallno, long a0, long a1, long a2, #define privileged_traced_syscall1(no, a0) privileged_traced_syscall2(no, a0, 0) #define privileged_traced_syscall0(no) privileged_traced_syscall1(no, 0) -/** - * Make a raw traced syscall using the params in |call|. - */ -static long traced_raw_syscall(struct syscall_info* call) { - if (call->no == SYS_rrcall_rdtsc) { - // Handle this specially because the rrcall writes to a memory out-param - // and we need to actually modify the outgoing AX/DX registers instead. - uint32_t tsc[2]; - privileged_traced_syscall1(SYS_rrcall_rdtsc, tsc); - // Overwrite RDX (syscall arg 3) with our TSC value. - call->args[2] = tsc[1]; - return tsc[0]; - } - /* FIXME: pass |call| to avoid pushing these on the stack - * again. */ - return _raw_syscall(call->no, call->args[0], call->args[1], call->args[2], - call->args[3], call->args[4], call->args[5], - RR_PAGE_SYSCALL_TRACED, 0, 0); -} - /** * Make a raw traced syscall using the params in |call|, privileged. */ @@ -732,7 +712,7 @@ static void __attribute__((constructor)) init_process(void) { 5, { 0x3d, 0x01, 0xf0, 0xff, 0xff }, (uintptr_t)_syscall_hook_trampoline_3d_01_f0_ff_ff }, - /* Our vdso syscall patch has 'int 80' followed by onp; nop; nop */ + /* Our vdso syscall patch has 'int 80' followed by nop; nop; nop */ { PATCH_IS_MULTIPLE_INSTRUCTIONS, 3, { 0x90, 0x90, 0x90 }, @@ -741,25 +721,28 @@ static void __attribute__((constructor)) init_process(void) { extern char _get_pc_thunks_start; extern char _get_pc_thunks_end; #elif defined(__x86_64__) - extern RR_HIDDEN void _syscall_hook_trampoline_48_3d_01_f0_ff_ff(void); - extern RR_HIDDEN void _syscall_hook_trampoline_48_3d_00_f0_ff_ff(void); - extern RR_HIDDEN void _syscall_hook_trampoline_48_8b_3c_24(void); - extern RR_HIDDEN void _syscall_hook_trampoline_48_89_45_f8(void); - extern RR_HIDDEN void _syscall_hook_trampoline_48_89_c3(void); - extern RR_HIDDEN void _syscall_hook_trampoline_5a_5e_c3(void); - extern RR_HIDDEN void _syscall_hook_trampoline_89_c2_f7_da(void); - extern RR_HIDDEN void _syscall_hook_trampoline_90_90_90(void); - extern RR_HIDDEN void _syscall_hook_trampoline_ba_01_00_00_00(void); - extern RR_HIDDEN void _syscall_hook_trampoline_89_c1_31_d2(void); - extern RR_HIDDEN void _syscall_hook_trampoline_c3_nop(void); - extern RR_HIDDEN void _syscall_hook_trampoline_40_80_f6_81(void); - extern RR_HIDDEN void _syscall_hook_trampoline_49_89_ca(void); - extern RR_HIDDEN void _syscall_hook_trampoline_48_89_c1(void); - extern RR_HIDDEN void _syscall_hook_trampoline_48_c1_e2_20(void); - extern RR_HIDDEN void _syscall_hook_trampoline_4c_89_f7(void); - extern RR_HIDDEN void _syscall_hook_trampoline_4c_89_ff(void); - extern RR_HIDDEN void _syscall_hook_trampoline_49_c7_c1_ff_ff_ff_ff(void); - extern RR_HIDDEN void _syscall_hook_trampoline_b8_ca_00_00_00(void); +#define DECLARE_SYSCALLHOOK(name) \ + extern RR_HIDDEN void _syscall_hook_trampoline_ ## name(void); + + DECLARE_SYSCALLHOOK(48_3d_01_f0_ff_ff); + DECLARE_SYSCALLHOOK(48_3d_00_f0_ff_ff); + DECLARE_SYSCALLHOOK(48_8b_3c_24); + DECLARE_SYSCALLHOOK(48_89_45_f8); + DECLARE_SYSCALLHOOK(48_89_c3); + DECLARE_SYSCALLHOOK(5a_5e_c3); + DECLARE_SYSCALLHOOK(89_c2_f7_da); + DECLARE_SYSCALLHOOK(90_90_90); + DECLARE_SYSCALLHOOK(ba_01_00_00_00); + DECLARE_SYSCALLHOOK(89_c1_31_d2); + DECLARE_SYSCALLHOOK(c3_nop); + DECLARE_SYSCALLHOOK(40_80_f6_81); + DECLARE_SYSCALLHOOK(49_89_ca); + DECLARE_SYSCALLHOOK(48_89_c1); + DECLARE_SYSCALLHOOK(48_c1_e2_20); + DECLARE_SYSCALLHOOK(4c_89_f7); + DECLARE_SYSCALLHOOK(4c_89_ff); + DECLARE_SYSCALLHOOK(49_c7_c1_ff_ff_ff_ff); + DECLARE_SYSCALLHOOK(b8_ca_00_00_00); #define MOV_RDX_VARIANTS \ MOV_RDX_TO_REG(48, d0) \ @@ -780,9 +763,12 @@ static void __attribute__((constructor)) init_process(void) { MOV_RDX_TO_REG(49, d7) #define MOV_RDX_TO_REG(rex, op) \ - extern RR_HIDDEN void _syscall_hook_trampoline_##rex##_89_##op(void); + DECLARE_SYSCALLHOOK(rex##_89_##op) MOV_RDX_VARIANTS +#define HOOK_REFERENCE(name) \ + (uintptr_t)_syscall_hook_trampoline_##name + struct syscall_patch_hook syscall_patch_hooks[] = { /* Many glibc syscall wrappers (e.g. read) have 'syscall' followed * by @@ -790,58 +776,58 @@ static void __attribute__((constructor)) init_process(void) { { 0, 6, { 0x48, 0x3d, 0x01, 0xf0, 0xff, 0xff }, - (uintptr_t)_syscall_hook_trampoline_48_3d_01_f0_ff_ff }, + HOOK_REFERENCE(48_3d_01_f0_ff_ff) }, /* Many glibc syscall wrappers (e.g. __libc_recv) have 'syscall' * followed by * cmp $-4096,%rax (in glibc-2.18-16.fc20.x86_64) */ { 0, 6, { 0x48, 0x3d, 0x00, 0xf0, 0xff, 0xff }, - (uintptr_t)_syscall_hook_trampoline_48_3d_00_f0_ff_ff }, + HOOK_REFERENCE(48_3d_00_f0_ff_ff) }, /* Many glibc syscall wrappers (e.g. read) have 'syscall' followed * by * mov (%rsp),%rdi (in glibc-2.18-16.fc20.x86_64) */ { 0, 4, { 0x48, 0x8b, 0x3c, 0x24 }, - (uintptr_t)_syscall_hook_trampoline_48_8b_3c_24 }, + HOOK_REFERENCE(48_8b_3c_24) }, /* Some syscall wrappers have 'syscall' followed * by * mov %rax,-8(%rbp) */ { 0, 4, { 0x48, 0x89, 0x45, 0xf8 }, - (uintptr_t)_syscall_hook_trampoline_48_89_45_f8 }, + HOOK_REFERENCE(48_89_45_f8) }, /* Some syscall wrappers (e.g. read) have 'syscall' followed * by * mov %rax,%rbx */ { 0, 3, { 0x48, 0x89, 0xc3 }, - (uintptr_t)_syscall_hook_trampoline_48_89_c3 }, + HOOK_REFERENCE(48_89_c3) }, /* Some RDTSC instructions are followed by 'mov %rax,%rcx'. */ { 0, 3, { 0x48, 0x89, 0xc1 }, - (uintptr_t)_syscall_hook_trampoline_48_89_c1 }, + HOOK_REFERENCE(48_89_c1) }, /* __lll_unlock_wake has 'syscall' followed by * pop %rdx; pop %rsi; ret */ { PATCH_IS_MULTIPLE_INSTRUCTIONS, 3, { 0x5a, 0x5e, 0xc3 }, - (uintptr_t)_syscall_hook_trampoline_5a_5e_c3 }, + HOOK_REFERENCE(5a_5e_c3) }, /* posix_fadvise64 has 'syscall' followed by * mov %eax,%edx; neg %edx (in glibc-2.22-11.fc23.x86_64) */ { PATCH_IS_MULTIPLE_INSTRUCTIONS, 4, { 0x89, 0xc2, 0xf7, 0xda }, - (uintptr_t)_syscall_hook_trampoline_89_c2_f7_da }, + HOOK_REFERENCE(89_c2_f7_da) }, /* Our VDSO vsyscall patches have 'syscall' followed by "nop; nop; nop" */ { PATCH_IS_MULTIPLE_INSTRUCTIONS, 3, { 0x90, 0x90, 0x90 }, - (uintptr_t)_syscall_hook_trampoline_90_90_90 }, + HOOK_REFERENCE(90_90_90) }, /* glibc-2.22-17.fc23.x86_64 has 'syscall' followed by 'mov $1,%rdx' * in * pthread_barrier_wait. @@ -849,54 +835,54 @@ static void __attribute__((constructor)) init_process(void) { { 0, 5, { 0xba, 0x01, 0x00, 0x00, 0x00 }, - (uintptr_t)_syscall_hook_trampoline_ba_01_00_00_00 }, + HOOK_REFERENCE(ba_01_00_00_00) }, /* pthread_sigmask has 'syscall' followed by 'mov %eax,%ecx; xor %edx,%edx' */ { PATCH_IS_MULTIPLE_INSTRUCTIONS, 4, { 0x89, 0xc1, 0x31, 0xd2 }, - (uintptr_t)_syscall_hook_trampoline_89_c1_31_d2 }, + HOOK_REFERENCE(89_c1_31_d2) }, /* getpid has 'syscall' followed by 'retq; nopl 0x0(%rax,%rax,1) */ { PATCH_IS_MULTIPLE_INSTRUCTIONS, 9, { 0xc3, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00 }, - (uintptr_t)_syscall_hook_trampoline_c3_nop }, + HOOK_REFERENCE(c3_nop) }, /* liblsan internal_close has 'syscall' followed by 'retq; nopl 0x0(%rax,%rax,1) */ { PATCH_IS_MULTIPLE_INSTRUCTIONS, 6, { 0xc3, 0x0f, 0x1f, 0x44, 0x00, 0x00 }, - (uintptr_t)_syscall_hook_trampoline_c3_nop }, + HOOK_REFERENCE(c3_nop) }, /* glibc-2.29-15.fc30.x86_64 getpid has 'syscall' followed by 'retq; nopl 0x0(%rax) */ { PATCH_IS_MULTIPLE_INSTRUCTIONS, 5, { 0xc3, 0x0f, 0x1f, 0x40, 0x00 }, - (uintptr_t)_syscall_hook_trampoline_c3_nop }, + HOOK_REFERENCE(c3_nop) }, /* liblsan internal_open has 'syscall' followed by 'retq; nopl (%rax) */ { PATCH_IS_MULTIPLE_INSTRUCTIONS, 4, { 0xc3, 0x0f, 0x1f, 0x00 }, - (uintptr_t)_syscall_hook_trampoline_c3_nop }, + HOOK_REFERENCE(c3_nop) }, /* liblsan internal_dup2 has 'syscall' followed by 'retq; xchg %ax,%ax */ { PATCH_IS_MULTIPLE_INSTRUCTIONS, 3, { 0xc3, 0x66, 0x90 }, - (uintptr_t)_syscall_hook_trampoline_c3_nop }, + HOOK_REFERENCE(c3_nop) }, /* Go runtime has 'syscall' followed by 'retq; int3; int3 */ { PATCH_IS_MULTIPLE_INSTRUCTIONS, 3, { 0xc3, 0xcc, 0xcc }, - (uintptr_t)_syscall_hook_trampoline_c3_nop }, + HOOK_REFERENCE(c3_nop) }, /* glibc-2.31 on Ubuntu 20.04 has 'xor $0x81, %sil' followed by 'syscall' */ { PATCH_SYSCALL_INSTRUCTION_IS_LAST, 4, { 0x40, 0x80, 0xf6, 0x81 }, - (uintptr_t)_syscall_hook_trampoline_40_80_f6_81 }, + HOOK_REFERENCE(40_80_f6_81) }, /* DynamoRIO has 'mov r10, rcx' followed by 'syscall' */ { PATCH_SYSCALL_INSTRUCTION_IS_LAST, 3, { 0x49, 0x89, 0xca }, - (uintptr_t)_syscall_hook_trampoline_49_89_ca }, + HOOK_REFERENCE(49_89_ca) }, /* Some applications have RDTSC followed by 'mov %rdx,any-reg' */ #undef MOV_RDX_TO_REG #define MOV_RDX_TO_REG(rex, op) \ @@ -904,34 +890,34 @@ static void __attribute__((constructor)) init_process(void) { 0, \ 3, \ { 0x##rex, 0x89, 0x##op }, \ - (uintptr_t)_syscall_hook_trampoline_##rex##_89_##op }, + HOOK_REFERENCE(rex##_89_##op) }, MOV_RDX_VARIANTS /* Some application has RDTSC followed by 'shl $32,%rdx' */ { 0, 4, { 0x48, 0xc1, 0xe2, 0x20 }, - (uintptr_t)_syscall_hook_trampoline_48_c1_e2_20 }, + HOOK_REFERENCE(48_c1_e2_20) }, /* Some application has 'mov %r14,%rdi' followed by 'syscall' */ { PATCH_SYSCALL_INSTRUCTION_IS_LAST, 3, { 0x4c, 0x89, 0xf7 }, - (uintptr_t)_syscall_hook_trampoline_4c_89_f7 }, + HOOK_REFERENCE(4c_89_f7) }, /* Some application has 'mov %r15,%rdi' followed by 'syscall' */ { PATCH_SYSCALL_INSTRUCTION_IS_LAST, 3, { 0x4c, 0x89, 0xff }, - (uintptr_t)_syscall_hook_trampoline_4c_89_ff }, + HOOK_REFERENCE(4c_89_ff) }, /* Some application has 'mov $0xffffffff,%r9' followed by 'syscall' */ { PATCH_SYSCALL_INSTRUCTION_IS_LAST, 7, { 0x49, 0xc7, 0xc1, 0xff, 0xff, 0xff, 0xff }, - (uintptr_t)_syscall_hook_trampoline_49_c7_c1_ff_ff_ff_ff }, + HOOK_REFERENCE(49_c7_c1_ff_ff_ff_ff) }, /* Some application has 'mov $0xca,%eax' followed by 'syscall' */ { PATCH_SYSCALL_INSTRUCTION_IS_LAST, 5, { 0xb8, 0xca, 0x00, 0x00, 0x00 }, - (uintptr_t)_syscall_hook_trampoline_b8_ca_00_00_00 }, + HOOK_REFERENCE(b8_ca_00_00_00) }, }; #elif defined(__aarch64__) extern RR_HIDDEN void _syscall_hook_trampoline_raw(void); @@ -1308,7 +1294,6 @@ static long commit_raw_syscall(int syscallno, void* record_end, long ret) { void* record_start = buffer_last(); struct syscallbuf_record* rec = record_start; struct syscallbuf_hdr* hdr = buffer_hdr(); - int call_breakpoint = 0; assert(record_end >= record_start); rec->size = record_end - record_start; @@ -1339,21 +1324,20 @@ static long commit_raw_syscall(int syscallno, void* record_end, long ret) { fatal("Record syscall number mismatch"); } - if (hdr->abort_commit) { + rec->ret = ret; + if (rec->aborted) { /* We were descheduled in the middle of a may-block * syscall, and it was recorded as a normal entry/exit * pair. So don't record the syscall in the buffer or * replay will go haywire. */ hdr->abort_commit = 0; hdr->failed_during_preparation = 0; - /* Clear the return value that rr puts there during replay */ - rec->ret = 0; + rec->size = sizeof(struct syscallbuf_record); + hdr->num_rec_bytes += sizeof(struct syscallbuf_record); } else { - rec->ret = ret; // Finish 'rec' first before updating num_rec_bytes, since // rr might read the record anytime after this update. hdr->num_rec_bytes += stored_record_size(rec->size); - call_breakpoint = 1; } if (rec->desched) { @@ -1367,23 +1351,21 @@ static long commit_raw_syscall(int syscallno, void* record_end, long ret) { buffer_hdr()->locked &= ~SYSCALLBUF_LOCKED_TRACEE; - if (call_breakpoint) { - /* Call the breakpoint function corresponding to the record we just - * committed. This function just returns, but during replay it gives rr - * a chance to set a breakpoint for when a specific syscallbuf record - * has been processed. - */ - do_breakpoint(hdr->num_rec_bytes/8); - /* Force a tick now. - * During replay, if an async event (SIGKILL) happens between committing the syscall - * above and before this forced tick, we can detect that because the number of ticks - * recorded for the SIGKILL will be less than or equal to the number of ticks reported - * when the replay hits do_breakpoint. - */ - force_tick(); - } - - return ret; + /* Call the breakpoint function corresponding to the record we just + * committed. This function just returns, but during replay it gives rr + * a chance to set a breakpoint for when a specific syscallbuf record + * has been processed. + */ + do_breakpoint(hdr->num_rec_bytes/8); + /* Force a tick now. + * During replay, if an async event (SIGKILL) happens between committing the syscall + * above and before this forced tick, we can detect that because the number of ticks + * recorded for the SIGKILL will be less than or equal to the number of ticks reported + * when the replay hits do_breakpoint. + */ + force_tick(); + + return !rec->aborted; } /** @@ -1565,11 +1547,15 @@ static long sys_generic_nonblocking(struct syscall_info* call) { long ret; if (!start_commit_buffered_syscall(call->no, ptr, WONT_BLOCK)) { - return traced_raw_syscall(call); + return 0; } ret = untraced_syscall6(call->no, call->args[0], call->args[1], call->args[2], call->args[3], call->args[4], call->args[5]); - return commit_raw_syscall(call->no, ptr, ret); + if (!commit_raw_syscall(call->no, ptr, ret)) { + return 0; + } + call->no = ret; + return 1; } /** @@ -1582,11 +1568,15 @@ static long sys_generic_nonblocking_fd(struct syscall_info* call) { long ret; if (!start_commit_buffered_syscall(call->no, ptr, WONT_BLOCK)) { - return traced_raw_syscall(call); + return 0; } ret = untraced_syscall6(call->no, fd, call->args[1], call->args[2], call->args[3], call->args[4], call->args[5]); - return commit_raw_syscall(call->no, ptr, ret); + if (!commit_raw_syscall(call->no, ptr, ret)) { + return 0; + } + call->no = ret; + return 1; } /** @@ -1603,7 +1593,10 @@ static long privileged_sys_generic_nonblocking_fd(const struct syscall_info* cal } ret = privileged_untraced_syscall6(call->no, fd, call->args[1], call->args[2], call->args[3], call->args[4], call->args[5]); - return commit_raw_syscall(call->no, ptr, ret); + if (!commit_raw_syscall(call->no, ptr, ret)) { + return 0; + } + return ret; } static long sys_clock_gettime(struct syscall_info* call) { @@ -1622,7 +1615,7 @@ static long sys_clock_gettime(struct syscall_info* call) { ptr += sizeof(*tp2); } if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { - return traced_raw_syscall(call); + return 0; } ret = untraced_syscall2(syscallno, clk_id, tp2); if (tp && ret >= 0 && !buffer_hdr()->failed_during_preparation) { @@ -1630,7 +1623,11 @@ static long sys_clock_gettime(struct syscall_info* call) { our library. */ *tp = *tp2; } - return commit_raw_syscall(syscallno, ptr, ret); + if (!commit_raw_syscall(syscallno, ptr, ret)) { + return 0; + } + call->no = ret; + return 1; } #ifdef SYS_clock_gettime64 @@ -1651,7 +1648,7 @@ static long sys_clock_gettime64(struct syscall_info* call) { ptr += sizeof(*tp2); } if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { - return traced_raw_syscall(call); + return 0; } ret = untraced_syscall2(syscallno, clk_id, tp2); if (tp && ret >= 0 && !buffer_hdr()->failed_during_preparation) { @@ -1659,7 +1656,11 @@ static long sys_clock_gettime64(struct syscall_info* call) { our library. */ *tp = *tp2; } - return commit_raw_syscall(syscallno, ptr, ret); + if (!commit_raw_syscall(syscallno, ptr, ret)) { + return 0; + } + call->no = ret; + return 1; } #endif @@ -1674,7 +1675,11 @@ static long sys_creat(struct syscall_info* call) { * O_CREAT|O_WRONLY|O_TRUNC. */ struct syscall_info open_call = { SYS_open, { (long)pathname, O_CREAT | O_TRUNC | O_WRONLY, mode } }; - return sys_open(&open_call); + long ret = sys_open(&open_call); + if (!ret) + return ret; + call->no = open_call.no; + return ret; } #endif @@ -1692,10 +1697,14 @@ static int sys_fcntl64_no_outparams(struct syscall_info* call) { assert(syscallno == call->no); if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { - return traced_raw_syscall(call); + return 0; } ret = untraced_syscall3(syscallno, fd, cmd, arg); - return commit_raw_syscall(syscallno, ptr, ret); + if (!commit_raw_syscall(syscallno, ptr, ret)) { + return 0; + } + call->no = ret; + return 1; } static int sys_fcntl64_own_ex(struct syscall_info* call) { @@ -1716,7 +1725,7 @@ static int sys_fcntl64_own_ex(struct syscall_info* call) { ptr += sizeof(*owner2); } if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { - return traced_raw_syscall(call); + return 0; } if (owner2) { memcpy_input_parameter(owner2, owner, sizeof(*owner2)); @@ -1725,13 +1734,17 @@ static int sys_fcntl64_own_ex(struct syscall_info* call) { if (owner2 && ret >= 0 && !buffer_hdr()->failed_during_preparation) { local_memcpy(owner, owner2, sizeof(*owner)); } - return commit_raw_syscall(syscallno, ptr, ret); + if (!commit_raw_syscall(syscallno, ptr, ret)) { + return 0; + } + call->no = ret; + return 1; } static int sys_fcntl64_setlk64(struct syscall_info* call) { if (force_traced_syscall_for_chaos_mode()) { /* Releasing a lock could unblock a higher priority task */ - return traced_raw_syscall(call); + return 0; } const int syscallno = RR_FCNTL_SYSCALL; @@ -1750,7 +1763,7 @@ static int sys_fcntl64_setlk64(struct syscall_info* call) { ptr += sizeof(*lock2); } if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { - return traced_raw_syscall(call); + return 0; } if (lock2) { memcpy_input_parameter(lock2, lock, sizeof(*lock2)); @@ -1759,13 +1772,17 @@ static int sys_fcntl64_setlk64(struct syscall_info* call) { if (lock2 && ret >= 0 && !buffer_hdr()->failed_during_preparation) { local_memcpy(lock, lock2, sizeof(*lock)); } - return commit_raw_syscall(syscallno, ptr, ret); + if (!commit_raw_syscall(syscallno, ptr, ret)) { + return 0; + } + call->no = ret; + return 1; } static int sys_fcntl64_setlkw64(struct syscall_info* call) { if (force_traced_syscall_for_chaos_mode()) { /* Releasing a lock could unblock a higher priority task */ - return traced_raw_syscall(call); + return 0; } const int syscallno = RR_FCNTL_SYSCALL; @@ -1779,10 +1796,14 @@ static int sys_fcntl64_setlkw64(struct syscall_info* call) { assert(syscallno == call->no); if (!start_commit_buffered_syscall(syscallno, ptr, MAY_BLOCK)) { - return traced_raw_syscall(call); + return 0; } ret = untraced_syscall3(syscallno, fd, cmd, lock); - return commit_raw_syscall(syscallno, ptr, ret); + if (!commit_raw_syscall(syscallno, ptr, ret)) { + return 0; + } + call->no = ret; + return 1; } #if defined(SYS_fcntl64) @@ -1798,7 +1819,7 @@ static long sys_fcntl(struct syscall_info* call) if (call->args[2] == O_DIRECT) { /* This needs to go to rr so we can disable syscall buffering on this fd. */ - return traced_raw_syscall(call); + return 0; } /* Falls through. */ case F_DUPFD: @@ -1835,7 +1856,7 @@ static long sys_fcntl(struct syscall_info* call) return sys_fcntl64_setlkw64(call); default: - return traced_raw_syscall(call); + return 0; } } @@ -1866,12 +1887,16 @@ static long sys_flistxattr(struct syscall_info* call) { ptr += size; } if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { - return traced_raw_syscall(call); + return 0; } ret = untraced_syscall3(syscallno, fd, buf2, size); ptr = copy_output_buffer(ret_buf_len(ret, size), ptr, buf, buf2); - return commit_raw_syscall(syscallno, ptr, ret); + if (!commit_raw_syscall(syscallno, ptr, ret)) { + return 0; + } + call->no = ret; + return 1; } static long sys_safe_nonblocking_ioctl(struct syscall_info* call) { @@ -1882,10 +1907,14 @@ static long sys_safe_nonblocking_ioctl(struct syscall_info* call) { long ret; if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { - return traced_raw_syscall(call); + return 0; } ret = untraced_syscall3(syscallno, fd, call->args[1], call->args[2]); - return commit_raw_syscall(syscallno, ptr, ret); + if (!commit_raw_syscall(syscallno, ptr, ret)) { + return 0; + } + call->no = ret; + return 1; } static long sys_ioctl_fionread(struct syscall_info* call) { @@ -1902,13 +1931,17 @@ static long sys_ioctl_fionread(struct syscall_info* call) { ptr += sizeof(*value); } if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { - return traced_raw_syscall(call); + return 0; } ret = untraced_syscall3(syscallno, fd, FIONREAD, buf); if (buf && ret >= 0 && !buffer_hdr()->failed_during_preparation) { local_memcpy(value, buf, sizeof(*value)); } - return commit_raw_syscall(syscallno, ptr, ret); + if (!commit_raw_syscall(syscallno, ptr, ret)) { + return 0; + } + call->no = ret; + return 1; } static long sys_ioctl(struct syscall_info* call) { @@ -1920,7 +1953,7 @@ static long sys_ioctl(struct syscall_info* call) { case FIONREAD: return sys_ioctl_fionread(call); default: - return traced_raw_syscall(call); + return 0; } } @@ -1934,7 +1967,7 @@ static long sys_futex(struct syscall_info* call) { in which case we're at most doubling the overhead of the combined wait + wakeup. */ if (globals.in_chaos) { - return traced_raw_syscall(call); + return 0; } int op = call->args[1]; @@ -1962,7 +1995,7 @@ static long sys_futex(struct syscall_info* call) { * special processing in the tracer process (in addition to * not being worth doing for perf reasons). */ default: - return traced_raw_syscall(call); + return 0; } const int syscallno = SYS_futex; @@ -1993,7 +2026,7 @@ static long sys_futex(struct syscall_info* call) { /* See above; it's not worth buffering may-block futex * calls. */ if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { - return traced_raw_syscall(call); + return 0; } ret = untraced_syscall6(syscallno, uaddr, op, val, timeout, uaddr2, val3); @@ -2007,7 +2040,10 @@ static long sys_futex(struct syscall_info* call) { if (saved_uaddr2) { copy_futex_int(saved_uaddr2, uaddr2); } - return commit_raw_syscall(syscallno, ptr, ret); + int ok = commit_raw_syscall(syscallno, ptr, ret); + assert(ok); + call->no = ret; + return 1; } static long sys_getrandom(struct syscall_info* call) { @@ -2027,12 +2063,16 @@ static long sys_getrandom(struct syscall_info* call) { ptr += buf_len; } if (!start_commit_buffered_syscall(call->no, ptr, (flags & GRND_NONBLOCK) ? WONT_BLOCK : MAY_BLOCK)) { - return traced_raw_syscall(call); + return 0; } ret = untraced_syscall3(call->no, buf2, buf_len, flags); ptr = copy_output_buffer(ret, ptr, buf, buf2); - return commit_raw_syscall(call->no, ptr, ret); + if (!commit_raw_syscall(call->no, ptr, ret)) { + return 0; + } + call->no = ret; + return 1; } static long sys_generic_getdents(struct syscall_info* call) { @@ -2049,12 +2089,16 @@ static long sys_generic_getdents(struct syscall_info* call) { ptr += count; } if (!start_commit_buffered_syscall(call->no, ptr, WONT_BLOCK)) { - return traced_raw_syscall(call); + return 0; } ret = untraced_syscall3(call->no, fd, buf2, count); ptr = copy_output_buffer(ret, ptr, buf, buf2); - return commit_raw_syscall(call->no, ptr, ret); + if (!commit_raw_syscall(call->no, ptr, ret)) { + return 0; + } + call->no = ret; + return 1; } #if defined(SYS_getdents) @@ -2091,7 +2135,7 @@ static long sys_gettimeofday(struct syscall_info* call) { ptr += sizeof(*tzp2); } if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { - return traced_raw_syscall(call); + return 0; } ret = untraced_syscall2(syscallno, tp2, tzp2); if (ret >= 0 && !buffer_hdr()->failed_during_preparation) { @@ -2106,7 +2150,11 @@ static long sys_gettimeofday(struct syscall_info* call) { *tzp = *tzp2; } } - return commit_raw_syscall(syscallno, ptr, ret); + if (!commit_raw_syscall(syscallno, ptr, ret)) { + return 0; + } + call->no = ret; + return 1; } static long sys_generic_getxattr(struct syscall_info* call) { @@ -2124,12 +2172,16 @@ static long sys_generic_getxattr(struct syscall_info* call) { ptr += size; } if (!start_commit_buffered_syscall(call->no, ptr, WONT_BLOCK)) { - return traced_raw_syscall(call); + return 0; } ret = untraced_syscall4(call->no, path, name, value2, size); ptr = copy_output_buffer(ret_buf_len(ret, size), ptr, value, value2); - return commit_raw_syscall(call->no, ptr, ret); + if (!commit_raw_syscall(call->no, ptr, ret)) { + return 0; + } + call->no = ret; + return 1; } static long sys_getxattr(struct syscall_info* call) { @@ -2155,12 +2207,16 @@ static long sys_fgetxattr(struct syscall_info* call) { ptr += size; } if (!start_commit_buffered_syscall(call->no, ptr, WONT_BLOCK)) { - return traced_raw_syscall(call); + return 0; } ret = untraced_syscall4(call->no, fd, name, value2, size); ptr = copy_output_buffer(ret_buf_len(ret, size), ptr, value, value2); - return commit_raw_syscall(call->no, ptr, ret); + if (!commit_raw_syscall(call->no, ptr, ret)) { + return 0; + } + call->no = ret; + return 1; } static long sys_generic_listxattr(struct syscall_info* call) { @@ -2177,12 +2233,16 @@ static long sys_generic_listxattr(struct syscall_info* call) { ptr += size; } if (!start_commit_buffered_syscall(call->no, ptr, WONT_BLOCK)) { - return traced_raw_syscall(call); + return 0; } ret = untraced_syscall3(call->no, path, buf2, size); ptr = copy_output_buffer(ret_buf_len(ret, size), ptr, buf, buf2); - return commit_raw_syscall(call->no, ptr, ret); + if (!commit_raw_syscall(call->no, ptr, ret)) { + return 0; + } + call->no = ret; + return 1; } static long sys_listxattr(struct syscall_info* call) { @@ -2213,7 +2273,7 @@ static long sys__llseek(struct syscall_info* call) { ptr += sizeof(*result2); } if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { - return traced_raw_syscall(call); + return 0; } if (result2) { @@ -2224,7 +2284,11 @@ static long sys__llseek(struct syscall_info* call) { if (result2) { *result = *result2; } - return commit_raw_syscall(syscallno, ptr, ret); + if (!commit_raw_syscall(syscallno, ptr, ret)) { + return 0; + } + call->no = ret; + return 1; } #endif @@ -2258,7 +2322,7 @@ static long sys_madvise(struct syscall_info* call) { advice = -1; break; default: - return traced_raw_syscall(call); + return 0; } ptr = prep_syscall(); @@ -2266,14 +2330,18 @@ static long sys_madvise(struct syscall_info* call) { assert(syscallno == call->no); if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { - return traced_raw_syscall(call); + return 0; } /* Ensure this syscall happens during replay. In particular MADV_DONTNEED * must be executed. */ ret = untraced_replayed_syscall3(syscallno, addr, length, advice); - return commit_raw_syscall(syscallno, ptr, ret); + if (!commit_raw_syscall(syscallno, ptr, ret)) { + return 0; + } + call->no = ret; + return 1; } static long sys_mprotect(struct syscall_info* call) { @@ -2288,7 +2356,7 @@ static long sys_mprotect(struct syscall_info* call) { if ((prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC)) || !buffer_hdr() || buffer_hdr()->mprotect_record_count >= MPROTECT_RECORD_COUNT) { - return traced_raw_syscall(call); + return 0; } ptr = prep_syscall(); @@ -2296,7 +2364,7 @@ static long sys_mprotect(struct syscall_info* call) { assert(syscallno == call->no); if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { - return traced_raw_syscall(call); + return 0; } mrec = &globals.mprotect_records[buffer_hdr()->mprotect_record_count++]; @@ -2310,7 +2378,11 @@ static long sys_mprotect(struct syscall_info* call) { } buffer_hdr()->mprotect_record_count_completed++; - return commit_raw_syscall(syscallno, ptr, ret); + if (!commit_raw_syscall(syscallno, ptr, ret)) { + return 0; + } + call->no = ret; + return 1; } static int supported_open(const char* file_name, int flags) { @@ -2345,7 +2417,7 @@ static int check_file_open_ok(struct syscall_info* call, int ret, struct check_o from doing anything, so there is nothing for us to do here and we shouldn't try to interpret the "syscall result". */ if (state.did_fail_during_preparation || ret < 0) { - return ret; + return 0; } char buf[100]; sprintf(buf, "/proc/self/fd/%d", ret); @@ -2362,7 +2434,8 @@ static int check_file_open_ok(struct syscall_info* call, int ret, struct check_o if (link_ret >= 0 && link_ret < (ssize_t)sizeof(link)) { link[link_ret] = 0; if (allow_buffered_open(link)) { - return ret; + call->no = ret; + return 1; } } /* Clean up by closing the file descriptor we should not have opened and @@ -2374,7 +2447,7 @@ static int check_file_open_ok(struct syscall_info* call, int ret, struct check_o syscall, but that's a bit more complicated and we're already on the slow (and hopefully rare) path. */ privileged_traced_syscall1(SYS_close, ret); - return traced_raw_syscall(call); + return 0; } static struct check_open_state capture_check_open_state(void) { @@ -2388,7 +2461,7 @@ static struct check_open_state capture_check_open_state(void) { static long sys_open(struct syscall_info* call) { if (force_traced_syscall_for_chaos_mode()) { /* Opening a FIFO could unblock a higher priority task */ - return traced_raw_syscall(call); + return 0; } const int syscallno = SYS_open; @@ -2401,25 +2474,27 @@ static long sys_open(struct syscall_info* call) { assert(syscallno == call->no); if (!supported_open(pathname, flags)) { - return traced_raw_syscall(call); + return 0; } ptr = prep_syscall(); if (!start_commit_buffered_syscall(syscallno, ptr, MAY_BLOCK)) { - return traced_raw_syscall(call); + return 0; } ret = untraced_syscall3(syscallno, pathname, flags, mode); struct check_open_state state = capture_check_open_state(); - ret = commit_raw_syscall(syscallno, ptr, ret); + if (!commit_raw_syscall(syscallno, ptr, ret)) + return 0; return check_file_open_ok(call, ret, state); } #endif static long sys_openat(struct syscall_info* call) { + (void)call; if (force_traced_syscall_for_chaos_mode()) { /* Opening a FIFO could unblock a higher priority task */ - return traced_raw_syscall(call); + return 0; } const int syscallno = SYS_openat; @@ -2433,17 +2508,18 @@ static long sys_openat(struct syscall_info* call) { assert(syscallno == call->no); if (!supported_open(pathname, flags)) { - return traced_raw_syscall(call); + return 0; } ptr = prep_syscall(); if (!start_commit_buffered_syscall(syscallno, ptr, MAY_BLOCK)) { - return traced_raw_syscall(call); + return 0; } ret = untraced_syscall4(syscallno, dirfd, pathname, flags, mode); struct check_open_state state = capture_check_open_state(); - ret = commit_raw_syscall(syscallno, ptr, ret); + if (!commit_raw_syscall(syscallno, ptr, ret)) + return 0; return check_file_open_ok(call, ret, state); } @@ -2476,7 +2552,7 @@ static long sys_poll(struct syscall_info* call) { ptr += nfds * sizeof(*fds2); } if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { - return traced_raw_syscall(call); + return 0; } if (fds2) { memcpy_input_parameter(fds2, fds, nfds * sizeof(*fds2)); @@ -2502,14 +2578,15 @@ static long sys_poll(struct syscall_info* call) { * incorrectly trashing 'fds'. */ local_memcpy(fds, fds2, nfds * sizeof(*fds)); } - commit_raw_syscall(syscallno, ptr, ret); + long ok = commit_raw_syscall(syscallno, ptr, ret); if (ret != 0 || timeout == 0) { - return ret; + call->no = ret; + return ok; } /* The syscall didn't return anything, and we should have blocked. Just perform a raw syscall now since we're almost certain to block. */ - return traced_raw_syscall(call); + return 0; } #endif @@ -2533,7 +2610,7 @@ static long sys_ppoll(struct syscall_info* call) { ptr += nfds * sizeof(*fds2); } if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { - return traced_raw_syscall(call); + return 0; } if (fds2) { memcpy_input_parameter(fds2, fds, nfds * sizeof(*fds2)); @@ -2563,11 +2640,12 @@ static long sys_ppoll(struct syscall_info* call) { commit_raw_syscall(syscallno, ptr, ret); if (ret != 0 || (tmo_p && tmo_p->tv_sec == 0 && tmo_p->tv_nsec == 0)) { - return ret; + call->no = ret; + return 1; } /* The syscall didn't return anything, and we should have blocked. Just perform a raw syscall now since we're almost certain to block. */ - return traced_raw_syscall(call); + return 0; } #endif @@ -2594,7 +2672,7 @@ static long sys_epoll_wait(struct syscall_info* call) { ptr += max_events * sizeof(*events2); } if (!start_commit_buffered_syscall(call->no, ptr, WONT_BLOCK)) { - return traced_raw_syscall(call); + return 0; } /* Try a no-timeout version of the syscall first. If this doesn't return @@ -2608,7 +2686,9 @@ static long sys_epoll_wait(struct syscall_info* call) { call->args[4], call->args[5]); ptr = copy_output_buffer(ret * sizeof(*events2), ptr, events, events2); - ret = commit_raw_syscall(call->no, ptr, ret); + if (!commit_raw_syscall(call->no, ptr, ret)) { + return 0; + } if (timeout == 0 || (ret != EINTR && ret != 0)) { /* If we got some real results, or a non-EINTR error, we can just return it directly. @@ -2620,7 +2700,8 @@ static long sys_epoll_wait(struct syscall_info* call) { returned had it run traced. (We didn't enable the desched signal so no extra signals could have affected our untraced syscall that could not have been delivered to a traced syscall.) */ - return ret; + call->no = ret; + return 1; } /* Some timeout was requested and either we got no results or we got EINTR. @@ -2635,7 +2716,7 @@ static long sys_epoll_wait(struct syscall_info* call) { itself interrupt the syscall and cause it to return EINTR just as would happen without rr. */ - return traced_raw_syscall(call); + return 0; } #define CLONE_SIZE_THRESHOLD 0x10000 @@ -2643,7 +2724,7 @@ static long sys_epoll_wait(struct syscall_info* call) { static long sys_read(struct syscall_info* call) { if (force_traced_syscall_for_chaos_mode()) { /* Reading from a pipe could unblock a higher priority task */ - return traced_raw_syscall(call); + return 0; } const int syscallno = SYS_read; @@ -2706,12 +2787,10 @@ static long sys_read(struct syscall_info* call) { ioctl_ret = privileged_untraced_syscall3(SYS_ioctl, thread_locals->cloned_file_data_fd, BTRFS_IOC_CLONE_RANGE, &ioctl_args); - ioctl_ret = commit_raw_syscall(SYS_ioctl, ioctl_ptr, ioctl_ret); + commit_raw_syscall(SYS_ioctl, ioctl_ptr, ioctl_ret); } if (ioctl_ret >= 0) { - struct syscall_info read_call = { SYS_read, - { fd, (long)buf, count, 0, 0, 0 } }; thread_locals->cloned_file_data_offset += count; replay_only_syscall3(SYS_dup3, thread_locals->cloned_file_data_fd, fd, 0); @@ -2719,12 +2798,12 @@ static long sys_read(struct syscall_info* call) { ptr = prep_syscall(); if (count > thread_locals->usable_scratch_size) { if (!start_commit_buffered_syscall(SYS_read, ptr, WONT_BLOCK)) { - return traced_raw_syscall(&read_call); + return 0; } ret = untraced_replayed_syscall3(SYS_read, fd, buf, count); } else { if (!start_commit_buffered_syscall(SYS_read, ptr, MAY_BLOCK)) { - return traced_raw_syscall(&read_call); + return 0; } ret = untraced_replayed_syscall3(SYS_read, fd, thread_locals->scratch_buf, count); @@ -2735,8 +2814,10 @@ static long sys_read(struct syscall_info* call) { // ReplaySession::flush_syscallbuf instead of // ReplaySession::enter_syscall or something similar. replay_only_syscall1(SYS_close, fd); - ret = commit_raw_syscall(SYS_read, ptr, ret); - return ret; + if (!commit_raw_syscall(SYS_read, ptr, ret)) + return 0; + call->no = ret; + return 1; } } } @@ -2750,12 +2831,15 @@ static long sys_read(struct syscall_info* call) { ptr += count; } if (!start_commit_buffered_syscall(syscallno, ptr, MAY_BLOCK)) { - return traced_raw_syscall(call); + return 0; } ret = untraced_syscall3(syscallno, fd, buf2, count); ptr = copy_output_buffer(ret, ptr, buf, buf2); - return commit_raw_syscall(syscallno, ptr, ret); + if (!commit_raw_syscall(syscallno, ptr, ret)) + return 0; + call->no = ret; + return 1; } /* On x86-32, pread/pwrite take the offset in two registers. We don't bother @@ -2782,12 +2866,14 @@ static long sys_pread64(struct syscall_info* call) { ptr += count; } if (!start_commit_buffered_syscall(syscallno, ptr, MAY_BLOCK)) { - return traced_raw_syscall(call); + return 0; } ret = untraced_syscall4(syscallno, fd, buf2, count, offset); ptr = copy_output_buffer(ret, ptr, buf, buf2); - return commit_raw_syscall(syscallno, ptr, ret); + commit_raw_syscall(syscallno, ptr, ret); + call->no = ret; + return 1; } #endif @@ -2809,12 +2895,14 @@ static long sys_readlink(struct syscall_info* call) { ptr += bufsiz; } if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { - return traced_raw_syscall(call); + return 0; } ret = untraced_syscall3(syscallno, path, buf2, bufsiz); ptr = copy_output_buffer(ret, ptr, buf, buf2); - return commit_raw_syscall(syscallno, ptr, ret); + commit_raw_syscall(syscallno, ptr, ret); + call->no = ret; + return 1; } #endif @@ -2839,7 +2927,7 @@ static long sys_readlinkat(struct syscall_info* call, int privileged) { if (privileged) { return privileged_traced_raw_syscall(call); } - return traced_raw_syscall(call); + return 0; } if (privileged) { @@ -2848,14 +2936,22 @@ static long sys_readlinkat(struct syscall_info* call, int privileged) { ret = untraced_syscall4(syscallno, dirfd, path, buf2, bufsiz); } ptr = copy_output_buffer(ret, ptr, buf, buf2); - return commit_raw_syscall(syscallno, ptr, ret); + if (!commit_raw_syscall(syscallno, ptr, ret)) { + return 0; + } + call->no = ret; + if (privileged) { + return ret; + } else { + return 1; + } } #if defined(SYS_socketcall) static long sys_socketcall_recv(struct syscall_info* call) { if (force_traced_syscall_for_chaos_mode()) { /* Reading from a socket could unblock a higher priority task */ - return traced_raw_syscall(call); + return 0; } const int syscallno = SYS_socketcall; @@ -2877,7 +2973,7 @@ static long sys_socketcall_recv(struct syscall_info* call) { ptr += len; } if (!start_commit_buffered_syscall(syscallno, ptr, MAY_BLOCK)) { - return traced_raw_syscall(call); + return 0; } new_args[0] = sockfd; @@ -2887,7 +2983,11 @@ static long sys_socketcall_recv(struct syscall_info* call) { ret = untraced_syscall2(SYS_socketcall, SYS_RECV, new_args); /* Account for MSG_TRUNC */ ptr = copy_output_buffer(ret_buf_len(ret, len), ptr, buf, buf2); - return commit_raw_syscall(syscallno, ptr, ret); + if (!commit_raw_syscall(syscallno, ptr, ret)) { + return 0; + } + call->no = ret; + return 1; } static long sys_socketcall(struct syscall_info* call) { @@ -2895,7 +2995,7 @@ static long sys_socketcall(struct syscall_info* call) { case SYS_RECV: return sys_socketcall_recv(call); default: - return traced_raw_syscall(call); + return 0; } } #endif @@ -2904,7 +3004,7 @@ static long sys_socketcall(struct syscall_info* call) { static long sys_recvfrom(struct syscall_info* call) { if (force_traced_syscall_for_chaos_mode()) { /* Reading from a socket could unblock a higher priority task */ - return traced_raw_syscall(call); + return 0; } const int syscallno = SYS_recvfrom; @@ -2942,7 +3042,7 @@ static long sys_recvfrom(struct syscall_info* call) { ptr += len; } if (!start_commit_buffered_syscall(syscallno, ptr, MAY_BLOCK)) { - return traced_raw_syscall(call); + return 0; } if (addrlen) { memcpy_input_parameter(addrlen2, addrlen, sizeof(*addrlen2)); @@ -2963,7 +3063,11 @@ static long sys_recvfrom(struct syscall_info* call) { } } ptr = copy_output_buffer(ret_buf_len(ret, len), ptr, buf, buf2); - return commit_raw_syscall(syscallno, ptr, ret); + if (!commit_raw_syscall(syscallno, ptr, ret)) { + return 0; + } + call->no = ret; + return 1; } #endif @@ -3012,7 +3116,7 @@ static int msg_received_file_descriptors(struct msghdr* msg) { static long sys_recvmsg(struct syscall_info* call) { if (force_traced_syscall_for_chaos_mode()) { /* Reading from a socket could unblock a higher priority task */ - return traced_raw_syscall(call); + return 0; } const int syscallno = SYS_recvmsg; @@ -3047,7 +3151,7 @@ static long sys_recvmsg(struct syscall_info* call) { ptr += msg->msg_iov[i].iov_len; } if (!start_commit_buffered_syscall(syscallno, ptr, MAY_BLOCK)) { - return traced_raw_syscall(call); + return 0; } /** @@ -3116,7 +3220,11 @@ static long sys_recvmsg(struct syscall_info* call) { */ ptr_end = ptr_overwritten_end; } - return commit_raw_syscall(syscallno, ptr_end, ret); + if (!commit_raw_syscall(syscallno, ptr_end, ret)) { + return 0; + } + call->no = ret; + return 1; } #endif @@ -3124,7 +3232,7 @@ static long sys_recvmsg(struct syscall_info* call) { static long sys_sendmsg(struct syscall_info* call) { if (force_traced_syscall_for_chaos_mode()) { /* Sending to a socket could unblock a higher priority task */ - return traced_raw_syscall(call); + return 0; } const int syscallno = SYS_sendmsg; @@ -3138,12 +3246,15 @@ static long sys_sendmsg(struct syscall_info* call) { assert(syscallno == call->no); if (!start_commit_buffered_syscall(syscallno, ptr, MAY_BLOCK)) { - return traced_raw_syscall(call); + return 0; } ret = untraced_syscall3(syscallno, sockfd, msg, flags); - - return commit_raw_syscall(syscallno, ptr, ret); + if (!commit_raw_syscall(syscallno, ptr, ret)) { + return 0; + } + call->no = ret; + return 1; } #endif @@ -3151,7 +3262,7 @@ static long sys_sendmsg(struct syscall_info* call) { static long sys_sendto(struct syscall_info* call) { if (force_traced_syscall_for_chaos_mode()) { /* Sending to a socket could unblock a higher priority task */ - return traced_raw_syscall(call); + return 0; } const int syscallno = SYS_sendto; @@ -3168,13 +3279,16 @@ static long sys_sendto(struct syscall_info* call) { assert(syscallno == call->no); if (!start_commit_buffered_syscall(syscallno, ptr, MAY_BLOCK)) { - return traced_raw_syscall(call); + return 0; } ret = untraced_syscall6(syscallno, sockfd, buf, len, flags, dest_addr, addrlen); - - return commit_raw_syscall(syscallno, ptr, ret); + if (!commit_raw_syscall(syscallno, ptr, ret)) { + return 0; + } + call->no = ret; + return 1; } #endif @@ -3190,12 +3304,12 @@ static long sys_setsockopt(struct syscall_info* call) { if (level == SOL_PACKET && (optname == PACKET_RX_RING || optname == PACKET_TX_RING)) { // Let rr intercept this (and probably disable it) - return traced_raw_syscall(call); + return 0; } if (level == SOL_NETLINK && (optname == NETLINK_RX_RING || optname == NETLINK_TX_RING)) { // Let rr intercept this (and probably disable it) - return traced_raw_syscall(call); + return 0; } void* ptr = prep_syscall_for_fd(sockfd); @@ -3204,12 +3318,15 @@ static long sys_setsockopt(struct syscall_info* call) { assert(syscallno == call->no); if (!start_commit_buffered_syscall(syscallno, ptr, MAY_BLOCK)) { - return traced_raw_syscall(call); + return 0; } ret = untraced_syscall5(syscallno, sockfd, level, optname, optval, optlen); - - return commit_raw_syscall(syscallno, ptr, ret); + if (!commit_raw_syscall(syscallno, ptr, ret)) { + return 0; + } + call->no = ret; + return 1; } #endif @@ -3225,7 +3342,7 @@ static long sys_getsockopt(struct syscall_info* call) { void* optval2; if (!optlen || !optval) { - return traced_raw_syscall(call); + return 0; } void* ptr = prep_syscall_for_fd(sockfd); @@ -3244,7 +3361,7 @@ static long sys_getsockopt(struct syscall_info* call) { memcpy_input_parameter(optval2, optval, *optlen); if (!start_commit_buffered_syscall(syscallno, ptr, MAY_BLOCK)) { - return traced_raw_syscall(call); + return 0; } // We may need to manually restart this syscall due to kernel bug @@ -3261,7 +3378,11 @@ static long sys_getsockopt(struct syscall_info* call) { local_memcpy(optlen, optlen2, sizeof(*optlen)); } - return commit_raw_syscall(syscallno, ptr, ret); + if (!commit_raw_syscall(syscallno, ptr, ret)) { + return 0; + } + call->no = ret; + return 1; } #endif @@ -3291,7 +3412,7 @@ static long sys_getsockname(struct syscall_info* call) { } if (!start_commit_buffered_syscall(syscallno, ptr, MAY_BLOCK)) { - return traced_raw_syscall(call); + return 0; } ret = untraced_syscall3(syscallno, sockfd, addr2, addrlen2); @@ -3304,7 +3425,11 @@ static long sys_getsockname(struct syscall_info* call) { local_memcpy(addrlen, addrlen2, sizeof(*addrlen)); } - return commit_raw_syscall(syscallno, ptr, ret); + if (!commit_raw_syscall(syscallno, ptr, ret)) { + return 0; + } + call->no = ret; + return 1; } #endif @@ -3326,13 +3451,17 @@ static long sys_socketpair(struct syscall_info* call) { sv2 = ptr; ptr += sizeof(*sv2); if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { - return traced_raw_syscall(call); + return 0; } ret = untraced_syscall4(syscallno, domain, type, protocol, sv2); if (ret >= 0 && !buffer_hdr()->failed_during_preparation) { local_memcpy(sv, sv2, sizeof(*sv)); } - return commit_raw_syscall(syscallno, ptr, ret); + if (!commit_raw_syscall(syscallno, ptr, ret)) { + return 0; + } + call->no = ret; + return 1; } #endif @@ -3347,14 +3476,18 @@ static long sys_time(struct syscall_info* call) { assert(syscallno == call->no); if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { - return traced_raw_syscall(call); + return 0; } ret = untraced_syscall1(syscallno, NULL); if (tp) { /* No error is possible here. */ *tp = ret; } - return commit_raw_syscall(syscallno, ptr, ret); + if (!commit_raw_syscall(syscallno, ptr, ret)) { + return 0; + } + call->no = ret; + return 1; } #endif @@ -3382,13 +3515,17 @@ static long sys_xstat64(struct syscall_info* call) { ptr += sizeof(*buf2); } if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { - return traced_raw_syscall(call); + return 0; } ret = untraced_syscall2(syscallno, what, buf2); if (buf2 && ret >= 0 && !buffer_hdr()->failed_during_preparation) { local_memcpy(buf, buf2, sizeof(*buf)); } - return commit_raw_syscall(syscallno, ptr, ret); + if (!commit_raw_syscall(syscallno, ptr, ret)) { + return 0; + } + call->no = ret; + return 1; } #ifdef SYS_statx @@ -3406,7 +3543,7 @@ static long sys_statx(struct syscall_info* call) { ptr += sizeof(*buf2); } if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { - return traced_raw_syscall(call); + return 0; } ret = untraced_syscall5(syscallno, call->args[0], call->args[1], call->args[2], call->args[3], @@ -3414,7 +3551,11 @@ static long sys_statx(struct syscall_info* call) { if (buf2 && ret >= 0 && !buffer_hdr()->failed_during_preparation) { local_memcpy(buf, buf2, sizeof(*buf)); } - return commit_raw_syscall(syscallno, ptr, ret); + if (!commit_raw_syscall(syscallno, ptr, ret)) { + return 0; + } + call->no = ret; + return 1; } #endif @@ -3426,7 +3567,7 @@ static long sys_quotactl(struct syscall_info* call) { void* addr = (void*)call->args[3]; if ((cmd >> SUBCMDSHIFT) != Q_GETQUOTA) { - return traced_raw_syscall(call); + return 0; } void* ptr = prep_syscall(); @@ -3438,13 +3579,17 @@ static long sys_quotactl(struct syscall_info* call) { ptr += sizeof(*buf2); } if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { - return traced_raw_syscall(call); + return 0; } ret = untraced_syscall4(syscallno, cmd, special, id, buf2); if (buf2 && ret >= 0 && !buffer_hdr()->failed_during_preparation) { local_memcpy(addr, buf2, sizeof(*buf2)); } - return commit_raw_syscall(syscallno, ptr, ret); + if (!commit_raw_syscall(syscallno, ptr, ret)) { + return 0; + } + call->no = ret; + return 1; } static long sys_statfs(struct syscall_info* call) { @@ -3466,19 +3611,22 @@ static long sys_statfs(struct syscall_info* call) { ptr += sizeof(*buf2); } if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { - return traced_raw_syscall(call); + return 0; } ret = untraced_syscall2(syscallno, what, buf2); if (buf2 && ret >= 0 && !buffer_hdr()->failed_during_preparation) { local_memcpy(buf, buf2, sizeof(*buf)); } - return commit_raw_syscall(syscallno, ptr, ret); + if (!commit_raw_syscall(syscallno, ptr, ret)) + return 0; + call->no = ret; + return 1; } static long sys_write(struct syscall_info* call) { if (force_traced_syscall_for_chaos_mode()) { /* Writing to a pipe or FIFO could unblock a higher priority task */ - return traced_raw_syscall(call); + return 0; } const int syscallno = SYS_write; @@ -3492,12 +3640,14 @@ static long sys_write(struct syscall_info* call) { assert(syscallno == call->no); if (!start_commit_buffered_syscall(syscallno, ptr, fd_write_blocks(fd))) { - return traced_raw_syscall(call); + return 0; } ret = untraced_syscall3(syscallno, fd, buf, count); - - return commit_raw_syscall(syscallno, ptr, ret); + if (!commit_raw_syscall(syscallno, ptr, ret)) + return 0; + call->no = ret; + return 1; } /* On x86-32, pread/pwrite take the offset in two registers. We don't bother @@ -3513,13 +3663,13 @@ static long sys_pwrite64(struct syscall_info* call) { enum syscallbuf_fd_classes cls = fd_class(fd); if (cls == FD_CLASS_TRACED) { - return traced_raw_syscall(call); + return 0; } void* ptr = prep_syscall(); assert(syscallno == call->no); if (!start_commit_buffered_syscall(syscallno, ptr, fd_write_blocks(fd))) { - return traced_raw_syscall(call); + return 0; } long ret; @@ -3529,14 +3679,17 @@ static long sys_pwrite64(struct syscall_info* call) { ret = untraced_syscall4(syscallno, fd, buf, count, offset); } - return commit_raw_syscall(syscallno, ptr, ret); + if (!commit_raw_syscall(syscallno, ptr, ret)) + return 0; + call->no = ret; + return 1; } #endif static long sys_writev(struct syscall_info* call) { if (force_traced_syscall_for_chaos_mode()) { /* Writing to a pipe or FIFO could unblock a higher priority task */ - return traced_raw_syscall(call); + return 0; } int syscallno = SYS_writev; @@ -3550,12 +3703,14 @@ static long sys_writev(struct syscall_info* call) { assert(syscallno == call->no); if (!start_commit_buffered_syscall(syscallno, ptr, fd_write_blocks(fd))) { - return traced_raw_syscall(call); + return 0; } ret = untraced_syscall3(syscallno, fd, iov, iovcnt); - - return commit_raw_syscall(syscallno, ptr, ret); + if (!commit_raw_syscall(syscallno, ptr, ret)) + return 0; + call->no = ret; + return 1; } static long sys_ptrace(struct syscall_info* call) { @@ -3566,7 +3721,7 @@ static long sys_ptrace(struct syscall_info* call) { void* data = (void*)call->args[3]; if (request != PTRACE_PEEKDATA || !data) { - return traced_raw_syscall(call); + return 0; } /* We try to emulate PTRACE_PEEKDATA using process_vm_readv. That might not @@ -3589,7 +3744,7 @@ static long sys_ptrace(struct syscall_info* call) { ptr += sizeof(long); if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { - return traced_raw_syscall(call); + return 0; } struct iovec local_iov = { data2, sizeof(long) }; @@ -3598,12 +3753,15 @@ static long sys_ptrace(struct syscall_info* call) { if (ret > 0 && !buffer_hdr()->failed_during_preparation) { local_memcpy(data, data2, ret); } - commit_raw_syscall(syscallno, ptr, ret); + if (!commit_raw_syscall(syscallno, ptr, ret)) { + return 0; + } if (ret != sizeof(long)) { - return traced_raw_syscall(call); + return 0; } - return ret; + call->no = ret; + return 1; } static long sys_getrusage(struct syscall_info* call) { @@ -3621,14 +3779,18 @@ static long sys_getrusage(struct syscall_info* call) { ptr += sizeof(struct rusage); } if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { - return traced_raw_syscall(call); + return 0; } ret = untraced_syscall2(syscallno, who, buf2); if (buf2 && ret >= 0 && !buffer_hdr()->failed_during_preparation) { local_memcpy(buf, buf2, sizeof(*buf)); } - return commit_raw_syscall(syscallno, ptr, ret); + if (!commit_raw_syscall(syscallno, ptr, ret)) { + return 0; + } + call->no = ret; + return 1; } static long sys_rt_sigprocmask(struct syscall_info* call) { @@ -3640,7 +3802,7 @@ static long sys_rt_sigprocmask(struct syscall_info* call) { if (call->args[3] != sizeof(kernel_sigset_t)) { // Unusual sigset size. Bail. - return traced_raw_syscall(call); + return 0; } void* ptr = prep_syscall(); @@ -3653,7 +3815,7 @@ static long sys_rt_sigprocmask(struct syscall_info* call) { ptr += sizeof(kernel_sigset_t); if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { - return traced_raw_syscall(call); + return 0; } if (set && (how == SIG_BLOCK || how == SIG_SETMASK)) { @@ -3697,14 +3859,17 @@ static long sys_rt_sigprocmask(struct syscall_info* call) { } hdr->in_sigprocmask_critical_section = 0; - commit_raw_syscall(syscallno, ptr, ret); + if (!commit_raw_syscall(syscallno, ptr, ret)) { + return 0; + } if (ret == -EAGAIN) { // The rr supervisor emulated EAGAIN because there was a pending signal. // Retry using a traced syscall so the pending signal(s) can be delivered. - return traced_raw_syscall(call); + return 0; } - return ret; + call->no = ret; + return 1; } static long sys_rrcall_rdtsc(struct syscall_info* call) { @@ -3715,7 +3880,7 @@ static long sys_rrcall_rdtsc(struct syscall_info* call) { void* buf = ptr; ptr += 8; if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { - return traced_raw_syscall(call); + return 0; } // Do an RDTSC without context-switching to rr. This is still a lot slower @@ -3727,7 +3892,9 @@ static long sys_rrcall_rdtsc(struct syscall_info* call) { local_memcpy(tsc, buf, sizeof(tsc)); // Overwrite RDX (syscall arg 3) with our TSC value. call->args[2] = tsc[1]; - return commit_raw_syscall(syscallno, ptr, tsc[0]); + commit_raw_syscall(syscallno, ptr, tsc[0]); + call->no = tsc[0]; + return 1; #else (void)call; fatal("RDTSC not supported in this architecture"); @@ -3905,7 +4072,7 @@ case SYS_epoll_pwait: #undef CASE_GENERIC_NONBLOCKING #undef CASE_GENERIC_NONBLOCKING_FD default: - return traced_raw_syscall(call); + return 0; } } @@ -3930,7 +4097,7 @@ RR_HIDDEN long syscall_hook(struct syscall_info* call) { if (!thread_locals->buffer || buffer_hdr()->locked) { /* We may be reentering via a signal handler. Bail. */ - return traced_raw_syscall(call); + return 0; } thread_locals->original_syscall_parameters = call; @@ -3939,6 +4106,7 @@ RR_HIDDEN long syscall_hook(struct syscall_info* call) { do_delay(); } + long callno = call->no; long result = syscall_hook_internal(call); if (buffer_hdr() && buffer_hdr()->notify_on_syscall_hook_exit) { // Sometimes a signal is delivered to interrupt an untraced syscall in @@ -3979,10 +4147,10 @@ RR_HIDDEN long syscall_hook(struct syscall_info* call) { // syscall_hook_internal generates either a traced syscall or a syscallbuf // record that would be flushed by SYSCALLBUF_FLUSH, so that can't // happen. - result = _raw_syscall(SYS_rrcall_notify_syscall_hook_exit, call->args[0], + call->no = _raw_syscall(SYS_rrcall_notify_syscall_hook_exit, call->args[0], call->args[1], call->args[2], call->args[3], call->args[4], call->args[5], - RR_PAGE_SYSCALL_PRIVILEGED_TRACED, result, call->no); + RR_PAGE_SYSCALL_PRIVILEGED_TRACED, call->no, callno); } // Do work that can only be safely done after syscallbuf can be flushed if (thread_locals->notify_control_msg) { diff --git a/src/record_signal.cc b/src/record_signal.cc index 24c0837e637..5e522893148 100644 --- a/src/record_signal.cc +++ b/src/record_signal.cc @@ -366,6 +366,74 @@ bool handle_syscallbuf_breakpoint(RecordTask* t) { return true; } +/** + * Pre-condition: We're at a syscall-entry or seccomp-trap event inside the + * syscallbuf. + * + * This function will abort the current syscall and moves us to the + * syscall-entry trap of the bail syscall. + */ +void leave_syscallbuf(RecordTask *t) { + remote_ptr desched_rec = t->desched_rec(); + if (!desched_rec) { + LOG(debug) << "Desched initiated"; + + /* The tracee is (re-)entering the buffered syscall. Stash + * away this breadcrumb so that we can figure out what syscall + * the tracee was in, and how much "scratch" space it carved + * off the syscallbuf, if needed. */ + desched_rec = t->next_syscallbuf_record(); + //t->push_event(DeschedEvent(desched_rec)); + //int call = t->read_mem(REMOTE_PTR_FIELD(desched_rec, syscallno)); + + /* The descheduled syscall was interrupted by a signal, like + * all other may-restart syscalls, with the exception that + * this one has already been restarted (which we'll detect + * back in the main loop). */ + //t->push_event(Event(interrupted, SyscallEvent(call, t->arch()))); + //ev.desched_rec = desched_rec; + } + + int call = t->read_mem(REMOTE_PTR_FIELD(desched_rec, syscallno)); + + t->exit_syscall(); + t->write_mem(REMOTE_PTR_FIELD(desched_rec, aborted), (uint8_t)1); + + Registers regs = t->regs(); + regs.set_syscall_result((uintptr_t)-EINTR); + t->set_regs(regs); + + LOG(debug) << " resuming (and probably switching out) blocked `" + << syscall_name(call, t->arch()) << "'"; + + // Advance until we hit the syscall entry event outside the syscallbuf, + // since that's the state we expect to be in. + while (true) { + t->resume_execution(RESUME_SYSCALL, RESUME_WAIT, RESUME_UNLIMITED_TICKS); + if (t->status().is_syscall()) { + if (t->is_in_syscallbuf()) { + continue; + } + break; + } + if (t->ptrace_event() == PTRACE_EVENT_EXIT) { + LOG(debug) + << " (got exit, bailing out)"; + t->push_event(Event::noop()); + return; + } + int sig = t->stop_sig(); + if (t->session().syscallbuf_desched_sig() == sig || + PerfCounters::TIME_SLICE_SIGNAL == sig || t->is_sig_ignored(sig)) { + LOG(debug) << " dropping ignored " << signal_name(sig); + continue; + } + + LOG(debug) << " stashing " << signal_name(sig); + t->stash_sig(); + } +} + /** * Return the event needing to be processed after this desched of |t|. * The tracee's execution may be advanced, and if so |regs| is updated @@ -572,53 +640,12 @@ static void handle_desched_event(RecordTask* t) { return; } - if (t->desched_rec()) { - // We're already processing a desched. We probably reexecuted the - // system call (e.g. because a signal was processed) and the syscall - // blocked again. Carry on with the current desched. - } else { - /* This prevents the syscallbuf record counter from being - * reset until we've finished guiding the tracee through this - * interrupted call. We use the record counter for - * assertions. */ - ASSERT(t, !t->delay_syscallbuf_reset_for_desched); - t->delay_syscallbuf_reset_for_desched = true; - LOG(debug) << "Desched initiated"; + // Get us out of this syscall so we can unwind the buffer and resume. + Registers regs = t->regs(); + regs.set_original_syscallno((uintptr_t)-1); + t->set_regs(regs); - /* The tracee is (re-)entering the buffered syscall. Stash - * away this breadcrumb so that we can figure out what syscall - * the tracee was in, and how much "scratch" space it carved - * off the syscallbuf, if needed. */ - remote_ptr desched_rec = - t->next_syscallbuf_record(); - t->push_event(DeschedEvent(desched_rec)); - int call = t->read_mem(REMOTE_PTR_FIELD(t->desched_rec(), syscallno)); - - /* The descheduled syscall was interrupted by a signal, like - * all other may-restart syscalls, with the exception that - * this one has already been restarted (which we'll detect - * back in the main loop). */ - t->push_event(Event(interrupted, SyscallEvent(call, t->arch()))); - SyscallEvent& ev = t->ev().Syscall(); - ev.desched_rec = desched_rec; - } - - SyscallEvent& ev = t->ev().Syscall(); - ev.regs = t->regs(); - /* For some syscalls (at least poll) but not all (at least not read), - * repeated cont_syscall()s above of the same interrupted syscall - * can set $orig_eax to 0 ... for unclear reasons. Fix that up here - * otherwise we'll get a divergence during replay, which will not - * encounter this problem. - */ - int call = t->read_mem(REMOTE_PTR_FIELD(t->desched_rec(), syscallno)); - ev.regs.set_original_syscallno(call); - t->set_regs(ev.regs); - // runnable_state_changed will observe us entering this syscall and change - // state to ENTERING_SYSCALL - - LOG(debug) << " resuming (and probably switching out) blocked `" - << syscall_name(call, ev.arch()) << "'"; + leave_syscallbuf(t); } static bool is_safe_to_deliver_signal(RecordTask* t, siginfo_t* si) { @@ -632,71 +659,7 @@ static bool is_safe_to_deliver_signal(RecordTask* t, siginfo_t* si) { << " because not in syscallbuf"; return true; } - - // Note that this will never fire on aarch64 in a signal stop - // since the ip has been moved to the syscall entry. - // We will catch it in the traced_syscall_entry case below. - // We will miss the exit for rrcall_notify_syscall_hook_exit - // but that should not be a big problem. - if (t->is_in_traced_syscall()) { - LOG(debug) << "Safe to deliver signal at " << t->ip() - << " because in traced syscall"; - return true; - } - - // Don't deliver signals just before entering rrcall_notify_syscall_hook_exit. - // At that point, notify_on_syscall_hook_exit will be set, but we have - // passed the point at which syscallbuf code has checked that flag. - // Replay will set notify_on_syscall_hook_exit when we replay towards the - // rrcall_notify_syscall_hook_exit *after* handling this signal, but - // that will be too late for syscallbuf to notice. - // It's OK to delay signal delivery until after rrcall_notify_syscall_hook_exit - // anyway. - if (t->is_at_traced_syscall_entry() && - !is_rrcall_notify_syscall_hook_exit_syscall(t->regs().syscallno(), t->arch())) { - LOG(debug) << "Safe to deliver signal at " << t->ip() - << " because at entry to traced syscall"; - return true; - } - - // On aarch64, the untraced syscall here include both entry and exit - // if we are at a signal stop. - if (t->is_in_untraced_syscall() && t->desched_rec()) { - // Untraced syscalls always use the architecture of the process - LOG(debug) << "Safe to deliver signal at " << t->ip() - << " because tracee interrupted by desched of " - << syscall_name(t->read_mem(REMOTE_PTR_FIELD(t->desched_rec(), - syscallno)), - t->arch()); - return true; - } - - if (t->is_in_untraced_syscall() && si->si_signo == SIGSYS && - si->si_code == SYS_SECCOMP) { - LOG(debug) << "Safe to deliver signal at " << t->ip() - << " because signal is seccomp trap."; - return true; - } - - // If the syscallbuf buffer hasn't been created yet, just delay the signal - // with no need to set notify_on_syscall_hook_exit; the signal will be - // delivered when rrcall_init_buffers is called. - if (t->syscallbuf_child) { - if (t->read_mem(REMOTE_PTR_FIELD(t->syscallbuf_child, locked)) & 2) { - LOG(debug) << "Safe to deliver signal at " << t->ip() - << " because the syscallbuf is locked"; - return true; - } - - // A signal (e.g. seccomp SIGSYS) interrupted a untraced syscall in a - // non-restartable way. Defer it until SYS_rrcall_notify_syscall_hook_exit. - if (t->is_in_untraced_syscall()) { - // Our emulation of SYS_rrcall_notify_syscall_hook_exit clears this flag. - t->write_mem( - REMOTE_PTR_FIELD(t->syscallbuf_child, notify_on_syscall_hook_exit), - (uint8_t)1); - } - } + (void)si; LOG(debug) << "Not safe to deliver signal at " << t->ip(); return false; diff --git a/src/record_syscall.cc b/src/record_syscall.cc index 13b79cbf136..98bb21e0070 100644 --- a/src/record_syscall.cc +++ b/src/record_syscall.cc @@ -5039,11 +5039,11 @@ static Switchable rec_prepare_syscall_arch(RecordTask* t, // the syscallbuf as a fake rrcall_rdtsc, but we then can't buffer it // because the buffer is full or disabled. case SYS_rrcall_rdtsc: { - syscall_state.emulate_result(0); uint64_t tsc = rdtsc(); - remote_ptr addr(t->regs().arg1()); - t->write_mem(addr, tsc); - t->record_local(addr, &tsc); + syscall_state.emulate_result((uint32_t)tsc); + Registers regs = t->regs(); + regs.set_dx(tsc >> 32); + t->set_regs(regs); return PREVENT_SWITCH; } diff --git a/src/test/execve_loop.c b/src/test/execve_loop.c index 171ada58724..d91d2543fb6 100644 --- a/src/test/execve_loop.c +++ b/src/test/execve_loop.c @@ -7,6 +7,11 @@ what we want to test here. */ int main(__attribute__((unused)) int argc, char* argv[], char* envp[]) { + if (argc < 2) { + atomic_printf("Usage: %s \n", argv[0]); + exit(1); + } + int count = atoi(argv[1]); if (count > 0) { diff --git a/src/test/expect_in_atomic_printf.py b/src/test/expect_in_atomic_printf.py index 10338698653..b6fb6235aae 100644 --- a/src/test/expect_in_atomic_printf.py +++ b/src/test/expect_in_atomic_printf.py @@ -1,6 +1,12 @@ from util import * import re +# Advance a bit, we may be in the jump stub. +# TODO: It would be nice to just teach gdb about this +for i in range(0,7): + send_gdb('stepi') + expect_gdb('(rr)') + send_gdb('bt') expect_gdb('atomic_printf') diff --git a/src/test/expect_in_exit.py b/src/test/expect_in_exit.py index 921fde5dbc0..22cb08ceb55 100644 --- a/src/test/expect_in_exit.py +++ b/src/test/expect_in_exit.py @@ -1,7 +1,11 @@ from util import * import re -send_gdb('reverse-stepi') +# Step out of the extended syscall jump patch. +for i in range(0,3): + send_gdb('reverse-stepi') + expect_gdb('(rr)') + send_gdb('bt') expect_gdb('_exit') diff --git a/src/test/get_thread_list.py b/src/test/get_thread_list.py index ccae5cc4139..072e4e14df9 100644 --- a/src/test/get_thread_list.py +++ b/src/test/get_thread_list.py @@ -26,6 +26,10 @@ '(0x[0-9a-f]+ in )?pthread_barrier_wait', '(0x[0-9a-f]+ in )?futex_wait', '0x0*70000002 in \?\?', + # This is the extended jump page. We hide it from the application, + # but not from GDB. Eventually we may want to supply some additional + # debug info to GDB to teach it about this, but for now we just let it be. + '0x[0-9a-f]+ in \?\?', '(0x[0-9a-f]+ in )?syscall_traced', '(0x[0-9a-f]+ in )?rr_page_start'], 'aarch64': ['(0x[0-9a-f]+ in )?syscall_traced', diff --git a/src/test/step_thread.py b/src/test/step_thread.py index 672c351d751..91d6084800d 100644 --- a/src/test/step_thread.py +++ b/src/test/step_thread.py @@ -68,6 +68,10 @@ '(0x[0-9a-f]+ in )?pthread_barrier_wait', '(0x[0-9a-f]+ in )?futex_wait', '0x0*70000002 in \?\?', + # This is the extended jump page. We hide it from the application, + # but not from GDB. Eventually we may want to supply some additional + # debug info to GDB to teach it about this, but for now we just let it be. + '0x[0-9a-f]+ in \?\?', '(0x[0-9a-f]+ in )?syscall_traced', '(0x[0-9a-f]+ in )?rr_page_start' ], diff --git a/src/test/vdso_stack.py b/src/test/vdso_stack.py index a6c060f2e47..95a3a331fba 100644 --- a/src/test/vdso_stack.py +++ b/src/test/vdso_stack.py @@ -5,7 +5,10 @@ send_gdb('c') expect_gdb('Breakpoint 1') -send_gdb('break traced_raw_syscall') +# This was supposed to check the unwinding in the bail path of the vdso, +# but we now unwind the syscallbuf before performing the bail syscall, +# so just check the stack at the main syscall_hook entry. +send_gdb('break syscall_hook') expect_gdb('Breakpoint 2') send_gdb('c') expect_gdb('Breakpoint 2')