diff --git a/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl b/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl index 57681b79a7..4e843bee9e 100644 --- a/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl +++ b/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl @@ -118,3 +118,4 @@ __NR_openat2 437 sys_openat2 (int dirfd, char *pathname, struct open_how *how __NR_pidfd_getfd 438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) __NR_rseq 334 sys_rseq (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) __NR_membarrier 324 sys_membarrier (int cmd, unsigned int flags, int cpu_id) +__NR_map_shadow_stack 453 sys_map_shadow_stack (unsigned long addr, unsigned long size, unsigned int flags) diff --git a/compel/arch/x86/src/lib/include/uapi/asm/sigframe.h b/compel/arch/x86/src/lib/include/uapi/asm/sigframe.h index 9a540694b2..4a2e675597 100644 --- a/compel/arch/x86/src/lib/include/uapi/asm/sigframe.h +++ b/compel/arch/x86/src/lib/include/uapi/asm/sigframe.h @@ -177,6 +177,24 @@ static inline void rt_sigframe_erase_sigset(struct rt_sigframe *sigframe) #define USER32_CS 0x23 /* clang-format off */ +/* + * rst_sigreturn in resorer is noninline call which adds an entry to the + * shadow stack above the sigframe token; + * if shadow stack is enabled, increment the shadow stack pointer to remove + * that entry + */ +#define ARCH_SHSTK_POP() \ + asm volatile( \ + "xor %%rax, %%rax\n" \ + "rdsspq %%rax\n" \ + "cmpq $0, %%rax\n" \ + "jz 1f\n" \ + "movq $1, %%rax\n" \ + "incsspq %%rax\n" \ + "1:\n" \ + : : \ + : "rax") + #define ARCH_RT_SIGRETURN_NATIVE(new_sp) \ asm volatile( \ "movq %0, %%rax \n" \ @@ -205,9 +223,10 @@ static inline void rt_sigframe_erase_sigset(struct rt_sigframe *sigframe) #define ARCH_RT_SIGRETURN_RST(new_sp, rt_sigframe) \ do { \ - if ((rt_sigframe)->is_native) \ + if ((rt_sigframe)->is_native) { \ + ARCH_SHSTK_POP(); \ ARCH_RT_SIGRETURN_NATIVE(new_sp); \ - else \ + } else \ ARCH_RT_SIGRETURN_COMPAT(new_sp); \ } while (0) diff --git a/criu/arch/x86/include/asm/shstk.h b/criu/arch/x86/include/asm/shstk.h index 7849dd7a60..7814c351d1 100644 --- a/criu/arch/x86/include/asm/shstk.h +++ b/criu/arch/x86/include/asm/shstk.h @@ -10,11 +10,11 @@ #endif /* arch/x86/include/uapi/asm/prctl.h */ -#define ARCH_SHSTK_ENABLE 0x5001 +#define ARCH_SHSTK_ENABLE 0x5001 #define ARCH_SHSTK_DISABLE 0x5002 #define ARCH_SHSTK_LOCK 0x5003 -#define ARCH_SHSTK_UNLOCK 0x5004 -#define ARCH_SHSTK_STATUS 0x5005 +#define ARCH_SHSTK_UNLOCK 0x5004 +#define ARCH_SHSTK_STATUS 0x5005 #define ARCH_SHSTK_SHSTK (1ULL << 0) #define ARCH_SHSTK_WRSS (1ULL << 1) @@ -66,13 +66,207 @@ int arch_shstk_prepare(struct pstree_item *item, CoreEntry *core, struct task_restore_args *ta); #define arch_shstk_prepare arch_shstk_prepare -#if 0 int arch_shstk_unlock(struct pstree_item *item, CoreEntry *core, pid_t pid); #define arch_shstk_unlock arch_shstk_unlock int arch_shstk_trampoline(struct pstree_item *item, CoreEntry *core, int (*func)(void *arg), void *arg); #define arch_shstk_trampoline arch_shstk_trampoline -#endif + +#ifdef CR_NOGLIBC + +#include +#include +#include "vma.h" + +#define SHSTK_BUSY_BIT (1UL << 0) /* BIT(0) */ + +static inline int shstk_map(unsigned long addr, unsigned long size) +{ + long shstk = sys_map_shadow_stack(addr, size, SHADOW_STACK_SET_TOKEN); + + if (shstk < 0) { + pr_err("Failed to map shadow stack at %lx: %ld\n", addr, shstk); + return -1; + } + + if (shstk != addr) { + pr_err("Shadow stack address mismatch: need %lx, got %lx\n", addr, shstk); + return -1; + } + + pr_info("Created shadow stack at %lx\n", shstk); + + return 0; +} + +/* clang-format off */ +static inline unsigned long get_ssp(void) +{ + unsigned long ssp; + + asm volatile("rdsspq %0" : "=r"(ssp) :: ); + + return ssp; +} + +static inline void wrssq(unsigned long addr, unsigned long val) +{ + asm volatile("wrssq %1, (%0)" :: "r"(addr), "r"(val) : "memory"); +} +/* clang-format off */ + +static always_inline void shstk_switch_ssp(unsigned long new_ssp) +{ + unsigned long old_ssp = get_ssp(); + + asm volatile("rstorssp (%0)\n" :: "r"(new_ssp)); + asm volatile("saveprevssp"); + + pr_debug("changed ssp from %lx to %lx\n", old_ssp, new_ssp); +} + +/* + * Disable writes to the shadow stack and lock it's disable/enable control + */ +static inline int shstk_finalize(void) +{ + int ret = 0; + + ret = sys_arch_prctl(ARCH_SHSTK_DISABLE, ARCH_SHSTK_WRSS); + if (ret) { + pr_err("Failed to disable writes to shadow stack\n"); + return ret; + } + + ret = sys_arch_prctl(ARCH_SHSTK_LOCK, ARCH_SHSTK_SHSTK); + if (ret) + pr_err("Failed to lock shadow stack controls\n"); + + return ret; +} + +/* + * Restore contents of the shadow stack and set shadow stack pointer + */ +static always_inline int shstk_restore(struct rst_shstk_info *cet) +{ + unsigned long *shstk_data = (unsigned long *)cet->premmaped_addr; + unsigned long ssp = cet->vma_start + cet->vma_size - 8; + unsigned long shstk_top = cet->vma_size / 8 - 1; + unsigned long val; + long ret; + + if (!(cet->cet & ARCH_SHSTK_SHSTK)) + return 0; + + if (shstk_map(cet->vma_start, cet->vma_size)) + return -1; + + /* + * Switch shadow stack from temporary location to the actual task's + * shadow stack VMA + */ + shstk_switch_ssp(ssp); + + /* restore shadow stack contents */ + for (; ssp >= cet->ssp; ssp -= 8, shstk_top--) + wrssq(ssp, shstk_data[shstk_top]); + + /* + * Add tokens for sigreturn frame and for switch of the shadow stack. + * The sigreturn token will be checked by the kernel during + * processing of sigreturn + * The token for stack switch is required by rstorssp and + * saveprevssp semantics + */ + + /* token for sigreturn frame */ + val = ALIGN_DOWN(cet->ssp, 8) | SHSTK_DATA_BIT; + wrssq(ssp, val); + + /* shadow stack switch token */ + val = ssp | SHSTK_BUSY_BIT; + ssp -= 8; + wrssq(ssp, val); + + /* reset shadow stack pointer to the proper location */ + shstk_switch_ssp(ssp); + + ret = sys_munmap(shstk_data, cet->vma_size + PAGE_SIZE); + if (ret < 0) { + pr_err("Failed to unmap premmaped shadow stack\n"); + return ret; + } + + return shstk_finalize(); +} +#define arch_shstk_restore shstk_restore + +/* + * Disable shadow stack + */ +static inline int shstk_disable(void) +{ + int ret; + + ret = sys_arch_prctl(ARCH_SHSTK_DISABLE, ARCH_SHSTK_WRSS); + if (ret) { + pr_err("Failed to disable writes to shadow stack\n"); + return ret; + } + + ret = sys_arch_prctl(ARCH_SHSTK_DISABLE, ARCH_SHSTK_SHSTK); + if (ret) { + pr_err("Failed to disable shadow stack\n"); + return ret; + } + + ret = sys_arch_prctl(ARCH_SHSTK_LOCK, ARCH_SHSTK_SHSTK); + if (ret) + pr_err("Failed to lock shadow stack controls\n"); + + return 0; +} + +/* + * Switch to temporary shadow stack + */ +static always_inline int shstk_switch_to_restorer(struct rst_shstk_info *cet) +{ + unsigned long ssp; + long ret; + + if (!(cet->cet & ARCH_SHSTK_SHSTK)) + return 0; + + ret = sys_munmap((void *)cet->tmp_shstk, PAGE_SIZE); + if (ret < 0) { + pr_err("Failed to unmap area for temporary shadow stack\n"); + return -1; + } + + ret = shstk_map(cet->tmp_shstk, PAGE_SIZE); + if (ret < 0) + return -1; + + /* + * Switch shadow stack from the default created by the kernel to a + * temporary shadow stack allocated in the premmaped area + */ + ssp = cet->tmp_shstk + PAGE_SIZE - 8; + shstk_switch_ssp(ssp); + + ret = sys_arch_prctl(ARCH_SHSTK_ENABLE, ARCH_SHSTK_WRSS); + if (ret) { + pr_err("Failed to enable writes to shadow stack\n"); + return ret; + } + + return 0; +} +#define arch_shstk_switch_to_restorer shstk_switch_to_restorer + +#endif /* CR_NOGLIBC */ #endif /* __CR_ASM_SHSTK_H__ */ diff --git a/criu/include/restorer.h b/criu/include/restorer.h index 73565d1de4..3fb5322a4b 100644 --- a/criu/include/restorer.h +++ b/criu/include/restorer.h @@ -339,4 +339,20 @@ enum { #define __r_sym(name) restorer_sym##name #define restorer_sym(rblob, name) (void *)(rblob + __r_sym(name)) +#ifndef arch_shstk_switch_to_restorer +static inline int arch_shstk_switch_to_restorer(struct rst_shstk_info *shstk) +{ + return 0; +} +#define arch_shstk_switch_to_restorer arch_shstk_switch_to_restorer +#endif + +#ifndef arch_shstk_restore +static inline int arch_shstk_restore(struct rst_shstk_info *shstk) +{ + return 0; +} +#define arch_shstk_restore arch_shstk_restore +#endif + #endif /* __CR_RESTORER_H__ */ diff --git a/criu/pie/Makefile b/criu/pie/Makefile index 265dcf82bd..912fab24ba 100644 --- a/criu/pie/Makefile +++ b/criu/pie/Makefile @@ -18,6 +18,11 @@ ifeq ($(ARCH),mips) ccflags-y += -mno-abicalls -fno-pic endif +# -mshstk required for CET instructions +ifeq ($(ARCH),x86) + ccflags-y += -mshstk +endif + LDS := compel/arch/$(ARCH)/scripts/compel-pack.lds.S restorer-obj-y += parasite-vdso.o ./$(ARCH_DIR)/vdso-pie.o diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index 20c6801c5e..7c34c06d47 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -752,6 +752,10 @@ __visible long __export_restore_thread(struct thread_restore_args *args) goto core_restore_end; } + /* restore original shadow stack */ + if (arch_shstk_restore(&args->shstk)) + goto core_restore_end; + /* All signals must be handled by thread leader */ ksigfillset(&to_block); ret = sys_sigprocmask(SIG_SETMASK, &to_block, NULL, sizeof(k_rtsigset_t)); @@ -1672,6 +1676,9 @@ __visible long __export_restore_task(struct task_restore_args *args) pr_debug("lazy-pages: uffd %d\n", args->uffd); } + if (arch_shstk_switch_to_restorer(&args->shstk)) + goto core_restore_end; + /* * Park vdso/vvar in a safe place if architecture doesn't support * mapping them with arch_prctl(). @@ -1723,6 +1730,13 @@ __visible long __export_restore_task(struct task_restore_args *args) if (vma_entry->start > vma_entry->shmid) break; + /* + * shadow stack VMAs cannot be remapped, they must be + * recreated with map_shadow_stack system call + */ + if (vma_entry_is(vma_entry, VMA_AREA_SHSTK)) + continue; + if (vma_remap(vma_entry, args->uffd)) goto core_restore_end; } @@ -1740,6 +1754,13 @@ __visible long __export_restore_task(struct task_restore_args *args) if (vma_entry->start < vma_entry->shmid) break; + /* + * shadow stack VMAs cannot be remapped, they must be + * recreated with map_shadow_stack system call + */ + if (vma_entry_is(vma_entry, VMA_AREA_SHSTK)) + continue; + if (vma_remap(vma_entry, args->uffd)) goto core_restore_end; } @@ -2166,6 +2187,14 @@ __visible long __export_restore_task(struct task_restore_args *args) futex_set_and_wake(&thread_inprogress, args->nr_threads); + /* + * Shadow stack of the leader can be locked only after all other + * threads were cloned, otherwise they may start with read-only + * shadow stack. + */ + if (arch_shstk_restore(&args->shstk)) + goto core_restore_end; + restore_finish_stage(task_entries_local, CR_STATE_RESTORE_CREDS); if (ret)