From 76c5be0e72934c879e69c81a03bb959434201a90 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Krist=C3=B3f=20T=C3=B3th?= Date: Mon, 13 Apr 2026 17:03:38 +0200 Subject: [PATCH] Fix misinformation in seccomp comments --- src/seccomp.rs | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/src/seccomp.rs b/src/seccomp.rs index c91eacc..b97c7fc 100644 --- a/src/seccomp.rs +++ b/src/seccomp.rs @@ -6,14 +6,15 @@ //! Simplifications vs upstream (intentional): //! - No capability-conditional rules (we never grant capabilities, so all //! of Podman's `caps` blocks collapse to "deny" — we just omit them). -//! - No explicit-EPERM list — anything outside the allowlist returns ENOSYS -//! via the default action. +//! - No explicit-EPERM list — anything outside the allowlist on the host +//! ABI returns ENOSYS via the default action; cross-ABI calls are killed +//! by seccompiler's arch prologue before the rule lookup runs. //! - x86_64 and aarch64 only. //! //! Argument filters (modelled after Flatpak and Docker/Podman seccomp policies): //! - `ioctl`: blocks TIOCSTI and TIOCLINUX to prevent terminal input //! injection (CVE-2017-5226). Uses 32-bit arg comparison to avoid the -//! snapd bypass (CVE-2019-10063). +//! Flatpak bypass (CVE-2019-10063). //! - `clone`: blocks CLONE_NEWUSER to prevent user-namespace privilege //! escalation. `clone3` remains fully denied (not in the allowlist) //! because BPF cannot inspect its struct argument. @@ -24,7 +25,10 @@ //! Syscall name -> number resolution uses the `syscalls` crate. Names that //! don't exist on the host architecture (e.g. legacy 32-bit aliases like //! `_llseek`, or aarch64-only `set_tls` when building on x86_64) are silently -//! skipped — they would just return ENOSYS anyway under the default action. +//! skipped. This is safe: such names are unreachable via the host ABI, and +//! any cross-ABI attempt to call them (e.g. int 0x80 from a 64-bit process) +//! is killed by seccompiler's arch-validation prologue (SECCOMP_RET_KILL_PROCESS) +//! before the syscall table is even consulted. use std::collections::BTreeMap; use std::io::{Seek, SeekFrom, Write}; @@ -53,7 +57,8 @@ pub fn write_program_to_memfd() -> Result { // Safety: memfd_create is a normal Linux syscall. We pass a valid C string // and flags=0, so the fd is created without MFD_CLOEXEC and survives exec - // into bwrap. The name is only a debugging label (shows up in /proc//maps). + // into bwrap. The name is only a debugging label (shows up as the symlink + // target in /proc//fd/). let raw_fd = unsafe { libc::memfd_create(c"agent-sandbox-seccomp".as_ptr(), 0) }; if raw_fd < 0 { return Err(SandboxError::Io(std::io::Error::last_os_error())); @@ -133,9 +138,11 @@ fn current_target_arch() -> Result { } fn serialize(program: &[sock_filter]) -> Vec { - // Flatten the in-memory BpfProgram into the raw byte stream the kernel's - // seccomp(2) interface expects. A BpfProgram is &[sock_filter], where - // sock_filter is the classic-BPF instruction format from : + // Flatten the in-memory BpfProgram into the byte stream bwrap expects to + // read from the fd passed via --seccomp. bwrap then constructs a + // `struct sock_fprog` over that buffer and hands it to the kernel via + // seccomp(2). A BpfProgram is &[sock_filter], where sock_filter is the + // classic-BPF instruction format from : // // struct sock_filter { // __u16 code; // opcode