diff --git a/src/seccomp.rs b/src/seccomp.rs index d3d7ab2..c91eacc 100644 --- a/src/seccomp.rs +++ b/src/seccomp.rs @@ -10,10 +10,13 @@ //! via the default action. //! - x86_64 and aarch64 only. //! -//! Argument filters (modelled after Flatpak's seccomp policy): +//! Argument filters (modelled after Flatpak and Docker/Podman seccomp policies): //! - `ioctl`: blocks TIOCSTI and TIOCLINUX to prevent terminal input //! injection (CVE-2017-5226). Uses 32-bit arg comparison to avoid the //! snapd bypass (CVE-2019-10063). +//! - `clone`: blocks CLONE_NEWUSER to prevent user-namespace privilege +//! escalation. `clone3` remains fully denied (not in the allowlist) +//! because BPF cannot inspect its struct argument. //! //! Default action is ENOSYS (errno 38), not EPERM. This matches Podman's //! stance and is critical for the glibc clone3 -> clone fallback path. @@ -71,9 +74,11 @@ fn build_program_bytes() -> Result, SandboxError> { if let Ok(sysno) = Sysno::from_str(name) { let nr = i64::from(sysno.id()); let filtered = match *name { - "ioctl" => ioctl_rules().map_err(|e| SandboxError::Seccomp(e.to_string()))?, - _ => vec![], - }; + "ioctl" => ioctl_rules(), + "clone" => clone_rules(), + _ => Ok(vec![]), + } + .map_err(|e| SandboxError::Seccomp(e.to_string()))?; rules.insert(nr, filtered); } } @@ -90,19 +95,35 @@ fn build_program_bytes() -> Result, SandboxError> { Ok(serialize(&program)) } -const TIOCSTI: u64 = 0x5412; -const TIOCLINUX: u64 = 0x541C; - /// Allow ioctl except for TIOCSTI and TIOCLINUX terminal injection attacks. /// Dword (32-bit) comparison prevents the CVE-2019-10063 bypass where the /// kernel ignores the high 32 bits of the ioctl command argument. fn ioctl_rules() -> Result, BackendError> { Ok(vec![SeccompRule::new(vec![ - SeccompCondition::new(1, SeccompCmpArgLen::Dword, SeccompCmpOp::Ne, TIOCSTI)?, - SeccompCondition::new(1, SeccompCmpArgLen::Dword, SeccompCmpOp::Ne, TIOCLINUX)?, + SeccompCondition::new(1, SeccompCmpArgLen::Dword, SeccompCmpOp::Ne, libc::TIOCSTI)?, + SeccompCondition::new( + 1, + SeccompCmpArgLen::Dword, + SeccompCmpOp::Ne, + libc::TIOCLINUX, + )?, ])?]) } +/// Allow clone only when CLONE_NEWUSER is not set. Creating a user namespace +/// gives the sandboxed process fake root, which is the standard entry point for +/// kernel privilege-escalation exploits. `clone3` is fully denied (not in the +/// allowlist) because BPF cannot inspect its struct clone_args — glibc falls +/// back to clone on ENOSYS. +fn clone_rules() -> Result, BackendError> { + Ok(vec![SeccompRule::new(vec![SeccompCondition::new( + 0, + SeccompCmpArgLen::Qword, + SeccompCmpOp::MaskedEq(libc::CLONE_NEWUSER as u64), + 0, + )?])?]) +} + fn current_target_arch() -> Result { match std::env::consts::ARCH { "x86_64" => Ok(TargetArch::x86_64),