Block CLONE_NEWUSER in seccomp argument filter on clone

Prevents sandboxed processes from creating user namespaces to gain fake
root, the standard entry point for kernel privilege-escalation exploits.
Matches Docker and Podman's default seccomp policy.

clone3 remains fully denied (not in the allowlist) so glibc falls back
to clone where this argument filter applies.

Also switches hand-written ioctl constants to libc::{TIOCSTI,TIOCLINUX}.
This commit is contained in:
2026-04-13 16:46:15 +02:00
parent 0d0682b04e
commit bd1f938f54
+30 -9
View File
@@ -10,10 +10,13 @@
//! via the default action.
//! - x86_64 and aarch64 only.
//!
//! Argument filters (modelled after Flatpak's seccomp policy):
//! Argument filters (modelled after Flatpak and Docker/Podman seccomp policies):
//! - `ioctl`: blocks TIOCSTI and TIOCLINUX to prevent terminal input
//! injection (CVE-2017-5226). Uses 32-bit arg comparison to avoid the
//! snapd bypass (CVE-2019-10063).
//! - `clone`: blocks CLONE_NEWUSER to prevent user-namespace privilege
//! escalation. `clone3` remains fully denied (not in the allowlist)
//! because BPF cannot inspect its struct argument.
//!
//! Default action is ENOSYS (errno 38), not EPERM. This matches Podman's
//! stance and is critical for the glibc clone3 -> clone fallback path.
@@ -71,9 +74,11 @@ fn build_program_bytes() -> Result<Vec<u8>, SandboxError> {
if let Ok(sysno) = Sysno::from_str(name) {
let nr = i64::from(sysno.id());
let filtered = match *name {
"ioctl" => ioctl_rules().map_err(|e| SandboxError::Seccomp(e.to_string()))?,
_ => vec![],
};
"ioctl" => ioctl_rules(),
"clone" => clone_rules(),
_ => Ok(vec![]),
}
.map_err(|e| SandboxError::Seccomp(e.to_string()))?;
rules.insert(nr, filtered);
}
}
@@ -90,19 +95,35 @@ fn build_program_bytes() -> Result<Vec<u8>, SandboxError> {
Ok(serialize(&program))
}
const TIOCSTI: u64 = 0x5412;
const TIOCLINUX: u64 = 0x541C;
/// Allow ioctl except for TIOCSTI and TIOCLINUX terminal injection attacks.
/// Dword (32-bit) comparison prevents the CVE-2019-10063 bypass where the
/// kernel ignores the high 32 bits of the ioctl command argument.
fn ioctl_rules() -> Result<Vec<SeccompRule>, BackendError> {
Ok(vec![SeccompRule::new(vec![
SeccompCondition::new(1, SeccompCmpArgLen::Dword, SeccompCmpOp::Ne, TIOCSTI)?,
SeccompCondition::new(1, SeccompCmpArgLen::Dword, SeccompCmpOp::Ne, TIOCLINUX)?,
SeccompCondition::new(1, SeccompCmpArgLen::Dword, SeccompCmpOp::Ne, libc::TIOCSTI)?,
SeccompCondition::new(
1,
SeccompCmpArgLen::Dword,
SeccompCmpOp::Ne,
libc::TIOCLINUX,
)?,
])?])
}
/// Allow clone only when CLONE_NEWUSER is not set. Creating a user namespace
/// gives the sandboxed process fake root, which is the standard entry point for
/// kernel privilege-escalation exploits. `clone3` is fully denied (not in the
/// allowlist) because BPF cannot inspect its struct clone_args — glibc falls
/// back to clone on ENOSYS.
fn clone_rules() -> Result<Vec<SeccompRule>, BackendError> {
Ok(vec![SeccompRule::new(vec![SeccompCondition::new(
0,
SeccompCmpArgLen::Qword,
SeccompCmpOp::MaskedEq(libc::CLONE_NEWUSER as u64),
0,
)?])?])
}
fn current_target_arch() -> Result<TargetArch, SandboxError> {
match std::env::consts::ARCH {
"x86_64" => Ok(TargetArch::x86_64),