Seccomp BPF与容器安全
本文为看雪论坛优秀文章
看雪论坛作者ID:ZxyNull
一
简介
Seccomp 的发展历史
/* Valid values for seccomp.mode and prctl(PR_SET_SECCOMP, <mode>) */
#define SECCOMP_MODE_DISABLED 0 /* seccomp is not in use. */
#define SECCOMP_MODE_STRICT 1 /* uses hard-coded filter. */
#define SECCOMP_MODE_FILTER 2 /* uses user-supplied filter. */
null@ubuntu:~/seccomp$ cat /proc/1/status | grep Seccomp
Seccomp: 0
Seccomp + BPF
简单指令集
小型指令集
所有的指令大小相同
实现过程简单、快速
只有分支向前指令
程序是有向无环图(DAGs),没有循环
易于验证程序的有效性/安全性
简单的指令集⇒可以验证操作码和参数
可以检测死代码
程序必须以 Return 结束
BPF过滤器程序仅限于4096条指令
struct sock_fprog { /* Required for SO_ATTACH_FILTER. */
unsigned short len; /* BPF指令的数量 */
struct sock_filter __user *filter; /*指向BPF数组的指针 */
};
struct sock_filter { /* Filter block */
__u16 code; /* Actual filter code */
__u8 jt; /* Jump true */
__u8 jf; /* Jump false */
__u32 k; /* Generic multiuse field */
};
加载指令
存储指令
跳转指令
算术逻辑指令
包括:ADD、SUB、 MUL、 DIV、 MOD、 NEG、OR、 AND、XOR、 LSH、 RSH
Return 指令
条件跳转指令
有两个跳转目标,jt为真,jf为假
jmp 目标是指令偏移量,最大 255
/*
* Macros for filter block array initializers.
*/
#ifndef BPF_STMT
#define BPF_STMT(code, k) { (unsigned short)(code), 0, 0, k }
#endif
#ifndef BPF_JUMP
#define BPF_JUMP(code, k, jt, jf) { (unsigned short)(code), jt, jf, k }
#endif
BPF_STMT
BPF_STMT(BPF_LD | BPF_W | BPF_ABS,(offsetof(struct seccomp_data, arch)))
BPF_JUMP
BPF_JUMP 中有四个参数:操作码、值(k)、为真跳转(jt)和为假跳转(jf),举个例子:
BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K ,AUDIT_ARCH_X86_64 , 1, 0)
struct sock_filter filter[] = {
BPF_STMT(BPF_LD+BPF_W+BPF_ABS,0), //将帧的偏移0处,取4个字节数据,也就是系统调用号的值载入累加器
BPF_JUMP(BPF_JMP+BPF_JEQ,59,0,1), //当A == 59时,顺序执行下一条规则,否则跳过下一条规则,这里的59就是x64的execve系统调用号
BPF_STMT(BPF_RET+BPF_K,SECCOMP_RET_KILL), //返回KILL
BPF_STMT(BPF_RET+BPF_K,SECCOMP_RET_ALLOW), //返回ALLOW
};
/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
#ifndef _UAPI__LINUX_BPF_COMMON_H__
#define _UAPI__LINUX_BPF_COMMON_H__
/* Instruction classes */
#define BPF_CLASS(code) ((code) & 0x07) //指定操作的类别
#define BPF_LD 0x00 //将值复制到累加器中
#define BPF_LDX 0x01 //将值加载到索引寄存器中
#define BPF_ST 0x02 //将累加器中的值存到暂存器
#define BPF_STX 0x03 //将索引寄存器的值存储在暂存器中
#define BPF_ALU 0x04 //用索引寄存器或常数作为操作数在累加器上执行算数或逻辑运算
#define BPF_JMP 0x05 //跳转
#define BPF_RET 0x06 //返回
#define BPF_MISC 0x07 // 其他类别
/* ld/ldx fields */
#define BPF_SIZE(code) ((code) & 0x18)
#define BPF_W 0x00 /* 32-bit */ //字
#define BPF_H 0x08 /* 16-bit */ //半字
#define BPF_B 0x10 /* 8-bit */ //字节
/* eBPF BPF_DW 0x18 64-bit */ //双字
#define BPF_MODE(code) ((code) & 0xe0)
#define BPF_IMM 0x00 //常数
#define BPF_ABS 0x20 //固定偏移量的数据包数据(绝对偏移)
#define BPF_IND 0x40 //可变偏移量的数据包数据(相对偏移)
#define BPF_MEM 0x60 //暂存器中的一个字
#define BPF_LEN 0x80 //数据包长度
#define BPF_MSH 0xa0
/* alu/jmp fields */
#define BPF_OP(code) ((code) & 0xf0) //当操作码类型为ALU时,指定具体运算符
#define BPF_ADD 0x00
#define BPF_SUB 0x10
#define BPF_MUL 0x20
#define BPF_DIV 0x30
#define BPF_OR 0x40
#define BPF_AND 0x50
#define BPF_LSH 0x60
#define BPF_RSH 0x70
#define BPF_NEG 0x80
#define BPF_MOD 0x90
#define BPF_XOR 0xa0
//当操作码是jmp时指定跳转类型
#define BPF_JA 0x00
#define BPF_JEQ 0x10
#define BPF_JGT 0x20
#define BPF_JGE 0x30
#define BPF_JSET 0x40
#define BPF_SRC(code) ((code) & 0x08)
#define BPF_K 0x00 //常数
#define BPF_X 0x08 //索引寄存器
#ifndef BPF_MAXINSNS
#define BPF_MAXINSNS 4096
#endif
#endif /* _UAPI__LINUX_BPF_COMMON_H__ */
/*
* All BPF programs must return a 32-bit value.
* The bottom 16-bits are for optional return data.
* The upper 16-bits are ordered from least permissive values to most,
* as a signed value (so 0x8000000 is negative).
*
* The ordering ensures that a min_t() over composed return values always
* selects the least permissive choice.
*/
#define SECCOMP_RET_KILL_PROCESS 0x80000000U /* kill the process */
#define SECCOMP_RET_KILL_THREAD 0x00000000U /* kill the thread */
#define SECCOMP_RET_KILL SECCOMP_RET_KILL_THREAD
#define SECCOMP_RET_TRAP 0x00030000U /* disallow and force a SIGSYS */
#define SECCOMP_RET_ERRNO 0x00050000U /* returns an errno */
#define SECCOMP_RET_USER_NOTIF 0x7fc00000U /* notifies userspace */
#define SECCOMP_RET_TRACE 0x7ff00000U /* pass to a tracer or disallow */
#define SECCOMP_RET_LOG 0x7ffc0000U /* allow after logging */
#define SECCOMP_RET_ALLOW 0x7fff0000U /* allow */
/* Masks for the return value sections. */
#define SECCOMP_RET_ACTION_FULL 0xffff0000U
#define SECCOMP_RET_ACTION 0x7fff0000U
#define SECCOMP_RET_DATA 0x0000ffffU
SECCOMP_RET_ALLOW:允许执行
SECCOMP_RET_KILL:立即终止执行
SECCOMP_RET_ERRNO:从系统调用中返回一个错误(系统调用不执行)
SECCOMP_RET_TRACE:尝试通知ptrace(), 使之有机会获得控制权
SECCOMP_RET_TRAP:通知内核发送SIGSYS信号(系统调用不执行)
struct seccomp_data {
int nr ; /* 系统调用号(依赖于体系架构) */
__u32 arch ; /* 架构(如AUDIT_ARCH_X86_64) */
__u64 instruction_pointer ; /* CPU指令指针 */
__u64 args [6]; /* 系统调用参数,最多有6个参数 */
};
二
实现
Prctl()
#include <sys/prctl.h>
int prctl(int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5);
prctl(PR_SET_NO_NEW_PRIVS,1,0,0,0);
prctl(PR_SET_SECCOMP,SECCOMP_MODE_FILTER,&prog);
#include <stdio.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/prctl.h>
#include <linux/seccomp.h>
void configure_seccomp() {
printf("Configuring seccomp\n");
prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT);
}
int main(int argc, char* argv[]) {
int infd, outfd;
ssize_t read_bytes;
char buffer[1024];
if (argc < 3) {
printf("Usage:\n\tdup_file <input path> <output_path>\n");
return -1;
}
configure_seccomp(); /* 配置seccomp */
printf("Opening '%s' for reading\n", argv[1]);
if ((infd = open(argv[1], O_RDONLY)) > 0) { /* open() 被禁用,进程会在此终止*/
printf("Opening '%s' for writing\n", argv[2]);
if ((outfd = open(argv[2], O_WRONLY | O_CREAT, 0644)) > 0) {
while((read_bytes = read(infd, &buffer, 1024)) > 0)
write(outfd, &buffer, (ssize_t)read_bytes);
}
}
close(infd);
close(outfd);
return 0;
}
null@ubuntu:~/seccomp$ gcc -o seccomp_strict seccomp_strict.c
null@ubuntu:~/seccomp$ ./seccomp_strict /etc/passwd output
Configuring seccomp
Opening '/etc/passwd' for reading
Killed
#include <stdio.h>
#include <sys/prctl.h>
#include <linux/seccomp.h>
#include <linux/filter.h>
#include <stdlib.h>
#include <unistd.h>
int main()
{
struct sock_filter filter[] = {
BPF_STMT(BPF_LD+BPF_W+BPF_ABS,0), //将帧的偏移0处,取4个字节数据,也就是系统调用号的值载入累加器
BPF_JUMP(BPF_JMP+BPF_JEQ,59,0,1), //判断系统调用号是否为59,是则顺序执行,否则跳过下一条
BPF_STMT(BPF_RET+BPF_K,SECCOMP_RET_KILL), //返回KILL
BPF_STMT(BPF_RET+BPF_K,SECCOMP_RET_ALLOW), //返回ALLOW
};
struct sock_fprog prog = {
.len = (unsigned short)(sizeof(filter)/sizeof(filter[0])),//规则条数
.filter = filter, //结构体数组指针
};
prctl(PR_SET_NO_NEW_PRIVS,1,0,0,0); //设置NO_NEW_PRIVS
prctl(PR_SET_SECCOMP,SECCOMP_MODE_FILTER,&prog);
write(0,"test\n",5);
system("/bin/sh");
return 0;
}
#include <stdio.h>
#include <fcntl.h>
#include <unistd.h>
#include <stddef.h>
#include <sys/prctl.h>
#include <linux/seccomp.h>
#include <linux/filter.h>
#include <linux/unistd.h>
void configure_seccomp() {
struct sock_filter filter [] = {
BPF_STMT(BPF_LD | BPF_W | BPF_ABS, (offsetof(struct seccomp_data, nr))), //将系统调用号载入累加器
BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_write, 0, 1), //测试系统调用号是否匹配'__NR__write',如果是允许其他syscall,如果不是则跳过下一条指令,
BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_open, 0, 3),//测试是否为'__NR_open',不是直接退出,
BPF_STMT(BPF_LD | BPF_W | BPF_ABS, (offsetof(struct seccomp_data, args[1]))),//第二个参数送入累加器
BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, O_RDONLY, 0, 1),//判断是否是'O_RDONLY'的方式,是则允许
BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL)
};
struct sock_fprog prog = {
.len = (unsigned short)(sizeof(filter) / sizeof (filter[0])),
.filter = filter,
};
printf("Configuring seccomp\n");
prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
}
int main(int argc, char* argv[]) {
int infd, outfd;
ssize_t read_bytes;
char buffer[1024];
if (argc < 3) {
printf("Usage:\n\tdup_file <input path> <output_path>\n");
return -1;
}
printf("Ducplicating file '%s' to '%s'\n", argv[1], argv[2]);
configure_seccomp(); //配置seccomp
printf("Opening '%s' for reading\n", argv[1]);
if ((infd = open(argv[1], O_RDONLY)) > 0) {
printf("Opening '%s' for writing\n", argv[2]);
if ((outfd = open(argv[2], O_WRONLY | O_CREAT, 0644)) > 0) {
while((read_bytes = read(infd, &buffer, 1024)) > 0)
write(outfd, &buffer, (ssize_t)read_bytes);
}
}
close(infd);
close(outfd);
return 0;
}
$ ./seccomp_filter /etc/passwd output
Ducplicating file '/etc/passwd' to 'output'
Configuring seccomp
Opening '/etc/passwd' for reading
Opening 'output' for writing
Bad system call
libseccomp
null@ubuntu:~/seccomp$ sudo apt install libseccomp-dev libseccomp2 seccomp
//gcc -g simple_syscall_seccomp.c -o simple_syscall_seccomp -lseccomp
#include <unistd.h>
#include <seccomp.h>
#include <linux/seccomp.h>
int main(void){
scmp_filter_ctx ctx;
ctx = seccomp_init(SCMP_ACT_ALLOW);
seccomp_rule_add(ctx, SCMP_ACT_KILL, SCMP_SYS(execve), 0);
seccomp_load(ctx);
char * filename = "/bin/sh";
char * argv[] = {"/bin/sh",NULL};
char * envp[] = {NULL};
write(1,"i will give you a shell\n",24);
syscall(59,filename,argv,envp);//execve
return 0;
}
null@ubuntu:~/seccomp$ gcc -g simple_syscall_seccomp.c -o simple_syscall_seccomp -lseccomp
null@ubuntu:~/seccomp$ ./simple_syscall_seccomp
i will give you a shell
Bad system call (core dumped)
seccomp_init(uint32_t def_action)
SCMP_ACT_ALLOW:即初始化为允许所有系统调用,过滤为黑名单模式;
SCMP_ACT_KILL:则为白名单模式过滤。
SCMP_ACT_KILL_PROCESS:整个进程将被内核终止
SCMP_ACT_TRAP:如果所有系统调用都不匹配,则给线程发送一个SIGSYS信号
SCMP_ACT_TRACE(uint16_t msg_num):在使用ptrace根据进程时的相关选项
SCMP_ACT_ERRNO(uint16_t errno):不匹配会收到errno的返回值
SCMP_ACT_LOG:不影响系统调用,但是会被记录;
int seccomp_rule_add(scmp_filter_ctx ctx, uint32_t action,int syscall, unsigned int arg_cnt, ...);
#include <unistd.h>
#include <seccomp.h>
#include <linux/seccomp.h>
int main(void){
scmp_filter_ctx ctx;
ctx = seccomp_init(SCMP_ACT_ALLOW);
seccomp_rule_add(ctx, SCMP_ACT_KILL, SCMP_SYS(write),1,SCMP_A2(SCMP_CMP_GT,0x10));//第2(从0)个参数大于0x10
seccomp_load(ctx);
write(1,"1234567812345678",0x10);//不被拦截
write(1,"i will give you a shell\n",24);//会拦截
return 0;
}
null@ubuntu:~/seccomp$ gcc -g seccomp_write_limit.c -o seccomp_write_limit -lseccomp
null@ubuntu:~/seccomp$ ./seccomp_write_limit
1234567812345678Bad system call (core dumped)
...
...
/**
* Comparison operators
*/
enum scmp_compare {
_SCMP_CMP_MIN = 0,
SCMP_CMP_NE = 1, /**< not equal */
SCMP_CMP_LT = 2, /**< less than */
SCMP_CMP_LE = 3, /**< less than or equal */
SCMP_CMP_EQ = 4, /**< equal */
SCMP_CMP_GE = 5, /**< greater than or equal */
SCMP_CMP_GT = 6, /**< greater than */
SCMP_CMP_MASKED_EQ = 7, /**< masked equality */
_SCMP_CMP_MAX,
};
...
struct scmp_arg_cmp {
unsigned int arg; /**< argument number, starting at 0 */
enum scmp_compare op; /**< the comparison op, e.g. SCMP_CMP_* */
scmp_datum_t datum_a;
scmp_datum_t datum_b;
};
....
/**
* Specify a 32-bit argument comparison struct for use in declaring rules
* @param arg the argument number, starting at 0
* @param op the comparison operator, e.g. SCMP_CMP_*
* @param datum_a dependent on comparison (32-bits)
* @param datum_b dependent on comparison, optional (32-bits)
*/
#define SCMP_CMP32(x, y, ...) \
_SCMP_MACRO_DISPATCHER(_SCMP_CMP32_, __VA_ARGS__)(x, y, __VA_ARGS__)
/**
* Specify a 64-bit argument comparison struct for argument 0
*/
#define SCMP_A0_64(...) SCMP_CMP64(0, __VA_ARGS__)
#define SCMP_A0 SCMP_A0_64
/**
* Specify a 32-bit argument comparison struct for argument 0
*/
#define SCMP_A0_32(x, ...) SCMP_CMP32(0, x, __VA_ARGS__)
/**
* Specify a 64-bit argument comparison struct for argument 1
*/
#define SCMP_A1_64(...) SCMP_CMP64(1, __VA_ARGS__)
#define SCMP_A1 SCMP_A1_64
/**
* Specify a 32-bit argument comparison struct for argument 1
*/
#define SCMP_A1_32(x, ...) SCMP_CMP32(1, x, __VA_ARGS__)
/**
* Specify a 64-bit argument comparison struct for argument 2
*/
#define SCMP_A2_64(...) SCMP_CMP64(2, __VA_ARGS__)
#define SCMP_A2 SCMP_A2_64
/**
* Specify a 32-bit argument comparison struct for argument 2
*/
#define SCMP_A2_32(x, ...) SCMP_CMP32(2, x, __VA_ARGS__)
/**
* Specify a 64-bit argument comparison struct for argument 3
*/
#define SCMP_A3_64(...) SCMP_CMP64(3, __VA_ARGS__)
#define SCMP_A3 SCMP_A3_64
/**
* Specify a 32-bit argument comparison struct for argument 3
*/
#define SCMP_A3_32(x, ...) SCMP_CMP32(3, x, __VA_ARGS__)
/**
* Specify a 64-bit argument comparison struct for argument 4
*/
#define SCMP_A4_64(...) SCMP_CMP64(4, __VA_ARGS__)
#define SCMP_A4 SCMP_A4_64
/**
* Specify a 32-bit argument comparison struct for argument 4
*/
#define SCMP_A4_32(x, ...) SCMP_CMP32(4, x, __VA_ARGS__)
/**
* Specify a 64-bit argument comparison struct for argument 5
*/
#define SCMP_A5_64(...) SCMP_CMP64(5, __VA_ARGS__)
#define SCMP_A5 SCMP_A5_64
/**
* Specify a 32-bit argument comparison struct for argument 5
*/
#define SCMP_A5_32(x, ...) SCMP_CMP32(5, x, __VA_ARGS__)
...
...
int seccomp_load(scmp_filter_ctx ctx);
int seccomp_reset(scmp_filter_ctx ctx ,uint32_t def_action )
三
其他工具
seccmop-bpf.h
...
define VALIDATE_ARCHITECTURE \
BPF_STMT(BPF_LD+BPF_W+BPF_ABS, arch_nr), \
BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, ARCH_NR, 1, 0), \
BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL)
define EXAMINE_SYSCALL \
BPF_STMT(BPF_LD+BPF_W+BPF_ABS, syscall_nr)
define ALLOW_SYSCALL(name) \
BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_##name, 0, 1), \
BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW)
define KILL_PROCESS \
BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL)
...
#include <fcntl.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <assert.h>
#include <linux/seccomp.h>
#include <sys/prctl.h>
#include "seccomp-bpf.h"
void install_syscall_filter()
{
struct sock_filter filter[] = {
/* Validate architecture. */
VALIDATE_ARCHITECTURE,
/* Grab the system call number. */
EXAMINE_SYSCALL,
/* List allowed syscalls. We add open() to the set of
allowed syscalls by the strict policy, but not
close(). */
ALLOW_SYSCALL(rt_sigreturn),
#ifdef __NR_sigreturn
ALLOW_SYSCALL(sigreturn),
#endif
ALLOW_SYSCALL(exit_group),
ALLOW_SYSCALL(exit),
ALLOW_SYSCALL(read),
ALLOW_SYSCALL(write),
ALLOW_SYSCALL(open),
KILL_PROCESS,
};
struct sock_fprog prog = {
.len = (unsigned short)(sizeof(filter)/sizeof(filter[0])),
.filter = filter,
};
assert(prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) == 0);
assert(prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog) == 0);
}
int main(int argc, char **argv)
{
int output = open("output.txt", O_WRONLY);
const char *val = "test";
printf("Calling prctl() to set seccomp with filter...\n");
install_syscall_filter();
printf("Writing to an already open file...\n");
write(output, val, strlen(val)+1);
printf("Trying to open file for reading...\n");
int input = open("output.txt", O_RDONLY);
printf("Note that open() worked. However, close() will not\n");
close(input);
printf("You will not see this message--the process will be killed first\n");
}
$ ./seccomp_policy
Calling prctl() to set seccomp with filter...
Writing to an already open file...
Trying to open file for reading...
Note that open() worked. However, close() will not
Bad system call
seccomp-tools
Dump:从可执行文件中自动转储 seccomp BPF
Disasm:将 seccomp BPF 转换为人类可读的格式
Asm:使编写seccomp规则类似于编写代码
Emu:模拟 seccomp 规则
sudo apt install gcc ruby-dev
gem install seccomp-tools
null@ubuntu:~/seccomp$ seccomp-tools dump ./simple_syscall_seccomp
line CODE JT JF K
=================================
0000: 0x20 0x00 0x00 0x00000004 A = arch
0001: 0x15 0x00 0x05 0xc000003e if (A != ARCH_X86_64) goto 0007
0002: 0x20 0x00 0x00 0x00000000 A = sys_number
0003: 0x35 0x00 0x01 0x40000000 if (A < 0x40000000) goto 0005
0004: 0x15 0x00 0x02 0xffffffff if (A != 0xffffffff) goto 0007
0005: 0x15 0x01 0x00 0x0000003b if (A == execve) goto 0007
0006: 0x06 0x00 0x00 0x7fff0000 return ALLOW
0007: 0x06 0x00 0x00 0x00000000 return KILL
四
使用Seccomp保护Docker的安全
null@ubuntu:~$ grep CONFIG_SECCOMP= /boot/config-$(uname -r)
CONFIG_SECCOMP=y
null@ubuntu:~$ sudo docker run --rm -it ubuntu /bin/bash
root@85e01c28bd2c:/# bash
root@85e01c28bd2c:/# ps
PID TTY TIME CMD
1 pts/0 00:00:00 bash
10 pts/0 00:00:00 bash
13 pts/0 00:00:00 ps
root@85e01c28bd2c:/# grep -i seccomp /proc/1/status
Seccomp: 2
{
"defaultAction": "SCMP_ACT_ERRNO",
"architectures": [
"SCMP_ARCH_X86_64",
"SCMP_ARCH_X86",
"SCMP_ARCH_X32"
],
"syscalls": [
{
"names": [
"arch_prctl",
"sched_yield",
"futex",
"write",
"mmap",
"exit_group",
"madvise",
"rt_sigprocmask",
"getpid",
"gettid",
"tgkill",
"rt_sigaction",
"read",
"getpgrp"
],
"action": "SCMP_ACT_ALLOW",
"args": [],
"comment": "",
"includes": {},
"excludes": {}
}
]
}
$ docker run --rm -it --security-opt seccomp=/path/to/seccomp/example.json hello-world
{
"defaultAction": "SCMP_ACT_ALLOW",
"syscalls": [
{
"name": "mkdir",
"action": "SCMP_ACT_ERRNO",
"args": []
}
]
}
null@ubuntu:~/seccomp/docker$ sudo docker run --rm -it --security-opt seccomp=seccomp_mkdir.json busybox /bin/sh
/ # ls
bin dev etc home proc root sys tmp usr var
/ # mkdir test
mkdir: can't create directory 'test': Operation not permitted
zaz
zaz seccomp docker IMAGE COMMAND
$ sudo ./zaz seccomp docker alpine "ping -c5 8.8.8.8" > seccomp_ping.json
$ cat seccomp_ping.json | jq '.'
{
"defaultAction": "SCMP_ACT_ERRNO",
"architectures": [
"SCMP_ARCH_X86_64",
"SCMP_ARCH_X86",
"SCMP_ARCH_X32"
],
"syscalls": [
{
"names": [
"arch_prctl",
"bind",
"clock_gettime",
"clone",
"close",
"connect",
"dup2",
"epoll_pwait",
"execve",
"exit",
"exit_group",
"fcntl",
"futex",
"getpid",
"getsockname",
"getuid",
"ioctl",
"mprotect",
"nanosleep",
"open",
"poll",
"read",
"recvfrom",
"rt_sigaction",
"rt_sigprocmask",
"rt_sigreturn",
"sendto",
"set_tid_address",
"setitimer",
"setsockopt",
"socket",
"write",
"writev"
],
"action": "SCMP_ACT_ALLOW"
}
]
}
FROM alpine:latest
CMD ["ping","-c5","8.8.8.8"]
$ sudo docker build -t pingtest .
$ sudo docker run --rm -it pingtest
PING 8.8.8.8 (8.8.8.8): 56 data bytes
64 bytes from 8.8.8.8: seq=0 ttl=127 time=42.139 ms
64 bytes from 8.8.8.8: seq=1 ttl=127 time=42.646 ms
64 bytes from 8.8.8.8: seq=2 ttl=127 time=42.098 ms
64 bytes from 8.8.8.8: seq=3 ttl=127 time=42.484 ms
64 bytes from 8.8.8.8: seq=4 ttl=127 time=42.007 ms
--- 8.8.8.8 ping statistics ---
5 packets transmitted, 5 packets received, 0% packet loss
round-trip min/avg/max = 42.007/42.274/42.646 ms
$ sudo docker run --rm -it --security-opt seccomp=seccomp_ping.json pingtest
docker: Error response from daemon: failed to create shim: OCI runtime create failed: container_linux.go:380: starting container process caused: close exec fds: open /proc/self/fd: operation not permitted: unknown.
$ sudo docker run --rm -it --security-opt seccomp=seccomp_ping.json pingtest
docker: Error response from daemon: failed to create shim: OCI runtime create failed: runc did not terminate successfully: exit status 2: panic: No error following JSON procError payload.
goroutine 1 [running]:
github.com/opencontainers/runc/libcontainer.parseSync(0x56551adf30b8, 0xc000010b20, 0xc0002268a0, 0xc00027f9e0, 0x0)
github.com/opencontainers/runc/libcontainer/sync.go:93 +0x307
github.com/opencontainers/runc/libcontainer.(*initProcess).start(0xc000297cb0, 0x0, 0x0)
github.com/opencontainers/runc/libcontainer/process_linux.go:440 +0x5ef
github.com/opencontainers/runc/libcontainer.(*linuxContainer).start(0xc000078700, 0xc000209680, 0x0, 0x0)
github.com/opencontainers/runc/libcontainer/container_linux.go:379 +0xf5
github.com/opencontainers/runc/libcontainer.(*linuxContainer).Start(0xc000078700, 0xc000209680, 0x0, 0x0)
github.com/opencontainers/runc/libcontainer/container_linux.go:264 +0xb4
main.(*runner).run(0xc0002274c8, 0xc0000200f0, 0x0, 0x0, 0x0)
github.com/opencontainers/runc/utils_linux.go:312 +0xd2a
main.startContainer(0xc00025c160, 0xc000076400, 0x1, 0x0, 0x0, 0xc0002275b8, 0x6)
github.com/opencontainers/runc/utils_linux.go:455 +0x455
main.glob..func2(0xc00025c160, 0xc000246000, 0xc000246120)
github.com/opencontainers/runc/create.go:65 +0xbb
github.com/urfave/cli.HandleAction(0x56551ad3b040, 0x56551ade81e8, 0xc00025c160, 0xc00025c160, 0x0)
github.com/urfave/cli@v1.22.1/app.go:523 +0x107
github.com/urfave/cli.Command.Run(0x56551aa566f5, 0x6, 0x0, 0x0, 0x0, 0x0, 0x0, 0x56551aa5f509, 0x12, 0x0, ...)
github.com/urfave/cli@v1.22.1/command.go:174 +0x579
github.com/urfave/cli.(*App).Run(0xc000254000, 0xc000132000, 0xf, 0xf, 0x0, 0x0)
github.com/urfave/cli@v1.22.1/app.go:276 +0x7e8
main.main()
github.com/opencontainers/runc/main.go:163 +0xd3f
: unknown.
Dockerd -> containerd -> containerd-shim -> runc
null@ubuntu:~$ sudo docker run --rm -it ubuntu /bin/bash
root@ef57fff95b80:/# bash
root@ef57fff95b80:/# ps
PID TTY TIME CMD
1 pts/0 00:00:00 bash
9 pts/0 00:00:00 bash
12 pts/0 00:00:00 ps
root@ubuntu:/home/null# pstree -p | grep containerd-shim
|-containerd-shim(28051)-+-bash(28075)---bash(28126)
| |-{containerd-shim}(28052)
| |-{containerd-shim}(28053)
| |-{containerd-shim}(28054)
| |-{containerd-shim}(28055)
| |-{containerd-shim}(28056)
| |-{containerd-shim}(28057)
| |-{containerd-shim}(28058)
| |-{containerd-shim}(28059)
| |-{containerd-shim}(28060)
| `-{containerd-shim}(28129)
root@ubuntu:/home/null# grep -i seccomp /proc/28051/status
Seccomp: 0
root@ubuntu:/home/null# grep -i seccomp /proc/28075/status
Seccomp: 2
root@ubuntu:/home/null# grep -i seccomp /proc/28126/status
Seccomp: 2
root@ubuntu:/home/null# grep -i seccomp /proc/28052/status
Seccomp: 0
...
...
root@ubuntu:/home/null# grep -i seccomp /proc/28129/status
Seccomp: 0
Sysdig
$ sysdig
285304 01:21:51.270700399 7 sshd (50485) > select
285306 01:21:51.270701716 7 sshd (50485) < select res=2
285307 01:21:51.270701982 7 sshd (50485) > rt_sigprocmask
285308 01:21:51.270702258 7 sshd (50485) < rt_sigprocmask
285309 01:21:51.270702473 7 sshd (50485) > rt_sigprocmask
285310 01:21:51.270702660 7 sshd (50485) < rt_sigprocmask
285312 01:21:51.270702983 7 sshd (50485) > read fd=13(<f>/dev/ptmx) size=16384
285313 01:21:51.270703971 1 sysdig (59131) > switch next=59095 pgft_maj=0 pgft_min=1759 vm_size=280112 vm_rss=18048 vm_swap=0
...
evt.num 是递增的事件编号
evt.time 是事件时间戳
evt.cpu 是捕获事件的 CPU 编号
proc.name 是生成事件的进程的名称
thread.tid 是产生事件的TID,对应单线程进程的PID
evt.dir 是事件方向,> 表示进入事件,< 表示退出事件
evt.type 是事件的名称,例如“open”或“read”
evt.args 是事件参数的列表。在系统调用的情况下,这些往往对应于系统调用参数,但情况并非总是如此:出于简单或性能原因,某些系统调用参数被排除在外。
$sysdig -w runc.scap container.name=ping&&proc.name=runc
$sudo docker run --rm -it --name=ping pingtest
PING 8.8.8.8 (8.8.8.8): 56 data bytes
64 bytes from 8.8.8.8: seq=0 ttl=127 time=44.032 ms
64 bytes from 8.8.8.8: seq=1 ttl=127 time=42.069 ms
64 bytes from 8.8.8.8: seq=2 ttl=127 time=42.066 ms
64 bytes from 8.8.8.8: seq=3 ttl=127 time=42.073 ms
64 bytes from 8.8.8.8: seq=4 ttl=127 time=42.112 ms
--- 8.8.8.8 ping statistics ---
5 packets transmitted, 5 packets received, 0% packet loss
round-trip min/avg/max = 42.066/42.470/44.032 ms
$ sysdig -p "%syscall.type" -r runc.scap | runc_syscall.txt
$ cat -n runc_syscall.txt
...
3437 rt_sigaction
3438 exit_group
3439 procexit
$ python analyse.py runc_syscall.txt
Filter syscall num: 72
filter syscall:['clone', 'close', 'prctl', 'getpid', 'write', 'unshare', 'read', 'exit_group', 'procexit', 'setsid', 'setuid', 'setgid', 'sched_getaffinity', 'openat', 'mmap', 'rt_sigprocmask', 'sigaltstack', 'gettid', 'rt_sigaction', 'mprotect', 'futex', 'set_robust_list', 'munmap', 'nanosleep', 'readlinkat', 'fcntl', 'epoll_create1', 'pipe', 'epoll_ctl', 'fstat', 'pread', 'getdents64', 'capget', 'epoll_pwait', 'newfstatat', 'statfs', 'getppid', 'keyctl', 'socket', 'bind', 'sendto', 'getsockname', 'recvfrom', 'mount', 'fchmodat', 'mkdirat', 'symlinkat', 'umask', 'mknodat', 'fchownat', 'unlinkat', 'chdir', 'fchdir', 'pivot_root', 'umount', 'dup', 'sethostname', 'fstatfs', 'seccomp', 'brk', 'fchown', 'setgroups', 'capset', 'execve', 'signaldeliver', 'access', 'arch_prctl', 'getuid', 'getgid', 'geteuid', 'getcwd', 'getegid']
{
"defaultAction": "SCMP_ACT_ERRNO",
"architectures": [
"SCMP_ARCH_X86_64",
"SCMP_ARCH_X86",
"SCMP_ARCH_X32"
],
"syscalls": [
{
"names": [
"clone",
"close",
"prctl",
"getpid",
"write",
"unshare",
"read",
"exit_group",
"procexit",
"setsid",
"setuid",
"setgid",
"sched_getaffinity",
"openat",
"mmap",
"rt_sigprocmask",
"sigaltstack",
"gettid",
"rt_sigaction",
"mprotect",
"futex",
"set_robust_list",
"munmap",
"nanosleep",
"readlinkat",
"fcntl",
"epoll_create1",
"pipe",
"epoll_ctl",
"fstat",
"pread",
"getdents64",
"capget",
"epoll_pwait",
"newfstatat",
"statfs",
"getppid",
"keyctl",
"socket",
"bind",
"sendto",
"getsockname",
"recvfrom",
"mount",
"fchmodat",
"mkdirat",
"symlinkat",
"umask",
"mknodat",
"fchownat",
"unlinkat",
"chdir",
"fchdir",
"pivot_root",
"umount",
"dup",
"sethostname",
"fstatfs",
"seccomp",
"brk",
"fchown",
"setgroups",
"capset",
"signaldeliver",
"access",
"getuid",
"getgid",
"geteuid",
"getcwd",
"getegid",
"arch_prctl",
"clock_gettime",
"connect",
"dup2",
"execve",
"exit",
"ioctl",
"open",
"poll",
"rt_sigreturn",
"set_tid_address",
"setitimer",
"setsockopt",
"socket",
"writev"
],
"action": "SCMP_ACT_ALLOW"
}
]
}
null@ubuntu:~/seccomp/docker/zaz/cmd$ sudo docker run -it --rm --security-opt seccomp=seccomp_ping.json pingtest
PING 8.8.8.8 (8.8.8.8): 56 data bytes
64 bytes from 8.8.8.8: seq=0 ttl=127 time=43.424 ms
64 bytes from 8.8.8.8: seq=1 ttl=127 time=42.873 ms
64 bytes from 8.8.8.8: seq=2 ttl=127 time=42.336 ms
64 bytes from 8.8.8.8: seq=3 ttl=127 time=48.164 ms
64 bytes from 8.8.8.8: seq=4 ttl=127 time=42.260 ms
--- 8.8.8.8 ping statistics ---
5 packets transmitted, 5 packets received, 0% packet loss
round-trip min/avg/max = 42.260/43.811/48.164 ms
$ sudo docker run -it --rm --security-opt seccomp=seccomp_ping.json pingtest ls
ls: .: Operation not permitted
$ sudo docker run -it --rm --security-opt seccomp=seccomp_ping.json pingtest mkdir test
mkdir: can't create directory 'test': Operation not permitted
五
总结
参考链接
https://github.com/seccomp/libseccomp/blob/3f0e47fe2717b73ccef68ca18f9f7297ee73ebb2/include/seccomp.h.in
docker seccomp
https://docs.docker.com/engine/security/seccomp/
Docker seccomp 与OCI
https://forums.mobyproject.org/t/docker-seccomp-prevents-system-calls-issued-by-oci-runtime/297/9
看雪ID:ZxyNull
https://bbs.pediy.com/user-home-921173.htm
# 往期推荐
球分享
球点赞
球在看
点击“阅读原文”,了解更多!