某次大促值班 ing,对系统稳定性有着充分信心、心态稳如老狗的笔者突然收到上游反馈有万分几的概率请求我们 endpoint 会出现 Connection timeout 。此时系统侧的 apiserver 集群水位在 40%,离极限水位还有着很大的距离,当时通过紧急扩容 apiserver 集群后错误率降为了 0。事后进行了详细的问题排查,定位分析到问题根因出现在系统连接队列被打满导致,之前笔者对 TCP 半连接队列、全连接队列不太了解,只依稀记得 《TCP/IP 详解》中好像有好像提到过这两个名词。
目前网上相关资料都比较零散,并且有些是过时或错误的结论,笔者在调查问题时踩了很多坑。痛定思痛,笔者查阅了大量资料并做了众多实验进行验证,梳理了这篇 TCP 半连接队列、全连接详解,当你细心阅读完这篇文章后相信你可以对 TCP 半连接队列、全连接队列有更充分的认识。
本篇文章将结合理论知识、内核代码、操作实验为你呈现如下内容:
半连接队列、全连接队列介绍
常用命令介绍
全连接队列实战 —— 最大长度控制、全连接队列溢出实验、实验结果分析...
半连接队列实战 —— 最大长度控制、半连接队列溢出实验、实验结果分析...
...
半连接队列、全连接队列
在 TCP 三次握手的过程中,Linux 内核会维护两个队列,分别是:
半连接队列 (SYN Queue)
全连接队列 (Accept Queue)
正常的 TCP 三次握手过程:
1、Client 端向 Server 端发送 SYN 发起握手,Client 端进入 SYN_SENT 状态
2、Server 端收到 Client 端的 SYN 请求后,Server 端进入 SYN_RECV 状态,此时内核会将连接存储到半连接队列(SYN Queue),并向 Client 端回复 SYN+ACK
3、Client 端收到 Server 端的 SYN+ACK 后,Client 端回复 ACK 并进入 ESTABLISHED 状态
4、Server 端收到 Client 端的 ACK 后,内核将连接从半连接队列(SYN Queue)中取出,添加到全连接队列(Accept Queue),Server 端进入 ESTABLISHED 状态
5、Server 端应用进程调用 accept 函数时,将连接从全连接队列(Accept Queue)中取出
半连接队列和全连接队列都有长度大小限制,超过限制时内核会将连接 Drop 丢弃或者返回 RST 包。
相关指标查看
ss 命令
通过 ss 命令可以查看到全连接队列的信息
# -n 不解析服务名称
# -t 只显示 tcp sockets
# -l 显示正在监听(LISTEN)的 sockets
$ ss -lnt
State Recv-Q Send-Q Local Address:Port Peer Address:Port
LISTEN 0 128 [::]:2380 [::]:*
LISTEN 0 128 [::]:80 [::]:*
LISTEN 0 128 [::]:8080 [::]:*
LISTEN 0 128 [::]:8090 [::]:*
$ ss -nt
State Recv-Q Send-Q Local Address:Port Peer Address:Port
ESTAB 0 0 [::ffff:33.9.95.134]:80 [::ffff:33.51.103.59]:47452
ESTAB 0 536 [::ffff:33.9.95.134]:80 [::ffff:33.43.108.144]:37656
ESTAB 0 0 [::ffff:33.9.95.134]:80 [::ffff:33.51.103.59]:38130
ESTAB 0 536 [::ffff:33.9.95.134]:80 [::ffff:33.51.103.59]:38280
ESTAB 0 0 [::ffff:33.9.95.134]:80 [::ffff:33.51.103.59]:38204
对于 LISTEN 状态的 socket
// https://github.com/torvalds/linux/blob/master/net/ipv4/tcp_diag.c
static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
void *_info)
{
struct tcp_info *info = _info;
if (inet_sk_state_load(sk) == TCP_LISTEN) { // socket 状态是 LISTEN 时
r->idiag_rqueue = READ_ONCE(sk->sk_ack_backlog); // 当前全连接队列大小
r->idiag_wqueue = READ_ONCE(sk->sk_max_ack_backlog); // 全连接队列最大长度
} else if (sk->sk_type == SOCK_STREAM) { // socket 状态不是 LISTEN 时
const struct tcp_sock *tp = tcp_sk(sk);
r->idiag_rqueue = max_t(int, READ_ONCE(tp->rcv_nxt) -
READ_ONCE(tp->copied_seq), 0); // 已收到但未被应用程序读取的字节数
r->idiag_wqueue = READ_ONCE(tp->write_seq) - tp->snd_una; // 已发送但未收到确认的字节数
}
if (info)
tcp_get_info(sk, info);
}
$ netstat -s | grep -i "listen"
189088 times the listen queue of a socket overflowed
30140232 SYNs to LISTEN sockets dropped
// https://github.com/torvalds/linux/blob/master/net/socket.c
/*
* Perform a listen. Basically, we allow the protocol to do anything
* necessary for a listen, and if that works, we mark the socket as
* ready for listening.
*/
int __sys_listen(int fd, int backlog)
{
struct socket *sock;
int err, fput_needed;
int somaxconn;
sock = sockfd_lookup_light(fd, &err, &fput_needed);
if (sock) {
somaxconn = sock_net(sock->sk)->core.sysctl_somaxconn; // /proc/sys/net/core/somaxconn
if ((unsigned int)backlog > somaxconn)
backlog = somaxconn; // TCP 全连接队列最大长度 min(somaxconn, backlog)
err = security_socket_listen(sock, backlog);
if (!err)
err = sock->ops->listen(sock, backlog);
fput_light(sock->file, fput_needed);
}
return err;
}
package main
import (
"log"
"net"
"time"
)
func main() {
l, err := net.Listen("tcp", ":8888")
if err != nil {
log.Printf("failed to listen due to %v", err)
}
defer l.Close()
log.Println("listen :8888 success")
for {
time.Sleep(time.Second * 100)
}
}
$ cat /proc/sys/net/core/somaxconn
128
LISTEN 0 128 [::]:8888 [::]:*
$ sudo sysctl -p
net.core.somaxconn = 1024
$ cat /proc/sys/net/core/somaxconn
1024
$ ss -lnt | grep 8888
LISTEN 0 1024 [::]:8888 [::]:*
// server 端监听 8888 tcp 端口
package main
import (
"log"
"net"
"time"
)
func main() {
l, err := net.Listen("tcp", ":8888")
if err != nil {
log.Printf("failed to listen due to %v", err)
}
defer l.Close()
log.Println("listen :8888 success")
for {
time.Sleep(time.Second * 100)
}
}
// client 端并发请求 10 次 server 端,成功建立 tcp 连接后向 server 端发送数据
package main
import (
"context"
"log"
"net"
"os"
"os/signal"
"sync"
"syscall"
"time"
)
var wg sync.WaitGroup
func establishConn(ctx context.Context, i int) {
defer wg.Done()
conn, err := net.DialTimeout("tcp", ":8888", time.Second*5)
if err != nil {
log.Printf("%d, dial error: %v", i, err)
return
}
log.Printf("%d, dial success", i)
_, err = conn.Write([]byte("hello world"))
if err != nil {
log.Printf("%d, send error: %v", i, err)
return
}
select {
case <-ctx.Done():
log.Printf("%d, dail close", i)
}
}
func main() {
ctx, cancel := context.WithCancel(context.Background())
for i := 0; i < 10; i++ {
wg.Add(1)
go establishConn(ctx, i)
}
go func() {
sc := make(chan os.Signal, 1)
signal.Notify(sc, syscall.SIGINT)
select {
case <-sc:
cancel()
}
}()
wg.Wait()
log.Printf("client exit")
}
$ sudo sysctl -p
net.core.somaxconn = 5
2021/10/11 17:24:48 8, dial success
2021/10/11 17:24:48 3, dial success
2021/10/11 17:24:48 4, dial success
2021/10/11 17:24:48 6, dial success
2021/10/11 17:24:48 5, dial success
2021/10/11 17:24:48 2, dial success
2021/10/11 17:24:48 1, dial success
2021/10/11 17:24:48 0, dial success
2021/10/11 17:24:48 7, dial success
2021/10/11 17:24:53 9, dial error: dial tcp 33.9.192.157:8888: i/o timeout
tcp 0 0 33.9.192.155:40372 33.9.192.157:8888 ESTABLISHED
tcp 0 0 33.9.192.155:40376 33.9.192.157:8888 ESTABLISHED
tcp 0 0 33.9.192.155:40370 33.9.192.157:8888 ESTABLISHED
tcp 0 0 33.9.192.155:40366 33.9.192.157:8888 ESTABLISHED
tcp 0 0 33.9.192.155:40374 33.9.192.157:8888 ESTABLISHED
tcp 0 0 33.9.192.155:40368 33.9.192.157:8888 ESTABLISHED
tcp6 11 0 33.9.192.157:8888 33.9.192.155:40376 ESTABLISHED
tcp6 11 0 33.9.192.157:8888 33.9.192.155:40370 ESTABLISHED
tcp6 11 0 33.9.192.157:8888 33.9.192.155:40368 ESTABLISHED
tcp6 11 0 33.9.192.157:8888 33.9.192.155:40372 ESTABLISHED
tcp6 11 0 33.9.192.157:8888 33.9.192.155:40374 ESTABLISHED
tcp6 11 0 33.9.192.157:8888 33.9.192.155:40366 ESTABLISHED
tcp LISTEN 6 5 [::]:8888 [::]:* users:(("main",pid=84244,fd=3))
State Recv-Q Send-Q Local Address:Port Peer Address:Port
LISTEN 6 5 [::]:8888 [::]:*
/* Note: If you think the test should be:
* return READ_ONCE(sk->sk_ack_backlog) >= READ_ONCE(sk->sk_max_ack_backlog);
* Then please take a look at commit 64a146513f8f ("[NET]: Revert incorrect accept queue backlog changes.")
*/
static inline bool sk_acceptq_is_full(const struct sock *sk)
{
return READ_ONCE(sk->sk_ack_backlog) > READ_ONCE(sk->sk_max_ack_backlog);
}
backlog = min(somaxconn, backlog)
nr_table_entries = backlog
nr_table_entries = min(backlog, sysctl_max_syn_backlog)
nr_table_entries = max(nr_table_entries, 8)
// roundup_pow_of_two: 将参数向上取整到最小的 2^n,注意这里存在一个 +1
nr_table_entries = roundup_pow_of_two(nr_table_entries + 1)
max_qlen_log = max(3, log2(nr_table_entries))
max_queue_length = 2^max_qlen_log
backlog = min(somaxconn, backlog) = min(128, 128) = 128
nr_table_entries = backlog = 128
nr_table_entries = min(backlog, sysctl_max_syn_backlog) = min(128, 1024) = 128
nr_table_entries = max(nr_table_entries, 8) = max(128, 8) = 128
nr_table_entries = roundup_pow_of_two(nr_table_entries + 1) = 256
max_qlen_log = max(3, log2(nr_table_entries)) = max(3, 8) = 8
max_queue_length = 2^max_qlen_log = 2^8 = 256
那么 Server 端是如何判断半连接队列是否满的呢?除了上面一小节提到的半连接队列最大长度控制外,还和 /proc/sys/net/ipv4/tcp_syncookies 参数有关。(tcp_syncookies 的作用是为了防止 SYN Flood 攻击的,下文会给出相关链接介绍)
static inline int inet_csk_reqsk_queue_is_full(const struct sock *sk)
{
return inet_csk_reqsk_queue_len(sk) >= sk->sk_max_ack_backlog;
}
$ sudo sysctl -p
net.core.somaxconn = 1024
net.ipv4.tcp_max_syn_backlog = 128
net.ipv4.tcp_syncookies = 0
$ sudo hping3 -S 33.9.192.157 -p 8888 --flood
HPING 33.9.192.157 (eth0 33.9.192.157): S set, 40 headers + 0 data bytes
hping in flood mode, no replies will be shown
[zechen.hg@function-compute033009192157.na63 /home/zechen.hg]
$ sudo netstat -nat | grep :8888 | grep SYN_RECV | wc -l
96
[zechen.hg@function-compute033009192157.na63 /home/zechen.hg]
$ sudo netstat -nat | grep :8888 | grep SYN_RECV | wc -l
96
$ sudo sysctl -p
net.core.somaxconn = 128
net.ipv4.tcp_max_syn_backlog = 512
net.ipv4.tcp_syncookies = 0
$ sudo hping3 -S 33.9.192.157 -p 8888 --flood
HPING 33.9.192.157 (eth0 33.9.192.157): S set, 40 headers + 0 data bytes
hping in flood mode, no replies will be shown
[zechen.hg@function-compute033009192157.na63 /home/zechen.hg]
$ sudo netstat -nat | grep :8888 | grep SYN_RECV | wc -l
128
[zechen.hg@function-compute033009192157.na63 /home/zechen.hg]
$ sudo netstat -nat | grep :8888 | grep SYN_RECV | wc -l
128
$ sudo sysctl -p
net.core.somaxconn = 128
net.ipv4.tcp_max_syn_backlog = 512
net.ipv4.tcp_syncookies = 1
$ sudo hping3 -S 33.9.192.157 -p 8888 --flood
HPING 33.9.192.157 (eth0 33.9.192.157): S set, 40 headers + 0 data bytes
hping in flood mode, no replies will be shown
[zechen.hg@function-compute033009192157.na63 /home/zechen.hg]
$ sudo netstat -nat | grep :8888 | grep SYN_RECV | wc -l
128
[zechen.hg@function-compute033009192157.na63 /home/zechen.hg]
$ sudo netstat -nat | grep :8888 | grep SYN_RECV | wc -l
128
$ sudo sysctl -p
net.core.somaxconn = 5
net.ipv4.tcp_max_syn_backlog = 512
net.ipv4.tcp_syncookies = 1
[zechen.hg@function-compute033009192157.na63 /home/zechen.hg]
$ sudo netstat -nat | grep :8888 | grep SYN_RECV | wc -l
5
[zechen.hg@function-compute033009192157.na63 /home/zechen.hg]
$ sudo netstat -nat | grep :8888 | grep SYN_RECV | wc -l
5
$ sudo sysctl -p
net.core.somaxconn = 256
net.ipv4.tcp_max_syn_backlog = 128
net.ipv4.tcp_syncookies = 1
$ sudo hping3 -S 33.9.192.157 -p 8888 --flood
HPING 33.9.192.157 (eth0 33.9.192.157): S set, 40 headers + 0 data bytes
hping in flood mode, no replies will be shown
[zechen.hg@function-compute033009192157.na63 /home/zechen.hg]
$ sudo netstat -nat | grep :8888 | grep SYN_RECV | wc -l
256
[zechen.hg@function-compute033009192157.na63 /home/zechen.hg]
$ sudo netstat -nat | grep :8888 | grep SYN_RECV | wc -l
256