TCP三次握手源码分析
TCP握手分为三个阶段,在握手开始之前,通信双方的套接字状态均为“TCP_CLOSE”,以下是这三个阶段:
(1)客户端发送一个标志位中SYN位为1的报文给服务端,并设套接字状态为“TCP_SYNSENT”
(2)服务端接到SYN报文,设套接字状态为“TCP_SYNRCV”,并回送一个SYN+ACK位均为1的报文
(3)客户端接到SYN+ACK报文,回送一个ACK位为1的报文,设套接字状态为“TCP_ESTABLISHED”,服务端接到ACK报文后,同样设置为“TCP_ESTABLISHED”

第一阶段
第一阶段客户端通过调用connect函数完成,connect实际上调用了内核中的__sys_connect函数。
以下代码是有关__sys_connect函数在文件net/scoket.c中的系统调用定义,由此可以看出,__sys_connect函数就是connect在内核中的实现。
SYSCALL_DEFINE3(connect, int, fd, struct sockaddr __user *, uservaddr,
int, addrlen)
{
return __sys_connect(fd, uservaddr, addrlen);
}从__sys_connect函数开始进入三次握手的第一阶段,以下是部分代码:
int __sys_connect(int fd, struct sockaddr __user *uservaddr, int addrlen)
{ ...
sock = sockfd_lookup_light(fd, &err, &fput_needed);
...
err = move_addr_to_kernel(uservaddr, addrlen, &address);
...
err = sock->ops->connect(sock, (struct sockaddr *)&address, addrlen,
sock->file->f_flags);
...
}代码中的sock->ops->connect即是tcp_v4_connect函数,现在转到tcp_v4_connect函数:
int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
struct inet_sock *inet = inet_sk(sk);
struct tcp_sock *tp = tcp_sk(sk);
__be16 orig_sport, orig_dport;
__be32 daddr, nexthop;
struct flowi4 *fl4;
struct rtable *rt;
int err;
struct ip_options_rcu *inet_opt;
struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
if (addr_len < sizeof(struct sockaddr_in))
return -EINVAL;
if (usin->sin_family != AF_INET)
return -EAFNOSUPPORT;
nexthop = daddr = usin->sin_addr.s_addr;
inet_opt = rcu_dereference_protected(inet->inet_opt,
lockdep_sock_is_held(sk));
if (inet_opt && inet_opt->opt.srr) {
if (!daddr)
return -EINVAL;
nexthop = inet_opt->opt.faddr;
}
orig_sport = inet->inet_sport;
orig_dport = usin->sin_port;
fl4 = &inet->cork.fl.u.ip4;
rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
IPPROTO_TCP,
orig_sport, orig_dport, sk);
if (IS_ERR(rt)) {
err = PTR_ERR(rt);
if (err == -ENETUNREACH)
IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
return err;
}
if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
ip_rt_put(rt);
return -ENETUNREACH;
}
if (!inet_opt || !inet_opt->opt.srr)
daddr = fl4->daddr;
if (!inet->inet_saddr)
inet->inet_saddr = fl4->saddr;
sk_rcv_saddr_set(sk, inet->inet_saddr);
if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
/* Reset inherited state */
tp->rx_opt.ts_recent = 0;
tp->rx_opt.ts_recent_stamp = 0;
if (likely(!tp->repair))
tp->write_seq = 0;
}
inet->inet_dport = usin->sin_port;
sk_daddr_set(sk, daddr);
inet_csk(sk)->icsk_ext_hdr_len = 0;
if (inet_opt)
inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
/* Socket identity is still unknown (sport may be zero).
* However we set state to SYN-SENT and not releasing socket
* lock select source port, enter ourselves into the hash tables and
* complete initialization after this.
*/
tcp_set_state(sk, TCP_SYN_SENT);
err = inet_hash_connect(tcp_death_row, sk);
if (err)
goto failure;
sk_set_txhash(sk);
rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
inet->inet_sport, inet->inet_dport, sk);
if (IS_ERR(rt)) {
err = PTR_ERR(rt);
rt = NULL;
goto failure;
}
/* OK, now commit destination to socket. */
sk->sk_gso_type = SKB_GSO_TCPV4;
sk_setup_caps(sk, &rt->dst);
rt = NULL;
if (likely(!tp->repair)) {
if (!tp->write_seq)
tp->write_seq = secure_tcp_seq(inet->inet_saddr,
inet->inet_daddr,
inet->inet_sport,
usin->sin_port);
tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
inet->inet_saddr,
inet->inet_daddr);
}
inet->inet_id = tp->write_seq ^ jiffies;
if (tcp_fastopen_defer_connect(sk, &err))
return err;
if (err)
goto failure;
err = tcp_connect(sk);
if (err)
goto failure;
return 0;
failure:
/*
* This unhashes the socket and releases the local port,
* if necessary.
*/
tcp_set_state(sk, TCP_CLOSE);
ip_rt_put(rt);
sk->sk_route_caps = 0;
inet->inet_dport = 0;
return err;
}在tcp_v4_connect函数中为套接字填充一些变量,将套接字的状态修改为“TCP_SYNSENT”,然后进入tcp_connect函数。
int tcp_connect(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *buff;
int err;
tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_CONNECT_CB, 0, NULL);
if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
return -EHOSTUNREACH; /* Routing failure or similar. */
tcp_connect_init(sk);
if (unlikely(tp->repair)) {
tcp_finish_connect(sk, NULL);
return 0;
}
buff = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, true);
if (unlikely(!buff))
return -ENOBUFS;
tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN);
tcp_mstamp_refresh(tp);
tp->retrans_stamp = tcp_time_stamp(tp);
tcp_connect_queue_skb(sk, buff);
tcp_ecn_send_syn(sk, buff);
tcp_rbtree_insert(&sk->tcp_rtx_queue, buff);
/* Send off SYN; include data in Fast Open. */
err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) :
tcp_transmit_skb(sk, buff, 1, sk->sk_allocation);
...47
/* Timer for repeating the SYN until an answer. */
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
return 0;
}通过调用tcp_transmit_skb函数构造SYN报文并发送出去,并设立一个定时器。
这一阶段函数的调用栈:
__sys_connect -> inet_stream_connect -> __inet_stream_connect -> tcp_v4_connect -> tcp_connect -> tcp_transmit_skb
第二阶段
这一阶段从中通过tcp_v4_rcv函数从ip层接收数据开始,以下是tcp_v4_rcv的部分代码:
int tcp_v4_rcv(struct sk_buff *skb)
{
...
if (sk->sk_state == TCP_LISTEN) {
ret = tcp_v4_do_rcv(sk, skb);
goto put_and_return;
}
...
put_and_return:
if (refcounted)
sock_put(sk);
return ret;
...
}由于当前套接字状态为“TCP_LISTEN”,进入tcp_v4_do_rcv函数执行
int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
{
...
if (sk->sk_state == TCP_LISTEN) {
if (tcp_rcv_state_process(sk, skb)) {
rsk = sk;
goto reset;
}
return 0;
...
}tcp_rcv_state_process函数专门用来处理套接字状态的转换,先贴出一张状态转换图:

int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
{
...
switch (sk->sk_state) {
case TCP_LISTEN:
if (th->ack)
return 1;
if (th->rst)
goto discard;
if (th->syn) {
if (th->fin)
goto discard;
/* It is possible that we process SYN packets from backlog,
* so we need to make sure to disable BH and RCU right there.
*/
rcu_read_lock();
local_bh_disable();
acceptable = icsk->icsk_af_ops->conn_request(sk, skb) >= 0;
local_bh_enable();
rcu_read_unlock();
if (!acceptable)
return 1;
consume_skb(skb);
return 0;
}
goto discard;
...
}这是tcp_rcv_state_process在“TCP_LISTEN”阶段执行的代码,核心在于22行的icsk->icsk_af_ops->conn_request,在此处一路执行tcp_v4_conn_request, tcp_conn_request。
以下是tcp_conn_request的部分代码:
if (fastopen_sk) {
af_ops->send_synack(fastopen_sk, dst, &fl, req,
&foc, TCP_SYNACK_FASTOPEN);
/* Add the child socket directly into the accept queue */
inet_csk_reqsk_queue_add(sk, req, fastopen_sk);
sk->sk_data_ready(sk);
bh_unlock_sock(fastopen_sk);
sock_put(fastopen_sk);
} else {
tcp_rsk(req)->tfo_listener = false;
if (!want_cookie)
inet_csk_reqsk_queue_hash_add(sk, req,
tcp_timeout_init((struct sock *)req));
af_ops->send_synack(sk, dst, &fl, req, &foc,
!want_cookie ? TCP_SYNACK_NORMAL :
TCP_SYNACK_COOKIE);
if (want_cookie) {
reqsk_free(req);
return 0;
}
}主要执行了send_synack函数,send_synack函数用于将SYN+ACK报文发送出去。
这一阶段函数的调用栈:
tcp_v4_rcv -> tcp_v4_do_rcv -> tcp_rcv_state_process -> tcp_v4_conn_request -> tcp_conn_request -> tcp_v4_send_synack
第三阶段
同上一阶段一样,从ip接收到报文后一路执行tcp_v4_rcv, tcp_v4_do_rcv,进入tcp_rcv_state_process函数:
int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
const struct tcphdr *th, unsigned int len)
{
...
switch (sk->sk_state) {
case TCP_SYN_SENT:
//进入到synack报文的处理流程
queued = tcp_rcv_synsent_state_process(sk, skb, th, len);
if (queued >= 0)
return queued;
/* Do step6 onward by hand. */
tcp_urg(sk, skb, th);
__kfree_skb(skb);
tcp_data_snd_check(sk);
return 0;
}
...
}在tcp_rcv_synsent_state_process函数中又调用了tcp_finish_connect函数,tcp_finish_connect函数做了三件事:
(1)将套接字状态设置为"TCP_ESTABLISHED"
(2)调用tcp_send_ack函数发送一个ACK包
(3)初始化一些参数
tcp_send_ack函数又调用tcp_transmit_skb将ACK报文从网络上发出去。
最后是服务端接收到ACK报文,依次执行tcp_v4_rcv,tcp_v4_do_rcv,tcp_rcv_state_process函数,将套接字的状态设置为"TCP_ESTABLISHED",至此,三次握手过程结束。
这一阶段函数的调用栈:
tcp_v4_rcv -> tcp_v4_do_rcv -> tcp_rcv_synsent_state_process -> tcp_send_ack -> tcp_transmit_skb
tcp_v4_rcv -> tcp_rcv_state_process