数据包在L4被挂入接收队列过程-liubangbo-ChinaUnix博客

在前一篇博文中，分析了数据包在IP层接收过程，如果是发给本机，最终数据包会送给L4来处理。下面以UDP协议为例来分析L4的处理过程：
1. udp_rcv是封装函数，直接调用__udp4_lib_rcv函数来处理，那么我们来看看这个函数：

点击(此处)折叠或打开

/*
* All we need to do is get the socket, and then do a checksum.
*/
int __udp4_lib_rcv(struct sk_buff *skb, struct hlist_head udptable[],
int is_udplite)
{
struct sock *sk; //这个结构体很重要，它和socket结构体相关联，也就是说根据一个就可以得到另一个
struct udphdr *uh = skb->h.uh; //从skb结构体中取得源端口号和目的端口号
unsigned short ulen;
struct rtable *rt = (struct rtable*)skb->dst;
__be32 saddr = skb->nh.iph->saddr; //从skb结构体中取得源IP地址和目的IP地址
__be32 daddr = skb->nh.iph->daddr;
/*
* Validate the packet.
*/
if (!pskb_may_pull(skb, sizeof(struct udphdr)))
goto drop; /* No space for header. */
ulen = ntohs(uh->len);
if (ulen > skb->len)
goto short_packet;
if(! is_udplite ) { /* UDP validates ulen. */
if (ulen < sizeof(*uh) || pskb_trim_rcsum(skb, ulen))
goto short_packet;
uh = skb->h.uh;
udp4_csum_init(skb, uh);
} else { /* UDP-Lite validates cscov. */
if (udplite4_csum_init(skb, uh))
goto csum_error;
}
if(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) //如果是L3广播或组播报文，进入相应的处理
return __udp4_lib_mcast_deliver(skb, uh, saddr, daddr, udptable);
sk = __udp4_lib_lookup(saddr, uh->source, daddr, uh->dest,
skb->dev->ifindex, udptable ); //这是这个函数所做的主要工作之一:根据目的端口号，找到应用层创建的socket
if (sk != NULL) {
int ret = udp_queue_rcv_skb(sk, skb); //如果找到这个socket，就把skb挂入到此socket的接收队列中
sock_put(sk);
/* a return value > 0 means to resubmit the input, but
* it wants the return to be -protocol, or 0
*/
if (ret > 0) //在这里这个数据包从网卡芯片往协议栈送的过程就算结束了
return -ret;
return 0;
}
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
goto drop;
nf_reset(skb);
/* No socket. Drop packet silently, if checksum is wrong */
if (udp_lib_checksum_complete(skb))
goto csum_error;
UDP_INC_STATS_BH(UDP_MIB_NOPORTS, is_udplite);
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); //如果挂入失败的话，就给源主机发送目标不可达ICMP报文
/*
* Hmm. We got an UDP packet to a port to which we
* don't wanna listen. Ignore it.
*/
kfree_skb(skb); //释放掉，此skb，over...
return(0);
short_packet:
LIMIT_NETDEBUG(KERN_DEBUG "UDP%s: short packet: From %u.%u.%u.%u:%u %d/%d to %u.%u.%u.%u:%u\n",
is_udplite? "-Lite" : "",
NIPQUAD(saddr),
ntohs(uh->source),
ulen,
skb->len,
NIPQUAD(daddr),
ntohs(uh->dest));
goto drop;
csum_error:
/*
* RFC1122: OK. Discards the bad packet silently (as far as
* the network is concerned, anyway) as per 4.1.3.4 (MUST).
*/
LIMIT_NETDEBUG(KERN_DEBUG "UDP%s: bad checksum. From %d.%d.%d.%d:%d to %d.%d.%d.%d:%d ulen %d\n",
is_udplite? "-Lite" : "",
NIPQUAD(saddr),
ntohs(uh->source),
NIPQUAD(daddr),
ntohs(uh->dest),
ulen);
drop:
UDP_INC_STATS_BH(UDP_MIB_INERRORS, is_udplite);
kfree_skb(skb);
return(0);
}

   一百多行啊，呵呵！
    函数的注释，正确的归纳了这个函数的处理过程：取得相应的socket, 做一些检查。其实还应该加一句，把skb挂入socket的接收队列中。
2. 从上面的代码可以看出，此过程比较简单。分析一下根据端口号找socket的过程和将skb挂入socket接收队列的过程：

点击(此处)折叠或打开

/* UDP is nearly always wildcards out the wazoo, it makes no sense to try
* harder than this. -DaveM
*/
static struct sock *__udp4_lib_lookup(__be32 saddr, __be16 sport,
__be32 daddr, __be16 dport,
int dif, struct hlist_head udptable[])
{
struct sock *sk, *result = NULL;
struct hlist_node *node;
unsigned short hnum = ntohs(dport);
int badness = -1;
read_lock(&udp_hash_lock); //这个过程得加锁
sk_for_each(sk, node, &udptable[hnum & (UDP_HTABLE_SIZE - 1)]) { //udptable这个哈希数组在bind绑定端口号的时候已经构建好了，在这里就是用端口号来
//从hlist链表中取得sock结构
struct inet_sock *inet = inet_sk(sk);
if (sk->sk_hash == hnum && !ipv6_only_sock(sk)) { //找到了bind了相同端口号的socket
int score = (sk->sk_family == PF_INET ? 1 : 0);
if (inet->rcv_saddr) { //在bind的时候绑定了自己本身的IP地址，判断对端发送数据包中的目的IP地址是否和自己匹配
if (inet->rcv_saddr != daddr)
continue;
score+=2;
}
if (inet->daddr) {
if (inet->daddr != saddr) //看socket端的目的地址和数据包的源地址
continue;
score+=2;
}
if (inet->dport) {
if (inet->dport != sport) //看socket端的目的端口和数据包的源端口
continue;
score+=2;
}
if (sk->sk_bound_dev_if) { //看绑定的接口？
if (sk->sk_bound_dev_if != dif)
continue;
score+=2;
}
if(score == 9) {
result = sk;
break;
} else if(score > badness) {
result = sk;
badness = score;
}
}
}
if (result)
sock_hold(result);
read_unlock(&udp_hash_lock);
return result;
}

从上面的代码中我们看到通过数据包的目的端口号，从udp_hash中找到bind相同端口号的socket，然后从中找到一个最佳的socket(score来计分，呵呵)，如果分值一样就取得最后bind的socket。现在我们知道了，可以建立多个socket
来bind相同的端口号(但是得用setsockopt设置socket属性为SO_REUSEADDR,否则会bind失败)，如果这些socket属性一样(score分值一样)，那么只有最后bind的socket有效，也就是说接收到的数据包会传给这个socket，其他socket接收不到skb。

点击(此处)折叠或打开

int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb)
{
struct udp_sock *up = udp_sk(sk);
int rc;
/*
* Charge it to the socket, dropping if the queue is full.
*/
if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
goto drop;
nf_reset(skb);
if (up->encap_type) {
/*
* This is an encapsulation socket, so let's see if this is
* an encapsulated packet.
* If it's a keepalive packet, then just eat it.
* If it's an encapsulateed packet, then pass it to the
* IPsec xfrm input and return the response
* appropriately. Otherwise, just fall through and
* pass this up the UDP socket.
*/
int ret;
ret = udp_encap_rcv(sk, skb);
if (ret == 0) {
/* Eat the packet .. */
kfree_skb(skb);
return 0;
}
if (ret < 0) {
/* process the ESP packet */
ret = xfrm4_rcv_encap(skb, up->encap_type);
UDP_INC_STATS_BH(UDP_MIB_INDATAGRAMS, up->pcflag);
return -ret;
}
/* FALLTHROUGH -- it's a UDP Packet */
}
/*
* UDP-Lite specific tests, ignored on UDP sockets
*/
if ((up->pcflag & UDPLITE_RECV_CC) && UDP_SKB_CB(skb)->partial_cov) {
/*
* MIB statistics other than incrementing the error count are
* disabled for the following two types of errors: these depend
* on the application settings, not on the functioning of the
* protocol stack as such.
*
* RFC 3828 here recommends (sec 3.3): "There should also be a
* way ... to ... at least let the receiving application block
* delivery of packets with coverage values less than a value
* provided by the application."
*/
if (up->pcrlen == 0) { /* full coverage was set */
LIMIT_NETDEBUG(KERN_WARNING "UDPLITE: partial coverage "
"%d while full coverage %d requested\n",
UDP_SKB_CB(skb)->cscov, skb->len);
goto drop;
}
/* The next case involves violating the min. coverage requested
* by the receiver. This is subtle: if receiver wants x and x is
* greater than the buffersize/MTU then receiver will complain
* that it wants x while sender emits packets of smaller size y.
* Therefore the above ...()->partial_cov statement is essential.
*/
if (UDP_SKB_CB(skb)->cscov < up->pcrlen) {
LIMIT_NETDEBUG(KERN_WARNING
"UDPLITE: coverage %d too small, need min %d\n",
UDP_SKB_CB(skb)->cscov, up->pcrlen);
goto drop;
}
}
if (sk->sk_filter && skb->ip_summed != CHECKSUM_UNNECESSARY) {
if (__udp_lib_checksum_complete(skb))
goto drop;
skb->ip_summed = CHECKSUM_UNNECESSARY;
}
if ((rc = sock_queue_rcv_skb(sk,skb)) < 0) { //在这里挂入的，这个函数里面有文章...
/* Note that an ENOMEM error is charged twice */
if (rc == -ENOMEM)
UDP_INC_STATS_BH(UDP_MIB_RCVBUFERRORS, up->pcflag);
goto drop;
}
UDP_INC_STATS_BH(UDP_MIB_INDATAGRAMS, up->pcflag);
return 0;
drop:
UDP_INC_STATS_BH(UDP_MIB_INERRORS, up->pcflag);
kfree_skb(skb);
return -1;
}

点击(此处)折叠或打开

int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
int err = 0;
int skb_len;
/* Cast skb->rcvbuf to unsigned... It's pointless, but reduces
number of warnings when compiling with -W --ANK
*/
if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= //sk_rmem_alloc是对接收的skb大小的累加和，当接收到skb时，sk_rmem_alloc增加，当从队列中取出并释放skb时，sk_rmem_alloc减少
(unsigned)sk->sk_rcvbuf) { //sk_rcvbuf 这个是接收缓冲区的大小，我们可以通过setsockopt进行设置。我们看到当从接收队列取包的速度小于接收到包的时候，我们
//适当增加sk_rcvbuf这个缓冲区的大小就一定程度上减少丢包。
err = -ENOMEM;
goto out;
}
err = sk_filter(sk, skb);
if (err)
goto out;
skb->dev = NULL;
skb_set_owner_r(skb, sk); //这个函数是对sk_rmem_alloc字段的操作
/* Cache the SKB length before we tack it onto the receive
* queue. Once it is added it no longer belongs to us and
* may be freed by other threads of control pulling packets
* from the queue.
*/
skb_len = skb->len;
skb_queue_tail(&sk->sk_receive_queue, skb); //把skb挂入到sk_receive_queue中
if (!sock_flag(sk, SOCK_DEAD))
sk->sk_data_ready(sk, skb_len);
out:
return err;
}

点击(此处)折叠或打开

static inline void skb_set_owner_r(struct sk_buff *skb, struct sock *sk)
{
skb->sk = sk;
skb->destructor = sock_rfree;
atomic_add(skb->truesize, &sk->sk_rmem_alloc); //还是个原子操作
}
void sock_rfree(struct sk_buff *skb) //这个是在free skb的时候调用的
{
struct sock *sk = skb->sk;
atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
}