数据包在L4被挂入接收队列过程

2090阅读 0评论2013-12-07 liubangbo
分类:LINUX

在前一篇博文中,分析了数据包在IP层接收过程,如果是发给本机,最终数据包会送给L4来处理。下面以UDP协议为例来分析L4的处理过程:
1. udp_rcv是封装函数,直接调用__udp4_lib_rcv函数来处理,那么我们来看看这个函数:

点击(此处)折叠或打开

  1. /*
  2.  *    All we need to do is get the socket, and then do a checksum.
  3.  */

  4. int __udp4_lib_rcv(struct sk_buff *skb, struct hlist_head udptable[],
  5.          int is_udplite)
  6. {
  7.     struct sock *sk;  //这个结构体很重要,它和socket结构体相关联,也就是说根据一个就可以得到另一个
  8.     struct udphdr *uh = skb->h.uh; //从skb结构体中取得源端口号和目的端口号
  9.     unsigned short ulen;
  10.     struct rtable *rt = (struct rtable*)skb->dst;
  11.     __be32 saddr = skb->nh.iph->saddr;  //从skb结构体中取得源IP地址和目的IP地址
  12.     __be32 daddr = skb->nh.iph->daddr;

  13.     /*
  14.      * Validate the packet.
  15.      */
  16.     if (!pskb_may_pull(skb, sizeof(struct udphdr)))
  17.         goto drop;        /* No space for header. */

  18.     ulen = ntohs(uh->len);
  19.     if (ulen > skb->len)
  20.         goto short_packet;

  21.     if(! is_udplite ) {        /* UDP validates ulen. */

  22.         if (ulen < sizeof(*uh) || pskb_trim_rcsum(skb, ulen))
  23.             goto short_packet;
  24.         uh = skb->h.uh;

  25.         udp4_csum_init(skb, uh);

  26.     } else     {            /* UDP-Lite validates cscov. */
  27.         if (udplite4_csum_init(skb, uh))
  28.             goto csum_error;
  29.     }

  30.     if(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST))  //如果是L3广播或组播报文,进入相应的处理
  31.         return __udp4_lib_mcast_deliver(skb, uh, saddr, daddr, udptable);

  32.     sk = __udp4_lib_lookup(saddr, uh->source, daddr, uh->dest,
  33.              skb->dev->ifindex, udptable );        //这是这个函数所做的主要工作之一:根据目的端口号,找到应用层创建的socket

  34.     if (sk != NULL) {
  35.         int ret = udp_queue_rcv_skb(sk, skb);    //如果找到这个socket,就把skb挂入到此socket的接收队列中
  36.         sock_put(sk);

  37.         /* a return value > 0 means to resubmit the input, but
  38.          * it wants the return to be -protocol, or 0
  39.          */
  40.         if (ret > 0)       //在这里这个数据包从网卡芯片往协议栈送的过程就算结束了
  41.             return -ret;
  42.         return 0;
  43.     }

  44.     if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
  45.         goto drop;
  46.     nf_reset(skb);

  47.     /* No socket. Drop packet silently, if checksum is wrong */
  48.     if (udp_lib_checksum_complete(skb))
  49.         goto csum_error;

  50.     UDP_INC_STATS_BH(UDP_MIB_NOPORTS, is_udplite);
  51.     icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);   //如果挂入失败的话,就给源主机发送目标不可达ICMP报文

  52.     /*
  53.      * Hmm. We got an UDP packet to a port to which we
  54.      * don't wanna listen. Ignore it.
  55.      */
  56.     kfree_skb(skb);  //释放掉,此skb,over...
  57.     return(0);

  58. short_packet:
  59.     LIMIT_NETDEBUG(KERN_DEBUG "UDP%s: short packet: From %u.%u.%u.%u:%u %d/%d to %u.%u.%u.%u:%u\n",
  60.          is_udplite? "-Lite" : "",
  61.          NIPQUAD(saddr),
  62.          ntohs(uh->source),
  63.          ulen,
  64.          skb->len,
  65.          NIPQUAD(daddr),
  66.          ntohs(uh->dest));
  67.     goto drop;

  68. csum_error:
  69.     /*
  70.      * RFC1122: OK. Discards the bad packet silently (as far as
  71.      * the network is concerned, anyway) as per 4.1.3.4 (MUST).
  72.      */
  73.     LIMIT_NETDEBUG(KERN_DEBUG "UDP%s: bad checksum. From %d.%d.%d.%d:%d to %d.%d.%d.%d:%d ulen %d\n",
  74.          is_udplite? "-Lite" : "",
  75.          NIPQUAD(saddr),
  76.          ntohs(uh->source),
  77.          NIPQUAD(daddr),
  78.          ntohs(uh->dest),
  79.          ulen);
  80. drop:
  81.     UDP_INC_STATS_BH(UDP_MIB_INERRORS, is_udplite);
  82.     kfree_skb(skb);
  83.     return(0);
  84. }
   一百多行啊,呵呵!
    函数的注释,正确的归纳了这个函数的处理过程:取得相应的socket, 做一些检查。其实还应该加一句,把skb挂入socket的接收队列中。
2. 从上面的代码可以看出,此过程比较简单。分析一下根据端口号找socket的过程和将skb挂入socket接收队列的过程:
  

点击(此处)折叠或打开

  1. /* UDP is nearly always wildcards out the wazoo, it makes no sense to try
  2.  * harder than this. -DaveM
  3.  */
  4. static struct sock *__udp4_lib_lookup(__be32 saddr, __be16 sport,
  5.                  __be32 daddr, __be16 dport,
  6.                  int dif, struct hlist_head udptable[])
  7. {
  8.     struct sock *sk, *result = NULL;
  9.     struct hlist_node *node;
  10.     unsigned short hnum = ntohs(dport);
  11.     int badness = -1;

  12.     read_lock(&udp_hash_lock);  //这个过程得加锁
  13.     sk_for_each(sk, node, &udptable[hnum & (UDP_HTABLE_SIZE - 1)]) {    //udptable这个哈希数组在bind绑定端口号的时候已经构建好了,在这里就是用端口号来
  14.                                                                         //从hlist链表中取得sock结构
  15.         struct inet_sock *inet = inet_sk(sk);

  16.         if (sk->sk_hash == hnum && !ipv6_only_sock(sk)) {   //找到了bind了相同端口号的socket
  17.             int score = (sk->sk_family == PF_INET ? 1 : 0);
  18.             if (inet->rcv_saddr) {              //在bind的时候绑定了自己本身的IP地址,判断对端发送数据包中的目的IP地址是否和自己匹配
  19.                 if (inet->rcv_saddr != daddr)
  20.                     continue;
  21.                 score+=2;
  22.             }
  23.             if (inet->daddr) {            
  24.                 if (inet->daddr != saddr)  //看socket端的目的地址和数据包的源地址
  25.                     continue;
  26.                 score+=2;
  27.             }
  28.             if (inet->dport) {            
  29.                 if (inet->dport != sport)  //看socket端的目的端口和数据包的源端口
  30.                     continue;
  31.                 score+=2;
  32.             }
  33.             if (sk->sk_bound_dev_if) {     //看绑定的接口 ?
  34.                 if (sk->sk_bound_dev_if != dif)
  35.                     continue;
  36.                 score+=2;
  37.             }
  38.             if(score == 9) {
  39.                 result = sk;
  40.                 break;
  41.             } else if(score > badness) {
  42.                 result = sk;
  43.                 badness = score;
  44.             }
  45.         }
  46.     }
  47.     if (result)
  48.         sock_hold(result);
  49.     read_unlock(&udp_hash_lock);
  50.     return result;
  51. }
  从上面的代码中我们看到通过数据包的目的端口号,从udp_hash中找到bind相同端口号的socket,然后从中找到一个最佳的socket(score来计分,呵呵),如果分值一样就取得最后bind的socket。现在我们知道了,可以建立多个socket
来bind相同的端口号(但是得用setsockopt设置socket属性为SO_REUSEADDR,否则会bind失败),如果这些socket属性一样(score分值一样),那么只有最后bind的socket有效,也就是说接收到的数据包会传给这个socket,其他socket接收不到skb。

点击(此处)折叠或打开

  1. int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb)
  2. {
  3.     struct udp_sock *up = udp_sk(sk);
  4.     int rc;

  5.     /*
  6.      *    Charge it to the socket, dropping if the queue is full.
  7.      */
  8.     if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
  9.         goto drop;
  10.     nf_reset(skb);

  11.     if (up->encap_type) {
  12.         /*
  13.          * This is an encapsulation socket, so let's see if this is
  14.          * an encapsulated packet.
  15.          * If it's a keepalive packet, then just eat it.
  16.          * If it's an encapsulateed packet, then pass it to the
  17.          * IPsec xfrm input and return the response
  18.          * appropriately. Otherwise, just fall through and
  19.          * pass this up the UDP socket.
  20.          */
  21.         int ret;

  22.         ret = udp_encap_rcv(sk, skb);
  23.         if (ret == 0) {
  24.             /* Eat the packet .. */
  25.             kfree_skb(skb);
  26.             return 0;
  27.         }
  28.         if (ret < 0) {
  29.             /* process the ESP packet */
  30.             ret = xfrm4_rcv_encap(skb, up->encap_type);
  31.             UDP_INC_STATS_BH(UDP_MIB_INDATAGRAMS, up->pcflag);
  32.             return -ret;
  33.         }
  34.         /* FALLTHROUGH -- it's a UDP Packet */
  35.     }

  36.     /*
  37.      *     UDP-Lite specific tests, ignored on UDP sockets
  38.      */
  39.     if ((up->pcflag & UDPLITE_RECV_CC) && UDP_SKB_CB(skb)->partial_cov) {

  40.         /*
  41.          * MIB statistics other than incrementing the error count are
  42.          * disabled for the following two types of errors: these depend
  43.          * on the application settings, not on the functioning of the
  44.          * protocol stack as such.
  45.          *
  46.          * RFC 3828 here recommends (sec 3.3): "There should also be a
  47.          * way ... to ... at least let the receiving application block
  48.          * delivery of packets with coverage values less than a value
  49.          * provided by the application."
  50.          */
  51.         if (up->pcrlen == 0) { /* full coverage was set */
  52.             LIMIT_NETDEBUG(KERN_WARNING "UDPLITE: partial coverage "
  53.                 "%d while full coverage %d requested\n",
  54.                 UDP_SKB_CB(skb)->cscov, skb->len);
  55.             goto drop;
  56.         }
  57.         /* The next case involves violating the min. coverage requested
  58.          * by the receiver. This is subtle: if receiver wants x and x is
  59.          * greater than the buffersize/MTU then receiver will complain
  60.          * that it wants x while sender emits packets of smaller size y.
  61.          * Therefore the above ...()->partial_cov statement is essential.
  62.          */
  63.         if (UDP_SKB_CB(skb)->cscov < up->pcrlen) {
  64.             LIMIT_NETDEBUG(KERN_WARNING
  65.                 "UDPLITE: coverage %d too small, need min %d\n",
  66.                 UDP_SKB_CB(skb)->cscov, up->pcrlen);
  67.             goto drop;
  68.         }
  69.     }

  70.     if (sk->sk_filter && skb->ip_summed != CHECKSUM_UNNECESSARY) {
  71.         if (__udp_lib_checksum_complete(skb))
  72.             goto drop;
  73.         skb->ip_summed = CHECKSUM_UNNECESSARY;
  74.     }

  75.     if ((rc = sock_queue_rcv_skb(sk,skb)) < 0) {    //在这里挂入的,这个函数里面有文章...
  76.         /* Note that an ENOMEM error is charged twice */
  77.         if (rc == -ENOMEM)
  78.             UDP_INC_STATS_BH(UDP_MIB_RCVBUFERRORS, up->pcflag);
  79.         goto drop;
  80.     }

  81.     UDP_INC_STATS_BH(UDP_MIB_INDATAGRAMS, up->pcflag);
  82.     return 0;

  83. drop:
  84.     UDP_INC_STATS_BH(UDP_MIB_INERRORS, up->pcflag);
  85.     kfree_skb(skb);
  86.     return -1;
  87. }


点击(此处)折叠或打开

  1. int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
  2. {
  3.     int err = 0;
  4.     int skb_len;

  5.     /* Cast skb->rcvbuf to unsigned... It's pointless, but reduces
  6.      number of warnings when compiling with -W --ANK
  7.      */
  8.     if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=   //sk_rmem_alloc是对接收的skb大小的累加和,当接收到skb时,sk_rmem_alloc增加,当从队列中取出并释放skb时,sk_rmem_alloc减少
  9.      (unsigned)sk->sk_rcvbuf) {                              //sk_rcvbuf 这个是接收缓冲区的大小,我们可以通过setsockopt进行设置。我们看到当从接收队列取包的速度小于接收到包的时候,我们
  10.                                                              //适当增加sk_rcvbuf这个缓冲区的大小就一定程度上减少丢包。
  11.                                                             
  12.         err = -ENOMEM;
  13.         goto out;
  14.     }

  15.     err = sk_filter(sk, skb);
  16.     if (err)
  17.         goto out;

  18.     skb->dev = NULL;
  19.     skb_set_owner_r(skb, sk); //这个函数是对sk_rmem_alloc字段的操作

  20.     /* Cache the SKB length before we tack it onto the receive
  21.      * queue. Once it is added it no longer belongs to us and
  22.      * may be freed by other threads of control pulling packets
  23.      * from the queue.
  24.      */
  25.     skb_len = skb->len;

  26.     skb_queue_tail(&sk->sk_receive_queue, skb);    //把skb挂入到sk_receive_queue中

  27.     if (!sock_flag(sk, SOCK_DEAD))
  28.         sk->sk_data_ready(sk, skb_len);
  29. out:
  30.     return err;
  31. }


点击(此处)折叠或打开

  1. static inline void skb_set_owner_r(struct sk_buff *skb, struct sock *sk)
  2. {
  3.     skb->sk = sk;
  4.     skb->destructor = sock_rfree;
  5.     atomic_add(skb->truesize, &sk->sk_rmem_alloc); //还是个原子操作
  6. }



  7. void sock_rfree(struct sk_buff *skb)  //这个是在free skb的时候调用的
  8. {
  9.     struct sock *sk = skb->sk;

  10.     atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
  11. }





上一篇:数据包在IP层接收流程简单分析
下一篇:socket创建过程