LINUX下PING与TCP_IP协议栈学习笔记(2)-B_C

ip_append_data在/net/ipv4/ip_output.c中

int ip_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, int length, int transhdrlen, struct ipcm_cookie *ipc, struct rtable *rt, unsigned int flags) { struct inet_sock *inet = inet_sk(sk); struct sk_buff *skb; struct ip_options *opt = NULL; int hh_len; int exthdrlen; int mtu; int copy; int err; int offset = 0; unsigned int maxfraglen, fragheaderlen; int csummode = CHECKSUM_NONE; //检测是否只初始化而不发送 if (flags & MSG_PROBE) return 0; //检测发送队列是否为空 if (skb_queue_empty(&sk->sk_write_queue)) { /* * setup for corking. */ //取得IP选项 opt = ipc->opt; //检测IP选项是否为空 if (opt) { if (inet->cork.opt == NULL) { inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation); if (unlikely(inet->cork.opt == NULL)) return -ENOBUFS; } memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen); inet->cork.flags |= IPCORK_OPT; inet->cork.addr = ipc->addr; } dst_hold(&rt->u.dst); //取得MTU inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ? rt->u.dst.dev->mtu : dst_mtu(rt->u.dst.path); //取得dst_entry结构 inet->cork.dst = &rt->u.dst; inet->cork.length = 0; sk->sk_sndmsg_page = NULL; sk->sk_sndmsg_off = 0; if ((exthdrlen = rt->u.dst.header_len) != 0) { length += exthdrlen; transhdrlen += exthdrlen; } } //发送队列不为空 else { rt = (struct rtable *)inet->cork.dst; if (inet->cork.flags & IPCORK_OPT) opt = inet->cork.opt; transhdrlen = 0; exthdrlen = 0; mtu = inet->cork.fragsize; } //取得硬件帧所需要的头部空间大小 hh_len = LL_RESERVED_SPACE(rt->u.dst.dev); //取得IP报头的大小 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); //计算数据可用的最大空间大小 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; if (inet->cork.length + length > 0xFFFF - fragheaderlen) { ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen); return -EMSGSIZE; } /* * transhdrlen > 0 means that this is the first fragment and we wish * it won't be fragmented in the future. */ if (transhdrlen && length + fragheaderlen <= mtu && rt->u.dst.dev->features & NETIF_F_V4_CSUM && !exthdrlen) csummode = CHECKSUM_PARTIAL; inet->cork.length += length; //检测数据长度是否超过MTU //检测发送队列是否为空 //检测协议类型是否为UDP if (((length> mtu) || !skb_queue_empty(&sk->sk_write_queue)) && (sk->sk_protocol == IPPROTO_UDP) && (rt->u.dst.dev->features & NETIF_F_UFO)) { //进行UDP分片 err = ip_ufo_append_data(sk, getfrag, from, length, hh_len, fragheaderlen, transhdrlen, mtu, flags); if (err) goto error; return 0; } /* So, what's going on in the loop below? * * We use calculated fragment length to generate chained skb, * each of segments is IP fragment ready for sending to network after * adding appropriate IP header. */ //检测发送队列是否为空 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) goto alloc_new_skb; //检测数据装载到skb中是否全部完成 while (length > 0) { /* Check if the remaining data fits into current packet. */ copy = mtu - skb->len; if (copy < length) copy = maxfraglen - skb->len; if (copy <= 0) { char *data; unsigned int datalen; unsigned int fraglen; unsigned int fraggap; unsigned int alloclen; struct sk_buff *skb_prev; alloc_new_skb: skb_prev = skb; //检测上一个skb是否存在 if (skb_prev) fraggap = skb_prev->len - maxfraglen; else //碎片长度为0 fraggap = 0; /* * If remaining data exceeds the mtu, * we know we need more fragment(s). */ //取得总数据长度,当前数据长度加上碎片 datalen = length + fraggap; //检测数据长度加上IP报头长度是否超过MTU if (datalen > mtu - fragheaderlen) //超过则为IP报头预留空间 datalen = maxfraglen - fragheaderlen; //计算数据长度加上IP报头长度的值 fraglen = datalen + fragheaderlen; if ((flags & MSG_MORE) && !(rt->u.dst.dev->features & NETIF_F_SG)) alloclen = mtu; else //实际分配的数据空间为数据长度加IP报头 alloclen = datalen + fragheaderlen; /* The last fragment gets additional space at tail. * Note, with MSG_MORE we overallocate on fragments, * because we have no idea what fragment will be * the last. */ //最后一个碎片需要更多的空间 if (datalen == length + fraggap) alloclen += rt->u.dst.trailer_len; if (transhdrlen) { skb = sock_alloc_send_skb(sk, alloclen + hh_len + 15, (flags & MSG_DONTWAIT), &err); } else { skb = NULL; //检测空间是否足够再分配一个sk_buff if (atomic_read(&sk->sk_wmem_alloc) <= 2 * sk->sk_sndbuf) //分配sk_buff //数据空间大小为数据加上MAC加上15位的填充 skb = sock_wmalloc(sk, alloclen + hh_len + 15, 1, sk->sk_allocation); if (unlikely(skb == NULL)) err = -ENOBUFS; } //检测分配是否成功 if (skb == NULL) goto error; /* * Fill in the control structures */ //设置IP段效验和模式 skb->ip_summed = csummode; //初始化效验和 skb->csum = 0; //预留硬件头部空间 skb_reserve(skb, hh_len); /* * Find where to start putting bytes. */ //占用fraglen大小的空间,并返回头位置 data = skb_put(skb, fraglen); //设置IP层指针的位置 skb_set_network_header(skb, exthdrlen); //设置运输层指针的位置 skb->transport_header = (skb->network_header + fragheaderlen); //取得运输层在缓冲区的起始位置 data += fragheaderlen; //检测是否有碎片 if (fraggap) { skb->csum = skb_copy_and_csum_bits( skb_prev, maxfraglen, data + transhdrlen, fraggap, 0); skb_prev->csum = csum_sub(skb_prev->csum, skb->csum); data += fraggap; pskb_trim_unique(skb_prev, maxfraglen); } //计算实际需要拷贝的数据长度 copy = datalen - transhdrlen - fraggap; //拷贝数据到缓冲区 if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) { err = -EFAULT; kfree_skb(skb); goto error; } //计算偏移 offset += copy; //计算数据剩余长度 length -= datalen - fraggap; transhdrlen = 0; exthdrlen = 0; csummode = CHECKSUM_NONE; /* * Put the packet on the pending queue. */ //把该skb加入发送队列中 __skb_queue_tail(&sk->sk_write_queue, skb); continue; } if (copy > length) copy = length; if (!(rt->u.dst.dev->features & NETIF_F_SG)) { unsigned int off; off = skb->len; if (getfrag(from, skb_put(skb, copy),offset, copy, off, skb) < 0) { __skb_trim(skb, off); err = -EFAULT; goto error; } } else { int i = skb_shinfo(skb)->nr_frags; skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1]; struct page *page = sk->sk_sndmsg_page; int off = sk->sk_sndmsg_off; unsigned int left; if (page && (left = PAGE_SIZE - off) > 0) { if (copy >= left) copy = left; if (page != frag->page) { if (i == MAX_SKB_FRAGS) { err = -EMSGSIZE; goto error; } get_page(page); skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0); frag = &skb_shinfo(skb)->frags[i]; } } else if (i < MAX_SKB_FRAGS) { if (copy > PAGE_SIZE) copy = PAGE_SIZE; page = alloc_pages(sk->sk_allocation, 0); if (page == NULL) { err = -ENOMEM; goto error; } sk->sk_sndmsg_page = page; sk->sk_sndmsg_off = 0; skb_fill_page_desc(skb, i, page, 0, 0); frag = &skb_shinfo(skb)->frags[i]; } else { err = -EMSGSIZE; goto error; } if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) { err = -EFAULT; goto error; } sk->sk_sndmsg_off += copy; frag->size += copy; skb->len += copy; skb->data_len += copy; skb->truesize += copy; atomic_add(copy, &sk->sk_wmem_alloc); } offset += copy; length -= copy; } return 0; error: inet->cork.length -= length; IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS); return err; }

因为我们的发送队列为空,来到alloc_new_skb,一路向下走到sock_wmalloc
sock_wmalloc负责从高速缓存中分配一个skb
sock_wmalloc在/net/core/sock.c中

struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, gfp_t priority) { if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) { struct sk_buff * skb = alloc_skb(size, priority); if (skb) { skb_set_owner_w(skb, sk); return skb; } } return NULL; }

alloc_skb负责调用分配函数
alloc_skb在/include/linux/skbuff.h中

static inline struct sk_buff *alloc_skb(unsigned int size, gfp_t priority) { return __alloc_skb(size, priority, 0, -1); }

__alloc_skb负责实际的分配
__alloc_skb在/net/core/skbuff.c中

struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, int fclone, int node) { struct kmem_cache *cache; struct skb_shared_info *shinfo; struct sk_buff *skb; u8 *data; cache = fclone ? skbuff_fclone_cache : skbuff_head_cache; /* Get the HEAD */ //从高速缓存中分配一个sk_buff结构 skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node); //检测skb分配是否成功 if (!skb) goto out; //计算所需要对齐后的数据空间 size = SKB_DATA_ALIGN(size); //分配数据空间和一个skb_shared_info结构 data = kmalloc_node_track_caller(size + sizeof(struct skb_shared_info), gfp_mask, node); //检测数据空间分配是否成功 if (!data) goto nodata; /* * Only clear those fields we need to clear, not those that we will * actually initialise below. Hence, don't put any more fields after * the tail pointer in struct sk_buff! */ //将sk_buff中tail指针之前的数据清0 memset(skb, 0, offsetof(struct sk_buff, tail)); //取得实际大小 skb->truesize = size + sizeof(struct sk_buff); //增加使用计数器 atomic_set(&skb->users, 1); //设置头指针到数据空间的起始地址 skb->head = data; //设置数据指针到数据空间的起始地址 skb->data = data; //复位尾部指针到数据指针的位置 skb_reset_tail_pointer(skb); //设置结束指针到数据空间的结束位置 skb->end = skb->tail + size; /* make sure we initialize shinfo sequentially */ //将数据空间尾部强制转换成skb_shared_info结构 shinfo = skb_shinfo(skb); atomic_set(&shinfo->dataref, 1); shinfo->nr_frags = 0; shinfo->gso_size = 0; shinfo->gso_segs = 0; shinfo->gso_type = 0; shinfo->ip6_frag_id = 0; shinfo->frag_list = NULL; if (fclone) { struct sk_buff *child = skb + 1; atomic_t *fclone_ref = (atomic_t *) (child + 1); skb->fclone = SKB_FCLONE_ORIG; atomic_set(fclone_ref, 1); child->fclone = SKB_FCLONE_UNAVAILABLE; } out: return skb; nodata: kmem_cache_free(cache, skb); skb = NULL; goto out; }

由于我们传递的fclone为0,所以是不会进入到if (fclone)中的

好,返回到sock_wmalloc,分配skb成功之后进入skb_set_owner_w
skb_set_owner_w在/include/net/sock.h中

static inline void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) { sock_hold(sk); //连接sock到sk_buff上 skb->sk = sk; //设置回收函数 skb->destructor = sock_wfree; //增加空间使用计数器 atomic_add(skb->truesize, &sk->sk_wmem_alloc); }

执行完后一个skb便分配好了
结构图如下

然后设置ip_summed和csum为0

skb_reserve(skb, hh_len)这句函数的作用是保留hh_len长度的空间,data和tail指针向后移动hh_len个单位,如下

然后到data = skb_put(skb, fraglen)
放入数据和IP报头,并且返回起初的data指针位置,skb的len加上放入的大小
如下图

skb_set_network_header(skb, exthdrlen);
接着设置设置IP层指针的位置,这里exthdrlen为0

如下图

skb->transport_header = (skb->network_header + fragheaderlen);
接着设置运输层指针的位置
如下图

然后到data += fragheaderlen;
使得data(不是skb呢个)指向运输层头部,如下图

因为我们没有碎片,不进入if (fraggap)
来到if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0)

这里getfrag为调用ip_append_data时所传递的参数ip_generic_getfrag
ip_generic_getfrag就不分析了,他负责把数据从用户空间拷贝到data指针(不是skb呢个)的位置

接着到__skb_queue_tail(&sk->sk_write_queue, skb);
把这个skb挂接到sk的发送队列中

然后conninue回到while头,因为我们把length长的数据拷贝完成了,这里length为0,跳出while循环 return 0 退出

回到raw_sendmsg,现在进入到ip_push_pending_frames
ip_push_pending_frames负责skb的重新排列
ip_push_pending_frames在/net/ipv4/ip_output.c中

int ip_push_pending_frames(struct sock *sk) { struct sk_buff *skb, *tmp_skb; struct sk_buff **tail_skb; struct inet_sock *inet = inet_sk(sk); struct ip_options *opt = NULL; struct rtable *rt = (struct rtable *)inet->cork.dst; struct iphdr *iph; __be16 df = 0; __u8 ttl; int err = 0; //检测待发送队列是否为空 //并返回队首skb if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL) goto out; tail_skb = &(skb_shinfo(skb)->frag_list); /* move skb->data to ip header from ext header */ //检测数据头部是否小于网络层头部 if (skb->data < skb_network_header(skb)) __skb_pull(skb, skb_network_offset(skb)); //历遍发送队列 while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) { __skb_pull(tmp_skb, skb_network_header_len(skb)); *tail_skb = tmp_skb; tail_skb = &(tmp_skb->next); skb->len += tmp_skb->len; skb->data_len += tmp_skb->len; skb->truesize += tmp_skb->truesize; __sock_put(tmp_skb->sk); tmp_skb->destructor = NULL; tmp_skb->sk = NULL; } /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow * to fragment the frame generated here. No matter, what transforms * how transforms change size of the packet, it will come out. */ if (inet->pmtudisc < IP_PMTUDISC_DO) skb->local_df = 1; /* DF bit is set when we want to see DF on outgoing frames. * If local_df is set too, we still allow to fragment this frame * locally. */ if (inet->pmtudisc >= IP_PMTUDISC_DO || (skb->len <= dst_mtu(&rt->u.dst) && ip_dont_fragment(sk, &rt->u.dst))) df = htons(IP_DF); if (inet->cork.flags & IPCORK_OPT) opt = inet->cork.opt; //检测是否为多播 if (rt->rt_type == RTN_MULTICAST) ttl = inet->mc_ttl; else //设置ttl ttl = ip_select_ttl(inet, &rt->u.dst); //将data所指的地址强制转换成iphdr结构 iph = (struct iphdr *)skb->data; //设置版本为v4 iph->version = 4; //设置ip报头长度为5个字节 iph->ihl = 5; //检测是否有ip_options if (opt) { iph->ihl += opt->optlen>>2; ip_options_build(skb, opt, inet->cork.addr, rt, 0); } //设置服务类型 iph->tos = inet->tos; //设置片偏移 iph->frag_off = df; //设置id ip_select_ident(iph, &rt->u.dst, sk); //设置生存时间 iph->ttl = ttl; //设置协议 iph->protocol = sk->sk_protocol; //设置发送方IP地址 iph->saddr = rt->rt_src; //设置目的IP地址 iph->daddr = rt->rt_dst; skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; //取得路由结构 skb->dst = dst_clone(&rt->u.dst); //检测是否为ICMP协议 if (iph->protocol == IPPROTO_ICMP) //增加ICMP累积计数器 icmp_out_count(((struct icmphdr *)skb_transport_header(skb))->type); /* Netfilter gets whole the not fragmented skb. */ //发送该skb err = ip_local_out(skb); if (err) { if (err > 0) err = inet->recverr ? net_xmit_errno(err) : 0; if (err) goto error; } out: ip_cork_release(inet); return err; error: IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS); goto out; }

由于skb_shared_info-> frag_list为NULL,所以这里tail_skb为空

这里data和network_header指针相等
所以不会进入到if (skb->data < skb_network_header(skb))中

因为我们的发送队列中只有一个skb,在之前已经出队了
所以这里不会进入到while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL)中

接下来设置ip头部信息,设置好之后的结构如下

然后连接skb与rtable中的dst_entry

然后进入ip_local_out
ip_local_out在/net/ipv4/ip_output.c中

int ip_local_out(struct sk_buff *skb) { int err; //发送skb err = __ip_local_out(skb); if (likely(err == 1)) err = dst_output(skb); return err; }

继续进入__ip_local_out
__ip_local_out在/net/ipv4/ip_output.c中

int __ip_local_out(struct sk_buff *skb) { //取得IP报头结构 struct iphdr *iph = ip_hdr(skb); //设置数据总长度 iph->tot_len = htons(skb->len); //设置效验和 ip_send_check(iph); return nf_hook(PF_INET, NF_INET_LOCAL_OUT, skb, NULL, skb->dst->dev, dst_output); }

NF_HOOK和Netfilter有关,我们不关心Netfilter,直接进入dst_output
dst_output在/include/net/dst.h中

static inline int dst_output(struct sk_buff *skb) { //由路由结构中的output函数进行发送 return skb->dst->output(skb); }

看看之前的结构图,得知dst->output为ip_output
ip_output在/net/ipv4/ip_output.c中

int ip_output(struct sk_buff *skb) { //取得路由结构中连接的网卡设备 struct net_device *dev = skb->dst->dev; IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS); //连接网卡设备到skb skb->dev = dev; //设置skb的协议为IP skb->protocol = htons(ETH_P_IP); return NF_HOOK_COND(PF_INET, NF_INET_POST_ROUTING, skb, NULL, dev, ip_finish_output, !(IPCB(skb)->flags & IPSKB_REROUTED)); }

继续进入到ip_finish_output
ip_finish_output在/net/ipv4/ip_output.c中

static int ip_finish_output(struct sk_buff *skb) { #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) /* Policy lookup after SNAT yielded a new policy */ if (skb->dst->xfrm != NULL) { IPCB(skb)->flags |= IPSKB_REROUTED; return dst_output(skb); } #endif //检测skb的中的数据大小是否超过MTU,超过则分片 if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb)) return ip_fragment(skb, ip_finish_output2); else return ip_finish_output2(skb); }

这里我们的数据大小不会超过MTU,所以不需要分片,进入到ip_finish_output2
ip_finish_output2在/net/ipv4/ip_output.c中

static inline int ip_finish_output2(struct sk_buff *skb) { struct dst_entry *dst = skb->dst; struct rtable *rt = (struct rtable *)dst; struct net_device *dev = dst->dev; unsigned int hh_len = LL_RESERVED_SPACE(dev); if (rt->rt_type == RTN_MULTICAST) IP_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS); else if (rt->rt_type == RTN_BROADCAST) IP_INC_STATS(IPSTATS_MIB_OUTBCASTPKTS); /* Be paranoid, rather than too clever. */ //检测硬件帧头部空间是否足够大 //检测网卡设备是否有header_ops操作集 if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { struct sk_buff *skb2; skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev)); if (skb2 == NULL) { kfree_skb(skb); return -ENOMEM; } if (skb->sk) skb_set_owner_w(skb2, skb->sk); kfree_skb(skb); skb = skb2; } //检测邻居结构 if (dst->hh) return neigh_hh_output(dst->hh, skb); else if (dst->neighbour) return dst->neighbour->output(skb); if (net_ratelimit()) printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n"); kfree_skb(skb); return -EINVAL; }

这里一开始我们的dst->hh为NULL,但是当第二次发送ICMP包的时候这里的hh就已经分配好了,也就是说只有第一次会进入dst->neighbour->output,以后都是neigh_hh_output
两个走向都会分析,现在先看hh为NULL时
neighbour->output为neigh_resolve_output
neigh_resolve_output在/net/core/neighbour.c中

int neigh_resolve_output(struct sk_buff *skb) { struct dst_entry *dst = skb->dst; struct neighbour *neigh; int rc = 0;

if (!dst || !(neigh = dst->neighbour)) goto discard; __skb_pull(skb, skb_network_offset(skb)); if (!neigh_event_send(neigh, skb)) { int err; struct net_device *dev = neigh->dev; if (dev->header_ops->cache && !dst->hh) { write_lock_bh(&neigh->lock); if (!dst->hh) neigh_hh_init(neigh, dst, dst->ops->protocol); err = dev_hard_header(skb, dev, ntohs(skb->protocol), neigh->ha, NULL, skb->len); write_unlock_bh(&neigh->lock); } else { read_lock_bh(&neigh->lock); err = dev_hard_header(skb, dev, ntohs(skb->protocol), neigh->ha, NULL, skb->len); read_unlock_bh(&neigh->lock); } if (err >= 0) rc = neigh->ops->queue_xmit(skb); else goto out_kfree_skb; } out: return rc; discard: NEIGH_PRINTK1("neigh_resolve_output: dst=%p neigh=%p\n", dst, dst ? dst->neighbour : NULL); out_kfree_skb: rc = -EINVAL; kfree_skb(skb); goto out; }

由于现在data和network_header指针指向同一个位置,所以skb_network_offset(skb)为0,
__skb_pull(skb, skb_network_offset(skb))并没有实际改变skb结构

之后是创建hh结构,和arp有关,也和路由表查询有关,我这里就不分析了

创建好的hh如下

然后到err = dev_hard_header(skb, dev, ntohs(skb->protocol),neigh->ha, NULL, skb->len)
dev_hard_header在/include/linux/netdevice.h中

static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, const void *saddr, unsigned len) { //检测设备是否有头部操作集 //检测操作集是否有创建操作 if (!dev->header_ops || !dev->header_ops->create) return 0; return dev->header_ops->create(skb, dev, type, daddr, saddr, len); }

我们的lo设备是有头部操作集的,eth_header_ops,结构如下

const struct header_ops eth_header_ops ____cacheline_aligned = { .create = eth_header, .parse = eth_header_parse, .rebuild = eth_rebuild_header, .cache = eth_header_cache, .cache_update = eth_header_cache_update, };

可以看见是有create函数的
eth_header在/net/ethernet/eth.c中

int eth_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, const void *saddr, unsigned len) { struct ethhdr *eth = (struct ethhdr *)skb_push(skb, ETH_HLEN); if (type != ETH_P_802_3) eth->h_proto = htons(type); else eth->h_proto = htons(len); /* * Set the source hardware address. */ if (!saddr) saddr = dev->dev_addr; memcpy(eth->h_source, saddr, ETH_ALEN); if (daddr) { memcpy(eth->h_dest, daddr, ETH_ALEN); return ETH_HLEN; } /* * Anyway, the loopback-device should never use this function... */ if (dev->flags & (IFF_LOOPBACK | IFF_NOARP)) { memset(eth->h_dest, 0, ETH_ALEN); return ETH_HLEN; } return -ETH_HLEN; }

主要是注意struct ethhdr *eth = (struct ethhdr *)skb_push(skb, ETH_HLEN);
这里会进行数据的压入
压入后的结构如下

然后对以太网报头进行初始化

初始化完成后回到neigh_resolve_output中
来到rc = neigh->ops->queue_xmit(skb)
neigh->ops->queue_xmit为dev_queue_xmit
在进入dev_queue_xmit之前我们回到ip_finish_output2中看看有hh时的流程
neigh_hh_output在/include/net/neighbour.h中

static inline int neigh_hh_output(struct hh_cache *hh, struct sk_buff *skb) { unsigned seq; int hh_len; do { int hh_alen; seq = read_seqbegin(&hh->hh_lock); hh_len = hh->hh_len; hh_alen = HH_DATA_ALIGN(hh_len); memcpy(skb->data - hh_alen, hh->hh_data, hh_alen); } while (read_seqretry(&hh->hh_lock, seq)); skb_push(skb, hh_len); return hh->hh_output(skb); }

neigh_hh_output也会对skb进行数据压入的操作然后调用hh_output
在之前hh的结构中hh_output正是dev_queue_xmit
所以最后两边都会回到dev_queue_xmit