基于de4x5 驱动的PCI总线描述。所涉及的函数仅包含了报文接收的关键部分。
这篇文章是为了记录一下这几天看linux 网络报文接收过程,作为一个总结,也是一个记忆点。
一 de4x5初始化:
点击(此处)折叠或打开
-
static struct pci_driver de4x5_pci_driver = {
-
.name = "de4x5",
-
.id_table = de4x5_pci_tbl,
-
.probe = de4x5_pci_probe, //主要关注这个函数,是整个初始化的主体入口,探测总线上的设备并进行初始化
-
.remove = __devexit_p (de4x5_pci_remove),
- };
点击(此处)折叠或打开
-
static int __init de4x5_module_init (void)
-
{
-
int err = 0;
-
#ifdef CONFIG_PCI
-
err = pci_register_driver(&de4x5_pci_driver); //使用静态初始化的pci driver进行注册。
-
#endif
-
#ifdef CONFIG_EISA
-
err |= eisa_driver_register (&de4x5_eisa_driver);
-
#endif
-
-
return err;
- }
点击(此处)折叠或打开
-
static int __devinit de4x5_pci_probe (struct pci_dev *pdev,
-
const struct pci_device_id *ent)
-
{
- ......
-
-
/* Ok, the device seems to be for us. 探测到属于自己负责的设备 */
-
if ((error = pci_enable_device (pdev)))
-
return error;
-
/* 为这个设备申请net_device空间并做一些初始化,这里其实还申请了dev的private空间,所以下面的代码中可以直接访问。看了代码还有tx 队列的申请,rx ring是在其它地方申请的 */
-
if (!(dev = alloc_etherdev (sizeof (struct de4x5_private)))) {
-
error = -ENOMEM;
-
goto disable_dev;
-
}
-
-
/* 根据上面申请的地址做偏移得到dev的private空间地址 然后对其进行一些基本的初始化 */
-
lp = netdev_priv(dev);
-
lp->bus = PCI;
-
lp->bus_num = 0;
-
-
/* Search for an SROM on this bus */
-
if (lp->bus_num != pb) {
-
lp->bus_num = pb;
-
srom_search(dev, pdev);
-
}
-
-
/* Get the chip configuration revision register */
-
lp->cfrv = pdev->revision;
-
-
/* Set the device number information */
-
lp->device = dev_num;
-
lp->bus_num = pb;
-
-
/* Set the chipset information */
-
if (is_DC2114x) {
-
device = ((lp->cfrv & CFRV_RN) < DC2114x_BRK ? DC21142 : DC21143);
-
}
-
lp->chipset = device;
-
-
/* Get the board I/O address (64 bits on sparc64) */
-
iobase = pci_resource_start(pdev, 0);
-
-
/* Fetch the IRQ to be used */
-
irq = pdev->irq;
-
if ((irq == 0) || (irq == 0xff) || ((int)irq == -1)) {
-
error = -ENODEV;
-
goto free_dev;
-
}
-
-
......
-
-
dev->irq = irq;
-
/* 对设备做进一步初始化 接下面代码 */
-
if ((error = de4x5_hw_init(dev, iobase, &pdev->dev))) {
-
goto release;
-
}
-
-
return 0;
-
......
- }
点击(此处)折叠或打开
-
static int __devinit
-
de4x5_hw_init(struct net_device *dev, u_long iobase, struct device *gendev)
-
{
-
char name[DE4X5_NAME_LENGTH + 1];
-
struct de4x5_private *lp = netdev_priv(dev);
-
struct pci_dev *pdev = NULL;
-
int i, status=0;
-
-
dev_set_drvdata(gendev, dev);
-
-
/* Ensure we're not sleeping */
-
if (lp->bus == EISA) {
-
outb(WAKEUP, PCI_CFPM);
-
} else {
-
pdev = to_pci_dev (gendev);
-
pci_write_config_byte(pdev, PCI_CFDA_PSM, WAKEUP);
-
}
-
mdelay(10);
-
-
RESET_DE4X5;
-
-
if ((inl(DE4X5_STS) & (STS_TS | STS_RS)) != 0) {
-
return -ENXIO; /* Hardware could not reset */
-
}
-
-
/*
-
** Now find out what kind of DC21040/DC21041/DC21140 board we have.
-
*/
-
lp->useSROM = false;
-
if (lp->bus == PCI) {
-
PCI_signature(name, lp);
-
} else {
-
EISA_signature(name, gendev);
-
}
-
-
if (*name == '\0') { /* Not found a board signature */
-
return -ENXIO;
-
}
-
-
dev->base_addr = iobase;
-
printk ("%s: %s at 0x%04lx", dev_name(gendev), name, iobase);
-
-
/* 根据chipset做一些地址的操作检查 */
-
status = get_hw_addr(dev);
-
printk(", h/w address %pM\n", dev->dev_addr);
-
-
if (status != 0) {
-
printk(" which has an Ethernet PROM CRC error.\n");/* CRC 错误 */
-
return -ENXIO;
-
} else {
-
skb_queue_head_init(&lp->cache.queue);/* 初始化缓存skb队列 jymao */
-
lp->cache.gepc = GEP_INIT;
-
lp->asBit = GEP_SLNK;
-
lp->asPolarity = GEP_SLNK;
-
lp->asBitValid = ~0;
-
lp->timeout = -1;
-
lp->gendev = gendev;
-
spin_lock_init(&lp->lock);
-
init_timer(&lp->timer);
-
lp->timer.function = (void (*)(unsigned long))de4x5_ast;
-
lp->timer.data = (unsigned long)dev;
-
de4x5_parse_params(dev);
-
- ......
-
lp->fdx = lp->params.fdx;
-
sprintf(lp->adapter_name,"%s (%s)", name, dev_name(gendev));
-
-
/* 计算DMA环形缓冲区的desc控制结构大小 分为RX+TX */
-
lp->dma_size = (NUM_RX_DESC + NUM_TX_DESC) * sizeof(struct de4x5_desc);
-
-
/* 申请连续的dev desc 并初始化dma_rings为申请的起始地址 */
-
lp->rx_ring = dma_alloc_coherent(gendev, lp->dma_size,
-
&lp->dma_rings, GFP_ATOMIC);
-
if (lp->rx_ring == NULL) {
-
return -ENOMEM;
-
}
-
-
/* 通过dma_size可以得出rx_ring加上rx的size就可得到tx_ring的地址 */
-
lp->tx_ring = lp->rx_ring + NUM_RX_DESC;
-
-
/*
-
** Set up the RX descriptor ring (Intels)
-
** Allocate contiguous receive buffers, long word aligned (Alphas)
-
*/
-
#if !defined(__alpha__) && !defined(__powerpc__) && !defined(CONFIG_SPARC) && !defined(DE4X5_DO_MEMCPY)
-
for (i=0; i<NUM_RX_DESC; i++) {
-
lp->rx_ring[i].status = 0; /* 有报文接收到的时候会变化,后续代码会有使用 */
-
lp->rx_ring[i].des1 = cpu_to_le32(RX_BUFF_SZ);
-
lp->rx_ring[i].buf = 0;
-
lp->rx_ring[i].next = 0;
-
lp->rx_skb[i] = (struct sk_buff *) 1; /* Dummy entry 后续申请初始环形缓存的时候会用到,即de4x5_open中 */
-
}
-
-
#else
- ......
- #endif
-
/* 省略了一些初始化*/
- ......
-
lp->state = CLOSED;
-
-
/* The DE4X5-specific entries in the device structure. */
-
SET_NETDEV_DEV(dev, gendev);
-
dev->netdev_ops = &de4x5_netdev_ops;/* 这里包含了所有dev的操作集 这个静态结构下面会给出 */
-
dev->mem_start = 0;
-
-
/* Fill in the generic fields of the device structure.向内核通知该设备 */
-
if ((status = register_netdev (dev))) {
-
dma_free_coherent (gendev, lp->dma_size,
-
lp->rx_ring, lp->dma_rings);
-
return status;
-
}
-
-
/* Let the adapter sleep to save power */
-
yawn(dev, SLEEP);
-
-
return status;
- }
点击(此处)折叠或打开
-
static const struct net_device_ops de4x5_netdev_ops = {
-
.ndo_open = de4x5_open, //netdev的start 函数
-
.ndo_stop = de4x5_close,
-
.ndo_start_xmit = de4x5_queue_pkt,
-
.ndo_get_stats = de4x5_get_stats,
-
.ndo_set_multicast_list = set_multicast_list,
-
.ndo_do_ioctl = de4x5_ioctl,
-
.ndo_change_mtu = eth_change_mtu,
-
.ndo_set_mac_address= eth_mac_addr,
-
.ndo_validate_addr = eth_validate_addr,
- };
点击(此处)折叠或打开
-
static int
-
de4x5_open(struct net_device *dev)
-
{
-
struct de4x5_private *lp = netdev_priv(dev);
-
u_long iobase = dev->base_addr;
-
int i, status = 0;
-
s32 omr;
-
-
/* 这里申请了DMA环形缓冲区,用来让网卡把硬件接收到的信息填充进去 这个内存是和CPU共享的 */
-
/* Allocate the RX buffers 大小是一个以太网报文的MTU+二层报头+pad 以32对齐*/
- /* 这里的数量是和desc的控制结构数量一致的,即一一对应 */
-
for (i=0; i<lp->rxRingSize; i++) {
-
if (de4x5_alloc_rx_buff(dev, i, 0) == NULL) {
-
de4x5_free_rx_buffs(dev);
-
return -EAGAIN;
-
}
-
}
-
-
/*
-
** Wake up the adapter
-
*/
-
yawn(dev, WAKEUP);
-
-
/*
-
** Re-initialize the DE4X5...
-
*/
-
status = de4x5_init(dev);
-
spin_lock_init(&lp->lock);
-
lp->state = OPEN;
-
de4x5_dbg_open(dev);
-
-
/* 终端的注册 register the interrupt handlers (IRQF_SHARED mearns multiple devices share the interrupt )*/
-
/* IRQF_DISABLED means high quick interrupt which need prohibit other interrupts when it is handling interrupt*/
-
if (request_irq(dev->irq, de4x5_interrupt, IRQF_SHARED,
-
lp->adapter_name, dev)) {
-
printk("de4x5_open(): Requested IRQ%d is busy - attemping FAST/SHARE...", dev->irq);
-
if (request_irq(dev->irq, de4x5_interrupt, IRQF_DISABLED | IRQF_SHARED,
-
lp->adapter_name, dev)) {
-
printk("\n Cannot get IRQ- reconfigure your hardware.\n");
-
disable_ast(dev);
-
de4x5_free_rx_buffs(dev);
-
de4x5_free_tx_buffs(dev);
-
yawn(dev, SLEEP);
-
lp->state = CLOSED;
-
return -EAGAIN;
-
} else {
-
printk("\n Succeeded, but you should reconfigure your hardware to avoid this.\n");
-
printk("WARNING: there may be IRQ related problems in heavily loaded systems.\n");
-
}
-
}
-
-
lp->interrupt = UNMASK_INTERRUPTS;
-
dev->trans_start = jiffies;
-
-
START_DE4X5;
-
-
de4x5_setup_intr(dev);
-
- ......
-
-
return status;
- }
以上是驱动和设备的初始化,上面代码包含了主要的一些初始化工作:
1. 总线上驱动注册初始化
2. 驱动探测设备注册初始化
3. 开启设备,初始化接收缓存
下面的步骤是网卡从网上上接收到了数据报文填充到设备的接收缓存中,然后触发了一个注册好的硬中断。
二:驱动中断处理过程
点击(此处)折叠或打开
- /* 在dev open中注册的中断处理函数 */
-
static irqreturn_t
-
de4x5_interrupt(int irq, void *dev_id)
-
{
-
struct net_device *dev = dev_id;
-
struct de4x5_private *lp;
-
s32 imr, omr, sts, limit;
-
u_long iobase;
-
unsigned int handled = 0;
-
-
lp = netdev_priv(dev);
-
spin_lock(&lp->lock);
-
iobase = dev->base_addr;
-
-
DISABLE_IRQs; /* Ensure non re-entrancy */
-
/* private 中的中断位置1 这里应该是类似中断嵌套的 ?*/
-
if (test_and_set_bit(MASK_INTERRUPTS, (void*) &lp->interrupt))
-
printk("%s: Re-entering the interrupt handler.\n", dev->name);
-
-
synchronize_irq(dev->irq);
-
-
for (limit=0; limit<8; limit++) {
-
sts = inl(DE4X5_STS); /* Read IRQ status */
-
outl(sts, DE4X5_STS); /* Reset the board interrupts */
-
-
if (!(sts & lp->irq_mask)) break;/* All done */
-
handled = 1;
-
-
/* 主要关注这里的接收 当收到 RX 中断 */
-
if (sts & (STS_RI | STS_RU)) /* Rx interrupt (packet[s] arrived) */
-
de4x5_rx(dev);
-
-
if (sts & (STS_TI | STS_TU)) /* Tx interrupt (packet sent) */
-
de4x5_tx(dev);
-
-
if (sts & STS_LNF) { /* TP Link has failed */
-
lp->irq_mask &= ~IMR_LFM;
-
}
-
-
if (sts & STS_UNF) { /* Transmit underrun */
-
de4x5_txur(dev);
-
}
-
- ......
-
lp->interrupt = UNMASK_INTERRUPTS;
-
ENABLE_IRQs;
-
spin_unlock(&lp->lock);
-
-
return IRQ_RETVAL(handled);
- }
点击(此处)折叠或打开
-
static int
-
de4x5_rx(struct net_device *dev)
-
{
-
struct de4x5_private *lp = netdev_priv(dev);
-
u_long iobase = dev->base_addr;
-
int entry;
-
s32 status;
-
-
/* 遍历所有rx ring 中entry的status 大于等于0的,其实是遍历所有初始化的,因为初始化的时候都是0 */
-
for (entry=lp->rx_new; (s32)le32_to_cpu(lp->rx_ring[entry].status)>=0;
-
entry=lp->rx_new) {
-
status = (s32)le32_to_cpu(lp->rx_ring[entry].status);
-
-
if (lp->rx_ovf) {
-
if (inl(DE4X5_MFC) & MFC_FOCM) {
-
de4x5_rx_ovfc(dev);
-
break;
-
}
-
}
-
-
/* 记录第一个作为old */
-
if (status & RD_FS) { /* Remember the start of frame */
-
lp->rx_old = entry;
-
}
-
-
if (status & RD_LS) { /* Valid frame status */
-
if (lp->tx_enable) lp->linkOK++;
-
/* 记录一些错误统计 */
-
if (status & RD_ES) { /* There was an error. */
-
lp->stats.rx_errors++; /* Update the error stats. */
-
if (status & (RD_RF | RD_TL)) lp->stats.rx_frame_errors++;
-
if (status & RD_CE) lp->stats.rx_crc_errors++;
-
if (status & RD_OF) lp->stats.rx_fifo_errors++;
-
if (status & RD_TL) lp->stats.rx_length_errors++;
-
if (status & RD_RF) lp->pktStats.rx_runt_frames++;
-
if (status & RD_CS) lp->pktStats.rx_collision++;
-
if (status & RD_DB) lp->pktStats.rx_dribble++;
-
if (status & RD_OF) lp->pktStats.rx_overflow++;
-
} else { /* A valid frame received */
- /* 正确的报文接收到了,计算长度 */
-
struct sk_buff *skb;
-
short pkt_len = (short)(le32_to_cpu(lp->rx_ring[entry].status)
-
>> 16) - 4;
-
-
/* 如果rx_ring 中entry的status check 合法 则说明接收到了包,而且已经缓存到物理地址了,现在直接申请skb来放入cpu缓存队列
-
* 这里申请的skb是用来替换已经接收到的报文的,即此时status大于0则说明这个entry下的rx_skb[entry]是保存了网卡接收的skb
-
* 我们需要接收它,然后重新申请一个空的skb来替换它,以供下一次网卡接收
-
*/
-
if ((skb = de4x5_alloc_rx_buff(dev, entry, pkt_len)) == NULL) {
-
printk("%s: Insufficient memory; nuking packet.\n",
-
dev->name);
-
lp->stats.rx_dropped++;
-
} else {
-
de4x5_dbg_rx(skb, pkt_len);
-
-
/* Push up the protocol stack */
-
skb->protocol=eth_type_trans(skb,dev);
-
de4x5_local_stats(dev, skb->data, pkt_len);
- /* 准备放入cpu 报文等待处理队列 */
-
netif_rx(skb);
-
-
/* Update stats */
-
lp->stats.rx_packets++;
-
lp->stats.rx_bytes += pkt_len;
-
}
-
}
-
- ......
-
/*
-
** Update entry information 计算下一个
-
*/
-
lp->rx_new = (++lp->rx_new) % lp->rxRingSize;
-
}
-
-
return 0;
- }
点击(此处)折叠或打开
-
int netif_rx(struct sk_buff *skb)
-
{
-
struct softnet_data *queue;
-
unsigned long flags;
-
-
/* if netpoll wants it, pretend we never saw it 这里的netpoll是一种在内核没有初始化好时候的一种简单的网络处理机制具体没研究 */
-
if (netpoll_rx(skb))
-
return NET_RX_DROP;
-
-
if (!skb->tstamp.tv64)
-
net_timestamp(skb);
-
-
/*
-
* The code is rearranged so that the path is the most
-
* short when CPU is congested, but is still operating.
-
*/
-
local_irq_save(flags);/* save the current irq system status flags, and disable local irq */
-
/* 得到特定cpu的接收队列 由于没有使用NAPI,所以直接使用softnet的队列 */
-
queue = &__get_cpu_var(softnet_data);
-
-
__get_cpu_var(netdev_rx_stat).total++;
-
if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {//1000
-
/* 第一次进来的时候qlen为0 即会调用下面napi_schedule */
-
if (queue->input_pkt_queue.qlen) {
-
enqueue:
-
__skb_queue_tail(&queue->input_pkt_queue, skb);//加入到接收队列
-
local_irq_restore(flags);/* 退出中断上下文 */
-
return NET_RX_SUCCESS;
-
}
-
/* 这里起初qlen是为0的,然后通过schedule引发软中断,并且把backlog加入到poll_list中,并触发软中断 */
-
napi_schedule(&queue->backlog);
-
goto enqueue;
-
}
-
-
__get_cpu_var(netdev_rx_stat).dropped++;
-
local_irq_restore(flags);
-
-
kfree_skb(skb);
-
return NET_RX_DROP;
- }
三:设备软中断处理
net_dev_init是一个内核通用dev模块的初始化过程。由于de4x5没有使用NAPI机制,所以很多直接沿用这个初始化中的处理。
点击(此处)折叠或打开
-
static int __init net_dev_init(void)
-
{
-
int i, rc = -ENOMEM;
-
-
BUG_ON(!dev_boot_phase);
-
- ......
-
-
/*
-
* Initialise the packet receive queues.针对每个CPU进行接收队列和backlog的初始化
-
*/
-
-
for_each_possible_cpu(i) {
-
struct softnet_data *queue;
-
-
queue = &per_cpu(softnet_data, i);
-
skb_queue_head_init(&queue->input_pkt_queue);
-
queue->completion_queue = NULL;
-
INIT_LIST_HEAD(&queue->poll_list);
-
-
queue->backlog.poll = process_backlog;//这里没有NAPI的直接掉用这个
-
queue->backlog.weight = weight_p;
-
queue->backlog.gro_list = NULL;
-
queue->backlog.gro_count = 0;
-
}
-
-
dev_boot_phase = 0;
-
- ......
-
-
open_softirq(NET_TX_SOFTIRQ, net_tx_action);/* 注册相应的软中断 */
-
open_softirq(NET_RX_SOFTIRQ, net_rx_action);/* RX softirq 注册 */
-
- ......
-
out:
-
return rc;
- }
点击(此处)折叠或打开
-
static void net_rx_action(struct softirq_action *h)
-
{
-
struct list_head *list = &__get_cpu_var(softnet_data).poll_list;
-
unsigned long time_limit = jiffies + 2;
-
int budget = netdev_budget;
-
void *have;
-
-
local_irq_disable();
-
-
while (!list_empty(list)) { //这里不会引起死循环,因为napi_complete会删除已完成的链表 process_backlog中也有调用
-
struct napi_struct *n;
-
int work, weight;
-
-
/* If softirq window is exhuasted then punt.
-
* Allow this to run for 2 jiffies since which will allow
-
* an average latency of 1.5/HZ. 这里是两个时间片,即防止在软中断中耗时过长,以前好像是一个时间片
-
*/
-
if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))//份额用完或者是时间到期则会退出
-
goto softnet_break;
-
-
local_irq_enable();
-
-
/* Even though interrupts have been re-enabled, this
-
* access is safe because interrupts can only add new
-
* entries to the tail of this list, and only ->poll()
-
* calls can remove this head entry from the list.
-
*/
-
/* 获取poll_list上的实例 napi 即前面加入的*/
-
n = list_entry(list->next, struct napi_struct, poll_list);
-
-
have = netpoll_poll_lock(n);
-
-
weight = n->weight;
-
-
/* This NAPI_STATE_SCHED test is for avoiding a race
-
* with netpoll's poll_napi(). Only the entity which
-
* obtains the lock and sees NAPI_STATE_SCHED set will
-
* actually make the ->poll() call. Therefore we avoid
-
* accidently calling ->poll() when NAPI is not scheduled.
-
*/
-
work = 0;
-
/* 这里就是napi_add的时候会注册的poll轮询函数,当然de4x5是process_backlog */
-
if (test_bit(NAPI_STATE_SCHED, &n->state)) {
-
work = n->poll(n, weight);
-
trace_napi_poll(n);
-
}
-
-
WARN_ON_ONCE(work > weight);
-
-
budget -= work;
-
-
local_irq_disable();
-
-
/* Drivers must not modify the NAPI state if they
-
* consume the entire weight. In such cases this code
-
* still "owns" the NAPI instance and therefore can
-
* move the instance around on the list at-will.
-
*/
-
/*达到了权重,处理的最大数 */
- if (unlikely(work == weight)) {
-
/* state被clear */
-
if (unlikely(napi_disable_pending(n))) {
-
local_irq_enable();
-
napi_complete(n);
-
local_irq_disable();
-
} else
-
list_move_tail(&n->poll_list, list);//说明还有报文,放入队尾
-
}
-
-
netpoll_poll_unlock(have);
-
}
-
out:
-
local_irq_enable();
-
-
#ifdef CONFIG_NET_DMA
-
/*
-
* There may not be any more sk_buffs coming right now, so push
-
* any pending DMA copies to hardware
-
*/
-
dma_issue_pending_all();
-
#endif
-
-
return;
-
-
softnet_break:
-
__get_cpu_var(netdev_rx_stat).time_squeeze++;
-
__raise_softirq_irqoff(NET_RX_SOFTIRQ);
-
goto out;
- }
点击(此处)折叠或打开
-
static int process_backlog(struct napi_struct *napi, int quota)
-
{
-
int work = 0;
-
struct softnet_data *queue = &__get_cpu_var(softnet_data);//取得队列
-
unsigned long start_time = jiffies;
-
-
napi->weight = weight_p;
-
do {
-
struct sk_buff *skb;
-
-
local_irq_disable();
-
skb = __skb_dequeue(&queue->input_pkt_queue);//skb出队列,进行处理
-
if (!skb) {//如果队列的报文已经没有了,此时会设置napi从poll_list中删除
-
__napi_complete(napi);
-
local_irq_enable();
-
break;
-
}
-
local_irq_enable();
-
-
netif_receive_skb(skb); //这里即要达到上层的一个处理,根据每个协议注册的处理函数进行适配。
-
} while (++work < quota && jiffies == start_time);
-
-
return work;
- }
后续的netif_receive_skb就不列了,到达上层的处理已经看得蛮多的了,这次主要是弄清楚底层的一个接受过程。
下面附上NAPI 和非 NAPI的一个区别图:(来源网络)
下次可以再贴代码讲一下。