linux内核之把块放在页高速缓存中

2050阅读 0评论2013-06-27 double_lq
分类:LINUX

1.VFS(映射层)和各种文件系统以叫做“块”的逻辑单位组织磁盘数据。
2.从2.4.10的稳定版本开始,不再单独分配块缓存区,而是将它们放在叫做“缓冲区页”的专门页中,而缓冲区页保存在页高速缓存中。
3.每个块缓冲区都有buffer_head类型的缓冲区首部描述符,该首部有如何处理块的所有信息,所以在对所有块操作之前,内核检查缓冲区首部。
4、只要内核必须单独访问一个块,就要涉及存放块缓冲区的缓冲区页,并检查相应缓冲区的缓冲区首部。
5.在一个缓冲区页内的所有块缓冲区大小必须相同。一个缓冲区页可以包括1~8个缓冲区。

缓存区首部的字段如下:

 
  1. struct buffer_head {
  2.           /* First cache line: */
  3.           unsigned long b_state; /* 缓冲区状态标志 */
  4.           struct buffer_head *b_this_page;/* 指向缓冲区页的链表中的下一个元素 */
  5.           struct page *b_page; /* 指向拥有该缓冲区页的描述符的指针 */
  6.           atomic_t b_count; /* 块使用计数器 */
  7.           u32 b_size; /* 块大小 */
  8.   
  9.           sector_t b_blocknr; /* 与块设备相关的块号 */
  10.           char *b_data; /* 块在缓冲区页内的位置 */
  11.   
  12.           struct block_device *b_bdev; /* 指向块设备描述符的指针*/
  13.           bh_end_io_t *b_end_io;
  14.           void *b_private;
  15.           struct list_head b_assoc_buffers;
  16.   };
6、如果一个页作为缓冲区页使用,那么与它的块缓冲区相关的所有缓冲区首部都被收集在一个单向循环链表中。缓冲区页描述符的private字段指向页中第一个块的缓冲区首部;每个缓冲区首部存放在b_this_page字段中,该字段是指向链表中下一个缓冲区首部的指针。此外、每个缓冲区首部还把缓冲区页描述符的地址存放在b_page字段中。下图显示了一个缓冲区页,其中包含四个块缓冲区和对应的缓冲区首部。
    
   



1、分配块设备缓冲区页
    当内核发现指定块的缓冲区所在的页不在页高速缓存中时,就分配一个新的块设备缓冲区页。
   内核调用函数grow_buffers()把块设备缓冲区页添加到页高速缓存中,函数代码如下:

   
  1. static inline int
  2. grow_buffers(struct block_device *bdev, sector_t block, int size)
  3. {
  4.         struct page *page;
  5.         pgoff_t index;
  6.         int sizebits;

  7.         sizebits = -1;
  8.         do {
  9.                 sizebits++;
  10.         } while ((size << sizebits) < PAGE_SIZE);

  11.         index = block >> sizebits;         //计算出数据页在所请求块的块设备中的偏移量index
  12.         block = index << sizebits;

  13.         /* Create a page with the proper size buffers.. */
  14.         page = grow_dev_page(bdev, block, index, size);       //创建新的块设备缓冲区页
  15.         if (!page)
  16.                 return 0;
  17.         unlock_page(page);
  18.         page_cache_release(page);
  19.         return 1;
  20. }
  grow_dev_page()代码如下:
   
  1. static struct page *
  2. grow_dev_page(struct block_device *bdev, sector_t block,
  3.                 pgoff_t index, int size)
  4. {
  5.         struct inode *inode = bdev->bd_inode;              //与该块设备对应的索引节点
  6.         struct page *page;
  7.         struct buffer_head *bh;

  8.         page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);   //此时页已经在页高速缓存中了
  9.         if (!page)
  10.                 return NULL;

  11.         if (!PageLocked(page))
  12.                 BUG();

  13.         if (page_has_buffers(page)) {                //检查其PG_private标志,若为空,则页还不是一个缓冲区页,因为没有相关的缓冲区首部
  14.                 bh = page_buffers(page);             //从页描述符中取得第一个缓冲区首部的地址     
  15.                 if (bh->b_size == size) {            //检验块大小是否等于所请求的块大小(缓冲区页中所有块大小一样)
  16.                         init_page_buffers(page, bdev, block, size);
  17.                         return page;
  18.                 }
  19.                 if (!try_to_free_buffers(page))
  20.                         goto failed;
  21.         }

  22.         /*
  23.          * Allocate some buffers for this page
  24.         */
  25.         bh = alloc_page_buffers(page, size, 0);    //为该页分配缓冲区首部
  26.         if (!bh)
  27.                 goto failed;

  28.         /*
  29.          * Link the page to the buffers and initialise them. Take the
  30.          * lock to be atomic wrt __find_get_block(), which does not
  31.          * run under the page lock.
  32.          */
  33.         spin_lock(&inode->i_mapping->private_lock);
  34.         link_dev_buffers(page, bh);    //将缓冲区的头尾相连形成循环链表,如图中红色线条所示,并将page->private=head
  35.         init_page_buffers(page, bdev, block, size);
  36.         spin_unlock(&inode->i_mapping->private_lock);
  37.         return page;

  38. failed:
  39.         BUG();
  40.         unlock_page(page);
  41.         page_cache_release(page);
  42.         return NULL;
  43. }

find_or_create_page()在mapping->page_tree对应的基树中查找对应索引的页,若没有,则增加一个页。返回指向该页的描述符。
  
  1. struct page *find_or_create_page(struct address_space *mapping,
  2.                  unsigned long index, unsigned int gfp_mask)
  3.  {
  4.          struct page *page, *cached_page = NULL;
  5.          int err;
  6.  repeat:
  7.          page = find_lock_page(mapping, index);//通过radix_tree_lookup()函数查找对应索引的页,若找到,则返回page,没找到,返回NULL
  8.          if (!page) {                          //在page==NULL的情况下
  9.                  if (!cached_page) {
  10.                          cached_page = alloc_page(gfp_mask);
  11.                          if (!cached_page)
  12.                                  return NULL;
  13.                  }
  14.                  err = add_to_page_cache_lru(cached_page, mapping,
  15.                                          index, gfp_mask);
  16.                  if (!err) {
  17.                          page = cached_page;
  18.                          cached_page = NULL;
  19.                  } else if (err == -EEXIST)
  20.                          goto repeat;
  21.          }
  22.          if (cached_page)
  23.                  page_cache_release(cached_page);
  24.          return page;
  25.  }

add_to_page_cache_lru()函数代码如下:
 
  1. int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
  2.                                  pgoff_t offset, int gfp_mask)
  3.  {
  4.          int ret = add_to_page_cache(page, mapping, offset, gfp_mask); //该函数的具体实现在前一篇博文中有,主要是增加页
  5.          if (ret == 0)
  6.                  lru_cache_add(page);
  7.          return ret;
  8.  }
init_page_buffers()函数初始化连接到页的缓冲区首部的字段b_bdev,b_blocknr和b_state.因为所有块在磁盘上都是相邻的,因此逻辑块号是连续的,而且很容易从块得出。

  
  1. static void
  2. init_page_buffers(struct page *page, struct block_device *bdev,
  3.                         sector_t block, int size)
  4. {
  5.         struct buffer_head *head = page_buffers(page);
  6.         struct buffer_head *bh = head;
  7.         int uptodate = PageUptodate(page);

  8.         do {
  9.                 if (!buffer_mapped(bh)) {
  10.                         init_buffer(bh, NULL, NULL);
  11.                         bh->b_bdev = bdev;
  12.                         bh->b_blocknr = block;
  13.                         if (uptodate)
  14.                                 set_buffer_uptodate(bh);
  15.                         set_buffer_mapped(bh);
  16.                 }
  17.                 block++;
  18.                 bh = bh->b_this_page;
  19.         } while (bh != head);
  20. }

  bh = alloc_page_buffers(page, size, 0); 具体代码如下:

   
  1. struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
  2.                 int retry)
  3. {
  4.         struct buffer_head *bh, *head;
  5.         long offset;

  6. try_again:
  7.         head = NULL;
  8.         offset = PAGE_SIZE;
  9.         while ((offset -= size) >= 0) {
  10.                 bh = alloc_buffer_head(GFP_NOFS);    //分配缓冲区首部
  11.                 if (!bh)
  12.                         goto no_grow;

  13.                 bh->b_bdev = NULL;
  14.                 bh->b_this_page = head;
  15.                 bh->b_blocknr = -1;
  16.                 head = bh;

  17.                 bh->b_state = 0;
  18.                 atomic_set(&bh->b_count, 0);
  19.                 bh->b_size = size;

  20.                 /* Link the buffer to its page */
  21.                 set_bh_page(bh, page, offset);    //该函数实现的功能在图中以蓝色线条表示  将缓冲区首部和页描述符连接起来,和缓冲区连接起来

  22.                 bh->b_end_io = NULL;
  23.         }
  24.         return head;
  25. /*
  26.  * In case anything failed, we just free everything we got.
  27.  */
  28. no_grow:
  29.         if (head) {
  30.                 do {
  31.                         bh = head;
  32.                         head = head->b_this_page;
  33.                         free_buffer_head(bh);
  34.                 } while (head);
  35.         }

  36.         /*
  37.          * Return failure for non-async IO requests. Async IO requests
  38.          * are not allowed to fail, so we have to wait until buffer heads
  39.          * become available. But we don't want tasks sleeping with
  40.          * partially complete buffers, so all were released above.
  41.          */
  42.         if (!retry)
  43.                 return NULL;

  44.          /* We're _really_ low on memory. Now we just
  45.          * wait for old buffer heads to become free due to
  46.          * finishing IO. Since this is an async request and
  47.          * the reserve list is empty, we're sure there are
  48.          * async buffer heads in use.
  49.          */
  50.         free_more_memory();
  51.         goto try_again;
  52. }
    
  1. void set_bh_page(struct buffer_head *bh,
  2.                 struct page *page, unsigned long offset)
  3. {
  4.         bh->b_page = page;       //将缓冲区首部和页描述符连接起来
  5.         if (offset >= PAGE_SIZE)
  6.                 BUG();
  7.         if (PageHighMem(page))     //将缓冲区首部和对应的缓冲区连接起来
  8.                 /*
  9.                  * This catches illegal uses and preserves the offset:
  10.                  */
  11.                 bh->b_data = (char *)(0 + offset);
  12.         else
  13.                 bh->b_data = page_address(page) + offset;
  14. }

函数grow_buffers()执行完成后,结构之间的关系如图:

  
 
  到此,分配块设备缓冲区页已完成。

2、释放块设备缓冲区页
     当内核试图获得更多的空闲内存时,就释放块设备缓冲区页。显然,不可能释放有脏缓冲区或上锁的缓冲区的页。内核调用函数try_to_release_page()释放缓冲区页,该函数接受页描述符的地址page,具体代码如下:
      
   
  1. int try_to_release_page(struct page *page, int gfp_mask)
  2. {
  3.         struct address_space * const mapping = page->mapping;

  4.         BUG_ON(!PageLocked(page));
  5.         if (PageWriteback(page))     //正在把页写回磁盘,所以不可能释放该页
  6.                 return 0;
  7.         
  8.         if (mapping && mapping->a_ops->releasepage)    //如果定义了块设备的releasepage方法,就调用它,通常没有定义
  9.                 return mapping->a_ops->releasepage(page, gfp_mask);
  10.         return try_to_free_buffers(page);
  11. }

 调用函数try_to_free_buffers()
   
  1. int try_to_free_buffers(struct page *page)
  2. {
  3.         struct address_space * const mapping = page->mapping;
  4.         struct buffer_head *buffers_to_free = NULL;
  5.         int ret = 0;

  6.         BUG_ON(!PageLocked(page));
  7.         if (PageWriteback(page))
  8.                 return 0;

  9.         if (mapping == NULL) { /* can this still happen? */
  10.                 ret = drop_buffers(page, &buffers_to_free);
  11.                 goto out;
  12.         }

  13.         spin_lock(&mapping->private_lock);
  14.         ret = drop_buffers(page, &buffers_to_free);
  15.         if (ret) {
  16.                 /*
  17.                  * If the filesystem writes its buffers by hand (eg ext3)
  18.                  * then we can have clean buffers against a dirty page. We
  19.                  * clean the page here; otherwise later reattachment of buffers
  20.                  * could encounter a non-uptodate page, which is unresolvable.
  21.                  * This only applies in the rare case where try_to_free_buffers
  22.                  * succeeds but the page is not freed.
  23.                  */
  24.                 clear_page_dirty(page);
  25.         }
  26.         spin_unlock(&mapping->private_lock);
  27. out:
  28.         if (buffers_to_free) {
  29.                 struct buffer_head *bh = buffers_to_free;

  30.                 do {
  31.                         struct buffer_head *next = bh->b_this_page;
  32.                         free_buffer_head(bh);
  33.                         bh = next;
  34.                 } while (bh != buffers_to_free);
  35.         }
  36.         return ret;
  37. }
 执行操作如下:
 1. 检查页中所有缓冲区的缓冲区首部的标志。如果有些缓冲区首部的BH_Dirty或BH_Locked标志被置位,说明函数不可能释放这些缓冲区,所以函数终止并返回0(失败)。
 2. 如果缓冲区首部在间接缓冲区的链表中,该函数就从链表中删除它。
 3. 清除页描述符的PG_private标记,把private字段设置为NULL,并递减页的使用计数器。
 4. 清除页的PG_dirty标记。
5. 反复调用free_buffer_head(),以释放页的所有缓冲区首部。
6. 返回1(成功)。

 


上一篇:linux内核之页高速缓存
下一篇:C 语言中的运算符优先级