linux内核之页高速缓存

1770阅读 0评论2013-06-27 double_lq
分类:LINUX

一、页高速缓存中涉及的数据结构

 1、页高速缓存的核心数据结构address_space对象,它是一个嵌入在页所有者的索引节点对象的数据结构。高速缓存中的许多页可能属于同一个所有者,从而可以链接到同一个address_space对象
    
  1. struct address_space {
  2.          struct inode *host; /* 指向拥有该对象的索引节点的指针 */
  3.          struct radix_tree_root page_tree; /* 拥有者页的基树的根*/
  4.          rwlock_t tree_lock; /* 保护基树的自旋锁 */
  5.          unsigned int i_mmap_writable;/* 地址空间共享内存映射的个数 */
  6.          struct prio_tree_root i_mmap;
  7.          struct list_head i_mmap_nonlinear;
  8.          spinlock_t i_mmap_lock;
  9.          unsigned int truncate_count;
  10.          unsigned long nrpages; /* 所有者的总页数 */
  11.          pgoff_t writeback_index;
  12.          struct address_space_operations *a_ops; /* 对页操作的方法 */
  13.          unsigned long flags;
  14.          struct backing_dev_info *backing_dev_info;
  15.          spinlock_t private_lock;
  16.          struct list_head private_list;
  17.          struct address_space *assoc_mapping;
  18. };

2、基树

   radix_tree_root数据结构:
   
  1. struct radix_tree_root {
  2.           unsigned int height;
  3.           int gfp_mask;
  4.           struct radix_tree_node *rnode;
  5. };

radix_tree_node数据结构:
 
  1. struct radix_tree_node {
  2.           unsigned int count;
  3.           void *slots[RADIX_TREE_MAP_SIZE];
  4.           unsigned long tags[RADIX_TREE_TAGS][RADIX_TREE_TAG_LONGS];
  5. };

内核中的基树结构:


    


   总结:address_space,inode,基树以及页描述符之间的关系
    1、每个adrres_space对象对应一颗搜索树。他们之间的联系是通过address_space对象中的page_tree字段指向该address_space对象对应的基树。
    2、一个inode节点对应一个address_space对象,其中inode节点对象的i_mapping和i_data字段指向相应的 address_space对象,而address_space对象的host字段指向对应的inode节点对象。
   3、)一般情况下一个inode节点对象对应的文件或者是块设备都会包含多个页面的内容,所以一个inode对象对应多个page描述符。同一个文件拥有的所有page描述符都可以在该文件对应的基树中找到。
 它们之间的关系如下图:
  
 
 


二、页高速缓存的处理函数
      对页高速缓存操作的基本高级函数有查找、添加和删除页。
  
  1. #ifdef __KERNEL__
  2. #define RADIX_TREE_MAP_SHIFT 6
  3. #else
  4. #define RADIX_TREE_MAP_SHIFT 3 /* For more stressful testing */
  5. #endif
  6. #define RADIX_TREE_TAGS 2

  7. #define RADIX_TREE_MAP_SIZE (1UL << RADIX_TREE_MAP_SHIFT)
  8. #define RADIX_TREE_MAP_MASK (RADIX_TREE_MAP_SIZE-1)

  9. #define RADIX_TREE_TAG_LONGS \
  10.         ((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG)

  2.1  查找页
      查找页主要是通过find_get_page()接受参数为指向address_space对象的指针和偏移量,具体代码如下:
     
  1. struct page * find_get_page(struct address_space *mapping, unsigned long offset)
  2. {
  3.          struct page *page;
  4.  
  5.          read_lock_irq(&mapping->tree_lock);
  6.          page = radix_tree_lookup(&mapping->page_tree, offset);
  7.          if (page)
  8.                  page_cache_get(page);   //增加该页的使用计数器
  9.          read_unlock_irq(&mapping->tree_lock);
  10.          return page;
  11. }
该函数由radix_tree_lookup()和page_cache_get()函数实现,首先看一下radix_tree_lookup()的代码:
  
  1. void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index)
  2.  {
  3.          unsigned int height, shift;
  4.          struct radix_tree_node **slot;
  5.  
  6.          height = root->height;
  7.          if (index > radix_tree_maxindex(height))  //索引是否大于当前深度对应的最大索引值
  8.                  return NULL;
  9.  
  10.          shift = (height-1) * RADIX_TREE_MAP_SHIFT;
  11.          slot = &root->rnode;
  12.  
  13.          while (height > 0) {
  14.                  if (*slot == NULL)
  15.                          return NULL;
  16.  
  17.                  slot = (struct radix_tree_node **)
  18.                          ((*slot)->slots +
  19.                                  ((index >> shift) & RADIX_TREE_MAP_MASK));
  20.                  shift -= RADIX_TREE_MAP_SHIFT;
  21.                  height--;
  22.          }
  23.  
  24.          return *slot;
  25.  }

该函数主要是找到基树中与index对应的页描述符,并返回指向该页描述符的地址。
 对于基树查找的说明:对于一个index比如131,其二进制表示为:000010 000011,从低位到高位以每六位为一组表示一层。最低六位表示page在最下面一层的位置,这里为000011表示最下面一层的slots下标为3表示的page。次低六位表示的是次下面一层的位置。例如图
    
  

从上图看:首先根据000010表示要找的页在根节点的下一层中的第三个位置即slots[2]表示的子树下面。因此我们顺着slots[2]的子树,再根据000011表示要找的页在此层中第四个位置即slots[3]下面。然后在顺着slots[3]就可以找到我们index=131的page了。
有了上面的基础就很容易的理解slot = (struct radix_tree_node **)((*slot)->slots + ((index >> shift) & RADIX_TREE_MAP_MASK));这条语句的意思了。首先这条语句的关键就是((index >> shift) & RADIX_TREE_MAP_MASK)寻找每层的下标。例如height为2,RADIX_TREE_MAP_SHIFT的值为6,那么一开始shift的值为6,那么(index>>6)&111111表示index右移6位,然后和111111按位与,就可以取得index的最高的六位的数字即第一层的下标了。当第二次循环的时候shift为0,此时index&111111就可以取得最底层的下标这样就可以找到相应的页了。
page_cache_get()函数增加该页的使用计数器。

 2. 2  增加页
   函数add_to_page_cache()把一个新页的描述符插入到页高速缓存。具体代码如下:

 
  1. int add_to_page_cache(struct page *page, struct address_space *mapping,
  2.                  pgoff_t offset, int gfp_mask)
  3.  {
  4.          int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
  5.  
  6.          if (error == 0) {
  7.                  write_lock_irq(&mapping->tree_lock);
  8.                  error = radix_tree_insert(&mapping->page_tree, offset, page);
  9.                  if (!error) {
  10.                          page_cache_get(page);
  11.                          SetPageLocked(page);
  12.                          page->mapping = mapping;
  13.                          page->index = offset;
  14.                          mapping->nrpages++;
  15.                          pagecache_acct(1);
  16.                  }
  17.                  write_unlock_irq(&mapping->tree_lock);
  18.                  radix_tree_preload_end();
  19.          }
  20.          return error;
  21.  }
  函数radix_tree_preload()函数禁止内核抢占,并把一些空的radix_tree_node结构赋给每cpu变量radix_tree_preloads.radix_tree_node结构的分配由slab分配器高速缓存radix_tree_node_cachep来完成。如果radix_tree_preload()预分配radix_tree_node结构不成功,函数add_to_page_cache就终止返回。否则,如果radix_tree_preload()成功地完成预分配,add_to_page_cache()函数肯定不会因为缺乏空闲内存或因为文件的大小达到了64GB而无法完成新页描述符的插入。
radix_tree_inser()在树中插入新节点,
  1. /* Make sure the tree is high enough. */
  2.          if ((!index && !root->rnode) ||
  3.                          index > radix_tree_maxindex(root->height)) {
  4.                  error = radix_tree_extend(root, index);
  5.                  if (error)
  6.                          return error;
  7.          }
上面一段代码主要是判断要插入的页的index是否小于当前页的深度对应的最大索引值。若大于,则通过radix_tree_extend()扩展基树的深度,并分配radix_tree_node,而且将该radix_tree_node和原有的基树建立连接。新分配的radix_tree_node添加在radix_tree_root之下,所以radix_tree_node之上。代码如下:
  
  1. do {
  2.                  if (!(node = radix_tree_node_alloc(root)))
  3.                          return -ENOMEM;

  4.                  /* Increase the height. */
  5.                  node->slots[0] = root->rnode;
  6.  
  7.                  /* Propagate the aggregated tag info into the new root */
  8.                  for (tag = 0; tag < RADIX_TREE_TAGS; tag++) {
  9.                          if (tags[tag])
  10.                                  tag_set(node, tag, 0);
  11.                  }
  12.  
  13.                  node->count = 1;
  14.                  root->rnode = node;
  15.                  root->height++;
  16.          } while (height > root->height);
node->slots[0] = root->rnode;则是将原来的radix_tree_node的地址放在新分配的radix_tree_node的slots[0]中。
root->rnode = node;则是root的rnode指向新分配的radix_tree_node;

 
  1.          slot = &root->rnode;
  2.          height = root->height;
  3.          shift = (height-1) * RADIX_TREE_MAP_SHIFT;
  4.  
  5.          offset = 0; /* uninitialised var warning */
  6.          while (height > 0) {
  7.                  if (*slot == NULL) {
  8.                          /* Have to add a child node. */
  9.                          if (!(tmp = radix_tree_node_alloc(root)))
  10.                                  return -ENOMEM;
  11.                          *slot = tmp;
  12.                          if (node)
  13.                                  node->count++;
  14.                  }
  15.  
  16.                  /* Go a level down */
  17.                  offset = (index >> shift) & RADIX_TREE_MAP_MASK;
  18.                  node = *slot;
  19.                  slot = (struct radix_tree_node **)(node->slots + offset);
  20.                  shift -= RADIX_TREE_MAP_SHIFT;
  21.                  height--;
  22.          }
  23.  
  24.          if (*slot != NULL)
  25.                  return -EEXIST;
  26.          if (node) {
  27.                  node->count++;
  28.                  BUG_ON(tag_get(node, 0, offset));
  29.                  BUG_ON(tag_get(node, 1, offset));
  30.          }
  31.  
  32.          *slot = item;
  33.          return 0;
以下面的图形例子来对上面的函数进行说明:
 
            

 
   slot = &root->rnode;使slot指向radix_tree_root的rnode
   height=2;
  进入while循环:(此时还只有黑色线条部分)
      *slot表示rnode中存储的地址,此时不为空,则执行下面语句段:以index=131为例,其二进制为:000010 000011
       
  1. /* Go a level down */
  2.                  offset = (index >> shift) & RADIX_TREE_MAP_MASK
  3.                  node = *slot;
  4.                  slot = (struct radix_tree_node **)(node->slots + offset);
  5.                  shift -= RADIX_TREE_MAP_SHIFT;
  6.                  height--;
此时offset的值为:000010;
node = *slot;使node指向第一个radix_tree_node
slot = (struct radix_tree_node **)(node->slots + offset);使slot指向第一个radix_tree_node的slot[2];
再一次进入while循环:(实现上图中的绿色部分)
   此时*slot==NULL,故进入下面的程序段:
    
  1. if (*slot == NULL) {
  2.                          /* Have to add a child node. */
  3.                          if (!(tmp = radix_tree_node_alloc(root)))
  4.                                  return -ENOMEM;
  5.                          *slot = tmp;
  6.                          if (node)
  7.                                  node->count++;
  8.                  }


radix_tree_node_alloc(root);分配一个radix_tree_node
*slot = tmp;将该radix_tree_node的地址放在slot中
然后执行:
  
  1. /* Go a level down */
  2.                  offset = (index >> shift) & RADIX_TREE_MAP_MASK;
  3.                  node = *slot;
  4.                  slot = (struct radix_tree_node **)(node->slots + offset);
  5.                  shift -= RADIX_TREE_MAP_SHIFT;
  6.                  height--;
最后执行:
 
  1. if (*slot != NULL)
  2.                  return -EEXIST;
  3.          if (node) {
  4.                  node->count++;
  5.                  BUG_ON(tag_get(node, 0, offset));
  6.                  BUG_ON(tag_get(node, 1, offset));
  7.          }
  8.  
  9.          *slot = item;
  10.          return 0;
此时node指向第二层那个radix_tree_node,
 *slot = item;将要添加的page加入的地址赋给*slot;

 2.3 删除页
    函数remove_from_page_cache()实现从高速缓存中删除页描述符,代码如下:
    
  1. void remove_from_page_cache(struct page *page)
  2.  {
  3.          struct address_space *mapping = page->mapping;   //获得page对应的address_space对象
  4.  
  5.          BUG_ON(!PageLocked(page));
  6.  
  7.          write_lock_irq(&mapping->tree_lock);
  8.          __remove_from_page_cache(page);
  9.          write_unlock_irq(&mapping->tree_lock);
  10.  }
 __remove_from_page_cache()代码如下:
 
  1. void __remove_from_page_cache(struct page *page)
  2.  {
  3.          struct address_space *mapping = page->mapping;
  4.  
  5.          radix_tree_delete(&mapping->page_tree, page->index);
  6.          page->mapping = NULL;
  7.          mapping->nrpages--;
  8.          pagecache_acct(-1);
  9.  }
  
  1.  page->mapping = NULL;把page的mapping字段置为NULL
  2.  mapping->nrpages--; 把所有者的页总数减1


radix_tree_delete()代码如下:
   

  1. void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
  2.  {
  3.          struct radix_tree_path path[RADIX_TREE_MAX_PATH], *pathp = path;   //path为结构体数组,将结构体数组的首元的首地址赋值给pathp
  4.          struct radix_tree_path *orig_pathp;
  5.          unsigned int height, shift;
  6.          void *ret = NULL;
  7.          char tags[RADIX_TREE_TAGS];
  8.          int nr_cleared_tags;
  9.  
  10.          height = root->height;                                               //以height=2为例
  11.          if (index > radix_tree_maxindex(height))                             //保证要删除的页索引小于当前深度所允许的最大索引值
  12.                  goto out;
  13.  
  14.          shift = (height - 1) * RADIX_TREE_MAP_SHIFT;                          //6
  15.          pathp->node = NULL;                                                   //初始条件以下图1中黑色线条表示
  16.          pathp->slot = &root->rnode;
  17.  
  18.          while (height > 0) {                                                   //height=2以下图1中蓝色线条表示
  19.                  int offset;                                                     //height=1以下图1中红色线条表示
  20.  
  21.                  if (*pathp->slot == NULL)
  22.                          goto out;
  23.  
  24.                  offset = (index >> shift) & RADIX_TREE_MAP_MASK;
  25.                  pathp[1].offset = offset;
  26.                  pathp[1].node = *pathp[0].slot;
  27.                  pathp[1].slot = (struct radix_tree_node **)
  28.                                  (pathp[1].node->slots + offset);
  29.                  pathp++;
  30.                  shift -= RADIX_TREE_MAP_SHIFT;
  31.                  height--;
  32.          }
  33.  
  34.          ret = *pathp[0].slot;
  35.          if (ret == NULL)
  36.                  goto out;
  37.  
  38.          orig_pathp = pathp;
  39.  
  40.          /*
  41.           * Clear all tags associated with the just-deleted item
  42.           */
  43.          memset(tags, 0, sizeof(tags));                                    //这里没太看懂
  44.          do {
  45.                  int tag;
  46.  
  47.                  nr_cleared_tags = RADIX_TREE_TAGS;
  48.                  for (tag = 0; tag < RADIX_TREE_TAGS; tag++) {
  49.                          int idx;
  50.  
  51.                          if (tags[tag])
  52.                                  continue;
  53.  
  54.                          tag_clear(pathp[0].node, tag, pathp[0].offset);
  55.  
  56.                          for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) {
  57.                                  if (pathp[0].node->tags[tag][idx]) {
  58.                                          tags[tag] = 1;
  59.                                          nr_cleared_tags--;
  60.                                          break;
  61.                                  }
  62.                          }
  63.                  }
  64.                  pathp--;
  65.          } while (pathp[0].node && nr_cleared_tags);
  66.  
  67.          pathp = orig_pathp;                                                   //从最后一个节点开始,对路径数组中的节点开始循环操作。对每个节点,把指向
  68.          *pathp[0].slot = NULL;                                                //下一个节点(或页描述符)位置数组的元素置为NULL,并递减count字段。如果
  69.          while (pathp[0].node && --pathp[0].node->count == 0) {                //count变为0,就从树中删除节点并把radix_tree_node结构释放给slab分配器
  70.                  pathp--;                                                      //高速缓存。然后继续循环处理路径数组中的节点
  71.                  BUG_ON(*pathp[0].slot == NULL);
  72.                  *pathp[0].slot = NULL;
  73.                  radix_tree_node_free(pathp[1].node);
  74.          }
  75.          if (root->rnode == NULL)
  76.                  root->height = 0;
  77.  out:
  78.          return ret;                                                             //返回已经从树中删除的页描述符指针
  79.  }

    


  至此,完成了页的删除。








上一篇:linux内核文件系统的注册与安装
下一篇:linux内核之把块放在页高速缓存中