2.从2.4.10的稳定版本开始,不再单独分配块缓存区,而是将它们放在叫做“缓冲区页”的专门页中,而缓冲区页保存在页高速缓存中。
3.每个块缓冲区都有buffer_head类型的缓冲区首部描述符,该首部有如何处理块的所有信息,所以在对所有块操作之前,内核检查缓冲区首部。
4、只要内核必须单独访问一个块,就要涉及存放块缓冲区的缓冲区页,并检查相应缓冲区的缓冲区首部。
5.在一个缓冲区页内的所有块缓冲区大小必须相同。一个缓冲区页可以包括1~8个缓冲区。
缓存区首部的字段如下:
-
struct buffer_head {
-
/* First cache line: */
-
unsigned long b_state; /* 缓冲区状态标志 */
-
struct buffer_head *b_this_page;/* 指向缓冲区页的链表中的下一个元素 */
-
struct page *b_page; /* 指向拥有该缓冲区页的描述符的指针 */
-
atomic_t b_count; /* 块使用计数器 */
-
u32 b_size; /* 块大小 */
-
-
sector_t b_blocknr; /* 与块设备相关的块号 */
-
char *b_data; /* 块在缓冲区页内的位置 */
-
-
struct block_device *b_bdev; /* 指向块设备描述符的指针*/
-
bh_end_io_t *b_end_io;
-
void *b_private;
-
struct list_head b_assoc_buffers;
- };
1、分配块设备缓冲区页
当内核发现指定块的缓冲区所在的页不在页高速缓存中时,就分配一个新的块设备缓冲区页。
内核调用函数grow_buffers()把块设备缓冲区页添加到页高速缓存中,函数代码如下:
-
static inline int
-
grow_buffers(struct block_device *bdev, sector_t block, int size)
-
{
-
struct page *page;
-
pgoff_t index;
-
int sizebits;
-
-
sizebits = -1;
-
do {
-
sizebits++;
-
} while ((size << sizebits) < PAGE_SIZE);
-
-
index = block >> sizebits; //计算出数据页在所请求块的块设备中的偏移量index
-
block = index << sizebits;
-
-
/* Create a page with the proper size buffers.. */
-
page = grow_dev_page(bdev, block, index, size); //创建新的块设备缓冲区页
-
if (!page)
-
return 0;
-
unlock_page(page);
-
page_cache_release(page);
-
return 1;
- }
-
static struct page *
-
grow_dev_page(struct block_device *bdev, sector_t block,
-
pgoff_t index, int size)
-
{
-
struct inode *inode = bdev->bd_inode; //与该块设备对应的索引节点
-
struct page *page;
-
struct buffer_head *bh;
-
-
page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); //此时页已经在页高速缓存中了
-
if (!page)
-
return NULL;
-
-
if (!PageLocked(page))
-
BUG();
-
- if (page_has_buffers(page)) { //检查其PG_private标志,若为空,则页还不是一个缓冲区页,因为没有相关的缓冲区首部
-
bh = page_buffers(page); //从页描述符中取得第一个缓冲区首部的地址
-
if (bh->b_size == size) { //检验块大小是否等于所请求的块大小(缓冲区页中所有块大小一样)
-
init_page_buffers(page, bdev, block, size);
-
return page;
-
}
-
if (!try_to_free_buffers(page))
-
goto failed;
-
}
-
-
/*
-
* Allocate some buffers for this page
-
*/
-
bh = alloc_page_buffers(page, size, 0); //为该页分配缓冲区首部
-
if (!bh)
-
goto failed;
-
-
/*
-
* Link the page to the buffers and initialise them. Take the
-
* lock to be atomic wrt __find_get_block(), which does not
-
* run under the page lock.
-
*/
-
spin_lock(&inode->i_mapping->private_lock);
-
link_dev_buffers(page, bh); //将缓冲区的头尾相连形成循环链表,如图中红色线条所示,并将page->private=head
-
init_page_buffers(page, bdev, block, size);
-
spin_unlock(&inode->i_mapping->private_lock);
-
return page;
-
-
failed:
-
BUG();
-
unlock_page(page);
-
page_cache_release(page);
-
return NULL;
- }
find_or_create_page()在mapping->page_tree对应的基树中查找对应索引的页,若没有,则增加一个页。返回指向该页的描述符。
-
struct page *find_or_create_page(struct address_space *mapping,
-
unsigned long index, unsigned int gfp_mask)
-
{
-
struct page *page, *cached_page = NULL;
-
int err;
-
repeat:
-
page = find_lock_page(mapping, index);//通过radix_tree_lookup()函数查找对应索引的页,若找到,则返回page,没找到,返回NULL
-
if (!page) { //在page==NULL的情况下
-
if (!cached_page) {
-
cached_page = alloc_page(gfp_mask);
-
if (!cached_page)
-
return NULL;
-
}
-
err = add_to_page_cache_lru(cached_page, mapping,
-
index, gfp_mask);
-
if (!err) {
-
page = cached_page;
-
cached_page = NULL;
-
} else if (err == -EEXIST)
-
goto repeat;
-
}
-
if (cached_page)
-
page_cache_release(cached_page);
-
return page;
- }
add_to_page_cache_lru()函数代码如下:
-
int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
-
pgoff_t offset, int gfp_mask)
-
{
-
int ret = add_to_page_cache(page, mapping, offset, gfp_mask); //该函数的具体实现在前一篇博文中有,主要是增加页
-
if (ret == 0)
-
lru_cache_add(page);
-
return ret;
- }
-
static void
-
init_page_buffers(struct page *page, struct block_device *bdev,
-
sector_t block, int size)
-
{
-
struct buffer_head *head = page_buffers(page);
-
struct buffer_head *bh = head;
-
int uptodate = PageUptodate(page);
-
-
do {
-
if (!buffer_mapped(bh)) {
-
init_buffer(bh, NULL, NULL);
-
bh->b_bdev = bdev;
-
bh->b_blocknr = block;
-
if (uptodate)
-
set_buffer_uptodate(bh);
-
set_buffer_mapped(bh);
-
}
-
block++;
-
bh = bh->b_this_page;
-
} while (bh != head);
- }
bh = alloc_page_buffers(page, size, 0); 具体代码如下:
-
struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
-
int retry)
-
{
-
struct buffer_head *bh, *head;
-
long offset;
-
-
try_again:
-
head = NULL;
-
offset = PAGE_SIZE;
-
while ((offset -= size) >= 0) {
-
bh = alloc_buffer_head(GFP_NOFS); //分配缓冲区首部
-
if (!bh)
-
goto no_grow;
-
-
bh->b_bdev = NULL;
-
bh->b_this_page = head;
-
bh->b_blocknr = -1;
-
head = bh;
-
-
bh->b_state = 0;
-
atomic_set(&bh->b_count, 0);
-
bh->b_size = size;
-
-
/* Link the buffer to its page */
-
set_bh_page(bh, page, offset); //该函数实现的功能在图中以蓝色线条表示 将缓冲区首部和页描述符连接起来,和缓冲区连接起来
-
-
bh->b_end_io = NULL;
-
}
-
return head;
-
/*
-
* In case anything failed, we just free everything we got.
-
*/
-
no_grow:
-
if (head) {
-
do {
-
bh = head;
-
head = head->b_this_page;
-
free_buffer_head(bh);
-
} while (head);
-
}
-
-
/*
-
* Return failure for non-async IO requests. Async IO requests
-
* are not allowed to fail, so we have to wait until buffer heads
-
* become available. But we don't want tasks sleeping with
-
* partially complete buffers, so all were released above.
-
*/
-
if (!retry)
-
return NULL;
-
-
/* We're _really_ low on memory. Now we just
-
* wait for old buffer heads to become free due to
-
* finishing IO. Since this is an async request and
-
* the reserve list is empty, we're sure there are
-
* async buffer heads in use.
-
*/
-
free_more_memory();
-
goto try_again;
- }
-
void set_bh_page(struct buffer_head *bh,
-
struct page *page, unsigned long offset)
-
{
-
bh->b_page = page; //将缓冲区首部和页描述符连接起来
-
if (offset >= PAGE_SIZE)
-
BUG();
-
if (PageHighMem(page)) //将缓冲区首部和对应的缓冲区连接起来
-
/*
-
* This catches illegal uses and preserves the offset:
-
*/
-
bh->b_data = (char *)(0 + offset);
-
else
-
bh->b_data = page_address(page) + offset;
- }
函数grow_buffers()执行完成后,结构之间的关系如图:
到此,分配块设备缓冲区页已完成。
2、释放块设备缓冲区页
当内核试图获得更多的空闲内存时,就释放块设备缓冲区页。显然,不可能释放有脏缓冲区或上锁的缓冲区的页。内核调用函数try_to_release_page()释放缓冲区页,该函数接受页描述符的地址page,具体代码如下:
-
int try_to_release_page(struct page *page, int gfp_mask)
-
{
-
struct address_space * const mapping = page->mapping;
-
-
BUG_ON(!PageLocked(page));
-
if (PageWriteback(page)) //正在把页写回磁盘,所以不可能释放该页
-
return 0;
-
-
if (mapping && mapping->a_ops->releasepage) //如果定义了块设备的releasepage方法,就调用它,通常没有定义
-
return mapping->a_ops->releasepage(page, gfp_mask);
-
return try_to_free_buffers(page);
- }
调用函数try_to_free_buffers()
-
int try_to_free_buffers(struct page *page)
-
{
-
struct address_space * const mapping = page->mapping;
-
struct buffer_head *buffers_to_free = NULL;
-
int ret = 0;
-
-
BUG_ON(!PageLocked(page));
-
if (PageWriteback(page))
-
return 0;
-
-
if (mapping == NULL) { /* can this still happen? */
-
ret = drop_buffers(page, &buffers_to_free);
-
goto out;
-
}
-
-
spin_lock(&mapping->private_lock);
-
ret = drop_buffers(page, &buffers_to_free);
-
if (ret) {
-
/*
-
* If the filesystem writes its buffers by hand (eg ext3)
-
* then we can have clean buffers against a dirty page. We
-
* clean the page here; otherwise later reattachment of buffers
-
* could encounter a non-uptodate page, which is unresolvable.
-
* This only applies in the rare case where try_to_free_buffers
-
* succeeds but the page is not freed.
-
*/
-
clear_page_dirty(page);
-
}
-
spin_unlock(&mapping->private_lock);
-
out:
-
if (buffers_to_free) {
-
struct buffer_head *bh = buffers_to_free;
-
-
do {
-
struct buffer_head *next = bh->b_this_page;
-
free_buffer_head(bh);
-
bh = next;
-
} while (bh != buffers_to_free);
-
}
-
return ret;
- }
1. 检查页中所有缓冲区的缓冲区首部的标志。如果有些缓冲区首部的BH_Dirty或BH_Locked标志被置位,说明函数不可能释放这些缓冲区,所以函数终止并返回0(失败)。
2. 如果缓冲区首部在间接缓冲区的链表中,该函数就从链表中删除它。
3. 清除页描述符的PG_private标记,把private字段设置为NULL,并递减页的使用计数器。
4. 清除页的PG_dirty标记。
5. 反复调用free_buffer_head(),以释放页的所有缓冲区首部。
6. 返回1(成功)。