在linux中同步的概念和换出有不同:
同步就是将物理内存中dirty的页写入到磁盘中,保证磁盘和物理页之间的内容一致。
而换出是指从物理内存中换出数据,将物理页释放,用于优先级更高的工作。
触发同步操作的时机:
1、周期性内核线程,扫描脏页,根据一定的规则选择脏页,将页写回到磁盘。
2、如果内核中的脏页过多,会触发同步
3、内核中其它组件触发同步操作
在逐个介绍上面三个机制之前,我们首先要介绍一下pdflush机制
Pdflush是一个内核线程,这个内核线程专门处理页面刷出相关的一些事务。Pdflush只是页同步机制中的内核使用的一种内核线程机制,这种机制能动态调整后台运行的pdflush线程个数。
|
static int __pdflush(struct pdflush_work *my_work)
{
current->flags |= PF_FLUSHER | PF_SWAPWRITE;
set_freezable();
my_work->fn = NULL;
my_work->who = current;
INIT_LIST_HEAD(&my_work->list);
spin_lock_irq(&pdflush_lock);
/* 系统中总是保持2~8个pdflush进程,不能多于这个数据,也
不会少于这个数目*/
nr_pdflush_threads++;
for ( ; ; )
{
struct pdflush_work *pdf;
/*将线程自己挂接到pdflush_list上面,等待被唤醒*/
set_current_state(TASK_INTERRUPTIBLE);
list_move(&my_work->list, &pdflush_list);
my_work->when_i_went_to_sleep = jiffies;
spin_unlock_irq(&pdflush_lock);
schedule();
try_to_freeze();
spin_lock_irq(&pdflush_lock);
if (!list_empty(&my_work->list))
{
/*
* Someone woke us up, but without removing our control
* structure from the global list. swsusp will do this
* in try_to_freeze()->refrigerator(). Handle it.
*/
my_work->fn = NULL;
continue;
}
if (my_work->fn == NULL)
{
printk("pdflush: bogus wakeup\n");
continue;
}
spin_unlock_irq(&pdflush_lock);
/*执行回调函数,执行刷出任务*/
(*my_work->fn)(my_work->arg0);
/*
* Thread creation: For how long have there been zero
* available threads?
*/
/*如果多于一秒中的时间内,没有可以使用的pdflush,创建一个*/
if (time_after(jiffies, last_empty_jifs + 1 * HZ))
{
/* unlocked list_empty() test is OK here */
if (list_empty(&pdflush_list))
{
/* unlocked test is OK here */
if (nr_pdflush_threads < MAX_PDFLUSH_THREADS)
start_one_pdflush_thread();
}
}
spin_lock_irq(&pdflush_lock);
my_work->fn = NULL;
/*
* Thread destruction: For how long has the sleepiest
* thread slept?
*/
/*pdflush_list是空的,不删除这个pdflush线程
线程数目小于等于2不删除这个进程*/
if (list_empty(&pdflush_list))
continue;
if (nr_pdflush_threads <= MIN_PDFLUSH_THREADS)
continue;
pdf = list_entry(pdflush_list.prev, struct pdflush_work, list);
/*已经休眠了一秒钟,直接退出*/
if (time_after(jiffies, pdf->when_i_went_to_sleep + 1 * HZ))
{
/* Limit exit rate */
pdf->when_i_went_to_sleep = jiffies;
break; /* exeunt */
}
}
nr_pdflush_threads--;
spin_unlock_irq(&pdflush_lock);
return 0;
}
|
内核中循环刷新内存中脏页的过程。
1、系统初始化的时候会启动wb_timer定时器,这个定时器函数wb_timer_fn会试图唤醒wb_kupdata处理函数pdflush内核线程。如果失败了再延迟一秒继续去唤醒。
2、wb_kupdata函数在执行完了具体的唤出任务后,又会重新设置wb_timer定时器,这样就会在系统里形成一种循环定时换出脏页的机制。
Wb_kupdata函数就是内核中完成循环定时同步脏页工作的函数。这么重要的任务就是由wb_kupdata这个函数来担当的,下面我们就已这个函数作为切入点来做具体的分析。
|
/*
* Periodic writeback of "old" data.
*
* Define "old": the first time one of an inode's pages is dirtied, we mark the
* dirtying-time in the inode's address_space. So this periodic writeback code
* just walks the superblock inode list, writing back any inodes which are
* older than a specific point in time.
*
* Try to run once per dirty_writeback_interval. But if a writeback event
* takes longer than a dirty_writeback_interval interval, then leave a
* one-second gap.
*
* older_than_this takes precedence over nr_to_write. So we'll only write back
* all dirty pages if they are all attached to "old" mappings.
*/
static void wb_kupdate(unsigned long arg)
{
unsigned long oldest_jif;
unsigned long start_jif;
unsigned long next_jif;
long nr_to_write;
struct writeback_control wbc =
{
.bdi = NULL,
.sync_mode = WB_SYNC_NONE,
.older_than_this = &oldest_jif,
.nr_to_write = 0,
.nonblocking = 1,
.for_kupdate = 1,
.range_cyclic = 1,
};
sync_supers();
oldest_jif = jiffies - dirty_expire_interval;
start_jif = jiffies;
next_jif = start_jif + dirty_writeback_interval;
/*首先统计一下,系统中需要写回磁盘的脏页有多少*/
nr_to_write = global_page_state(NR_FILE_DIRTY) +
global_page_state(NR_UNSTABLE_NFS) +
(inodes_stat.nr_inodes - inodes_stat.nr_unused);
while (nr_to_write > 0)
{
wbc.more_io = 0;
wbc.encountered_congestion = 0;
wbc.nr_to_write = MAX_WRITEBACK_PAGES;
/*写回这些脏页*/
writeback_inodes(&wbc);
if (wbc.nr_to_write > 0)
{
/*如果发现准许写入磁盘的配额还有,有两种
可能性,一种就是队列阻塞,或者写入错误,这个时候
在阻塞队列上等待一段时间
还有一种可能性就是,确实所有的脏页都已经写入磁盘了*/
if (wbc.encountered_congestion || wbc.more_io)
congestion_wait(WRITE, HZ / 10);
else
break; /* All the old data is written */
}
/*所有的写入磁盘的配额都用完了,看来脏页还很多,继续写入*/
nr_to_write -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
}
/*设置下一次进入这个函数换出脏页的时间*/
if (time_before(next_jif, jiffies + HZ))
next_jif = jiffies + HZ;
if (dirty_writeback_interval)
mod_timer(&wb_timer, next_jif);
}
|
文件同步过程函数调用
1、内核定时调用wb_kupdata函数,进行内核同步文件的工作,这个调用的周期默认可能长达30秒之久。
2、在wb_kupdate函数中,还调用了sync_supers函数,这个函数复杂同步超级块的数据
3、之后调用writeback_inodes函数扫描所有的超级快,如果超级块中有脏页,就同步这个超级块中的脏页。
4、对于每个超级块调用sync_sb_inodes函数,对于一个超级块中的所有脏inode,逐一刷出同步这些inode中的脏页。这个过程是扫描一个super block中的每一个inode,如果这个inode是脏的inode,调用__writeback_single_inode函数将这个inode中的脏页刷出。
5、最后,对于一个inode中的脏页,调用do_writepages将这些脏页逐一写入到磁盘上面。
writeback_inodes(struct writeback_control *wbc)函数实际上就是遍历所有的super_blocks链表上的所有的super_block,然后将每个super_block上的脏页逐一换出。
|
void
writeback_inodes(struct writeback_control *wbc)
{
struct super_block *sb;
might_sleep();
spin_lock(&sb_lock);
restart:
list_for_each_entry_reverse(sb, &super_blocks, s_list)
{
if (sb_has_dirty_inodes(sb))
{
/* we're making our own get_super here */
sb->s_count++;
spin_unlock(&sb_lock);
/*
* If we can't get the readlock, there's no sense in
* waiting around, most of the time the FS is going to
* be unmounted by the time it is released.
*/
if (down_read_trylock(&sb->s_umount))
{
if (sb->s_root)
sync_sb_inodes(sb, wbc);
up_read(&sb->s_umount);
}
spin_lock(&sb_lock);
if (__put_super_and_need_restart(sb))
goto restart;
}
if (wbc->nr_to_write <= 0)
break;
}
spin_unlock(&sb_lock);
}
|
generic_sync_sb_inodes函数是在一个super block上面扫描所有的变脏的inode,并且对每一个脏的inode做刷出的操作,具体分析如下。
|
void generic_sync_sb_inodes(struct super_block *sb,
struct writeback_control *wbc)
{
const unsigned long start = jiffies; /* livelock avoidance */
int sync = wbc->sync_mode == WB_SYNC_ALL;
spin_lock(&inode_lock);
/*
如果不是周期性刷出脏的页面,将s_dirty上的inode,s_more上面的inode都放在s_io上面,
准备在下面的刷出过程中处理s_io上面inode
如果是周期性刷出,那么如果s_io上面不是空,那么稳健的将s_io上的inode刷出后,再将
s_dirty和s_more上面的inode填充到s_io上面
*/
if (!wbc->for_kupdate || list_empty(&sb->s_io))
queue_io(sb, wbc->older_than_this);
while (!list_empty(&sb->s_io))
{
struct inode *inode = list_entry(sb->s_io.prev,
struct inode, i_list);
struct address_space *mapping = inode->i_mapping;
struct backing_dev_info *bdi = mapping->backing_dev_info;
long pages_skipped;
/*如果这个inode对应的块设备有BDI_CAP_NO_WRITEBACK标记,说明是
ram disk之类的设备,不必回写,这中情况可以直接跳过这个super
block
*/
if (!bdi_cap_writeback_dirty(bdi))
{
redirty_tail(inode);
if (sb_is_blkdev_sb(sb))
{
/*
* Dirty memory-backed blockdev: the ramdisk
* driver does this. Skip just this inode
*/
continue;
}
/*
* Dirty memory-backed inode against a filesystem other
* than the kernel-internal bdev filesystem. Skip the
* entire superblock.
*/
break;
}
/*如果inode有 I_NEW,标记,将这个inode放入s_more链表中,下次处理*/
if (inode->i_state & I_NEW)
{
requeue_io(inode);
continue;
}
/*如果遇到块设备拥塞,有两种情况
1、普通文件,直接跳过这个super block,
2、如果是块设备的inode,那么将这个inode放到s_more上面,继续下一个
inode的处理*/
if (wbc->nonblocking && bdi_write_congested(bdi))
{
wbc->encountered_congestion = 1;
if (!sb_is_blkdev_sb(sb))
break; /* Skip a congested fs */
requeue_io(inode);
continue; /* Skip a congested blockdev */
}
/*
如果刷出的时候指定的队列,而发现inode所在队列不符合,也有两种情况:
1、普通文件,直接跳过这个super block
2、如果是块设备的inode,那么将这个inode放到s_more上面,继续下一个
inode的处理
*/
if (wbc->bdi && bdi != wbc->bdi)
{
if (!sb_is_blkdev_sb(sb))
break; /* fs has the wrong queue */
requeue_io(inode);
continue; /* blockdev has wrong queue */
}
/* Was this inode dirtied after sync_sb_inodes was called? */
/*
如果发现这个inode弄脏的时间并开始扫描这些inode的时间还要晚
这中情况是不正常的,跳过这个super block
*/
if (time_after(inode->dirtied_when, start))
break;
/* Is another pdflush already flushing this queue? */
if (current_is_pdflush() && !writeback_acquire(bdi))
break;
BUG_ON(inode->i_state & I_FREEING);
__iget(inode);
pages_skipped = wbc->pages_skipped;
__writeback_single_inode(inode, wbc);
if (current_is_pdflush())
writeback_release(bdi);
if (wbc->pages_skipped != pages_skipped)
{
/*
* writeback is not making progress due to locked
* buffers. Skip this inode for now.
*/
redirty_tail(inode);
}
spin_unlock(&inode_lock);
iput(inode);
cond_resched();
spin_lock(&inode_lock);
if (wbc->nr_to_write <= 0)
{
wbc->more_io = 1;
break;
}
if (!list_empty(&sb->s_more_io))
wbc->more_io = 1;
}
if (sync)
{
struct inode *inode, *old_inode = NULL;
/*
* Data integrity sync. Must wait for all pages under writeback,
* because there may have been pages dirtied before our sync
* call, but which had writeout started before we write it out.
* In which case, the inode may not be on the dirty list, but
* we still have to wait for that writeout.
*/
list_for_each_entry(inode, &sb->s_inodes, i_sb_list)
{
struct address_space *mapping;
if (inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW))
continue;
mapping = inode->i_mapping;
if (mapping->nrpages == 0)
continue;
__iget(inode);
spin_unlock(&inode_lock);
/*
* We hold a reference to 'inode' so it couldn't have
* been removed from s_inodes list while we dropped the
* inode_lock. We cannot iput the inode now as we can
* be holding the last reference and we cannot iput it
* under inode_lock. So we keep the reference and iput
* it later.
*/
iput(old_inode);
old_inode = inode;
filemap_fdatawait(mapping);
cond_resched();
spin_lock(&inode_lock);
}
spin_unlock(&inode_lock);
iput(old_inode);
}
else
spin_unlock(&inode_lock);
return; /* Leave any unwritten inodes on s_io */
}
|