文件系统同步机制分析-xiong9937-ChinaUnix博客

在linux中同步的概念和换出有不同：

同步就是将物理内存中dirty的页写入到磁盘中，保证磁盘和物理页之间的内容一致。

而换出是指从物理内存中换出数据，将物理页释放，用于优先级更高的工作。

触发同步操作的时机：

1、周期性内核线程，扫描脏页，根据一定的规则选择脏页，将页写回到磁盘。

2、如果内核中的脏页过多，会触发同步

3、内核中其它组件触发同步操作

在逐个介绍上面三个机制之前，我们首先要介绍一下pdflush机制

Pdflush是一个内核线程，这个内核线程专门处理页面刷出相关的一些事务。Pdflush只是页同步机制中的内核使用的一种内核线程机制，这种机制能动态调整后台运行的pdflush线程个数。

static int __pdflush(struct pdflush_work *my_work)
{
    current->flags |= PF_FLUSHER | PF_SWAPWRITE;
    set_freezable();
    my_work->fn = NULL;
    my_work->who = current;
    INIT_LIST_HEAD(&my_work->list);

    spin_lock_irq(&pdflush_lock);
    /* 系统中总是保持2～8个pdflush进程，不能多于这个数据，也
    不会少于这个数目*/
    nr_pdflush_threads++;
    for ( ; ; )
    {
        struct pdflush_work *pdf;
        /*将线程自己挂接到pdflush_list上面，等待被唤醒*/
        set_current_state(TASK_INTERRUPTIBLE);
        list_move(&my_work->list, &pdflush_list);
        my_work->when_i_went_to_sleep = jiffies;
        spin_unlock_irq(&pdflush_lock);
        schedule();
        try_to_freeze();
        spin_lock_irq(&pdflush_lock);
        if (!list_empty(&my_work->list))
        {
            /*
             * Someone woke us up, but without removing our control
             * structure from the global list.  swsusp will do this
             * in try_to_freeze()->refrigerator().  Handle it.
             */
            my_work->fn = NULL;
            continue;
        }
        if (my_work->fn == NULL)
        {
            printk("pdflush: bogus wakeup\n");
            continue;
        }
        spin_unlock_irq(&pdflush_lock);
        /*执行回调函数，执行刷出任务*/
        (*my_work->fn)(my_work->arg0);

        /*
         * Thread creation: For how long have there been zero
         * available threads?
         */
        /*如果多于一秒中的时间内，没有可以使用的pdflush，创建一个*/
        if (time_after(jiffies, last_empty_jifs + 1 * HZ))
        {
            /* unlocked list_empty() test is OK here */
            if (list_empty(&pdflush_list))
            {
                /* unlocked test is OK here */
                if (nr_pdflush_threads < MAX_PDFLUSH_THREADS)
                    start_one_pdflush_thread();
            }
        }

        spin_lock_irq(&pdflush_lock);
        my_work->fn = NULL;

        /*
         * Thread destruction: For how long has the sleepiest
         * thread slept?
         */
        /*pdflush_list是空的，不删除这个pdflush线程
        线程数目小于等于2不删除这个进程*/
        if (list_empty(&pdflush_list))
            continue;
        if (nr_pdflush_threads <= MIN_PDFLUSH_THREADS)
            continue;
        pdf = list_entry(pdflush_list.prev, struct pdflush_work, list);
        /*已经休眠了一秒钟，直接退出*/
        if (time_after(jiffies, pdf->when_i_went_to_sleep + 1 * HZ))
        {
            /* Limit exit rate */
            pdf->when_i_went_to_sleep = jiffies;
            break;                    /* exeunt */
        }
    }
    nr_pdflush_threads--;
    spin_unlock_irq(&pdflush_lock);
    return 0;
}

内核中循环刷新内存中脏页的过程。

1、系统初始化的时候会启动wb_timer定时器，这个定时器函数wb_timer_fn会试图唤醒wb_kupdata处理函数pdflush内核线程。如果失败了再延迟一秒继续去唤醒。

2、wb_kupdata函数在执行完了具体的唤出任务后，又会重新设置wb_timer定时器，这样就会在系统里形成一种循环定时换出脏页的机制。

Wb_kupdata函数就是内核中完成循环定时同步脏页工作的函数。这么重要的任务就是由wb_kupdata这个函数来担当的，下面我们就已这个函数作为切入点来做具体的分析。

/*
* Periodic writeback of "old" data.
*
* Define "old": the first time one of an inode's pages is dirtied, we mark the
* dirtying-time in the inode's address_space.  So this periodic writeback code
* just walks the superblock inode list, writing back any inodes which are
* older than a specific point in time.
*
* Try to run once per dirty_writeback_interval.  But if a writeback event
* takes longer than a dirty_writeback_interval interval, then leave a
* one-second gap.
*
* older_than_this takes precedence over nr_to_write.  So we'll only write back
* all dirty pages if they are all attached to "old" mappings.
*/
static void wb_kupdate(unsigned long arg)
{
    unsigned long oldest_jif;
    unsigned long start_jif;
    unsigned long next_jif;
    long nr_to_write;
    struct writeback_control wbc =
    {
        .bdi        = NULL,
        .sync_mode    = WB_SYNC_NONE,
        .older_than_this = &oldest_jif,
        .nr_to_write    = 0,
        .nonblocking    = 1,
        .for_kupdate    = 1,
        .range_cyclic    = 1,
    };

    sync_supers();

    oldest_jif = jiffies - dirty_expire_interval;
    start_jif = jiffies;
    next_jif = start_jif + dirty_writeback_interval;
    /*首先统计一下，系统中需要写回磁盘的脏页有多少*/
    nr_to_write = global_page_state(NR_FILE_DIRTY) +
                  global_page_state(NR_UNSTABLE_NFS) +
                  (inodes_stat.nr_inodes - inodes_stat.nr_unused);
    while (nr_to_write > 0)
    {
        wbc.more_io = 0;
        wbc.encountered_congestion = 0;
        wbc.nr_to_write = MAX_WRITEBACK_PAGES;
        /*写回这些脏页*/
        writeback_inodes(&wbc);
        if (wbc.nr_to_write > 0)
        {
            /*如果发现准许写入磁盘的配额还有，有两种
            可能性，一种就是队列阻塞，或者写入错误，这个时候
            在阻塞队列上等待一段时间
            还有一种可能性就是，确实所有的脏页都已经写入磁盘了*/
            if (wbc.encountered_congestion || wbc.more_io)
                congestion_wait(WRITE, HZ / 10);
            else
                break;    /* All the old data is written */
        }
        /*所有的写入磁盘的配额都用完了，看来脏页还很多，继续写入*/
        nr_to_write -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
    }
    /*设置下一次进入这个函数换出脏页的时间*/
    if (time_before(next_jif, jiffies + HZ))
        next_jif = jiffies + HZ;
    if (dirty_writeback_interval)
        mod_timer(&wb_timer, next_jif);
}

文件同步过程函数调用

1、内核定时调用wb_kupdata函数，进行内核同步文件的工作，这个调用的周期默认可能长达30秒之久。

2、在wb_kupdate函数中，还调用了sync_supers函数，这个函数复杂同步超级块的数据

3、之后调用writeback_inodes函数扫描所有的超级快，如果超级块中有脏页，就同步这个超级块中的脏页。

4、对于每个超级块调用sync_sb_inodes函数，对于一个超级块中的所有脏inode，逐一刷出同步这些inode中的脏页。这个过程是扫描一个super block中的每一个inode，如果这个inode是脏的inode，调用__writeback_single_inode函数将这个inode中的脏页刷出。

5、最后，对于一个inode中的脏页，调用do_writepages将这些脏页逐一写入到磁盘上面。

writeback_inodes(struct writeback_control *wbc)函数实际上就是遍历所有的super_blocks链表上的所有的super_block，然后将每个super_block上的脏页逐一换出。

void
writeback_inodes(struct writeback_control *wbc)
{
    struct super_block *sb;

    might_sleep();
    spin_lock(&sb_lock);
restart:
    list_for_each_entry_reverse(sb, &super_blocks, s_list)
    {
        if (sb_has_dirty_inodes(sb))
        {
            /* we're making our own get_super here */
            sb->s_count++;
            spin_unlock(&sb_lock);
            /*
             * If we can't get the readlock, there's no sense in
             * waiting around, most of the time the FS is going to
             * be unmounted by the time it is released.
             */
            if (down_read_trylock(&sb->s_umount))
            {
                if (sb->s_root)
                    sync_sb_inodes(sb, wbc);
                up_read(&sb->s_umount);
            }
            spin_lock(&sb_lock);
            if (__put_super_and_need_restart(sb))
                goto restart;
        }
        if (wbc->nr_to_write <= 0)
            break;
    }
    spin_unlock(&sb_lock);
}

generic_sync_sb_inodes函数是在一个super block上面扫描所有的变脏的inode，并且对每一个脏的inode做刷出的操作，具体分析如下。

void generic_sync_sb_inodes(struct super_block *sb,
                            struct writeback_control *wbc)
{
    const unsigned long start = jiffies;    /* livelock avoidance */
    int sync = wbc->sync_mode == WB_SYNC_ALL;

    spin_lock(&inode_lock);
    /*
     如果不是周期性刷出脏的页面，将s_dirty上的inode，s_more上面的inode都放在s_io上面，
    准备在下面的刷出过程中处理s_io上面inode
    如果是周期性刷出，那么如果s_io上面不是空，那么稳健的将s_io上的inode刷出后，再将
    s_dirty和s_more上面的inode填充到s_io上面
    */
    if (!wbc->for_kupdate || list_empty(&sb->s_io))
        queue_io(sb, wbc->older_than_this);

    while (!list_empty(&sb->s_io))
    {
        struct inode *inode = list_entry(sb->s_io.prev,
                                         struct inode, i_list);
        struct address_space *mapping = inode->i_mapping;
        struct backing_dev_info *bdi = mapping->backing_dev_info;
        long pages_skipped;
        /*如果这个inode对应的块设备有BDI_CAP_NO_WRITEBACK标记，说明是
        ram disk之类的设备，不必回写，这中情况可以直接跳过这个super
        block
        */
        if (!bdi_cap_writeback_dirty(bdi))
        {
            redirty_tail(inode);
            if (sb_is_blkdev_sb(sb))
            {
                /*
                 * Dirty memory-backed blockdev: the ramdisk
                 * driver does this.  Skip just this inode
                 */
                continue;
            }
            /*
             * Dirty memory-backed inode against a filesystem other
             * than the kernel-internal bdev filesystem.  Skip the
             * entire superblock.
             */
            break;
        }
        /*如果inode有 I_NEW，标记，将这个inode放入s_more链表中，下次处理*/
        if (inode->i_state & I_NEW)
        {
            requeue_io(inode);
            continue;
        }
        /*如果遇到块设备拥塞，有两种情况
        1、普通文件，直接跳过这个super block，
        2、如果是块设备的inode，那么将这个inode放到s_more上面，继续下一个
        inode的处理*/
        if (wbc->nonblocking && bdi_write_congested(bdi))
        {
            wbc->encountered_congestion = 1;
            if (!sb_is_blkdev_sb(sb))
                break;        /* Skip a congested fs */
            requeue_io(inode);
            continue;        /* Skip a congested blockdev */
        }
        /*
        如果刷出的时候指定的队列，而发现inode所在队列不符合，也有两种情况：
        1、普通文件，直接跳过这个super block
        2、如果是块设备的inode，那么将这个inode放到s_more上面，继续下一个
        inode的处理
        */
        if (wbc->bdi && bdi != wbc->bdi)
        {
            if (!sb_is_blkdev_sb(sb))
                break;        /* fs has the wrong queue */
            requeue_io(inode);
            continue;        /* blockdev has wrong queue */
        }

        /* Was this inode dirtied after sync_sb_inodes was called? */
        /*
        如果发现这个inode弄脏的时间并开始扫描这些inode的时间还要晚
        这中情况是不正常的，跳过这个super block
        */
        if (time_after(inode->dirtied_when, start))
            break;

        /* Is another pdflush already flushing this queue? */
        if (current_is_pdflush() && !writeback_acquire(bdi))
            break;

        BUG_ON(inode->i_state & I_FREEING);
        __iget(inode);
        pages_skipped = wbc->pages_skipped;
        __writeback_single_inode(inode, wbc);
        if (current_is_pdflush())
            writeback_release(bdi);
        if (wbc->pages_skipped != pages_skipped)
        {
            /*
             * writeback is not making progress due to locked
             * buffers.  Skip this inode for now.
             */
            redirty_tail(inode);
        }
        spin_unlock(&inode_lock);
        iput(inode);
        cond_resched();
        spin_lock(&inode_lock);
        if (wbc->nr_to_write <= 0)
        {
            wbc->more_io = 1;
            break;
        }
        if (!list_empty(&sb->s_more_io))
            wbc->more_io = 1;
    }

    if (sync)
    {
        struct inode *inode, *old_inode = NULL;

        /*
         * Data integrity sync. Must wait for all pages under writeback,
         * because there may have been pages dirtied before our sync
         * call, but which had writeout started before we write it out.
         * In which case, the inode may not be on the dirty list, but
         * we still have to wait for that writeout.
         */
        list_for_each_entry(inode, &sb->s_inodes, i_sb_list)
        {
            struct address_space *mapping;

            if (inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW))
                continue;
            mapping = inode->i_mapping;
            if (mapping->nrpages == 0)
                continue;
            __iget(inode);
            spin_unlock(&inode_lock);
            /*
             * We hold a reference to 'inode' so it couldn't have
             * been removed from s_inodes list while we dropped the
             * inode_lock.  We cannot iput the inode now as we can
             * be holding the last reference and we cannot iput it
             * under inode_lock. So we keep the reference and iput
             * it later.
             */
            iput(old_inode);
            old_inode = inode;

            filemap_fdatawait(mapping);

            cond_resched();

            spin_lock(&inode_lock);
        }
        spin_unlock(&inode_lock);
        iput(old_inode);
    }
    else
        spin_unlock(&inode_lock);

    return;        /* Leave any unwritten inodes on s_io */
}