文件写入过程分析-alloysystem-ChinaUnix博客

Linux系统中将数据写入到一个文件是一个很复杂的过程。如果我们只对这个过程做一个理论和概要的理解，可以总结为下面处理过程。

1、首先，用户态程序需要将数据所在地址、写入文件位置、写入长度传递给内核空间。

2、然后，内核需要检查用户态程序传入的参数，一部分要检查用户态空间那段存储数据的地址是否可读，还需要检查写入文件位置，写入文件长度是不是超过了文件的限制。

3、通过检查后，进入一个循环中，这个循环的总体流程就是从写入文件位置开始，逐page进行操作，首先根据写入位置计算对应的page，然后找到或者分配一个page，然后找到这个page对应逻辑块的物理块。完成该page中需要写入的块的物理磁盘映射

4、将数据拷贝到这个page中

5、将这个page中的需要写入的块标记为dirty标记

6、修改写入文件的位置，开始操作需要写入的下一个page，过程还是如上面所述。

下面将linux系统写入文件相关的系统调用总结如下，比较关键的函数做了逐一分析。

/*
这个函数循环的做如下操作
1、根据写入文件位置pos，计算写入page 的索引
2、调用write_begin函数准备该空间中对应index需要的page
3、进行拷贝操作，将用户态空间中的数据拷贝到对应page中
4、将page设置成dirty状态
5、调整写入文件位置pos，继续下一个page的拷贝
*/
static ssize_t generic_perform_write(struct file *file,
                                     struct iov_iter *i, loff_t pos)
{
    struct address_space *mapping = file->f_mapping;
    const struct address_space_operations *a_ops = mapping->a_ops;
    long status = 0;
    ssize_t written = 0;
    unsigned int flags = 0;

    /*
     * Copies from kernel address space cannot fail (NFSD is a big user).
     */
    if (segment_eq(get_fs(), KERNEL_DS))
        flags |= AOP_FLAG_UNINTERRUPTIBLE;

    do
    {
        struct page *page;
        pgoff_t index;        /* Pagecache index for current page */
        unsigned long offset;    /* Offset into pagecache page */
        unsigned long bytes;    /* Bytes to write to page */
        size_t copied;        /* Bytes copied from user */
        void *fsdata;
        /*
        1、根据写入文件位置pos，计算写入page 的索引
        */
        offset = (pos & (PAGE_CACHE_SIZE - 1));
        index = pos >> PAGE_CACHE_SHIFT;
        bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
                      iov_iter_count(i));

again:

        /*
         * Bring in the user page that we will copy from _first_.
         * Otherwise there's a nasty deadlock on copying from the
         * same page as we're writing to, without it being marked
         * up-to-date.
         *
         * Not only is this an optimisation, but it is also required
         * to check that the address is actually valid, when atomic
         * usercopies are used, below.
         */
        if (unlikely(iov_iter_fault_in_readable(i, bytes)))
        {
            status = -EFAULT;
            break;
        }
        /*
        2、调用write_begin函数准备该空间中对应index需要的page
        */
        status = a_ops->write_begin(file, mapping, pos, bytes, flags,
                                    &page, &fsdata);
        if (unlikely(status))
            break;

        pagefault_disable();
        /*
        3、进行拷贝操作，将用户态空间中的数据拷贝到对应page中
        */
        copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
        pagefault_enable();
        flush_dcache_page(page);
        /*
        4、将page设置成dirty状态
        */
        status = a_ops->write_end(file, mapping, pos, bytes, copied,
                                  page, fsdata);
        if (unlikely(status < 0))
            break;
        copied = status;

        cond_resched();

        iov_iter_advance(i, copied);
        if (unlikely(copied == 0))
        {
            /*
             * If we were unable to copy any data at all, we must
             * fall back to a single segment length write.
             *
             * If we didn't fallback here, we could livelock
             * because not all segments in the iov can be copied at
             * once without a pagefault.
             */
            bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
                          iov_iter_single_seg_count(i));
            goto again;
        }
        /*
        5、调整写入文件位置pos，继续下一个page的拷贝
        */        pos += copied;
        written += copied;
        /*
        拷贝到时候，顺带要检查这个文件中dirty标记的
        文件数目是不是太多了，太多了就要将这些数据刷入到
        磁盘分区中去
        */
        balance_dirty_pages_ratelimited(mapping);

    }
    while (iov_iter_count(i));

    return written ? written : status;
}

/*
* block_write_begin takes care of the basic task of block allocation and
* bringing partial write blocks uptodate first.
*
* If *pagep is not NULL, then block_write_begin uses the locked page
* at *pagep rather than allocating its own. In this case, the page will
* not be unlocked or deallocated on failure.
*/
int block_write_begin(struct file *file, struct address_space *mapping,
                      loff_t pos, unsigned len, unsigned flags,
                      struct page **pagep, void **fsdata,
                      get_block_t *get_block)
{
    struct inode *inode = mapping->host;
    int status = 0;
    struct page *page;
    pgoff_t index;
    unsigned start, end;
    int ownpage = 0;

    index = pos >> PAGE_CACHE_SHIFT;
    start = pos & (PAGE_CACHE_SIZE - 1);
    end = start + len;

    page = *pagep;
    if (page == NULL)
    {
        ownpage = 1;
        /*
        查找page或者，创建page
        */
        page = grab_cache_page_write_begin(mapping, index, flags);
        if (!page)
        {
            status = -ENOMEM;
            goto out;
        }
        *pagep = page;
    }
    else
        BUG_ON(!PageLocked(page));
    /*
    对page缓冲区做相应的操作
    */
    status = __block_prepare_write(inode, page, start, end, get_block);
    if (unlikely(status))
    {
        ClearPageUptodate(page);

        if (ownpage)
        {
            unlock_page(page);
            page_cache_release(page);
            *pagep = NULL;

            /*
             * prepare_write() may have instantiated a few blocks
             * outside i_size.  Trim these off again. Don't need
             * i_size_read because we hold i_mutex.
             */
            if (pos + len > inode->i_size)
                vmtruncate(inode, inode->i_size);
        }
    }

out:
    return status;
}

static int __block_prepare_write(struct inode *inode, struct page *page,
                                 unsigned from, unsigned to, get_block_t *get_block)
{
    unsigned block_start, block_end;
    sector_t block;
    int err = 0;
    unsigned blocksize, bbits;
    struct buffer_head *bh, *head, *wait[2], **wait_bh = wait;

    BUG_ON(!PageLocked(page));
    BUG_ON(from > PAGE_CACHE_SIZE);
    BUG_ON(to > PAGE_CACHE_SIZE);
    BUG_ON(from > to);

    blocksize = 1 << inode->i_blkbits;
    /*这个位置首先检查，page是否带有缓冲区，如果没有创建page的缓冲区头*/
    if (!page_has_buffers(page))
        create_empty_buffers(page, blocksize, 0);
    head = page_buffers(page);

    bbits = inode->i_blkbits;
    block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);

    /*对该page的所有块逐一操作*/
    for(bh = head, block_start = 0; bh != head || !block_start;
            block++, block_start = block_end, bh = bh->b_this_page)
    {
        block_end = block_start + blocksize;
        /*如果这个block，不在写入的范围内，无需进行多余的操作
        只需要判断需不需要设置uptodate标记*/
        if (block_end <= from || block_start >= to)
        {
            if (PageUptodate(page))
            {
                if (!buffer_uptodate(bh))
                    set_buffer_uptodate(bh);
            }
            continue;
        }
        /*清除掉new标记*/
        if (buffer_new(bh))
            clear_buffer_new(bh);
        /*如果缓冲区还没有mapped，就调用get_block函数给相应的缓冲区分配block
        判断发现缓冲区是新分配的，需要查找这个缓冲区是不是还有未写入的缓冲，
        需要先将这个未写入的缓冲同步写入才行，避免出现混乱
        */
        if (!buffer_mapped(bh))
        {
            WARN_ON(bh->b_size != blocksize);
            err = get_block(inode, block, bh, 1);
            if (err)
                break;
            if (buffer_new(bh))
            {
                unmap_underlying_metadata(bh->b_bdev,
                                          bh->b_blocknr);
                if (PageUptodate(page))
                {
                    clear_buffer_new(bh);
                    set_buffer_uptodate(bh);
                    mark_buffer_dirty(bh);
                    continue;
                }
                if (block_end > to || block_start < from)
                    zero_user_segments(page,
                                       to, block_end,
                                       block_start, from);
                continue;
            }
        }
        /*page已经是最新数据，page中的内容和磁盘上的一致
        每个缓冲区也可以设置uptodata标记*/
        if (PageUptodate(page))
        {
            if (!buffer_uptodate(bh))
                set_buffer_uptodate(bh);
            continue;
        }
        /*缓冲区的内容不是最新的，和磁盘不同步，这个缓冲区上没有读操作
        并且缓冲区在本次写操作的范围里，执行读操作，
        这样做的目的是为了，由于这个缓冲区数据不是最新的，如果在写操作
        过程中并非出现读，就会造成磁盘操作造成干扰*/
        if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
                !buffer_unwritten(bh) &&
                (block_start < from || block_end > to))
        {
            ll_rw_block(READ, 1, &bh);
            *wait_bh++ = bh;
        }
    }
    /*
     * If we issued read requests - let them complete.
     */
    /*如果上面有同步读操作，等待读操作完成*/
    while(wait_bh > wait)
    {
        wait_on_buffer(*--wait_bh);
        if (!buffer_uptodate(*wait_bh))
            err = -EIO;
    }
    if (unlikely(err))
        page_zero_new_buffers(page, from, to);
    return err;
}

static int __block_commit_write(struct inode *inode, struct page *page,
                                unsigned from, unsigned to)
{
    unsigned block_start, block_end;
    int partial = 0;
    unsigned blocksize;
    struct buffer_head *bh, *head;

    blocksize = 1 << inode->i_blkbits;
    /*
    遍历一个page上的所有缓冲区，
    如果是写入操作的缓冲区，就将这个缓冲区设置成dirty
    如果不是写入操作的缓冲区，就跳过而不做操作*/
    for(bh = head = page_buffers(page), block_start = 0;
            bh != head || !block_start;
            block_start = block_end, bh = bh->b_this_page)
    {
        block_end = block_start + blocksize;
        if (block_end <= from || block_start >= to)
        {
            if (!buffer_uptodate(bh))
                partial = 1;
        }
        else
        {
            set_buffer_uptodate(bh);
            mark_buffer_dirty(bh);
        }
        clear_buffer_new(bh);
    }

    /*
     * If this is a partial write which happened to make all buffers
     * uptodate then we can optimize away a bogus readpage() for
     * the next read(). Here we 'discover' whether the page went
     * uptodate as a result of this (potentially partial) write.
     */
    /*判断是否整个page可以设置uptodata标记*/
    if (!partial)
        SetPageUptodate(page);
    return 0;
}