前言
上文已经做了论文阅读以及mkfs.f2fs初始化调研,现在我们从内核的角度来看F2FS的运行时是怎样的:包括文件的创建和读写、GC以及崩溃恢复。如上文所述,使用的源码是早期commit,代码的稳定性相对于现在肯定是不足的,但这样更易于理解论文提到的若干特性
文章话题涉及比较宽泛,本人才疏学浅,如有错误还请指正
Initialization
上文分析过mkfs.f2fs的格式化行为,到了内核部分则通过mount()
实现从VFS到具体文件系统的初始化,简单来说就是解析外存的数据结构。下面附上f2fs.txt对关键数据结构的简要说明:
File System Metadata Structure
------------------------------
F2FS adopts the checkpointing scheme to maintain file system consistency. At
mount time, F2FS first tries to find the last valid checkpoint data by scanning
CP area. In order to reduce the scanning time, F2FS uses only two copies of CP.
One of them always indicates the last valid data, which is called as shadow copy
mechanism. In addition to CP, NAT and SIT also adopt the shadow copy mechanism.
For file system consistency, each CP points to which NAT and SIT copies are
valid, as shown as below.
+--------+----------+---------+
| CP | SIT | NAT |
+--------+----------+---------+
. . . .
. . . .
. . . .
+-------+-------+--------+--------+--------+--------+
| CP #0 | CP #1 | SIT #0 | SIT #1 | NAT #0 | NAT #1 |
+-------+-------+--------+--------+--------+--------+
| ^ ^
| | |
`----------------------------------------'
回调注册
// mount操作的注册回调为f2fs_mount()
static struct file_system_type f2fs_fs_type = {
.owner = THIS_MODULE,
.name = "f2fs",
.mount = f2fs_mount,
.kill_sb = kill_block_super,
.fs_flags = FS_REQUIRES_DEV,
};
static struct dentry *f2fs_mount(struct file_system_type *fs_type, int flags,
const char *dev_name, void *data)
{
// 具体文件系统操作为f2fs_fill_super()
return mount_bdev(fs_type, flags, dev_name, data, f2fs_fill_super);
}
这些注册操作将F2FS和VFS关联起来,可以说是每一个文件系统的基本操作
关于VFS层面的
mount()
基本流程可以看这里
主流程
static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
{
struct f2fs_sb_info *sbi;
struct f2fs_super_block *raw_super;
struct buffer_head *raw_super_buf;
struct inode *root;
long err = -EINVAL;
int i;
/* allocate memory for f2fs-specific super block info */
sbi = kzalloc(sizeof(struct f2fs_sb_info), GFP_KERNEL);
if (!sbi)
return -ENOMEM;
/* set a block size */
if (!sb_set_blocksize(sb, F2FS_BLKSIZE)) {
f2fs_msg(sb, KERN_ERR, "unable to set blocksize");
goto free_sbi;
}
/* read f2fs raw super block */
// 通过sb_bread()直接读取block #0来获取外存super block信息
// 其数据保存在raw_super_buf->b_data
// F2FS_SUPER_OFFSET = 1024
raw_super_buf = sb_bread(sb, 0);
if (!raw_super_buf) {
err = -EIO;
f2fs_msg(sb, KERN_ERR, "unable to read superblock");
goto free_sbi;
}
raw_super = (struct f2fs_super_block *)
((char *)raw_super_buf->b_data + F2FS_SUPER_OFFSET);
/* init some FS parameters */
// main area的log数目(6)
sbi->active_logs = NR_CURSEG_TYPE;
// 用于确认是否允许gc_thread_func()
set_opt(sbi, BG_GC);
#ifdef CONFIG_F2FS_FS_XATTR
set_opt(sbi, XATTR_USER);
#endif
#ifdef CONFIG_F2FS_FS_POSIX_ACL
set_opt(sbi, POSIX_ACL);
#endif
/* parse mount options */
if (parse_options(sb, sbi, (char *)data))
goto free_sb_buf;
/* sanity checking of raw super */
if (sanity_check_raw_super(sb, raw_super)) {
f2fs_msg(sb, KERN_ERR, "Can't find a valid F2FS filesystem");
goto free_sb_buf;
}
// 填充VFS层struct super_block
sb->s_maxbytes = max_file_size(le32_to_cpu(raw_super->log_blocksize));
sb->s_max_links = F2FS_LINK_MAX;
get_random_bytes(&sbi->s_next_generation, sizeof(u32));
// sops初始化
sb->s_op = &f2fs_sops;
sb->s_xattr = f2fs_xattr_handlers;
sb->s_export_op = &f2fs_export_ops;
sb->s_magic = F2FS_SUPER_MAGIC;
sb->s_fs_info = sbi;
sb->s_time_gran = 1;
sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
(test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0);
memcpy(sb->s_uuid, raw_super->uuid, sizeof(raw_super->uuid));
/* init f2fs-specific super block info */
// 填充具体文件系统的super block info
sbi->sb = sb;
sbi->raw_super = raw_super;
sbi->raw_super_buf = raw_super_buf;
mutex_init(&sbi->gc_mutex);
mutex_init(&sbi->write_inode);
mutex_init(&sbi->writepages);
mutex_init(&sbi->cp_mutex);
for (i = 0; i < NR_LOCK_TYPE; i++)
mutex_init(&sbi->fs_lock[i]);
sbi->por_doing = 0;
spin_lock_init(&sbi->stat_lock);
init_rwsem(&sbi->bio_sem);
// 见下方调用
init_sb_info(sbi);
/* get an inode for meta space */
// meta inode的初始化,实际执行3步:
// 0. inode = iget_locked(ino);
// 1. inode->i_mapping->a_ops = &f2fs_meta_aops;
// 2. mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_ZERO);
sbi->meta_inode = f2fs_iget(sb, F2FS_META_INO(sbi));
if (IS_ERR(sbi->meta_inode)) {
f2fs_msg(sb, KERN_ERR, "Failed to read F2FS meta data inode");
err = PTR_ERR(sbi->meta_inode);
goto free_sb_buf;
}
// 见后续recovery章节
err = get_valid_checkpoint(sbi);
if (err) {
f2fs_msg(sb, KERN_ERR, "Failed to get valid F2FS checkpoint");
goto free_meta_inode;
}
/* sanity checking of checkpoint */
err = -EINVAL;
if (sanity_check_ckpt(raw_super, sbi->ckpt)) {
f2fs_msg(sb, KERN_ERR, "Invalid F2FS checkpoint");
goto free_cp;
}
// 从check point获取信息填充到super block
// 可以参考上文mkfs.f2fs填入的信息
sbi->total_valid_node_count =
le32_to_cpu(sbi->ckpt->valid_node_count);
sbi->total_valid_inode_count =
le32_to_cpu(sbi->ckpt->valid_inode_count);
sbi->user_block_count = le64_to_cpu(sbi->ckpt->user_block_count);
sbi->total_valid_block_count =
le64_to_cpu(sbi->ckpt->valid_block_count);
sbi->last_valid_block_count = sbi->total_valid_block_count;
sbi->alloc_valid_block_count = 0;
INIT_LIST_HEAD(&sbi->dir_inode_list);
spin_lock_init(&sbi->dir_inode_lock);
// 初始化空的sbi->orphan_inode_list
init_orphan_info(sbi);
/* setup f2fs internal modules */
// 关联sbi->sm_info并初始化SIT
err = build_segment_manager(sbi);
if (err) {
f2fs_msg(sb, KERN_ERR,
"Failed to initialize F2FS segment manager");
goto free_sm;
}
// 分配并初始化sbi->nm_info
err = build_node_manager(sbi);
if (err) {
f2fs_msg(sb, KERN_ERR,
"Failed to initialize F2FS node manager");
goto free_nm;
}
// 初始化struct victim_selection
build_gc_manager(sbi);
/* get an inode for node space */
sbi->node_inode = f2fs_iget(sb, F2FS_NODE_INO(sbi));
if (IS_ERR(sbi->node_inode)) {
f2fs_msg(sb, KERN_ERR, "Failed to read node inode");
err = PTR_ERR(sbi->node_inode);
goto free_nm;
}
/* if there are nt orphan nodes free them */
err = -EINVAL;
if (recover_orphan_inodes(sbi))
goto free_node_inode;
/* read root inode and dentry */
// 分配root inode
root = f2fs_iget(sb, F2FS_ROOT_INO(sbi));
if (IS_ERR(root)) {
f2fs_msg(sb, KERN_ERR, "Failed to read root inode");
err = PTR_ERR(root);
goto free_node_inode;
}
if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size)
goto free_root_inode;
// 为VFS分配root dentry
sb->s_root = d_make_root(root); /* allocate root dentry */
if (!sb->s_root) {
err = -ENOMEM;
goto free_root_inode;
}
/* recover fsynced data */
if (!test_opt(sbi, DISABLE_ROLL_FORWARD))
// 见后续recovery章节
recover_fsync_data(sbi);
/* After POR, we can run background GC thread */
err = start_gc_thread(sbi);
if (err)
goto fail;
// 非debug为空实现
err = f2fs_build_stats(sbi);
if (err)
goto fail;
return 0;
fail:
stop_gc_thread(sbi);
free_root_inode:
dput(sb->s_root);
sb->s_root = NULL;
free_node_inode:
iput(sbi->node_inode);
free_nm:
destroy_node_manager(sbi);
free_sm:
destroy_segment_manager(sbi);
free_cp:
kfree(sbi->ckpt);
free_meta_inode:
make_bad_inode(sbi->meta_inode);
iput(sbi->meta_inode);
free_sb_buf:
brelse(raw_super_buf);
free_sbi:
kfree(sbi);
return err;
}
static void init_sb_info(struct f2fs_sb_info *sbi)
{
struct f2fs_super_block *raw_super = sbi->raw_super;
int i;
sbi->log_sectors_per_block =
le32_to_cpu(raw_super->log_sectors_per_block);
sbi->log_blocksize = le32_to_cpu(raw_super->log_blocksize);
sbi->blocksize = 1 << sbi->log_blocksize;
sbi->log_blocks_per_seg = le32_to_cpu(raw_super->log_blocks_per_seg);
sbi->blocks_per_seg = 1 << sbi->log_blocks_per_seg;
sbi->segs_per_sec = le32_to_cpu(raw_super->segs_per_sec);
sbi->secs_per_zone = le32_to_cpu(raw_super->secs_per_zone);
sbi->total_sections = le32_to_cpu(raw_super->section_count);
sbi->total_node_count =
(le32_to_cpu(raw_super->segment_count_nat) / 2)
* sbi->blocks_per_seg * NAT_ENTRY_PER_BLOCK;
sbi->root_ino_num = le32_to_cpu(raw_super->root_ino);
sbi->node_ino_num = le32_to_cpu(raw_super->node_ino);
sbi->meta_ino_num = le32_to_cpu(raw_super->meta_ino);
for (i = 0; i < NR_COUNT_TYPE; i++)
atomic_set(&sbi->nr_pages[i], 0);
}
super block的初始化除了内核预先提供的宏作为参数以外,主要入手点是一个预先定义好的SB area地址,通过sb_bread()
即可定位到外存指定block的位置,从而开始执行解析/初始化工作
Segment manager
在主流程中存在smi
(即segment manager info)的初始化build_segment_manager()
,展开如下
int build_segment_manager(struct f2fs_sb_info *sbi)
{
struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
struct f2fs_sm_info *sm_info;
int err;
sm_info = kzalloc(sizeof(struct f2fs_sm_info), GFP_KERNEL);
if (!sm_info)
return -ENOMEM;
/* init sm info */
sbi->sm_info = sm_info;
// 初始化writeback list
INIT_LIST_HEAD(&sm_info->wblist_head);
spin_lock_init(&sm_info->wblist_lock);‘
// 从check point获取segment相关的信息
sm_info->seg0_blkaddr = le32_to_cpu(raw_super->segment0_blkaddr);
sm_info->main_blkaddr = le32_to_cpu(raw_super->main_blkaddr);
sm_info->segment_count = le32_to_cpu(raw_super->segment_count);
sm_info->reserved_segments = le32_to_cpu(ckpt->rsvd_segment_count);
sm_info->ovp_segments = le32_to_cpu(ckpt->overprov_segment_count);
sm_info->main_segments = le32_to_cpu(raw_super->segment_count_main);
sm_info->ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr);
// 构建sit,LFS-style更新需要维护这些信息
err = build_sit_info(sbi);
if (err)
return err;
// 构建free_i,可用于分配新的segment
err = build_free_segmap(sbi);
if (err)
return err;
// log信息
err = build_curseg(sbi);
if (err)
return err;
/* reinit free segmap based on SIT */
build_sit_entries(sbi);
init_free_segmap(sbi);
err = build_dirty_segmap(sbi);
if (err)
return err;
init_min_max_mtime(sbi);
return 0;
}
static int build_sit_info(struct f2fs_sb_info *sbi)
{
struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
struct sit_info *sit_i;
unsigned int sit_segs, start;
char *src_bitmap, *dst_bitmap;
unsigned int bitmap_size;
/* allocate memory for SIT information */
sit_i = kzalloc(sizeof(struct sit_info), GFP_KERNEL);
if (!sit_i)
return -ENOMEM;
SM_I(sbi)->sit_info = sit_i;
// TOTAL_SEGS指的是sbi->main_segments,也就是main area的segment个数
// SIT在内存中分配对应长度到sentries
sit_i->sentries = vzalloc(TOTAL_SEGS(sbi) * sizeof(struct seg_entry));
if (!sit_i->sentries)
return -ENOMEM;
bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi));
sit_i->dirty_sentries_bitmap = kzalloc(bitmap_size, GFP_KERNEL);
if (!sit_i->dirty_sentries_bitmap)
return -ENOMEM;
// 遍历segment,为每一个segment entry初始化bitmap
for (start = 0; start < TOTAL_SEGS(sbi); start++) {
sit_i->sentries[start].cur_valid_map
= kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL);
sit_i->sentries[start].ckpt_valid_map
= kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL);
if (!sit_i->sentries[start].cur_valid_map
|| !sit_i->sentries[start].ckpt_valid_map)
return -ENOMEM;
}
// 非默认情况,略
if (sbi->segs_per_sec > 1) {
sit_i->sec_entries = vzalloc(sbi->total_sections *
sizeof(struct sec_entry));
if (!sit_i->sec_entries)
return -ENOMEM;
}
/* get information related with SIT */
// 得到SIT本身需要的segment,因为有副本(pair segment),所以取一半
sit_segs = le32_to_cpu(raw_super->segment_count_sit) >> 1;
/* setup SIT bitmap from ckeckpoint pack */
bitmap_size = __bitmap_size(sbi, SIT_BITMAP);
// 从check point获取一定偏移得到sit bitmap
src_bitmap = __bitmap_ptr(sbi, SIT_BITMAP);
dst_bitmap = kzalloc(bitmap_size, GFP_KERNEL);
if (!dst_bitmap)
return -ENOMEM;
memcpy(dst_bitmap, src_bitmap, bitmap_size);
/* init SIT information */
// s_ops只有一个allocate_segment策略(allocate_segment_by_default)
sit_i->s_ops = &default_salloc_ops;
sit_i->sit_base_addr = le32_to_cpu(raw_super->sit_blkaddr);
sit_i->sit_blocks = sit_segs << sbi->log_blocks_per_seg;
sit_i->written_valid_blocks = le64_to_cpu(ckpt->valid_block_count);
sit_i->sit_bitmap = dst_bitmap;
sit_i->bitmap_size = bitmap_size;
sit_i->dirty_sentries = 0;
sit_i->sents_per_block = SIT_ENTRY_PER_BLOCK;
sit_i->elapsed_time = le64_to_cpu(sbi->ckpt->elapsed_time);
sit_i->mounted_time = CURRENT_TIME_SEC.tv_sec;
mutex_init(&sit_i->sentry_lock);
return 0;
}
static int build_free_segmap(struct f2fs_sb_info *sbi)
{
struct f2fs_sm_info *sm_info = SM_I(sbi);
// 字面意思,管理free segmap
// 它会决定何时使用in-place write
struct free_segmap_info *free_i;
unsigned int bitmap_size, sec_bitmap_size;
/* allocate memory for free segmap information */
free_i = kzalloc(sizeof(struct free_segmap_info), GFP_KERNEL);
if (!free_i)
return -ENOMEM;
SM_I(sbi)->free_info = free_i;
bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi));
free_i->free_segmap = kmalloc(bitmap_size, GFP_KERNEL);
if (!free_i->free_segmap)
return -ENOMEM;
sec_bitmap_size = f2fs_bitmap_size(sbi->total_sections);
free_i->free_secmap = kmalloc(sec_bitmap_size, GFP_KERNEL);
if (!free_i->free_secmap)
return -ENOMEM;
/* set all segments as dirty temporarily */
// 全1,全都是dirty
memset(free_i->free_segmap, 0xff, bitmap_size);
memset(free_i->free_secmap, 0xff, sec_bitmap_size);
/* init free segmap information */
free_i->start_segno =
(unsigned int) GET_SEGNO_FROM_SEG0(sbi, sm_info->main_blkaddr);
free_i->free_segments = 0;
free_i->free_sections = 0;
rwlock_init(&free_i->segmap_lock);
return 0;
}
static int build_curseg(struct f2fs_sb_info *sbi)
{
// 管理6个log
struct curseg_info *array;
int i;
array = kzalloc(sizeof(*array) * NR_CURSEG_TYPE, GFP_KERNEL);
if (!array)
return -ENOMEM;
SM_I(sbi)->curseg_array = array;
for (i = 0; i < NR_CURSEG_TYPE; i++) {
mutex_init(&array[i].curseg_mutex);
// 数据结构struct f2fs_summary_block*,cached summary block
// 每一个log有一个形式的summary block
// NOTE:
// 最终这个block写到哪里需要参考write_sum_page(..., blk_addr)
// 一般来说,blk_addr就是SSA起始地址+segno
//
// NOTE2:
// 一个summary block前半是summary entry,后半是SIT/NAT journal
array[i].sum_blk = kzalloc(PAGE_CACHE_SIZE, GFP_KERNEL);
if (!array[i].sum_blk)
return -ENOMEM;
array[i].segno = NULL_SEGNO;
array[i].next_blkoff = 0;
}
// 从check point得到summary
// 填充SM_I(sbi)->curseg_array
// 有点长,先略了
return restore_curseg_summaries(sbi);
}
static void build_sit_entries(struct f2fs_sb_info *sbi)
{
struct sit_info *sit_i = SIT_I(sbi);
struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA);
struct f2fs_summary_block *sum = curseg->sum_blk;
unsigned int start;
for (start = 0; start < TOTAL_SEGS(sbi); start++) {
struct seg_entry *se = &sit_i->sentries[start];
struct f2fs_sit_block *sit_blk;
struct f2fs_sit_entry sit;
struct page *page;
int i;
mutex_lock(&curseg->curseg_mutex);
for (i = 0; i < sits_in_cursum(sum); i++) {
if (le32_to_cpu(segno_in_journal(sum, i)) == start) {
// 通过summary block找到sit entry
sit = sit_in_journal(sum, i);
mutex_unlock(&curseg->curseg_mutex);
goto got_it;
}
}
mutex_unlock(&curseg->curseg_mutex);
page = get_current_sit_page(sbi, start);
sit_blk = (struct f2fs_sit_block *)page_address(page);
sit = sit_blk->entries[SIT_ENTRY_OFFSET(sit_i, start)];
f2fs_put_page(page, 1);
got_it:
check_block_count(sbi, start, &sit);
seg_info_from_raw_sit(se, &sit);
if (sbi->segs_per_sec > 1) {
struct sec_entry *e = get_sec_entry(sbi, start);
e->valid_blocks += se->valid_blocks;
}
}
}
static void init_free_segmap(struct f2fs_sb_info *sbi)
{
unsigned int start;
int type;
for (start = 0; start < TOTAL_SEGS(sbi); start++) {
struct seg_entry *sentry = get_seg_entry(sbi, start);
if (!sentry->valid_blocks)
__set_free(sbi, start);
}
/* set use the current segments */
for (type = CURSEG_HOT_DATA; type <= CURSEG_COLD_NODE; type++) {
struct curseg_info *curseg_t = CURSEG_I(sbi, type);
__set_test_and_inuse(sbi, curseg_t->segno);
}
}
static inline void __set_test_and_inuse(struct f2fs_sb_info *sbi,
unsigned int segno)
{
struct free_segmap_info *free_i = FREE_I(sbi);
unsigned int secno = segno / sbi->segs_per_sec;
write_lock(&free_i->segmap_lock);
if (!test_and_set_bit(segno, free_i->free_segmap)) {
free_i->free_segments--;
if (!test_and_set_bit(secno, free_i->free_secmap))
free_i->free_sections--;
}
write_unlock(&free_i->segmap_lock);
}
static int build_dirty_segmap(struct f2fs_sb_info *sbi)
{
struct dirty_seglist_info *dirty_i;
unsigned int bitmap_size, i;
/* allocate memory for dirty segments list information */
dirty_i = kzalloc(sizeof(struct dirty_seglist_info), GFP_KERNEL);
if (!dirty_i)
return -ENOMEM;
SM_I(sbi)->dirty_info = dirty_i;
mutex_init(&dirty_i->seglist_lock);
bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi));
for (i = 0; i < NR_DIRTY_TYPE; i++) {
// NOTE:
// 可以通过dirty_segmap[PRE]得到prefree segment
// 详见check_prefree_segments()
dirty_i->dirty_segmap[i] = kzalloc(bitmap_size, GFP_KERNEL);
if (!dirty_i->dirty_segmap[i])
return -ENOMEM;
}
init_dirty_segmap(sbi);
return init_victim_segmap(sbi);
}
static void init_dirty_segmap(struct f2fs_sb_info *sbi)
{
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
struct free_segmap_info *free_i = FREE_I(sbi);
unsigned int segno = 0, offset = 0;
unsigned short valid_blocks;
while (segno < TOTAL_SEGS(sbi)) {
/* find dirty segment based on free segmap */
segno = find_next_inuse(free_i, TOTAL_SEGS(sbi), offset);
if (segno >= TOTAL_SEGS(sbi))
break;
offset = segno + 1;
valid_blocks = get_valid_blocks(sbi, segno, 0);
if (valid_blocks >= sbi->blocks_per_seg || !valid_blocks)
continue;
mutex_lock(&dirty_i->seglist_lock);
__locate_dirty_segment(sbi, segno, DIRTY);
mutex_unlock(&dirty_i->seglist_lock);
}
}
static int init_victim_segmap(struct f2fs_sb_info *sbi)
{
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
unsigned int bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi));
// 用于GC的victim标记,见get_victim_by_default()
dirty_i->victim_segmap[FG_GC] = kzalloc(bitmap_size, GFP_KERNEL);
dirty_i->victim_segmap[BG_GC] = kzalloc(bitmap_size, GFP_KERNEL);
if (!dirty_i->victim_segmap[FG_GC] || !dirty_i->victim_segmap[BG_GC])
return -ENOMEM;
return 0;
}
这里大多数流程要到读写操作才能发挥作用,现阶段就是「混个脸熟」,后面需要再往回看吧
Node manager
node manager相对简单,简单解析NAT信息
int build_node_manager(struct f2fs_sb_info *sbi)
{
int err;
sbi->nm_info = kzalloc(sizeof(struct f2fs_nm_info), GFP_KERNEL);
if (!sbi->nm_info)
return -ENOMEM;
err = init_node_manager(sbi);
if (err)
return err;
// 这里决定node id的分配
// 调用方见Create operation章节
// 分析见下面
build_free_nids(sbi);
return 0;
}
// 整个函数基本就是直接解析
static int init_node_manager(struct f2fs_sb_info *sbi)
{
struct f2fs_super_block *sb_raw = F2FS_RAW_SUPER(sbi);
struct f2fs_nm_info *nm_i = NM_I(sbi);
unsigned char *version_bitmap;
unsigned int nat_segs, nat_blocks;
nm_i->nat_blkaddr = le32_to_cpu(sb_raw->nat_blkaddr);
/* segment_count_nat includes pair segment so divide to 2. */
nat_segs = le32_to_cpu(sb_raw->segment_count_nat) >> 1;
nat_blocks = nat_segs << le32_to_cpu(sb_raw->log_blocks_per_seg);
nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nat_blocks;
nm_i->fcnt = 0;
nm_i->nat_cnt = 0;
INIT_LIST_HEAD(&nm_i->free_nid_list);
INIT_RADIX_TREE(&nm_i->nat_root, GFP_ATOMIC);
INIT_LIST_HEAD(&nm_i->nat_entries);
INIT_LIST_HEAD(&nm_i->dirty_nat_entries);
mutex_init(&nm_i->build_lock);
spin_lock_init(&nm_i->free_nid_list_lock);
rwlock_init(&nm_i->nat_tree_lock);
nm_i->bitmap_size = __bitmap_size(sbi, NAT_BITMAP);
// 从root inode号(3)之后开始,因此ckpt->next_free_nid = 4
// 这里用于node id的分配算法优化
nm_i->init_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid);
nm_i->next_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid);
nm_i->nat_bitmap = kzalloc(nm_i->bitmap_size, GFP_KERNEL);
if (!nm_i->nat_bitmap)
return -ENOMEM;
version_bitmap = __bitmap_ptr(sbi, NAT_BITMAP);
if (!version_bitmap)
return -EFAULT;
/* copy version bitmap */
memcpy(nm_i->nat_bitmap, version_bitmap, nm_i->bitmap_size);
return 0;
}
// 组织free node的过程
static void build_free_nids(struct f2fs_sb_info *sbi)
{
struct free_nid *fnid, *next_fnid;
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
struct f2fs_summary_block *sum = curseg->sum_blk;
nid_t nid = 0;
bool is_cycled = false;
// 含义:the number of free node id
int fcnt = 0;
int i;
// 决定nid的扫描范围
// next_scan_nid会在完成后再次赋值,下一次扫描就从这里开始
nid = nm_i->next_scan_nid;
// init_scan_nid记录这一次开始扫描开始用到的nid
// 主要是拿来判断取数到最大再循环到0的情况
nm_i->init_scan_nid = nid;
// 对NAT页进行readahead预读操作,避免后续多次IO
// 大概4个page(见FREE_NID_PAGES)
ra_nat_pages(sbi, nid);
while (1) {
struct page *page = get_current_nat_page(sbi, nid);
// 进行per-block的entry遍历
// 如果nat_blk->entries[i].block_addr为空地址
// 那么fcnt += 1
// NOTE:
// 也有些原因导致+=0,比如超过了扫描阈值(由MAX_FREE_NIDS宏决定)
//
// /* maximum # of free node ids to produce during build_free_nids */
// #define MAX_FREE_NIDS (NAT_ENTRY_PER_BLOCK * FREE_NID_PAGES) = 455 * 4 = 1820
//
// #define NAT_ENTRY_PER_BLOCK (PAGE_CACHE_SIZE / sizeof(struct f2fs_nat_entry))
//
// /* # of pages to perform readahead before building free nids */
// #define FREE_NID_PAGES 4
fcnt += scan_nat_page(nm_i, page, nid);
f2fs_put_page(page, 1);
// 跨到下一个block
nid += (NAT_ENTRY_PER_BLOCK - (nid % NAT_ENTRY_PER_BLOCK));
// 模拟循环复用
if (nid >= nm_i->max_nid) {
nid = 0;
is_cycled = true;
}
// 这里扫描到一定数值就立刻停手,下一次再通过next_scan_nid扫描
if (fcnt > MAX_FREE_NIDS)
break;
// 这里表示完整跑了一圈都没有凑够fcnt最大数,但是已经是目前能收集最多的情况了
if (is_cycled && nm_i->init_scan_nid <= nid)
break;
}
nm_i->next_scan_nid = nid;
/* find free nids from current sum_pages */
mutex_lock(&curseg->curseg_mutex);
for (i = 0; i < nats_in_cursum(sum); i++) {
block_t addr = le32_to_cpu(nat_in_journal(sum, i).block_addr);
nid = le32_to_cpu(nid_in_journal(sum, i));
if (addr == NULL_ADDR)
add_free_nid(nm_i, nid);
else
remove_free_nid(nm_i, nid);
}
mutex_unlock(&curseg->curseg_mutex);
/* remove the free nids from current allocated nids */
list_for_each_entry_safe(fnid, next_fnid, &nm_i->free_nid_list, list) {
struct nat_entry *ne;
read_lock(&nm_i->nat_tree_lock);
// 查找对应nid的nat entry
ne = __lookup_nat_cache(nm_i, fnid->nid);
if (ne && nat_get_blkaddr(ne) != NULL_ADDR)
remove_free_nid(nm_i, fnid->nid);
read_unlock(&nm_i->nat_tree_lock);
}
}
node manager的关键在于它的nid freelist构建(build_free_nids
)。它的数据结构是不完整的,只会搜集一定批数(fcnt
统计当前搜集到的free inode数目),不足了再次重新构造,是有一定技巧
Create operation
主流程
文件的创建open(..., O_CREAT)
从VFS层do_sys_open()
开始,其中会调用具体文件系统的f2fs_create()
来完成实际上的创建
VFS的调用栈:
do_sys_open do_filp_open path_openat do_last lookup_open vfs_create dir->i_op->create f2fs_create
static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
bool excl)
{
struct super_block *sb = dir->i_sb;
struct f2fs_sb_info *sbi = F2FS_SB(sb);
struct inode *inode;
nid_t ino = 0;
int err;
// 见后续gc章节
f2fs_balance_fs(sbi);
// 获得inode实例:kmemcache分配inode,且从freelist申请nid
inode = f2fs_new_inode(dir, mode);
if (IS_ERR(inode))
return PTR_ERR(inode);
if (!test_opt(sbi, DISABLE_EXT_IDENTIFY))
// 设备为cold
set_cold_file(sbi, inode, dentry->d_name.name);
// 赋值iop fop aop
inode->i_op = &f2fs_file_inode_operations;
inode->i_fop = &f2fs_file_operations;
inode->i_mapping->a_ops = &f2fs_dblock_aops;
ino = inode->i_ino;
// 关联到dentry
// 这里会更新parent metadata
err = f2fs_add_link(dentry, inode);
if (err)
goto out;
// 从sbi获取nm_i,并从中关联的free list删除对应空间,让给ino使用
alloc_nid_done(sbi, ino);
if (!sbi->por_doing)
// VFS层函数,使用inode信息,构造完整的dentry
d_instantiate(dentry, inode);
unlock_new_inode(inode);
return 0;
out:
clear_nlink(inode);
unlock_new_inode(inode);
iput(inode);
alloc_nid_failed(sbi, ino);
return err;
}
整体逻辑是很清晰的,即创建inode,关联dentry。下面逐步分析内部函数的细节
inode创建
在主流程中,inode
实例的创建位于f2fs_new_inode()
static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
{
struct super_block *sb = dir->i_sb;
struct f2fs_sb_info *sbi = F2FS_SB(sb);
nid_t ino;
struct inode *inode;
bool nid_free = false;
int err;
// VFS函数,通过kmalloc返回inode实例
// 内部会调用s_op->alloc_inode()进行初始化
// 对应到F2FS就是f2fs_alloc_inode()
inode = new_inode(sb);
if (!inode)
return ERR_PTR(-ENOMEM);
mutex_lock_op(sbi, NODE_NEW);
// 分配inode号,后续存放于inode->i_ino
if (!alloc_nid(sbi, &ino)) {
mutex_unlock_op(sbi, NODE_NEW);
err = -ENOSPC;
goto fail;
}
mutex_unlock_op(sbi, NODE_NEW);
inode->i_uid = current_fsuid();
if (dir->i_mode & S_ISGID) {
inode->i_gid = dir->i_gid;
if (S_ISDIR(mode))
mode |= S_ISGID;
} else {
inode->i_gid = current_fsgid();
}
inode->i_ino = ino;
inode->i_mode = mode;
inode->i_blocks = 0;
inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
inode->i_generation = sbi->s_next_generation++;
// 这是VFS层的函数
// inode插入到VFS层私有的inode_hashtable,自身关联inode->i_hash
// 这样以后可以在内存中通过inode号快速查找到inode
// NOTE:
// 我觉得需要区分一下inode number(ino)和node number(nid)
// inode number是VFS可以访问的,我个人理解是一个file对应一个inode number
// 但是F2FS是维护node number来完成寻址(用一个ID来替代绝对地址,解决wandering tree)
// 因此只要需要寻址的node block都会有一个nid(相比之下data block不需要nid)
// 这里的交集在于ino的分配也是通过nid得到的(见上方,alloc_nid(..., &ino))
err = insert_inode_locked(inode);
if (err) {
err = -EINVAL;
nid_free = true;
goto out;
}
// 标记脏页,以后回写
mark_inode_dirty(inode);
return inode;
out:
clear_nlink(inode);
unlock_new_inode(inode);
fail:
iput(inode);
if (nid_free)
alloc_nid_failed(sbi, ino);
return ERR_PTR(err);
}
简单来说,f2fs_new_inode()
有以下流程:
- 通过VFS层new_inode()通过kmalloc获得inode实例
- 填充必要的具体文件系统参数到inode
- alloc_nid()分配inode->ino
- inode插入到VFS层私有的inode_hashtable,自身关联inode->i_hash
- 标记inode为dirty
- 返回inode实例
下面来看分配node id(在f2fs_new_inode()
这里是inode
号)的子流程alloc_nid()
/*
* If this function returns success, caller can obtain a new nid
* from second parameter of this function.
* The returned nid could be used ino as well as nid when inode is created.
*/
bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid)
{
// NOTE: 在前面fill super block流程中,通过build_node_manager()来初始化nmi
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct free_nid *i = NULL;
struct list_head *this;
retry:
mutex_lock(&nm_i->build_lock);
// fcnt含义:the number of free node id
if (!nm_i->fcnt) {
/* scan NAT in order to build free nid list */
// 没有free node就需要需要通过NAT构建
// 这里已经在上面分析过
build_free_nids(sbi);
if (!nm_i->fcnt) {
mutex_unlock(&nm_i->build_lock);
return false;
}
}
mutex_unlock(&nm_i->build_lock);
/*
* We check fcnt again since previous check is racy as
* we didn't hold free_nid_list_lock. So other thread
* could consume all of free nids.
*/
spin_lock(&nm_i->free_nid_list_lock);
if (!nm_i->fcnt) {
spin_unlock(&nm_i->free_nid_list_lock);
goto retry;
}
BUG_ON(list_empty(&nm_i->free_nid_list));
// 遍历freelist查找free_nid
list_for_each(this, &nm_i->free_nid_list) {
i = list_entry(this, struct free_nid, list);
// 用了这个i,后续i->state改为NID_ALLOC
if (i->state == NID_NEW)
break;
}
BUG_ON(i->state != NID_NEW);
*nid = i->nid;
i->state = NID_ALLOC;
nm_i->fcnt--;
spin_unlock(&nm_i->free_nid_list_lock);
return true;
}
简单来说,inode
号是从nid freelist中复用的。它的分配策略其实集中在build_free_nids()
函数里面,上面已经分析过,是一个带有缓存和截断的构建算法,能减少查找free nid的延迟
关联dentry
分析f2fs_add_link()
需要了解一下F2FS的dentry设计(也可以先看上文对dentry的简要分析)
Directory Structure
-------------------
A directory entry occupies 11 bytes, which consists of the following attributes.
- hash hash value of the file name
- ino inode number
- len the length of file name
- type file type such as directory, symlink, etc
A dentry block consists of 214 dentry slots and file names. Therein a bitmap is
used to represent whether each dentry is valid or not. A dentry block occupies
4KB with the following composition.
Dentry Block(4 K) = bitmap (27 bytes) + reserved (3 bytes) +
dentries(11 * 214 bytes) + file name (8 * 214 bytes)
[Bucket]
+--------------------------------+
|dentry block 1 | dentry block 2 |
+--------------------------------+
. .
. .
. [Dentry Block Structure: 4KB] .
+--------+----------+----------+------------+
| bitmap | reserved | dentries | file names |
+--------+----------+----------+------------+
[Dentry Block: 4KB] . .
. .
. .
+------+------+-----+------+
| hash | ino | len | type |
+------+------+-----+------+
[Dentry Structure: 11 bytes]
F2FS implements multi-level hash tables for directory structure. Each level has
a hash table with dedicated number of hash buckets as shown below. Note that
"A(2B)" means a bucket includes 2 data blocks.
----------------------
A : bucket
B : block
N : MAX_DIR_HASH_DEPTH
----------------------
level #0 | A(2B)
|
level #1 | A(2B) - A(2B)
|
level #2 | A(2B) - A(2B) - A(2B) - A(2B)
. | . . . .
level #N/2 | A(2B) - A(2B) - A(2B) - A(2B) - A(2B) - ... - A(2B)
. | . . . .
level #N | A(4B) - A(4B) - A(4B) - A(4B) - A(4B) - ... - A(4B)
The number of blocks and buckets are determined by,
,- 2, if n < MAX_DIR_HASH_DEPTH / 2,
# of blocks in level #n = |
`- 4, Otherwise
,- 2^n, if n < MAX_DIR_HASH_DEPTH / 2,
# of buckets in level #n = |
`- 2^((MAX_DIR_HASH_DEPTH / 2) - 1), Otherwise
When F2FS finds a file name in a directory, at first a hash value of the file
name is calculated. Then, F2FS scans the hash table in level #0 to find the
dentry consisting of the file name and its inode number. If not found, F2FS
scans the next hash table in level #1. In this way, F2FS scans hash tables in
each levels incrementally from 1 to N. In each levels F2FS needs to scan only
one bucket determined by the following equation, which shows O(log(# of files))
complexity.
bucket number to scan in level #n = (hash value) % (# of buckets in level #n)
In the case of file creation, F2FS finds empty consecutive slots that cover the
file name. F2FS searches the empty slots in the hash tables of whole levels from
1 to N in the same way as the lookup operation.
The following figure shows an example of two cases holding children.
--------------> Dir <--------------
| |
child child
child - child [hole] - child
child - child - child [hole] - [hole] - child
Case 1: Case 2:
Number of children = 6, Number of children = 3,
File size = 7 File size = 7
文档除了指出上文分析过的dentry数据结构以外,还介绍了文件查找用到的多级哈希表:
- 使用filename作为哈希计算的key从而得到value
- 使用该value遍历至多\(O(log(\# \ of \ files))\)层哈希表,实际不超过\(64\)层
- 低层哈希表的bucket数目为\(2^n\),高层为\(2^{\frac{MAX\_DEPTH}{2}-1}\),其中\(n\)为层数,最大值为\(MAX\_DEPTH\)
- 低层哈希表中bucket的block数目为\(2\),高层为\(4\)
- 在查找过程中:
- 每一次遍历通过\(value \ \% \ nbucket\)定位到bucket index,从中找出filename和ino匹配的dentry
- 如果不匹配则继续走高层哈希表
- 在创建过程中:
- 定位方式同查找过程
- 但是要定位到空的slot(每个dentry block含有214个dentry slot即
struct f2fs_dir_entry
)
int f2fs_add_link(struct dentry *dentry, struct inode *inode)
{
unsigned int bit_pos;
unsigned int level;
unsigned int current_depth;
unsigned long bidx, block;
f2fs_hash_t dentry_hash;
struct f2fs_dir_entry *de;
unsigned int nbucket, nblock;
// parent directory对应的inode
struct inode *dir = dentry->d_parent->d_inode;
struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
const char *name = dentry->d_name.name;
size_t namelen = dentry->d_name.len;
struct page *dentry_page = NULL;
struct f2fs_dentry_block *dentry_blk = NULL;
// #define GET_DENTRY_SLOTS(x) ((x + F2FS_NAME_LEN - 1) >> F2FS_NAME_LEN_BITS)
int slots = GET_DENTRY_SLOTS(namelen);
int err = 0;
int i;
// 使用filename作为hash key
dentry_hash = f2fs_dentry_hash(name, dentry->d_name.len);
level = 0;
// 记录能到达的最高depth
current_depth = F2FS_I(dir)->i_current_depth;
if (F2FS_I(dir)->chash == dentry_hash) {
level = F2FS_I(dir)->clevel;
F2FS_I(dir)->chash = 0;
}
// 进入到多级哈希表的查找过程
start:
if (current_depth == MAX_DIR_HASH_DEPTH)
return -ENOSPC;
/* Increase the depth, if required */
if (level == current_depth)
++current_depth;
// 按上述公式计算bucket数目
nbucket = dir_buckets(level);
// 计算block数目
nblock = bucket_blocks(level);
// 定位block index
bidx = dir_block_index(level, (le32_to_cpu(dentry_hash) % nbucket));
// 在当前bucket中遍历block
for (block = bidx; block <= (bidx + nblock - 1); block++) {
mutex_lock_op(sbi, DENTRY_OPS);
// 为index = block分配对应的data page
dentry_page = get_new_data_page(dir, block, true);
if (IS_ERR(dentry_page)) {
mutex_unlock_op(sbi, DENTRY_OPS);
return PTR_ERR(dentry_page);
}
// TODO 临时映射
// 往dentry_blk写入数据,然后对dentry_page标记dirty就能刷入外存
dentry_blk = kmap(dentry_page);
bit_pos = room_for_filename(dentry_blk, slots);
// NR_DENTRY_IN_BLOCK = 214,一个dentry block含有214个dentry slot
if (bit_pos < NR_DENTRY_IN_BLOCK)
// 定位成功,挑出循环
goto add_dentry;
kunmap(dentry_page);
f2fs_put_page(dentry_page, 1);
mutex_unlock_op(sbi, DENTRY_OPS);
}
/* Move to next level to find the empty slot for new dentry */
// 失败定位,继续循环走更高层
++level;
goto start;
// 定位成功
add_dentry:
// 提供inode对应的page(通用函数grab_cache_page())和对应的aops
err = init_inode_metadata(inode, dentry);
if (err)
goto fail;
wait_on_page_writeback(dentry_page);
// 初始化dentry
de = &dentry_blk->dentry[bit_pos];
de->hash_code = dentry_hash;
de->name_len = cpu_to_le16(namelen);
memcpy(dentry_blk->filename[bit_pos], name, namelen);
de->ino = cpu_to_le32(inode->i_ino);
set_de_type(de, inode);
for (i = 0; i < slots; i++)
test_and_set_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap);
set_page_dirty(dentry_page);
// dir是parent inode
// 更新dir与之关联的信息
update_parent_metadata(dir, inode, current_depth);
/* update parent inode number before releasing dentry page */
F2FS_I(inode)->i_pino = dir->i_ino;
fail:
kunmap(dentry_page);
f2fs_put_page(dentry_page, 1);
mutex_unlock_op(sbi, DENTRY_OPS);
return err;
}
这个子流程主要是通过特定的哈希算法定位并初始化dentry,还有更新parent inode的信息
Write operation
回调注册
在VFS层面上,F2FS用的f_op是框架提供的
const struct file_operations f2fs_file_operations = {
.write = do_sync_write,
.aio_write = generic_file_aio_write,
.splice_write = generic_file_splice_write,
};
也就是说从do_sync_write()
到generic_file_buffered_write()
都是非常通用的流程
但是真正写入page时,即generic_file_buffered_write()
调用a_ops->write_begin()
则有所不同
const struct address_space_operations f2fs_dblock_aops = {
.writepage = f2fs_write_data_page,
.writepages = f2fs_write_data_pages,
.write_begin = f2fs_write_begin,
.write_end = nobh_write_end, // 这个也是通用的流程
};
写入主流程
f2fs_write_begin()
为写入操作准备了page,并通过pagep
指向它
static int f2fs_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
{
struct inode *inode = mapping->host;
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
struct page *page;
pgoff_t index = ((unsigned long long) pos) >> PAGE_CACHE_SHIFT;
struct dnode_of_data dn;
int err = 0;
/* for nobh_write_end */
*fsdata = NULL;
// GC相关,后续章节再讨论
f2fs_balance_fs(sbi);
// 初始化要写的page(Find or create a page at the given pagecache position)
// 这里保证存在于page cache中
page = grab_cache_page_write_begin(mapping, index, flags);
if (!page)
return -ENOMEM;
*pagep = page;
mutex_lock_op(sbi, DATA_NEW);
// 初始化dnode,用于找到物理地址(data_blkaddr)
set_new_dnode(&dn, inode, NULL, NULL, 0);
err = get_dnode_of_data(&dn, index, 0);
if (err) {
mutex_unlock_op(sbi, DATA_NEW);
f2fs_put_page(page, 1);
return err;
}
if (dn.data_blkaddr == NULL_ADDR) {
err = reserve_new_block(&dn);
if (err) {
f2fs_put_dnode(&dn);
mutex_unlock_op(sbi, DATA_NEW);
f2fs_put_page(page, 1);
return err;
}
}
f2fs_put_dnode(&dn);
mutex_unlock_op(sbi, DATA_NEW);
if ((len == PAGE_CACHE_SIZE) || PageUptodate(page))
return 0;
if ((pos & PAGE_CACHE_MASK) >= i_size_read(inode)) {
unsigned start = pos & (PAGE_CACHE_SIZE - 1);
unsigned end = start + len;
/* Reading beyond i_size is simple: memset to zero */
zero_user_segments(page, 0, start, end, PAGE_CACHE_SIZE);
return 0;
}
// append操作
if (dn.data_blkaddr == NEW_ADDR) {
// 填0
zero_user_segment(page, 0, PAGE_CACHE_SIZE);
// overwrite操作
} else {
// 读出旧的内容
err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC);
if (err) {
f2fs_put_page(page, 1);
return err;
}
}
SetPageUptodate(page);
clear_cold_data(page);
return 0;
}
f2fs_write_begin()
这个过程会被VFS调用,然后得到page的VFS将write(fd, buf, size)
需要写入的buf
内容拷贝到该page。VFS拷贝完成后,还会调用write_end()
作为写入操作的结束调用,主要是为此前已上锁的page进行解锁
写回主流程
普通文件的默认写策略是writeback,VFS通过提供writepage()
等定制点来实现具体文件系统的写回操作。前面已提到F2FS使用f2fs_write_data_page()
和f2fs_write_data_pages()
来完成这个操作
写回线程的调用路径:
bdi_writeback_thread wb_do_writeback wb_writeback __writeback_single_inode do_writepages a_ops->writepages
一般
writepages()
是多个页的writepage()
类似实现,下面挑个单页流程分析
static int f2fs_write_data_page(struct page *page,
struct writeback_control *wbc)
{
struct inode *inode = page->mapping->host;
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
loff_t i_size = i_size_read(inode);
const pgoff_t end_index = ((unsigned long long) i_size)
>> PAGE_CACHE_SHIFT;
unsigned offset;
int err = 0;
if (page->index < end_index)
goto out;
/*
* If the offset is out-of-range of file size,
* this page does not have to be written to disk.
*/
// 找到要写的地方
offset = i_size & (PAGE_CACHE_SIZE - 1);
if ((page->index >= end_index + 1) || !offset) {
if (S_ISDIR(inode->i_mode)) {
dec_page_count(sbi, F2FS_DIRTY_DENTS);
inode_dec_dirty_dents(inode);
}
goto unlock_out;
}
// [offset, SIZE)先清零
zero_user_segment(page, offset, PAGE_CACHE_SIZE);
out:
// 如果正在做recovery,跳过
if (sbi->por_doing)
goto redirty_out;
// writeback controller的特性
// for_reclaim在shrink_page_list()中被置为1
// 也就是说内存吃紧且高频数据可能被跳过
if (wbc->for_reclaim && !S_ISDIR(inode->i_mode) && !is_cold_data(page))
goto redirty_out;
mutex_lock_op(sbi, DATA_WRITE);
if (S_ISDIR(inode->i_mode)) {
dec_page_count(sbi, F2FS_DIRTY_DENTS);
inode_dec_dirty_dents(inode);
}
// 实际的写操作
err = do_write_data_page(page);
// 有问题就下次再来
if (err && err != -ENOENT) {
wbc->pages_skipped++;
set_page_dirty(page);
}
mutex_unlock_op(sbi, DATA_WRITE);
if (wbc->for_reclaim)
f2fs_submit_bio(sbi, DATA, true);
if (err == -ENOENT)
goto unlock_out;
clear_cold_data(page);
unlock_page(page);
if (!wbc->for_reclaim && !S_ISDIR(inode->i_mode))
f2fs_balance_fs(sbi);
return 0;
unlock_out:
unlock_page(page);
return (err == -ENOENT) ? 0 : err;
redirty_out:
wbc->pages_skipped++;
set_page_dirty(page);
return AOP_WRITEPAGE_ACTIVATE;
}
block寻址
上面的写入操作依赖于寻址操作。而寻址取决于index是怎么组织的,先看一眼f2fs.txt
Index Structure
---------------
The key data structure to manage the data locations is a "node". Similar to
traditional file structures, F2FS has three types of node: inode, direct node,
indirect node. F2FS assigns 4KB to an inode block which contains 929 data block
indices, two direct node pointers, two indirect node pointers, and one double
indirect node pointer as described below. One direct node block contains 1018
data blocks, and one indirect node block contains also 1018 node blocks. Thus,
one inode block (i.e., a file) covers:
4KB * (923 + 2 * 1018 + 2 * 1018 * 1018 + 1018 * 1018 * 1018) := 3.94TB.
Inode block (4KB)
|- data (923)
|- direct node (2)
| `- data (1018)
|- indirect node (2)
| `- direct node (1018)
| `- data (1018)
`- double indirect node (1)
`- indirect node (1018)
`- direct node (1018)
`- data (1018)
Note that, all the node blocks are mapped by NAT which means the location of
each node is translated by the NAT table. In the consideration of the wandering
tree problem, F2FS is able to cut off the propagation of node updates caused by
leaf data writes.
这里(论文图2)和f2fs.txt描述的名词并不一致,但是机制一样
设计哲学在论文阅读篇已经品鉴过就不复读了。这里指出F2FS的index struct和传统的文件系统相似。在具体的指标上支持单文件约4TB的大小,寻址实现上以inode为起点,路径分为direct / indirect / double indirect,因此最深的路径是4层,同时这些node block都被NAT管理。需要注意的是inode当中存放的923个data pointer是直接存放block地址,而另外5个node字段则只记录node id,这是解决wandering tree的关键
现在回归到代码。在上面的流程中出现了struct dnode_of_data
,这是一个寻址用到的数据结构:
/*
* this structure is used as one of function parameters.
* all the information are dedicated to a given direct node block determined
* by the data offset in a file.
*/
struct dnode_of_data {
struct inode *inode; /* vfs inode pointer */
struct page *inode_page; /* its inode page, NULL is possible */
struct page *node_page; /* cached direct node page */
nid_t nid; /* node id of the direct node block */
unsigned int ofs_in_node; /* data offset in the node page */
bool inode_page_locked; /* inode page is locked or not */
block_t data_blkaddr; /* block address of the node block */
};
内部字段比较简单。根据流程,我们需要用它得出data_blkaddr
构造函数如下,在上面的流程中,传入set_new_dnode(dn, inode, NULL, NULL, 0)
static inline void set_new_dnode(struct dnode_of_data *dn, struct inode *inode,
struct page *ipage, struct page *npage, nid_t nid)
{
memset(dn, 0, sizeof(*dn));
dn->inode = inode;
dn->inode_page = ipage;
dn->node_page = npage;
dn->nid = nid;
}
后续调用了get_dnode_of_data(dn, index, 0)
/*
* Caller should call f2fs_put_dnode(dn).
*/
int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int ro)
{
struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
struct page *npage[4];
struct page *parent;
int offset[4];
unsigned int noffset[4];
nid_t nids[4];
int level, i;
int err = 0;
// 核心block寻址,callee传回offset[]。具体见下方
level = get_node_path(index, offset, noffset);
nids[0] = dn->inode->i_ino;
// 途径get_node_page -> read_node_page -> f2fs_readpage
// 读流程后面再说吧
// 总之读到了inode page
npage[0] = get_node_page(sbi, nids[0]);
if (IS_ERR(npage[0]))
return PTR_ERR(npage[0]);
// 就是inode page
parent = npage[0];
// 使用page和offset计算出nid
// 返回 (struct f2fs_node*) page_address(parent)->i.i_nid[offset[0] - NODE_DIR1_BLOCK]
// NOTE:
// f2fs_node可以表现3种形式:inode, direct, and indirect types。这里是inode(->i)
// i_nid[5]存放direct(2) + indirect(2) + d_indirect(2)的node id
//
// 但这里为什么如此确定offset[0] >= NODE_DIR1_BLOCK?
// 看了后期的代码是需要判断的(if level>0),这里早期实现确实是个bug
nids[1] = get_nid(parent, offset[0], true);
dn->inode_page = npage[0];
dn->inode_page_locked = true;
/* get indirect or direct nodes */
// 总之这里就按层级不断读页
// 一些实现细节:
// - 如果遇到没有nid的block,那就分配
// - page会尝试预读(ro == true)
//
// NOTE:
// 这里如果是direct pointer是不会遍历的,因为此时level=0
for (i = 1; i <= level; i++) {
bool done = false;
if (!nids[i] && !ro) {
mutex_lock_op(sbi, NODE_NEW);
/* alloc new node */
if (!alloc_nid(sbi, &(nids[i]))) {
mutex_unlock_op(sbi, NODE_NEW);
err = -ENOSPC;
goto release_pages;
}
dn->nid = nids[i];
// 从page cache获取page,如果没有对应page,允许读页
npage[i] = new_node_page(dn, noffset[i]);
if (IS_ERR(npage[i])) {
alloc_nid_failed(sbi, nids[i]);
mutex_unlock_op(sbi, NODE_NEW);
err = PTR_ERR(npage[i]);
goto release_pages;
}
// 更新parent->i.i_nid[] (if i==1)或者parent->in.nid[] (if i != 1)
set_nid(parent, offset[i - 1], nids[i], i == 1);
alloc_nid_done(sbi, nids[i]);
mutex_unlock_op(sbi, NODE_NEW);
done = true;
} else if (ro && i == level && level > 1) {
npage[i] = get_node_page_ra(parent, offset[i - 1]);
if (IS_ERR(npage[i])) {
err = PTR_ERR(npage[i]);
goto release_pages;
}
done = true;
}
if (i == 1) {
dn->inode_page_locked = false;
unlock_page(parent);
} else {
f2fs_put_page(parent, 1);
}
if (!done) {
npage[i] = get_node_page(sbi, nids[i]);
if (IS_ERR(npage[i])) {
err = PTR_ERR(npage[i]);
f2fs_put_page(npage[0], 0);
goto release_out;
}
}
if (i < level) {
parent = npage[i];
nids[i + 1] = get_nid(parent, offset[i], false);
}
}
// NOTE:
// 这是会写入到summary entry的nid,也就是parent node
// 比如direct pointer(level==0)的情况,这里就填入ino
dn->nid = nids[level];
// 块内偏移
dn->ofs_in_node = offset[level];
dn->node_page = npage[level];
// 最终目标,知道第几层,哪个page以及offset后,对着数据结构布局解析就能得到
dn->data_blkaddr = datablock_addr(dn->node_page, dn->ofs_in_node);
return 0;
release_pages:
f2fs_put_page(parent, 1);
if (i > 1)
f2fs_put_page(npage[0], 0);
release_out:
dn->inode_page = NULL;
dn->node_page = NULL;
return err;
}
/*
* The maximum depth is four.
* Offset[0] will have raw inode offset.
*/
// 老实说我看不下去这些过程,按照f2fs.txt理解即可
static int get_node_path(long block, int offset[4], unsigned int noffset[4])
{
// 923,Address Pointers in an Inode,见前面Index Structure说明
const long direct_index = ADDRS_PER_INODE;
// 1018,Address Pointers in a Direct Block
const long direct_blks = ADDRS_PER_BLOCK;
// 1018,Address Pointers in a Direct Block
const long dptrs_per_blk = NIDS_PER_BLOCK;
const long indirect_blks = ADDRS_PER_BLOCK * NIDS_PER_BLOCK;
const long dindirect_blks = indirect_blks * NIDS_PER_BLOCK;
int n = 0;
int level = 0;
noffset[0] = 0;
// NOTE: block为文件内的page index
// 小于923,即论文中的direct pointers or inline data,第0层就足够了
// NOTE2: 初版的F2FS似乎没有inline data
if (block < direct_index) {
offset[n++] = block;
level = 0;
goto got;
}
block -= direct_index;
// 第0级扣去923后不超第1级寻址范围
if (block < direct_blks) {
// [0]指向Single-indirect
// NODE_DIR1_BLOCK = (ADDRS_PER_INODE + 1)
offset[n++] = NODE_DIR1_BLOCK;
// offset block的个数?
noffset[n] = 1;
// 块内偏移量
offset[n++] = block;
level = 1;
goto got;
}
block -= direct_blks;
if (block < direct_blks) {
offset[n++] = NODE_DIR2_BLOCK;
noffset[n] = 2;
offset[n++] = block;
level = 1;
goto got;
}
block -= direct_blks;
if (block < indirect_blks) {
offset[n++] = NODE_IND1_BLOCK;
noffset[n] = 3;
offset[n++] = block / direct_blks;
noffset[n] = 4 + offset[n - 1];
offset[n++] = block % direct_blks;
level = 2;
goto got;
}
block -= indirect_blks;
if (block < indirect_blks) {
offset[n++] = NODE_IND2_BLOCK;
noffset[n] = 4 + dptrs_per_blk;
offset[n++] = block / direct_blks;
noffset[n] = 5 + dptrs_per_blk + offset[n - 1];
offset[n++] = block % direct_blks;
level = 2;
goto got;
}
block -= indirect_blks;
if (block < dindirect_blks) {
offset[n++] = NODE_DIND_BLOCK;
noffset[n] = 5 + (dptrs_per_blk * 2);
offset[n++] = block / indirect_blks;
noffset[n] = 6 + (dptrs_per_blk * 2) +
offset[n - 1] * (dptrs_per_blk + 1);
offset[n++] = (block / direct_blks) % dptrs_per_blk;
noffset[n] = 7 + (dptrs_per_blk * 2) +
offset[n - 2] * (dptrs_per_blk + 1) +
offset[n - 1];
offset[n++] = block % direct_blks;
level = 3;
goto got;
} else {
BUG();
}
got:
return level;
}
简单来说,寻址就是一个按照多级间接块计算offset的过程
实际写执行
前面已分析到,F2FS的writeback实际靠do_write_data_page
完成
int do_write_data_page(struct page *page)
{
struct inode *inode = page->mapping->host;
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
block_t old_blk_addr, new_blk_addr;
struct dnode_of_data dn;
int err = 0;
// 寻址过程,得到old_blk_addr
set_new_dnode(&dn, inode, NULL, NULL, 0);
err = get_dnode_of_data(&dn, page->index, RDONLY_NODE);
if (err)
return err;
old_blk_addr = dn.data_blkaddr;
/* This page is already truncated */
if (old_blk_addr == NULL_ADDR)
goto out_writepage;
set_page_writeback(page);
/*
* If current allocation needs SSR,
* it had better in-place writes for updated data.
*/
// SSR-style写操作
// SSR指的是Slack Space Recycling
// 含义见f2fs/segment.h,reuses obsolete space without cleaning operations
//
// 也就是说这里是in-place update
//
// 需要的条件need_inplace_update()如论文描述,
// free section数目(由free_i维护)少于一定阈值时满足
if (old_blk_addr != NEW_ADDR && !is_cold_data(page) &&
need_inplace_update(inode)) {
rewrite_data_page(F2FS_SB(inode->i_sb), page,
old_blk_addr);
// 而这里是LFS-style的写操作
} else {
write_data_page(inode, page, &dn,
old_blk_addr, &new_blk_addr);
update_extent_cache(new_blk_addr, &dn);
F2FS_I(inode)->data_version =
le64_to_cpu(F2FS_CKPT(sbi)->checkpoint_ver);
}
out_writepage:
f2fs_put_dnode(&dn);
return err;
}
可以看出实际的写操作是区分为原地写和追加写,在这份实现中其实追加写是临时关掉了(可能是早期实现不太完善),但这不影响分析,后面会继续深入这两种行为
In-place write
void rewrite_data_page(struct f2fs_sb_info *sbi, struct page *page,
block_t old_blk_addr)
{
submit_write_page(sbi, page, old_blk_addr, DATA);
}
static void submit_write_page(struct f2fs_sb_info *sbi, struct page *page,
block_t blk_addr, enum page_type type)
{
struct block_device *bdev = sbi->sb->s_bdev;
verify_block_addr(sbi, blk_addr);
down_write(&sbi->bio_sem);
inc_page_count(sbi, F2FS_WRITEBACK);
// super block维护DATA/NODE/META类型的bio[]数组
// last_block_in_bio[]表示最后的block号
if (sbi->bio[type] && sbi->last_block_in_bio[type] != blk_addr - 1)
// 不连续的page先提交上去
// false指的是异步提交,如果是true则是SYNC
// 在这个上下文中,内部大概是一个submit_bio()的封装
do_submit_bio(sbi, type, false);
alloc_new:
// 既然没在sbi初始化过程找到它的身影,那就是默认为NULL,先分配
// NOTE: 提交(submit_bio())后,bio也是回归到NULL。因此上面不连续的提交后也走这个分支
if (sbi->bio[type] == NULL) {
// 1. kmemcache分配f2fs私有数据结构struct bio_private,用于check point等待唤醒
// 2. 使用内核通用的bio_alloc()分配bio
sbi->bio[type] = f2fs_bio_alloc(bdev, bio_get_nr_vecs(bdev));
sbi->bio[type]->bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr);
/*
* The end_io will be assigned at the sumbission phase.
* Until then, let bio_add_page() merge consecutive IOs as much
* as possible.
*/
}
// bio合并page
if (bio_add_page(sbi->bio[type], page, PAGE_CACHE_SIZE, 0) <
PAGE_CACHE_SIZE) {
do_submit_bio(sbi, type, false);
goto alloc_new;
}
// 记录last block,尽可能合并提交
sbi->last_block_in_bio[type] = blk_addr;
up_write(&sbi->bio_sem);
}
static void do_submit_bio(struct f2fs_sb_info *sbi,
enum page_type type, bool sync)
{
int rw = sync ? WRITE_SYNC : WRITE;
enum page_type btype = type > META ? META : type;
if (type >= META_FLUSH)
rw = WRITE_FLUSH_FUA;
if (sbi->bio[btype]) {
struct bio_private *p = sbi->bio[btype]->bi_private;
p->sbi = sbi;
// 完成后的回调
// 大概是标记writeback page结束,以及处理check point细节(唤醒)
sbi->bio[btype]->bi_end_io = f2fs_end_io_write;
// check point操作才会有这个类型
if (type == META_FLUSH) {
DECLARE_COMPLETION_ONSTACK(wait);
p->is_sync = true;
p->wait = &wait;
submit_bio(rw, sbi->bio[btype]);
wait_for_completion(&wait);
// 其它类型就是一个submit_bio()过程
} else {
p->is_sync = false;
submit_bio(rw, sbi->bio[btype]);
}
// 提交完后对应bio就置空
sbi->bio[btype] = NULL;
}
}
Append write
在追加写中,new_blkaddr
是由callee而非caller指定的,因此执行过程后调用方才能得知新的追加地址
void write_data_page(struct inode *inode, struct page *page,
struct dnode_of_data *dn, block_t old_blkaddr,
block_t *new_blkaddr)
{
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
struct f2fs_summary sum;
struct node_info ni;
BUG_ON(old_blkaddr == NULL_ADDR);
get_node_info(sbi, dn->nid, &ni);
// 初始化summary entry
set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version);
do_write_page(sbi, page, old_blkaddr,
new_blkaddr, &sum, DATA);
}
static inline void set_summary(struct f2fs_summary *sum, nid_t nid,
unsigned int ofs_in_node, unsigned char version)
{
sum->nid = cpu_to_le32(nid);
sum->ofs_in_node = cpu_to_le16(ofs_in_node);
sum->version = version;
}
static void do_write_page(struct f2fs_sb_info *sbi, struct page *page,
block_t old_blkaddr, block_t *new_blkaddr,
struct f2fs_summary *sum, enum page_type p_type)
{
struct sit_info *sit_i = SIT_I(sbi);
struct curseg_info *curseg;
unsigned int old_cursegno;
int type;
// 返回诸如CURSEG_HOT_DATA等类型
type = __get_segment_type(page, p_type);
// 通过SM_I(sbi)->curseg_array + type得到
curseg = CURSEG_I(sbi, type);
mutex_lock(&curseg->curseg_mutex);
// #define NEXT_FREE_BLKADDR(sbi, curseg) \
// (START_BLOCK(sbi, curseg->segno) + curseg->next_blkoff)
// next_blkoff决定新的地址
*new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
old_cursegno = curseg->segno;
/*
* __add_sum_entry should be resided under the curseg_mutex
* because, this function updates a summary entry in the
* current summary block.
*/
// curseg->sum_blk + next_blkoff*sizeof(f2fs_summary)后面memcpy拼接sum
__add_sum_entry(sbi, type, sum, curseg->next_blkoff);
mutex_lock(&sit_i->sentry_lock);
// 简单来说,如果是LFS-style的话,更新next就是简单+1
// SSR-style感兴趣自己看吧
__refresh_next_blkoff(sbi, curseg);
// alloc_type: 枚举值,LFS或者SSR
sbi->block_count[curseg->alloc_type]++;
/*
* SIT information should be updated before segment allocation,
* since SSR needs latest valid block information.
*/
// 见SIT update子流程,主要维护对应segment entry的bitmap
refresh_sit_entry(sbi, old_blkaddr, *new_blkaddr);
// 子函数判断curseg->next_blkoff < sbi->blocks_per_seg
// 在1个segment内分配是有极限的。要想超越极限,那就用2个semgnet
if (!__has_curseg_space(sbi, type))
sit_i->s_ops->allocate_segment(sbi, type, false);
locate_dirty_segment(sbi, old_cursegno);
locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr));
mutex_unlock(&sit_i->sentry_lock);
if (p_type == NODE)
fill_node_footer_blkaddr(page, NEXT_FREE_BLKADDR(sbi, curseg));
/* writeout dirty page into bdev */
// 殊途同归,这个过程在In-place write已经分析过
submit_write_page(sbi, page, *new_blkaddr, p_type);
mutex_unlock(&curseg->curseg_mutex);
}
SIT update
static void refresh_sit_entry(struct f2fs_sb_info *sbi,
block_t old_blkaddr, block_t new_blkaddr)
{
update_sit_entry(sbi, new_blkaddr, 1);
// GET_SEGNO一般展开为GET_L2R_SEGNO(FREE_I(sbi), GET_SEGNO_FROM_SEG0(sbi, blk_addr))
//
// #define GET_SEGNO_FROM_SEG0(sbi, blk_addr) \
// (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) >> sbi->log_blocks_per_seg)
// #define GET_SEGOFF_FROM_SEG0(sbi, blk_addr) \
// ((blk_addr) - SM_I(sbi)->seg0_blkaddr)
//
// /* V: Logical segment # in volume, R: Relative segment # in main area */
// #define GET_L2R_SEGNO(free_i, segno) (segno - free_i->start_segno)
//
// 最终结果
// ((((blk_addr) - SM_I(sbi)->seg0_blkaddr) >> sbi->log_blocks_per_seg)
// - FREE_I(sbi)->start_segno)
if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO)
update_sit_entry(sbi, old_blkaddr, -1);
}
update_sit_entry()
下面再分析,先看GET_SEGNO(sbi, blk_addr)
的行为。它最后得到的结果是:
((((blk_addr) - SM_I(sbi)->seg0_blkaddr) >> sbi->log_blocks_per_seg) - FREE_I(sbi)->start_segno)
这里就需要看seg0_blkaddr
和start_segno
是什么
上文在初始化过程中得知seg0_blkaddr
是start block address of segment 0(sm_info
读取segment0_blkaddr
的信息),segment0_blkaddr
在mkfs.f2fs
初始化过程为:
super_block.segment0_blkaddr =
cpu_to_le32(zone_align_start_offset / blk_size_bytes); // 换算成block单位
zone_align_start_offset =
(f2fs_params.start_sector * DEFAULT_SECTOR_SIZE + // 这里默认是0
F2FS_SUPER_OFFSET * F2FS_BLKSIZE + // 这里也是0
sizeof(struct f2fs_super_block) * 2 + // 算上2个super block
zone_size_bytes - 1) / zone_size_bytes * zone_size_bytes - // 对zone_size_bytes上取整
f2fs_params.start_sector * DEFAULT_SECTOR_SIZE; // 这里是0
而start_segno
是free_i
读取GET_SEGNO_FROM_SEG0(sbi, sm_info->main_blkaddr)
得到的,同样补充一下main_blkaddr
在mkfs.f2fs
的初始化过程:
// 按照layout图去看就是main area接着在ssa后面
super_block.main_blkaddr = cpu_to_le32(
le32_to_cpu(super_block.ssa_blkaddr) +
(le32_to_cpu(super_block.segment_count_ssa) *
f2fs_params.blks_per_seg));
// ssa在nat后面,如此类推
super_block.ssa_blkaddr = cpu_to_le32(
le32_to_cpu(super_block.nat_blkaddr) +
le32_to_cpu(super_block.segment_count_nat) *
f2fs_params.blks_per_seg);
super_block.nat_blkaddr = cpu_to_le32(
le32_to_cpu(super_block.sit_blkaddr) +
(le32_to_cpu(super_block.segment_count_sit) *
f2fs_params.blks_per_seg));
很显然segment0_blkaddr
是512,因为是按zone单位(2MB)对齐且换算成block单位(4KB);main_blkaddr
我就不算了,在gdb
调试中得出5120,就看每个area占几个segment(segment_count_*
)
总之,seg0_blkaddr
表示super block后面且与zone对齐过的起始地址,一般来说应该对应着check point起始地址;start_segno
是main area起始换算成segment单位后的结果
水了一大段,后面开始正式看如何更新SIT
static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del)
{
struct seg_entry *se;
unsigned int segno, offset;
long int new_vblocks;
// 换算单位 block->segment
segno = GET_SEGNO(sbi, blkaddr);
// 返回&sit_i->sentries[segno]
// sentries初始化见前面build_sit_info()
// 总之要semgnet entry是可以随机访问的
se = get_seg_entry(sbi, segno);
// TODO valid_blocks
new_vblocks = se->valid_blocks + del;
// segment内block offset
offset = GET_SEGOFF_FROM_SEG0(sbi, blkaddr) & (sbi->blocks_per_seg - 1);
BUG_ON((new_vblocks >> (sizeof(unsigned short) << 3) ||
(new_vblocks > sbi->blocks_per_seg)));
// 初始化se valid_blocks
se->valid_blocks = new_vblocks;
se->mtime = get_mtime(sbi);
// 因为是刚刚改动的,所以整个segment中是max
SIT_I(sbi)->max_mtime = se->mtime;
/* Update valid block bitmap */
// del用于更新valid map
if (del > 0) {
// 要删除的才set bit为1,返回set bit前的值
if (f2fs_set_bit(offset, se->cur_valid_map))
BUG();
} else {
if (!f2fs_clear_bit(offset, se->cur_valid_map))
BUG();
}
// 如果和check point对比发现有差异
if (!f2fs_test_bit(offset, se->ckpt_valid_map))
// 那就维护新的总数,+1或者-1
// NOTE: 在GC过程中,SSR-style直接使用该值作为cost判断依据
se->ckpt_valid_blocks += del;
// 维护sit_i的dirty_sentries_bitmap和dirty_sentries
__mark_sit_entry_dirty(sbi, segno);
/* update total number of valid blocks to be written in ckpt area */
// 这个值会应用到后续check point
SIT_I(sbi)->written_valid_blocks += del;
// 略
if (sbi->segs_per_sec > 1)
get_sec_entry(sbi, segno)->valid_blocks += del;
}
static void __mark_sit_entry_dirty(struct f2fs_sb_info *sbi, unsigned int segno)
{
struct sit_info *sit_i = SIT_I(sbi);
if (!__test_and_set_bit(segno, sit_i->dirty_sentries_bitmap))
sit_i->dirty_sentries++;
}
SIT的维护思路很简单,一是换算单位到segment并在线性表中找到segment entry,二是更新entry当中的bitmap信息(存在新旧地址的差异,那就一个setbit另一个clearbit),然后给check point铺路。
Segment allocation
在append write当中由于是追加行为,因此会涉及到单个segment已满需要新增的情况
static const struct segment_allocation default_salloc_ops = {
.allocate_segment = allocate_segment_by_default,
};
/*
* flush out current segment and replace it with new segment
* This function should be returned with success, otherwise BUG
*/
static void allocate_segment_by_default(struct f2fs_sb_info *sbi,
int type, bool force)
{
struct curseg_info *curseg = CURSEG_I(sbi, type);
unsigned int ofs_unit;
if (force) {
new_curseg(sbi, type, true);
goto out;
}
ofs_unit = need_SSR(sbi) ? 1 : sbi->segs_per_sec;
// TODO next_segno
// 涉及dirty seglist,需要了解Delete operation或者GC
curseg->next_segno = check_prefree_segments(sbi, ofs_unit, type);
// 如果有next segment,那就直接复用
if (curseg->next_segno != NULL_SEGNO)
change_curseg(sbi, type, false);
// 没有的话那就看type决定策略。一般来说,LFS-style就是新增一个segment
else if (type == CURSEG_WARM_NODE)
new_curseg(sbi, type, false);
else if (need_SSR(sbi) && get_ssr_segment(sbi, type))
change_curseg(sbi, type, true);
else
new_curseg(sbi, type, false);
out:
sbi->segment_count[curseg->alloc_type]++;
}
/*
* This function always allocates a used segment (from dirty seglist) by SSR
* manner, so it should recover the existing segment information of valid blocks
*/
static void change_curseg(struct f2fs_sb_info *sbi, int type, bool reuse)
{
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
struct curseg_info *curseg = CURSEG_I(sbi, type);
unsigned int new_segno = curseg->next_segno;
struct f2fs_summary_block *sum_node;
struct page *sum_page;
write_sum_page(sbi, curseg->sum_blk,
GET_SUM_BLOCK(sbi, curseg->segno));
__set_test_and_inuse(sbi, new_segno);
mutex_lock(&dirty_i->seglist_lock);
__remove_dirty_segment(sbi, new_segno, PRE);
__remove_dirty_segment(sbi, new_segno, DIRTY);
mutex_unlock(&dirty_i->seglist_lock);
reset_curseg(sbi, type, 1);
curseg->alloc_type = SSR;
__next_free_blkoff(sbi, curseg, 0);
if (reuse) {
sum_page = get_sum_page(sbi, new_segno);
sum_node = (struct f2fs_summary_block *)page_address(sum_page);
memcpy(curseg->sum_blk, sum_node, SUM_ENTRY_SIZE);
f2fs_put_page(sum_page, 1);
}
}
/*
* Allocate a current working segment.
* This function always allocates a free segment in LFS manner.
*/
static void new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec)
{
struct curseg_info *curseg = CURSEG_I(sbi, type);
unsigned int segno = curseg->segno;
int dir = ALLOC_LEFT;
// #define GET_SUM_BLOCK(sbi, segno) ((sbi->sm_info->ssa_blkaddr) + segno)
// 没太理解,这里按理应该是:
// 将curseg_info中的f2fs_summary_block同步到page cache中
// 但是实现上反而是:
// memcpy(kaddr, sum_blk, PAGE_CACHE_SIZE)
// <del>这不是反了吗?</del>哦确实没错
write_sum_page(sbi, curseg->sum_blk,
GET_SUM_BLOCK(sbi, curseg->segno));
// DATA分配方向,HOT往左,WARM/COLD往右
if (type == CURSEG_WARM_DATA || type == CURSEG_COLD_DATA)
dir = ALLOC_RIGHT;
if (test_opt(sbi, NOHEAP))
dir = ALLOC_RIGHT;
get_new_segment(sbi, &segno, new_sec, dir);
// 得到新的segno
curseg->next_segno = segno;
reset_curseg(sbi, type, 1);
curseg->alloc_type = LFS;
}
/*
* Find a new segment from the free segments bitmap to right order
* This function should be returned with success, otherwise BUG
*/
static void get_new_segment(struct f2fs_sb_info *sbi,
unsigned int *newseg, bool new_sec, int dir)
{
struct free_segmap_info *free_i = FREE_I(sbi);
// (一些无关紧要的)NOTE:
// 虽然segment和section是同一单位
// 但是section count一般只统计main area
// 详见前面mkfs文章的dump信息
//
// 得到section总数目
unsigned int total_secs = sbi->total_sections;
unsigned int segno, secno, zoneno;
// 换算到zone
unsigned int total_zones = sbi->total_sections / sbi->secs_per_zone;
// 当前上下文的newseg是旧的curseg->segno
unsigned int hint = *newseg / sbi->segs_per_sec;
unsigned int old_zoneno = GET_ZONENO_FROM_SEGNO(sbi, *newseg);
unsigned int left_start = hint;
bool init = true;
int go_left = 0;
int i;
write_lock(&free_i->segmap_lock);
if (!new_sec && ((*newseg + 1) % sbi->segs_per_sec)) {
// NOTE:
// 关于find_next_zero_bit()的签名
// 第二个参数是bitmap size,第三个参数是开始查找的起始位置
segno = find_next_zero_bit(free_i->free_segmap,
TOTAL_SEGS(sbi), *newseg + 1);
if (segno < TOTAL_SEGS(sbi))
goto got_it;
}
find_other_zone:
// 查找下一个可用section
secno = find_next_zero_bit(free_i->free_secmap, total_secs, hint);
// 没找到?
if (secno >= total_secs) {
// 循环左移查找
if (dir == ALLOC_RIGHT) {
secno = find_next_zero_bit(free_i->free_secmap,
total_secs, 0);
BUG_ON(secno >= total_secs);
} else {
go_left = 1;
left_start = hint - 1;
}
}
if (go_left == 0)
goto skip_left;
// 右边不行再往左找
while (test_bit(left_start, free_i->free_secmap)) {
if (left_start > 0) {
left_start--;
continue;
}
left_start = find_next_zero_bit(free_i->free_secmap,
total_secs, 0);
BUG_ON(left_start >= total_secs);
break;
}
// 定位到0-bit
secno = left_start;
skip_left:
hint = secno;
segno = secno * sbi->segs_per_sec;
zoneno = secno / sbi->secs_per_zone;
/* give up on finding another zone */
if (!init)
goto got_it;
// 默认走这里
if (sbi->secs_per_zone == 1)
goto got_it;
if (zoneno == old_zoneno)
goto got_it;
if (dir == ALLOC_LEFT) {
if (!go_left && zoneno + 1 >= total_zones)
goto got_it;
if (go_left && zoneno == 0)
goto got_it;
}
for (i = 0; i < NR_CURSEG_TYPE; i++)
if (CURSEG_I(sbi, i)->zone == zoneno)
break;
if (i < NR_CURSEG_TYPE) {
/* zone is in user, try another */
if (go_left)
hint = zoneno * sbi->secs_per_zone - 1;
else if (zoneno + 1 >= total_zones)
hint = 0;
else
hint = (zoneno + 1) * sbi->secs_per_zone;
init = false;
goto find_other_zone;
}
got_it:
/* set it as dirty segment in free segmap */
BUG_ON(test_bit(segno, free_i->free_segmap));
// 更新free segmap:setbit且free_i->free_segments--(&& free_sections--)
__set_inuse(sbi, segno);
*newseg = segno;
write_unlock(&free_i->segmap_lock);
}
// 一些基本的初始化
static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified)
{
struct curseg_info *curseg = CURSEG_I(sbi, type);
struct summary_footer *sum_footer;
curseg->segno = curseg->next_segno;
curseg->zone = GET_ZONENO_FROM_SEGNO(sbi, curseg->segno);
// 因为是全新的segment,所以是0
curseg->next_blkoff = 0;
// 通过链表形式找到同一log的所有segment,old seg的next_segno就是指向现在的curseg
curseg->next_segno = NULL_SEGNO;
// footer在前面mkfs聊过,区分类型和校验相关
sum_footer = &(curseg->sum_blk->footer);
memset(sum_footer, 0, sizeof(struct summary_footer));
if (IS_DATASEG(type))
SET_SUM_TYPE(sum_footer, SUM_TYPE_DATA);
if (IS_NODESEG(type))
SET_SUM_TYPE(sum_footer, SUM_TYPE_NODE);
__set_sit_entry_type(sbi, type, curseg->segno, modified);
}
segment的分配思路简单,但是实现看着复杂,基本上就是尝试复用segment,以及free segmap当中区分方向的查找空闲位置
Read operation
回调注册
和写流程类似,F2FS在f_op
使用通用的读机制(就不列出了),而a_op
是F2FS内部实现的
const struct address_space_operations f2fs_dblock_aops = {
.readpage = f2fs_read_data_page,
.readpages = f2fs_read_data_pages,
// ...
};
static int f2fs_read_data_page(struct file *file, struct page *page)
{
// 好吧,起码get_block_t回调确实是F2FS写的
return mpage_readpage(page, get_data_block_ro);
}
其关键在get_block_t
回调get_data_block_ro()
主流程
/*
* This function should be used by the data read flow only where it
* does not check the "create" flag that indicates block allocation.
* The reason for this special functionality is to exploit VFS readahead
* mechanism.
*/
static int get_data_block_ro(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
{
unsigned int blkbits = inode->i_sb->s_blocksize_bits;
unsigned maxblocks = bh_result->b_size >> blkbits;
struct dnode_of_data dn;
pgoff_t pgofs;
int err;
/* Get the page offset from the block offset(iblock) */
pgofs = (pgoff_t)(iblock >> (PAGE_CACHE_SHIFT - blkbits));
if (check_extent_cache(inode, pgofs, bh_result))
return 0;
/* When reading holes, we need its node page */
set_new_dnode(&dn, inode, NULL, NULL, 0);
err = get_dnode_of_data(&dn, pgofs, RDONLY_NODE);
if (err)
return (err == -ENOENT) ? 0 : err;
/* It does not support data allocation */
BUG_ON(create);
if (dn.data_blkaddr != NEW_ADDR && dn.data_blkaddr != NULL_ADDR) {
int i;
unsigned int end_offset;
end_offset = IS_INODE(dn.node_page) ?
ADDRS_PER_INODE :
ADDRS_PER_BLOCK;
clear_buffer_new(bh_result);
/* Give more consecutive addresses for the read ahead */
for (i = 0; i < end_offset - dn.ofs_in_node; i++)
if (((datablock_addr(dn.node_page,
dn.ofs_in_node + i))
!= (dn.data_blkaddr + i)) || maxblocks == i)
break;
map_bh(bh_result, inode->i_sb, dn.data_blkaddr);
bh_result->b_size = (i << blkbits);
}
f2fs_put_dnode(&dn);
return 0;
}
相比波澜壮阔的写流程,F2FS读流程似乎没啥可说的,无非就是寻址再填上buffer_head
……
为了避免尴尬,感兴趣可以看一下我之前写的VFS读流程还有预读机制
Delete operation
Garbage collection
F2FS的GC入口在f2fs_gc()
。这个函数的caller有2个:
- 一个是前面接触到的
f2fs_balance_fs()
- 另一个是后台
kthread
执行的gc_thread_func()
Background GC
后台GC是个和GC主流程关联性不大的话题,先简单梳理一下
前面的f2fs_fill_super()
初始化流程可以看到,它执行了start_gc_thread()
int start_gc_thread(struct f2fs_sb_info *sbi)
{
// 一个kthread封装类
struct f2fs_gc_kthread *gc_th;
gc_th = kmalloc(sizeof(struct f2fs_gc_kthread), GFP_KERNEL);
if (!gc_th)
return -ENOMEM;
sbi->gc_thread = gc_th;
init_waitqueue_head(&sbi->gc_thread->gc_wait_queue_head);
// kthread的任务是gc_thread_func
sbi->gc_thread->f2fs_gc_task = kthread_run(gc_thread_func, sbi,
GC_THREAD_NAME);
if (IS_ERR(gc_th->f2fs_gc_task)) {
kfree(gc_th);
return -ENOMEM;
}
return 0;
}
static int gc_thread_func(void *data)
{
struct f2fs_sb_info *sbi = data;
wait_queue_head_t *wq = &sbi->gc_thread->gc_wait_queue_head;
long wait_ms;
// GC_THREAD_MIN_SLEEP_TIME: 10s
wait_ms = GC_THREAD_MIN_SLEEP_TIME;
do {
// freezer特性,如果系统处于休眠则等待
if (try_to_freeze())
continue;
else
// 等待wait_ms时间,这个数值会在后续动态调节
wait_event_interruptible_timeout(*wq,
kthread_should_stop(),
msecs_to_jiffies(wait_ms));
if (kthread_should_stop())
break;
// 如果没有足够free section,会强制进入主流程f2fs_gc()
// 否则什么都不干
f2fs_balance_fs(sbi);
if (!test_opt(sbi, BG_GC))
continue;
/*
* [GC triggering condition]
* 0. GC is not conducted currently.
* 1. There are enough dirty segments.
* 2. IO subsystem is idle by checking the # of writeback pages.
* 3. IO subsystem is idle by checking the # of requests in
* bdev's request list.
*
* Note) We have to avoid triggering GCs too much frequently.
* Because it is possible that some segments can be
* invalidated soon after by user update or deletion.
* So, I'd like to wait some time to collect dirty segments.
*/
if (!mutex_trylock(&sbi->gc_mutex))
continue;
if (!is_idle(sbi)) {
wait_ms = increase_sleep_time(wait_ms);
mutex_unlock(&sbi->gc_mutex);
continue;
}
// invalid(没写过的block)超40%,就会缩减等待时间,提高后台频率
if (has_enough_invalid_blocks(sbi))
// 减10s,但不超过10s阈值
wait_ms = decrease_sleep_time(wait_ms);
else
// 加10s,但不超过30s阈值
wait_ms = increase_sleep_time(wait_ms);
sbi->bg_gc++;
// 进入GC主流程
if (f2fs_gc(sbi) == GC_NONE)
// 作者的意思应该是没有发生GC,可能的情况是:
// - super block尚未完成初始化
// - victim挑选失败(比如每一个segno都成本过高)
//
// GC_THREAD_NOGC_SLEEP_TIME: 10s
wait_ms = GC_THREAD_NOGC_SLEEP_TIME;
else if (wait_ms == GC_THREAD_NOGC_SLEEP_TIME)
// 如果上一次走了NOGC的情况,这次强制降低频率
// GC_THREAD_MAX_SLEEP_TIME: 30s
wait_ms = GC_THREAD_MAX_SLEEP_TIME;
} while (!kthread_should_stop());
return 0;
}
static inline bool has_enough_invalid_blocks(struct f2fs_sb_info *sbi)
{
// 没有写过的block数目
// NOTE:
// LFS写操作下,旧block不算入written
// 详见update_sit_entry()的written_valid_blocks更新
block_t invalid_user_blocks = sbi->user_block_count -
written_block_count(sbi);
/*
* Background GC is triggered with the following condition.
* 1. There are a number of invalid blocks.
* 2. There is not enough free space.
*/
// invalid user超过了40%总数,free user小于40%总数
if (invalid_user_blocks > limit_invalid_user_blocks(sbi) &&
free_user_blocks(sbi) < limit_free_user_blocks(sbi))
return true;
return false;
}
static inline long decrease_sleep_time(long wait)
{
// 减去10s,但不得低于10s
wait -= GC_THREAD_MIN_SLEEP_TIME;
if (wait <= GC_THREAD_MIN_SLEEP_TIME)
wait = GC_THREAD_MIN_SLEEP_TIME;
return wait;
}
static inline long increase_sleep_time(long wait)
{
// 加上10s,但不得高于30s
wait += GC_THREAD_MIN_SLEEP_TIME;
if (wait > GC_THREAD_MAX_SLEEP_TIME)
wait = GC_THREAD_MAX_SLEEP_TIME;
return wait;
}
可以看出后台GC是一个动态调整GC频率的任务,它会参考当前block使用情况和上一次GC反馈结果,每\([10, 30]\)秒执行一次GC任务
主流程
不管前台GC还是后台GC,都会进入到统一的GC接口f2fs_gc()
。后台GC的情况在前面已经说明了;而前台GC可能会通过f2fs_balance_fs()
在多个流程插入,只要free section低于预期,就会进入主流程
int f2fs_gc(struct f2fs_sb_info *sbi)
{
struct list_head ilist;
unsigned int segno, i;
int gc_type = BG_GC;
int gc_status = GC_NONE;
INIT_LIST_HEAD(&ilist);
gc_more:
// 不清楚MS_ACTIVE具体含义
// 但是在mount过程中VFS会加上这个标记
if (!(sbi->sb->s_flags & MS_ACTIVE))
goto stop;
// 如果没有足够的free section
// 那就改为FG GC,默认是BG GC
if (has_not_enough_free_secs(sbi))
gc_type = FG_GC;
// victim selection过程,完成结果传入segno
if (!__get_victim(sbi, &segno, gc_type, NO_CHECK_TYPE))
goto stop;
// 论文提到,GC执行是以section为单位的
// 以拿到的segment为起始遍历section长度
for (i = 0; i < sbi->segs_per_sec; i++) {
/*
* do_garbage_collect will give us three gc_status:
* GC_ERROR, GC_DONE, and GC_BLOCKED.
* If GC is finished uncleanly, we have to return
* the victim to dirty segment list.
*/
// 核心函数
gc_status = do_garbage_collect(sbi, segno + i, &ilist, gc_type);
if (gc_status != GC_DONE)
break;
}
if (has_not_enough_free_secs(sbi)) {
write_checkpoint(sbi, (gc_status == GC_BLOCKED), false);
if (has_not_enough_free_secs(sbi))
goto gc_more;
}
stop:
mutex_unlock(&sbi->gc_mutex);
put_gc_inode(&ilist);
return gc_status;
}
在GC真正执行前,需要挑选victim
Victim selection
static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim,
int gc_type, int type)
{
struct sit_info *sit_i = SIT_I(sbi);
int ret;
mutex_lock(&sit_i->sentry_lock);
ret = DIRTY_I(sbi)->v_ops->get_victim(sbi, victim, gc_type, type, LFS);
mutex_unlock(&sit_i->sentry_lock);
return ret;
}
/* victim selection function for cleaning and SSR */
struct victim_selection {
int (*get_victim)(struct f2fs_sb_info *, unsigned int *,
int, int, char);
};
static const struct victim_selection default_v_ops = {
.get_victim = get_victim_by_default,
};
/*
* This function is called from two pathes.
* One is garbage collection and the other is SSR segment selection.
* When it is called during GC, it just gets a victim segment
* and it does not remove it from dirty seglist.
* When it is called from SSR segment selection, it finds a segment
* which has minimum valid blocks and removes it from dirty seglist.
*/
static int get_victim_by_default(struct f2fs_sb_info *sbi,
unsigned int *result, int gc_type, int type, char alloc_mode)
{
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
struct victim_sel_policy p;
unsigned int segno;
int nsearched = 0;
// 如果空间不足,这里固定为LFS
p.alloc_mode = alloc_mode;
// 如果空间不足,这里确定:
// - gc_mode为greedy
// - ofs_unit为1
select_policy(sbi, gc_type, type, &p);
p.min_segno = NULL_SEGNO;
p.min_cost = get_max_cost(sbi, &p);
mutex_lock(&dirty_i->seglist_lock);
if (p.alloc_mode == LFS && gc_type == FG_GC) {
// 先尝试获取BG_GC获得的victim
// min_segno应该是min_cost对应的segment
// 毕竟BG的算法是min_cost的
p.min_segno = check_bg_victims(sbi);
if (p.min_segno != NULL_SEGNO)
// 如果有就立刻结束
goto got_it;
}
// 否则再按照算法查找
while (1) {
unsigned long cost;
// p.offset指的是last scanned bitmap offset,因此从这里开始
// 从dirty segmap选择一个dirty segment
segno = find_next_bit(p.dirty_segmap,
TOTAL_SEGS(sbi), p.offset);
// 没找到dirty segment
// 循环会跳出
if (segno >= TOTAL_SEGS(sbi)) {
// last_victim:一个大小为2的数组,按照gc_mode区分
// 此时先清空?不太明白,感觉是指完整扫描过一遍了
if (sbi->last_victim[p.gc_mode]) {
sbi->last_victim[p.gc_mode] = 0;
p.offset = 0;
continue;
}
break;
}
// 下一个find_next_bit的开始地址
p.offset = ((segno / p.ofs_unit) * p.ofs_unit) + p.ofs_unit;
// NOTE: victim_segmap默认为空,见init_victim_segmap()
// 但是挑选的dirty segment这里已经被victim segmap标记
// 更新时机见got_it部分
if (test_bit(segno, dirty_i->victim_segmap[FG_GC]))
continue;
if (gc_type == BG_GC &&
test_bit(segno, dirty_i->victim_segmap[BG_GC]))
continue;
// 不处理curseg指向的起始section
if (IS_CURSEC(sbi, GET_SECNO(sbi, segno)))
continue;
// 见下方具体流程
// 在当前的上下文中,就是获取valid block数目作为成本
cost = get_gc_cost(sbi, segno, &p);
// 如果找到真·min_cost,那就更新对应的min_segno
if (p.min_cost > cost) {
p.min_segno = segno;
p.min_cost = cost;
}
// 如果成本太大,不能选
// 需要下一次循环用新的p.offset再来一遍
if (cost == get_max_cost(sbi, &p))
continue;
// 迭代是有限次数的,MAX_VICTIM_SEARCH = 20
if (nsearched++ >= MAX_VICTIM_SEARCH) {
// 最后一次找到的segno,不管是否min都记下了
// 这有什么用呢?我认为是加速GC,因为在select_policy中,
// 可以看到GC扫描开始的p.offset就是使用last_victim
sbi->last_victim[p.gc_mode] = segno;
break;
}
}
got_it:
if (p.min_segno != NULL_SEGNO) {
// 从这里可以看出最终目标就是求出p.min_segno
// 这里一直有除后再乘上的操作是因为作者假定section和segment不是统一单位
*result = (p.min_segno / p.ofs_unit) * p.ofs_unit;
if (p.alloc_mode == LFS) {
int i;
// 标记对应的victim segmap
for (i = 0; i < p.ofs_unit; i++)
set_bit(*result + i,
dirty_i->victim_segmap[gc_type]);
}
}
mutex_unlock(&dirty_i->seglist_lock);
return (p.min_segno == NULL_SEGNO) ? 0 : 1;
}
static void select_policy(struct f2fs_sb_info *sbi, int gc_type,
int type, struct victim_sel_policy *p)
{
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
// LFS走这里
if (p->alloc_mode) {
// 在当前的流程中,p已经确定是LFS,因此是GREEDY
p->gc_mode = GC_GREEDY;
p->dirty_segmap = dirty_i->dirty_segmap[type];
p->ofs_unit = 1;
} else {
p->gc_mode = select_gc_type(gc_type);
// 这里有点特殊,用的是dirty_segmap[DIRTY]
p->dirty_segmap = dirty_i->dirty_segmap[DIRTY];
p->ofs_unit = sbi->segs_per_sec;
}
p->offset = sbi->last_victim[p->gc_mode];
}
static unsigned int get_max_cost(struct f2fs_sb_info *sbi,
struct victim_sel_policy *p)
{
if (p->gc_mode == GC_GREEDY)
return (1 << sbi->log_blocks_per_seg) * p->ofs_unit;
// CB是Cost-Benefit的意思
else if (p->gc_mode == GC_CB)
return UINT_MAX;
else /* No other gc_mode */
return 0;
}
static unsigned int check_bg_victims(struct f2fs_sb_info *sbi)
{
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
unsigned int segno;
/*
* If the gc_type is FG_GC, we can select victim segments
* selected by background GC before.
* Those segments guarantee they have small valid blocks.
*/
segno = find_next_bit(dirty_i->victim_segmap[BG_GC],
TOTAL_SEGS(sbi), 0);
if (segno < TOTAL_SEGS(sbi)) {
clear_bit(segno, dirty_i->victim_segmap[BG_GC]);
return segno;
}
return NULL_SEGNO;
}
static unsigned int get_gc_cost(struct f2fs_sb_info *sbi, unsigned int segno,
struct victim_sel_policy *p)
{
if (p->alloc_mode == SSR)
return get_seg_entry(sbi, segno)->ckpt_valid_blocks;
/* alloc_mode == LFS */
if (p->gc_mode == GC_GREEDY)
// 从select_policy()分析得出目前流程会走到这里
// 查找valid block数目最少的segno,避免迁移成本
// NOTE: 这里不用取最小,这个工作在get_victim_by_default()完成
// 实现上就是得到sentry的valid_blocks字段(非常直接)
return get_valid_blocks(sbi, segno, sbi->segs_per_sec);
else
// 最优成本算法,考虑老化
return get_cb_cost(sbi, segno);
}
static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno)
{
struct sit_info *sit_i = SIT_I(sbi);
unsigned int secno = GET_SECNO(sbi, segno);
unsigned int start = secno * sbi->segs_per_sec;
unsigned long long mtime = 0;
unsigned int vblocks;
unsigned char age = 0;
unsigned char u;
unsigned int i;
// 累计mtime
// 这里mtime是每个segment加起来的
for (i = 0; i < sbi->segs_per_sec; i++)
mtime += get_seg_entry(sbi, start + i)->mtime;
vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec);
// 换算mtime为segment平均水平
mtime = div_u64(mtime, sbi->segs_per_sec);
vblocks = div_u64(vblocks, sbi->segs_per_sec);
// 参数1:u,块利用率(util%)的意思
// u = vblock * 100 / 512
// = vblock / 512 * 100 (vblock <= 512)
// <= 100
u = (vblocks * 100) >> sbi->log_blocks_per_seg;
/* Handle if the system time is changed by user */
// min和max指的是这个segment中的最大最小修改时间
// 话说为啥平均水平会超出最值呢?好神奇哦
if (mtime < sit_i->min_mtime)
sit_i->min_mtime = mtime;
if (mtime > sit_i->max_mtime)
sit_i->max_mtime = mtime;
// 避免除以0
if (sit_i->max_mtime != sit_i->min_mtime)
// 参数2:age,一个segment的年纪
// age = 100 - 100*(avg - min)/(max - min)
// 这个估值方式有点疑问,为什么是单调时钟而不考虑sbi设墙上时钟?
// 总之是这个算法吧,很久没任何改动就大概是年纪大(avg≈min,age≈100)
age = 100 - div64_u64(100 * (mtime - sit_i->min_mtime),
sit_i->max_mtime - sit_i->min_mtime);
// 利用率部分为(100-u)/(100+u),u范围[0,100]的结果是单调递减的
// 考虑负数的优先级反转后(且要尽可能小的数值),总结:
// - 年纪越大,越容易毕业
// - 利用率越低,越容易毕业
return UINT_MAX - ((100 * (100 - u) * age) / (100 + u));
}
虽然过程有点繁琐,但是这里可看出两种挑选算法的细节:
- 如果是贪心算法,那就\(O(1)\)找出块利用率最少的segment
- 如果是成本最优算法,需要\(O(segments\_per\_section)\)计算,同时考虑段的年龄和块利用率
这里有个疑问,如果使用1:1:1的单位,那成本最优算法也就降为\(O(1)\)的复杂度,在这个上下文中是否所有挑选策略都按这个算法来执行会更好?我看未必,一是成本最优算法的内部复杂度常数仍比贪心算法高不少;二是两种算法都是估值算法,成本最优只是看似更精准的估值方式
GC procedure
上面victim selection拿到segno
后就可以执行核心的gc流程do_garbage_collect()
,处理\([segno, segno + segments\_per\_section)\)编号范围的segment(每次调用处理单个segment)
static int do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno,
struct list_head *ilist, int gc_type)
{
struct page *sum_page;
struct f2fs_summary_block *sum;
int ret = GC_DONE;
/* read segment summary of victim */
// 获取关联的summary
sum_page = get_sum_page(sbi, segno);
if (IS_ERR(sum_page))
return GC_ERROR;
/*
* CP needs to lock sum_page. In this time, we don't need
* to lock this page, because this summary page is not gone anywhere.
* Also, this page is not gonna be updated before GC is done.
*/
unlock_page(sum_page);
sum = page_address(sum_page);
// 区分类型的GC操作
switch (GET_SUM_TYPE((&sum->footer))) {
case SUM_TYPE_NODE:
ret = gc_node_segment(sbi, sum->entries, segno, gc_type);
break;
case SUM_TYPE_DATA:
ret = gc_data_segment(sbi, sum->entries, ilist, segno, gc_type);
break;
}
stat_inc_seg_count(sbi, GET_SUM_TYPE((&sum->footer)));
stat_inc_call_count(sbi->stat_info);
f2fs_put_page(sum_page, 0);
return ret;
}
/*
* This function tries to get parent node of victim data block, and identifies
* data block validity. If the block is valid, copy that with cold status and
* modify parent node.
* If the parent node is not valid or the data block address is different,
* the victim data block is ignored.
*/
static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
struct list_head *ilist, unsigned int segno, int gc_type)
{
struct super_block *sb = sbi->sb;
struct f2fs_summary *entry;
block_t start_addr;
int err, off;
int phase = 0;
// segno起始block转为block_t单位
start_addr = START_BLOCK(sbi, segno);
next_step:
entry = sum;
// 遍历segment,off是block个数偏移量
// 完整for一遍后才会phase递增,再执行for
for (off = 0; off < sbi->blocks_per_seg; off++, entry++) {
struct page *data_page;
struct inode *inode;
struct node_info dni; /* dnode info for the data */
unsigned int ofs_in_node, nofs;
block_t start_bidx;
/*
* It makes sure that free segments are able to write
* all the dirty node pages before CP after this CP.
* So let's check the space of dirty node pages.
*/
if (should_do_checkpoint(sbi)) {
mutex_lock(&sbi->cp_mutex);
block_operations(sbi);
err = GC_BLOCKED;
goto stop;
}
// 根据segno获取sentry
// 再根据off进行sentry->cur_valid_map的testbit
// 正常返回GC_OK (bit:1)
// NOTE:
// bitmap更新时机在前面SIT update章节中的update_sit_entry()
// 就是说write page时一般都置旧页/块为1
err = check_valid_map(sbi, segno, off);
if (err == GC_NEXT)
continue;
if (phase == 0) {
// 阶段0,预读node page
//(nid通过block寻址获得,赋值callee见set_summary())
// NOTE: 这些预读是为下一个phase确保page已在内存中
ra_node_page(sbi, le32_to_cpu(entry->nid));
continue;
}
/* Get an inode by ino with checking validity */
// 获取dni,可进一步获取inode
err = check_dnode(sbi, entry, &dni, start_addr + off, &nofs);
if (err == GC_NEXT)
continue;
if (phase == 1) {
// 阶段1,预读inode page
ra_node_page(sbi, dni.ino);
continue;
}
// start_bidx含义: start block index indicating the given node offset
start_bidx = start_bidx_of_node(nofs);
// TODO: fill_node_footer()
ofs_in_node = le16_to_cpu(entry->ofs_in_node);
if (phase == 2) {
// 获得inode实例
inode = f2fs_iget_nowait(sb, dni.ino);
if (IS_ERR(inode))
continue;
// 以start_bidx + ofs_in_node为page index定位到data page
data_page = find_data_page(inode,
start_bidx + ofs_in_node);
if (IS_ERR(data_page))
goto next_iput;
f2fs_put_page(data_page, 0);
// inode加入到gc list(ilist,是一个栈上的链表)
add_gc_inode(inode, ilist);
// 这里phase == 3
} else {
// 遍历找到ino
inode = find_gc_inode(dni.ino, ilist);
if (inode) {
data_page = get_lock_data_page(inode,
start_bidx + ofs_in_node);
if (IS_ERR(data_page))
continue;
move_data_page(inode, data_page, gc_type);
stat_inc_data_blk_count(sbi, 1);
}
}
continue;
next_iput:
iput(inode);
}
if (++phase < 4)
goto next_step;
err = GC_DONE;
stop:
if (gc_type == FG_GC)
f2fs_submit_bio(sbi, DATA, true);
return err;
}
static void add_gc_inode(struct inode *inode, struct list_head *ilist)
{
struct list_head *this;
// inode_entry是个<list_head*, inode*>二元组
struct inode_entry *new_ie, *ie;
// 避免重复加入
list_for_each(this, ilist) {
ie = list_entry(this, struct inode_entry, list);
if (ie->inode == inode) {
iput(inode);
return;
}
}
repeat:
new_ie = kmem_cache_alloc(winode_slab, GFP_NOFS);
if (!new_ie) {
cond_resched();
goto repeat;
}
new_ie->inode = inode;
list_add_tail(&new_ie->list, ilist);
}
static struct inode *find_gc_inode(nid_t ino, struct list_head *ilist)
{
struct list_head *this;
struct inode_entry *ie;
list_for_each(this, ilist) {
ie = list_entry(this, struct inode_entry, list);
if (ie->inode->i_ino == ino)
return ie->inode;
}
return NULL;
}
static void move_data_page(struct inode *inode, struct page *page, int gc_type)
{
if (page->mapping != inode->i_mapping)
goto out;
if (inode != page->mapping->host)
goto out;
if (PageWriteback(page))
goto out;
// BG的时候没有实际移动,而是设为COLD
if (gc_type == BG_GC) {
set_page_dirty(page);
set_cold_data(page);
} else {
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
mutex_lock_op(sbi, DATA_WRITE);
if (clear_page_dirty_for_io(page) &&
S_ISDIR(inode->i_mode)) {
dec_page_count(sbi, F2FS_DIRTY_DENTS);
inode_dec_dirty_dents(inode);
}
set_cold_data(page);
// 写到新的位置,见Write operation章节
do_write_data_page(page);
mutex_unlock_op(sbi, DATA_WRITE);
clear_cold_data(page);
}
out:
f2fs_put_page(page, 1);
}
当决定victim section后,核心流程会按照地址关系找出具体的block,并执行写操作,因为写操作通常是LFS-style(设置了COLD,因此必然是LFS),也就会迁移到新的地址。被利用的(少数)block依次迁出,后续整个section就回归到可利用的状态
Recovery
如论文所述,F2FS使用check point来实现roll-back recovery。check point可以发生在umount()
和sync()
阶段,先看umount()
的过程
static void f2fs_put_super(struct super_block *sb)
{
struct f2fs_sb_info *sbi = F2FS_SB(sb);
f2fs_destroy_stats(sbi);
stop_gc_thread(sbi);
// 触发check point
write_checkpoint(sbi, false, true);
iput(sbi->node_inode);
iput(sbi->meta_inode);
/* destroy f2fs internal modules */
destroy_node_manager(sbi);
destroy_segment_manager(sbi);
kfree(sbi->ckpt);
sb->s_fs_info = NULL;
brelse(sbi->raw_super_buf);
kfree(sbi);
}
以umount()
为例,上层调用f2fs_put_super()
,内部会执行write_checkpoint()
触发check point
主流程
write_checkpoint()
的大纲其实在论文中已经给出
F2FS triggers a checkpoint procedure as follows:
(1) All dirty node and dentry blocks in the page cache are flushed;
(2) It suspends ordinary writing activities including system calls such as create, unlink and mkdir;
(3) The file system metadata, NAT, SIT and SSA, are written to their dedicated areas on the disk; and
(4) Finally, F2FS writes a checkpoint pack.
/*
* We guarantee that this checkpoint procedure should not fail.
*/
void write_checkpoint(struct f2fs_sb_info *sbi, bool blocked, bool is_umount)
{
struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
unsigned long long ckpt_ver;
if (!blocked) {
mutex_lock(&sbi->cp_mutex);
// 内部会mutex lock若干的锁
block_operations(sbi);
}
// DATA和NODE均是main area block
// META则是main area之外的区域
// NOTE: 然而META多是内存数据结构,刷下去也未必是最新的
f2fs_submit_bio(sbi, DATA, true);
f2fs_submit_bio(sbi, NODE, true);
f2fs_submit_bio(sbi, META, true);
/*
* update checkpoint pack index
* Increase the version number so that
* SIT entries and seg summaries are written at correct place
*/
// 获取check point版本号并且递增
ckpt_ver = le64_to_cpu(ckpt->checkpoint_ver);
ckpt->checkpoint_ver = cpu_to_le64(++ckpt_ver);
/* write cached NAT/SIT entries to NAT/SIT area */
flush_nat_entries(sbi);
flush_sit_entries(sbi);
reset_victim_segmap(sbi);
/* unlock all the fs_lock[] in do_checkpoint() */
do_checkpoint(sbi, is_umount);
unblock_operations(sbi);
mutex_unlock(&sbi->cp_mutex);
}
稍微有的细节差异是flush操作在block操作之后(block操作是通过mutex lock完成的)
NAT flush
这里看下第三步里面的NAT刷新是怎么做的
/*
* This function is called during the checkpointing process.
*/
void flush_nat_entries(struct f2fs_sb_info *sbi)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
struct f2fs_summary_block *sum = curseg->sum_blk;
struct list_head *cur, *n;
struct page *page = NULL;
struct f2fs_nat_block *nat_blk = NULL;
nid_t start_nid = 0, end_nid = 0;
bool flushed;
// true表示journal数目(curseg->sum_blk->n_nats)已满
// 即数目超过(SUM_JOURNAL_SIZE - 2)/sizeof(struct nat_journal_entry)
// SUM_JOURNAL_SIZE表示一个4k block减去summary_footer(5字节)和entry(7字节*512)
//
// 如果没满,直接返回false,否则刷入到dirty_nat_entries再返回true
flushed = flush_nats_in_journal(sbi);
if (!flushed)
mutex_lock(&curseg->curseg_mutex);
/* 1) flush dirty nat caches */
// dirty_nat_entries一般是在set_node_addr()时更新
// set_node_addr()的caller有f2fs_write_node_page(),new_node_page()
// 除此以外上面的flush_nats_in_journal也会记录到dirty_nat_entries
// 这个具体流程有空再补吧
//
// 遍历dirty_nat_entries并刷出到raw_ne
list_for_each_safe(cur, n, &nm_i->dirty_nat_entries) {
struct nat_entry *ne;
nid_t nid;
struct f2fs_nat_entry raw_ne;
int offset = -1;
block_t new_blkaddr;
ne = list_entry(cur, struct nat_entry, list);
nid = nat_get_nid(ne);
if (nat_get_blkaddr(ne) == NEW_ADDR)
continue;
if (flushed)
goto to_nat_page;
/* if there is room for nat enries in curseg->sumpage */
// 在这个上下文中,该函数主要遍历测试i∈[0, nats_in_cursum(sum))
// if nid_in_journal(sum, i)) == val,找到就返回i
// #define nid_in_journal(sum, i) (sum->nat_j.entries[i].nid)
offset = lookup_journal_in_cursum(sum, NAT_JOURNAL, nid, 1);
if (offset >= 0) {
// 得到sum->nat_j.entries[offset].ne
// 这里是!flushed,raw_ne从journal得到
// 另外的来源(flused情况)见to_nat_page
raw_ne = nat_in_journal(sum, offset);
goto flush_now;
}
to_nat_page:
if (!page || (start_nid > nid || nid > end_nid)) {
// 因为这是在一个循环内部,多个nid可能对着一个page
// 直到超出范围才put掉
if (page) {
f2fs_put_page(page, 1);
page = NULL;
}
start_nid = START_NID(nid);
end_nid = start_nid + NAT_ENTRY_PER_BLOCK - 1;
/*
* get nat block with dirty flag, increased reference
* count, mapped and lock
*/
page = get_next_nat_page(sbi, start_nid);
// 实际的NAT block
nat_blk = page_address(page);
}
BUG_ON(!nat_blk);
// 如果是flushed,那么raw_ne是从nat_blk通过readpage得到的
raw_ne = nat_blk->entries[nid - start_nid];
flush_now:
new_blkaddr = nat_get_blkaddr(ne);
// 刷入ne三元组
raw_ne.ino = cpu_to_le32(nat_get_ino(ne));
raw_ne.block_addr = cpu_to_le32(new_blkaddr);
raw_ne.version = nat_get_version(ne);
// 脏的要么写到NAT block,要么写到journal
if (offset < 0) {
nat_blk->entries[nid - start_nid] = raw_ne;
} else {
nat_in_journal(sum, offset) = raw_ne;
nid_in_journal(sum, offset) = cpu_to_le32(nid);
}
if (nat_get_blkaddr(ne) == NULL_ADDR) {
write_lock(&nm_i->nat_tree_lock);
__del_from_nat_cache(nm_i, ne);
write_unlock(&nm_i->nat_tree_lock);
/* We can reuse this freed nid at this point */
add_free_nid(NM_I(sbi), nid);
} else {
write_lock(&nm_i->nat_tree_lock);
__clear_nat_cache_dirty(nm_i, ne);
ne->checkpointed = true;
write_unlock(&nm_i->nat_tree_lock);
}
}
if (!flushed)
mutex_unlock(&curseg->curseg_mutex);
f2fs_put_page(page, 1);
/* 2) shrink nat caches if necessary */
try_to_free_nats(sbi, nm_i->nat_cnt - NM_WOUT_THRESHOLD);
}
NAT flush操作是要把内存上已脏的NAT entry刷入到外存,但是NAT entry可能是真的在NAT area中,也可能只是一个NAT journal,因此要刷到哪里就看journal空间是否足够,尽量使用journal避免频繁in-place的写入操作
SIT flush
SIT flush与NAT flush类似,即区分SIT area和SIT journal
CP procedure
脏数据刷写完成后,就进入到实际的check point执行
static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
{
struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
nid_t last_nid = 0;
block_t start_blk;
struct page *cp_page;
unsigned int data_sum_blocks, orphan_blocks;
unsigned int crc32 = 0;
void *kaddr;
int i;
/* Flush all the NAT/SIT pages */
while (get_pages(sbi, F2FS_DIRTY_META))
sync_meta_pages(sbi, META, LONG_MAX);
// 求出last_nid,后面写到check point的next_free_nid
next_free_nid(sbi, &last_nid);
/*
* modify checkpoint
* version number is already updated
*/
// checkpoint_ver在NAT/SIT flush前已更新
ckpt->elapsed_time = cpu_to_le64(get_mtime(sbi));
ckpt->valid_block_count = cpu_to_le64(valid_user_blocks(sbi));
ckpt->free_segment_count = cpu_to_le32(free_segments(sbi));
for (i = 0; i < 3; i++) {
ckpt->cur_node_segno[i] =
cpu_to_le32(curseg_segno(sbi, i + CURSEG_HOT_NODE));
ckpt->cur_node_blkoff[i] =
cpu_to_le16(curseg_blkoff(sbi, i + CURSEG_HOT_NODE));
ckpt->alloc_type[i + CURSEG_HOT_NODE] =
curseg_alloc_type(sbi, i + CURSEG_HOT_NODE);
}
for (i = 0; i < 3; i++) {
ckpt->cur_data_segno[i] =
cpu_to_le32(curseg_segno(sbi, i + CURSEG_HOT_DATA));
ckpt->cur_data_blkoff[i] =
cpu_to_le16(curseg_blkoff(sbi, i + CURSEG_HOT_DATA));
ckpt->alloc_type[i + CURSEG_HOT_DATA] =
curseg_alloc_type(sbi, i + CURSEG_HOT_DATA);
}
ckpt->valid_node_count = cpu_to_le32(valid_node_count(sbi));
ckpt->valid_inode_count = cpu_to_le32(valid_inode_count(sbi));
ckpt->next_free_nid = cpu_to_le32(last_nid);
/* 2 cp + n data seg summary + orphan inode blocks */
// 这是summary在CP area占用的page/block个数
data_sum_blocks = npages_for_summary_flush(sbi);
// compact mode影响write_data_summaries()写入和restore_curseg_summaries()恢复
// 1-2个page内可以搞定,就设为compact
if (data_sum_blocks < 3)
set_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG);
else
clear_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG);
// orphan inode用于处理unlink/truncate时的一致性
// 基本思路:
// 1. 当需要删除/截断一个inode时,先把inode记录到orphan外存
// 2. 操作成功后再从orphan移除
// 3. 加载check point时会重做一遍仍在orphan里的操作
// 这样在1-2操作完成前宕机仍保持一致性
orphan_blocks = (sbi->n_orphans + F2FS_ORPHANS_PER_BLOCK - 1)
/ F2FS_ORPHANS_PER_BLOCK;
ckpt->cp_pack_start_sum = cpu_to_le32(1 + orphan_blocks);
if (is_umount) {
set_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
ckpt->cp_pack_total_block_count = cpu_to_le32(2 +
data_sum_blocks + orphan_blocks + NR_CURSEG_NODE_TYPE);
} else {
clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
ckpt->cp_pack_total_block_count = cpu_to_le32(2 +
data_sum_blocks + orphan_blocks);
}
if (sbi->n_orphans)
set_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG);
else
clear_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG);
/* update SIT/NAT bitmap */
// __bitmap_ptr指向ckpt中对应的bitmap
// 这两个函数会把对应的sit_i->sit_bitmap和nm_i->nat_bitmap拷贝到ckpt
get_sit_bitmap(sbi, __bitmap_ptr(sbi, SIT_BITMAP));
get_nat_bitmap(sbi, __bitmap_ptr(sbi, NAT_BITMAP));
crc32 = f2fs_crc32(ckpt, le32_to_cpu(ckpt->checksum_offset));
*(__le32 *)((unsigned char *)ckpt +
le32_to_cpu(ckpt->checksum_offset))
= cpu_to_le32(crc32);
// 这里的开始地址是有讲究的
// 前面讲过cp pack之间是交替存放的,间隔segment
start_blk = __start_cp_addr(sbi);
/* write out checkpoint buffer at block 0 */
cp_page = grab_meta_page(sbi, start_blk++);
kaddr = page_address(cp_page);
// 使用一个block存放ckpt
memcpy(kaddr, ckpt, (1 << sbi->log_blocksize));
set_page_dirty(cp_page);
f2fs_put_page(cp_page, 1);
// 使用orphan_blocks数目的block存放orphan
if (sbi->n_orphans) {
write_orphan_inodes(sbi, start_blk);
start_blk += orphan_blocks;
}
// compact mode和normal mode的写入方式不一样
// 使用data_sum_blocks数目的block存放data summary
write_data_summaries(sbi, start_blk);
start_blk += data_sum_blocks;
if (is_umount) {
// umount的情况还写入node summary
write_node_summaries(sbi, start_blk);
// 固定占坑3个block
start_blk += NR_CURSEG_NODE_TYPE;
}
/* writeout checkpoint block */
// 再使用一个block存放ckpt,用于校验
cp_page = grab_meta_page(sbi, start_blk);
kaddr = page_address(cp_page);
memcpy(kaddr, ckpt, (1 << sbi->log_blocksize));
set_page_dirty(cp_page);
f2fs_put_page(cp_page, 1);
// 剩下的就是善后处理
/* wait for previous submitted node/meta pages writeback */
while (get_pages(sbi, F2FS_WRITEBACK))
congestion_wait(BLK_RW_ASYNC, HZ / 50);
filemap_fdatawait_range(sbi->node_inode->i_mapping, 0, LONG_MAX);
filemap_fdatawait_range(sbi->meta_inode->i_mapping, 0, LONG_MAX);
/* update user_block_counts */
sbi->last_valid_block_count = sbi->total_valid_block_count;
sbi->alloc_valid_block_count = 0;
/* Here, we only have one bio having CP pack */
if (is_set_ckpt_flags(ckpt, CP_ERROR_FLAG))
// 出事了就进入只读模式
sbi->sb->s_flags |= MS_RDONLY;
else
sync_meta_pages(sbi, META_FLUSH, LONG_MAX);
clear_prefree_segments(sbi);
F2FS_RESET_SB_DIRT(sbi);
}
/*
* Calculate the number of current summary pages for writing
*/
int npages_for_summary_flush(struct f2fs_sb_info *sbi)
{
int total_size_bytes = 0;
// summary entry数目
int valid_sum_count = 0;
int i, sum_space;
// 计算curseg的sum block
for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
if (sbi->ckpt->alloc_type[i] == SSR)
valid_sum_count += sbi->blocks_per_seg;
else
// 直接加上curseg->next_blkoff
// 一个seg内next_blkoff前的sum entry均有效
valid_sum_count += curseg_blkoff(sbi, i);
}
// 为什么需要+1?
total_size_bytes = valid_sum_count * (SUMMARY_SIZE + 1)
+ sizeof(struct nat_journal) + 2
+ sizeof(struct sit_journal) + 2;
sum_space = PAGE_CACHE_SIZE - SUM_FOOTER_SIZE;
// 1个page内可以搞定
if (total_size_bytes < sum_space)
return 1;
// 2个page内可以搞定
else if (total_size_bytes < 2 * sum_space)
return 2;
// 看后面分析,normal mode在3个page内必能搞定
// 就是三种data type curseg,每种各写一个page
// (sum_blk直接拷过去)
return 3;
}
static inline block_t __start_cp_addr(struct f2fs_sb_info *sbi)
{
block_t start_addr = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr);
// 前面分析过cp pack,交替使用,pack之间间隔segment单位
if (sbi->cur_cp_pack == 2)
start_addr += sbi->blocks_per_seg;
return start_addr;
}
void write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk)
{
// 区分compact mode还是normal mode
if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_COMPACT_SUM_FLAG))
write_compacted_summaries(sbi, start_blk);
else
write_normal_summaries(sbi, start_blk, CURSEG_HOT_DATA);
}
static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr)
{
struct page *page;
unsigned char *kaddr;
struct f2fs_summary *summary;
struct curseg_info *seg_i;
int written_size = 0;
int i, j;
// 这里一开始传入blkaddr位于CP area
// 可能是ckpt后面,也可能是orphan后面
page = grab_meta_page(sbi, blkaddr++);
kaddr = (unsigned char *)page_address(page);
/* Step 1: write nat cache */
// 写入HOT_DATA存放的NAT jorunal
seg_i = CURSEG_I(sbi, CURSEG_HOT_DATA);
memcpy(kaddr, &seg_i->sum_blk->n_nats, SUM_JOURNAL_SIZE);
written_size += SUM_JOURNAL_SIZE;
/* Step 2: write sit cache */
// 写入COLD_DATA存放的SIT journal
seg_i = CURSEG_I(sbi, CURSEG_COLD_DATA);
memcpy(kaddr + written_size, &seg_i->sum_blk->n_sits,
SUM_JOURNAL_SIZE);
written_size += SUM_JOURNAL_SIZE;
set_page_dirty(page);
/* Step 3: write summary entries */
// 写入summary entry
for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
unsigned short blkoff;
seg_i = CURSEG_I(sbi, i);
if (sbi->ckpt->alloc_type[i] == SSR)
blkoff = sbi->blocks_per_seg;
else
// 定位到最后的valid summary entry前
blkoff = curseg_blkoff(sbi, i);
// 依次遍历即可
for (j = 0; j < blkoff; j++) {
if (!page) {
page = grab_meta_page(sbi, blkaddr++);
kaddr = (unsigned char *)page_address(page);
written_size = 0;
}
summary = (struct f2fs_summary *)(kaddr + written_size);
*summary = seg_i->sum_blk->entries[j];
written_size += SUMMARY_SIZE;
set_page_dirty(page);
// page还没满
if (written_size + SUMMARY_SIZE <= PAGE_CACHE_SIZE -
SUM_FOOTER_SIZE)
continue;
f2fs_put_page(page, 1);
page = NULL;
}
}
if (page)
f2fs_put_page(page, 1);
}
static void write_normal_summaries(struct f2fs_sb_info *sbi,
block_t blkaddr, int type)
{
int i, end;
// type在这里传入CURSEG_HOT_DATA
if (IS_DATASEG(type))
end = type + NR_CURSEG_DATA_TYPE;
else
end = type + NR_CURSEG_NODE_TYPE;
// 遍历DATA TYPE curseg
// 总共三种温度,各写一个page,按[HOT, WARM, COLD]顺序排放在blkaddr后面
// 如上面compact写入分析,blkaddr要么定位在ckpt后面,要么在orphan后面
for (i = type; i < end; i++) {
struct curseg_info *sum = CURSEG_I(sbi, i);
mutex_lock(&sum->curseg_mutex);
write_sum_page(sbi, sum->sum_blk, blkaddr + (i - type));
mutex_unlock(&sum->curseg_mutex);
}
}
执行过程处理这些关键路径:
- 构造ckpt结构体
- 计算data summary,决定使用compact mode还是normal mode
- SIT bitmap和NAT bitmap放入ckpt
- 开始摆放CP area,起始地址按cp pack决定
- 写入ckpt到CP area
- 写入orphan到CP area「可选」
- 写入data summary到CP area
- 写入node summary到CP area「可选」
- 写入同一份ckpt到CP area
这里还有一些细节:
- 写入data summmary可以有compact mode或者normal mode
- compact mode可以压缩data summary到1至2个block的大小
umount()
时才会写入node summary
如果使能compact mode,那么会按照
[NAT-journal], [SIT-journal], [mixed-summary-entry]
的形式写入data summary;否则直接按3个curseg->sum_blk
分别写入data summary,即
[HOT-DATA-sum_blk], [WARM-DATA-sum_blk], [COLD_DATA-sum_blk]
数据结构之间的联系见上文mkfs.f2fs格式化流程对summary block的解释
现在已经得知check point的构造方式,那回滚恢复就不是什么难事,按部就班解析就行了
剩余部分鋭意製作中,先放松一下吧
Last update: 2023/10/19
Kokona is cute!