學習一個文件系統,需要熟悉它的數據layout,爲此就必須深入理解layout相關的數據結構。結合本人最近學習f2fs的心得,下面總結了相關的幾個最重要的數據結構。
基本概念
block: 4KB對齊且連續的物理存儲空間
segment: 2M連續的物理存儲空間
session: 若干連續的segment 組成
zone: 若干連續的zone組成
node
node是內部用來定位的。通過下面的數據結構可以看到,f2fs裏面的node 主要就是用來記錄block的地址。相關的數據結構如下:
struct f2fs_node {
/* can be one of three types: inode, direct, and indirect types */
union {
struct f2fs_inode i;
struct direct_node dn;
struct indirect_node in;
};
struct node_footer footer;
} __packed;
struct direct_node {
__le32 addr[ADDRS_PER_BLOCK]; /* array of data block address */
} __packed;
struct indirect_node {
__le32 nid[NIDS_PER_BLOCK]; /* array of data block address */
} __packed;
上面f2fs_node包含了以個union,裏面可能是f2fs_inode。它裏面的一個重要內容就是用來索引邏輯文件或者目錄裏的數據。具體結構如下。
inode
inode是用來和外部用戶交互的,inode包括和VFS交互,包括ACL、time等相關數據信息。主要數據結構如下:
struct f2fs_inode {
__le16 i_mode; /* file mode */
__u8 i_advise; /* file hints */
__u8 i_inline; /* file inline flags */
__le32 i_uid; /* user ID */
__le32 i_gid; /* group ID */
__le32 i_links; /* links count */
__le64 i_size; /* file size in bytes */
__le64 i_blocks; /* file size in blocks */
__le64 i_atime; /* access time */
__le64 i_ctime; /* change time */
__le64 i_mtime; /* modification time */
__le32 i_atime_nsec; /* access time in nano scale */
__le32 i_ctime_nsec; /* change time in nano scale */
__le32 i_mtime_nsec; /* modification time in nano scale */
__le32 i_generation; /* file version (for NFS) */
union {
__le32 i_current_depth; /* only for directory depth */
__le16 i_gc_failures; /*
* # of gc failures on pinned file.
* only for regular files.
*/
};
__le32 i_xattr_nid; /* nid to save xattr */
__le32 i_flags; /* file attributes */
__le32 i_pino; /* parent inode number */
__le32 i_namelen; /* file name length */
__u8 i_name[F2FS_NAME_LEN]; /* file name for SPOR */
__u8 i_dir_level; /* dentry_level for large dir */
struct f2fs_extent i_ext; /* caching a largest extent */
union {
struct { // for what usage?
__le16 i_extra_isize; /* extra inode attribute size */
__le16 i_inline_xattr_size; /* inline xattr size, unit: 4 bytes */
__le32 i_projid; /* project id */
__le32 i_inode_checksum;/* inode meta checksum */
__le64 i_crtime; /* creation time */
__le32 i_extra_end[0]; /* for attribute size calculation */
} __packed;
__le32 i_addr[DEF_ADDRS_PER_INODE]; /* Pointers to data blocks */
};
__le32 i_nid[DEF_NIDS_PER_INODE]; /* direct(2), indirect(2),
double_indirect(1) node id */
} __packed;
其中上面的i_addr 直接可以指向數據塊,如果數據塊的數量超過了DEF_NIDS_PER_INODE,就需要使用i_nid。 i_nid 數組可以用來分別指向2個direct、2個indirect、1個double indirect的 block地址索引塊。
NAT
上面f2fs_inode數據結構是一個inode塊裏面的內容。那麼這個inode塊的地址如何確定呢?這就是f2fs_nat_entry的職責了, 每個f2fs_nat_entry 記錄了每個inode編號和其inode塊數據地址的對應關係。而專門存儲f2fs_nat_entry的block,組成了f2fs_nat_block。
/*
* For NAT entries
*/
#define NAT_ENTRY_PER_BLOCK (PAGE_SIZE / sizeof(struct f2fs_nat_entry))
struct f2fs_nat_entry {
__u8 version; /* latest version of cached nat entry */
__le32 ino; /* inode number */
__le32 block_addr; /* block address */
} __packed;
struct f2fs_nat_block {
struct f2fs_nat_entry entries[NAT_ENTRY_PER_BLOCK];
} __packed;
可是上面問題又來了, inode number如何確定?NAT block起始地址在哪,有多少個?
f2fs dir entry
f2fs_dir_entry 回答了上面的第一個問題,它把inode number和文件名通過hash關聯起來了。同樣,也有專門存儲f2fs_dir_entry的塊,叫做f2fs_dentry_block.
#define NR_DENTRY_IN_BLOCK 214 /* the number of dentry in a block */
#define SIZE_OF_DIR_ENTRY 11 /* by byte */
#define SIZE_OF_DENTRY_BITMAP ((NR_DENTRY_IN_BLOCK + BITS_PER_BYTE - 1) / \
BITS_PER_BYTE)
#define SIZE_OF_RESERVED (PAGE_SIZE - ((SIZE_OF_DIR_ENTRY + \
F2FS_SLOT_LEN) * \
NR_DENTRY_IN_BLOCK + SIZE_OF_DENTRY_BITMAP))
/* One directory entry slot representing F2FS_SLOT_LEN-sized file name */
struct f2fs_dir_entry {
__le32 hash_code; /* hash code of file name */
__le32 ino; /* inode number */
__le16 name_len; /* lengh of file name */
__u8 file_type; /* file type */
} __packed;
/* 4KB-sized directory entry block */
struct f2fs_dentry_block {
/* validity bitmap for directory entries in each block */
__u8 dentry_bitmap[SIZE_OF_DENTRY_BITMAP];
__u8 reserved[SIZE_OF_RESERVED];
struct f2fs_dir_entry dentry[NR_DENTRY_IN_BLOCK];
__u8 filename[NR_DENTRY_IN_BLOCK][F2FS_SLOT_LEN];
} __packed;
這裏我之前的一個顧慮是,如果出現不同file name的inode hash到同一個inode number,豈不是出問題了?後來通過看具體實現的代碼,可以看到,實際還會加上file name的比較。這樣就可以避免碰撞了。
那麼,f2fs第一個inode節點(root indoe)的inode number是怎麼確定的?又存儲在哪呢?
f2fs super block
f2fs super block數據結構回答了上面的問題,同時也記錄了NAT block的起始地址。主要的數據結構如下:
struct f2fs_super_block {
__le32 magic; /* Magic Number */
__le16 major_ver; /* Major Version */
__le16 minor_ver; /* Minor Version */
__le32 log_sectorsize; /* log2 sector size in bytes */
__le32 log_sectors_per_block; /* log2 # of sectors per block */
__le32 log_blocksize; /* log2 block size in bytes */
__le32 log_blocks_per_seg; /* log2 # of blocks per segment */
__le32 segs_per_sec; /* # of segments per section */
__le32 secs_per_zone; /* # of sections per zone */
__le32 checksum_offset; /* checksum offset inside super block */
__le64 block_count; /* total # of user blocks */
__le32 section_count; /* total # of sections */
__le32 segment_count; /* total # of segments */
__le32 segment_count_ckpt; /* # of segments for checkpoint */
__le32 segment_count_sit; /* # of segments for SIT */
__le32 segment_count_nat; /* # of segments for NAT */
__le32 segment_count_ssa; /* # of segments for SSA */
__le32 segment_count_main; /* # of segments for main area */
__le32 segment0_blkaddr; /* start block address of segment 0 */
__le32 cp_blkaddr; /* start block address of checkpoint */
__le32 sit_blkaddr; /* start block address of SIT */
__le32 nat_blkaddr; /* start block address of NAT */
__le32 ssa_blkaddr; /* start block address of SSA */
__le32 main_blkaddr; /* start block address of main area */
__le32 root_ino; /* root inode number */
__le32 node_ino; /* node inode number */
__le32 meta_ino; /* meta inode number */
__u8 uuid[16]; /* 128-bit uuid for volume */
__le16 volume_name[MAX_VOLUME_NAME]; /* volume name */
__le32 extension_count; /* # of extensions below */
__u8 extension_list[F2FS_MAX_EXTENSION][F2FS_EXTENSION_LEN];/* extension array */
__le32 cp_payload;
__u8 version[VERSION_LEN]; /* the kernel version */
__u8 init_version[VERSION_LEN]; /* the initial kernel version */
__le32 feature; /* defined features */
__u8 encryption_level; /* versioning level for encryption */
__u8 encrypt_pw_salt[16]; /* Salt used for string2key algorithm */
struct f2fs_device devs[MAX_DEVICES]; /* device list */
__le32 qf_ino[F2FS_MAX_QUOTAS]; /* quota inode numbers */
__u8 hot_ext_count; /* # of hot file extension */
__u8 reserved[314]; /* valid reserved region */
} __packed;
而super block的位置是固定的,當以f2fs格式化一個磁盤的時候,它會寫入到磁盤固定偏移的地方。
SIT
由於f2fs是LFS,追加的寫的大小不固定,很可能小於一個segment的大小,這就需要記錄哪些block已經使用。segment info table 就是做這個事情的,裏面的valid_map記錄了有效的塊。
/*
* Note that f2fs_sit_entry->vblocks has the following bit-field information.
* [15:10] : allocation type such as CURSEG_XXXX_TYPE
* [9:0] : valid block count
*/
#define SIT_VBLOCKS_SHIFT 10
#define SIT_VBLOCKS_MASK ((1 << SIT_VBLOCKS_SHIFT) - 1)
#define GET_SIT_VBLOCKS(raw_sit) \
(le16_to_cpu((raw_sit)->vblocks) & SIT_VBLOCKS_MASK)
#define GET_SIT_TYPE(raw_sit) \
((le16_to_cpu((raw_sit)->vblocks) & ~SIT_VBLOCKS_MASK) \
>> SIT_VBLOCKS_SHIFT)
struct f2fs_sit_entry {
__le16 vblocks; /* reference above */
__u8 valid_map[SIT_VBLOCK_MAP_SIZE]; /* bitmap for valid blocks */
__le64 mtime; /* segment age for cleaning */
} __packed;
struct f2fs_sit_block {
struct f2fs_sit_entry entries[SIT_ENTRY_PER_BLOCK];
} __packed;
segment summary
f2fs 一個重要的設計特色就是避免了對傳統LFS 的wandering tree問題,這個主要是通過segment summary 相關的數據結構實現的。
通過上面SIT的介紹,一次寫之後,需要更新對應的SIT。這個更新會記錄到 f2fs_sit_journal_entry中:
struct sit_journal_entry {
__le32 segno;
struct f2fs_sit_entry se;
} __packed;
struct sit_journal {
struct sit_journal_entry entries[SIT_JOURNAL_ENTRIES];
__u8 reserved[SIT_JOURNAL_RESERVED];
} __packed;
如果新建一個文件或目錄,並且有寫操作,就需要更新nat 區域。同樣對這個inode的更新也會記錄到f2fs_nat_journal_entry中:
struct nat_journal_entry {
__le32 nid;
struct f2fs_nat_entry ne;
} __packed;
struct nat_journal {
struct nat_journal_entry entries[NAT_JOURNAL_ENTRIES];
__u8 reserved[NAT_JOURNAL_RESERVED];
}
一個寫操作,其實對NAT/SIT更新的區域的很小。如果每次都直接更新這兩個區域,對SSD會導致比較大的寫放大。爲了避免這個問題,f2fs 通過segment summary把這些零星的寫攢到segment summary 區域。
/*
* For segment summary
*
* One summary block contains exactly 512 summary entries, which represents
* exactly 2MB segment by default. Not allow to change the basic units.
*
* NOTE: For initializing fields, you must use set_summary
*
* - If data page, nid represents dnode's nid
* - If node page, nid represents the node page's nid.
*
* The ofs_in_node is used by only data page. It represents offset
* from node's page's beginning to get a data block address.
* ex) data_blkaddr = (block_t)(nodepage_start_address + ofs_in_node)
*/
#define ENTRIES_IN_SUM 512
#define SUMMARY_SIZE (7) /* sizeof(struct summary) */
#define SUM_FOOTER_SIZE (5) /* sizeof(struct summary_footer) */
#define SUM_ENTRY_SIZE (SUMMARY_SIZE * ENTRIES_IN_SUM)
/* a summary entry for a 4KB-sized block in a segment */
struct f2fs_summary {
__le32 nid; /* parent node id */
union {
__u8 reserved[3];
struct {
__u8 version; /* node version number */
__le16 ofs_in_node; /* block index in parent node */
} __packed;
};
} __packed;
/* summary block type, node or data, is stored to the summary_footer */
#define SUM_TYPE_NODE (1)
#define SUM_TYPE_DATA (0)
struct summary_footer {
unsigned char entry_type; /* SUM_TYPE_XXX */
__le32 check_sum; /* summary checksum */
} __packed;
#define SUM_JOURNAL_SIZE (F2FS_BLKSIZE - SUM_FOOTER_SIZE -\
SUM_ENTRY_SIZE)
#define NAT_JOURNAL_ENTRIES ((SUM_JOURNAL_SIZE - 2) /\
sizeof(struct nat_journal_entry))
#define NAT_JOURNAL_RESERVED ((SUM_JOURNAL_SIZE - 2) %\
sizeof(struct nat_journal_entry))
#define SIT_JOURNAL_ENTRIES ((SUM_JOURNAL_SIZE - 2) /\
sizeof(struct sit_journal_entry))
#define SIT_JOURNAL_RESERVED ((SUM_JOURNAL_SIZE - 2) %\
sizeof(struct sit_journal_entry))
/* Reserved area should make size of f2fs_extra_info equals to
* that of nat_journal and sit_journal.
*/
#define EXTRA_INFO_RESERVED (SUM_JOURNAL_SIZE - 2 - 8)
/*
* frequently updated NAT/SIT entries can be stored in the spare area in
* summary blocks
*/
enum {
NAT_JOURNAL = 0,
SIT_JOURNAL
};
struct f2fs_extra_info {
__le64 kbytes_written;
__u8 reserved[EXTRA_INFO_RESERVED];
} __packed;
struct f2fs_journal {
union {
__le16 n_nats;
__le16 n_sits;
};
/* spare area is used by NAT or SIT journals or extra info */
union {
struct nat_journal nat_j;
struct sit_journal sit_j;
struct f2fs_extra_info info;
};
} __packed;
/* 4KB-sized summary block structure */
struct f2fs_summary_block {
struct f2fs_summary entries[ENTRIES_IN_SUM]; // 512 entry * 6 bytes per entry ,used to recor where has modifiaction
struct f2fs_journal journal;
struct summary_footer footer;
} __packed;
file 相關操作
f2fs.h:
file_operations f2fs_file_operations:
fs/f2fs/file.c:
const struct file_operations f2fs_file_operations = {
.llseek = f2fs_llseek,
.read_iter = generic_file_read_iter,
.write_iter = f2fs_file_write_iter,
.open = f2fs_file_open,
.release = f2fs_release_file,
.mmap = f2fs_file_mmap,
.flush = f2fs_file_flush,
.fsync = f2fs_sync_file,
.fallocate = f2fs_fallocate,
.unlocked_ioctl = f2fs_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = f2fs_compat_ioctl,
#endif
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
};
參考
include/linux/f2fs_fs.h
fs/f2fs/
Documentation/filesystems/f2fs.txt