f2fs系列之二: 重要的數據結構

學習一個文件系統,需要熟悉它的數據layout,爲此就必須深入理解layout相關的數據結構。結合本人最近學習f2fs的心得,下面總結了相關的幾個最重要的數據結構。

基本概念

block: 4KB對齊且連續的物理存儲空間
segment: 2M連續的物理存儲空間
session: 若干連續的segment 組成
zone: 若干連續的zone組成

node

node是內部用來定位的。通過下面的數據結構可以看到,f2fs裏面的node 主要就是用來記錄block的地址。相關的數據結構如下:

struct f2fs_node {
        /* can be one of three types: inode, direct, and indirect types */
        union {
                struct f2fs_inode i;
                struct direct_node dn;
                struct indirect_node in;
        };
        struct node_footer footer;
} __packed;

struct direct_node {
        __le32 addr[ADDRS_PER_BLOCK];   /* array of data block address */
} __packed;

struct indirect_node {
        __le32 nid[NIDS_PER_BLOCK];     /* array of data block address */
} __packed;

上面f2fs_node包含了以個union,裏面可能是f2fs_inode。它裏面的一個重要內容就是用來索引邏輯文件或者目錄裏的數據。具體結構如下。

inode

inode是用來和外部用戶交互的,inode包括和VFS交互,包括ACL、time等相關數據信息。主要數據結構如下:

struct f2fs_inode {
        __le16 i_mode;                  /* file mode */
        __u8 i_advise;                  /* file hints */
        __u8 i_inline;                  /* file inline flags */
        __le32 i_uid;                   /* user ID */
        __le32 i_gid;                   /* group ID */
        __le32 i_links;                 /* links count */
        __le64 i_size;                  /* file size in bytes */
        __le64 i_blocks;                /* file size in blocks */
        __le64 i_atime;                 /* access time */
        __le64 i_ctime;                 /* change time */
        __le64 i_mtime;                 /* modification time */
        __le32 i_atime_nsec;            /* access time in nano scale */
        __le32 i_ctime_nsec;            /* change time in nano scale */
        __le32 i_mtime_nsec;            /* modification time in nano scale */
        __le32 i_generation;            /* file version (for NFS) */
        union {
                __le32 i_current_depth; /* only for directory depth */
                __le16 i_gc_failures;   /*
                                         * # of gc failures on pinned file.
                                         * only for regular files.
                                         */
        };
        __le32 i_xattr_nid;             /* nid to save xattr */
        __le32 i_flags;                 /* file attributes */
        __le32 i_pino;                  /* parent inode number */
        __le32 i_namelen;               /* file name length */
        __u8 i_name[F2FS_NAME_LEN];     /* file name for SPOR */
        __u8 i_dir_level;               /* dentry_level for large dir */

        struct f2fs_extent i_ext;       /* caching a largest extent */

        union {
                struct { // for what usage?
                        __le16 i_extra_isize;   /* extra inode attribute size */
                        __le16 i_inline_xattr_size;     /* inline xattr size, unit: 4 bytes */
                        __le32 i_projid;        /* project id */
                        __le32 i_inode_checksum;/* inode meta checksum */
                        __le64 i_crtime;        /* creation time */
                                                             __le32 i_extra_end[0];  /* for attribute size calculation */
                } __packed;
                __le32 i_addr[DEF_ADDRS_PER_INODE];     /* Pointers to data blocks */
        };
        __le32 i_nid[DEF_NIDS_PER_INODE];       /* direct(2), indirect(2),
                                                double_indirect(1) node id */
} __packed;

其中上面的i_addr 直接可以指向數據塊,如果數據塊的數量超過了DEF_NIDS_PER_INODE,就需要使用i_nid。 i_nid 數組可以用來分別指向2個direct、2個indirect、1個double indirect的 block地址索引塊。

NAT

上面f2fs_inode數據結構是一個inode塊裏面的內容。那麼這個inode塊的地址如何確定呢?這就是f2fs_nat_entry的職責了, 每個f2fs_nat_entry 記錄了每個inode編號和其inode塊數據地址的對應關係。而專門存儲f2fs_nat_entry的block,組成了f2fs_nat_block。

/*
 * For NAT entries
 */
#define NAT_ENTRY_PER_BLOCK (PAGE_SIZE / sizeof(struct f2fs_nat_entry))

struct f2fs_nat_entry {
        __u8 version;           /* latest version of cached nat entry */
        __le32 ino;             /* inode number */
        __le32 block_addr;      /* block address */
} __packed;

struct f2fs_nat_block {
        struct f2fs_nat_entry entries[NAT_ENTRY_PER_BLOCK];
} __packed;

可是上面問題又來了, inode number如何確定?NAT block起始地址在哪,有多少個?

f2fs dir entry

f2fs_dir_entry 回答了上面的第一個問題,它把inode number和文件名通過hash關聯起來了。同樣,也有專門存儲f2fs_dir_entry的塊,叫做f2fs_dentry_block.

#define NR_DENTRY_IN_BLOCK      214     /* the number of dentry in a block */
#define SIZE_OF_DIR_ENTRY       11      /* by byte */
#define SIZE_OF_DENTRY_BITMAP   ((NR_DENTRY_IN_BLOCK + BITS_PER_BYTE - 1) / \
                                        BITS_PER_BYTE)
#define SIZE_OF_RESERVED        (PAGE_SIZE - ((SIZE_OF_DIR_ENTRY + \
                                F2FS_SLOT_LEN) * \
                                NR_DENTRY_IN_BLOCK + SIZE_OF_DENTRY_BITMAP))

/* One directory entry slot representing F2FS_SLOT_LEN-sized file name */
struct f2fs_dir_entry {
        __le32 hash_code;       /* hash code of file name */
        __le32 ino;             /* inode number */
        __le16 name_len;        /* lengh of file name */
        __u8 file_type;         /* file type */
} __packed;

/* 4KB-sized directory entry block */
struct f2fs_dentry_block {
        /* validity bitmap for directory entries in each block */
        __u8 dentry_bitmap[SIZE_OF_DENTRY_BITMAP];
        __u8 reserved[SIZE_OF_RESERVED];
        struct f2fs_dir_entry dentry[NR_DENTRY_IN_BLOCK];
        __u8 filename[NR_DENTRY_IN_BLOCK][F2FS_SLOT_LEN];
} __packed;

這裏我之前的一個顧慮是,如果出現不同file name的inode hash到同一個inode number,豈不是出問題了?後來通過看具體實現的代碼,可以看到,實際還會加上file name的比較。這樣就可以避免碰撞了。

那麼,f2fs第一個inode節點(root indoe)的inode number是怎麼確定的?又存儲在哪呢?

f2fs super block

f2fs super block數據結構回答了上面的問題,同時也記錄了NAT block的起始地址。主要的數據結構如下:

struct f2fs_super_block {
        __le32 magic;                   /* Magic Number */
        __le16 major_ver;               /* Major Version */
        __le16 minor_ver;               /* Minor Version */
        __le32 log_sectorsize;          /* log2 sector size in bytes */
        __le32 log_sectors_per_block;   /* log2 # of sectors per block */
        __le32 log_blocksize;           /* log2 block size in bytes */
        __le32 log_blocks_per_seg;      /* log2 # of blocks per segment */
        __le32 segs_per_sec;            /* # of segments per section */
        __le32 secs_per_zone;           /* # of sections per zone */
        __le32 checksum_offset;         /* checksum offset inside super block */
        __le64 block_count;             /* total # of user blocks */
        __le32 section_count;           /* total # of sections */
        __le32 segment_count;           /* total # of segments */
        __le32 segment_count_ckpt;      /* # of segments for checkpoint */
        __le32 segment_count_sit;       /* # of segments for SIT */
        __le32 segment_count_nat;       /* # of segments for NAT */
        __le32 segment_count_ssa;       /* # of segments for SSA */
        __le32 segment_count_main;      /* # of segments for main area */
        __le32 segment0_blkaddr;        /* start block address of segment 0 */
        __le32 cp_blkaddr;              /* start block address of checkpoint */
        __le32 sit_blkaddr;             /* start block address of SIT */
        __le32 nat_blkaddr;             /* start block address of NAT */
        __le32 ssa_blkaddr;             /* start block address of SSA */
        __le32 main_blkaddr;            /* start block address of main area */
        __le32 root_ino;                /* root inode number */
        __le32 node_ino;                /* node inode number */
        __le32 meta_ino;                /* meta inode number */
        __u8 uuid[16];                  /* 128-bit uuid for volume */
        __le16 volume_name[MAX_VOLUME_NAME];    /* volume name */
        __le32 extension_count;         /* # of extensions below */
        __u8 extension_list[F2FS_MAX_EXTENSION][F2FS_EXTENSION_LEN];/* extension array */
        __le32 cp_payload;
        __u8 version[VERSION_LEN];      /* the kernel version */
        __u8 init_version[VERSION_LEN]; /* the initial kernel version */
        __le32 feature;                 /* defined features */
        __u8 encryption_level;          /* versioning level for encryption */
        __u8 encrypt_pw_salt[16];       /* Salt used for string2key algorithm */
        struct f2fs_device devs[MAX_DEVICES];   /* device list */
                 __le32 qf_ino[F2FS_MAX_QUOTAS]; /* quota inode numbers */
        __u8 hot_ext_count;             /* # of hot file extension */
        __u8 reserved[314];             /* valid reserved region */
} __packed;

而super block的位置是固定的,當以f2fs格式化一個磁盤的時候,它會寫入到磁盤固定偏移的地方。

SIT

由於f2fs是LFS,追加的寫的大小不固定,很可能小於一個segment的大小,這就需要記錄哪些block已經使用。segment info table 就是做這個事情的,裏面的valid_map記錄了有效的塊。

/*
 * Note that f2fs_sit_entry->vblocks has the following bit-field information.
 * [15:10] : allocation type such as CURSEG_XXXX_TYPE
 * [9:0] : valid block count
 */
#define SIT_VBLOCKS_SHIFT       10
#define SIT_VBLOCKS_MASK        ((1 << SIT_VBLOCKS_SHIFT) - 1)
#define GET_SIT_VBLOCKS(raw_sit)                                \
        (le16_to_cpu((raw_sit)->vblocks) & SIT_VBLOCKS_MASK)
#define GET_SIT_TYPE(raw_sit)                                   \
        ((le16_to_cpu((raw_sit)->vblocks) & ~SIT_VBLOCKS_MASK)  \
         >> SIT_VBLOCKS_SHIFT)

struct f2fs_sit_entry {
        __le16 vblocks;                         /* reference above */
        __u8 valid_map[SIT_VBLOCK_MAP_SIZE];    /* bitmap for valid blocks */
        __le64 mtime;                           /* segment age for cleaning */
} __packed;

struct f2fs_sit_block {
        struct f2fs_sit_entry entries[SIT_ENTRY_PER_BLOCK];
} __packed;

segment summary

f2fs 一個重要的設計特色就是避免了對傳統LFS 的wandering tree問題,這個主要是通過segment summary 相關的數據結構實現的。

通過上面SIT的介紹,一次寫之後,需要更新對應的SIT。這個更新會記錄到 f2fs_sit_journal_entry中:

struct sit_journal_entry {
        __le32 segno;
        struct f2fs_sit_entry se;
} __packed;

struct sit_journal {
        struct sit_journal_entry entries[SIT_JOURNAL_ENTRIES];
        __u8 reserved[SIT_JOURNAL_RESERVED];
} __packed;

如果新建一個文件或目錄,並且有寫操作,就需要更新nat 區域。同樣對這個inode的更新也會記錄到f2fs_nat_journal_entry中:

struct nat_journal_entry {
        __le32 nid;
        struct f2fs_nat_entry ne;
} __packed;

struct nat_journal {
        struct nat_journal_entry entries[NAT_JOURNAL_ENTRIES];
        __u8 reserved[NAT_JOURNAL_RESERVED];
}

一個寫操作,其實對NAT/SIT更新的區域的很小。如果每次都直接更新這兩個區域,對SSD會導致比較大的寫放大。爲了避免這個問題,f2fs 通過segment summary把這些零星的寫攢到segment summary 區域。

/*
 * For segment summary
 *
 * One summary block contains exactly 512 summary entries, which represents
 * exactly 2MB segment by default. Not allow to change the basic units.
 *
 * NOTE: For initializing fields, you must use set_summary
 *
 * - If data page, nid represents dnode's nid
 * - If node page, nid represents the node page's nid.
 *
 * The ofs_in_node is used by only data page. It represents offset
 * from node's page's beginning to get a data block address.
 * ex) data_blkaddr = (block_t)(nodepage_start_address + ofs_in_node)
 */
#define ENTRIES_IN_SUM          512
#define SUMMARY_SIZE            (7)     /* sizeof(struct summary) */
#define SUM_FOOTER_SIZE         (5)     /* sizeof(struct summary_footer) */
#define SUM_ENTRY_SIZE          (SUMMARY_SIZE * ENTRIES_IN_SUM)

/* a summary entry for a 4KB-sized block in a segment */
struct f2fs_summary {
        __le32 nid;             /* parent node id */
        union {
                __u8 reserved[3];
                struct {
                        __u8 version;           /* node version number */
                        __le16 ofs_in_node;     /* block index in parent node */
                } __packed;
        };
} __packed;

/* summary block type, node or data, is stored to the summary_footer */
#define SUM_TYPE_NODE           (1)
#define SUM_TYPE_DATA           (0)

struct summary_footer {
        unsigned char entry_type;       /* SUM_TYPE_XXX */
        __le32 check_sum;               /* summary checksum */
} __packed;

#define SUM_JOURNAL_SIZE        (F2FS_BLKSIZE - SUM_FOOTER_SIZE -\
                                SUM_ENTRY_SIZE)
#define NAT_JOURNAL_ENTRIES     ((SUM_JOURNAL_SIZE - 2) /\
                                sizeof(struct nat_journal_entry))
#define NAT_JOURNAL_RESERVED    ((SUM_JOURNAL_SIZE - 2) %\
                                sizeof(struct nat_journal_entry))
#define SIT_JOURNAL_ENTRIES     ((SUM_JOURNAL_SIZE - 2) /\
                                sizeof(struct sit_journal_entry))
#define SIT_JOURNAL_RESERVED    ((SUM_JOURNAL_SIZE - 2) %\
                                sizeof(struct sit_journal_entry))

/* Reserved area should make size of f2fs_extra_info equals to
 * that of nat_journal and sit_journal.
 */
#define EXTRA_INFO_RESERVED     (SUM_JOURNAL_SIZE - 2 - 8)

/*
 * frequently updated NAT/SIT entries can be stored in the spare area in
 * summary blocks
 */
enum {
        NAT_JOURNAL = 0,
        SIT_JOURNAL
};

struct f2fs_extra_info {
        __le64 kbytes_written;
        __u8 reserved[EXTRA_INFO_RESERVED];
} __packed;

struct f2fs_journal {
        union {
                __le16 n_nats;
                __le16 n_sits;
        };
        /* spare area is used by NAT or SIT journals or extra info */
        union {
                struct nat_journal nat_j;
                struct sit_journal sit_j;
                struct f2fs_extra_info info;
        };
} __packed;

/* 4KB-sized summary block structure */
struct f2fs_summary_block {
        struct f2fs_summary entries[ENTRIES_IN_SUM]; // 512 entry * 6 bytes per entry ,used to recor where has modifiaction
        struct f2fs_journal journal;
        struct summary_footer footer;
} __packed;

file 相關操作

f2fs.h:
file_operations f2fs_file_operations:

fs/f2fs/file.c:

const struct file_operations f2fs_file_operations = {
        .llseek         = f2fs_llseek,
        .read_iter      = generic_file_read_iter,
        .write_iter     = f2fs_file_write_iter,
        .open           = f2fs_file_open,
        .release        = f2fs_release_file,
        .mmap           = f2fs_file_mmap,
        .flush          = f2fs_file_flush,
        .fsync          = f2fs_sync_file,
        .fallocate      = f2fs_fallocate,
        .unlocked_ioctl = f2fs_ioctl,
#ifdef CONFIG_COMPAT
        .compat_ioctl   = f2fs_compat_ioctl,
#endif
        .splice_read    = generic_file_splice_read,
        .splice_write   = iter_file_splice_write,
};

參考

include/linux/f2fs_fs.h
fs/f2fs/
Documentation/filesystems/f2fs.txt

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章