Dave Chinner <david@xxxxxxxxxxxxx> 于2024年7月1日周一 10:16写道: > > On Wed, Jun 19, 2024 at 06:06:37PM +0800, Junchao Sun wrote: > > By reordering the elements in the xfs_inode structure, we can > > reduce the padding needed on an x86_64 system by 8 bytes. > > > > Furthermore, it also enables denser packing of xfs_inode > > structures within slab pages. In the Debian 6.8.12-amd64, > > before applying the patch, the size of xfs_inode is 1000 bytes, > > > > Please use pahole to show where the holes are in the current TOT > > structure, not reference a distro kernel build that has a largely > > unknown config. Ok. > > > allowing 32 xfs_inode structures to be allocated from an > > order-3 slab. After applying the patch, the size of > > xfs_inode is reduced to 992 bytes, allowing 33 xfs_inode > > structures to be allocated from an order-3 slab. > > > > This improvement is also observed in the mainline kernel > > with the same config. > > > > Signed-off-by: Junchao Sun <sunjunchao2870@xxxxxxxxx> > > --- > > fs/xfs/xfs_inode.h | 12 ++++++------ > > 1 file changed, 6 insertions(+), 6 deletions(-) > > > > diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h > > index 292b90b5f2ac..fedac2792a38 100644 > > --- a/fs/xfs/xfs_inode.h > > +++ b/fs/xfs/xfs_inode.h > > @@ -37,12 +37,6 @@ typedef struct xfs_inode { > > struct xfs_ifork i_df; /* data fork */ > > struct xfs_ifork i_af; /* attribute fork */ > > > > - /* Transaction and locking information. */ > > - struct xfs_inode_log_item *i_itemp; /* logging information */ > > - struct rw_semaphore i_lock; /* inode lock */ > > - atomic_t i_pincount; /* inode pin count */ > > - struct llist_node i_gclist; /* deferred inactivation list */ > > There's lots of 4 byte holes in the structure due to stuff like > xfs_ifork and xfs_imap being 4 byte aligned structures. > > This only addresses a coupel fo them, and in doing so destroys the > attempt at creating locality of access to the inode structure. > > > - > > /* > > * Bitsets of inode metadata that have been checked and/or are sick. > > * Callers must hold i_flags_lock before accessing this field. > > @@ -88,6 +82,12 @@ typedef struct xfs_inode { > > /* VFS inode */ > > struct inode i_vnode; /* embedded VFS inode */ > > > > + /* Transaction and locking information. */ > > + struct xfs_inode_log_item *i_itemp; /* logging information */ > > + struct rw_semaphore i_lock; /* inode lock */ > > + struct llist_node i_gclist; /* deferred inactivation list */ > > + atomic_t i_pincount; /* inode pin count */ > > > > This separates the items commonly accessed together in the core XFS > > code and so should be located near to each other (on the same > > cachelines if possible). It places them on the side of the VFS inode > > (at least 700 bytes further down the structure) and places them on > > the same cacheline as IO completion marshalling structures. These > > shouldn't be on the same cacheline as IO completion variables as > > they run concurrently and independently and so need to be separated. > > > > really, if we are going to optimise the layout of the xfs_inode, > > let's just do it properly the first time. See the patch below, it > > reduces the struct xfs_inode to 960 bytes without changing much in > > it's layout at all. Ok, I see. Really appreciate for your detailed explanation and suggestions! > > -Dave. > -- > Dave Chinner > david@xxxxxxxxxxxxx > > xfs: repack the xfs_inode to reduce space. > > From: Dave Chinner <dchinner@xxxxxxxxxx> > > pahole reports several 4 byte holes in the xfs_inode with a size of > 1000 bytes. We can reduce that by packing holes better without > affecting the data access patterns to the inode structure. > > Some of these holes are a result of 4 byte aligned tail structures > being padded out to 16 bytes in the xfs_inode. These structures are > already tightly packed and 8 byte aligned, so if we are careful with > the layout we can add __attribute(packed) to them so their tail > padding gets. This allows us to add a 4 byte variable into the hole > that they would have left with 8 byte alignment padding. > > This reduces the struct xfs_inode to 960 bytes, a 4% reduction from > the original 1000 bytes, and it largely doesn't change access > patterns or data cache footprint as the alignment of all the > critical structures is completely unchanged. > > pahole output now reports: > > struct xfs_inode { > struct xfs_mount * i_mount; /* 0 8 */ > struct xfs_dquot * i_udquot; /* 8 8 */ > struct xfs_dquot * i_gdquot; /* 16 8 */ > struct xfs_dquot * i_pdquot; /* 24 8 */ > xfs_ino_t i_ino; /* 32 8 */ > struct xfs_imap i_imap; /* 40 12 */ > spinlock_t i_flags_lock; /* 52 4 */ > unsigned long i_flags; /* 56 8 */ > /* --- cacheline 1 boundary (64 bytes) --- */ > struct xfs_ifork * i_cowfp; /* 64 8 */ > struct xfs_ifork i_df; /* 72 44 */ > xfs_extlen_t i_extsize; /* 116 4 */ > struct xfs_ifork i_af; /* 120 44 */ > /* --- cacheline 2 boundary (128 bytes) was 36 bytes ago --- */ > union { > xfs_extlen_t i_cowextsize; /* 164 4 */ > uint16_t i_flushiter; /* 164 2 */ > }; /* 164 4 */ > union { > xfs_extlen_t i_cowextsize; /* 0 4 */ > uint16_t i_flushiter; /* 0 2 */ > }; > > struct xfs_inode_log_item * i_itemp; /* 168 8 */ > struct rw_semaphore i_lock; /* 176 40 */ > /* --- cacheline 3 boundary (192 bytes) was 24 bytes ago --- */ > struct llist_node i_gclist; /* 216 8 */ > atomic_t i_pincount; /* 224 4 */ > uint16_t i_checked; /* 228 2 */ > uint16_t i_sick; /* 230 2 */ > uint64_t i_delayed_blks; /* 232 8 */ > xfs_fsize_t i_disk_size; /* 240 8 */ > xfs_rfsblock_t i_nblocks; /* 248 8 */ > /* --- cacheline 4 boundary (256 bytes) --- */ > prid_t i_projid; /* 256 4 */ > uint8_t i_forkoff; /* 260 1 */ > > /* XXX 1 byte hole, try to pack */ > > uint16_t i_diflags; /* 262 2 */ > uint64_t i_diflags2; /* 264 8 */ > struct timespec64 i_crtime; /* 272 16 */ > xfs_agino_t i_next_unlinked; /* 288 4 */ > xfs_agino_t i_prev_unlinked; /* 292 4 */ > struct inode i_vnode; /* 296 608 */ > /* --- cacheline 14 boundary (896 bytes) was 8 bytes ago --- */ > struct work_struct i_ioend_work; /* 904 32 */ > struct list_head i_ioend_list; /* 936 16 */ > spinlock_t i_ioend_lock; /* 952 4 */ > > /* size: 960, cachelines: 15, members: 33 */ > /* sum members: 955, holes: 1, sum holes: 1 */ > /* padding: 4 */ > }; > > We have a 1 byte hole in the middle, and 4 bytes of tail padding. > > Signed-off-by: Dave Chinner <dchinner@xxxxxxxxxx> > --- > fs/xfs/libxfs/xfs_inode_buf.h | 2 +- > fs/xfs/libxfs/xfs_inode_fork.h | 2 +- > fs/xfs/xfs_inode.h | 21 +++++++++++---------- > 3 files changed, 13 insertions(+), 12 deletions(-) > > diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h > index 585ed5a110af..28760973d809 100644 > --- a/fs/xfs/libxfs/xfs_inode_buf.h > +++ b/fs/xfs/libxfs/xfs_inode_buf.h > @@ -17,7 +17,7 @@ struct xfs_imap { > xfs_daddr_t im_blkno; /* starting BB of inode chunk */ > unsigned short im_len; /* length in BBs of inode chunk */ > unsigned short im_boffset; /* inode offset in block in bytes */ > -}; > +} __attribute__((packed)); > > int xfs_imap_to_bp(struct xfs_mount *mp, struct xfs_trans *tp, > struct xfs_imap *imap, struct xfs_buf **bpp); > diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h > index 2373d12fd474..63780b3542c6 100644 > --- a/fs/xfs/libxfs/xfs_inode_fork.h > +++ b/fs/xfs/libxfs/xfs_inode_fork.h > @@ -23,7 +23,7 @@ struct xfs_ifork { > short if_broot_bytes; /* bytes allocated for root */ > int8_t if_format; /* format of this fork */ > uint8_t if_needextents; /* extents have not been read */ > -}; > +} __attribute__((packed)); > > /* > * Worst-case increase in the fork extent count when we're adding a single > diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h > index 292b90b5f2ac..bbc73fd56fa2 100644 > --- a/fs/xfs/xfs_inode.h > +++ b/fs/xfs/xfs_inode.h > @@ -32,16 +32,25 @@ typedef struct xfs_inode { > xfs_ino_t i_ino; /* inode number (agno/agino)*/ > struct xfs_imap i_imap; /* location for xfs_imap() */ > > + spinlock_t i_flags_lock; /* inode i_flags lock */ > + unsigned long i_flags; /* see defined flags below */ > + > /* Extent information. */ > struct xfs_ifork *i_cowfp; /* copy on write extents */ > struct xfs_ifork i_df; /* data fork */ > + xfs_extlen_t i_extsize; /* basic/minimum extent size */ > struct xfs_ifork i_af; /* attribute fork */ > + /* cowextsize is only used for v3 inodes, flushiter for v1/2 */ > + union { > + xfs_extlen_t i_cowextsize; /* basic cow extent size */ > + uint16_t i_flushiter; /* incremented on flush */ > + }; > > /* Transaction and locking information. */ > struct xfs_inode_log_item *i_itemp; /* logging information */ > struct rw_semaphore i_lock; /* inode lock */ > - atomic_t i_pincount; /* inode pin count */ > struct llist_node i_gclist; /* deferred inactivation list */ > + atomic_t i_pincount; /* inode pin count */ > > /* > * Bitsets of inode metadata that have been checked and/or are sick. > @@ -50,19 +59,11 @@ typedef struct xfs_inode { > uint16_t i_checked; > uint16_t i_sick; > > - spinlock_t i_flags_lock; /* inode i_flags lock */ > /* Miscellaneous state. */ > - unsigned long i_flags; /* see defined flags below */ > uint64_t i_delayed_blks; /* count of delay alloc blks */ > xfs_fsize_t i_disk_size; /* number of bytes in file */ > xfs_rfsblock_t i_nblocks; /* # of direct & btree blocks */ > prid_t i_projid; /* owner's project id */ > - xfs_extlen_t i_extsize; /* basic/minimum extent size */ > - /* cowextsize is only used for v3 inodes, flushiter for v1/2 */ > - union { > - xfs_extlen_t i_cowextsize; /* basic cow extent size */ > - uint16_t i_flushiter; /* incremented on flush */ > - }; > uint8_t i_forkoff; /* attr fork offset >> 3 */ > uint16_t i_diflags; /* XFS_DIFLAG_... */ > uint64_t i_diflags2; /* XFS_DIFLAG2_... */ > @@ -89,9 +90,9 @@ typedef struct xfs_inode { > struct inode i_vnode; /* embedded VFS inode */ > > /* pending io completions */ > - spinlock_t i_ioend_lock; > struct work_struct i_ioend_work; > struct list_head i_ioend_list; > + spinlock_t i_ioend_lock; > } xfs_inode_t; > > static inline bool xfs_inode_on_unlinked_list(const struct xfs_inode *ip) -- Junchao Sun <sunjunchao2870@xxxxxxxxx>