Jens, do you think the following patch is still okay for 3.6.36? It fixes the massive performance regression due to the full barriers for discard and also makes the barrier removal in 2.6.37 a lot simpler. --- From: Christoph Hellwig <hch@xxxxxx> Subject: [PATCH] block: do not send discards as barriers There is no reason to send discards as barriers. The rationale for that is easy: The current barriers do two things, flushing caches and provide ordering. There is no reason for a cache flush before a discard because the discard doesn't care for ordering vs writes to other regions than the one it discards, and there is no reason for a cache flush afterwards as a discard doesn't store data. The ordering semantics aren't used currently because all discards are done synchronously and thus we order explicitly. Even more until very late in the 2.6.35 cycle we didn't send DISCARD_BARRIER requests as real hardbarrier but as an elevator only barrier which doesn't provide the above semantics, and the switch to real barriers causes masssive performance regressions especially for the swap code. Signed-off-by: Christoph Hellwig <hch@xxxxxx> Index: linux-2.6/block/blk-lib.c =================================================================== --- linux-2.6.orig/block/blk-lib.c 2010-08-23 15:16:37.656033352 +0200 +++ linux-2.6/block/blk-lib.c 2010-08-23 15:16:45.913012470 +0200 @@ -39,8 +39,7 @@ int blkdev_issue_discard(struct block_de { DECLARE_COMPLETION_ONSTACK(wait); struct request_queue *q = bdev_get_queue(bdev); - int type = flags & BLKDEV_IFL_BARRIER ? - DISCARD_BARRIER : DISCARD_NOBARRIER; + int type = REQ_WRITE | REQ_DISCARD; unsigned int max_discard_sectors; struct bio *bio; int ret = 0; @@ -65,7 +64,7 @@ int blkdev_issue_discard(struct block_de if (flags & BLKDEV_IFL_SECURE) { if (!blk_queue_secdiscard(q)) return -EOPNOTSUPP; - type |= DISCARD_SECURE; + type |= REQ_SECURE; } while (nr_sects && !ret) { @@ -162,12 +161,6 @@ int blkdev_issue_zeroout(struct block_de bb.wait = &wait; bb.end_io = NULL; - if (flags & BLKDEV_IFL_BARRIER) { - /* issue async barrier before the data */ - ret = blkdev_issue_flush(bdev, gfp_mask, NULL, 0); - if (ret) - return ret; - } submit: ret = 0; while (nr_sects != 0) { @@ -199,13 +192,6 @@ submit: issued++; submit_bio(WRITE, bio); } - /* - * When all data bios are in flight. Send final barrier if requeted. - */ - if (nr_sects == 0 && flags & BLKDEV_IFL_BARRIER) - ret = blkdev_issue_flush(bdev, gfp_mask, NULL, - flags & BLKDEV_IFL_WAIT); - if (flags & BLKDEV_IFL_WAIT) /* Wait for bios in-flight */ Index: linux-2.6/fs/btrfs/extent-tree.c =================================================================== --- linux-2.6.orig/fs/btrfs/extent-tree.c 2010-08-23 15:16:37.677004089 +0200 +++ linux-2.6/fs/btrfs/extent-tree.c 2010-08-23 15:16:45.918004368 +0200 @@ -1696,7 +1696,7 @@ static void btrfs_issue_discard(struct b u64 start, u64 len) { blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL, - BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER); + BLKDEV_IFL_WAIT); } static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, Index: linux-2.6/fs/gfs2/rgrp.c =================================================================== --- linux-2.6.orig/fs/gfs2/rgrp.c 2010-08-23 15:16:37.684004298 +0200 +++ linux-2.6/fs/gfs2/rgrp.c 2010-08-23 15:16:45.927004298 +0200 @@ -854,8 +854,7 @@ static void gfs2_rgrp_send_discards(stru if ((start + nr_sects) != blk) { rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS, - BLKDEV_IFL_WAIT | - BLKDEV_IFL_BARRIER); + BLKDEV_IFL_WAIT); if (rv) goto fail; nr_sects = 0; @@ -870,7 +869,7 @@ start_new_extent: } if (nr_sects) { rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS, - BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER); + BLKDEV_IFL_WAIT); if (rv) goto fail; } Index: linux-2.6/fs/nilfs2/the_nilfs.c =================================================================== --- linux-2.6.orig/fs/nilfs2/the_nilfs.c 2010-08-23 15:16:37.692003949 +0200 +++ linux-2.6/fs/nilfs2/the_nilfs.c 2010-08-23 15:16:45.928014774 +0200 @@ -775,8 +775,7 @@ int nilfs_discard_segments(struct the_ni start * sects_per_block, nblocks * sects_per_block, GFP_NOFS, - BLKDEV_IFL_WAIT | - BLKDEV_IFL_BARRIER); + BLKDEV_IFL_WAIT); if (ret < 0) return ret; nblocks = 0; @@ -787,7 +786,7 @@ int nilfs_discard_segments(struct the_ni start * sects_per_block, nblocks * sects_per_block, GFP_NOFS, - BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER); + BLKDEV_IFL_WAIT); return ret; } Index: linux-2.6/include/linux/blkdev.h =================================================================== --- linux-2.6.orig/include/linux/blkdev.h 2010-08-23 15:16:37.700013168 +0200 +++ linux-2.6/include/linux/blkdev.h 2010-08-23 15:16:45.931003739 +0200 @@ -921,11 +921,9 @@ static inline struct request *blk_map_qu } enum{ BLKDEV_WAIT, /* wait for completion */ - BLKDEV_BARRIER, /* issue request with barrier */ BLKDEV_SECURE, /* secure discard */ }; #define BLKDEV_IFL_WAIT (1 << BLKDEV_WAIT) -#define BLKDEV_IFL_BARRIER (1 << BLKDEV_BARRIER) #define BLKDEV_IFL_SECURE (1 << BLKDEV_SECURE) extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *, unsigned long); @@ -939,7 +937,7 @@ static inline int sb_issue_discard(struc block <<= (sb->s_blocksize_bits - 9); nr_blocks <<= (sb->s_blocksize_bits - 9); return blkdev_issue_discard(sb->s_bdev, block, nr_blocks, GFP_NOFS, - BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER); + BLKDEV_IFL_WAIT); } extern int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm); Index: linux-2.6/include/linux/fs.h =================================================================== --- linux-2.6.orig/include/linux/fs.h 2010-08-23 15:16:37.709012819 +0200 +++ linux-2.6/include/linux/fs.h 2010-08-23 15:16:45.938005835 +0200 @@ -159,14 +159,6 @@ struct inodes_stat_t { #define WRITE_BARRIER (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG | \ REQ_HARDBARRIER) -/* - * These aren't really reads or writes, they pass down information about - * parts of device that are now unused by the file system. - */ -#define DISCARD_NOBARRIER (WRITE | REQ_DISCARD) -#define DISCARD_BARRIER (WRITE | REQ_DISCARD | REQ_HARDBARRIER) -#define DISCARD_SECURE (DISCARD_NOBARRIER | REQ_SECURE) - #define SEL_IN 1 #define SEL_OUT 2 #define SEL_EX 4 Index: linux-2.6/mm/swapfile.c =================================================================== --- linux-2.6.orig/mm/swapfile.c 2010-08-23 15:16:37.724015124 +0200 +++ linux-2.6/mm/swapfile.c 2010-08-23 15:16:45.948004996 +0200 @@ -142,7 +142,7 @@ static int discard_swap(struct swap_info if (nr_blocks) { err = blkdev_issue_discard(si->bdev, start_block, nr_blocks, GFP_KERNEL, - BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER); + BLKDEV_IFL_WAIT); if (err) return err; cond_resched(); @@ -154,7 +154,7 @@ static int discard_swap(struct swap_info err = blkdev_issue_discard(si->bdev, start_block, nr_blocks, GFP_KERNEL, - BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER); + BLKDEV_IFL_WAIT); if (err) break; @@ -193,8 +193,7 @@ static void discard_swap_cluster(struct start_block <<= PAGE_SHIFT - 9; nr_blocks <<= PAGE_SHIFT - 9; if (blkdev_issue_discard(si->bdev, start_block, - nr_blocks, GFP_NOIO, BLKDEV_IFL_WAIT | - BLKDEV_IFL_BARRIER)) + nr_blocks, GFP_NOIO, BLKDEV_IFL_WAIT)) break; } -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html