Those are generic rountines with support for SHMFS (TMPFS). Signed-off-by: Radosław Smogura <mail@xxxxxxxxxx> --- include/linux/defrag-pagecache.h | 62 +++++ include/linux/fs.h | 23 ++ mm/Makefile | 1 + mm/defrag-pagecache.c | 489 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 575 insertions(+), 0 deletions(-) create mode 100644 include/linux/defrag-pagecache.h create mode 100644 mm/defrag-pagecache.c diff --git a/include/linux/defrag-pagecache.h b/include/linux/defrag-pagecache.h new file mode 100644 index 0000000..46793de --- /dev/null +++ b/include/linux/defrag-pagecache.h @@ -0,0 +1,62 @@ +/* + * linux/include/linux/defrag-pagecache.h + * + * Defragments pagecache into compound pages + * + * (c) 2011 Radosław Smogura + */ + +#ifndef DEFRAG_PAGECACHE_H +#define DEFRAG_PAGECACHE_H +#include <linux/fs.h> + +/* XXX Split this file into two public and protected - comments below + * Protected will contain + * declaration of generic and helper methods for file systems developers, + * public just general structures and controls. + */ +struct file; +struct inode; +struct defrag_pagecache_ctl; +struct address_space; + +typedef struct page *defrag_generic_get_page( + const struct defrag_pagecache_ctl *ctl, struct inode *inode, + pgoff_t pageIndex); + +/** Passes additional information and controls to page defragmentation. */ +struct defrag_pagecache_ctl { + /** If yes defragmentation will try to fill page caches. */ + char fillPages:1; + + /** If filling of page fails, defragmentation will fail too. Setting + * this requires {@link #fillPages} will be setted. + */ + char requireFillPages:1; + + /** If yes defragmentation will try to force in many aspects, this may + * cause, operation to run longer, but with greater probability of + * success. */ + char force:1; +}; + +/** Defragments page cache of specified file and migrates it's to huge pages. + * + * @param f + * @param offset + * @param size + * @return + */ +extern int defragPageCache(struct file *f, unsigned long offset, + unsigned long size, const struct defrag_pagecache_ctl *defragCtl); + +/** Tries to fix to huge page mappings, buy walking through given Trnapsarent + * Huge Page */ +extern int thpFixMappings(struct page *hugePage); + +extern int defrag_generic_shm(struct file *file, struct address_space *mapping, + loff_t pos, + struct page **pagep, + struct defrag_pagecache_ctl *ctl); +#endif /* DEFRAG_PAGECACHE_H */ + diff --git a/include/linux/fs.h b/include/linux/fs.h index 386da09..bfd9122 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -11,6 +11,10 @@ #include <linux/blk_types.h> #include <linux/types.h> +#ifdef CONFIG_HUGEPAGECACHE +#include <linux/defrag-pagecache.h> +#endif + /* * It's silly to have NR_OPEN bigger than NR_FILE, but you can change * the file limit at runtime and only root can increase the per-process @@ -602,6 +606,25 @@ struct address_space_operations { loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata); +#ifdef CONFIG_HUGEPAGECACHE + /** Used to defrag (migrate) pages at position {@code pos} + * to huge pages. Having this not {@code NULL} will indicate that + * address space, generally, supports huge pages (transaprent + * huge page may be established). + * <br/> + * It's like migrate pages, but different :) + * + * @param pagep on success will be setted to established huge page + * + * @returns TODO What to return? + * {@code 0} on success, value less then {@code 0} on error + */ + int (*defragpage) (struct file *, struct address_space *mapping, + loff_t pos, + struct page **pagep, + const struct defrag_pagecache_ctl *ctl); +#endif + /* Unfortunately this kludge is needed for FIBMAP. Don't use it */ sector_t (*bmap)(struct address_space *, sector_t); void (*invalidatepage) (struct page *, unsigned long); diff --git a/mm/Makefile b/mm/Makefile index 50ec00e..75389c8 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -51,3 +51,4 @@ obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o obj-$(CONFIG_CLEANCACHE) += cleancache.o +obj-$(CONFIG_HUGEPAGECACHE) += defrag-pagecache.o \ No newline at end of file diff --git a/mm/defrag-pagecache.c b/mm/defrag-pagecache.c new file mode 100644 index 0000000..5a14fe8 --- /dev/null +++ b/mm/defrag-pagecache.c @@ -0,0 +1,489 @@ +/* + * linux/mm/defrag-pagecache.c + * + * Defragments pagecache into compound pages + * + * (c) 2011 Radosław Smogura + */ +#include <linux/export.h> +#include <linux/mm.h> +#include <linux/pagemap.h> +#include <asm/pgtable.h> +#include <linux/migrate.h> +#include <linux/defrag-pagecache.h> +#include <linux/pagemap.h> +#include <linux/swap.h> +#include <linux/rmap.h> +#include <linux/page-flags.h> +#include <linux/shmem_fs.h> +#include <asm/tlbflush.h> +#include "internal.h" +/*#include <linux/pgtable_helper.h>*/ + +struct migration_private { + loff_t startIndex; + pgoff_t nextIndex; + pgoff_t pagesToMigrateCount; + + struct page *hugePage; + struct inode *inode; + + const struct defrag_pagecache_ctl *defragCtl; + + int stop; + int result; + int stoppedCompoundFound; + + /** Callback method used to obtain next page. */ + defrag_generic_get_page *getNextPage; +}; + +static const struct defrag_pagecache_ctl defaultDefragCtl = { + .fillPages = 0, + .requireFillPages = 0, + .force = 0 +}; + +#define HUGEPAGE_ALLOC_GFP (GFP_HIGHUSER | __GFP_COMP \ + | __GFP_REPEAT | __GFP_NOWARN | __GFP_WAIT) + +static int defrageOneHugePage(struct file *file, loff_t offset, + struct page **pagep, + const struct defrag_pagecache_ctl *defragCtl, + defrag_generic_get_page *getPage); + +int defragPageCache(struct file *f, unsigned long offset, unsigned long size, + const struct defrag_pagecache_ctl *defragCtl) +{ + /* Calculate requested huge page order. + * XXX Is below caluclation mutliplatform? + */ + const int hugePageOrder = (PMD_SHIFT - PAGE_SHIFT); + const int chunkSize = 1 << hugePageOrder; + unsigned long offsetIdx = offset; + unsigned long chunksToProceed; + + struct inode *inode = f->f_path.dentry->d_inode; + + const struct address_space_operations *aops = + inode->i_mapping->a_ops; + + /* TODO: Use hugepage state or something better instead of hardcoded... + * value. */ + if ((offset != ((offset >> hugePageOrder) << hugePageOrder) || + size != ((size >> hugePageOrder) << hugePageOrder)) + /* && (size != (1 << hugePageOrder))*/) { + /* Start and length must be huge page "aligned". */ + return -EINVAL; + } + + offsetIdx = offset; + chunksToProceed = size >> hugePageOrder; + for (; chunksToProceed; chunksToProceed--, offsetIdx += chunkSize) { + struct page *pagep; + int result = aops->defragpage(f, inode->i_mapping, offsetIdx, + &pagep, + defragCtl); + if (result) + return result; + } + + return 0; +} + +/** Callback for getting page for tmpfs. + * Tmpfs uses {@link shmem_read_mapping_page_gfp} function to read + * page from page cache. + */ +struct page *shmem_defrag_get_page(const struct defrag_pagecache_ctl *ctl, + struct inode *inode, pgoff_t pageIndex) +{ + + return shmem_read_mapping_page_gfp( + inode->i_mapping, pageIndex, + mapping_gfp_mask(inode->i_mapping)); +} + +static void defrag_generic_mig_result(struct page *oldPage, + struct page *newPage, struct migration_ctl *ctl, int result) +{ + struct migration_private *prv = + (struct migration_private *) ctl->privateData; + + if (!result) { + /* Update index only on success; on fail, index will be used to + * clean up. */ + prv->nextIndex++; + + if (!PageTail(newPage)) + putback_lru_page(newPage); + else + put_page(newPage); + } else { + prv->stop = 1; + } + + /* XXX No isolated zone status update! */ + putback_lru_page(oldPage); + put_page(oldPage); +/* + unlock_page(oldPage); +*/ + + prv->result = result; +} + +static struct page *defrag_generic_mig_page_new(struct page *oldPage, + struct migration_ctl *ctl) +{ + struct migration_private *prv = + (struct migration_private *) ctl->privateData; + + return prv->hugePage + prv->nextIndex; +} + +static struct page *defrag_generic_mig_page_next(struct migration_ctl *ctl, + page_mode *mode) +{ + struct migration_private *prv = + (struct migration_private *) ctl->privateData; + const struct defrag_pagecache_ctl *defragCtl; + + /** Hold current page cache page, we are going to migrate. */ + struct page *filePage; + + struct inode *inode; + + pgoff_t pageIndex; + + if (!(prv->nextIndex < prv->pagesToMigrateCount)) + return NULL; + + if (prv->result || prv->stop) + return NULL; + + inode = prv->inode; + pageIndex = prv->startIndex + prv->nextIndex; + defragCtl = prv->defragCtl; + +repeat_find: + filePage = find_lock_page(inode->i_mapping, pageIndex); + + if (filePage) + if (PageUptodate(filePage)) + goto skip_fill_pages; + + /* Try to upread page, if this was intention of caller, + * we don't need to check if page is writeback, migrate pages do it. */ + if (!defragCtl->fillPages) { + prv->result = 0; + prv->stop = 1; + return NULL; + } + + filePage = prv->getNextPage(prv->defragCtl, inode, pageIndex); + + if (IS_ERR(filePage)) { + prv->result = PTR_ERR(filePage); + prv->stop = 1; + return NULL; + } + + lock_page(filePage); + /* Validate page */ + if (!filePage->mapping + || filePage->index != pageIndex + || !PageUptodate(filePage)) { + unlock_page(filePage); + goto repeat_find; + } + +skip_fill_pages: + if (/* ??? !defragCtl->fillPages && */ PageCompound(filePage)) { + /* Heare I think about giving support that in page + * cache may exists huge page but not uptodate whole. + * + * Currently this idea is suspended, due to many + * complications. + */ + prv->stoppedCompoundFound = 1; + goto out_unlock_and_stop; + } + + /* Prepare page for isolation, check if it can be isolated. */ + if (!PageLRU(filePage)) { + if (defragCtl->force) { + /* Isolation requires page in LRU, we may need to drain + * it if not present. */ + lru_add_drain(); + if (!PageLRU(filePage)) { + lru_add_drain_all(); + if (!PageLRU(filePage)) { + prv->result = -EBUSY; + goto out_unlock_and_stop; + } + } + } else { + prv->result = -EBUSY; + goto out_unlock_and_stop; + } + } + + /* Isolate pages. */ + if (isolate_lru_page(filePage)) { + prv->result = -EBUSY; + goto putback_page_and_stop; + } + + *mode = PAGE_LOCKED; + return filePage; + +putback_page_and_stop: + putback_lru_page(filePage); + +out_unlock_and_stop: + unlock_page(filePage); + put_page(filePage); + + return NULL; + +} + +int defrag_generic_shm(struct file *file, struct address_space *mapping, + loff_t pos, + struct page **pagep, + struct defrag_pagecache_ctl *ctl) +{ + return defrageOneHugePage(file, pos, pagep, ctl, shmem_defrag_get_page); +} +EXPORT_SYMBOL(defrag_generic_shm); + +int defrag_generic_pagecache(struct file *file, + struct address_space *mapping, + loff_t pos, + struct page **pagep, + struct defrag_pagecache_ctl *ctl) +{ + /* As we do not support generic page cache defragmentaion, yet. */ + BUG(); + return 0; +} +/** Internal method for defragmenting one chunk of page cache. + * + * <br/> + * This is in some + * way common logic to operate on page cache. It's highly probably that this + * method will be exposed as "generic" to add support for transparent + * huge pages for page cache. + */ +static int defrageOneHugePage(struct file *file, loff_t offset, + struct page **pagep, + const struct defrag_pagecache_ctl *defragCtl, + defrag_generic_get_page *getPage) +{ + const int hugePageOrder = (PMD_SHIFT - PAGE_SHIFT); + + /** Huge page we migrate to. */ + struct page *hugePage; + + /** Private migration data. */ + struct migration_private migrationPrv; + + struct migration_ctl migration_ctl; + + struct inode *inode = file->f_path.dentry->d_inode; + + const int size = 1 << hugePageOrder; + + /** Helpers */ + pgoff_t i; + + /* Over here we callback based migration. */ + /* READ. + * + * This code is in develop stage, and following problems must be + * resolved: + * - page is read from page cache, but lock is droped, in meantime + * page may be no longer up to date, or may be removed from + * page cache. This will be resolved by changing migrat function + */ + /* Allocate one huge page. */ + hugePage = alloc_pages(HUGEPAGE_ALLOC_GFP, hugePageOrder); + if (!hugePage) + return -ENOMEM; + + migrationPrv.nextIndex = 0; + migrationPrv.pagesToMigrateCount = size; + migrationPrv.hugePage = hugePage; + migrationPrv.stop = 0; + migrationPrv.result = 0; + migrationPrv.stoppedCompoundFound = 0; + migrationPrv.getNextPage = getPage; + migrationPrv.startIndex = offset; + migrationPrv.inode = inode; + migrationPrv.defragCtl = + (const struct defrag_pagecache_ctl *) defragCtl; + /* Elevate page counts */ + for (i = 1; i < size; i++) { + struct page *p = hugePage + i; + /* Elevate page counters. */ + get_page(p); + } + + migration_ctl.getNextPage = defrag_generic_mig_page_next; + migration_ctl.getNewPage = defrag_generic_mig_page_new; + migration_ctl.notifyResult = defrag_generic_mig_result; + migration_ctl.privateData = (unsigned long) &migrationPrv; + + /* Aquire compund lock. */ + compound_lock(hugePage); + + /* Migrate pages. Currently page migrate will auto put back pages, + * and may fail and repeat, we need array of pages, to match + * each subpage. This behaviour isn't good. + */ + migrate_pages_cb(&migration_ctl, true, + MIGRATE_SYNC | MIGRATE_SRC_GETTED); + if (migrationPrv.nextIndex < migrationPrv.pagesToMigrateCount) { + /* XXX Simulate various bugs, at least do it hardcoded. */ + /* XXX Everything here is BUG, because need to opcode spliting + */ + if (migrationPrv.stoppedCompoundFound) { + /* If any page has been migrated it's a BUG */ + BUG_ON(migrationPrv.nextIndex); + goto compound_unlock_end; + } + /* Not all pages has been migrated, split target page. */ + /* Downgrade counts of tail pages - may cause deadlock. */ + VM_BUG_ON(1); + } else { + goto compound_unlock_end; + } + +compound_unlock_end: + compound_unlock(hugePage); +/* + put_page(hugePage); +*/ + + /* All file pages are unlocked, and should be freed. Huge should be on + * Unevictable list. + */ + return migrationPrv.result; +} + +static int thpFixMappingsRmapWalk(struct page *page, struct vm_area_struct *vma, + unsigned long addr, void *prvData) { + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd, _pmd; + pte_t *pte; + + int i; + +/* + printk(KERN_INFO "Starting address is %lx", addr); +*/ + if (vma->vm_flags & VM_NONLINEAR || (addr & ~HPAGE_PMD_MASK)) { + /* Skip nonlinear VMAs, and not aligned addresses*/ + return SWAP_AGAIN; + } + + /* We will set pmd only if all tail pages meets following requirements: + * - all pages are up to data + * - all pages have same protection bits + * - ??? + */ + pgd = pgd_offset(vma->vm_mm, addr); + if (!pgd_present(*pgd)) + return SWAP_AGAIN; + + pud = pud_offset(pgd, addr); + if (!pud_present(*pud)) + return SWAP_AGAIN; + + pmd = pmd_offset(pud, addr); + if (!pmd_present(*pmd)) + return SWAP_AGAIN; + + pte = (pte_t *) pmd; + if (pte_huge(*pte)) + return SWAP_AGAIN; + + + /*printk(KERN_INFO "Checking head flags"); */ + pte = pte_offset_map(pmd, addr); + if (!pte_present(*pte)) { + /* printk(KERN_INFO "Pte not present."); */ + pte_unmap(pte); + return SWAP_AGAIN; + } + + for (i = 1; i < HPAGE_PMD_NR; i++) { + struct page *tail_page; + + addr += PAGE_SIZE; + + pte = pte_offset_map(pmd, addr); + if (!pte_present(*pte)) { + /* + * printk(KERN_INFO "No %d pte returning.", i); + */ + pte_unmap(pte); + return SWAP_AGAIN; + } + + tail_page = pte_page(*pte); + if (!tail_page) { + /* printk(KERN_INFO "Page +%d not present.", i); */ + goto unmap_out; + } + + /* We check index, howver we do not allow not linear mapping :) + */ + /* smp_mb(); */ + int i1 = tail_page->mapping == page->mapping; + int i2 = tail_page->index == (page->index + i); + if (i1 && i2) { + /* + printk(KERN_INFO "Page +%d present mappings and" + " indices ok", i); + */ + } else { + printk(KERN_INFO "Page +%d has good mapping %d, and" + " good index %d (%d, %d).", + i, + i1, + i2, + tail_page->index, + page->index); + goto unmap_out; + } + pte_unmap(pte); + } + pmd_clear(pmd); + _pmd = pmd_mkhuge(pmd_modify(*pmd, vma->vm_page_prot)); + + set_pmd_at(vma->vm_mm, addr, pmd, _pmd); + /* Everything is ok. */ + + /* TODO Do not flush all :) */ + flush_tlb_mm(vma->vm_mm); + printk(KERN_INFO "Replaced by pmd"); + return SWAP_AGAIN; +unmap_out: + pte_unmap(pte); + + return SWAP_AGAIN; +} + +int thpFixMappings(struct page *hugePage) +{ + BUG_ON(PageAnon(hugePage)); + /* lock_page(hugePage); */ + BUG_ON(!PageTransHuge(hugePage)); + rmap_walk(hugePage, thpFixMappingsRmapWalk, NULL); + /* unlock_page(hugePage); */ + + return 0; +} -- 1.7.3.4 -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/ Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>