Michal Hocko wrote: > > > > That _somebody_ might release oom_lock without invoking the OOM killer (e.g. > > > > doing !__GFP_FS allocation), which means that we have reached the OOM condition > > > > and nobody is actually handling the OOM on our behalf. __GFP_RETRY_HARD becomes > > > > as weak as __GFP_NORETRY. I think this is a regression. > > > > > > I really fail to see your point. We are talking about a gfp flag which > > > tells the allocator to retry as much as it is feasible. Getting through > > > all the reclaim attempts two times without any progress sounds like a > > > fair criterion. Well, we could try $NUM times but that wouldn't make too > > > much difference to what you are writing above. The fact whether somebody > > > has been killed or not is not really that important IMHO. > > > > If all the reclaim attempt first time made no progress, all the reclaim > > attempt second time unlikely make progress unless the OOM killer kills > > something. Thus, doing all the reclaim attempts two times without any progress > > without killing somebody sounds almost equivalent to doing all the reclaim > > attempt only once. > > Yes, that is possible. You might have a GFP_NOFS only load where nothing > really invokes the OOM killer. Does that actually matter, though? The > semantic of the flag is to retry hard while the page allocator believes > it can make a forward progress. But not for ever. We never know whether > a progress is possible at all. We have certain heuristics when to give > up, try to invoke OOM killer and try again hoping things have changed. > This is not much different except we declare that no hope to getting to > the OOM point again without being able to succeed. Are you suggesting > a more precise heuristic? Or do you claim that we do not need a flag > which would put a middle ground between __GFP_NORETRY and __GFP_NOFAIL > which are on the extreme sides? Well, maybe we can get rid of __GFP_RETRY (or make __GFP_RETRY used for only huge pages). Many __GFP_RETRY users are ready to fall back to vmalloc(). We are not sure whether such __GFP_RETRY users want to retry with OOM-killing somebody (we don't have __GFP_MAY_OOM_KILL which explicitly asks for "retry with OOM-killing somebody"). If __GFP_RETRY means nothing but try once more, void *n = kzalloc(size, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY); if (!n) n = kzalloc(size, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY); will emulate it. ----- arch/powerpc/include/asm/book3s/64/pgalloc.h ----- static inline pgd_t *radix__pgd_alloc(struct mm_struct *mm) { #ifdef CONFIG_PPC_64K_PAGES return (pgd_t *)__get_free_page(PGALLOC_GFP); #else struct page *page; page = alloc_pages(GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO | __GFP_REPEAT, 4); if (!page) return NULL; return (pgd_t *) page_address(page); #endif } ----- arch/powerpc/kvm/book3s_64_mmu_hv.c ----- kvm->arch.hpt_cma_alloc = 0; page = kvm_alloc_hpt(1ul << (order - PAGE_SHIFT)); if (page) { hpt = (unsigned long)pfn_to_kaddr(page_to_pfn(page)); memset((void *)hpt, 0, (1ul << order)); kvm->arch.hpt_cma_alloc = 1; } /* Lastly try successively smaller sizes from the page allocator */ /* Only do this if userspace didn't specify a size via ioctl */ while (!hpt && order > 18 && !htab_orderp) { hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT| __GFP_NOWARN, order - PAGE_SHIFT); if (!hpt) --order; } if (!hpt) return -ENOMEM; ----- drivers/vhost/vhost.c ----- static void *vhost_kvzalloc(unsigned long size) { void *n = kzalloc(size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); if (!n) n = vzalloc(size); return n; } ----- drivers/vhost/scsi.c ----- vs = kzalloc(sizeof(*vs), GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); if (!vs) { vs = vzalloc(sizeof(*vs)); if (!vs) goto err_vs; } ----- drivers/vhost/net.c ----- n = kmalloc(sizeof *n, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); if (!n) { n = vmalloc(sizeof *n); if (!n) return -ENOMEM; } ----- drivers/block/xen-blkfront.c ----- /* Stage 1: Make a safe copy of the shadow state. */ copy = kmemdup(rinfo->shadow, sizeof(rinfo->shadow), GFP_NOIO | __GFP_REPEAT | __GFP_HIGH); if (!copy) return -ENOMEM; ----- drivers/mmc/host/wbsd.c ----- /* * We need to allocate a special buffer in * order for ISA to be able to DMA to it. */ host->dma_buffer = kmalloc(65536, GFP_NOIO | GFP_DMA | __GFP_REPEAT | __GFP_NOWARN); if (!host->dma_buffer) goto free; ----- drivers/target/target_core_transport.c ----- se_sess->sess_cmd_map = kzalloc(tag_num * tag_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); if (!se_sess->sess_cmd_map) { se_sess->sess_cmd_map = vzalloc(tag_num * tag_size); if (!se_sess->sess_cmd_map) { pr_err("Unable to allocate se_sess->sess_cmd_map\n"); return -ENOMEM; } } ----- drivers/s390/char/vmcp.c ----- if (mutex_lock_interruptible(&session->mutex)) { kfree(cmd); return -ERESTARTSYS; } if (!session->response) session->response = (char *)__get_free_pages(GFP_KERNEL | __GFP_REPEAT | GFP_DMA, get_order(session->bufsize)); if (!session->response) { mutex_unlock(&session->mutex); kfree(cmd); return -ENOMEM; } ----- fs/btrfs/raid56.c ----- table_size = sizeof(*table) + sizeof(*h) * num_entries; table = kzalloc(table_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); if (!table) { table = vzalloc(table_size); if (!table) return -ENOMEM; } ----- fs/btrfs/check-integrity.c ----- state = kzalloc(sizeof(*state), GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); if (!state) { state = vzalloc(sizeof(*state)); if (!state) { printk(KERN_INFO "btrfs check-integrity: vzalloc() failed!\n"); return -1; } } ----- mm/sparse-vmemmap.c ----- void * __meminit vmemmap_alloc_block(unsigned long size, int node) { /* If the main allocator is up use that, fallback to bootmem. */ if (slab_is_available()) { struct page *page; if (node_state(node, N_HIGH_MEMORY)) page = alloc_pages_node( node, GFP_KERNEL | __GFP_ZERO | __GFP_REPEAT, get_order(size)); else page = alloc_pages( GFP_KERNEL | __GFP_ZERO | __GFP_REPEAT, get_order(size)); if (page) return page_address(page); return NULL; } else return __earlyonly_bootmem_alloc(node, size, size, __pa(MAX_DMA_ADDRESS)); } ----- mm/hugetlb.c ----- static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) { struct page *page; page = __alloc_pages_node(nid, htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE| __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); if (page) { prep_new_huge_page(h, page, nid); } return page; } static struct page *__hugetlb_alloc_buddy_huge_page(struct hstate *h, struct vm_area_struct *vma, unsigned long addr, int nid) { int order = huge_page_order(h); gfp_t gfp = htlb_alloc_mask(h)|__GFP_COMP|__GFP_REPEAT|__GFP_NOWARN; unsigned int cpuset_mems_cookie; ----- net/core/skbuff.c ----- gfp_head = gfp_mask; if (gfp_head & __GFP_DIRECT_RECLAIM) gfp_head |= __GFP_REPEAT; *errcode = -ENOBUFS; skb = alloc_skb(header_len, gfp_head); if (!skb) return NULL; ----- net/core/dev.c ----- rx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); if (!rx) { rx = vzalloc(sz); if (!rx) return -ENOMEM; } dev->_rx = rx; tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); if (!tx) { tx = vzalloc(sz); if (!tx) return -ENOMEM; } dev->_tx = tx; p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); if (!p) p = vzalloc(alloc_size); if (!p) return NULL; ----- net/sched/sch_fq.c ----- static void *fq_alloc_node(size_t sz, int node) { void *ptr; ptr = kmalloc_node(sz, GFP_KERNEL | __GFP_REPEAT | __GFP_NOWARN, node); if (!ptr) ptr = vmalloc_node(sz, node); return ptr; } -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>