Re: [PATCH RFC 7/9] mm/gup: Decrement head page once for group of subpages

Joao Martins <joao.m.martins@xxxxxxxxxx> · Thu, 17 Dec 2020 19:05:37 +0000

On 12/8/20 7:34 PM, Jason Gunthorpe wrote:
>> @@ -274,6 +291,7 @@ void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages,
>>  				 bool make_dirty)
>>  {
>>  	unsigned long index;
>> +	int refs = 1;
>>  
>>  	/*
>>  	 * TODO: this can be optimized for huge pages: if a series of pages is
>> @@ -286,8 +304,9 @@ void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages,
>>  		return;
>>  	}
>>  
>> -	for (index = 0; index < npages; index++) {
>> +	for (index = 0; index < npages; index += refs) {
>>  		struct page *page = compound_head(pages[index]);
>> +
> 
> I think this is really hard to read, it should end up as some:
> 
> for_each_compond_head(page_list, page_list_len, &head, &ntails) {
>        		if (!PageDirty(head))
> 			set_page_dirty_lock(head, ntails);
> 		unpin_user_page(head, ntails);
> }
> 
> And maybe you open code that iteration, but that basic idea to find a
> compound_head and ntails should be computational work performed.
> 
> No reason not to fix set_page_dirty_lock() too while you are here.
> 

The wack of atomics you mentioned earlier you referred to, I suppose it
ends being account_page_dirtied(). See partial diff at the end.

I was looking at the latter part and renaming all the fs that supply
set_page_dirty()... But now my concern is whether it's really safe to
assume that filesystems that supply it ... have indeed the ability to dirty
@ntails pages. Functionally, fixing set_page_dirty_lock() means we don't call
set_page_dirty(head) @ntails times as it happens today, we would only call once
with ntails as argument.

Perhaps the safest thing to do is still to iterate over
@ntails and call .set_page_dirty(page) and instead introduce
a set_page_range_dirty() which individual filesystems can separately
supply and give precedence of ->set_page_range_dirty() as opposed
to ->set_page_dirty() ?

	Joao

--------------------->8------------------------------

diff --git a/mm/gup.c b/mm/gup.c
index 41ab3d48e1bb..5f8a0f16ab62 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -295,7 +295,7 @@ void unpin_user_pages_dirty_lock(struct page **pages, unsigned long
npages,
                 * next writeback cycle. This is harmless.
                 */
                if (!PageDirty(head))
-                       set_page_dirty_lock(head);
+                       set_page_range_dirty_lock(head, ntails);
                put_compound_head(head, ntails, FOLL_PIN);
        }
 }
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 088729ea80b2..4642d037f657 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2417,7 +2417,8 @@ int __set_page_dirty_no_writeback(struct page *page, unsigned int
ntails)
  *
  * NOTE: This relies on being atomic wrt interrupts.
  */
-void account_page_dirtied(struct page *page, struct address_space *mapping)
+void account_page_dirtied(struct page *page, struct address_space *mapping,
+                         unsigned int ntails)
 {
        struct inode *inode = mapping->host;

@@ -2425,17 +2426,18 @@ void account_page_dirtied(struct page *page, struct address_space
*mapping)

        if (mapping_can_writeback(mapping)) {
                struct bdi_writeback *wb;
+               int nr = ntails + 1;

                inode_attach_wb(inode, page);
                wb = inode_to_wb(inode);

-               __inc_lruvec_page_state(page, NR_FILE_DIRTY);
-               __inc_zone_page_state(page, NR_ZONE_WRITE_PENDING);
-               __inc_node_page_state(page, NR_DIRTIED);
-               inc_wb_stat(wb, WB_RECLAIMABLE);
-               inc_wb_stat(wb, WB_DIRTIED);
-               task_io_account_write(PAGE_SIZE);
-               current->nr_dirtied++;
+               mod_lruvec_page_state(page, NR_FILE_DIRTY, nr);
+               mod_zone_page_state(page_zone(page), NR_ZONE_WRITE_PENDING, nr);
+               mod_node_page_state(page_pgdat(page), NR_DIRTIED, nr);
+               __add_wb_stat(wb, WB_RECLAIMABLE, nr);
+               __add_wb_stat(wb, WB_DIRTIED, nr);
+               task_io_account_write(nr * PAGE_SIZE);
+               current->nr_dirtied += nr;
                this_cpu_inc(bdp_ratelimits);

                mem_cgroup_track_foreign_dirty(page, wb);
@@ -2485,7 +2487,7 @@ int __set_page_dirty_nobuffers(struct page *page, unsigned int ntails)
                xa_lock_irqsave(&mapping->i_pages, flags);
                BUG_ON(page_mapping(page) != mapping);
                WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
-               account_page_dirtied(page, mapping);
+               account_page_dirtied(page, mapping, ntails);
                __xa_set_mark(&mapping->i_pages, page_index(page),
                                   PAGECACHE_TAG_DIRTY);
                xa_unlock_irqrestore(&mapping->i_pages, flags);
@@ -2624,6 +2626,27 @@ int set_page_dirty_lock(struct page *page)
 }
 EXPORT_SYMBOL(set_page_dirty_lock);

+/*
+ * set_page_range_dirty() is racy if the caller has no reference against
+ * page->mapping->host, and if the page is unlocked.  This is because another
+ * CPU could truncate the page off the mapping and then free the mapping.
+ *
+ * Usually, the page _is_ locked, or the caller is a user-space process which
+ * holds a reference on the inode by having an open file.
+ *
+ * In other cases, the page should be locked before running set_page_range_dirty().
+ */
+int set_page_range_dirty_lock(struct page *page, unsigned int ntails)
+{
+       int ret;
+
+       lock_page(page);
+       ret = set_page_range_dirty(page, ntails);
+       unlock_page(page);
+       return ret;
+}
+EXPORT_SYMBOL(set_page_range_dirty_lock);
+