Re: [PATCH v3 05/17] NFS: Don't discard readdir results

David Wysochanski <dwysocha@xxxxxxxxxx> · Fri, 6 Nov 2020 13:00:56 -0500



On Fri, Nov 6, 2020 at 10:05 AM Trond Myklebust <trondmy@xxxxxxxxxxxxxxx> wrote:
>
> On Fri, 2020-11-06 at 08:30 -0500, David Wysochanski wrote:
> > On Wed, Nov 4, 2020 at 11:27 AM <trondmy@xxxxxxxxx> wrote:
> > >
> > > From: Trond Myklebust <trond.myklebust@xxxxxxxxxxxxxxx>
> > >
> > > If a readdir call returns more data than we can fit into one page
> > > cache page, then allocate a new one for that data rather than
> > > discarding the data.
> > >
> > > Signed-off-by: Trond Myklebust <trond.myklebust@xxxxxxxxxxxxxxx>
> > > ---
> > >  fs/nfs/dir.c | 46 ++++++++++++++++++++++++++++++++++++++++++----
> > >  1 file changed, 42 insertions(+), 4 deletions(-)
> > >
> > > diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
> > > index 842f69120a01..f7248145c333 100644
> > > --- a/fs/nfs/dir.c
> > > +++ b/fs/nfs/dir.c
> > > @@ -320,6 +320,26 @@ static void nfs_readdir_page_set_eof(struct
> > > page *page)
> > >         kunmap_atomic(array);
> > >  }
> > >
> > > +static void nfs_readdir_page_unlock_and_put(struct page *page)
> > > +{
> > > +       unlock_page(page);
> > > +       put_page(page);
> > > +}
> > > +
> > > +static struct page *nfs_readdir_page_get_next(struct address_space
> > > *mapping,
> > > +                                             pgoff_t index, u64
> > > cookie)
> > > +{
> > > +       struct page *page;
> > > +
> > > +       page = nfs_readdir_page_get_locked(mapping, index, cookie);
> > > +       if (page) {
> > > +               if (nfs_readdir_page_last_cookie(page) == cookie)
> > > +                       return page;
> > > +               nfs_readdir_page_unlock_and_put(page);
> > > +       }
> > > +       return NULL;
> > > +}
> > > +
> > >  static inline
> > >  int is_32bit_api(void)
> > >  {
> > > @@ -637,13 +657,15 @@ void nfs_prime_dcache(struct dentry *parent,
> > > struct nfs_entry *entry,
> > >  }
> > >
> > >  /* Perform conversion from xdr to cache array */
> > > -static
> > > -int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct
> > > nfs_entry *entry,
> > > -                               struct page **xdr_pages, struct
> > > page *page, unsigned int buflen)
> > > +static int nfs_readdir_page_filler(struct nfs_readdir_descriptor
> > > *desc,
> > > +                                  struct nfs_entry *entry,
> > > +                                  struct page **xdr_pages,
> > > +                                  struct page *fillme, unsigned
> > > int buflen)
> > >  {
> > > +       struct address_space *mapping = desc->file->f_mapping;
> > >         struct xdr_stream stream;
> > >         struct xdr_buf buf;
> > > -       struct page *scratch;
> > > +       struct page *scratch, *new, *page = fillme;
> > >         int status;
> > >
> > >         scratch = alloc_page(GFP_KERNEL);
> > > @@ -666,6 +688,19 @@ int
> > > nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct
> > > nfs_entry *en
> > >                                         desc->dir_verifier);
> > >
> > >                 status = nfs_readdir_add_to_array(entry, page);
> > > +               if (status != -ENOSPC)
> > > +                       continue;
> > > +
> > > +               if (page->mapping != mapping)
> > > +                       break;
> > > +               new = nfs_readdir_page_get_next(mapping, page-
> > > >index + 1,
> > > +                                               entry-
> > > >prev_cookie);
> > > +               if (!new)
> > > +                       break;
> > > +               if (page != fillme)
> > > +                       nfs_readdir_page_unlock_and_put(page);
> > > +               page = new;
> > > +               status = nfs_readdir_add_to_array(entry, page);
> > >         } while (!status && !entry->eof);
> > >
> > >         switch (status) {
> > > @@ -681,6 +716,9 @@ int
> > > nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct
> > > nfs_entry *en
> > >                 break;
> > >         }
> > >
> > > +       if (page != fillme)
> > > +               nfs_readdir_page_unlock_and_put(page);
> > > +
> > >         put_page(scratch);
> > >         return status;
> > >  }
> > > --
> > > 2.28.0
> > >
> >
> > It doesn't look like this handles uncached_readdir.  Were you
> > planning
> > on addressing that somehow, or should we think about something like
> > this to move dtsize up as a parameter to nfs_readdir_xdr_to_array(),
> > and force uncached_readdir() to 1 page:
> > diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
> > index b6c3501e8f61..ca30e2dbb9c3 100644
> > --- a/fs/nfs/dir.c
> > +++ b/fs/nfs/dir.c
> > @@ -791,13 +791,12 @@ static struct page
> > **nfs_readdir_alloc_pages(size_t npages)
> >
> >  static int nfs_readdir_xdr_to_array(struct nfs_readdir_descriptor
> > *desc,
> >                                     struct page *page, __be32
> > *verf_arg,
> > -                                   __be32 *verf_res)
> > +                                   __be32 *verf_res, size_t dtsize)
> >  {
> >         struct page **pages;
> >         struct nfs_entry *entry;
> >         size_t array_size;
> >         struct inode *inode = file_inode(desc->file);
> > -       size_t dtsize = NFS_SERVER(inode)->dtsize;
> >         int status = -ENOMEM;
> >
> >         entry = kzalloc(sizeof(*entry), GFP_KERNEL);
> > @@ -879,13 +878,15 @@ static int find_and_lock_cache_page(struct
> > nfs_readdir_descriptor *desc)
> >         struct nfs_inode *nfsi = NFS_I(inode);
> >         __be32 verf[NFS_DIR_VERIFIER_SIZE];
> >         int res;
> > +       size_t dtsize = NFS_SERVER(inode)->dtsize;
> >
> >         desc->page = nfs_readdir_page_get_cached(desc);
> >         if (!desc->page)
> >                 return -ENOMEM;
> >         if (nfs_readdir_page_needs_filling(desc->page)) {
> >                 res = nfs_readdir_xdr_to_array(desc, desc->page,
> > -                                              nfsi->cookieverf,
> > verf);
> > +                                              nfsi->cookieverf,
> > verf,
> > +                                              dtsize);
> >                 if (res < 0) {
> >                         nfs_readdir_page_unlock_and_put_cached(desc);
> >                         if (res == -EBADCOOKIE || res == -ENOTSYNC) {
> > @@ -995,7 +996,8 @@ static int uncached_readdir(struct
> > nfs_readdir_descriptor *desc)
> >         desc->duped = 0;
> >
> >         nfs_readdir_page_init_array(page, desc->dir_cookie);
> > -       status = nfs_readdir_xdr_to_array(desc, page, desc->verf,
> > verf);
> > +       status = nfs_readdir_xdr_to_array(desc, page, desc->verf,
> > verf,
> > +                                         PAGE_SIZE);
> >         if (status < 0)
> >                 goto out_release;
> >
>
> Actually for uncached readdir, I was thinking we might want to convert
> nfs_readdir_xdr_to_array() and nfs_readdir_page_filler() to take an
> array of pages + buffer size.
> IOW: convert uncached_readdir() to allocate an array of pages, and pass
> in a 'struct page **' + a buffer length.
>
Yes I agree and this looks more like the right way to fix it rather than
this single page approach.


> I don't like the idea of passing in a dtsize because that restricts the
> size of the READDIR RPC request buffer instead of restricting the
> number of entries the server returns. For any given buffer size, that
> number of entries fluctuates wildly depending on the filenames in that
> directory and their differing lengths, whereas your page can take a
> fixed number of entries irrespective of the filename lengths (in fact
> it can always take 127 entries on an x86_64).
>
> It is true that the number of entries that nfs_do_filldir() can handle
> also depends on the filename length, but we don't have any information
> in the filesystem about the buffer size that was passed in to the
> getdents() system call of how much space remains in that buffer. All
> that information is hidden in the opaque 'struct dir_context'. So for
> that reason, we can't use that information to set a dtsize either.
>
> --
> Trond Myklebust
> Linux NFS client maintainer, Hammerspace
> trond.myklebust@xxxxxxxxxxxxxxx
>
>