Re: exofs/ore: allocation of _ore_get_io_state()

Idan Kedar <idank@xxxxxxxxxx> · Thu, 24 May 2012 14:23:48 +0300

On Thu, May 24, 2012 at 12:00 AM, Boaz Harrosh <bharrosh@xxxxxxxxxxx> wrote:
> On 04/16/2012 05:53 PM, Idan Kedar wrote:
>
>> _ore_get_io_state is supposed to allocate a struct ore_io_state, which is
>> variable length. Setting aside the uglification argument which gave birth
>> to the patch "pnfs-obj: Uglify objio_segment allocation for the sake of
>> the principle :-(", the above function checks if the memory it needs to
>> allocate is bigger than a page, and if so, separates it into two
>> allocations. why is this done?
>>
>> from include/linux/slab.h:
>>> /*
>>> * The largest kmalloc size supported by the slab allocators is
>>> * 32 megabyte (2^25) or the maximum allocatable page order if that is
>>> * less than 32 MB.
>>> *
>>> * WARNING: Its not easy to increase this value since the allocators have
>>> * to do various tricks to work around compiler limitations in order to
>>> * ensure proper constant folding.
>>> */
>>> #define KMALLOC_SHIFT_HIGH      ((MAX_ORDER + PAGE_SHIFT - 1) <= 25 ? \
>>>                                (MAX_ORDER + PAGE_SHIFT - 1) : 25)
>>>
>>> #define KMALLOC_MAX_SIZE        (1UL << KMALLOC_SHIFT_HIGH)
>>> #define KMALLOC_MAX_ORDER       (KMALLOC_SHIFT_HIGH - PAGE_SHIFT)
>>
>> If kmalloc can allocate 32MB, why check one page, like so?
>>
>> from fs/exofs/ore.c:
>>> struct __alloc_all_io_state {
>>>      struct ore_io_state ios;
>>>      struct ore_per_dev_state per_dev[numdevs];
>>>      union {
>>>              struct osd_sg_entry sglist[sgs_per_dev * numdevs];
>>>              struct page *pages[num_par_pages];
>>>      };
>>> } *_aios;
>>>
>>> if (likely(sizeof(*_aios) <= PAGE_SIZE)) {
>>>      _aios = kzalloc(sizeof(*_aios), GFP_KERNEL);
>>>      if (unlikely(!_aios)) {
>>>              EXOFS_DBGMSG("Failed kzalloc bytes=%zd\n",
>>>                         sizeof(*_aios));
>>>              *pios = NULL;
>>>              return -ENOMEM;
>>>      }
>>>      pages = num_par_pages ? _aios->pages : NULL;
>>>      sgilist = sgs_per_dev ? _aios->sglist : NULL;
>>>      ios = &_aios->ios;
>>> } else {
>>>      struct __alloc_small_io_state {
>>>              struct ore_io_state ios;
>>>              struct ore_per_dev_state per_dev[numdevs];
>>>      } *_aio_small;
>>>      union __extra_part {
>>>              struct osd_sg_entry sglist[sgs_per_dev * numdevs];
>>>              struct page *pages[num_par_pages];
>>>      } *extra_part;
>>>              _aio_small = kzalloc(sizeof(*_aio_small), GFP_KERNEL);
>>>      if (unlikely(!_aio_small)) {
>>>              EXOFS_DBGMSG("Failed alloc first part bytes=%zd\n",
>>>                         sizeof(*_aio_small));
>>>              *pios = NULL;
>>>              return -ENOMEM;
>>>      }
>>>      extra_part = kzalloc(sizeof(*extra_part), GFP_KERNEL);
>>>      if (unlikely(!extra_part)) {
>>>              EXOFS_DBGMSG("Failed alloc second part bytes=%zd\n",
>>>                         sizeof(*extra_part));
>>>              kfree(_aio_small);
>>>              *pios = NULL;
>>>              return -ENOMEM;
>>>      }
>>>              pages = num_par_pages ? extra_part->pages : NULL;
>>>      sgilist = sgs_per_dev ? extra_part->sglist : NULL;
>>>      /* In this case the per_dev[0].sgilist holds the pointer to
>>>       * be freed
>>>       */
>>>      ios = &_aio_small->ios;
>>>      ios->extra_part_alloc = true;
>>> }
>>
>> Is there any point to check if the memory is greater than 32MB?
>>
>>
>
>
> In theory it can allocate 32MB, in slab. I'm not sure about slob and slub.
>
> But in practice contiguous physical pages allocation tends to fail very
> fast on a system that was up a couple of hours. So we avoid it as plage.
>
> Past testing with tables bigger than PAGE_SIZE on the IO path gave
> catastrophic results. (Again once the system is up for a while and
> had a chance to fragment physical address space)

What allocation sizes (of struct __alloc_all_io_state) are we talking
about? how many devices per I/O did you encounter?

>
> The all Kernel point of the use of sg-lists is so not to allocate
> contiguous physical pages and to not have to use virtual-memory.
>
> This is done all over the Kernel. MAX_BIO_SIZE max-sg-table ...

Why not use virtual memory? Is this limitation imposed by the OSD
initiator or by some other layer in the OSD stack?

>
> (BTW I saw this mail by chance. If you direct it to me I see it
>  for sure)
>
> Cheers
> Boaz

-- 
Idan Kedar
Tonian
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html