[PATCH] mm: fix maxnode for mbind(), set_mempolicy() and migrate_pages()

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Because maxnode bug there is no way to bind or migrate_pages to the
last node in multi-node NUMA system unless you lie about maxnodes
when making the mbind, set_mempolicy or migrate_pages syscall.

Manpage for those syscall describe maxnodes as the number of bits in
the node bitmap ("bit mask of nodes containing up to maxnode bits").
Thus if maxnode is n then we expect to have a n bit(s) bitmap which
means that the mask of valid bits is ((1 << n) - 1). The get_nodes()
decrement lead to the mask being ((1 << (n - 1)) - 1).

The three syscalls use a common helper get_nodes() and first things
this helper do is decrement maxnode by 1 which leads to using n-1 bits
in the provided mask of nodes (see get_bitmap() an helper function to
get_nodes()).

The lead to two bugs, either the last node in the bitmap provided will
not be use in either of the three syscalls, or the syscalls will error
out and return EINVAL if the only bit set in the bitmap was the last
bit in the mask of nodes (which is ignored because of the bug and an
empty mask of nodes is an invalid argument).

I am surprised this bug was never caught ... it has been in the kernel
since forever.

People can use the following function to detect if the kernel has the
bug:

bool kernel_has_maxnodes_bug(void)
{
    unsigned long nodemask = 1;
    bool has_bug;
    long res;

    res = set_mempolicy(MPOL_BIND, &nodemask, 1);
    has_bug = res && (errno == EINVAL);
    set_mempolicy(MPOL_DEFAULT, NULL, 0);
    return has_bug;
}

You can tested with any of the three program below:

gcc mbind.c -o mbind -lnuma
gcc set_mempolicy.c -o set_mempolicy -lnuma
gcc migrate_pages.c -o migrate_pages -lnuma

First argument is maxnode, second argument is the bit index to set in
the mask of node (0 set the first bit, 1 the second bit, ...).

./mbind 2 1 & sleep 2 && numastat -n -p `pidof mbind` && fg
./set_mempolicy 2 1 & sleep 2 && numastat -n -p `pidof set_mempolicy` && fg
./migrate_pages 2 1 & sleep 2 && numastat -n -p `pidof migrate_pages` && fg

mbind.c %< ----------------------------------------------------------

void *anon_mem(size_t size)
{
    void *ret;

    ret = mmap(NULL, size, PROT_READ|
               PROT_WRITE, MAP_PRIVATE|
               MAP_ANON, -1, 0);
    return ret == MAP_FAILED ? NULL : ret;
}

unsigned long mround(unsigned long v, unsigned long m)
{
    if (m == 0) {
        return v;
    }

    return v + m - (v % m);
}

void bitmap_set(void *_bitmap, unsigned long b)
{
    uint8_t *bitmap = _bitmap;

    bitmap[b >> 3] |= (1 << (b & 7));
}

int main(int argc, char *argv[])
{
    unsigned long *nodemask, maxnode, node, i;
    size_t bytes;
    int8_t *mem;
    long res;

    if (argv[1] == NULL || argv[2] == NULL) {
        printf("missing argument: %s maxnodes node\n", argv[0]);
        return -1;
    }
    maxnode = atoi(argv[1]);
    node = atoi(argv[2]);

    bytes = mround(mround(maxnode, 8) >> 3,
                   sizeof(unsigned long));
    nodemask = calloc(bytes, 1);
    mem = anon_mem(NPAGES << 12);
    if (!mem || !nodemask) {
        return -1;
    }

    // Try to bind memory to node
    bitmap_set(nodemask, node);
    res = mbind(mem, NPAGES << 12, MPOL_BIND,
                nodemask, maxnode, 0);
    if (res) {
        printf("mbind(mem, NPAGES << 12, MPOL_BIND, "
               "nodemask, %d, 0) failed with %d\n",
               maxnode, errno);
        return -1;
    }

    // Write something to breakup from the zero page
    for (unsigned i = 0; i < NPAGES; i++) {
        mem[i << 12] = i + 1;
    }

    // Allow numastats to gather statistics
    getchar();

    return 0;
}

set_mempolicy %< ----------------------------------------------------

void *anon_mem(size_t size)
{
    void *ret;

    ret = mmap(NULL, size, PROT_READ|
               PROT_WRITE, MAP_PRIVATE|
               MAP_ANON, -1, 0);
    return ret == MAP_FAILED ? NULL : ret;
}

unsigned long mround(unsigned long v, unsigned long m)
{
    if (m == 0) {
        return v;
    }

    return v + m - (v % m);
}

void bitmap_set(void *_bitmap, unsigned long b)
{
    uint8_t *bitmap = _bitmap;

    bitmap[b >> 3] |= (1 << (b & 7));
}

int main(int argc, char *argv[])
{
    unsigned long *nodemask, maxnode, node, i;
    size_t bytes;
    int8_t *mem;
    long res;

    if (argv[1] == NULL || argv[2] == NULL) {
        printf("missing argument: %s maxnodes node\n", argv[0]);
        return -1;
    }
    maxnode = atoi(argv[1]);
    node = atoi(argv[2]);

    // bind memory to node 0 ...
    i = 1;
    res = set_mempolicy(MPOL_BIND, i, 2);
    if (res) {
        printf("set_mempolicy(MPOL_BIND, []=1, %d) "
               "failed with %d\n", maxnode, errno);
        return -1;
    }

    bytes = mround(mround(maxnode, 8) >> 3,
                   sizeof(unsigned long));
    nodemask = calloc(bytes, 1);
    mem = anon_mem(NPAGES << 12);
    if (!mem || !nodemask) {
        return -1;
    }

    // Try to bind memory to node
    bitmap_set(nodemask, node);
    res = set_mempolicy(MPOL_BIND, nodemask, maxnode);
    if (res) {
        printf("set_mempolicy(MPOL_BIND, nodemask, %d) "
               "failed with %d\n", maxnode, errno);
        return -1;
    }

    // Write something to breakup from the zero page
    for (unsigned i = 0; i < NPAGES; i++) {
        mem[i << 12] = i + 1;
    }

    // Allow numastats to gather statistics
    getchar();

    return 0;
}

migrate_pages %< ----------------------------------------------------

void *anon_mem(size_t size)
{
    void *ret;

    ret = mmap(NULL, size, PROT_READ|
               PROT_WRITE, MAP_PRIVATE|
               MAP_ANON, -1, 0);
    return ret == MAP_FAILED ? NULL : ret;
}

unsigned long mround(unsigned long v, unsigned long m)
{
    if (m == 0) {
        return v;
    }

    return v + m - (v % m);
}

void bitmap_set(void *_bitmap, unsigned long b)
{
    uint8_t *bitmap = _bitmap;

    bitmap[b >> 3] |= (1 << (b & 7));
}

int main(int argc, char *argv[])
{
    unsigned long *old_nodes, *new_nodes, maxnode, node, i;
    size_t bytes;
    int8_t *mem;
    long res;

    if (argv[1] == NULL || argv[2] == NULL) {
        printf("missing argument: %s maxnodes node\n", argv[0]);
        return -1;
    }
    maxnode = atoi(argv[1]);
    node = atoi(argv[2]);

    // bind memory to node 0 ...
    i = 1;
    res = set_mempolicy(MPOL_BIND, &i, 2);
    if (res) {
        printf("set_mempolicy(MPOL_BIND, []=1, %d) "
               "failed with %d\n", maxnode, errno);
        return -1;
    }

    bytes = mround(mround(maxnode, 8) >> 3,
                   sizeof(unsigned long));
    old_nodes = calloc(bytes, 1);
    new_nodes = calloc(bytes, 1);
    mem = anon_mem(NPAGES << 12);
    if (!mem || !new_nodes || !old_nodes) {
        return -1;
    }

    // Write something to breakup from the zero page
    for (unsigned i = 0; i < NPAGES; i++) {
        mem[i << 12] = i + 1;
    }

    // Try to bind memory to node
    bitmap_set(old_nodes, 0);
    bitmap_set(new_nodes, node);
    res = migrate_pages(getpid(), maxnode,
                        old_nodes, new_nodes);
    if (res) {
        printf("migrate_pages(pid, %d, old_nodes, "
               "new_nodes) failed with %d\n",
               maxnode, errno);
        return -1;
    }

    // Allow numastats to gather statistics
    getchar();

    return 0;
}

Signed-off-by: Jérôme Glisse <jglisse@xxxxxxxxxx>
To: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
To: linux-mm@xxxxxxxxx
Cc: linux-kernel@xxxxxxxxxxxxxxx
Cc: stable@xxxxxxxxxxxxxxx
---
 mm/mempolicy.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index aec756ae5637..658e5366d266 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1434,7 +1434,6 @@ static int get_bitmap(unsigned long *mask, const unsigned long __user *nmask,
 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
 		     unsigned long maxnode)
 {
-	--maxnode;
 	nodes_clear(*nodes);
 	if (maxnode == 0 || !nmask)
 		return 0;
-- 
2.45.2.1089.g2a221341d9-goog






[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux OMAP]     [Linux MIPS]     [eCos]     [Asterisk Internet PBX]     [Linux API]

  Powered by Linux