[PATCH] dlm: faster dlm recovery

Yevheniy Demchenko <zheka@xxxxxx> · Mon, 26 Dec 2011 23:52:29 +0100

Avoid running find_rsb_root by storing last recovered rsb address for
each node.
Makes dlm recovery much faster for FS with large number of files.

Signed-off-by: Yevheniy Demchenko <zheka@xxxxxx>
---
Current dlm recovery uses small (4096 bytes) buffer to communicate between
dlm_copy_master_names and dlm_directory_recovery. This leads to running
find_rsb_root
N*32/4096 times, where N - number of locks to recover and 32 -
DLM_RESNAME_MAXLEN+1.
find_rsb_root itself takes N*c to complete, where c is some constant.
Eventually, dlm recovery
time is proportional to N*N. For an ocfs2 fs with one directory
consisting of 300000 small files
every mount on other node takes more than 2.5 minutes and umount more
than 5 minutes on a
fairly modern HW with 10Gb interconnect. During dlm recovery FS is not
available on any node.
 This patch makes mounts and umounts on non-locking-master nodes to take
less than a 2 seconds.
It is not limited to ocfs2 and might make dlm recovery faster in general
(i.e. for gfs2).

Test case:
2 node RHCS cluster, OCFS2 with cman cluster stack.
/sys/kernel/config/dlm/cluster/{lkbtbl_size,dirtbl_size,rsbtbl_size} =
16384 on both nodes

On node 1:
#mkfs.ocfs2
--fs-features=backup-super,sparse,inline-data,extended-slotmap,indexed-dirs,refcount,xattr,usrquota,grpquota,unwritten
/dev/vg1/test1
#mount /dev/vg1/test1 /mnt/temp -o noatime,nodiratime
#mkdir /mnt/temp/test1
#for i in $(seq 1 300000) ; do dd if=/dev/urandom bs=4096 count=1
of=/mnt/temp/test1/$i ; done
#umount /mnt/temp  #-----leave dlm and destroy locks
#mount /dev/vg1/test1 /mnt/temp -o noatime,nodiratime
#time (ls -l /mnt/temp/test1 | wc -l )    #-------create 300000 RR locks
on node 1

On node 2:
#mount /dev/vg1/test1 /mnt/temp -o noatime,nodiratime #--- dlm recovery
starts and takes a looooong time if dlm is not patched
#umount /mnt/temp #----- even looooooonger, FS is not available on any
node while recovery is running
After patching, both operations on node2 take less than a 2 seconds.

For now, patch tries to detect inconsistences and reverts to the
previous behaviour if there are any.
These tests can be dropped together with find_rsb_root and some
excessive code in the future.


diff -uNr vanilla/fs/dlm/dir.c v1.0/fs/dlm/dir.c

--- vanilla/fs/dlm/dir.c        2011-09-29 15:29:00.000000000 +0200
+++ v1.0/fs/dlm/dir.c   2011-12-26 22:00:21.068403493 +0100
@@ -196,6 +196,16 @@
        }
 }
 
+static int nodeid2index (struct dlm_ls *ls, int nodeid) {
+       int i;
+       for (i = 0; i < ls->ls_num_nodes ; i++) {
+               if (ls->ls_node_array[i] == nodeid)
+                       return (i);
+       }
+       log_debug(ls, "index not found for nodeid %d", nodeid);
+       return (-1);
+}
+
 int dlm_recover_directory(struct dlm_ls *ls)
 {
        struct dlm_member *memb;
@@ -375,11 +385,28 @@
        struct dlm_rsb *r;
        int offset = 0, dir_nodeid;
        __be16 be_namelen;
+       int index;
 
        down_read(&ls->ls_root_sem);
 
+       index = nodeid2index(ls, nodeid);
+
        if (inlen > 1) {
-               r = find_rsb_root(ls, inbuf, inlen);
+               if ((index > -1) && (ls->ls_recover_last_rsb[index])) {
+                       if (inlen ==
ls->ls_recover_last_rsb[index]->res_length &&
+                           !memcmp(inbuf,
ls->ls_recover_last_rsb[index]->res_name, inlen)) {
+                               r = ls->ls_recover_last_rsb[index];
+                       } else {
+                               /* This should never happen! */
+                               log_error(ls, "copy_master_names: rsb
cache failed 1: node %d: cached rsb %1.31s, needed rsb %1.31s;", nodeid,
+                                        
ls->ls_recover_last_rsb[index]->res_name, inbuf);
+                               r = find_rsb_root(ls, inbuf, inlen);
+                       }
+               } else {
+                       /* Left for safety reasons, we should never get
here */
+                       r = find_rsb_root(ls, inbuf, inlen);
+                       log_error(ls, "copy_master_names: rsb cache
failed 2: ,searching for %1.31s, node %d", inbuf, nodeid);
+               }
                if (!r) {
                        inbuf[inlen - 1] = '\0';
                        log_error(ls, "copy_master_names from %d start
%d %s",
@@ -421,6 +448,7 @@
                offset += sizeof(__be16);
                memcpy(outbuf + offset, r->res_name, r->res_length);
                offset += r->res_length;
+               ls->ls_recover_last_rsb[index] = r;
        }
 
        /*
diff -uNr vanilla/fs/dlm/dlm_internal.h v1.0/fs/dlm/dlm_internal.h
--- vanilla/fs/dlm/dlm_internal.h       2011-09-29 15:32:00.000000000 +0200
+++ v1.0/fs/dlm/dlm_internal.h  2011-12-22 23:51:00.000000000 +0100
@@ -526,6 +526,7 @@
        int                     ls_recover_list_count;
        wait_queue_head_t       ls_wait_general;
        struct mutex            ls_clear_proc_locks;
+       struct dlm_rsb          **ls_recover_last_rsb;
 
        struct list_head        ls_root_list;   /* root resources */
        struct rw_semaphore     ls_root_sem;    /* protect root_list */
diff -uNr vanilla/fs/dlm/member.c v1.0/fs/dlm/member.c
--- vanilla/fs/dlm/member.c     2011-09-29 15:29:00.000000000 +0200
+++ v1.0/fs/dlm/member.c        2011-12-23 19:55:00.000000000 +0100
@@ -128,6 +128,9 @@
 
        kfree(ls->ls_node_array);
        ls->ls_node_array = NULL;
+
+       kfree(ls->ls_recover_last_rsb);
+       ls->ls_recover_last_rsb = NULL;
 
        list_for_each_entry(memb, &ls->ls_nodes, list) {
                if (memb->weight)
@@ -146,6 +149,11 @@
        array = kmalloc(sizeof(int) * total, GFP_NOFS);
        if (!array)
                return;
+
+       ls->ls_recover_last_rsb = kcalloc(ls->ls_num_nodes+1,
sizeof(struct dlm_rsb *), GFP_NOFS);
+
+       if (!ls->ls_recover_last_rsb)
+               return;
 
        list_for_each_entry(memb, &ls->ls_nodes, list) {
                if (!all_zero && !memb->weight)

-- 
Ing. Yevheniy Demchenko
Senior Linux Administrator
UVT s.r.o. 

--
Linux-cluster mailing list
Linux-cluster@xxxxxxxxxx
https://www.redhat.com/mailman/listinfo/linux-cluster