Avoid running find_rsb_root by storing last recovered rsb address for each node. Makes dlm recovery much faster for FS with large number of files. Signed-off-by: Yevheniy Demchenko <zheka@xxxxxx> --- Current dlm recovery uses small (4096 bytes) buffer to communicate between dlm_copy_master_names and dlm_directory_recovery. This leads to running find_rsb_root N*32/4096 times, where N - number of locks to recover and 32 - DLM_RESNAME_MAXLEN+1. find_rsb_root itself takes N*c to complete, where c is some constant. Eventually, dlm recovery time is proportional to N*N. For an ocfs2 fs with one directory consisting of 300000 small files every mount on other node takes more than 2.5 minutes and umount more than 5 minutes on a fairly modern HW with 10Gb interconnect. During dlm recovery FS is not available on any node. This patch makes mounts and umounts on non-locking-master nodes to take less than a 2 seconds. It is not limited to ocfs2 and might make dlm recovery faster in general (i.e. for gfs2). Test case: 2 node RHCS cluster, OCFS2 with cman cluster stack. /sys/kernel/config/dlm/cluster/{lkbtbl_size,dirtbl_size,rsbtbl_size} = 16384 on both nodes On node 1: #mkfs.ocfs2 --fs-features=backup-super,sparse,inline-data,extended-slotmap,indexed-dirs,refcount,xattr,usrquota,grpquota,unwritten /dev/vg1/test1 #mount /dev/vg1/test1 /mnt/temp -o noatime,nodiratime #mkdir /mnt/temp/test1 #for i in $(seq 1 300000) ; do dd if=/dev/urandom bs=4096 count=1 of=/mnt/temp/test1/$i ; done #umount /mnt/temp #-----leave dlm and destroy locks #mount /dev/vg1/test1 /mnt/temp -o noatime,nodiratime #time (ls -l /mnt/temp/test1 | wc -l ) #-------create 300000 RR locks on node 1 On node 2: #mount /dev/vg1/test1 /mnt/temp -o noatime,nodiratime #--- dlm recovery starts and takes a looooong time if dlm is not patched #umount /mnt/temp #----- even looooooonger, FS is not available on any node while recovery is running After patching, both operations on node2 take less than a 2 seconds. For now, patch tries to detect inconsistences and reverts to the previous behaviour if there are any. These tests can be dropped together with find_rsb_root and some excessive code in the future. diff -uNr vanilla/fs/dlm/dir.c v1.0/fs/dlm/dir.c --- vanilla/fs/dlm/dir.c 2011-09-29 15:29:00.000000000 +0200 +++ v1.0/fs/dlm/dir.c 2011-12-26 22:00:21.068403493 +0100 @@ -196,6 +196,16 @@ } } +static int nodeid2index (struct dlm_ls *ls, int nodeid) { + int i; + for (i = 0; i < ls->ls_num_nodes ; i++) { + if (ls->ls_node_array[i] == nodeid) + return (i); + } + log_debug(ls, "index not found for nodeid %d", nodeid); + return (-1); +} + int dlm_recover_directory(struct dlm_ls *ls) { struct dlm_member *memb; @@ -375,11 +385,28 @@ struct dlm_rsb *r; int offset = 0, dir_nodeid; __be16 be_namelen; + int index; down_read(&ls->ls_root_sem); + index = nodeid2index(ls, nodeid); + if (inlen > 1) { - r = find_rsb_root(ls, inbuf, inlen); + if ((index > -1) && (ls->ls_recover_last_rsb[index])) { + if (inlen == ls->ls_recover_last_rsb[index]->res_length && + !memcmp(inbuf, ls->ls_recover_last_rsb[index]->res_name, inlen)) { + r = ls->ls_recover_last_rsb[index]; + } else { + /* This should never happen! */ + log_error(ls, "copy_master_names: rsb cache failed 1: node %d: cached rsb %1.31s, needed rsb %1.31s;", nodeid, + ls->ls_recover_last_rsb[index]->res_name, inbuf); + r = find_rsb_root(ls, inbuf, inlen); + } + } else { + /* Left for safety reasons, we should never get here */ + r = find_rsb_root(ls, inbuf, inlen); + log_error(ls, "copy_master_names: rsb cache failed 2: ,searching for %1.31s, node %d", inbuf, nodeid); + } if (!r) { inbuf[inlen - 1] = '\0'; log_error(ls, "copy_master_names from %d start %d %s", @@ -421,6 +448,7 @@ offset += sizeof(__be16); memcpy(outbuf + offset, r->res_name, r->res_length); offset += r->res_length; + ls->ls_recover_last_rsb[index] = r; } /* diff -uNr vanilla/fs/dlm/dlm_internal.h v1.0/fs/dlm/dlm_internal.h --- vanilla/fs/dlm/dlm_internal.h 2011-09-29 15:32:00.000000000 +0200 +++ v1.0/fs/dlm/dlm_internal.h 2011-12-22 23:51:00.000000000 +0100 @@ -526,6 +526,7 @@ int ls_recover_list_count; wait_queue_head_t ls_wait_general; struct mutex ls_clear_proc_locks; + struct dlm_rsb **ls_recover_last_rsb; struct list_head ls_root_list; /* root resources */ struct rw_semaphore ls_root_sem; /* protect root_list */ diff -uNr vanilla/fs/dlm/member.c v1.0/fs/dlm/member.c --- vanilla/fs/dlm/member.c 2011-09-29 15:29:00.000000000 +0200 +++ v1.0/fs/dlm/member.c 2011-12-23 19:55:00.000000000 +0100 @@ -128,6 +128,9 @@ kfree(ls->ls_node_array); ls->ls_node_array = NULL; + + kfree(ls->ls_recover_last_rsb); + ls->ls_recover_last_rsb = NULL; list_for_each_entry(memb, &ls->ls_nodes, list) { if (memb->weight) @@ -146,6 +149,11 @@ array = kmalloc(sizeof(int) * total, GFP_NOFS); if (!array) return; + + ls->ls_recover_last_rsb = kcalloc(ls->ls_num_nodes+1, sizeof(struct dlm_rsb *), GFP_NOFS); + + if (!ls->ls_recover_last_rsb) + return; list_for_each_entry(memb, &ls->ls_nodes, list) { if (!all_zero && !memb->weight) -- Ing. Yevheniy Demchenko Senior Linux Administrator UVT s.r.o. -- Linux-cluster mailing list Linux-cluster@xxxxxxxxxx https://www.redhat.com/mailman/listinfo/linux-cluster