Re: volume not working after yum update - gluster 3.6.3

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Mon, 2015-08-10 at 22:22 +0530, Atin Mukherjee wrote:
[snip]
strace output claims the command exited successfully. Are you sure ls got hung?

Not sure, but this one definitely hung. 'mkdir("test", 0777' was the last output, and it's been stuck here for about 7 minutes now:

[root@voicemail1b-1 14391.broken]# strace mkdir test
execve("/usr/bin/mkdir", ["mkdir", "test"], [/* 27 vars */]) = 0
brk(0)                                  = 0x8db000
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f3468a89000
access("/etc/ld.so.preload", R_OK)      = -1 ENOENT (No such file or directory)
open("/etc/ld.so.cache", O_RDONLY|O_CLOEXEC) = 3
fstat(3, {st_mode=S_IFREG|0644, st_size=31874, ...}) = 0
mmap(NULL, 31874, PROT_READ, MAP_PRIVATE, 3, 0) = 0x7f3468a81000
close(3)                                = 0
open("/lib64/libselinux.so.1", O_RDONLY|O_CLOEXEC) = 3
read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\240d\0\0\0\0\0\0"..., 832) = 832
fstat(3, {st_mode=S_IFREG|0755, st_size=147120, ...}) = 0
mmap(NULL, 2246784, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x7f3468644000
mprotect(0x7f3468665000, 2097152, PROT_NONE) = 0
mmap(0x7f3468865000, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x21000) = 0x7f3468865000
mmap(0x7f3468867000, 6272, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x7f3468867000
close(3)                                = 0
open("/lib64/libc.so.6", O_RDONLY|O_CLOEXEC) = 3
read(3, "\177ELF\2\1\1\3\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\0\34\2\0\0\0\0\0"..., 832) = 832
fstat(3, {st_mode=S_IFREG|0755, st_size=2107760, ...}) = 0
mmap(NULL, 3932736, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x7f3468283000
mprotect(0x7f3468439000, 2097152, PROT_NONE) = 0
mmap(0x7f3468639000, 24576, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x1b6000) = 0x7f3468639000
mmap(0x7f346863f000, 16960, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x7f346863f000
close(3)                                = 0
open("/lib64/libpcre.so.1", O_RDONLY|O_CLOEXEC) = 3
read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\360\25\0\0\0\0\0\0"..., 832) = 832
fstat(3, {st_mode=S_IFREG|0755, st_size=398272, ...}) = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f3468a80000
mmap(NULL, 2490888, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x7f3468022000
mprotect(0x7f3468081000, 2097152, PROT_NONE) = 0
mmap(0x7f3468281000, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x5f000) = 0x7f3468281000
close(3)                                = 0
open("/lib64/liblzma.so.5", O_RDONLY|O_CLOEXEC) = 3
read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0000/\0\0\0\0\0\0"..., 832) = 832
fstat(3, {st_mode=S_IFREG|0755, st_size=153184, ...}) = 0
mmap(NULL, 2245240, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x7f3467dfd000
mprotect(0x7f3467e21000, 2093056, PROT_NONE) = 0
mmap(0x7f3468020000, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x23000) = 0x7f3468020000
close(3)                                = 0
open("/lib64/libdl.so.2", O_RDONLY|O_CLOEXEC) = 3
read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\320\16\0\0\0\0\0\0"..., 832) = 832
fstat(3, {st_mode=S_IFREG|0755, st_size=19512, ...}) = 0
mmap(NULL, 2109744, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x7f3467bf9000
mprotect(0x7f3467bfc000, 2093056, PROT_NONE) = 0
mmap(0x7f3467dfb000, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x2000) = 0x7f3467dfb000
close(3)                                = 0
open("/lib64/libpthread.so.0", O_RDONLY|O_CLOEXEC) = 3
read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\240l\0\0\0\0\0\0"..., 832) = 832
fstat(3, {st_mode=S_IFREG|0755, st_size=141616, ...}) = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f3468a7f000
mmap(NULL, 2208864, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x7f34679dd000
mprotect(0x7f34679f3000, 2097152, PROT_NONE) = 0
mmap(0x7f3467bf3000, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x16000) = 0x7f3467bf3000
mmap(0x7f3467bf5000, 13408, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x7f3467bf5000
close(3)                                = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f3468a7e000
mmap(NULL, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f3468a7c000
arch_prctl(ARCH_SET_FS, 0x7f3468a7c800) = 0
mprotect(0x7f3468639000, 16384, PROT_READ) = 0
mprotect(0x7f3467bf3000, 4096, PROT_READ) = 0
mprotect(0x7f3467dfb000, 4096, PROT_READ) = 0
mprotect(0x7f3468020000, 4096, PROT_READ) = 0
mprotect(0x7f3468281000, 4096, PROT_READ) = 0
mprotect(0x7f3468865000, 4096, PROT_READ) = 0
mprotect(0x611000, 4096, PROT_READ)     = 0
mprotect(0x7f3468a8a000, 4096, PROT_READ) = 0
munmap(0x7f3468a81000, 31874)           = 0
set_tid_address(0x7f3468a7cad0)         = 24942
set_robust_list(0x7f3468a7cae0, 24)     = 0
rt_sigaction(SIGRTMIN, {0x7f34679e3780, [], SA_RESTORER|SA_SIGINFO, 0x7f34679ec130}, NULL, 8) = 0
rt_sigaction(SIGRT_1, {0x7f34679e3810, [], SA_RESTORER|SA_RESTART|SA_SIGINFO, 0x7f34679ec130}, NULL, 8) = 0
rt_sigprocmask(SIG_UNBLOCK, [RTMIN RT_1], NULL, 8) = 0
getrlimit(RLIMIT_STACK, {rlim_cur=8192*1024, rlim_max=RLIM64_INFINITY}) = 0
statfs("/sys/fs/selinux", {f_type=0xf97cff8c, f_bsize=4096, f_blocks=0, f_bfree=0, f_bavail=0, f_files=0, f_ffree=0, f_fsid={0, 0}, f_namelen=255, f_frsize=4096}) = 0
statfs("/sys/fs/selinux", {f_type=0xf97cff8c, f_bsize=4096, f_blocks=0, f_bfree=0, f_bavail=0, f_files=0, f_ffree=0, f_fsid={0, 0}, f_namelen=255, f_frsize=4096}) = 0
stat("/sys/fs/selinux", {st_mode=S_IFDIR|0755, st_size=0, ...}) = 0
brk(0)                                  = 0x8db000
brk(0x8fc000)                           = 0x8fc000
mkdir("test", 0777 

>
>
>
>
>> >
>> > Then ... do I need to run something on one of the bricks while strace is
>> > running?
>> >
>> > Cheers,
>> > Kingsley.
>> >
>> >
>> > > >
>> > > > [root@gluster1b-1 ~]# gluster volume heal callrec info
>> > > > Brick gluster1a-1.dns99.co.uk:/data/brick/callrec/
>> > > > <gfid:164f888f-2049-49e6-ad26-c758ee091863>
>> > > > /recordings/834723/14391 - Possibly undergoing heal
>> > > >
>> > > > <gfid:e280b40c-d8b7-43c5-9da7-4737054d7a7f>
>> > > > <gfid:b1fbda4a-732f-4f5d-b5a1-8355d786073e>
>> > > > <gfid:edb74524-b4b7-4190-85e7-4aad002f6e7c>
>> > > > <gfid:9b8b8446-1e27-4113-93c2-6727b1f457eb>
>> > > > <gfid:650efeca-b45c-413b-acc3-f0a5853ccebd>
>> > > > Number of entries: 7
>> > > >
>> > > > Brick gluster1b-1.dns99.co.uk:/data/brick/callrec/
>> > > > Number of entries: 0
>> > > >
>> > > > Brick gluster2a-1.dns99.co.uk:/data/brick/callrec/
>> > > > <gfid:e280b40c-d8b7-43c5-9da7-4737054d7a7f>
>> > > > <gfid:164f888f-2049-49e6-ad26-c758ee091863>
>> > > > <gfid:650efeca-b45c-413b-acc3-f0a5853ccebd>
>> > > > <gfid:b1fbda4a-732f-4f5d-b5a1-8355d786073e>
>> > > > /recordings/834723/14391 - Possibly undergoing heal
>> > > >
>> > > > <gfid:edb74524-b4b7-4190-85e7-4aad002f6e7c>
>> > > > <gfid:9b8b8446-1e27-4113-93c2-6727b1f457eb>
>> > > > Number of entries: 7
>> > > >
>> > > > Brick gluster2b-1.dns99.co.uk:/data/brick/callrec/
>> > > > Number of entries: 0
>> > > >
>> > > >
>> > > > If I query each brick directly for the number of files/directories
>> > > > within that, I get 1731 on gluster1a-1 and gluster2a-1, but 1737 on
>> > > the
>> > > > other two, using this command:
>> > > >
>> > > > # find /data/brick/callrec/recordings/834723/14391 -print | wc -l
>> > > >
>> > > > Cheers,
>> > > > Kingsley.
>> > > >
>> > > > On Mon, 2015-08-10 at 11:05 +0100, Kingsley wrote:
>> > > > > Sorry for the blind panic - restarting the volume seems to have
>> > > fixed
>> > > > > it.
>> > > > >
>> > > > > But then my next question - why is this necessary? Surely it
>> > > undermines
>> > > > > the whole point of a high availability system?
>> > > > >
>> > > > > Cheers,
>> > > > > Kingsley.
>> > > > >
>> > > > > On Mon, 2015-08-10 at 10:53 +0100, Kingsley wrote:
>> > > > > > Hi,
>> > > > > >
>> > > > > > We have a 4 way replicated volume using gluster 3.6.3 on CentOS
>> > > 7.
>> > > > > >
>> > > > > > Over the weekend I did a yum update on each of the bricks in
>> > > turn, but
>> > > > > > now when clients (using fuse mounts) try to access the volume,
>> > > it hangs.
>> > > > > > Gluster itself wasn't updated (we've disabled that repo so that
>> > > we keep
>> > > > > > to 3.6.3 for now).
>> > > > > >
>> > > > > > This was what I did:
>> > > > > >
>> > > > > >       * on first brick, "yum update"
>> > > > > >       * reboot brick
>> > > > > >       * watch "gluster volume status" on another brick and wait
>> > > for it
>> > > > > >         to say all 4 bricks are online before proceeding to
>> > > update the
>> > > > > >         next brick
>> > > > > >
>> > > > > > I was expecting the clients might pause 30 seconds while they
>> > > notice a
>> > > > > > brick is offline, but then recover.
>> > > > > >
>> > > > > > I've tried re-mounting clients, but that hasn't helped.
>> > > > > >
>> > > > > > I can't see much data in any of the log files.
>> > > > > >
>> > > > > > I've tried "gluster volume heal callrec" but it doesn't seem to
>> > > have
>> > > > > > helped.
>> > > > > >
>> > > > > > What shall I do next?
>> > > > > >
>> > > > > > I've pasted some stuff below in case any of it helps.
>> > > > > >
>> > > > > > Cheers,
>> > > > > > Kingsley.
>> > > > > >
>> > > > > > [root@gluster1b-1 ~]# gluster volume info callrec
>> > > > > >
>> > > > > > Volume Name: callrec
>> > > > > > Type: Replicate
>> > > > > > Volume ID: a39830b7-eddb-4061-b381-39411274131a
>> > > > > > Status: Started
>> > > > > > Number of Bricks: 1 x 4 = 4
>> > > > > > Transport-type: tcp
>> > > > > > Bricks:
>> > > > > > Brick1: gluster1a-1:/data/brick/callrec
>> > > > > > Brick2: gluster1b-1:/data/brick/callrec
>> > > > > > Brick3: gluster2a-1:/data/brick/callrec
>> > > > > > Brick4: gluster2b-1:/data/brick/callrec
>> > > > > > Options Reconfigured:
>> > > > > > performance.flush-behind: off
>> > > > > > [root@gluster1b-1 ~]#
>> > > > > >
>> > > > > >
>> > > > > > [root@gluster1b-1 ~]# gluster volume status callrec
>> > > > > > Status of volume: callrec
>> > > > > > Gluster process                                         Port
>> > > Online  Pid
>> > > > > >
>> > > ------------------------------------------------------------------------------
>> > > > > > Brick gluster1a-1:/data/brick/callrec                   49153
>> > >  Y       6803
>> > > > > > Brick gluster1b-1:/data/brick/callrec                   49153
>> > >  Y       2614
>> > > > > > Brick gluster2a-1:/data/brick/callrec                   49153
>> > >  Y       2645
>> > > > > > Brick gluster2b-1:/data/brick/callrec                   49153
>> > >  Y       4325
>> > > > > > NFS Server on localhost                                 2049
>> > > Y       2769
>> > > > > > Self-heal Daemon on localhost                           N/A
>> > >  Y       2789
>> > > > > > NFS Server on gluster2a-1                               2049
>> > > Y       2857
>> > > > > > Self-heal Daemon on gluster2a-1                         N/A
>> > >  Y       2814
>> > > > > > NFS Server on 88.151.41.100                             2049
>> > > Y       6833
>> > > > > > Self-heal Daemon on 88.151.41.100                       N/A
>> > >  Y       6824
>> > > > > > NFS Server on gluster2b-1                               2049
>> > > Y       4428
>> > > > > > Self-heal Daemon on gluster2b-1                         N/A
>> > >  Y       4387
>> > > > > >
>> > > > > > Task Status of Volume callrec
>> > > > > >
>> > > ------------------------------------------------------------------------------
>> > > > > > There are no active volume tasks
>> > > > > >
>> > > > > > [root@gluster1b-1 ~]#
>> > > > > >
>> > > > > >
>> > > > > > [root@gluster1b-1 ~]# gluster volume heal callrec info
>> > > > > > Brick gluster1a-1.dns99.co.uk:/data/brick/callrec/
>> > > > > > /to_process - Possibly undergoing heal
>> > > > > >
>> > > > > > Number of entries: 1
>> > > > > >
>> > > > > > Brick gluster1b-1.dns99.co.uk:/data/brick/callrec/
>> > > > > > Number of entries: 0
>> > > > > >
>> > > > > > Brick gluster2a-1.dns99.co.uk:/data/brick/callrec/
>> > > > > > /to_process - Possibly undergoing heal
>> > > > > >
>> > > > > > Number of entries: 1
>> > > > > >
>> > > > > > Brick gluster2b-1.dns99.co.uk:/data/brick/callrec/
>> > > > > > Number of entries: 0
>> > > > > >
>> > > > > > [root@gluster1b-1 ~]#
>> > > > > >
>> > > > > >
>> > > > > > _______________________________________________
>> > > > > > Gluster-users mailing list
>> > > > > > Gluster-users@xxxxxxxxxxx
>> > > > > > http://www.gluster.org/mailman/listinfo/gluster-users
>> > > > > >
>> > > > >
>> > > > > _______________________________________________
>> > > > > Gluster-users mailing list
>> > > > > Gluster-users@xxxxxxxxxxx
>> > > > > http://www.gluster.org/mailman/listinfo/gluster-users
>> > > > >
>> > > >
>> > > > _______________________________________________
>> > > > Gluster-users mailing list
>> > > > Gluster-users@xxxxxxxxxxx
>> > > > http://www.gluster.org/mailman/listinfo/gluster-users
>> > >
>> > >
>> >
>>
>>


_______________________________________________
Gluster-users mailing list
Gluster-users@xxxxxxxxxxx
http://www.gluster.org/mailman/listinfo/gluster-users
_______________________________________________
Gluster-users mailing list
Gluster-users@xxxxxxxxxxx
http://www.gluster.org/mailman/listinfo/gluster-users

[Index of Archives]     [Gluster Development]     [Linux Filesytems Development]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux OMAP]     [Linux MIPS]     [eCos]     [Asterisk Internet PBX]     [Linux API]

  Powered by Linux