lost locks

Emmanuel Dreyfus <manu@xxxxxxxxxx> · Mon, 9 May 2011 07:30:14 +0000

Hello

I have been playing with replicate translator on glusterfs 3.1.4, 
with high availlability in mind. It works fine, if I kill one server,
the client uses the other one, and vice versa. auto healing works 
fine, this is amazing.

However things turn bad when a server die while a client holds a lock 
on a file.  Following such an event, the file cannot be written anymore
even after all servers came back online. I have to delete it to clear 
the situation.

Is it a feature missing in the replication protocol? Or is it just 
missing in the implementation? Or is it a bug?

Here is the client volume file:

volume silo1
    type protocol/client
    option transport-type tcp
    option transport.address-family inet
    option remote-host silo.example.net
    option remote-subvolume wd1lock
end-volume

volume hangar1
    type protocol/client
    option transport-type tcp
    option transport.address-family inet
    option remote-host hangar.example.net
    option remote-subvolume wd1lock
end-volume

volume replicate
  type cluster/replicate
  subvolumes silo1 hangar1
  data-change-log on
  metadata-change-log on
  entry-change-log on
end-volume

On the servers side:

volume wd1
  type storage/posix
  option directory /export/wd1a
end-volume

volume wd1lock
    type features/locks
    subvolumes wd1
end-volume

volume silo
  type protocol/server
  option transport-type tcp
  option transport.address-family inet
  subvolumes wd1lock
  option auth.addr.wd1lock.allow * # Allow access to "brick" volume
  option client-volume-filename /usr/pkg/etc/glusterfs/afr.vol
end-volume

-- 
Emmanuel Dreyfus
manu@xxxxxxxxxx