I would think that the link count should be sufficient. I've attached a python module that we've used to do some brick level indexing and cleanup on our system. You could use it to generate a list of the .glusterfs links for the remaining good files on the disk. Then you could check your candidates for removal against the list before removing them.
On Aug 21, 2014 12:05 PM, "Branden Timm" <btimm@xxxxxxxxxxxxxxx> wrote:
We have a distributed volume, and had a rather large (> 50TB) folder that was no longer needed. Naively, we removed the folder from the brick instead of through the Gluster client.
You can probably see where this is going. We didn’t actually reclaim the space because of the hard links to .glusterfs, and now we need to figure out how to clean up.
Is it sufficient to simply check whether a file under .glusterfs has less than 2 hard links, something like:
find .glusterfs -type f -links -2 -exec rm {} \;
Or do we have to do something else? Any help is much appreciated.
—Branden
_______________________________________________
Gluster-users mailing list
Gluster-users@xxxxxxxxxxx
http://supercolony.gluster.org/mailman/listinfo/gluster-users
import os import os.path import subprocess from hashlib import md5 from socket import gethostname from datetime import datetime class File: def __init__(self,path,brick=None): self.path = os.path.realpath(path) self.stat = os.stat(self.path) self.brick = brick self.xattrs = dict() self._md5 = None self._contains_nulls = None self._all_nulls = None def read_xattr(self,attrname): if attrname in self.xattrs: return self.xattrs[attrname] p = subprocess.Popen(['getfattr', '--only-values', '-m', attrname, '-d', self.path],stdout=subprocess.PIPE, stderr=subprocess.PIPE) (stdout,stderr) = p.communicate() if stdout == '': value = None else: value = stdout self.xattrs[attrname] = value return value def is_sparse(self): return self.stat.st_size > self.stat.st_blocks * 512 def is_linkfile(self): if self.read_xattr('trusted.glusterfs.dht.linkto') != None: return True else: return False @property def is_null(self): if self._all_nulls == None: self.__fastallnullcheck() return self._all_nulls def __fastallnullcheck(self): """ Tests to see if a file only contains nulls, exits as soon as a non null character is found. This can also be done with __readfile, though it will read the full file. """ self._contains_nulls = False if self.size > 0: nullfile = True f = open(self.path,"r") for chunk in iter(lambda: f.read(512*2**10), ''): for byte in chunk: if ord(byte) != 0: nullfile = False break else: self._contains_nulls = True if not nullfile: break f.close() else: nullfile = False self._all_nulls = nullfile def __readfile(self): h = md5() self._contains_nulls = False if self.size > 0: nullfile = True f = open(self.path,"r") for chunk in iter(lambda: f.read(512*2**10), ''): h.update(chunk) for byte in chunk: if ord(byte) != 0: nullfile = False else: self._contains_nulls = True f.close() else: nullfile = False self._all_nulls = nullfile self._md5 = h.hexdigest() @property def contains_nulls(self): if self._contains_nulls == None: self.__readfile() return self._contains_nulls @property def gfid(self): gfid_raw = self.read_xattr("trusted.gfid") if gfid_raw == None: return None gfid_array = "" for i in bytearray(gfid_raw): gfid_array += "%02x" % i return gfid_array @property def gfidlinkpath(self): gfid = self.gfid if gfid == None or self.brick == None: return None relpath = os.path.join(".glusterfs",gfid[0:2],gfid[2:4],"%s-%s-%s-%s-%s" % (gfid[0:8],gfid[8:12],gfid[12:16],gfid[16:20],gfid[20:32])) return os.path.join(self.brick.path,relpath) @property def linkcount(self): return self.stat.st_nlink @property def size(self): return self.stat.st_size @property def spaceallocated(self): return self.stat.st_blocks * 512 @property def relpath(self): if self.brick != None: return self.path[len(self.brick.path)+1:] else: return self.path @property def mtime(self): """ Return mtime, defaults to float, I'm okay with that""" return self.stat.st_mtime @property def md5sum(self): if self._md5 == None: self.__readfile() return self._md5 def __str__(self): self.__readfile() if self.is_sparse(): sparsechar = 'S' else: sparsechar = '-' if self.is_linkfile(): linkchar = 'L' else: linkchar = '-' if self.is_null: nullchar = 'N' elif self.contains_nulls: nullchar = 'n' else: nullchar = '-' mtime = datetime.fromtimestamp(self.mtime) return "%s %s %12d %3d %s %s %s" % (self.gfid,self.md5sum,self.size,self.linkcount,sparsechar + linkchar + nullchar,mtime,self.path) class Brick: def __init__(self,brickpath): self._brickpath = os.path.realpath(brickpath) @property def node(self): nodename = gethostname().split(".")[0] return nodename @property def device(self): return self.path.split(os.sep)[2] @property def path(self): return self._brickpath def __str__(self): return "%s\t%s" % (self.node,self.path) def __iter__(self): for root,dirs,files in os.walk(self.path): for file in files: if not os.path.islink(os.path.join(root,file)): f = File(os.path.join(root,file),brick=self) yield f
_______________________________________________ Gluster-users mailing list Gluster-users@xxxxxxxxxxx http://supercolony.gluster.org/mailman/listinfo/gluster-users