Re: Removing orphaned hard links under .glusterfs

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



I would think that the link count should be sufficient.  I've attached a python module that we've used to do some brick level indexing and cleanup on our system.  You could use it to generate a list of the .glusterfs links for the remaining good files on the disk.  Then you could check your candidates for removal against the list before removing them.

On Aug 21, 2014 12:05 PM, "Branden Timm" <btimm@xxxxxxxxxxxxxxx> wrote:
We have a distributed volume, and had a rather large (> 50TB) folder that was no longer needed.  Naively, we removed the folder from the brick instead of through the Gluster client.

You can probably see where this is going.  We didn’t actually reclaim the space because of the hard links to .glusterfs, and now we need to figure out how to clean up.

Is it sufficient to simply check whether a file under .glusterfs has less than 2 hard links, something like:

find .glusterfs -type f -links -2 -exec rm {} \;

Or do we have to do something else?  Any help is much appreciated.

—Branden

_______________________________________________
Gluster-users mailing list
Gluster-users@xxxxxxxxxxx
http://supercolony.gluster.org/mailman/listinfo/gluster-users
import os
import os.path
import subprocess
from hashlib import md5
from socket import gethostname
from datetime import datetime

class File:
	
	def __init__(self,path,brick=None):
		self.path = os.path.realpath(path)
		self.stat = os.stat(self.path)
		self.brick = brick
		self.xattrs = dict()
		self._md5 = None
		self._contains_nulls = None
		self._all_nulls = None

	def read_xattr(self,attrname):
		if attrname in self.xattrs:
			return self.xattrs[attrname]

		p = subprocess.Popen(['getfattr', '--only-values', '-m', attrname, '-d', self.path],stdout=subprocess.PIPE, stderr=subprocess.PIPE)
		(stdout,stderr) = p.communicate()
		if stdout == '':
			value = None
		else:
			value = stdout

		self.xattrs[attrname] = value
		return value
	
	def is_sparse(self):
		return self.stat.st_size > self.stat.st_blocks * 512

	def is_linkfile(self):
		if self.read_xattr('trusted.glusterfs.dht.linkto') != None:
			return True
		else:
			return False

	@property
	def is_null(self):
		if self._all_nulls == None:
			self.__fastallnullcheck()
		return self._all_nulls

	def __fastallnullcheck(self):
		"""
		Tests to see if a file only contains nulls, exits as soon as a non null character is found.  
		This can also be done with __readfile, though it will read the full file.
		"""
		self._contains_nulls = False
		if self.size > 0:
			nullfile = True
			f = open(self.path,"r")
                	for chunk in iter(lambda: f.read(512*2**10), ''):
				for byte in chunk:
					if ord(byte) != 0:
						nullfile = False
						break
					else:
						self._contains_nulls = True
				if not nullfile:
					break
			f.close()
		else:
			nullfile = False

		self._all_nulls = nullfile


	def __readfile(self):
		h = md5()

		self._contains_nulls = False
		if self.size > 0:
			nullfile = True
			f = open(self.path,"r")
                	for chunk in iter(lambda: f.read(512*2**10), ''):
				h.update(chunk)
				for byte in chunk:
					if ord(byte) != 0:
						nullfile = False
					else:
						self._contains_nulls = True
			f.close()
		else:
			nullfile = False

		self._all_nulls = nullfile
		self._md5 = h.hexdigest()
		
	@property
	def contains_nulls(self):
		if self._contains_nulls == None:
			self.__readfile()
		return self._contains_nulls

	@property
	def gfid(self):
		gfid_raw = self.read_xattr("trusted.gfid")
		if gfid_raw == None:
			return None
		gfid_array = ""
		for i in bytearray(gfid_raw):
			gfid_array += "%02x" % i
		return gfid_array
			
	@property
	def gfidlinkpath(self):
		gfid = self.gfid
		if gfid == None or self.brick == None:
			return None
		
		relpath = os.path.join(".glusterfs",gfid[0:2],gfid[2:4],"%s-%s-%s-%s-%s" % (gfid[0:8],gfid[8:12],gfid[12:16],gfid[16:20],gfid[20:32]))	

		return os.path.join(self.brick.path,relpath)
		
	@property
	def linkcount(self):
		return self.stat.st_nlink		

	@property
	def size(self):
		return self.stat.st_size

	@property
	def spaceallocated(self):
		return self.stat.st_blocks * 512

	@property
	def relpath(self):
		if self.brick != None:
			return self.path[len(self.brick.path)+1:]
		else:
			return self.path
	
	@property
	def mtime(self):
		""" Return mtime, defaults to float, I'm okay with that"""
		return self.stat.st_mtime

	@property
	def md5sum(self):
		if self._md5 == None:
			self.__readfile()
	
		return self._md5

	def __str__(self):
		self.__readfile()
		if self.is_sparse():
			sparsechar = 'S'
		else:
			sparsechar = '-'

		if self.is_linkfile():
			linkchar = 'L'
		else:
			linkchar = '-'

		if self.is_null:
			nullchar = 'N'
		elif self.contains_nulls:
			nullchar = 'n'
		else:
			nullchar = '-'
		
		mtime = datetime.fromtimestamp(self.mtime)

		return "%s %s %12d %3d %s %s %s" % (self.gfid,self.md5sum,self.size,self.linkcount,sparsechar + linkchar + nullchar,mtime,self.path)

class Brick:
	
	def __init__(self,brickpath):
		self._brickpath = os.path.realpath(brickpath)
	
	@property
	def node(self):
		nodename = gethostname().split(".")[0]
		return nodename
	
	@property
	def device(self):
		return self.path.split(os.sep)[2]
	
	@property
	def path(self):
		return self._brickpath
		

	def __str__(self):
		return "%s\t%s" % (self.node,self.path)


	def __iter__(self):
		for root,dirs,files in os.walk(self.path):
			for file in files:
				if not os.path.islink(os.path.join(root,file)):
					f = File(os.path.join(root,file),brick=self)
					yield f
_______________________________________________
Gluster-users mailing list
Gluster-users@xxxxxxxxxxx
http://supercolony.gluster.org/mailman/listinfo/gluster-users

[Index of Archives]     [Gluster Development]     [Linux Filesytems Development]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux OMAP]     [Linux MIPS]     [eCos]     [Asterisk Internet PBX]     [Linux API]

  Powered by Linux