Cache-tier problem when cache becomes full

Xavier Serrano <xserrano+ceph@xxxxxxxxxx> · Fri, 17 Apr 2015 08:02:03 +0200

Hello all,

We are trying to run some tests on a cache-tier Ceph cluster, but
we are encountering serious problems, which eventually lead the cluster
unusable.

We are apparently doing something wrong, but we have no idea of
what it could be. We'd really appreciate if someone could point us what
to do.

We are running Ceph version:
ceph version 0.87.1 (283c2e7cfa2457799f534744d7d549f83ea1335e)

All nodes are Ubuntu 14.04.2 LTS (GNU/Linux 3.16.0-34-generic x86_64)

Our test cluster is:
 * disk-host-1: monitor, with 128 GB RAM

 * disk-brick-3, disk-brick-4, disk-brick-5: each node has:
   - 32 GB RAM
   - /dev/sda and /dev/sdb: 2 TB spinning HDDs
   - /dev/sdu: 400 GB SSD

 * disk-host-5: client, with 128 GB RAM

Please, find the ceph.conf file and the decompiled CRUSH map at the end
of this e-mail.

This is what we do:

(1) Create a pool named "cache_pool":
sudo ceph osd pool create cache_pool 32 32
sudo ceph osd pool set cache_pool crush_ruleset 4

(2) Increase PGs of the default "rbd" pool before putting anything in it:
sudo ceph osd pool set rbd pg_num 256
sudo ceph osd pool set rbd pgp_num 256

(3) Create cache-tier having the new "cache_pool" over "rbd":
sudo ceph osd tier add rbd cache_pool
sudo ceph osd tier cache-mode cache_pool writeback
sudo ceph osd tier set-overlay rbd cache_pool

(4) Configure some parameters for "cache_pool":
sudo ceph osd pool set cache_pool hit_set_type bloom
sudo ceph osd pool set cache_pool hit_set_count 1
sudo ceph osd pool set cache_pool hit_set_period 300
sudo ceph osd pool set cache_pool cache_min_flush_age 300
sudo ceph osd pool set cache_pool cache_min_evict_age 300
sudo ceph osd pool set cache_pool target_max_bytes 0
sudo ceph osd pool set cache_pool target_max_objects 0
sudo ceph osd pool set cache_pool cache_target_dirty_ratio .4
sudo ceph osd pool set cache_pool cache_target_full_ratio .8

(5) Create a 2 TB object to run our tests:
sudo rbd create fiobench --size 2048000

(6) In the client (disk-host-5), map and mount the object:
sudo rbd map --image fiobench ---> result is /dev/rbd0
mkfs.xfs /dev/rbd0
mkdir /mnt/fio
mount /dev/rbd0 /mnt/fio

(7) Run the fio tests (http://packages.ubuntu.com/trusty/fio)
in the client. Please, find the fiobench.sh script at the end of
this e-mail with all the details.

fio creates 64 files of 30 GB each on the /mnt/fio filesystem
(built on top of a RADOS image) prior to its measurements. Creating the
files works OK, and the benchmark begins.

After a while, the benchmark becomes stalled. Read and write tests
were completed, but random read tests just hung. Inspecting the cluster,
we see that one OSD in the cache_pool has become full, and ceph has
marked it down.

>From that point, it is not possible to resume the benchmark,
and we are not able to get the cluster healthy (HEALTH_OK) back again.

Any ideas will be very much appreciated.

Thank you very much for your time and your help.

Best regards,
- Xavier Serrano
- LCAC, Laboratori de Càlcul
- Departament d'Arquitectura de Computadors
- UPC, Universitat Politècnica de Catalunya, BarcelonaTECH

The /etc/ceph/ceph.conf file is:

[global]
fsid = 726babd1-c7df-4fed-8b5f-c5a70d35c4a0
mon_initial_members = disk-host-1
mon_host = 192.168.31.65
auth_cluster_required = cephx
auth_service_required = cephx
auth_client_required = cephx
filestore_xattr_use_omap = true
public_network = 192.168.31.0/24

The CRUSH map looks like this:

# begin crush map
tunable choose_local_tries 0
tunable choose_local_fallback_tries 0
tunable choose_total_tries 50
tunable chooseleaf_descend_once 1

# devices
device 0 osd.0
device 1 osd.1
device 2 osd.2
device 3 osd.3
device 4 osd.4
device 5 osd.5
device 6 osd.6
device 7 osd.7
device 8 osd.8

# types
type 0 osd
type 1 host
type 2 chassis
type 3 rack
type 4 row
type 5 pdu
type 6 pod
type 7 room
type 8 datacenter
type 9 region
type 10 root

# buckets
host disk-brick-4 {
	id -2		# do not change unnecessarily
	# weight 3.980
	alg straw
	hash 0	# rjenkins1
	item osd.0 weight 1.810
	item osd.3 weight 1.810
	item osd.6 weight 0.360
}
host disk-brick-5 {
	id -3		# do not change unnecessarily
	# weight 3.980
	alg straw
	hash 0	# rjenkins1
	item osd.1 weight 1.810
	item osd.4 weight 1.810
	item osd.7 weight 0.360
}
host disk-brick-6 {
	id -4		# do not change unnecessarily
	# weight 3.980
	alg straw
	hash 0	# rjenkins1
	item osd.2 weight 1.810
	item osd.5 weight 1.810
	item osd.8 weight 0.360
}
root default {
	id -1		# do not change unnecessarily
	# weight 11.940
	alg straw
	hash 0	# rjenkins1
	item osd.0 weight 1.810
	item osd.3 weight 1.810
	item osd.1 weight 1.810
	item osd.4 weight 1.810
	item osd.2 weight 1.810
	item osd.5 weight 1.810
}

root cache {
	id -5
	alg straw
	hash 0
	item osd.6 weight 0.360
	item osd.7 weight 0.360
	item osd.8 weight 0.360
}

# rules
rule replicated_ruleset {
	ruleset 0
	type replicated
	min_size 1
	max_size 10
	step take default
	step chooseleaf firstn 0 type osd
	step emit
}
host disk-brick-5 {
	id -3		# do not change unnecessarily
	# weight 3.980
	alg straw
	hash 0	# rjenkins1
	item osd.1 weight 1.810
	item osd.4 weight 1.810
	item osd.7 weight 0.360
}
host disk-brick-6 {
	id -4		# do not change unnecessarily
	# weight 3.980
	alg straw
	hash 0	# rjenkins1
	item osd.2 weight 1.810
	item osd.5 weight 1.810
	item osd.8 weight 0.360
}
root default {
	id -1		# do not change unnecessarily
	# weight 11.940
	alg straw
	hash 0	# rjenkins1
	item osd.0 weight 1.810
	item osd.3 weight 1.810
	item osd.1 weight 1.810
	item osd.4 weight 1.810
	item osd.2 weight 1.810
	item osd.5 weight 1.810
}

root cache {
	id -5
	alg straw
	hash 0
	item osd.6 weight 0.360
	item osd.7 weight 0.360
	item osd.8 weight 0.360
}

# rules
rule replicated_ruleset {
	ruleset 0
	type replicated
	min_size 1
	max_size 10
	step take default
	step chooseleaf firstn 0 type osd
	step emit
}

rule cache_pool {
	ruleset 4
	type replicated
	min_size 1
	max_size 10
	step take cache
	step chooseleaf firstn 0 type osd
	step emit
}

# end crush map

This is the fiobench.sh script:

#!/bin/sh

Directory=/mnt/fio
RunTime=1800
Size=30G
NumJobs=64

for Action in read write randread randwrite
do
  for BlockSize in 4k 128k 8m
  do
    OutputFile=~/fio-${Action}-${BlockSize}-${RunTime}-${Size}-${NumJobs}.txt
    fio --directory=$Directory --direct=1 \
        --rw=$Action --bs=$BlockSize --size=$Size --numjobs=$NumJobs \
        --runtime=$RunTime --group_reporting --name=testfile \
        --output=$OutputFile
  done
done

_______________________________________________
ceph-users mailing list
ceph-users@xxxxxxxxxxxxxx
http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com