On the RBD performance issue, you may want to look at: http://tracker.ceph.com/issues/9192 Eric On Tue, Oct 27, 2015 at 8:59 PM, FaHui Lin <fahui.lin@xxxxxxxxxx> wrote: > Dear Ceph experts, > > I found something strange about the performance of my Ceph cluster: Read-out > much slower than write-in. > > I have 3 machines running OSDs, each has 8 OSDs running on 8 raid0s (each > made up of 2 HDDs) respectively. The OSD journal and data the is on the same > device. All machines in my clusters have 10Gb network. > > I used both Ceph RBD and CephFS, the client on another machine outside > cluster or on one of the running OSD (to rule out possible network issue), > an so on. All of these end up in a similar results: write-in can almost > reach the network limit, say 1200 MB/s, while read-out is only 350~450 MB/s. > > Trying to figure out, I did an extra test using CephFS: > > Version and Config: > [root@dl-disk1 ~]# ceph --version > ceph version 0.94.3 (95cefea9fd9ab740263bf8bb4796fd864d9afe2b) > [root@dl-disk1 ~]# cat /etc/ceph/ceph.conf > [global] > fsid = (hidden) > mon_initial_members = dl-disk1, dl-disk2, dl-disk3 > mon_host = (hidden) > auth_cluster_required = cephx > auth_service_required = cephx > auth_client_required = cephx > filestore_xattr_use_omap = true > > OSD tree: > # ceph osd tree > ID WEIGHT TYPE NAME UP/DOWN REWEIGHT PRIMARY-AFFINITY > -1 258.88000 root default > -2 87.28000 host dl-disk1 > 0 10.90999 osd.0 up 1.00000 1.00000 > 1 10.90999 osd.1 up 1.00000 1.00000 > 2 10.90999 osd.2 up 1.00000 1.00000 > 3 10.90999 osd.3 up 1.00000 1.00000 > 4 10.90999 osd.4 up 1.00000 1.00000 > 5 10.90999 osd.5 up 1.00000 1.00000 > 6 10.90999 osd.6 up 1.00000 1.00000 > 7 10.90999 osd.7 up 1.00000 1.00000 > -3 87.28000 host dl-disk2 > 8 10.90999 osd.8 up 1.00000 1.00000 > 9 10.90999 osd.9 up 1.00000 1.00000 > 10 10.90999 osd.10 up 1.00000 1.00000 > 11 10.90999 osd.11 up 1.00000 1.00000 > 12 10.90999 osd.12 up 1.00000 1.00000 > 13 10.90999 osd.13 up 1.00000 1.00000 > 14 10.90999 osd.14 up 1.00000 1.00000 > 15 10.90999 osd.15 up 1.00000 1.00000 > -4 84.31999 host dl-disk3 > 16 10.53999 osd.16 up 1.00000 1.00000 > 17 10.53999 osd.17 up 1.00000 1.00000 > 18 10.53999 osd.18 up 1.00000 1.00000 > 19 10.53999 osd.19 up 1.00000 1.00000 > 20 10.53999 osd.20 up 1.00000 1.00000 > 21 10.53999 osd.21 up 1.00000 1.00000 > 22 10.53999 osd.22 up 1.00000 1.00000 > 23 10.53999 osd.23 up 1.00000 1.00000 > > Pools and PG (each pool has 128 PGs): > # ceph osd lspools > 0 rbd,2 fs_meta,3 fs_data0,4 fs_data1, > # ceph pg dump pools > dumped pools in format plain > pg_stat objects mip degr misp unf bytes log disklog > pool 0 0 0 0 0 0 0 0 0 > pool 2 20 0 0 0 0 356958 264 264 > pool 3 3264 0 0 0 0 16106127360 14657 > 14657 > pool 4 0 0 0 0 0 0 0 0 > > To simplify the problem, I made a new crush rule that the CephFS data pool > use OSDs on only one machine (dl-disk1 here), and size = 1. > # ceph osd crush rule dump osd_in_dl-disk1__ruleset > { > "rule_id": 1, > "rule_name": "osd_in_dl-disk1__ruleset", > "ruleset": 1, > "type": 1, > "min_size": 1, > "max_size": 10, > "steps": [ > { > "op": "take", > "item": -2, > "item_name": "dl-disk1" > }, > { > "op": "chooseleaf_firstn", > "num": 0, > "type": "osd" > }, > { > "op": "emit" > } > ] > } > # ceph osd pool get fs_data0 crush_ruleset > crush_ruleset: 1 > # ceph osd pool get fs_data0 size > size: 1 > > Here starts the test. > On an client machine, I used dd to write a 4GB-file to CephFS, and checked > dstat on the OSD node dl-disk1: > [root@client ~]# dd of=/mnt/cephfs/4Gfile if=/dev/zero bs=4096k count=1024 > 1024+0 records in > 1024+0 records out > 4294967296 bytes (4.3 GB) copied, 3.69993 s, 1.2 GB/s > > [root@dl-disk1 ~]# dstat ... > ---total-cpu-usage---- ------memory-usage----- -net/total- > --dsk/sdb-----dsk/sdc-----dsk/sdd-----dsk/sde-----dsk/sdf-----dsk/sdg-----dsk/sdh-----dsk/sdi-- > usr sys idl wai hiq siq| used buff cach free| recv send| read writ: > read writ: read writ: read writ: read writ: read writ: read writ: read > writ > > 0 0 100 0 0 0|3461M 67.2M 15.1G 44.3G| 19k 20k| 0 0 : 0 > 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0 > 0 0 100 0 0 0|3461M 67.2M 15.1G 44.3G| 32k 32k| 0 0 : 0 > 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0 > 8 18 74 0 0 0|3364M 67.2M 11.1G 48.4G| 391k 391k| 0 2712k: 0 > 1096k: 0 556k: 0 1084k: 0 1200k: 0 1196k: 0 688k: 0 > 1252k > 0 0 100 0 0 0|3364M 67.2M 11.1G 48.4G| 82k 127k| 0 0 : 0 > 0 : 0 0 : 0 928k: 0 540k: 0 0 : 0 0 : 0 0 > 8 16 72 3 0 1|3375M 67.2M 11.8G 47.7G| 718M 2068k| 0 120M: 0 > 172M: 0 76M: 0 220M: 0 188M: 16k 289M: 0 53M: 0 36M > 6 13 77 4 0 1|3391M 67.2M 12.3G 47.1G| 553M 1517k| 0 160M: 0 > 176M: 0 88M: 0 208M: 0 225M: 0 213M: 0 8208k: 0 49M > 6 13 77 3 0 1|3408M 67.2M 12.9G 46.6G| 544M 1272k| 0 212M: 0 > 8212k: 0 36M: 0 0 : 0 37M: 0 3852k: 0 497M: 0 > 337M > 0 0 99 0 0 0|3407M 67.3M 12.9G 46.6G| 53k 114k| 0 36M: 0 > 37M: 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0 > 0 0 100 0 0 0|3407M 67.3M 12.9G 46.6G| 68k 110k| 0 0 : 0 > 0 : 0 0 : 0 36M: 0 0 : 0 0 : 0 0 : 0 0 > 0 0 99 0 0 0|3407M 67.3M 12.9G 46.6G| 38k 328k| 0 0 : 0 > 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 36M: 0 0 > 0 1 99 0 0 0|3406M 67.3M 12.9G 46.6G| 11M 132k| 0 0 : 0 > 0 : 0 8224k: 0 0 : 0 0 : 0 32M: 0 0 : 0 36M > 14 24 52 8 0 2|3436M 67.3M 13.8G 45.6G|1026M 2897k| 0 100M: 0 > 409M: 0 164M: 0 313M: 0 253M: 0 321M: 0 84M: 0 76M > 14 24 34 27 0 1|3461M 67.3M 14.7G 44.7G| 990M 2565k| 0 354M: 0 > 72M: 0 0 : 0 164M: 0 313M: 0 188M: 0 308M: 0 333M > 4 9 70 16 0 0|3474M 67.3M 15.1G 44.3G| 269M 646k| 0 324M: 0 > 0 : 0 0 : 0 36M: 0 0 : 0 0 : 0 349M: 0 172M > 0 0 99 0 0 0|3474M 67.3M 15.1G 44.3G| 24k 315k| 0 0 : 0 > 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 37M: 0 0 > 0 0 99 0 0 0|3474M 67.4M 15.1G 44.3G| 38k 102k| 0 0 : 0 > 0 : 0 0 : 0 0 : 0 0 : 0 36M: 0 0 : 0 36M > 0 0 99 0 0 0|3473M 67.4M 15.1G 44.3G| 22k 23k| 0 0 : 0 > 0 : 0 36M: 0 0 : 0 36M: 0 0 : 0 0 : 0 0 > 0 0 100 0 0 0|3473M 67.4M 15.1G 44.3G| 39k 40k| 0 304k: 0 > 16k: 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0 > 0 0 100 0 0 0|3472M 67.4M 15.1G 44.3G| 28k 64k| 0 64M: 0 > 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0 > 0 0 100 0 0 0|3471M 67.4M 15.1G 44.3G| 31k 94k| 0 0 : 0 > 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0 > 0 0 100 0 0 0|3472M 67.4M 15.1G 44.3G| 38k 39k| 0 0 : 0 > 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0 > > The throughput is 1.2 GB/s, able to reach the network limit 10Gb. > > Then, on the client machine, I used dd to read that file back from CephFS, > redirecting the file to /dev/zero (or /dev/null) to rule out local HDD's IO: > [root@client ~]# dd if=/mnt/cephfs/4Gfile of=/dev/zero bs=4096k count=1024 > 1024+0 records in > 1024+0 records out > 4294967296 bytes (4.3 GB) copied, 8.85246 s, 485 MB/s > > [root@dl-disk1 ~]# dstat ... > 0 0 100 0 0 0|3462M 67.4M 15.1G 44.3G| 36k 36k| 0 0 : 0 > 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0 > 0 0 100 0 0 0|3462M 67.4M 15.1G 44.3G| 22k 22k| 0 0 : 0 > 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0 > 0 0 100 0 0 0|3463M 67.4M 15.1G 44.3G| 49k 49k| 0 0 : 0 > 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0 > 0 1 99 0 0 0|3464M 67.4M 15.1G 44.3G| 282k 111M| 0 0 : 0 > 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0 > 2 5 93 0 0 0|3466M 67.4M 15.1G 44.3G|1171k 535M| 0 0 : 0 > 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0 > 2 5 93 0 0 0|3467M 67.4M 15.1G 44.3G|1124k 535M| 0 0 : 0 > 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0 > 2 4 94 0 0 0|3467M 67.4M 15.1G 44.3G|1124k 535M| 0 0 : 0 > 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0 > 2 4 94 0 0 0|3467M 67.4M 15.1G 44.3G|1109k 527M| 0 0 : 0 > 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0 > 2 4 93 0 0 0|3471M 67.4M 15.1G 44.3G|1044k 504M| 0 0 : 0 > 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0 > 2 4 94 0 0 0|3470M 67.4M 15.1G 44.3G|1031k 504M| 0 0 : 0 > 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0 > 2 5 93 0 0 0|3470M 67.4M 15.1G 44.3G|1103k 527M| 0 0 : 0 > 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0 > 2 4 93 0 0 0|3471M 67.5M 15.1G 44.3G|1084k 504M| 0 0 : 0 > 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0 > 0 0 100 0 0 0|3470M 67.5M 15.1G 44.3G| 25k 24k| 0 0 : 0 > 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0 > ----total-cpu-usage---- ------memory-usage----- -net/total- > --dsk/sdb-----dsk/sdc-----dsk/sdd-----dsk/sde-----dsk/sdf-----dsk/sdg-----dsk/sdh-----dsk/sdi-- > usr sys idl wai hiq siq| used buff cach free| recv send| read writ: > read writ: read writ: read writ: read writ: read writ: read writ: read > writ > 0 0 100 0 0 0|3470M 67.5M 15.1G 44.3G| 43k 44k| 0 0 : 0 > 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0 > 0 0 100 0 0 0|3470M 67.5M 15.1G 44.3G| 22k 23k| 0 48k: 0 > 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0 > 0 0 100 0 0 0|3469M 67.5M 15.1G 44.3G| 35k 38k| 0 0 : 0 > 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0 > 0 0 100 0 0 0|3469M 67.5M 15.1G 44.3G| 23k 85k| 0 0 : 0 > 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0 > 0 0 100 0 0 0|3469M 67.5M 15.1G 44.3G| 44k 44k| 0 0 : 0 > 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0 > 0 0 100 0 0 0|3469M 67.5M 15.1G 44.3G| 24k 25k| 0 12k: 0 > 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0 > 0 0 100 0 0 0|3469M 67.5M 15.1G 44.3G| 45k 43k| 0 0 : 0 > 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0 > 0 0 100 0 0 0|3468M 67.5M 15.1G 44.3G| 17k 18k| 0 0 : 0 > 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0 > > > The throughput here was only 400~500 MB/s here. > I noticed that there was NO disk I/O during the read-out, that means all the > objects of the file were already cached in memory on the OSD node. > Thus, HDDs does NOT seem to cause the lower throughput. > > I also tried read-out using cat (in case dd may not use read-ahead in file > system. ), ended up getting similar result: > > [root@client ~]# time cat /mnt/cephfs/4Gfile > /dev/zero > > real 0m9.352s > user 0m0.002s > sys 0m4.147s > > > [root@dl-disk1 ~]# dstat ... > 0 0 100 0 0 0|3465M 67.5M 15.1G 44.3G| 23k 22k| 0 0 : 0 > 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0 > 0 0 100 0 0 0|3465M 67.5M 15.1G 44.3G| 17k 18k| 0 0 : 0 > 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0 > 0 0 100 0 0 0|3465M 67.5M 15.1G 44.3G| 37k 37k| 0 0 : 0 > 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0 > 1 2 97 0 0 0|3466M 67.5M 15.1G 44.3G| 633k 280M| 0 0 : 0 > 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0 > 2 4 94 0 0 0|3467M 67.5M 15.1G 44.3G|1057k 498M| 0 0 : 0 > 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0 > 2 4 94 0 0 0|3470M 67.5M 15.1G 44.3G|1078k 498M| 0 0 : 0 > 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0 > 2 4 94 0 0 0|3470M 67.5M 15.1G 44.3G| 996k 486M| 0 0 : 0 > 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0 > 2 4 94 0 0 0|3469M 67.5M 15.1G 44.3G| 988k 489M| 0 0 : 0 > 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0 > 2 4 94 0 0 0|3469M 67.5M 15.1G 44.3G|1012k 489M| 0 0 : 0 > 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0 > 2 4 94 0 0 0|3470M 67.5M 15.1G 44.3G|1017k 497M| 0 0 : 0 > 8192B: 0 28k: 0 0 : 0 0 : 0 0 : 0 0 : 0 0 > 2 4 94 0 0 0|3469M 67.5M 15.1G 44.3G|1032k 498M| 0 0 : 0 > 0 : 0 0 : 0 8192B: 0 104k: 0 0 : 0 0 : 0 0 > ----total-cpu-usage---- ------memory-usage----- -net/total- > --dsk/sdb-----dsk/sdc-----dsk/sdd-----dsk/sde-----dsk/sdf-----dsk/sdg-----dsk/sdh-----dsk/sdi-- > usr sys idl wai hiq siq| used buff cach free| recv send| read writ: > read writ: read writ: read writ: read writ: read writ: read writ: read > writ > 2 4 94 0 0 0|3469M 67.5M 15.1G 44.3G|1025k 496M| 0 0 : 0 > 0 : 0 0 : 0 0 : 0 0 : 0 40k: 0 80k: 0 0 > 0 1 99 0 0 0|3469M 67.5M 15.1G 44.3G| 127k 52M| 0 0 : 0 > 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 120k > 0 0 100 0 0 0|3469M 67.5M 15.1G 44.3G| 21k 21k| 0 0 : 0 > 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0 > 0 0 100 0 0 0|3469M 67.5M 15.1G 44.3G| 66k 66k| 0 0 : 0 > 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0 > 0 0 100 0 0 0|3469M 67.5M 15.1G 44.3G| 35k 38k| 0 0 : 0 > 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0 : 0 0 > > > The average throughput is 4GB / 9.35s = 438 MB/s. Still, unlikely to be > HDD's issue. > > I'm sure that the network can reach 10Gb in both ways via iperf or other > test, and there's no other user process occupying bandwidth. > > Could you please help me some to find out the main reason for this issue? > Thank you. > > Best Regards, > FaHui > > > > > > > _______________________________________________ > ceph-users mailing list > ceph-users@xxxxxxxxxxxxxx > http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com > _______________________________________________ ceph-users mailing list ceph-users@xxxxxxxxxxxxxx http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com