IO pausing during failures

Matthew Stroud <mattstroud@xxxxxxxxxxxxx> · Thu, 13 Apr 2017 14:46:57 +0000

When our clusters hits a failure (e.g. Node going down or osd dying) our vms pause all IO for about 10 – 20 seconds. I’m curious if there is a way to fix or mitigate this?

Here is my ceph.conf:

[global]
fsid = fb991e48-c425-4f82-a70e-5ce748ae186b
mon_initial_members = mon01, mon02, mon03
mon_host = 10.20.57.10,10.20.57.11,10.20.57.12
auth_cluster_required = cephx
auth_service_required = cephx
auth_client_required = cephx
public network = 10.20.57.0/24
cluster_network = 10.20.58.0/24
filestore_xattr_use_omap = true
mon_clock_drift_allowed = .15
mon_clock_drift_warn_backoff = 30
mon_osd_down_out_interval = 30
mon_osd_report_timeout = 300
mon_osd_full_ratio = .95
mon_osd_nearfull_ratio = .85
mon_osd_allow_primary_affinity = true
osd_backfill_full_ratio = .90
osd_journal_size = 10000
osd_pool_default_size = 3
osd_pool_default_min_size = 1
osd_pool_default_pg_num = 4096
osd_pool_default_pgp_num = 4096
osd_crush_chooseleaf_type = 1
max_open_files = 131072
osd_op_threads = 10
osd_max_backfills = 1
osd_recovery_max_active = 1
osd_recovery_op_priority = 1
osd_client_op_priority = 63

[client]
rbd_cache = true
rbd_cache_writethrough_until_flush = true

And here is our osd tree:

ID WEIGHT   TYPE NAME      UP/DOWN REWEIGHT PRIMARY-AFFINITY
-1 15.91589 root default
-2  3.97897     host osd01
 0  1.98949         osd.0       up  1.00000          1.00000
 3  1.98949         osd.3       up  1.00000          1.00000
-3  3.97897     host osd02
 1  1.98949         osd.1       up  1.00000          1.00000
 4  1.98949         osd.4       up  1.00000          1.00000
-4  3.97897     host osd03
 2  1.98949         osd.2       up  1.00000          1.00000
 5  1.98949         osd.5       up  1.00000          1.00000
-5  3.97897     host osd04
 7  1.98949         osd.7       up  1.00000          1.00000
 6  1.98949         osd.6       up  1.00000          1.00000

Thanks a head of time.

CONFIDENTIALITY NOTICE: This message is intended only for the use and review of the individual or entity to which it is addressed and may contain information that is privileged and confidential. If the reader of this message is not the intended recipient, or
 the employee or agent responsible for delivering the message solely to the intended recipient, you are hereby notified that any dissemination, distribution or copying of this communication is strictly prohibited. If you have received this communication in
 error, please notify sender immediately by telephone or return email. Thank you.

_______________________________________________
ceph-users mailing list
ceph-users@xxxxxxxxxxxxxx
http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com