Hello everybody,
I think I fixed the issues after weeks of looking.
question 1: anyone know hos to prevent iptables, nftables or conntrack
to be loaded in the first time? Adding them to
/etc/modprobe.d/blacklist.local.conf does not seem to work? What is
recommended?
question 2: what systemd target i can use to run a service after all
ceph-osds are loaded? I tried ceph.target ceph-osd.target both do not
work reliable.
question 3: should I still try to upgrade to bluestore or pray to the
system ods that my performance is back after many many hours of
troubleshooting?
I made a few changes I am going to just list them for other people that
are suffering from slow performance after upgrading there Ceph and/or OS.
Disk utilization is back around 10% no more 80-100%... and rados bench
is stable again.
apt-get install irqbalance nftables
# cat /etc/ceph/ceph.conf
[global]
fsid = 5f8d3724-1a51-4895-9b3e-5eb90ea49782
mon_initial_members = ceph01, ceph02, ceph03
mon_host = 192.168.35.11,192.168.35.12,192.168.35.13
auth_cluster_required = cephx
auth_service_required = cephx
auth_client_required = cephx
filestore_xattr_use_omap = true
osd pool default size = 3
public network = 192.168.35.0/28
cluster network = 192.168.35.0/28
osd pool default min size = 2
osd scrub begin hour = 23
osd scrub end hour = 6
# default osd recovery max active = 3
osd recovery max active = 1
#setuser match path = /var/lib/ceph/$type/$cluster-$id
debug_default = 0
debug_lockdep = 0/0
debug_context = 0/0
debug_crush = 0/0
debug_buffer = 0/0
debug_timer = 0/0
debug_filer = 0/0
debug_objecter = 0/0
debug_rados = 0/0
debug_rbd = 0/0
debug_journaler = 0/0
debug_objectcatcher = 0/0
debug_client = 0/0
debug_osd = 0/0
debug_optracker = 0/0
debug_objclass = 0/0
debug_filestore = 0/0
debug_journal = 0/0
debug_ms = 0/0
debug_monc = 0/0
debug_tp = 0/0
filestore_op_threads = 8
filestore_max_inline_xattr_size = 254
filestore_max_inline_xattrs = 6
filestore_queue_max_ops = 500
filestore_queue_committing_max_ops = 5000
filestore_merge_threshold = 40
filestore_split_multiple = 10
journal_max_write_entries = 1000
journal_queue_max_ops = 3000
journal_max_write_bytes = 1048576000
osd_mkfs_options_xfs = -f -I size=2048
osd_mount_options_xfs = noatime,largeio,nobarrier,inode64,allocsize=8M
ods_op_threads = 32
osd_journal_size = 10000
filestore_queue_max_bytes = 1048576000
filestore_queue_committing_max_bytes = 1048576000
journal_queue_max_bytes = 1048576000
filestore_max_sync_interval = 10
filestore_journal_parallel = true
[client]
rbd cache = true
#rbd cache max dirty = 0
# cat /etc/sysctl.d/30-nic-10gbit.conf
net.ipv4.tcp_rmem = 10000000 10000000 10000000
net.ipv4.tcp_wmem = 10000000 10000000 10000000
net.ipv4.tcp_mem = 10000000 10000000 10000000
net.core.rmem_default = 524287
net.core.wmem_default = 524287
net.core.rmem_max = 524287
net.core.wmem_max = 524287
net.core.netdev_max_backlog = 300000
Unload all forms of filtering, does not blacklist does not work, they
keep getting loaded! Guess auto loaded by kernel.
echo "blacklist ip_tables" | tee --append
/etc/modprobe.d/blacklist.local.conf
echo "blacklist iptable_filter" | tee --append
/etc/modprobe.d/blacklist.local.conf
echo "blacklist ip6_tables" | tee --append
/etc/modprobe.d/blacklist.local.conf
echo "blacklist ip6table_filter" | tee --append
/etc/modprobe.d/blacklist.local.conf
echo "blacklist nf_tables" | tee --append
/etc/modprobe.d/blacklist.local.conf
echo "blacklist nf6_tables" | tee --append
/etc/modprobe.d/blacklist.local.conf
depmod -a
update-initramfs -u -k all -v
root@ceph02:~# cat /etc/rc.local
#!/bin/bash -e
#
# rc.local
#
# This script is executed at the end of each multiuser runlevel.
# Make sure that the script will "exit 0" on success or any other
# value on error.
#
# In order to enable or disable this script just change the execution
# bits.
#
# By default this script does nothing.
for i in {a..e}; do echo 512 > /sys/block/sd$i/queue/read_ahead_kb; done
for i in {a..d}; do hdparm -q -B 255 -q -W0 /dev/sd$i; done
echo 'on' > '/sys/bus/pci/devices/0000:00:01.0/power/control'
echo 'on' > '/sys/bus/pci/devices/0000:00:03.0/power/control'
echo 'on' > '/sys/bus/pci/devices/0000:00:01.0/power/control'
cpupower frequency-set --governor performance
modprobe -r iptable_filter ip_tables ip6table_filter ip6_tables
nf_tables_ipv6 nf_tables_ipv4 nf_tables_bridge nf_tables
array=($(pidof ceph-osd))
taskset -cp 0-5 $(echo ${array[0]})
taskset -cp 12-17 $(echo ${array[1]})
taskset -cp 6-11 $(echo ${array[2]})
taskset -cp 18-23 $(echo ${array[3]})
exit 0
Please also save the pastebin from my OP there is a lot of benchmark and
test notes in there.
root@ceph02:~# rados bench -p scbench 10 write --no-cleanup
hints = 1
Maintaining 16 concurrent writes of 4194304 bytes to objects of size
4194304 for up to 10 seconds or 0 objects
Object prefix: benchmark_data_ceph02_396172
sec Cur ops started finished avg MB/s cur MB/s last lat(s) avg
lat(s)
0 0 0 0 0 0 -
0
1 16 77 61 243.956 244 0.200718
0.227978
2 16 151 135 269.946 296 0.327927
0.2265
3 16 215 199 265.281 256 0.0875193
0.225989
4 16 288 272 271.951 292 0.184617
0.227921
5 16 358 342 273.553 280 0.140823
0.22683
6 16 426 410 273.286 272 0.118436
0.226586
7 16 501 485 277.094 300 0.224887
0.226209
8 16 573 557 278.452 288 0.200903
0.226424
9 16 643 627 278.619 280 0.214474
0.227003
10 16 711 695 277.952 272 0.259724
0.226849
Total time run: 10.146720
Total writes made: 712
Write size: 4194304
Object size: 4194304
Bandwidth (MB/sec): 280.682
Stddev Bandwidth: 17.7138
Max bandwidth (MB/sec): 300
Min bandwidth (MB/sec): 244
Average IOPS: 70
Stddev IOPS: 4
Max IOPS: 75
Min IOPS: 61
Average Latency(s): 0.227538
Stddev Latency(s): 0.0843661
Max latency(s): 0.48464
Min latency(s): 0.0467124
On 2020-01-06 20:44, Jelle de Jong wrote:
Hello everybody,
I have issues with very slow requests a simple tree node cluster here,
four WDC enterprise disks and Intel Optane NVMe journal on identical
high memory nodes, with 10GB networking.
It was working all good with Ceph Hammer on Debian Wheezy, but I wanted
to upgrade to a supported version and test out bluestore as well. So I
upgraded to luminous on Debian Stretch and used ceph-volume to create
bluestore osds, everything went downhill from there.
I went back to filestore on all nodes but I still have slow requests and
I can not pinpoint a good reason I tried to debug and gathered
information to look at:
https://paste.debian.net/hidden/acc5d204/
First I thought it was the balancing that was making things slow, then I
thought it might be the LVM layer, so I recreated the nodes without LVM
by switching from ceph-volume to ceph-disk, no different still slow
request. Then I changed back from bluestore to filestore but still the a
very slow cluster. Then I thought it was a CPU scheduling issue and
downgraded the 5.x kernel and CPU performance is full speed again. I
thought maybe there is something weird with an osd and taking them out
one by one, but slow request are still showing up and client performance
from vms is really poor.
I just feel a burst of small requests keeps blocking for a while then
recovers again.
Many thanks for helping out looking at the URL.
If there are options which I should tune for a hdd with nvme journal
setup please share.
Jelle
_______________________________________________
ceph-users mailing list
ceph-users@xxxxxxxxxxxxxx
http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com
_______________________________________________
ceph-users mailing list
ceph-users@xxxxxxxxxxxxxx
http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com