Deep scrubbing is a pain point for some (many?) Ceph installations.
We have recently been hit by deep scrubbing causing noticeable latency
increases to the entire cluster, but only on certain (infrequent) days.
This led me to become more interested in the distribution of pgdeep scrubs.
Here's an example (small pre production system):
day # bytes
------------------------------------------
2016-07-26 55 214157421
2016-07-27 251 3228175434
2016-07-28 233 3087309278
2016-07-29 144 3657977623
2016-07-30 487 3310926110
2016-07-31 481 9278429194
2016-08-01 1139 15464866042
2016-08-02 505 9200910539
2016-08-03 244 1087745079
2016-08-04 149 575754064
2016-08-05 226 3830955473
2016-08-06 174 2902124793
2016-08-07 433 1892731097
2016-08-08 324 3779908017
2016-08-09 120 3250368596
2016-08-10 62 1298502931
2016-08-11 80 1266855188
2016-08-12 47 239441239
2016-08-13 68 306461083
2016-08-14 97 1416565438
2016-08-15 7 41944564
2016-08-16 13 64109571
2016-08-18 77 11937945588
2016-08-19 290 6446010754
2016-08-20 374 3658389385
------------------------------------------
6080 91438564501
Now osd_deep_scrub_interval is 3024000 (35 days). Two issues are apparent:
1/ We are not doing deep scrubbing on every day we could
2/ We are doing a lot of deep scrubbing on some days and notmuch on others
So we need some way to 'move' when a pg is deep scrubbed so as to even
out the load. Unfortunately the only way to do that is to pre-emptively
deep scrub some pgs before they are due to be dome automatically.
We wanted to:
- take pgs from days where 'too many' pgs were being deep scrubbed
- move them to days where 'too few' (or none) were.
- controlthe degree of parallelism so not too many were being
pre-emptively scrubbed at once
This thinking led to a slightly largerscript than I had initially
envisaged (attached), but it seems to work well. Here is what the above
distribution looks like after a few days of it running:
day # bytes
------------------------------------------
2016-07-26 55 214157421
2016-07-27 213 3046513946
2016-07-28 231 3041988574
2016-07-29 53 2196992034
2016-07-30 452 3019055783
2016-07-31 89 2590830041
2016-08-01 277 3034406207
2016-08-02 423 7748167775
2016-08-03 244 1105338953
2016-08-04 149 575754064
2016-08-05 226 3839278545
2016-08-06 174 2911264505
2016-08-07 433 1876704966
2016-08-08 324 3772335992
2016-08-09 120 3259895892
2016-08-10 62 1306074899
2016-08-11 80 1283566849
2016-08-12 47 247698775
2016-08-13 68 298072475
2016-08-14 97 1438790334
2016-08-15 7 41944564
2016-08-16 13 64109571
2016-08-18 77 11946334196
2016-08-19 290 6458525533
2016-08-20 374 3650686307
2016-08-21 147 3673093376
2016-08-22 308 4463993668
2016-08-23 169 3348235234
2016-08-24 415 3989389181
2016-08-25 202 3751921834
2016-08-26 261 3308940853
------------------------------------------
6080 91504062347
ideal # ideal bytes
------------------------------------------
196 2951743946
So it has trimmed the big clump on 2016-08-01 from 15G to the
'idealized' 3G, without touching the pgs on some earlier days that have
too little deep scrubbing scheduled (they will get fixed up later).
Cheers
Mark
#!/usr/bin/python
#
# rados-deep-scrub-redist:
#
# Big problem with deep scrubbing:
# - clumpy (i.e some days lots but others none at all)
# - not redistributed if we change deep scrub interval
#
# We attempt to even up the distribution of pgs being deep scrubbed
# over the entire period osd_deep_scrub_interval by pre-emptively
# deep scrubbing some pgs.
#
# License: this program is licensed under GNU GPL v2 or later.
#
import argparse
import json
import time
from datetime import datetime
from datetime import date
from datetime import timedelta
from rados import Rados
cmd_timeout = 600
deep_scrub_interval = None
pg_data = {}
pg_today_list = []
deep_scrub_daily_data = {}
deep_scrub_today_data = {}
deep_scrub_ideal_count = 0
deep_scrub_ideal_bytes = 0
def cmp_deep_scrub_stamp(pg1, pg2):
# compare deep scrub dates, oldest first
if pg1["last_deep_scrub_stamp"] < pg2["last_deep_scrub_stamp"]:
return -1
elif pg1["last_deep_scrub_stamp"] > pg2["last_deep_scrub_stamp"]:
return 1
else:
return 0
def get_pg_data(conn):
# create a dict of pgs and their scrubbing info
global pg_data
cmd = {"prefix":"pg dump", "format":"json"}
ret, buf, errs = conn.mon_command(json.dumps(cmd), '', timeout=cmd_timeout)
if ret != 0:
print("cmd {:30} failed with {:50}".format(cmd, errs))
sys.exit(1)
format = '%Y-%m-%d'
pg_data = json.loads(buf)["pg_stats"]
return
def get_deep_scrub_daily_data():
# create a dict of daily counts and bytes for last deep scrub operations
global deep_scrub_daily_data
format = '%Y-%m-%d'
for pg in sorted(pg_data, cmp=cmp_deep_scrub_stamp):
pg_stat = pg["stat_sum"]
dt = datetime.strptime(pg["last_deep_scrub_stamp"],'%Y-%m-%d %H:%M:%S.%f')
d = datetime.strftime(dt, format)
if d not in deep_scrub_daily_data:
deep_scrub_daily_data[d] = {"count": 0, "bytes": 0}
deep_scrub_daily_data[d]["count"] += 1
deep_scrub_daily_data[d]["bytes"] += pg_stat["num_bytes"]
return
def get_deep_scrub_today_data():
# increment the last deep scrubbed date from the prev daily data
# and see which pgs are due for processing now, also include
# those pgs that have been already scrubbed today
global deep_scrub_today_data
deep_scrub_interval_days = int(deep_scrub_interval/(3600 * 24))
dt_now = date.today()
d_now = datetime.strftime(dt_now, "%Y-%m-%d")
deep_scrub_today_data = {"count":0, "bytes":0}
for d in deep_scrub_daily_data:
dt = datetime.strptime(d, '%Y-%m-%d')
dt_next = dt + timedelta(days=int(deep_scrub_interval_days))
d_next = datetime.strftime(dt_next, "%Y-%m-%d")
if d_next == d_now or d == d_now:
deep_scrub_today_data["count"] += deep_scrub_daily_data[d]["count"]
deep_scrub_today_data["bytes"] += deep_scrub_daily_data[d]["bytes"]
return
def calc_ideal_daily_metrics():
# work out ideal pg bytes and counts to deep scrub per day
global deep_scrub_ideal_count
global deep_scrub_ideal_bytes
bytes = 0
count = 0
for d in deep_scrub_daily_data:
count += deep_scrub_daily_data[d]["count"]
bytes += deep_scrub_daily_data[d]["bytes"]
deep_scrub_ideal_count = count/len(deep_scrub_daily_data)
deep_scrub_ideal_bytes = bytes/len(deep_scrub_daily_data)
return
def show_deep_scrub_histogram():
# show the stats for pgs last deep scrubbed by day,
# plus total and ideal metrics
count = 0
bytes = 0
print("{:>10} {:>10} {:>20}".format('day', '#', 'bytes'))
print("-" * 42)
for d in sorted(deep_scrub_daily_data):
count += deep_scrub_daily_data[d]["count"]
bytes += deep_scrub_daily_data[d]["bytes"]
print("{:10} {:10} {:20}".format(d,
deep_scrub_daily_data[d]["count"],
deep_scrub_daily_data[d]["bytes"]))
print("-" * 42)
print("{:10} {:10} {:20}".format('', count, bytes))
print(" ")
print("{:10} {:>10} {:>20}".format('', 'ideal #', ' ideal bytes'))
print("-" * 42)
print("{:>10} {:>10} {:>20}".format(' ',
deep_scrub_ideal_count,
deep_scrub_ideal_bytes))
return
def get_deep_scrub_today_pgs():
# get a list of pgs that we should deep scrub today (if any)
# decide as follows:
# look at pg bytes scheduled for today
# if < ideal bytes then start grabbing pgs from oldest day(s)
# that have > ideal bytes scheduled
# but be careful not to take too many pgs from any one day
global pg_today_list
deep_scrub_daily_increment = {}
deep_scrub_bytes = 0
deep_scrub_count = 0
format = '%Y-%m-%d'
bytes_needed = deep_scrub_ideal_bytes - deep_scrub_today_data["bytes"]
if bytes_needed > 0:
for pg in sorted(pg_data, cmp=cmp_deep_scrub_stamp):
pg_stat = pg["stat_sum"]
dt = datetime.strptime(pg["last_deep_scrub_stamp"],'%Y-%m-%d %H:%M:%S.%f')
d = datetime.strftime(dt, format)
if d not in deep_scrub_daily_increment:
deep_scrub_daily_increment[d] = {"bytes": 0}
# see if we have removed enough pgs from this day
if deep_scrub_daily_data[d]["bytes"] - deep_scrub_daily_increment[d]["bytes"] <= deep_scrub_ideal_bytes:
continue
# ok, we are going to scrub this one
pg_today_list.append(pg["pgid"])
deep_scrub_daily_increment[d]["bytes"] += pg_stat["num_bytes"]
deep_scrub_bytes += pg_stat["num_bytes"]
deep_scrub_count += 1
bytes_needed -= pg_stat["num_bytes"]
# we now have sufficient
if bytes_needed <= 0:
break
print("about to deep scrub {:7} pgs {:20} bytes".format(deep_scrub_count, deep_scrub_bytes))
return
def deep_scrub_today_pgs(conn, args):
# send todays pgs to be scrubbed, args.numpgs at a time
pg_list = []
for pg in pg_today_list:
if args.noop == True:
print("(noop) deep scrub pg {:7}".format(pg))
pg_list.append(pg)
if len(pg_list) == args.numpgs:
if args.noop == False:
deep_scrub_pgs(conn, pg_list)
pg_list = []
if args.noop == False:
deep_scrub_pgs(conn, pg_list)
return
def deep_scrub_pgs(conn, pg_list):
# initialate a deep scrub on the list of pgs
# wait until they have been completed
for pgid in pg_list:
scrub_cmd = {"prefix":"pg deep-scrub", "pgid":pgid, "format":"json"}
print("deep scrubbing pg {:7}".format(pgid))
ret, buf, errs = conn.mon_command(json.dumps(scrub_cmd), '', timeout=cmd_timeout)
if ret != 0:
print("cmd {:30} failed with {:50}".format(scrub_cmd, errs))
wait_for_deep_scrub_pgs(conn, pg_list)
return
def wait_for_deep_scrub_pgs(conn, pg_list):
# like it says...
dt_now = date.today()
d_now = datetime.strftime(dt_now, "%Y-%m-%d")
for pgid in pg_list:
waiting = True
while waiting == True:
query_cmd = {"prefix":"pg", "pgid":pgid, "cmd":"query", "format":"json"}
ret, buf, errs = conn.pg_command(pgid, json.dumps(query_cmd), '', timeout=cmd_timeout)
if ret != 0:
print("query_cmd {:30} failed with {:50}".format(query_cmd, errs))
sys.exit(1)
pg_stats = json.loads(buf)["info"]["stats"]
if pg_stats["last_deep_scrub_stamp"] >= d_now:
waiting = False
print("end deep scrubbing pg {:7}".format(pgid))
else:
time.sleep(1)
return
def main():
global deep_scrub_interval
parser = argparse.ArgumentParser()
parser.add_argument("-c", "--conf",
help="ceph config file to use",
default="/etc/ceph/ceph.conf")
parser.add_argument("-i", "--interval",
help="deep scrub interval override",
type=float, default=None)
parser.add_argument("-n", "--numpgs",
help="number of pgs to deep scrub in parallel",
type=int, default=2)
parser.add_argument("-p", "--noop",
help="no not actually scrub",
type=bool, default=False)
parser.add_argument("-g", "--histogram",
help="only show histogram of deep scrubs by day",
type=bool, default=False)
args = parser.parse_args()
conn = Rados(conffile=args.conf)
conn.connect()
# conf_get gets confused if the actual deep scrub interval is
# different from what is specified in a ceph.conf, so we can
# override it if needed.
if args.interval:
deep_scrub_interval = args.interval
else:
deep_scrub_interval = float(conn.conf_get('osd_deep_scrub_interval'))
get_pg_data(conn)
get_deep_scrub_daily_data()
get_deep_scrub_today_data()
calc_ideal_daily_metrics()
if args.histogram == False:
get_deep_scrub_today_pgs()
deep_scrub_today_pgs(conn, args)
else:
show_deep_scrub_histogram()
conn.shutdown()
return
if __name__ == '__main__':
main()
_______________________________________________
ceph-users mailing list
ceph-users@xxxxxxxxxxxxxx
http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com