Re: [PATCH 1/1] EC backfill retries

David Zafman <dzafman@xxxxxxxxxx> · Fri, 10 Mar 2017 12:34:21 -0800

Alexandre,


    I can't apply your patch because sha1 e3eb1fd isn't in the ceph 
repo and it won't apply cleanly.


David

On 3/7/17 9:33 AM, Alexandre Oliva wrote:
Having applied David Zafman's patch for 13937 and Kefu Chai's for 17857,
I still hit the problem described in issue 18162.  This mostly fixes it.
There are still remaining known problems:

- given config osd recovery max single start > 1, if the same 'single
start' involves retries that would require reading from different OSDs,
they may fail repeatedly, because we will, for some reason I couldn't
identify, perform all reads from the same set of OSDs.

- in some cases I couldn't narrow down, backfill will complete with
pending retries, and that will take up a backfill slot from the OSD
until it PG goes through peering.

Having said that, I think getting errors during EC backfill fixed up
rather than having the primary crash over and over is much improvement,
and since I don't envision having time to investigate the remaining
issues in the near future, I figured I'd post this patch anyway.


ReplicatedPG::failed_push: implement EC backfill retries

ECBackend::get_min_avail_to_read_shards: use only locations from the
missing_loc if there is any missing_loc set for the object.

ReplicatedPG::get_snapset_context: fix some misindented code.

ReplicatedPG::failed_push: create missing_loc if it's not there,
before clearing the failed location and clearing recovering.

ReplicatedPG::_clear_recovery_state: tolerate backfills_in_flight
without corresponding recovering entries without crashing: they're
pending retries.

ReplicatedPG::start_recovery_ops: if recovery ends with missing
replicas, log what they are.

ReplicatedPG::recover_replicas: do not reject retry-pending backfills.

ReplicatedPG::recover_backfill: don't trim backfill_info.objects past
pending retries.  Clean up unused sets of backfill_pos.  Detect
retries and be more verbose about them.

Fixes: http://tracker.ceph.com/issues/18162
Fixes: http://tracker.ceph.com/issues/13937
Fixes: http://tracker.ceph.com/issues/17857
Signed-off-by: Alexandre Oliva <oliva@xxxxxxx>
---
  src/osd/ECBackend.cc    |   23 +++++++--
  src/osd/ReplicatedPG.cc |  114 ++++++++++++++++++++++++++++++++++++++---------
  2 files changed, 108 insertions(+), 29 deletions(-)

diff --git a/src/osd/ECBackend.cc b/src/osd/ECBackend.cc
index e3eb1fd..27b31fe 100644
--- a/src/osd/ECBackend.cc
+++ b/src/osd/ECBackend.cc
@@ -1450,9 +1450,6 @@ int ECBackend::get_min_avail_to_read_shards(
    // Make sure we don't do redundant reads for recovery
    assert(!for_recovery || !do_redundant_reads);
  
-  map<hobject_t, set<pg_shard_t>, hobject_t::BitwiseComparator>::const_iterator miter =
-    get_parent()->get_missing_loc_shards().find(hoid);
-
    set<int> have;
    map<shard_id_t, pg_shard_t> shards;
  
@@ -1490,16 +1487,28 @@ int ECBackend::get_min_avail_to_read_shards(
        }
      }
  
-    if (miter != get_parent()->get_missing_loc_shards().end()) {
+    bool miter_first = true;
+    for (map<hobject_t, set<pg_shard_t>, hobject_t::BitwiseComparator>::const_iterator miter =
+	   get_parent()->get_missing_loc_shards().find(hoid);
+	 miter != get_parent()->get_missing_loc_shards().end();
+	 miter++) {
+      if (miter_first) {
+	dout(20) << __func__ << hoid
+		 << " has missing_loc, resetting have" << dendl;
+	miter_first = false;
+	have.clear();
+      }
+      dout(20) << __func__ << hoid
+	       << " presumed available at " << miter->second
+	       << dendl;
        for (set<pg_shard_t>::iterator i = miter->second.begin();
  	   i != miter->second.end();
  	   ++i) {
  	dout(10) << __func__ << ": checking missing_loc " << *i << dendl;
  	boost::optional<const pg_missing_t &> m =
  	  get_parent()->maybe_get_shard_missing(*i);
-	if (m) {
-	  assert(!(*m).is_missing(hoid));
-	}
+	if (m && (*m).is_missing(hoid))
+	  continue;
  	have.insert(i->shard);
  	shards.insert(make_pair(i->shard, *i));
        }
diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc
index 4c490a4..afed733 100644
--- a/src/osd/ReplicatedPG.cc
+++ b/src/osd/ReplicatedPG.cc
@@ -9334,8 +9334,8 @@ SnapSetContext *ReplicatedPG::get_snapset_context(
  	r = pgbackend->objects_get_attr(oid.get_head(), SS_ATTR, &bv);
        if (r < 0) {
  	// try _snapset
-      if (!(oid.is_snapdir() && !oid_existed))
-	r = pgbackend->objects_get_attr(oid.get_snapdir(), SS_ATTR, &bv);
+	if (!(oid.is_snapdir() && !oid_existed))
+	  r = pgbackend->objects_get_attr(oid.get_snapdir(), SS_ATTR, &bv);
  	if (r < 0 && !can_create)
  	  return NULL;
        }
@@ -9598,9 +9598,31 @@ void ReplicatedPG::failed_push(const list<pg_shard_t> &from, const hobject_t &so
      obc->drop_recovery_read(&blocked_ops);
      requeue_ops(blocked_ops);
    }
-  recovering.erase(soid);
-  for (list<pg_shard_t>::const_iterator i = from.begin(); i != from.end(); i++)
+  if (missing_loc.get_locations(soid).empty()) {
+    dout(20) << __func__ << ": " << soid
+	     << " had an empty location list, reconstructing" << dendl;
+    assert(!actingbackfill.empty());
+    for (set<pg_shard_t>::iterator i = actingbackfill.begin();
+	 i != actingbackfill.end(); ++i) {
+      pg_shard_t peer = *i;
+      if (!peer_missing[peer].is_missing(soid)) {
+	missing_loc.add_location(soid, peer);
+	dout(20) << __func__ << ": " << soid
+		 << " assumed to be available in " << peer << dendl;
+      }
+    }
+  }
+  for (list<pg_shard_t>::const_iterator i = from.begin();
+       i != from.end(); i++) {
+    dout(20) << __func__ << ": " << soid
+	     << " marked as not available in " << *i
+	     << dendl;
      missing_loc.remove_location(soid, *i);
+  }
+  /* If we were backfilling, we will retry the push from
+     recover_backfill, and its backfills_in_flight entry will only be
+     cleared when we succeed.  */
+  recovering.erase(soid);
    dout(0) << "_failed_push " << soid << " from shard " << from
  	  << ", reps on " << missing_loc.get_locations(soid)
  	  << " unfound? " << missing_loc.is_unfound(soid) << dendl;
@@ -10225,7 +10247,9 @@ void ReplicatedPG::_clear_recovery_state()
    last_backfill_started = hobject_t();
    set<hobject_t, hobject_t::Comparator>::iterator i = backfills_in_flight.begin();
    while (i != backfills_in_flight.end()) {
-    assert(recovering.count(*i));
+    if(!recovering.count(*i))
+      dout(10) << __func__ << ": " << *i
+	       << " is still pending backfill retry" << dendl;
      backfills_in_flight.erase(i++);
    }
  
@@ -10450,6 +10474,21 @@ bool ReplicatedPG::start_recovery_ops(
      // this shouldn't happen!
      // We already checked num_missing() so we must have missing replicas
      osd->clog->error() << info.pgid << " recovery ending with missing replicas\n";
+    set<pg_shard_t>::const_iterator end = actingbackfill.end();
+    set<pg_shard_t>::const_iterator a = actingbackfill.begin();
+    for (; a != end; ++a) {
+      if (*a == get_primary()) continue;
+      pg_shard_t peer = *a;
+      map<pg_shard_t, pg_missing_t>::const_iterator pm = peer_missing.find(peer);
+      if (pm == peer_missing.end()) {
+	continue;
+      }
+      if (pm->second.num_missing()) {
+	dout(10) << __func__ << " osd." << peer << " has "
+		 << pm->second.num_missing() << " missing: "
+		 << pm->second << dendl;
+      }
+    }
      return work_in_progress;
    }
  
@@ -10740,7 +10779,7 @@ int ReplicatedPG::recover_replicas(int max, ThreadPool::TPHandle &handle)
        const hobject_t soid(p->second);
  
        if (cmp(soid, pi->second.last_backfill, get_sort_bitwise()) > 0) {
-	if (!recovering.count(soid)) {
+	if (!recovering.count(soid) && !backfills_in_flight.count(soid)) {
  	  derr << __func__ << ": object added to missing set for backfill, but "
  	       << "is not in recovering, error!" << dendl;
  	  assert(0);
@@ -10908,6 +10947,29 @@ int ReplicatedPG::recover_backfill(
  	   << dendl;
    }
  
+  bool trim = true;
+  bool retrying = !backfills_in_flight.empty();
+  if (retrying) {
+    /* If we had any errors, arrange for us to retain the information
+       about the range before the first failed object, and to retry
+       it.  */
+    map<hobject_t,eversion_t,hobject_t::Comparator>::iterator p;
+    p = backfill_info.objects.find(*backfills_in_flight.begin());
+    if (p-- == backfill_info.objects.end() ||
+	p == backfill_info.objects.end()) {
+      last_backfill_started = *backfills_in_flight.begin();
+      trim = false;
+      dout(20) << "backfill retry: adjusting last_backfill_started to "
+	       << last_backfill_started
+	       << " and disabling trimming" << dendl;
+    } else {
+      last_backfill_started = MIN_HOBJ(last_backfill_started, p->first,
+				       get_sort_bitwise());
+      dout(20) << "backfill retry: adjusting last_backfill_started to "
+	       << last_backfill_started << dendl;
+    }
+  }
+
    // update our local interval to cope with recent changes
    backfill_info.begin = last_backfill_started;
    update_range(&backfill_info, handle);
@@ -10918,18 +10980,19 @@ int ReplicatedPG::recover_backfill(
    vector<boost::tuple<hobject_t, eversion_t, pg_shard_t> > to_remove;
    set<hobject_t, hobject_t::BitwiseComparator> add_to_stat;
  
-  for (set<pg_shard_t>::iterator i = backfill_targets.begin();
-       i != backfill_targets.end();
-       ++i) {
-    peer_backfill_info[*i].trim_to(
-      MAX_HOBJ(peer_info[*i].last_backfill, last_backfill_started,
-	       get_sort_bitwise()));
+  if (trim) {
+    dout(20) << "trimming backfill interval up to "
+	     << last_backfill_started << dendl;
+    for (set<pg_shard_t>::iterator i = backfill_targets.begin();
+         i != backfill_targets.end();
+	 ++i) {
+      peer_backfill_info[*i].trim_to(
+        MAX_HOBJ(peer_info[*i].last_backfill, last_backfill_started,
+		 get_sort_bitwise()));
+    }
+    backfill_info.trim_to(last_backfill_started);
    }
-  backfill_info.trim_to(last_backfill_started);
  
-  hobject_t backfill_pos = MIN_HOBJ(backfill_info.begin,
-				    earliest_peer_backfill(),
-				    get_sort_bitwise());
    while (ops < max) {
      if (cmp(backfill_info.begin, earliest_peer_backfill(),
  	    get_sort_bitwise()) <= 0 &&
@@ -10939,9 +11002,8 @@ int ReplicatedPG::recover_backfill(
        backfill_info.end = hobject_t::get_max();
        update_range(&backfill_info, handle);
        backfill_info.trim();
+      dout(20) << "resetting and trimming backfill interval" << dendl;
      }
-    backfill_pos = MIN_HOBJ(backfill_info.begin, earliest_peer_backfill(),
-			    get_sort_bitwise());
  
      dout(20) << "   my backfill interval " << backfill_info << dendl;
  
@@ -10983,9 +11045,7 @@ int ReplicatedPG::recover_backfill(
      // Get object within set of peers to operate on and
      // the set of targets for which that object applies.
      hobject_t check = earliest_peer_backfill();
-
      if (cmp(check, backfill_info.begin, get_sort_bitwise()) < 0) {
-
        set<pg_shard_t> check_targets;
        for (set<pg_shard_t>::iterator i = backfill_targets.begin();
  	   i != backfill_targets.end();
@@ -11071,6 +11131,11 @@ int ReplicatedPG::recover_backfill(
  	  to_push.push_back(
  	    boost::tuple<hobject_t, eversion_t, ObjectContextRef, vector<pg_shard_t> >
  	    (backfill_info.begin, obj_v, obc, all_push));
+	  if (retrying && backfills_in_flight.count(backfill_info.begin))
+	    dout(20) << " BACKFILL retrying " << backfill_info.begin
+		     << " with locations "
+		     << missing_loc.get_locations(backfill_info.begin)
+		     << dendl;
  	  // Count all simultaneous pushes of the same object as a single op
  	  ops++;
  	} else {
@@ -11100,8 +11165,9 @@ int ReplicatedPG::recover_backfill(
        }
      }
    }
-  backfill_pos = MIN_HOBJ(backfill_info.begin, earliest_peer_backfill(),
-			  get_sort_bitwise());
+  hobject_t backfill_pos = MIN_HOBJ(backfill_info.begin,
+				    earliest_peer_backfill(),
+				    get_sort_bitwise());
  
    for (set<hobject_t, hobject_t::BitwiseComparator>::iterator i = add_to_stat.begin();
         i != add_to_stat.end();
@@ -11124,6 +11190,10 @@ int ReplicatedPG::recover_backfill(
    PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
    for (unsigned i = 0; i < to_push.size(); ++i) {
      handle.reset_tp_timeout();
+    if (retrying && backfills_in_flight.count(to_push[i].get<0>()))
+      dout(0) << "retrying backfill of " << to_push[i].get<0>()
+	      << " with locations "
+	      << missing_loc.get_locations(to_push[i].get<0>()) << dendl;
      prep_backfill_object_push(to_push[i].get<0>(), to_push[i].get<1>(),
  	    to_push[i].get<2>(), to_push[i].get<3>(), h);
    }



--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html