[tarantool-patches] Re: [PATCH v3 07/11] replication: rebootstrap instance on startup if it fell behind

  • From: Vladimir Davydov <vdavydov.dev@xxxxxxxxx>
  • To: Konstantin Osipov <kostja@xxxxxxxxxxxxx>
  • Date: Thu, 19 Jul 2018 13:04:39 +0300

On Thu, Jul 19, 2018 at 10:19:03AM +0300, Konstantin Osipov wrote:

* Vladimir Davydov <vdavydov.dev@xxxxxxxxx> [18/07/14 23:50]:

diff --git a/src/box/box.cc b/src/box/box.cc
index b629a4d8..baf30fce 100644
--- a/src/box/box.cc
+++ b/src/box/box.cc
@@ -1797,6 +1797,9 @@ bootstrap(const struct tt_uuid *instance_uuid,
 /**
  * Recover the instance from the local directory.
  * Enter hot standby if the directory is locked.
+ * Invoke rebootstrap if the instance fell too much
+ * behind its peers in the replica set and needs
+ * to be rebootstrapped.
  */
 static void
 local_recovery(const struct tt_uuid *instance_uuid,
@@ -1832,6 +1835,12 @@ local_recovery(const struct tt_uuid *instance_uuid,
    if (wal_dir_lock >= 0) {
            box_listen();
            box_sync_replication(replication_connect_timeout, false);
+
+           struct replica *master;
+           if (replicaset_needs_rejoin(&master)) {
+                   say_info("replica is too old, initiating rejoin");
+                   return bootstrap_from_master(master);

This is say_crit() IMHO.

+bool
+replicaset_needs_rejoin(struct replica **master)
+{
+   replicaset_foreach(replica) {
+           /*
+            * Rebootstrap this instance from a master if:
+            * - the oldest vclock stored on the master is greater
+            *   than or incomparable with the instance vclock
+            *   (so that the instance can't follow the master) and
+            * - the instance is strictly behind the master (so
+            *   that we won't lose any data by rebootstrapping
+            *   this instance)
+            */
+           struct applier *applier = replica->applier;
+           if (applier != NULL &&
+               vclock_compare(&applier->remote_status.gc_vclock,
+                              &replicaset.vclock) > 0 &&
+               vclock_compare(&replicaset.vclock,
+                              &applier->remote_status.vclock) < 0) {
+                   *master = replica;
+                   return true;

I'd love to see a bit more clarity in the log about this decision
making process. Imagine this function returns 'false' because
vclocks are incomparable and then replication breaks - it would be
very hard to diagnose why this happened. You could add some
logging to this function, but this would change its contract, since
currently this function has no side effects.

Should it set the diagnostics area in case of error? Log the
error? Return an extra status code? Please feel free to choose the
option you think is best.

What about this?

: 2018-07-19 12:53:45.195 [19455] main/101/replica I> can't follow [::1]:47383: 
required {1: 8} available {1: 12}
: 2018-07-19 12:53:45.195 [19455] main/101/replica C> replica is too old, 
initiating rebootstrap
: 2018-07-19 12:53:45.195 [19455] main/101/replica I> bootstrapping replica 
from [::1]:47383

and

: 2018-07-19 12:53:46.546 [19493] main/101/replica I> can't follow [::1]:47383: 
required {1: 17, 2: 1} available {1: 20}
: 2018-07-19 12:53:46.546 [19493] main/101/replica I> can't rebootstrap from 
[::1]:47383: replica has local rows: local {1: 17, 2: 1} remote {1: 23}
: 2018-07-19 12:53:46.546 [19493] main/101/replica I> recovery start

The diff is below. Note, apart from logging I also changed the logic
of the rebootstrap trigger a bit: now we proceed to rebootstrap only
if none of the configured masters has rows needed by the instance.

I haven't pushed the patch to the branch yet. If you're okay with it,
I'll squash it into the original patch, rebase the whole series, and
then update the remote branch.

diff --git a/src/box/box.cc b/src/box/box.cc
index baf30fce..7d752c59 100644
--- a/src/box/box.cc
+++ b/src/box/box.cc
@@ -1838,7 +1838,7 @@ local_recovery(const struct tt_uuid *instance_uuid,
 
                struct replica *master;
                if (replicaset_needs_rejoin(&master)) {
-                       say_info("replica is too old, initiating rejoin");
+                       say_crit("replica is too old, initiating rebootstrap");
                        return bootstrap_from_master(master);
                }
        }
diff --git a/src/box/replication.cc b/src/box/replication.cc
index d61a984f..3ea4f578 100644
--- a/src/box/replication.cc
+++ b/src/box/replication.cc
@@ -41,6 +41,7 @@
 #include "error.h"
 #include "relay.h"
 #include "vclock.h" /* VCLOCK_MAX */
+#include "sio.h"
 
 uint32_t instance_id = REPLICA_ID_NIL;
 struct tt_uuid INSTANCE_UUID;
@@ -628,28 +629,58 @@ error:
 bool
 replicaset_needs_rejoin(struct replica **master)
 {
+       struct replica *leader = NULL;
        replicaset_foreach(replica) {
-               /*
-                * Rebootstrap this instance from a master if:
-                * - the oldest vclock stored on the master is greater
-                *   than or incomparable with the instance vclock
-                *   (so that the instance can't follow the master) and
-                * - the instance is strictly behind the master (so
-                *   that we won't lose any data by rebootstrapping
-                *   this instance)
-                */
                struct applier *applier = replica->applier;
-               if (applier != NULL &&
-                   vclock_compare(&applier->remote_status.gc_vclock,
-                                  &replicaset.vclock) > 0 &&
-                   vclock_compare(&replicaset.vclock,
-                                  &applier->remote_status.vclock) < 0) {
-                       *master = replica;
-                       return true;
+               if (applier == NULL)
+                       continue;
+
+               const struct status *status = &applier->remote_status;
+               if (vclock_compare(&status->gc_vclock, &replicaset.vclock) < 0) 
{
+                       /*
+                        * There's at least one master that still stores
+                        * WALs needed by this instance. Proceed to local
+                        * recovery.
+                        */
+                       return false;
                }
+
+               const char *addr_str = sio_strfaddr(&applier->addr,
+                                               applier->addr_len);
+               char *local_vclock_str = vclock_to_string(&replicaset.vclock);
+               char *remote_vclock_str = vclock_to_string(&status->vclock);
+               char *gc_vclock_str = vclock_to_string(&status->gc_vclock);
+
+               say_info("can't follow %s: required %s available %s",
+                        addr_str, local_vclock_str, gc_vclock_str);
+
+               if (vclock_compare(&replicaset.vclock, &status->vclock) > 0) {
+                       /*
+                        * Replica has some rows that are not present on
+                        * the master. Don't rebootstrap as we don't want
+                        * to lose any data.
+                        */
+                       say_info("can't rebootstrap from %s: "
+                                "replica has local rows: local %s remote %s",
+                                addr_str, local_vclock_str, remote_vclock_str);
+                       goto next;
+               }
+
+               /* Prefer a master with the max vclock. */
+               if (leader == NULL ||
+                   vclock_sum(&applier->remote_status.vclock) >
+                   vclock_sum(&leader->applier->remote_status.vclock))
+                       leader = replica;
+next:
+               free(local_vclock_str);
+               free(remote_vclock_str);
+               free(gc_vclock_str);
        }
-       *master = NULL;
-       return false;
+       if (leader == NULL)
+               return false;
+
+       *master = leader;
+       return true;
 }
 
 void

Other related posts: