[tarantool-patches] [PATCH v2 3/3] replication: improve join/subscribe logging

  • From: Vladimir Davydov <vdavydov.dev@xxxxxxxxx>
  • To: kostja@xxxxxxxxxxxxx
  • Date: Fri, 15 Feb 2019 15:25:49 +0300

This patch makes logging a little bit more verbose so as to facilitate
post-mortem replication failure analysis:

 - When an instance starts up, print its uuid and vclock.
 - When a replica connects to a master, print the master's uuid to
   the replica's log.
 - Print replica uuid and address to the master's log on join/subscribe.
 - Log local and remote vclock on subscribe, both on master and on
   replica.
 - Log ids assigned to replicas both on recovery and when online.
   Also, log when a replica is deleted from the cluster.
---
 src/box/alter.cc       |  1 +
 src/box/applier.cc     | 17 +++++++++++++----
 src/box/box.cc         | 21 ++++++++++++++++++---
 src/box/replication.cc |  5 +++++
 4 files changed, 37 insertions(+), 7 deletions(-)

diff --git a/src/box/alter.cc b/src/box/alter.cc
index de8372b3..0e340468 100644
--- a/src/box/alter.cc
+++ b/src/box/alter.cc
@@ -3017,6 +3017,7 @@ on_replace_dd_schema(struct trigger * /* trigger */, void 
*event)
                tt_uuid uu;
                tuple_field_uuid_xc(new_tuple, BOX_CLUSTER_FIELD_UUID, &uu);
                REPLICASET_UUID = uu;
+               say_info("cluster uuid %s", tt_uuid_str(&uu));
        }
 }
 
diff --git a/src/box/applier.cc b/src/box/applier.cc
index c09253be..e9addcb3 100644
--- a/src/box/applier.cc
+++ b/src/box/applier.cc
@@ -207,11 +207,12 @@ applier_connect(struct applier *applier)
        }
 
        if (applier->version_id != greeting.version_id) {
-               say_info("remote master is %u.%u.%u at %s",
+               say_info("remote master %s at %s running Tarantool %u.%u.%u",
+                        tt_uuid_str(&greeting.uuid),
+                        sio_strfaddr(&applier->addr, applier->addr_len),
                         version_id_major(greeting.version_id),
                         version_id_minor(greeting.version_id),
-                        version_id_patch(greeting.version_id),
-                        sio_strfaddr(&applier->addr, applier->addr_len));
+                        version_id_patch(greeting.version_id));
        }
 
        /* Save the remote instance version and UUID on connect. */
@@ -393,8 +394,11 @@ applier_subscribe(struct applier *applier)
        struct vclock remote_vclock_at_subscribe;
        struct tt_uuid cluster_id = uuid_nil;
 
+       struct vclock vclock;
+       vclock_create(&vclock);
+       vclock_copy(&vclock, &replicaset.vclock);
        xrow_encode_subscribe_xc(&row, &REPLICASET_UUID, &INSTANCE_UUID,
-                                &replicaset.vclock);
+                                &vclock);
        coio_write_xrow(coio, &row);
 
        /* Read SUBSCRIBE response */
@@ -429,6 +433,11 @@ applier_subscribe(struct applier *applier)
                                  tt_uuid_str(&cluster_id),
                                  tt_uuid_str(&REPLICASET_UUID));
                }
+
+               say_info("subscribed");
+               say_info("remote vclock %s local vclock %s",
+                        vclock_to_string(&remote_vclock_at_subscribe),
+                        vclock_to_string(&vclock));
        }
        /*
         * Tarantool < 1.6.7:
diff --git a/src/box/box.cc b/src/box/box.cc
index 8136cf33..058ba266 100644
--- a/src/box/box.cc
+++ b/src/box/box.cc
@@ -1509,6 +1509,9 @@ box_process_join(struct ev_io *io, struct xrow_header 
*header)
        row.sync = header->sync;
        coio_write_xrow(io, &row);
 
+       say_info("joining replica %s at %s",
+                tt_uuid_str(&instance_uuid), sio_socketname(io->fd));
+
        /*
         * Initial stream: feed replica with dirty data from engines.
         */
@@ -1600,6 +1603,9 @@ box_process_subscribe(struct ev_io *io, struct 
xrow_header *header)
                          "wal_mode = 'none'");
        }
 
+       struct vclock vclock;
+       vclock_create(&vclock);
+       vclock_copy(&vclock, &replicaset.vclock);
        /*
         * Send a response to SUBSCRIBE request, tell
         * the replica how many rows we have in stock for it,
@@ -1612,9 +1618,7 @@ box_process_subscribe(struct ev_io *io, struct 
xrow_header *header)
         * the additional field.
         */
        struct xrow_header row;
-       xrow_encode_subscribe_response_xc(&row,
-                                         &REPLICASET_UUID,
-                                         &replicaset.vclock);
+       xrow_encode_subscribe_response_xc(&row, &REPLICASET_UUID, &vclock);
        /*
         * Identify the message with the replica id of this
         * instance, this is the only way for a replica to find
@@ -1626,6 +1630,11 @@ box_process_subscribe(struct ev_io *io, struct 
xrow_header *header)
        row.sync = header->sync;
        coio_write_xrow(io, &row);
 
+       say_info("subscribed replica %s at %s",
+                tt_uuid_str(&replica_uuid), sio_socketname(io->fd));
+       say_info("remote vclock %s local vclock %s",
+                vclock_to_string(&replica_clock), vclock_to_string(&vclock));
+
        /*
         * Process SUBSCRIBE request via replication relay
         * Send current recovery vector clock as a marker
@@ -1848,6 +1857,9 @@ bootstrap(const struct tt_uuid *instance_uuid,
                INSTANCE_UUID = *instance_uuid;
        else
                tt_uuid_create(&INSTANCE_UUID);
+
+       say_info("instance uuid %s", tt_uuid_str(&INSTANCE_UUID));
+
        /*
         * Begin listening on the socket to enable
         * master-master replication leader election.
@@ -1905,6 +1917,8 @@ local_recovery(const struct tt_uuid *instance_uuid,
                          tt_uuid_str(&INSTANCE_UUID));
        }
 
+       say_info("instance uuid %s", tt_uuid_str(&INSTANCE_UUID));
+
        struct wal_stream wal_stream;
        wal_stream_create(&wal_stream, cfg_geti64("rows_per_wal"));
 
@@ -1930,6 +1944,7 @@ local_recovery(const struct tt_uuid *instance_uuid,
         * not attempt to apply these rows twice.
         */
        recovery_scan(recovery, &replicaset.vclock, &gc.vclock);
+       say_info("instance vclock %s", vclock_to_string(&replicaset.vclock));
 
        if (wal_dir_lock >= 0) {
                box_listen();
diff --git a/src/box/replication.cc b/src/box/replication.cc
index 19ad5026..e3ceb009 100644
--- a/src/box/replication.cc
+++ b/src/box/replication.cc
@@ -197,6 +197,9 @@ replica_set_id(struct replica *replica, uint32_t replica_id)
                instance_id = replica_id;
        }
        replicaset.replica_by_id[replica_id] = replica;
+
+       say_info("assigned id %d to replica %s",
+                replica->id, tt_uuid_str(&replica->uuid));
 }
 
 void
@@ -214,6 +217,8 @@ replica_clear_id(struct replica *replica)
         */
        replicaset.replica_by_id[replica->id] = NULL;
        replica->id = REPLICA_ID_NIL;
+       say_info("removed replica %s", tt_uuid_str(&replica->uuid));
+
        /*
         * The replica will never resubscribe so we don't need to keep
         * WALs for it anymore. Unregister it with the garbage collector
-- 
2.11.0


Other related posts: