On Tue, Oct 02, 2018 at 01:12:49PM +0300, Olga Arkhangelskaia wrote:
Patch fixes behavior when replica tries to connect to the same master
more than once. In case when it is initial configuration we raise the
exception. If it in not initial config we print the error and disconnect
the applier.
Closes #3610
---
https://github.com/tarantool/tarantool/issues/3610 ;
https://github.com/tarantool/tarantool/tree/OKriw/gh-3610-assert_fail_when_connect_master_twice-1.10
diff --git a/src/box/box.h b/src/box/box.h
index 9930d4a1a..21c60e8f0 100644
--- a/src/box/box.h
+++ b/src/box/box.h
@@ -38,6 +38,9 @@
extern "C" {
#endif /* defined(__cplusplus) */
+/* Size of buffer for error message */
+#define MSG_SIZE 50
+
/*
* Box - data storage (spaces, indexes) and query
* processor (INSERT, UPDATE, DELETE, SELECT, Lua)
diff --git a/src/box/replication.cc b/src/box/replication.cc
index 5755ad45e..f97f54d53 100644
--- a/src/box/replication.cc
+++ b/src/box/replication.cc
@@ -271,6 +271,8 @@ replica_on_applier_connect(struct replica *replica)
assert(replica->applier_sync_state == APPLIER_DISCONNECTED);
replica->uuid = applier->uuid;
+ replica->applier_sync_state = APPLIER_CONNECTED;
+ replicaset.applier.connected++;
struct replica *orig = replica_hash_search(&replicaset.hash, replica);
if (orig != NULL && orig->applier != NULL) {
@@ -290,6 +292,8 @@ replica_on_applier_connect(struct replica *replica)
if (orig != NULL) {
/* Use existing struct replica */
+ assert(orig->applier_sync_state == APPLIER_DISCONNECTED);
+ orig->applier_sync_state = replica->applier_sync_state;
replica_set_applier(orig, applier);
replica_clear_applier(replica);
replica_delete(replica);
@@ -299,8 +303,6 @@ replica_on_applier_connect(struct replica *replica)
replica_hash_insert(&replicaset.hash, replica);
}
- replica->applier_sync_state = APPLIER_CONNECTED;
- replicaset.applier.connected++;
}
static void
@@ -427,6 +429,7 @@ replicaset_update(struct applier **appliers, int count)
auto uniq_guard = make_scoped_guard([&]{
replica_hash_foreach_safe(&uniq, replica, next) {
replica_hash_remove(&uniq, replica);
+ replica_clear_applier(replica);
replica_delete(replica);
}
});
@@ -454,6 +457,8 @@ replicaset_update(struct applier **appliers, int count)
replica->uuid = applier->uuid;
if (replica_hash_search(&uniq, replica) != NULL) {
+ replica_clear_applier(replica);
+ replica_delete(replica);
tnt_raise(ClientError, ER_CFG, "replication",
"duplicate connection to the same replica");
}
@@ -596,6 +601,9 @@ replicaset_connect(struct applier **appliers, int count,
double timeout = replication_connect_timeout;
int quorum = MIN(count, replication_connect_quorum);
+ /* Default error message in case if something will fail */
+ char err_msg [MSG_SIZE] = "failed to connect to one or more replicas";
+ char *msg = err_msg;
/* Add triggers and start simulations connection to remote peers */
for (int i = 0; i < count; i++) {
@@ -641,8 +649,17 @@ replicaset_connect(struct applier **appliers, int count,
}
/* Now all the appliers are connected, update the replica set. */
- replicaset_update(appliers, count);
- return;
+ try {
+ replicaset_update(appliers, count);
+ return;
+ } catch (Exception *e) {
+ /* If exception occurred in replicaset_update we need to rewrite
+ * error message.
+ */
+ char dupl_msg[MSG_SIZE] = "duplicate connection to the same
replica";
+ msg = dupl_msg;
+ goto error;
+ }
error:
/* Destroy appliers */
for (int i = 0; i < count; i++) {
@@ -651,8 +668,7 @@ error:
}
/* ignore original error */
- tnt_raise(ClientError, ER_CFG, "replication",
- "failed to connect to one or more replicas");
+ tnt_raise(ClientError, ER_CFG, "replication", msg);
}
bool
diff --git a/test/replication/misc.test.lua b/test/replication/misc.test.lua
index 56e1bab69..5eb440837 100644
--- a/test/replication/misc.test.lua
+++ b/test/replication/misc.test.lua
@@ -161,5 +161,44 @@ _ = test_run:wait_vclock('replica_auth', vclock)
test_run:cmd("stop server replica_auth")
test_run:cmd("cleanup server replica_auth")
test_run:cmd("delete server replica_auth")
-
box.schema.user.drop('cluster')
+
+--
+-- Test case for gh-3610. Before the fix replica would fail with the
assertion
+-- when trying to connect to the same master twice.
+--
+box.schema.user.grant('guest', 'replication')
+test_run:cmd("create server replica with rpl_master=default,
script='replication/replica.lua'")
+test_run:cmd("start server replica")
+test_run:cmd("switch replica")
+replication = box.cfg.replication
+box.cfg{replication = {replication, replication}}
+
+test_run:cmd("switch default")
+test_run:cmd('cleanup server replica')
+
+-- case when replica reconnects master with duplication in new configuration
+
+listen = box.cfg.listen
+test_run:cmd("switch replica")
+box.cfg{replication_connect_quorum=0, replication_connect_timeout = 0.1}
+
+test_run:cmd("switch default")
+box.cfg{listen = ''}
+
+test_run:cmd("switch replica")
+replication = box.cfg.replication
+box.cfg{replication = {replication, replication}}
+
+test_run:cmd("switch default")
+box.cfg{listen = listen}
+fiber.sleep(1)
+test_run:cmd("switch replica")
+
+test_run:cmd("switch default")
+test_run:grep_log('replica', 'duplicate connection')
+
+box.schema.user.revoke('guest', 'replication')
+test_run:cmd("stop server replica")
+test_run:cmd('cleanup server replica')
+test_run:cmd("delete server replica")
diff --git a/test/replication/replica_dupl.lua
b/test/replication/replica_dupl.lua
new file mode 100644
index 000000000..29e94551d