[tarantool-patches] [PATCH v2 1/2] memtx: do not use space_vtab::commit_alter for freeing tuples

  • From: Vladimir Davydov <vdavydov.dev@xxxxxxxxx>
  • To: kostja@xxxxxxxxxxxxx
  • Date: Fri, 6 Apr 2018 16:15:31 +0300

When the last index of a memtx space is dropped, we need to delete all
tuples stored in the space. We do it in space_vtab::commit_alter, but
this is wrong, because this function is not rolled back and so we may
get use-after-free error if we fail to write a DDL operation to WAL.
To avoid that, let's delete tuples in index_vtab::commit_drop, which is
called after WAL write.

There's a nuance here: index_vtab::commit_drop is called if an index is
rebuilt (because essentially it is drop + create) so we must elevate the
reference counter of every tuple added to the new index during rebuild
and, respectively, drop all the references in index_vtab::abort_create,
which is called if index creation is aborted for some reason. This also
means that now we iterate over all tuples twice when a primary key is
rebuilt - first to build the new index, then to unreference all tuples
stored in the old index. This is OK as we can make the last step
asynchronous, which will also speed up the more common case of space
drop.

Closes #3289
---
 src/box/memtx_bitset.c   |  4 +--
 src/box/memtx_engine.c   | 47 ++++++++++++++++++++++++++++++++++
 src/box/memtx_engine.h   | 10 ++++++++
 src/box/memtx_hash.c     |  4 +--
 src/box/memtx_rtree.c    |  4 +--
 src/box/memtx_space.c    | 45 +++++----------------------------
 src/box/memtx_tree.c     |  4 +--
 test/box/errinj.result   | 66 ++++++++++++++++++++++++++++++++++++++++++++++++
 test/box/errinj.test.lua | 17 +++++++++++++
 9 files changed, 155 insertions(+), 46 deletions(-)

diff --git a/src/box/memtx_bitset.c b/src/box/memtx_bitset.c
index e66247ee..f67405ec 100644
--- a/src/box/memtx_bitset.c
+++ b/src/box/memtx_bitset.c
@@ -457,9 +457,9 @@ memtx_bitset_index_count(struct index *base, enum 
iterator_type type,
 static const struct index_vtab memtx_bitset_index_vtab = {
        /* .destroy = */ memtx_bitset_index_destroy,
        /* .commit_create = */ generic_index_commit_create,
-       /* .abort_create = */ generic_index_abort_create,
+       /* .abort_create = */ memtx_index_abort_create,
        /* .commit_modify = */ generic_index_commit_modify,
-       /* .commit_drop = */ generic_index_commit_drop,
+       /* .commit_drop = */ memtx_index_commit_drop,
        /* .update_def = */ generic_index_update_def,
        /* .depends_on_pk = */ generic_index_depends_on_pk,
        /* .size = */ memtx_bitset_index_size,
diff --git a/src/box/memtx_engine.c b/src/box/memtx_engine.c
index e60e34d0..f7e21a69 100644
--- a/src/box/memtx_engine.c
+++ b/src/box/memtx_engine.c
@@ -1062,3 +1062,50 @@ memtx_index_extent_reserve(int num)
        }
        return 0;
 }
+
+static void
+memtx_index_free_tuples(struct index *index)
+{
+       if (index->def->iid > 0)
+               return;
+
+       /*
+        * Tuples stored in a memtx space are referenced by the
+        * primary index so when the primary index is dropped we
+        * should delete them.
+        */
+       struct iterator *it = index_create_iterator(index, ITER_ALL, NULL, 0);
+       if (it == NULL)
+               goto fail;
+       int rc;
+       struct tuple *tuple;
+       while ((rc = iterator_next(it, &tuple)) == 0 && tuple != NULL)
+               tuple_unref(tuple);
+       iterator_delete(it);
+       if (rc != 0)
+               goto fail;
+
+       return;
+fail:
+       /*
+        * This function is called after WAL write so we have no
+        * other choice but panic in case of any error. The good
+        * news is memtx iterators do not fail so we should not
+        * normally get here.
+        */
+       diag_log();
+       unreachable();
+       panic("failed to drop index");
+}
+
+void
+memtx_index_abort_create(struct index *index)
+{
+       memtx_index_free_tuples(index);
+}
+
+void
+memtx_index_commit_drop(struct index *index)
+{
+       memtx_index_free_tuples(index);
+}
diff --git a/src/box/memtx_engine.h b/src/box/memtx_engine.h
index b64c0ca3..e71045f0 100644
--- a/src/box/memtx_engine.h
+++ b/src/box/memtx_engine.h
@@ -42,6 +42,8 @@
 extern "C" {
 #endif /* defined(__cplusplus) */
 
+struct index;
+
 /**
  * The state of memtx recovery process.
  * There is a global state of the entire engine state of each
@@ -149,6 +151,14 @@ memtx_index_extent_free(void *ctx, void *extent);
 int
 memtx_index_extent_reserve(int num);
 
+/*
+ * The following two methods are used by all kinds of memtx indexes
+ * to delete tuples stored in the space when the primary index is
+ * destroyed.
+ */
+void memtx_index_abort_create(struct index *index);
+void memtx_index_commit_drop(struct index *index);
+
 #if defined(__cplusplus)
 } /* extern "C" */
 
diff --git a/src/box/memtx_hash.c b/src/box/memtx_hash.c
index c4081edc..38027a3b 100644
--- a/src/box/memtx_hash.c
+++ b/src/box/memtx_hash.c
@@ -381,9 +381,9 @@ memtx_hash_index_create_snapshot_iterator(struct index 
*base)
 static const struct index_vtab memtx_hash_index_vtab = {
        /* .destroy = */ memtx_hash_index_destroy,
        /* .commit_create = */ generic_index_commit_create,
-       /* .abort_create = */ generic_index_abort_create,
+       /* .abort_create = */ memtx_index_abort_create,
        /* .commit_modify = */ generic_index_commit_modify,
-       /* .commit_drop = */ generic_index_commit_drop,
+       /* .commit_drop = */ memtx_index_commit_drop,
        /* .update_def = */ memtx_hash_index_update_def,
        /* .depends_on_pk = */ generic_index_depends_on_pk,
        /* .size = */ memtx_hash_index_size,
diff --git a/src/box/memtx_rtree.c b/src/box/memtx_rtree.c
index ff213922..d0dceaea 100644
--- a/src/box/memtx_rtree.c
+++ b/src/box/memtx_rtree.c
@@ -288,9 +288,9 @@ memtx_rtree_index_create_iterator(struct index *base,  enum 
iterator_type type,
 static const struct index_vtab memtx_rtree_index_vtab = {
        /* .destroy = */ memtx_rtree_index_destroy,
        /* .commit_create = */ generic_index_commit_create,
-       /* .abort_create = */ generic_index_abort_create,
+       /* .abort_create = */ memtx_index_abort_create,
        /* .commit_modify = */ generic_index_commit_modify,
-       /* .commit_drop = */ generic_index_commit_drop,
+       /* .commit_drop = */ memtx_index_commit_drop,
        /* .update_def = */ generic_index_update_def,
        /* .depends_on_pk = */ generic_index_depends_on_pk,
        /* .size = */ memtx_rtree_index_size,
diff --git a/src/box/memtx_space.c b/src/box/memtx_space.c
index ec6f6db6..ebb54f05 100644
--- a/src/box/memtx_space.c
+++ b/src/box/memtx_space.c
@@ -803,41 +803,17 @@ memtx_space_build_secondary_key(struct space *old_space,
                        break;
                assert(old_tuple == NULL); /* Guaranteed by DUP_INSERT. */
                (void) old_tuple;
+               /*
+                * All tuples stored in a memtx space must be
+                * referenced by the primary index.
+                */
+               if (new_index->def->iid == 0)
+                       tuple_ref(tuple);
        }
        iterator_delete(it);
        return rc;
 }
 
-static void
-memtx_space_prune(struct space *space)
-{
-       struct index *index = space_index(space, 0);
-       if (index == NULL)
-               return;
-
-       struct iterator *it = index_create_iterator(index, ITER_ALL, NULL, 0);
-       if (it == NULL)
-               goto fail;
-       int rc;
-       struct tuple *tuple;
-       while ((rc = iterator_next(it, &tuple)) == 0 && tuple != NULL)
-               tuple_unref(tuple);
-       iterator_delete(it);
-       if (rc == 0)
-               return;
-fail:
-       /*
-        * This function is called from space_vtab::commit_alter()
-        * or commit_truncate(), which do not tolerate failures,
-        * so we have no other choice but panic here. Good news is
-        * memtx iterators do not fail so we should not normally
-        * get here.
-        */
-       diag_log();
-       unreachable();
-       panic("failed to prune space");
-}
-
 static int
 memtx_space_prepare_alter(struct space *old_space, struct space *new_space)
 {
@@ -857,14 +833,7 @@ memtx_space_commit_alter(struct space *old_space, struct 
space *new_space)
        struct memtx_space *new_memtx_space = (struct memtx_space *)new_space;
        bool is_empty = new_space->index_count == 0 ||
                        index_size(new_space->index[0]) == 0;
-
-       /*
-        * Delete all tuples when the last index is dropped
-        * or the space is truncated.
-        */
-       if (is_empty)
-               memtx_space_prune(old_space);
-       else
+       if (!is_empty)
                new_memtx_space->bsize = old_memtx_space->bsize;
 }
 
diff --git a/src/box/memtx_tree.c b/src/box/memtx_tree.c
index a06b590d..22177f57 100644
--- a/src/box/memtx_tree.c
+++ b/src/box/memtx_tree.c
@@ -590,9 +590,9 @@ memtx_tree_index_create_snapshot_iterator(struct index 
*base)
 static const struct index_vtab memtx_tree_index_vtab = {
        /* .destroy = */ memtx_tree_index_destroy,
        /* .commit_create = */ generic_index_commit_create,
-       /* .abort_create = */ generic_index_abort_create,
+       /* .abort_create = */ memtx_index_abort_create,
        /* .commit_modify = */ generic_index_commit_modify,
-       /* .commit_drop = */ generic_index_commit_drop,
+       /* .commit_drop = */ memtx_index_commit_drop,
        /* .update_def = */ memtx_tree_index_update_def,
        /* .depends_on_pk = */ memtx_tree_index_depends_on_pk,
        /* .size = */ memtx_tree_index_size,
diff --git a/test/box/errinj.result b/test/box/errinj.result
index 8e4d5742..269de663 100644
--- a/test/box/errinj.result
+++ b/test/box/errinj.result
@@ -1093,3 +1093,69 @@ s:drop()
 box.schema.user.revoke('guest', 'read,write,execute','universe')
 ---
 ...
+--
+-- gh-3289: drop/truncate leaves the space in inconsistent
+-- state if WAL write fails.
+--
+s = box.schema.space.create('test')
+---
+...
+_ = s:create_index('pk')
+---
+...
+for i = 1, 10 do s:replace{i} end
+---
+...
+errinj.set('ERRINJ_WAL_IO', true)
+---
+- ok
+...
+s:drop()
+---
+- error: Failed to write to disk
+...
+s:truncate()
+---
+- error: Failed to write to disk
+...
+s:drop()
+---
+- error: Failed to write to disk
+...
+s:truncate()
+---
+- error: Failed to write to disk
+...
+errinj.set('ERRINJ_WAL_IO', false)
+---
+- ok
+...
+for i = 1, 10 do s:replace{i + 10} end
+---
+...
+s:select()
+---
+- - [1]
+  - [2]
+  - [3]
+  - [4]
+  - [5]
+  - [6]
+  - [7]
+  - [8]
+  - [9]
+  - [10]
+  - [11]
+  - [12]
+  - [13]
+  - [14]
+  - [15]
+  - [16]
+  - [17]
+  - [18]
+  - [19]
+  - [20]
+...
+s:drop()
+---
+...
diff --git a/test/box/errinj.test.lua b/test/box/errinj.test.lua
index d97cd81f..2d4b210d 100644
--- a/test/box/errinj.test.lua
+++ b/test/box/errinj.test.lua
@@ -366,3 +366,20 @@ errinj.set("ERRINJ_IPROTO_TX_DELAY", false)
 
 s:drop()
 box.schema.user.revoke('guest', 'read,write,execute','universe')
+
+--
+-- gh-3289: drop/truncate leaves the space in inconsistent
+-- state if WAL write fails.
+--
+s = box.schema.space.create('test')
+_ = s:create_index('pk')
+for i = 1, 10 do s:replace{i} end
+errinj.set('ERRINJ_WAL_IO', true)
+s:drop()
+s:truncate()
+s:drop()
+s:truncate()
+errinj.set('ERRINJ_WAL_IO', false)
+for i = 1, 10 do s:replace{i + 10} end
+s:select()
+s:drop()
-- 
2.11.0


Other related posts: