Skip to content

Commit 149ccce

Browse files
committed
vinyl: skip vylog if it's newer than snap
Having data in different engines checkpoint process is handled this way: - wait_checkpoint memtx - wait_checkpoint vinyl - commit_checkpoint memtx - commit_checkpoint vinyl In contrast to commit_checkpoint which does not tolerate fails (if something goes wrong e.g. renaming of snapshot file - instance simply crashes), wait_checkpoint may fail. As a part of wait_checkpoint for vinyl engine vy_log rotation takes place: old vy_log is closed and new one is created. At this moment, wait_checkpoint of memtx engine has already created new *inprogress* snapshot featuring bumped vclock. While recovering from this configuration, vclock of the latest snapshot is used as a reference. At the initial recovery stage (vinyl_engine_begin_initial_recovery), we check that snapshot's vclock matches with vylog's one (they should be the same since normally vylog is rotated along with snapshot). On the other hand, in the directory we have old snapshot and new vylog (and new .inprogress snapshot). In such a situation recovery (even in force mode) was aborted. The only way to fix this dead end, user has to manually delete last vy_log file. Let's proceed with the same resolution while user runs force_recovery mode: delete last vy_log file and update vclock value. If user uses casual recovery, let's print verbose message how to fix this situation manually. Closes tarantool#5823
1 parent a240e01 commit 149ccce

File tree

9 files changed

+166
-5
lines changed

9 files changed

+166
-5
lines changed
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
## bugfix/build
2+
3+
* Make recovering with force_recovery option delete newer than snapshot vylog
4+
files. So that instance can recover after incidents during checkpoint(gh-5823).

src/box/memtx_engine.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -759,6 +759,7 @@ static void
759759
memtx_engine_commit_checkpoint(struct engine *engine,
760760
const struct vclock *vclock)
761761
{
762+
ERROR_INJECT_TERMINATE(ERRINJ_SNAP_COMMIT_FAIL);
762763
(void) vclock;
763764
struct memtx_engine *memtx = (struct memtx_engine *)engine;
764765

src/box/vy_log.c

Lines changed: 28 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1030,7 +1030,6 @@ struct vy_recovery *
10301030
vy_log_begin_recovery(const struct vclock *vclock, bool force_recovery)
10311031
{
10321032
assert(vy_log.recovery == NULL);
1033-
(void) force_recovery;
10341033

10351034
/*
10361035
* Do not fail recovery if vinyl directory does not exist,
@@ -1057,11 +1056,36 @@ vy_log_begin_recovery(const struct vclock *vclock, bool force_recovery)
10571056
/*
10581057
* Last vy_log log is newer than the last snapshot.
10591058
* This can't normally happen, as vy_log is rotated
1060-
* after snapshot is created. Looks like somebody
1059+
* in a short gap between checkpoint wait and commit.
1060+
* However, if memtx for some reason fails to commit its
1061+
* changes, instance will crash leaving .inprogress snap
1062+
* and corresponding (already rotated) vylog.
1063+
* Another and simpler reason is the case when somebody
10611064
* deleted snap file, but forgot to delete vy_log.
1065+
* So in case we are anyway in force recovery mode, let's
1066+
* try to delete last .vylog file and continue recovery process.
10621067
*/
1063-
diag_set(ClientError, ER_MISSING_SNAPSHOT);
1064-
return NULL;
1068+
if (!force_recovery) {
1069+
diag_set(ClientError, ER_MISSING_SNAPSHOT);
1070+
say_info("To bootstrap instance try to remove last "
1071+
".vylog file or run in force_recovery mode");
1072+
return NULL;
1073+
}
1074+
if (xdir_remove_file_by_vclock(&vy_log.dir,
1075+
&vy_log.last_checkpoint) != 0) {
1076+
say_info(".vylog is newer than snapshot. Failed to "
1077+
"remove it. Try to delete last .vylog "
1078+
"manually");
1079+
return NULL;
1080+
}
1081+
const struct vclock *prev_checkpoint =
1082+
vy_log_prev_checkpoint(&vy_log.last_checkpoint);
1083+
if (prev_checkpoint == NULL) {
1084+
say_info("Can't find previous vylog");
1085+
return NULL;
1086+
}
1087+
vclock_copy(&vy_log.last_checkpoint, prev_checkpoint);
1088+
assert(vclock_compare(&vy_log.last_checkpoint, vclock) == 0);
10651089
}
10661090
if (cmp < 0) {
10671091
/*

src/lib/core/errinj.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,7 @@ struct errinj {
150150
_(ERRINJ_COIO_WRITE_CHUNK, ERRINJ_BOOL, {.bparam = false}) \
151151
_(ERRINJ_APPLIER_SLOW_ACK, ERRINJ_BOOL, {.bparam = false}) \
152152
_(ERRINJ_STDIN_ISATTY, ERRINJ_INT, {.iparam = -1}) \
153+
_(ERRINJ_SNAP_COMMIT_FAIL, ERRINJ_BOOL, {.bparam = false}) \
153154

154155
ENUM0(errinj_id, ERRINJ_LIST);
155156
extern struct errinj errinjs[];

test/box/errinj.result

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ evals
7272
- ERRINJ_REPLICA_JOIN_DELAY: false
7373
- ERRINJ_SIO_READ_MAX: -1
7474
- ERRINJ_SNAP_COMMIT_DELAY: false
75+
- ERRINJ_SNAP_COMMIT_FAIL: false
7576
- ERRINJ_SNAP_WRITE_DELAY: false
7677
- ERRINJ_SQL_NAME_NORMALIZATION: false
7778
- ERRINJ_STDIN_ISATTY: -1

test/vinyl/gh-5823-crash_snapshot.lua

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
#!/usr/bin/env tarantool
2+
3+
--
4+
-- mode == 0: casual bootstrap;
5+
-- mode == 1: force recovery bootstrap;
6+
-- mode == 2: casual bootstrap and fill in data.
7+
--
8+
local mode = tonumber(arg[1])
9+
box.cfg ({
10+
force_recovery = (mode == 1),
11+
})
12+
13+
if mode == 2 then
14+
local v = box.schema.space.create('test_v', {engine = 'vinyl'})
15+
v:create_index('pk')
16+
local m = box.schema.space.create('test_m')
17+
m:create_index('pk')
18+
local str = string.rep('!', 100)
19+
for i = 1,10 do v:insert{i, str} end
20+
for i = 1,10 do m:insert{i, str} end
21+
box.error.injection.set("ERRINJ_SNAP_COMMIT_FAIL", true);
22+
box.snapshot()
23+
end
24+
25+
require('console').listen(os.getenv('ADMIN'))
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
-- test-run result file version 2
2+
test_run = require('test_run').new()
3+
| ---
4+
| ...
5+
6+
-- Test is about following scenario:
7+
-- 1. There's both memtx and vinyl data;
8+
-- 2. User starts checkpoint process;
9+
-- 3. In the most unsuitable moment instance crashes;
10+
-- 4. Recovering in the casual mode does not help;
11+
-- 5. Recovering in the force recovery mode solves the problem (deletes
12+
-- redundant vylog file).
13+
--
14+
test_run:cmd("create server test with script='vinyl/gh-5823-crash_snapshot.lua'")
15+
| ---
16+
| - true
17+
| ...
18+
test_run:cmd("start server test with args='2' with crash_expected=True")
19+
| ---
20+
| - false
21+
| ...
22+
-- Can't bootstrap instance without force_recovery.
23+
--
24+
test_run:cmd("start server test with args='0' with crash_expected=True")
25+
| ---
26+
| - false
27+
| ...
28+
29+
fio = require('fio')
30+
| ---
31+
| ...
32+
fh = fio.open(fio.pathjoin(fio.cwd(), 'gh-5823-crash_snapshot.log'), {'O_RDONLY'})
33+
| ---
34+
| ...
35+
size = fh:seek(0, 'SEEK_END')
36+
| ---
37+
| ...
38+
fh:seek(-256, 'SEEK_END') ~= nil
39+
| ---
40+
| - true
41+
| ...
42+
line = fh:read(256)
43+
| ---
44+
| ...
45+
fh:close()
46+
| ---
47+
| - true
48+
| ...
49+
string.match(line, "Can\'t find snapshot") ~= nil
50+
| ---
51+
| - true
52+
| ...
53+
54+
test_run:cmd("start server test with args='1'")
55+
| ---
56+
| - true
57+
| ...
58+
test_run:cmd("switch test")
59+
| ---
60+
| - true
61+
| ...
62+
box.space.test_v:select({5})
63+
| ---
64+
| - - [5, '!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!']
65+
| ...
66+
test_run:cmd("switch default")
67+
| ---
68+
| - true
69+
| ...
70+
test_run:cmd("stop server test")
71+
| ---
72+
| - true
73+
| ...
74+
test_run:cmd("cleanup server test")
75+
| ---
76+
| - true
77+
| ...
78+
test_run:cmd("delete server test")
79+
| ---
80+
| - true
81+
| ...
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
test_run = require('test_run').new()
2+
3+
-- Test is about following scenario:
4+
-- 1. There's both memtx and vinyl data;
5+
-- 2. User starts checkpoint process;
6+
-- 3. In the most unsuitable moment instance crashes;
7+
-- 4. Recovering in the casual mode does not help;
8+
-- 5. Recovering in the force recovery mode solves the problem (deletes
9+
-- redundant vylog file).
10+
--
11+
test_run:cmd("create server test with script='vinyl/gh-5823-crash_snapshot.lua'")
12+
test_run:cmd("start server test with args='2' with crash_expected=True")
13+
-- Can't bootstrap instance without force_recovery.
14+
--
15+
test_run:cmd("start server test with args='0' with crash_expected=True")
16+
test_run:grep_log('test', "Can\'t find snapshot", nil, {filename='gh-5823-crash_snapshot.log'}) ~= nil
17+
18+
test_run:cmd("start server test with args='1'")
19+
test_run:cmd("switch test")
20+
box.space.test_v:select({5})
21+
test_run:cmd("switch default")
22+
test_run:cmd("stop server test")
23+
test_run:cmd("cleanup server test")
24+
test_run:cmd("delete server test")

test/vinyl/suite.ini

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
core = tarantool
33
description = vinyl integration tests
44
script = vinyl.lua
5-
release_disabled = errinj.test.lua errinj_ddl.test.lua errinj_gc.test.lua errinj_stat.test.lua errinj_tx.test.lua errinj_vylog.test.lua partial_dump.test.lua quota_timeout.test.lua recovery_quota.test.lua replica_rejoin.test.lua gh-4864-stmt-alloc-fail-compact.test.lua gh-4805-open-run-err-recovery.test.lua gh-4821-ddl-during-throttled-dump.test.lua gh-3395-read-prepared-uncommitted.test.lua
5+
release_disabled = errinj.test.lua errinj_ddl.test.lua errinj_gc.test.lua errinj_stat.test.lua errinj_tx.test.lua errinj_vylog.test.lua partial_dump.test.lua quota_timeout.test.lua recovery_quota.test.lua replica_rejoin.test.lua gh-4864-stmt-alloc-fail-compact.test.lua gh-4805-open-run-err-recovery.test.lua gh-4821-ddl-during-throttled-dump.test.lua gh-3395-read-prepared-uncommitted.test.lua gh-5823-skip-newer-than-snap-vylog.test.lua
66
config = suite.cfg
77
lua_libs = suite.lua stress.lua large.lua ../box/lua/txn_proxy.lua ../box/lua/utils.lua
88
use_unix_sockets = True

0 commit comments

Comments
 (0)