From c2c4b4a3736c0e9f69ecca95539f1c8a21282304 Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Mon, 29 Jul 2019 15:31:55 +1000 Subject: [PATCH 1/9] ctdb-tests: Reformat node_has_status() Re-indent and drop non-POSIX left-parenthesis from case labels. BUG: https://bugzilla.samba.org/show_bug.cgi?id=14085 Signed-off-by: Martin Schwenke Reviewed-by: Amitay Isaacs (cherry picked from commit 52227d19735a3305ad633672c70385f443f222f0) --- ctdb/tests/scripts/integration.bash | 94 +++++++++++++++-------------- 1 file changed, 48 insertions(+), 46 deletions(-) diff --git a/ctdb/tests/scripts/integration.bash b/ctdb/tests/scripts/integration.bash index 30725c48e53..e2f238d93d4 100644 --- a/ctdb/tests/scripts/integration.bash +++ b/ctdb/tests/scripts/integration.bash @@ -311,53 +311,55 @@ wait_until_ready () # This function is becoming nicely overloaded. Soon it will collapse! :-) node_has_status () { - local pnn="$1" - local status="$2" - - local bits fpat mpat rpat - case "$status" in - (unhealthy) bits="?|?|?|1|*" ;; - (healthy) bits="?|?|?|0|*" ;; - (disconnected) bits="1|*" ;; - (connected) bits="0|*" ;; - (banned) bits="?|1|*" ;; - (unbanned) bits="?|0|*" ;; - (disabled) bits="?|?|1|*" ;; - (enabled) bits="?|?|0|*" ;; - (stopped) bits="?|?|?|?|1|*" ;; - (notstopped) bits="?|?|?|?|0|*" ;; - (frozen) fpat='^[[:space:]]+frozen[[:space:]]+1$' ;; - (unfrozen) fpat='^[[:space:]]+frozen[[:space:]]+0$' ;; - (recovered) rpat='^Recovery mode:RECOVERY \(1\)$' ;; - (notlmaster) rpat="^hash:.* lmaster:${pnn}\$" ;; + local pnn="$1" + local status="$2" + + local bits fpat mpat rpat + case "$status" in + unhealthy) bits="?|?|?|1|*" ;; + healthy) bits="?|?|?|0|*" ;; + disconnected) bits="1|*" ;; + connected) bits="0|*" ;; + banned) bits="?|1|*" ;; + unbanned) bits="?|0|*" ;; + disabled) bits="?|?|1|*" ;; + enabled) bits="?|?|0|*" ;; + stopped) bits="?|?|?|?|1|*" ;; + notstopped) bits="?|?|?|?|0|*" ;; + frozen) fpat='^[[:space:]]+frozen[[:space:]]+1$' ;; + unfrozen) fpat='^[[:space:]]+frozen[[:space:]]+0$' ;; + recovered) rpat='^Recovery mode:RECOVERY \(1\)$' ;; + notlmaster) rpat="^hash:.* lmaster:${pnn}\$" ;; *) - echo "node_has_status: unknown status \"$status\"" - return 1 - esac - - if [ -n "$bits" ] ; then - local out x line - - out=$($CTDB -X status 2>&1) || return 1 - - { - read x - while read line ; do - # This needs to be done in 2 steps to avoid false matches. - local line_bits="${line#|${pnn}|*|}" - [ "$line_bits" = "$line" ] && continue - [ "${line_bits#${bits}}" != "$line_bits" ] && return 0 - done - return 1 - } <<<"$out" # Yay bash! - elif [ -n "$fpat" ] ; then - $CTDB statistics -n "$pnn" | egrep -q "$fpat" - elif [ -n "$rpat" ] ; then - ! $CTDB status -n "$pnn" | egrep -q "$rpat" - else - echo 'node_has_status: unknown mode, neither $bits nor $fpat is set' - return 1 - fi + echo "node_has_status: unknown status \"$status\"" + return 1 + esac + + if [ -n "$bits" ] ; then + local out x line + + out=$($CTDB -X status 2>&1) || return 1 + + { + read x + while read line ; do + # This needs to be done in 2 steps to + # avoid false matches. + local line_bits="${line#|${pnn}|*|}" + [ "$line_bits" = "$line" ] && continue + [ "${line_bits#${bits}}" != "$line_bits" ] && \ + return 0 + done + return 1 + } <<<"$out" # Yay bash! + elif [ -n "$fpat" ] ; then + $CTDB statistics -n "$pnn" | egrep -q "$fpat" + elif [ -n "$rpat" ] ; then + ! $CTDB status -n "$pnn" | egrep -q "$rpat" + else + echo 'node_has_status: unknown mode, neither $bits nor $fpat is set' + return 1 + fi } wait_until_node_has_status () -- 2.23.0.rc1 From 7d6236e19bfa313bf5456ba9fcc36f270623c217 Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Mon, 29 Jul 2019 15:40:16 +1000 Subject: [PATCH 2/9] ctdb-tests: Drop unused node statuses frozen/unfrozen Silently drop unused local variable mpat. BUG: https://bugzilla.samba.org/show_bug.cgi?id=14085 Signed-off-by: Martin Schwenke Reviewed-by: Amitay Isaacs (cherry picked from commit 9b09a87326af28877301ad27bcec5bb13744e2b6) --- ctdb/tests/scripts/integration.bash | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/ctdb/tests/scripts/integration.bash b/ctdb/tests/scripts/integration.bash index e2f238d93d4..60323e33a97 100644 --- a/ctdb/tests/scripts/integration.bash +++ b/ctdb/tests/scripts/integration.bash @@ -314,7 +314,7 @@ node_has_status () local pnn="$1" local status="$2" - local bits fpat mpat rpat + local bits rpat case "$status" in unhealthy) bits="?|?|?|1|*" ;; healthy) bits="?|?|?|0|*" ;; @@ -326,8 +326,6 @@ node_has_status () enabled) bits="?|?|0|*" ;; stopped) bits="?|?|?|?|1|*" ;; notstopped) bits="?|?|?|?|0|*" ;; - frozen) fpat='^[[:space:]]+frozen[[:space:]]+1$' ;; - unfrozen) fpat='^[[:space:]]+frozen[[:space:]]+0$' ;; recovered) rpat='^Recovery mode:RECOVERY \(1\)$' ;; notlmaster) rpat="^hash:.* lmaster:${pnn}\$" ;; *) @@ -352,12 +350,10 @@ node_has_status () done return 1 } <<<"$out" # Yay bash! - elif [ -n "$fpat" ] ; then - $CTDB statistics -n "$pnn" | egrep -q "$fpat" elif [ -n "$rpat" ] ; then ! $CTDB status -n "$pnn" | egrep -q "$rpat" else - echo 'node_has_status: unknown mode, neither $bits nor $fpat is set' + echo 'node_has_status: unknown mode, neither $bits nor $rpat is set' return 1 fi } -- 2.23.0.rc1 From f922a1f5d47f9b10aba9c7e9210dbc2b9f777cf5 Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Mon, 29 Jul 2019 15:45:41 +1000 Subject: [PATCH 3/9] ctdb-tests: Inline handling of recovered and notlmaster statuses BUG: https://bugzilla.samba.org/show_bug.cgi?id=14085 Signed-off-by: Martin Schwenke Reviewed-by: Amitay Isaacs (cherry picked from commit bb59073515ee5f7886b5d9a20d7b2805857c2708) --- ctdb/tests/scripts/integration.bash | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/ctdb/tests/scripts/integration.bash b/ctdb/tests/scripts/integration.bash index 60323e33a97..d8411edc588 100644 --- a/ctdb/tests/scripts/integration.bash +++ b/ctdb/tests/scripts/integration.bash @@ -314,7 +314,7 @@ node_has_status () local pnn="$1" local status="$2" - local bits rpat + local bits case "$status" in unhealthy) bits="?|?|?|1|*" ;; healthy) bits="?|?|?|0|*" ;; @@ -326,8 +326,16 @@ node_has_status () enabled) bits="?|?|0|*" ;; stopped) bits="?|?|?|?|1|*" ;; notstopped) bits="?|?|?|?|0|*" ;; - recovered) rpat='^Recovery mode:RECOVERY \(1\)$' ;; - notlmaster) rpat="^hash:.* lmaster:${pnn}\$" ;; + recovered) + ! $CTDB status -n "$pnn" | \ + grep -Eq '^Recovery mode:RECOVERY \(1\)$' + return + ;; + notlmaster) + ! $CTDB status -n "$pnn" | \ + grep -Eq "^hash:.* lmaster:${pnn}\$" + return + ;; *) echo "node_has_status: unknown status \"$status\"" return 1 @@ -350,10 +358,8 @@ node_has_status () done return 1 } <<<"$out" # Yay bash! - elif [ -n "$rpat" ] ; then - ! $CTDB status -n "$pnn" | egrep -q "$rpat" else - echo 'node_has_status: unknown mode, neither $bits nor $rpat is set' + echo 'node_has_status: unknown mode, $bits not set' return 1 fi } -- 2.23.0.rc1 From 2d98af8f0d8e81d8fa38c9496682086496da2194 Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Mon, 29 Jul 2019 16:43:09 +1000 Subject: [PATCH 4/9] ctdb-tests: Handle special cases first and return All the other cases involve matching bits. BUG: https://bugzilla.samba.org/show_bug.cgi?id=14085 Signed-off-by: Martin Schwenke Reviewed-by: Amitay Isaacs (cherry picked from commit bff1a3a548a2cace997b767d78bb824438664cb7) --- ctdb/tests/scripts/integration.bash | 59 ++++++++++++++--------------- 1 file changed, 28 insertions(+), 31 deletions(-) diff --git a/ctdb/tests/scripts/integration.bash b/ctdb/tests/scripts/integration.bash index d8411edc588..3fb6f9ade5e 100644 --- a/ctdb/tests/scripts/integration.bash +++ b/ctdb/tests/scripts/integration.bash @@ -314,6 +314,19 @@ node_has_status () local pnn="$1" local status="$2" + case "$status" in + recovered) + ! $CTDB status -n "$pnn" | \ + grep -Eq '^Recovery mode:RECOVERY \(1\)$' + return + ;; + notlmaster) + ! $CTDB status -n "$pnn" | \ + grep -Eq "^hash:.* lmaster:${pnn}\$" + return + ;; + esac + local bits case "$status" in unhealthy) bits="?|?|?|1|*" ;; @@ -326,42 +339,26 @@ node_has_status () enabled) bits="?|?|0|*" ;; stopped) bits="?|?|?|?|1|*" ;; notstopped) bits="?|?|?|?|0|*" ;; - recovered) - ! $CTDB status -n "$pnn" | \ - grep -Eq '^Recovery mode:RECOVERY \(1\)$' - return - ;; - notlmaster) - ! $CTDB status -n "$pnn" | \ - grep -Eq "^hash:.* lmaster:${pnn}\$" - return - ;; *) echo "node_has_status: unknown status \"$status\"" return 1 esac - - if [ -n "$bits" ] ; then - local out x line - - out=$($CTDB -X status 2>&1) || return 1 - - { - read x - while read line ; do - # This needs to be done in 2 steps to - # avoid false matches. - local line_bits="${line#|${pnn}|*|}" - [ "$line_bits" = "$line" ] && continue - [ "${line_bits#${bits}}" != "$line_bits" ] && \ - return 0 - done - return 1 - } <<<"$out" # Yay bash! - else - echo 'node_has_status: unknown mode, $bits not set' + local out x line + + out=$($CTDB -X status 2>&1) || return 1 + + { + read x + while read line ; do + # This needs to be done in 2 steps to + # avoid false matches. + local line_bits="${line#|${pnn}|*|}" + [ "$line_bits" = "$line" ] && continue + [ "${line_bits#${bits}}" != "$line_bits" ] && \ + return 0 + done return 1 - fi + } <<<"$out" # Yay bash! } wait_until_node_has_status () -- 2.23.0.rc1 From 3e5d5358aa6cfecf91fbb4e5b60287738980b476 Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Mon, 29 Jul 2019 16:45:07 +1000 Subject: [PATCH 5/9] ctdb-tests: Don't retrieve the VNN map from target node for notlmaster Use the VNN map from the node running node_has_status(). This means that wait_until_node_has_status 1 notlmaster 10 0 will run "ctdb status" on node 0 and check (for up to 10 seconds) if node 1 is in the VNN map. If the LMASTER capability has been dropped on node 1 then the above will wait for the VNN map to be updated on node 0. This will happen as part of the recovery that is triggered by the change of LMASTER capability. The next command will then only be able to attach to $TESTDB after the recovery is complete thus guaranteeing a sane state for the test to continue. This stops simple/79_volatile_db_traverse.sh from going into recovery during the traverse or at some other inconvenient time. BUG: https://bugzilla.samba.org/show_bug.cgi?id=14085 Signed-off-by: Martin Schwenke Reviewed-by: Amitay Isaacs (cherry picked from commit 53daeb2f878af1634a26e05cb86d87e2faf20173) --- ctdb/tests/scripts/integration.bash | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ctdb/tests/scripts/integration.bash b/ctdb/tests/scripts/integration.bash index 3fb6f9ade5e..a4d45fb9ac2 100644 --- a/ctdb/tests/scripts/integration.bash +++ b/ctdb/tests/scripts/integration.bash @@ -321,8 +321,7 @@ node_has_status () return ;; notlmaster) - ! $CTDB status -n "$pnn" | \ - grep -Eq "^hash:.* lmaster:${pnn}\$" + ! $CTDB status | grep -Eq "^hash:.* lmaster:${pnn}\$" return ;; esac -- 2.23.0.rc1 From ab9b37308e2776d20dcd7ef5c489555c73cc4019 Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Wed, 21 Aug 2019 14:35:09 +1000 Subject: [PATCH 6/9] ctdb-recoverd: Only check for LMASTER nodes in the VNN map BUG: https://bugzilla.samba.org/show_bug.cgi?id=14085 Signed-off-by: Martin Schwenke Reviewed-by: Amitay Isaacs (cherry picked from commit 5d655ac6f2ff82f8f1c89b06870d600a1a3c7a8a) --- ctdb/server/ctdb_recoverd.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/ctdb/server/ctdb_recoverd.c b/ctdb/server/ctdb_recoverd.c index 3e63bd1e7a5..4410c4f7be2 100644 --- a/ctdb/server/ctdb_recoverd.c +++ b/ctdb/server/ctdb_recoverd.c @@ -2981,13 +2981,19 @@ static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, return; } - /* verify that all active nodes in the nodemap also exist in - the vnnmap. + /* + * Verify that all active lmaster nodes in the nodemap also + * exist in the vnnmap */ for (j=0; jnum; j++) { if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) { continue; } + if (! ctdb_node_has_capabilities(rec->caps, + ctdb->nodes[j]->pnn, + CTDB_CAP_LMASTER)) { + continue; + } if (nodemap->nodes[j].pnn == pnn) { continue; } @@ -2998,8 +3004,8 @@ static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, } } if (i == vnnmap->size) { - DEBUG(DEBUG_ERR, (__location__ " Node %u is active in the nodemap but did not exist in the vnnmap\n", - nodemap->nodes[j].pnn)); + D_ERR("Active LMASTER node %u is not in the vnnmap\n", + nodemap->nodes[j].pnn); ctdb_set_culprit(rec, nodemap->nodes[j].pnn); do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap); return; -- 2.23.0.rc1 From 123499227b1529abdafb65ee713f95056a3e1a58 Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Mon, 29 Jul 2019 17:22:50 +1000 Subject: [PATCH 7/9] ctdb-tests: Strengthen volatile DB traverse test Check the record count more often, from multiple nodes. Add a case with multiple records. BUG: https://bugzilla.samba.org/show_bug.cgi?id=14085 Signed-off-by: Martin Schwenke Reviewed-by: Amitay Isaacs (cherry picked from commit ca4df06080709adf0cbebc95b0a70b4090dad5ba) --- ctdb/tests/simple/79_volatile_db_traverse.sh | 67 +++++++++++++++----- 1 file changed, 52 insertions(+), 15 deletions(-) diff --git a/ctdb/tests/simple/79_volatile_db_traverse.sh b/ctdb/tests/simple/79_volatile_db_traverse.sh index af7e962f579..7f3007d5105 100755 --- a/ctdb/tests/simple/79_volatile_db_traverse.sh +++ b/ctdb/tests/simple/79_volatile_db_traverse.sh @@ -42,11 +42,56 @@ try_command_on_node 0 $CTDB writekey "$TESTDB" "foo" "bar0" echo "write foo=bar1 on node 1" try_command_on_node 1 $CTDB writekey "$TESTDB" "foo" "bar1" -echo "do traverse on node 0" -try_command_on_node -v 0 $CTDB catdb "$TESTDB" +echo -echo "do traverse on node 1" -try_command_on_node -v 1 $CTDB catdb "$TESTDB" +check_db_num_records () +{ + local node="$1" + local db="$2" + local n="$3" + + echo "Checking on node ${node} to ensure ${db} has ${n} records..." + try_command_on_node "$node" "${CTDB} catdb ${db}" + + num=$(sed -n -e 's|^Dumped \(.*\) records$|\1|p' "$outfile") + if [ "$num" = "$n" ] ; then + echo "OK: Number of records=${num}" + echo + else + echo "BAD: There were ${num} (!= ${n}) records" + cat "$outfile" + exit 1 + fi +} + +check_db_num_records 0 "$TESTDB" 1 +check_db_num_records 1 "$TESTDB" 1 + +cat < Date: Tue, 13 Aug 2019 14:45:33 +1000 Subject: [PATCH 8/9] ctdb-tests: Clear deleted record via recovery instead of vacuuming This test has been flapping because sometimes the record is not vacuumed within the expected time period, perhaps even because the check for the record can interfere with vacuuming. However, instead of waiting for vacuuming the record can be cleared by doing a recovery. This should be much more reliable. BUG: https://bugzilla.samba.org/show_bug.cgi?id=14085 RN: Fix flapping CTDB tests Signed-off-by: Martin Schwenke Reviewed-by: Amitay Isaacs Autobuild-User(master): Martin Schwenke Autobuild-Date(master): Wed Aug 21 13:06:57 UTC 2019 on sn-devel-184 (cherry picked from commit 71ad473ba805abe23bbe6c1a1290612e448e73f3) --- .../simple/69_recovery_resurrect_deleted.sh | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/ctdb/tests/simple/69_recovery_resurrect_deleted.sh b/ctdb/tests/simple/69_recovery_resurrect_deleted.sh index 8126c49b83c..f6c72c59f2a 100755 --- a/ctdb/tests/simple/69_recovery_resurrect_deleted.sh +++ b/ctdb/tests/simple/69_recovery_resurrect_deleted.sh @@ -54,18 +54,11 @@ database_has_zero_records () return 0 } -echo "Get vacuum interval" -try_command_on_node -v $second $CTDB getvar VacuumInterval -vacuum_interval="${out#* = }" - -echo "Wait until vacuuming deletes the record on active nodes" -# Why 4? Steps are: -# 1. Original node processes delete queue, asks lmaster to fetch -# 2. lmaster recoverd fetches -# 3. lmaster processes delete queue -# If vacuuming is just missed then need an extra interval -t=$((vacuum_interval * 4)) -wait_until "${t}/10" database_has_zero_records +echo "Trigger a recovery" +try_command_on_node "$second" $CTDB recover + +echo "Checking that database has 0 records" +database_has_zero_records echo "Continue node ${first}" try_command_on_node $first $CTDB continue -- 2.23.0.rc1 From ec4cf736c94d0e5189a6203ba01de223748948eb Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Tue, 27 Aug 2019 12:13:51 +1000 Subject: [PATCH 9/9] ctdb-recoverd: Fix typo in previous fix BUG: https://bugzilla.samba.org/show_bug.cgi?id=14085 Signed-off-by: Martin Schwenke Reviewed-by: Amitay Isaacs Autobuild-User(master): Amitay Isaacs Autobuild-Date(master): Tue Aug 27 15:29:11 UTC 2019 on sn-devel-184 (cherry picked from commit 8190993d99284162bd8699780248bb2edfec2673) --- ctdb/server/ctdb_recoverd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ctdb/server/ctdb_recoverd.c b/ctdb/server/ctdb_recoverd.c index 4410c4f7be2..31e72f139ff 100644 --- a/ctdb/server/ctdb_recoverd.c +++ b/ctdb/server/ctdb_recoverd.c @@ -2990,7 +2990,7 @@ static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, continue; } if (! ctdb_node_has_capabilities(rec->caps, - ctdb->nodes[j]->pnn, + nodemap->nodes[j].pnn, CTDB_CAP_LMASTER)) { continue; } -- 2.23.0.rc1