From 751957e5fd296ed4b0bc94560a2630bfbf0020e1 Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Sun, 11 Jul 2021 20:40:10 +1000 Subject: [PATCH 01/19] ctdb-recoverd: Add a helper variable Improves readability and simplifies subsequent changes. BUG: https://bugzilla.samba.org/show_bug.cgi?id=14784 Signed-off-by: Martin Schwenke Reviewed-by: Amitay Isaacs (cherry picked from commit 82a075d4d734588a42fca7ebaf529892d1eba853) --- ctdb/server/ctdb_recoverd.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/ctdb/server/ctdb_recoverd.c b/ctdb/server/ctdb_recoverd.c index 856fcbb72c8..69bff7339e1 100644 --- a/ctdb/server/ctdb_recoverd.c +++ b/ctdb/server/ctdb_recoverd.c @@ -553,13 +553,14 @@ static int update_flags(struct ctdb_recoverd *rec, for (j=0; jnum; j++) { struct ctdb_node_map_old *remote_nodemap=NULL; uint32_t local_flags = nodemap->nodes[j].flags; + uint32_t remote_pnn = nodemap->nodes[j].pnn; uint32_t remote_flags; int ret; if (local_flags & NODE_FLAGS_DISCONNECTED) { continue; } - if (nodemap->nodes[j].pnn == ctdb->pnn) { + if (remote_pnn == ctdb->pnn) { continue; } @@ -568,7 +569,7 @@ static int update_flags(struct ctdb_recoverd *rec, if (local_flags != remote_flags) { ret = update_flags_on_all_nodes(rec, - nodemap->nodes[j].pnn, + remote_pnn, remote_flags); if (ret != 0) { DBG_ERR( @@ -583,7 +584,7 @@ static int update_flags(struct ctdb_recoverd *rec, */ D_NOTICE("Remote node %u had flags 0x%x, " "local had 0x%x - updating local\n", - nodemap->nodes[j].pnn, + remote_pnn, remote_flags, local_flags); nodemap->nodes[j].flags = remote_flags; -- 2.30.2 From 682b0750de53a397954c33a39d411e4813068aed Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Sun, 11 Jul 2021 21:28:43 +1000 Subject: [PATCH 02/19] ctdb-recoverd: Update the local node map before pushing out flags The resulting code structure looks a little weird. However, there is another condition that requires the flags to be pushed that will be inserted before the continue statement in a subsequent commit.. BUG: https://bugzilla.samba.org/show_bug.cgi?id=14784 Signed-off-by: Martin Schwenke Reviewed-by: Amitay Isaacs (cherry picked from commit 620d07871420cdbfa055c1ace75ec1ac4c32721d) --- ctdb/server/ctdb_recoverd.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/ctdb/server/ctdb_recoverd.c b/ctdb/server/ctdb_recoverd.c index 69bff7339e1..d913b51d4bb 100644 --- a/ctdb/server/ctdb_recoverd.c +++ b/ctdb/server/ctdb_recoverd.c @@ -568,16 +568,6 @@ static int update_flags(struct ctdb_recoverd *rec, remote_flags = remote_nodemap->nodes[j].flags; if (local_flags != remote_flags) { - ret = update_flags_on_all_nodes(rec, - remote_pnn, - remote_flags); - if (ret != 0) { - DBG_ERR( - "Unable to update flags on remote nodes\n"); - talloc_free(mem_ctx); - return -1; - } - /* * Update the local copy of the flags in the * recovery daemon. @@ -588,6 +578,21 @@ static int update_flags(struct ctdb_recoverd *rec, remote_flags, local_flags); nodemap->nodes[j].flags = remote_flags; + local_flags = remote_flags; + goto push; + } + + continue; + +push: + D_NOTICE("Pushing updated flags for node %u (0x%x)\n", + remote_pnn, + local_flags); + ret = update_flags_on_all_nodes(rec, remote_pnn, local_flags); + if (ret != 0) { + DBG_ERR("Unable to update flags on remote nodes\n"); + talloc_free(mem_ctx); + return -1; } } talloc_free(mem_ctx); -- 2.30.2 From 474073fccbf59db51bbb5d04bba04b33d6cca5a7 Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Sun, 11 Jul 2021 22:17:08 +1000 Subject: [PATCH 03/19] ctdb-recoverd: Push flags for a node if any remote node disagrees This will usually happen if flags on the node in question change, so keeping the code simple and pushing to all nodes won't hurt. When all nodes come up there might be differences in connected nodes, causing such "fix ups". Receiving nodes will ignore no-op pushes. BUG: https://bugzilla.samba.org/show_bug.cgi?id=14784 Signed-off-by: Martin Schwenke Reviewed-by: Amitay Isaacs (cherry picked from commit 8305f6a7f132f03b0bbdb26692b7491fd3f6c24f) --- ctdb/server/ctdb_recoverd.c | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/ctdb/server/ctdb_recoverd.c b/ctdb/server/ctdb_recoverd.c index d913b51d4bb..9d788505f1f 100644 --- a/ctdb/server/ctdb_recoverd.c +++ b/ctdb/server/ctdb_recoverd.c @@ -555,13 +555,20 @@ static int update_flags(struct ctdb_recoverd *rec, uint32_t local_flags = nodemap->nodes[j].flags; uint32_t remote_pnn = nodemap->nodes[j].pnn; uint32_t remote_flags; + unsigned int i; int ret; if (local_flags & NODE_FLAGS_DISCONNECTED) { continue; } if (remote_pnn == ctdb->pnn) { - continue; + /* + * No remote nodemap for this node since this + * is the local nodemap. However, still need + * to check this against the remote nodes and + * push it if they are out-of-date. + */ + goto compare_remotes; } remote_nodemap = remote_nodemaps[j]; @@ -582,6 +589,26 @@ static int update_flags(struct ctdb_recoverd *rec, goto push; } +compare_remotes: + for (i = 0; i < nodemap->num; i++) { + if (i == j) { + continue; + } + if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) { + continue; + } + if (nodemap->nodes[i].pnn == ctdb->pnn) { + continue; + } + + remote_nodemap = remote_nodemaps[i]; + remote_flags = remote_nodemap->nodes[j].flags; + + if (local_flags != remote_flags) { + goto push; + } + } + continue; push: -- 2.30.2 From c00be0528720a9fc6a685ab188018fc0b855c31f Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Thu, 8 Jul 2021 17:28:20 +1000 Subject: [PATCH 04/19] ctdb-protocol: Add new controls to disable and enable nodes These are CTDB_CONTROL_DISABLE_NODE and CTDB_CONTROL_ENABLE_NODE. For consistency these match CTDB_CONTROL_STOP_NODE and CTDB_CONTROL_CONTINUE_NODE. It would be possible to add a single control but it would need to take data. The aim is to finally fix races in flag handling. Previous fixes have improved the situation but they have only narrowed the race window. The problem is that the recovery daemon on the master node pushes flags to nodes the same way that disable and enable are implemented. So the following sequence is still racy: 1. Node A is disabled 2. Recovery master pulls flags from all nodes including A 3. Node A is enabled 4. Recovery master notices A is disabled and pushes a flag update to all nodes including node A 5. Node A is erroneously marked disabled Node A can not tell if the MODIFY_FLAGS control is from a "ctdb disable" command or a flag update from the recovery master. The solution is to use a different mechanism for disable/enable and for a node to ignore MODIFY_FLAGS controls for their own flags. BUG: https://bugzilla.samba.org/show_bug.cgi?id=14784 Signed-off-by: Martin Schwenke Reviewed-by: Amitay Isaacs (cherry picked from commit 49dc5d8cd2d3767044ac69cbd25c8210d11cadf7) --- ctdb/protocol/protocol.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ctdb/protocol/protocol.h b/ctdb/protocol/protocol.h index 35543a67cf9..f4a106f2869 100644 --- a/ctdb/protocol/protocol.h +++ b/ctdb/protocol/protocol.h @@ -376,6 +376,8 @@ enum ctdb_controls {CTDB_CONTROL_PROCESS_EXISTS = 0, CTDB_CONTROL_VACUUM_FETCH = 154, CTDB_CONTROL_DB_VACUUM = 155, CTDB_CONTROL_ECHO_DATA = 156, + CTDB_CONTROL_DISABLE_NODE = 157, + CTDB_CONTROL_ENABLE_NODE = 158, }; #define MAX_COUNT_BUCKETS 16 -- 2.30.2 From 038ea903985037a0be535d56abcd4410c93f4efe Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Fri, 9 Jul 2021 12:10:12 +1000 Subject: [PATCH 05/19] ctdb-protocol: Add marshalling for controls DISABLE_NODE/ENABLE_NODE BUG: https://bugzilla.samba.org/show_bug.cgi?id=14784 Signed-off-by: Martin Schwenke Reviewed-by: Amitay Isaacs (cherry picked from commit 6845dca87e6ffc5e449fb78d23eb9c7a22698b80) --- ctdb/protocol/protocol_api.h | 6 ++++ ctdb/protocol/protocol_client.c | 36 ++++++++++++++++++++++ ctdb/protocol/protocol_control.c | 12 ++++++++ ctdb/protocol/protocol_debug.c | 2 ++ ctdb/tests/UNIT/cunit/protocol_test_101.sh | 2 +- ctdb/tests/src/protocol_common_ctdb.c | 24 +++++++++++++++ ctdb/tests/src/protocol_ctdb_test.c | 2 +- 7 files changed, 82 insertions(+), 2 deletions(-) diff --git a/ctdb/protocol/protocol_api.h b/ctdb/protocol/protocol_api.h index bdb4bc0e2ea..b7fcc53dd68 100644 --- a/ctdb/protocol/protocol_api.h +++ b/ctdb/protocol/protocol_api.h @@ -615,6 +615,12 @@ void ctdb_req_control_echo_data(struct ctdb_req_control *request, struct ctdb_echo_data *echo_data); int ctdb_reply_control_echo_data(struct ctdb_reply_control *reply); +void ctdb_req_control_disable_node(struct ctdb_req_control *request); +int ctdb_reply_control_disable_node(struct ctdb_reply_control *reply); + +void ctdb_req_control_enable_node(struct ctdb_req_control *request); +int ctdb_reply_control_enable_node(struct ctdb_reply_control *reply); + /* From protocol/protocol_debug.c */ void ctdb_packet_print(uint8_t *buf, size_t buflen, FILE *fp); diff --git a/ctdb/protocol/protocol_client.c b/ctdb/protocol/protocol_client.c index cde544feb52..71d2f0144b3 100644 --- a/ctdb/protocol/protocol_client.c +++ b/ctdb/protocol/protocol_client.c @@ -2409,3 +2409,39 @@ int ctdb_reply_control_echo_data(struct ctdb_reply_control *reply) return reply->status; } + +/* CTDB_CONTROL_DISABLE_NODE */ + +void ctdb_req_control_disable_node(struct ctdb_req_control *request) +{ + request->opcode = CTDB_CONTROL_DISABLE_NODE; + request->pad = 0; + request->srvid = 0; + request->client_id = 0; + request->flags = 0; + + request->rdata.opcode = CTDB_CONTROL_DISABLE_NODE; +} + +int ctdb_reply_control_disable_node(struct ctdb_reply_control *reply) +{ + return ctdb_reply_control_generic(reply, CTDB_CONTROL_DISABLE_NODE); +} + +/* CTDB_CONTROL_ENABLE_NODE */ + +void ctdb_req_control_enable_node(struct ctdb_req_control *request) +{ + request->opcode = CTDB_CONTROL_ENABLE_NODE; + request->pad = 0; + request->srvid = 0; + request->client_id = 0; + request->flags = 0; + + request->rdata.opcode = CTDB_CONTROL_ENABLE_NODE; +} + +int ctdb_reply_control_enable_node(struct ctdb_reply_control *reply) +{ + return ctdb_reply_control_generic(reply, CTDB_CONTROL_ENABLE_NODE); +} diff --git a/ctdb/protocol/protocol_control.c b/ctdb/protocol/protocol_control.c index 4fd5a5a7d4d..076863278a3 100644 --- a/ctdb/protocol/protocol_control.c +++ b/ctdb/protocol/protocol_control.c @@ -419,6 +419,12 @@ static size_t ctdb_req_control_data_len(struct ctdb_req_control_data *cd) case CTDB_CONTROL_ECHO_DATA: len = ctdb_echo_data_len(cd->data.echo_data); break; + + case CTDB_CONTROL_DISABLE_NODE: + break; + + case CTDB_CONTROL_ENABLE_NODE: + break; } return len; @@ -1418,6 +1424,12 @@ static size_t ctdb_reply_control_data_len(struct ctdb_reply_control_data *cd) case CTDB_CONTROL_ECHO_DATA: len = ctdb_echo_data_len(cd->data.echo_data); break; + + case CTDB_CONTROL_DISABLE_NODE: + break; + + case CTDB_CONTROL_ENABLE_NODE: + break; } return len; diff --git a/ctdb/protocol/protocol_debug.c b/ctdb/protocol/protocol_debug.c index 56f14e32b09..2e5ed9f0ced 100644 --- a/ctdb/protocol/protocol_debug.c +++ b/ctdb/protocol/protocol_debug.c @@ -245,6 +245,8 @@ static void ctdb_opcode_print(uint32_t opcode, FILE *fp) { CTDB_CONTROL_VACUUM_FETCH, "VACUUM_FETCH" }, { CTDB_CONTROL_DB_VACUUM, "DB_VACUUM" }, { CTDB_CONTROL_ECHO_DATA, "ECHO_DATA" }, + { CTDB_CONTROL_DISABLE_NODE, "DISABLE_NODE" }, + { CTDB_CONTROL_ENABLE_NODE, "ENABLE_NODE" }, { MAP_END, "" }, }; diff --git a/ctdb/tests/UNIT/cunit/protocol_test_101.sh b/ctdb/tests/UNIT/cunit/protocol_test_101.sh index 79dfabeb801..b84f208b109 100755 --- a/ctdb/tests/UNIT/cunit/protocol_test_101.sh +++ b/ctdb/tests/UNIT/cunit/protocol_test_101.sh @@ -2,7 +2,7 @@ . "${TEST_SCRIPTS_DIR}/unit.sh" -last_control=156 +last_control=158 generate_control_output () { diff --git a/ctdb/tests/src/protocol_common_ctdb.c b/ctdb/tests/src/protocol_common_ctdb.c index f1d662f0e1d..36abbe2cd58 100644 --- a/ctdb/tests/src/protocol_common_ctdb.c +++ b/ctdb/tests/src/protocol_common_ctdb.c @@ -606,6 +606,12 @@ void fill_ctdb_req_control_data(TALLOC_CTX *mem_ctx, assert(cd->data.echo_data != NULL); fill_ctdb_echo_data(mem_ctx, cd->data.echo_data); break; + + case CTDB_CONTROL_DISABLE_NODE: + break; + + case CTDB_CONTROL_ENABLE_NODE: + break; } } @@ -1004,6 +1010,12 @@ void verify_ctdb_req_control_data(struct ctdb_req_control_data *cd, case CTDB_CONTROL_ECHO_DATA: verify_ctdb_echo_data(cd->data.echo_data, cd2->data.echo_data); break; + + case CTDB_CONTROL_DISABLE_NODE: + break; + + case CTDB_CONTROL_ENABLE_NODE: + break; } } @@ -1409,6 +1421,12 @@ void fill_ctdb_reply_control_data(TALLOC_CTX *mem_ctx, assert(cd->data.echo_data != NULL); fill_ctdb_echo_data(mem_ctx, cd->data.echo_data); break; + + case CTDB_CONTROL_DISABLE_NODE: + break; + + case CTDB_CONTROL_ENABLE_NODE: + break; } } @@ -1753,6 +1771,12 @@ void verify_ctdb_reply_control_data(struct ctdb_reply_control_data *cd, case CTDB_CONTROL_ECHO_DATA: verify_ctdb_echo_data(cd->data.echo_data, cd2->data.echo_data); break; + + case CTDB_CONTROL_DISABLE_NODE: + break; + + case CTDB_CONTROL_ENABLE_NODE: + break; } } diff --git a/ctdb/tests/src/protocol_ctdb_test.c b/ctdb/tests/src/protocol_ctdb_test.c index 6a9a8a4aef6..b359e7a1280 100644 --- a/ctdb/tests/src/protocol_ctdb_test.c +++ b/ctdb/tests/src/protocol_ctdb_test.c @@ -284,7 +284,7 @@ PROTOCOL_CTDB4_TEST(struct ctdb_req_dmaster, ctdb_req_dmaster, PROTOCOL_CTDB4_TEST(struct ctdb_reply_dmaster, ctdb_reply_dmaster, CTDB_REPLY_DMASTER); -#define NUM_CONTROLS 157 +#define NUM_CONTROLS 159 PROTOCOL_CTDB2_TEST(struct ctdb_req_control_data, ctdb_req_control_data); PROTOCOL_CTDB2_TEST(struct ctdb_reply_control_data, ctdb_reply_control_data); -- 2.30.2 From 44886b456dfbf7edf2af7f046416c7e7804913b4 Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Wed, 28 Jul 2021 10:27:42 +1000 Subject: [PATCH 06/19] ctdb-daemon: Add a helper variable Simplifies a subsequent change. BUG: https://bugzilla.samba.org/show_bug.cgi?id=14784 Signed-off-by: Martin Schwenke Reviewed-by: Amitay Isaacs (cherry picked from commit e0a7b5a9e866452b1faaed86a105492fe7b237e2) --- ctdb/server/ctdb_daemon.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/ctdb/server/ctdb_daemon.c b/ctdb/server/ctdb_daemon.c index 7ebb419bc1f..4f83b27bd33 100644 --- a/ctdb/server/ctdb_daemon.c +++ b/ctdb/server/ctdb_daemon.c @@ -1227,26 +1227,28 @@ failed: static void initialise_node_flags (struct ctdb_context *ctdb) { + struct ctdb_node *node = NULL; unsigned int i; /* Always found: PNN correctly set just before this is called */ for (i = 0; i < ctdb->num_nodes; i++) { - if (ctdb->pnn == ctdb->nodes[i]->pnn) { + node = ctdb->nodes[i]; + if (ctdb->pnn == node->pnn) { break; } } - ctdb->nodes[i]->flags &= ~NODE_FLAGS_DISCONNECTED; + node->flags &= ~NODE_FLAGS_DISCONNECTED; /* do we start out in DISABLED mode? */ if (ctdb->start_as_disabled != 0) { D_ERR("This node is configured to start in DISABLED state\n"); - ctdb->nodes[i]->flags |= NODE_FLAGS_DISABLED; + node->flags |= NODE_FLAGS_DISABLED; } /* do we start out in STOPPED mode? */ if (ctdb->start_as_stopped != 0) { D_ERR("This node is configured to start in STOPPED state\n"); - ctdb->nodes[i]->flags |= NODE_FLAGS_STOPPED; + node->flags |= NODE_FLAGS_STOPPED; } } -- 2.30.2 From d1c45405712fcc5e928aa7433d0463ed8dc3498f Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Fri, 9 Jul 2021 14:01:33 +1000 Subject: [PATCH 07/19] ctdb-daemon: Factor out a function to get node structure from PNN BUG: https://bugzilla.samba.org/show_bug.cgi?id=14784 Signed-off-by: Martin Schwenke Reviewed-by: Amitay Isaacs (cherry picked from commit 1ac7bc7532b2fad791d0e53effa7c64cdc73c4eb) --- ctdb/include/ctdb_private.h | 2 ++ ctdb/server/ctdb_daemon.c | 27 ++++++++++++++++++++++++--- 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/ctdb/include/ctdb_private.h b/ctdb/include/ctdb_private.h index 9ca87332d61..6f4111f1a18 100644 --- a/ctdb/include/ctdb_private.h +++ b/ctdb/include/ctdb_private.h @@ -565,6 +565,8 @@ int daemon_deregister_message_handler(struct ctdb_context *ctdb, void daemon_tunnel_handler(uint64_t tunnel_id, TDB_DATA data, void *private_data); +struct ctdb_node *ctdb_find_node(struct ctdb_context *ctdb, uint32_t pnn); + int ctdb_start_daemon(struct ctdb_context *ctdb, bool interactive, bool test_mode_enabled); diff --git a/ctdb/server/ctdb_daemon.c b/ctdb/server/ctdb_daemon.c index 4f83b27bd33..3a3a2cfed70 100644 --- a/ctdb/server/ctdb_daemon.c +++ b/ctdb/server/ctdb_daemon.c @@ -1225,19 +1225,40 @@ failed: return -1; } -static void initialise_node_flags (struct ctdb_context *ctdb) +struct ctdb_node *ctdb_find_node(struct ctdb_context *ctdb, uint32_t pnn) { struct ctdb_node *node = NULL; unsigned int i; + if (pnn == CTDB_CURRENT_NODE) { + pnn = ctdb->pnn; + } + /* Always found: PNN correctly set just before this is called */ for (i = 0; i < ctdb->num_nodes; i++) { node = ctdb->nodes[i]; - if (ctdb->pnn == node->pnn) { - break; + if (pnn == node->pnn) { + return node; } } + return NULL; +} + +static void initialise_node_flags (struct ctdb_context *ctdb) +{ + struct ctdb_node *node = NULL; + + node = ctdb_find_node(ctdb, CTDB_CURRENT_NODE); + /* + * PNN correctly set just before this is called so always + * found but keep static analysers happy... + */ + if (node == NULL) { + DBG_ERR("Unable to find current node\n"); + return; + } + node->flags &= ~NODE_FLAGS_DISCONNECTED; /* do we start out in DISABLED mode? */ -- 2.30.2 From afe0025a4b9f1fd6b68d9a6a48de90746685165d Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Fri, 9 Jul 2021 14:02:28 +1000 Subject: [PATCH 08/19] ctdb-daemon: Start as disabled means PERMANENTLY_DISABLED DISABLED is UNHEALTHY | PERMANENTLY_DISABLED, which is not what is intended here. Luckily, it doesn't do any harm because nodes are marked unhealthy at startup anyway. BUG: https://bugzilla.samba.org/show_bug.cgi?id=14784 Signed-off-by: Martin Schwenke Reviewed-by: Amitay Isaacs (cherry picked from commit 60c1ef146538d90f97b7823459f7548ca5fa6dd3) --- ctdb/server/ctdb_daemon.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ctdb/server/ctdb_daemon.c b/ctdb/server/ctdb_daemon.c index 3a3a2cfed70..f64a0475348 100644 --- a/ctdb/server/ctdb_daemon.c +++ b/ctdb/server/ctdb_daemon.c @@ -1264,7 +1264,7 @@ static void initialise_node_flags (struct ctdb_context *ctdb) /* do we start out in DISABLED mode? */ if (ctdb->start_as_disabled != 0) { D_ERR("This node is configured to start in DISABLED state\n"); - node->flags |= NODE_FLAGS_DISABLED; + node->flags |= NODE_FLAGS_PERMANENTLY_DISABLED; } /* do we start out in STOPPED mode? */ if (ctdb->start_as_stopped != 0) { -- 2.30.2 From d937f47b4d2170d98fb7051f386657608b576768 Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Fri, 9 Jul 2021 14:12:59 +1000 Subject: [PATCH 09/19] ctdb_daemon: Implement controls DISABLE_NODE/ENABLE_NODE BUG: https://bugzilla.samba.org/show_bug.cgi?id=14784 Signed-off-by: Martin Schwenke Reviewed-by: Amitay Isaacs (cherry picked from commit 15a6489c288b3adb635a728cb2049621ab1a07f7) --- ctdb/server/ctdb_control.c | 42 +++++++++++++++++++++++++++++ ctdb/tests/src/fake_ctdbd.c | 54 +++++++++++++++++++++++++++++++++++++ 2 files changed, 96 insertions(+) diff --git a/ctdb/server/ctdb_control.c b/ctdb/server/ctdb_control.c index 95f3b175934..a9d1aa1b438 100644 --- a/ctdb/server/ctdb_control.c +++ b/ctdb/server/ctdb_control.c @@ -173,6 +173,40 @@ done: TALLOC_FREE(state); } +static int ctdb_control_disable_node(struct ctdb_context *ctdb) +{ + struct ctdb_node *node; + + node = ctdb_find_node(ctdb, CTDB_CURRENT_NODE); + if (node == NULL) { + /* Can't happen */ + DBG_ERR("Unable to find current node\n"); + return -1; + } + + D_ERR("Disable node\n"); + node->flags |= NODE_FLAGS_PERMANENTLY_DISABLED; + + return 0; +} + +static int ctdb_control_enable_node(struct ctdb_context *ctdb) +{ + struct ctdb_node *node; + + node = ctdb_find_node(ctdb, CTDB_CURRENT_NODE); + if (node == NULL) { + /* Can't happen */ + DBG_ERR("Unable to find current node\n"); + return -1; + } + + D_ERR("Enable node\n"); + node->flags &= ~NODE_FLAGS_PERMANENTLY_DISABLED; + + return 0; +} + /* process a control request */ @@ -828,6 +862,14 @@ static int32_t ctdb_control_dispatch(struct ctdb_context *ctdb, return ctdb_control_echo_data(ctdb, c, indata, async_reply); } + case CTDB_CONTROL_DISABLE_NODE: + CHECK_CONTROL_DATA_SIZE(0); + return ctdb_control_disable_node(ctdb); + + case CTDB_CONTROL_ENABLE_NODE: + CHECK_CONTROL_DATA_SIZE(0); + return ctdb_control_enable_node(ctdb); + default: DEBUG(DEBUG_CRIT,(__location__ " Unknown CTDB control opcode %u\n", opcode)); return -1; diff --git a/ctdb/tests/src/fake_ctdbd.c b/ctdb/tests/src/fake_ctdbd.c index 146b7da344c..d6552baee8d 100644 --- a/ctdb/tests/src/fake_ctdbd.c +++ b/ctdb/tests/src/fake_ctdbd.c @@ -3513,6 +3513,52 @@ static void control_check_pid_srvid(TALLOC_CTX *mem_ctx, client_send_control(req, header, &reply); } +static void control_disable_node(TALLOC_CTX *mem_ctx, + struct tevent_req *req, + struct ctdb_req_header *header, + struct ctdb_req_control *request) +{ + struct client_state *state = tevent_req_data( + req, struct client_state); + struct ctdbd_context *ctdb = state->ctdb; + struct ctdb_reply_control reply; + + reply.rdata.opcode = request->opcode; + + DEBUG(DEBUG_INFO, ("Disabling node\n")); + ctdb->node_map->node[header->destnode].flags |= + NODE_FLAGS_PERMANENTLY_DISABLED; + + reply.status = 0; + reply.errmsg = NULL; + + client_send_control(req, header, &reply); + return; +} + +static void control_enable_node(TALLOC_CTX *mem_ctx, + struct tevent_req *req, + struct ctdb_req_header *header, + struct ctdb_req_control *request) +{ + struct client_state *state = tevent_req_data( + req, struct client_state); + struct ctdbd_context *ctdb = state->ctdb; + struct ctdb_reply_control reply; + + reply.rdata.opcode = request->opcode; + + DEBUG(DEBUG_INFO, ("Enable node\n")); + ctdb->node_map->node[header->destnode].flags &= + ~NODE_FLAGS_PERMANENTLY_DISABLED; + + reply.status = 0; + reply.errmsg = NULL; + + client_send_control(req, header, &reply); + return; +} + static bool fake_control_failure(TALLOC_CTX *mem_ctx, struct tevent_req *req, struct ctdb_req_header *header, @@ -4205,6 +4251,14 @@ static void client_process_control(struct tevent_req *req, control_check_pid_srvid(mem_ctx, req, &header, &request); break; + case CTDB_CONTROL_DISABLE_NODE: + control_disable_node(mem_ctx, req, &header, &request); + break; + + case CTDB_CONTROL_ENABLE_NODE: + control_enable_node(mem_ctx, req, &header, &request); + break; + default: if (! (request.flags & CTDB_CTRL_FLAG_NOREPLY)) { control_error(mem_ctx, req, &header, &request); -- 2.30.2 From eba01064745a3c5c27c9de91ab87fded826bc5f4 Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Fri, 9 Jul 2021 14:32:12 +1000 Subject: [PATCH 10/19] ctdb-client: Add client code for disable/enable controls BUG: https://bugzilla.samba.org/show_bug.cgi?id=14784 Signed-off-by: Martin Schwenke Reviewed-by: Amitay Isaacs (cherry picked from commit 6fe6a54e7f32e650be6ab36041159081dbde5165) --- ctdb/client/client_control_sync.c | 68 +++++++++++++++++++++++++++++++ ctdb/client/client_sync.h | 12 ++++++ 2 files changed, 80 insertions(+) diff --git a/ctdb/client/client_control_sync.c b/ctdb/client/client_control_sync.c index e56a2b2f18d..29e0249198c 100644 --- a/ctdb/client/client_control_sync.c +++ b/ctdb/client/client_control_sync.c @@ -2718,3 +2718,71 @@ int ctdb_ctrl_tunnel_deregister(TALLOC_CTX *mem_ctx, struct tevent_context *ev, return 0; } + +int ctdb_ctrl_disable_node(TALLOC_CTX *mem_ctx, + struct tevent_context *ev, + struct ctdb_client_context *client, + int destnode, + struct timeval timeout) +{ + struct ctdb_req_control request; + struct ctdb_reply_control *reply; + int ret; + + ctdb_req_control_disable_node(&request); + ret = ctdb_client_control(mem_ctx, + ev, + client, + destnode, + timeout, + &request, + &reply); + if (ret != 0) { + D_ERR("Control DISABLE_NODE failed to node %u, ret=%d\n", + destnode, + ret); + return ret; + } + + ret = ctdb_reply_control_disable_node(reply); + if (ret != 0) { + D_ERR("Control DISABLE_NODE failed, ret=%d\n", ret); + return ret; + } + + return 0; +} + +int ctdb_ctrl_enable_node(TALLOC_CTX *mem_ctx, + struct tevent_context *ev, + struct ctdb_client_context *client, + int destnode, + struct timeval timeout) +{ + struct ctdb_req_control request; + struct ctdb_reply_control *reply; + int ret; + + ctdb_req_control_enable_node(&request); + ret = ctdb_client_control(mem_ctx, + ev, + client, + destnode, + timeout, + &request, + &reply); + if (ret != 0) { + D_ERR("Control ENABLE_NODE failed to node %u, ret=%d\n", + destnode, + ret); + return ret; + } + + ret = ctdb_reply_control_enable_node(reply); + if (ret != 0) { + D_ERR("Control ENABLE_NODE failed, ret=%d\n", ret); + return ret; + } + + return 0; +} diff --git a/ctdb/client/client_sync.h b/ctdb/client/client_sync.h index b29e669fba4..25a9615098c 100644 --- a/ctdb/client/client_sync.h +++ b/ctdb/client/client_sync.h @@ -491,6 +491,18 @@ int ctdb_ctrl_tunnel_deregister(TALLOC_CTX *mem_ctx, struct tevent_context *ev, int destnode, struct timeval timeout, uint64_t tunnel_id); +int ctdb_ctrl_disable_node(TALLOC_CTX *mem_ctx, + struct tevent_context *ev, + struct ctdb_client_context *client, + int destnode, + struct timeval timeout); + +int ctdb_ctrl_enable_node(TALLOC_CTX *mem_ctx, + struct tevent_context *ev, + struct ctdb_client_context *client, + int destnode, + struct timeval timeout); + /* from client/client_message_sync.c */ int ctdb_message_recd_update_ip(TALLOC_CTX *mem_ctx, struct tevent_context *ev, -- 2.30.2 From 19a4181442178c010ab336144089f31722a6f5ec Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Fri, 9 Jul 2021 14:37:19 +1000 Subject: [PATCH 11/19] ctdb-tools: Use disable and enable controls in tool Note that there a change from broadcast to a directed control here. This is OK because the recovery master will push flags if any nodes disagree with the canonical flags fetched from a node. Static function ctdb_ctrl_modflags() is no longer used to drop it. BUG: https://bugzilla.samba.org/show_bug.cgi?id=14784 Signed-off-by: Martin Schwenke Reviewed-by: Amitay Isaacs (cherry picked from commit 5914054698dab934fd4db5efb9d211b2fdc40bb9) --- ctdb/tools/ctdb.c | 57 ++++++++++------------------------------------- 1 file changed, 12 insertions(+), 45 deletions(-) diff --git a/ctdb/tools/ctdb.c b/ctdb/tools/ctdb.c index ba994caff2d..649108dd0a7 100644 --- a/ctdb/tools/ctdb.c +++ b/ctdb/tools/ctdb.c @@ -2579,40 +2579,6 @@ static void wait_for_flags(TALLOC_CTX *mem_ctx, struct ctdb_context *ctdb, } } -static int ctdb_ctrl_modflags(TALLOC_CTX *mem_ctx, struct tevent_context *ev, - struct ctdb_client_context *client, - uint32_t destnode, struct timeval timeout, - uint32_t set, uint32_t clear) -{ - struct ctdb_node_map *nodemap; - struct ctdb_node_flag_change flag_change; - struct ctdb_req_control request; - uint32_t *pnn_list; - int ret, count; - - ret = ctdb_ctrl_get_nodemap(mem_ctx, ev, client, destnode, - tevent_timeval_zero(), &nodemap); - if (ret != 0) { - return ret; - } - - flag_change.pnn = destnode; - flag_change.old_flags = nodemap->node[destnode].flags; - flag_change.new_flags = flag_change.old_flags | set; - flag_change.new_flags &= ~clear; - - count = list_of_connected_nodes(nodemap, -1, mem_ctx, &pnn_list); - if (count == -1) { - return ENOMEM; - } - - ctdb_req_control_modify_flags(&request, &flag_change); - ret = ctdb_client_control_multi(mem_ctx, ev, client, pnn_list, count, - tevent_timeval_zero(), &request, - NULL, NULL); - return ret; -} - struct ipreallocate_state { int status; bool done; @@ -2694,13 +2660,13 @@ static int control_disable(TALLOC_CTX *mem_ctx, struct ctdb_context *ctdb, return 0; } - ret = ctdb_ctrl_modflags(mem_ctx, ctdb->ev, ctdb->client, - ctdb->cmd_pnn, TIMEOUT(), - NODE_FLAGS_PERMANENTLY_DISABLED, 0); + ret = ctdb_ctrl_disable_node(mem_ctx, + ctdb->ev, + ctdb->client, + ctdb->cmd_pnn, + TIMEOUT()); if (ret != 0) { - fprintf(stderr, - "Failed to set DISABLED flag on node %u\n", - ctdb->cmd_pnn); + fprintf(stderr, "Failed to disable node %u\n", ctdb->cmd_pnn); return ret; } @@ -2723,12 +2689,13 @@ static int control_enable(TALLOC_CTX *mem_ctx, struct ctdb_context *ctdb, return 0; } - ret = ctdb_ctrl_modflags(mem_ctx, ctdb->ev, ctdb->client, - ctdb->cmd_pnn, TIMEOUT(), - 0, NODE_FLAGS_PERMANENTLY_DISABLED); + ret = ctdb_ctrl_enable_node(mem_ctx, + ctdb->ev, + ctdb->client, + ctdb->cmd_pnn, + TIMEOUT()); if (ret != 0) { - fprintf(stderr, "Failed to reset DISABLED flag on node %u\n", - ctdb->cmd_pnn); + fprintf(stderr, "Failed to enable node %u\n", ctdb->cmd_pnn); return ret; } -- 2.30.2 From 47a00df552e0806349926b93ce89658b998bc080 Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Fri, 9 Jul 2021 15:13:49 +1000 Subject: [PATCH 12/19] ctdb-daemon: Correct the condition for logging unchanged flags Don't trust the old flags from the recovery master. Surrounding code will change in future comments, including the use of old-style debug macros, so just make this change clear. BUG: https://bugzilla.samba.org/show_bug.cgi?id=14784 Signed-off-by: Martin Schwenke Reviewed-by: Amitay Isaacs (cherry picked from commit eec44e286250a6ee7b5c42d85d632bdc300a409f) --- ctdb/server/ctdb_monitor.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ctdb/server/ctdb_monitor.c b/ctdb/server/ctdb_monitor.c index 5c694bde969..a49162dd9d9 100644 --- a/ctdb/server/ctdb_monitor.c +++ b/ctdb/server/ctdb_monitor.c @@ -484,7 +484,7 @@ int32_t ctdb_control_modflags(struct ctdb_context *ctdb, TDB_DATA indata) } } - if (node->flags == c->old_flags) { + if (node->flags == old_flags) { DEBUG(DEBUG_INFO, ("Control modflags on node %u - Unchanged - flags 0x%x\n", c->pnn, node->flags)); return 0; } -- 2.30.2 From bce2a726da0965c4bd1ab7a59ef1accf930301a6 Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Thu, 8 Jul 2021 11:29:38 +1000 Subject: [PATCH 13/19] ctdb-daemon: Update logging for flag changes When flags change, promote the message to NOTICE level and switch the message to the style that is currently generated by ctdb-recoverd.c:monitor_handler(). This will allow monitor_handler() to go away in future. Drop logging when flags do not change. The recovery master now logs when it pushes flags for a node, so the lack of a corresponding "changed flags" message here indicates that no update was required. BUG: https://bugzilla.samba.org/show_bug.cgi?id=14784 Signed-off-by: Martin Schwenke Reviewed-by: Amitay Isaacs (cherry picked from commit b6d25d079e30919457cacbfbbfd670bf88295a9c) --- ctdb/server/ctdb_monitor.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/ctdb/server/ctdb_monitor.c b/ctdb/server/ctdb_monitor.c index a49162dd9d9..84d8c2ce299 100644 --- a/ctdb/server/ctdb_monitor.c +++ b/ctdb/server/ctdb_monitor.c @@ -485,11 +485,13 @@ int32_t ctdb_control_modflags(struct ctdb_context *ctdb, TDB_DATA indata) } if (node->flags == old_flags) { - DEBUG(DEBUG_INFO, ("Control modflags on node %u - Unchanged - flags 0x%x\n", c->pnn, node->flags)); return 0; } - DEBUG(DEBUG_INFO, ("Control modflags on node %u - flags now 0x%x\n", c->pnn, node->flags)); + D_NOTICE("Node %u has changed flags - 0x%x -> 0x%x\n", + c->pnn, + old_flags, + node->flags); if (node->flags == 0 && ctdb->runstate <= CTDB_RUNSTATE_STARTUP) { DEBUG(DEBUG_ERR, (__location__ " Node %u became healthy - force recovery for startup\n", -- 2.30.2 From e9456f4fe53fb6bef74a06bf80c754cc17a55ad2 Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Thu, 8 Jul 2021 11:34:49 +1000 Subject: [PATCH 14/19] ctdb-daemon: Modernise remaining debug macro in this function BUG: https://bugzilla.samba.org/show_bug.cgi?id=14784 Signed-off-by: Martin Schwenke Reviewed-by: Amitay Isaacs (cherry picked from commit 0132bd5a2233193256af434a37506f86ed62c075) --- ctdb/server/ctdb_monitor.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ctdb/server/ctdb_monitor.c b/ctdb/server/ctdb_monitor.c index 84d8c2ce299..ad936925a30 100644 --- a/ctdb/server/ctdb_monitor.c +++ b/ctdb/server/ctdb_monitor.c @@ -494,8 +494,8 @@ int32_t ctdb_control_modflags(struct ctdb_context *ctdb, TDB_DATA indata) node->flags); if (node->flags == 0 && ctdb->runstate <= CTDB_RUNSTATE_STARTUP) { - DEBUG(DEBUG_ERR, (__location__ " Node %u became healthy - force recovery for startup\n", - c->pnn)); + DBG_ERR("Node %u became healthy - force recovery for startup\n", + c->pnn); ctdb->recovery_mode = CTDB_RECOVERY_ACTIVE; } -- 2.30.2 From 5b1fca69cdeb0b6eb665b6079e57dfeaf10ec8fe Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Thu, 8 Jul 2021 11:32:20 +1000 Subject: [PATCH 15/19] ctdb-daemon: Don't bother sending CTDB_SRVID_SET_NODE_FLAGS The code that handles this message is ctdb_recoverd.c:monitor_handler(). Although it appears to do something potentially useful, it only logs the flags changes. All changes made are to local structures - there are no actual side-effects. It used to trigger a takeover run when the DISABLED flag changed. This was dropped back in commit 662f06de9fdce7b1bc1772a4fbe43de271564917. BUG: https://bugzilla.samba.org/show_bug.cgi?id=14784 Signed-off-by: Martin Schwenke Reviewed-by: Amitay Isaacs (cherry picked from commit e75256767fffc6a7ac0b97e58737a39c63c8b187) --- ctdb/server/ctdb_monitor.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/ctdb/server/ctdb_monitor.c b/ctdb/server/ctdb_monitor.c index ad936925a30..67b99c61e8f 100644 --- a/ctdb/server/ctdb_monitor.c +++ b/ctdb/server/ctdb_monitor.c @@ -499,10 +499,5 @@ int32_t ctdb_control_modflags(struct ctdb_context *ctdb, TDB_DATA indata) ctdb->recovery_mode = CTDB_RECOVERY_ACTIVE; } - /* tell the recovery daemon something has changed */ - c->new_flags = node->flags; - ctdb_daemon_send_message(ctdb, ctdb->pnn, - CTDB_SRVID_SET_NODE_FLAGS, indata); - return 0; } -- 2.30.2 From 3454ef18a23e2604fde5b5d0d696f8a0787daffa Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Wed, 17 Jan 2018 19:04:34 +1100 Subject: [PATCH 16/19] ctdb-recoverd: Mark CTDB_SRVID_SET_NODE_FLAGS obsolete CTDB_SRVID_SET_NODE_FLAGS is no longer sent so drop monitor_handler() and replace with srvid_not_implemented(). Mark the SRVID obsolete in its comment. BUG: https://bugzilla.samba.org/show_bug.cgi?id=14784 Signed-off-by: Martin Schwenke Reviewed-by: Amitay Isaacs (cherry picked from commit 916c5ee131dc5c7f1d9c3540147d1f915c8302ad) --- ctdb/protocol/protocol.h | 2 +- ctdb/server/ctdb_recoverd.c | 61 +++++++++---------------------------- 2 files changed, 16 insertions(+), 47 deletions(-) diff --git a/ctdb/protocol/protocol.h b/ctdb/protocol/protocol.h index f4a106f2869..403d66c3972 100644 --- a/ctdb/protocol/protocol.h +++ b/ctdb/protocol/protocol.h @@ -137,7 +137,7 @@ struct ctdb_call { /* SRVID to inform clients that an IP address has been taken over */ #define CTDB_SRVID_TAKE_IP 0xF301000000000000LL -/* SRVID to inform recovery daemon of the node flags */ +/* SRVID to inform recovery daemon of the node flags - OBSOLETE */ #define CTDB_SRVID_SET_NODE_FLAGS 0xF400000000000000LL /* SRVID to inform recovery daemon to update public ip assignment */ diff --git a/ctdb/server/ctdb_recoverd.c b/ctdb/server/ctdb_recoverd.c index 9d788505f1f..28ef9468cd4 100644 --- a/ctdb/server/ctdb_recoverd.c +++ b/ctdb/server/ctdb_recoverd.c @@ -1781,54 +1781,21 @@ static void force_election(struct ctdb_recoverd *rec, uint32_t pnn, } - -/* - handler for when a node changes its flags -*/ -static void monitor_handler(uint64_t srvid, TDB_DATA data, void *private_data) +static void srvid_not_implemented(uint64_t srvid, + TDB_DATA data, + void *private_data) { - struct ctdb_recoverd *rec = talloc_get_type( - private_data, struct ctdb_recoverd); - struct ctdb_context *ctdb = rec->ctdb; - int ret; - struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr; - struct ctdb_node_map_old *nodemap=NULL; - TALLOC_CTX *tmp_ctx; - unsigned int i; - - if (data.dsize != sizeof(*c)) { - DEBUG(DEBUG_ERR,(__location__ "Invalid data in ctdb_node_flag_change\n")); - return; - } - - tmp_ctx = talloc_new(ctdb); - CTDB_NO_MEMORY_VOID(ctdb, tmp_ctx); + const char *s; - ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap); - if (ret != 0) { - DEBUG(DEBUG_ERR,(__location__ "ctdb_ctrl_getnodemap failed in monitor_handler\n")); - talloc_free(tmp_ctx); - return; - } - - - for (i=0;inum;i++) { - if (nodemap->nodes[i].pnn == c->pnn) break; - } - - if (i == nodemap->num) { - DEBUG(DEBUG_CRIT,(__location__ "Flag change for non-existant node %u\n", c->pnn)); - talloc_free(tmp_ctx); - return; - } - - if (c->old_flags != c->new_flags) { - DEBUG(DEBUG_NOTICE,("Node %u has changed flags - now 0x%x was 0x%x\n", c->pnn, c->new_flags, c->old_flags)); + switch (srvid) { + case CTDB_SRVID_SET_NODE_FLAGS: + s = "CTDB_SRVID_SET_NODE_FLAGS"; + break; + default: + s = "UNKNOWN"; } - nodemap->nodes[i].flags = c->new_flags; - - talloc_free(tmp_ctx); + D_WARNING("SRVID %s (0x%" PRIx64 ") is obsolete\n", s, srvid); } /* @@ -2996,8 +2963,10 @@ static void monitor_cluster(struct ctdb_context *ctdb) /* register a message port for recovery elections */ ctdb_client_set_message_handler(ctdb, CTDB_SRVID_ELECTION, election_handler, rec); - /* when nodes are disabled/enabled */ - ctdb_client_set_message_handler(ctdb, CTDB_SRVID_SET_NODE_FLAGS, monitor_handler, rec); + ctdb_client_set_message_handler(ctdb, + CTDB_SRVID_SET_NODE_FLAGS, + srvid_not_implemented, + rec); /* when we are asked to puch out a flag change */ ctdb_client_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec); -- 2.30.2 From 432379e958f54f913ee3358bbf4c389058a39b11 Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Thu, 8 Jul 2021 11:11:11 +1000 Subject: [PATCH 17/19] ctdb-daemon: Simplify ctdb_control_modflags() Now that there are separate disable/enable controls used by the ctdb tool this control can ignore any flag updates for the current nodes. These only come from the recovery master, which depends on being able to fetch flags for all nodes. BUG: https://bugzilla.samba.org/show_bug.cgi?id=14784 Signed-off-by: Martin Schwenke Reviewed-by: Amitay Isaacs (cherry picked from commit ae10a8a4b70e53ea3be6257d1f86f2d9a56aa62a) --- ctdb/server/ctdb_monitor.c | 47 +++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 23 deletions(-) diff --git a/ctdb/server/ctdb_monitor.c b/ctdb/server/ctdb_monitor.c index 67b99c61e8f..913e2721f98 100644 --- a/ctdb/server/ctdb_monitor.c +++ b/ctdb/server/ctdb_monitor.c @@ -455,34 +455,35 @@ int32_t ctdb_control_modflags(struct ctdb_context *ctdb, TDB_DATA indata) struct ctdb_node *node; uint32_t old_flags; - if (c->pnn >= ctdb->num_nodes) { - DEBUG(DEBUG_ERR,(__location__ " Node %d is invalid, num_nodes :%d\n", c->pnn, ctdb->num_nodes)); - return -1; + /* + * Don't let other nodes override the current node's flags. + * The recovery master fetches flags from this node so there's + * no need to push them back. Doing so is racy. + */ + if (c->pnn == ctdb->pnn) { + DBG_DEBUG("Ignoring flag changes for current node\n"); + return 0; } - node = ctdb->nodes[c->pnn]; - old_flags = node->flags; - if (c->pnn != ctdb->pnn) { - c->old_flags = node->flags; + node = ctdb_find_node(ctdb, c->pnn); + if (node == NULL) { + DBG_ERR("Node %u is invalid\n", c->pnn); + return -1; } - node->flags = c->new_flags & ~NODE_FLAGS_DISCONNECTED; - node->flags |= (c->old_flags & NODE_FLAGS_DISCONNECTED); - /* we don't let other nodes modify our STOPPED status */ - if (c->pnn == ctdb->pnn) { - node->flags &= ~NODE_FLAGS_STOPPED; - if (old_flags & NODE_FLAGS_STOPPED) { - node->flags |= NODE_FLAGS_STOPPED; - } - } + /* + * Remember the old flags. We don't care what some other node + * thought the old flags were - that's irrelevant. + */ + old_flags = node->flags; - /* we don't let other nodes modify our BANNED status */ - if (c->pnn == ctdb->pnn) { - node->flags &= ~NODE_FLAGS_BANNED; - if (old_flags & NODE_FLAGS_BANNED) { - node->flags |= NODE_FLAGS_BANNED; - } - } + /* + * This node tracks nodes it is connected to, so don't let + * another node override this + */ + node->flags = + (old_flags & NODE_FLAGS_DISCONNECTED) | + (c->new_flags & ~NODE_FLAGS_DISCONNECTED); if (node->flags == old_flags) { return 0; -- 2.30.2 From ffe127ab567f051f2cb17acc0bbaefd7172883f4 Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Tue, 27 Jul 2021 15:50:54 +1000 Subject: [PATCH 18/19] ctdb-daemon: Ignore flag changes for disconnected nodes If this node is not connected to a node then we shouldn't know anything about it. The state will be pushed later by the recovery master. BUG: https://bugzilla.samba.org/show_bug.cgi?id=14784 Signed-off-by: Martin Schwenke Signed-off-by: Amitay Isaacs (cherry picked from commit 7f697b1938efb3972f03f25546bf807d5af9a26c) --- ctdb/server/ctdb_monitor.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/ctdb/server/ctdb_monitor.c b/ctdb/server/ctdb_monitor.c index 913e2721f98..ab58ec485fe 100644 --- a/ctdb/server/ctdb_monitor.c +++ b/ctdb/server/ctdb_monitor.c @@ -471,6 +471,11 @@ int32_t ctdb_control_modflags(struct ctdb_context *ctdb, TDB_DATA indata) return -1; } + if (node->flags & NODE_FLAGS_DISCONNECTED) { + DBG_DEBUG("Ignoring flag changes for disconnected node\n"); + return 0; + } + /* * Remember the old flags. We don't care what some other node * thought the old flags were - that's irrelevant. -- 2.30.2 From 60869615ccf8ad5e01ab5c47977be7e80a81de42 Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Fri, 9 Jul 2021 17:25:32 +1000 Subject: [PATCH 19/19] ctdb-daemon: Don't mark a node as unhealthy when connecting to it Remote nodes are already initialised as UNHEALTHY when the node list is initialised at startup (ctdb_load_nodes_file() calls convert_node_map_to_list()) and when disconnected (ctdb_node_dead()). So, drop this code. RN: Fix CTDB flag/status update race conditions BUG: https://bugzilla.samba.org/show_bug.cgi?id=14784 Signed-off-by: Martin Schwenke Reviewed-by: Amitay Isaacs Autobuild-User(master): Amitay Isaacs Autobuild-Date(master): Thu Sep 9 02:38:34 UTC 2021 on sn-devel-184 (cherry picked from commit 9e7d2d9794af7251c42cb22f23ee9f86c6ea05c1) --- ctdb/server/ctdb_server.c | 1 - 1 file changed, 1 deletion(-) diff --git a/ctdb/server/ctdb_server.c b/ctdb/server/ctdb_server.c index 1470b00dba5..ec6480c5067 100644 --- a/ctdb/server/ctdb_server.c +++ b/ctdb/server/ctdb_server.c @@ -337,7 +337,6 @@ void ctdb_node_connected(struct ctdb_node *node) node->ctdb->num_connected++; node->dead_count = 0; node->flags &= ~NODE_FLAGS_DISCONNECTED; - node->flags |= NODE_FLAGS_UNHEALTHY; DEBUG(DEBUG_ERR, ("%s: connected to %s - %u connected\n", node->ctdb->name, node->name, node->ctdb->num_connected)); -- 2.30.2