From ec6d55f43ce2ecfc317ffb93ea111d58209d0863 Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Sun, 11 Jul 2021 20:40:10 +1000 Subject: [PATCH 01/19] ctdb-recoverd: Add a helper variable Improves readability and simplifies subsequent changes. BUG: https://bugzilla.samba.org/show_bug.cgi?id=14784 Signed-off-by: Martin Schwenke Reviewed-by: Amitay Isaacs (cherry picked from commit 82a075d4d734588a42fca7ebaf529892d1eba853) --- ctdb/server/ctdb_recoverd.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/ctdb/server/ctdb_recoverd.c b/ctdb/server/ctdb_recoverd.c index 4ba8729b50e..9e084486036 100644 --- a/ctdb/server/ctdb_recoverd.c +++ b/ctdb/server/ctdb_recoverd.c @@ -553,13 +553,14 @@ static int update_flags(struct ctdb_recoverd *rec, for (j=0; jnum; j++) { struct ctdb_node_map_old *remote_nodemap=NULL; uint32_t local_flags = nodemap->nodes[j].flags; + uint32_t remote_pnn = nodemap->nodes[j].pnn; uint32_t remote_flags; int ret; if (local_flags & NODE_FLAGS_DISCONNECTED) { continue; } - if (nodemap->nodes[j].pnn == ctdb->pnn) { + if (remote_pnn == ctdb->pnn) { continue; } @@ -568,7 +569,7 @@ static int update_flags(struct ctdb_recoverd *rec, if (local_flags != remote_flags) { ret = update_flags_on_all_nodes(rec, - nodemap->nodes[j].pnn, + remote_pnn, remote_flags); if (ret != 0) { DBG_ERR( @@ -583,7 +584,7 @@ static int update_flags(struct ctdb_recoverd *rec, */ D_NOTICE("Remote node %u had flags 0x%x, " "local had 0x%x - updating local\n", - nodemap->nodes[j].pnn, + remote_pnn, remote_flags, local_flags); nodemap->nodes[j].flags = remote_flags; -- 2.30.2 From 33c2adc3c87fc746542eedf8cb1c2b9fcfc5e50f Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Sun, 11 Jul 2021 21:28:43 +1000 Subject: [PATCH 02/19] ctdb-recoverd: Update the local node map before pushing out flags The resulting code structure looks a little weird. However, there is another condition that requires the flags to be pushed that will be inserted before the continue statement in a subsequent commit.. BUG: https://bugzilla.samba.org/show_bug.cgi?id=14784 Signed-off-by: Martin Schwenke Reviewed-by: Amitay Isaacs (cherry picked from commit 620d07871420cdbfa055c1ace75ec1ac4c32721d) --- ctdb/server/ctdb_recoverd.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/ctdb/server/ctdb_recoverd.c b/ctdb/server/ctdb_recoverd.c index 9e084486036..c6efd6e362e 100644 --- a/ctdb/server/ctdb_recoverd.c +++ b/ctdb/server/ctdb_recoverd.c @@ -568,16 +568,6 @@ static int update_flags(struct ctdb_recoverd *rec, remote_flags = remote_nodemap->nodes[j].flags; if (local_flags != remote_flags) { - ret = update_flags_on_all_nodes(rec, - remote_pnn, - remote_flags); - if (ret != 0) { - DBG_ERR( - "Unable to update flags on remote nodes\n"); - talloc_free(mem_ctx); - return -1; - } - /* * Update the local copy of the flags in the * recovery daemon. @@ -588,6 +578,21 @@ static int update_flags(struct ctdb_recoverd *rec, remote_flags, local_flags); nodemap->nodes[j].flags = remote_flags; + local_flags = remote_flags; + goto push; + } + + continue; + +push: + D_NOTICE("Pushing updated flags for node %u (0x%x)\n", + remote_pnn, + local_flags); + ret = update_flags_on_all_nodes(rec, remote_pnn, local_flags); + if (ret != 0) { + DBG_ERR("Unable to update flags on remote nodes\n"); + talloc_free(mem_ctx); + return -1; } } talloc_free(mem_ctx); -- 2.30.2 From 3b548fa24affb7bdb7e826cd656a2e9e815fd445 Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Sun, 11 Jul 2021 22:17:08 +1000 Subject: [PATCH 03/19] ctdb-recoverd: Push flags for a node if any remote node disagrees This will usually happen if flags on the node in question change, so keeping the code simple and pushing to all nodes won't hurt. When all nodes come up there might be differences in connected nodes, causing such "fix ups". Receiving nodes will ignore no-op pushes. BUG: https://bugzilla.samba.org/show_bug.cgi?id=14784 Signed-off-by: Martin Schwenke Reviewed-by: Amitay Isaacs (cherry picked from commit 8305f6a7f132f03b0bbdb26692b7491fd3f6c24f) --- ctdb/server/ctdb_recoverd.c | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/ctdb/server/ctdb_recoverd.c b/ctdb/server/ctdb_recoverd.c index c6efd6e362e..79f4cb03b0f 100644 --- a/ctdb/server/ctdb_recoverd.c +++ b/ctdb/server/ctdb_recoverd.c @@ -555,13 +555,20 @@ static int update_flags(struct ctdb_recoverd *rec, uint32_t local_flags = nodemap->nodes[j].flags; uint32_t remote_pnn = nodemap->nodes[j].pnn; uint32_t remote_flags; + unsigned int i; int ret; if (local_flags & NODE_FLAGS_DISCONNECTED) { continue; } if (remote_pnn == ctdb->pnn) { - continue; + /* + * No remote nodemap for this node since this + * is the local nodemap. However, still need + * to check this against the remote nodes and + * push it if they are out-of-date. + */ + goto compare_remotes; } remote_nodemap = remote_nodemaps[j]; @@ -582,6 +589,26 @@ static int update_flags(struct ctdb_recoverd *rec, goto push; } +compare_remotes: + for (i = 0; i < nodemap->num; i++) { + if (i == j) { + continue; + } + if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) { + continue; + } + if (nodemap->nodes[i].pnn == ctdb->pnn) { + continue; + } + + remote_nodemap = remote_nodemaps[i]; + remote_flags = remote_nodemap->nodes[j].flags; + + if (local_flags != remote_flags) { + goto push; + } + } + continue; push: -- 2.30.2 From 6ac027bbfde0051d477c57a283d4d00dad3f92df Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Thu, 8 Jul 2021 17:28:20 +1000 Subject: [PATCH 04/19] ctdb-protocol: Add new controls to disable and enable nodes These are CTDB_CONTROL_DISABLE_NODE and CTDB_CONTROL_ENABLE_NODE. For consistency these match CTDB_CONTROL_STOP_NODE and CTDB_CONTROL_CONTINUE_NODE. It would be possible to add a single control but it would need to take data. The aim is to finally fix races in flag handling. Previous fixes have improved the situation but they have only narrowed the race window. The problem is that the recovery daemon on the master node pushes flags to nodes the same way that disable and enable are implemented. So the following sequence is still racy: 1. Node A is disabled 2. Recovery master pulls flags from all nodes including A 3. Node A is enabled 4. Recovery master notices A is disabled and pushes a flag update to all nodes including node A 5. Node A is erroneously marked disabled Node A can not tell if the MODIFY_FLAGS control is from a "ctdb disable" command or a flag update from the recovery master. The solution is to use a different mechanism for disable/enable and for a node to ignore MODIFY_FLAGS controls for their own flags. BUG: https://bugzilla.samba.org/show_bug.cgi?id=14784 Signed-off-by: Martin Schwenke Reviewed-by: Amitay Isaacs (cherry picked from commit 49dc5d8cd2d3767044ac69cbd25c8210d11cadf7) --- ctdb/protocol/protocol.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ctdb/protocol/protocol.h b/ctdb/protocol/protocol.h index e4b76c6b986..9fd7f1cdecb 100644 --- a/ctdb/protocol/protocol.h +++ b/ctdb/protocol/protocol.h @@ -376,6 +376,8 @@ enum ctdb_controls {CTDB_CONTROL_PROCESS_EXISTS = 0, CTDB_CONTROL_VACUUM_FETCH = 154, CTDB_CONTROL_DB_VACUUM = 155, CTDB_CONTROL_ECHO_DATA = 156, + CTDB_CONTROL_DISABLE_NODE = 157, + CTDB_CONTROL_ENABLE_NODE = 158, }; #define MAX_COUNT_BUCKETS 16 -- 2.30.2 From 413af03b1c69c559763e7853d83e64d4a8d782b0 Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Fri, 9 Jul 2021 12:10:12 +1000 Subject: [PATCH 05/19] ctdb-protocol: Add marshalling for controls DISABLE_NODE/ENABLE_NODE BUG: https://bugzilla.samba.org/show_bug.cgi?id=14784 Signed-off-by: Martin Schwenke Reviewed-by: Amitay Isaacs (cherry picked from commit 6845dca87e6ffc5e449fb78d23eb9c7a22698b80) --- ctdb/protocol/protocol_api.h | 6 ++++ ctdb/protocol/protocol_client.c | 36 ++++++++++++++++++++++ ctdb/protocol/protocol_control.c | 12 ++++++++ ctdb/protocol/protocol_debug.c | 2 ++ ctdb/tests/UNIT/cunit/protocol_test_101.sh | 2 +- ctdb/tests/src/protocol_common_ctdb.c | 24 +++++++++++++++ ctdb/tests/src/protocol_ctdb_test.c | 2 +- 7 files changed, 82 insertions(+), 2 deletions(-) diff --git a/ctdb/protocol/protocol_api.h b/ctdb/protocol/protocol_api.h index 7bbe33b22fe..499d9329c54 100644 --- a/ctdb/protocol/protocol_api.h +++ b/ctdb/protocol/protocol_api.h @@ -605,6 +605,12 @@ void ctdb_req_control_echo_data(struct ctdb_req_control *request, struct ctdb_echo_data *echo_data); int ctdb_reply_control_echo_data(struct ctdb_reply_control *reply); +void ctdb_req_control_disable_node(struct ctdb_req_control *request); +int ctdb_reply_control_disable_node(struct ctdb_reply_control *reply); + +void ctdb_req_control_enable_node(struct ctdb_req_control *request); +int ctdb_reply_control_enable_node(struct ctdb_reply_control *reply); + /* From protocol/protocol_debug.c */ void ctdb_packet_print(uint8_t *buf, size_t buflen, FILE *fp); diff --git a/ctdb/protocol/protocol_client.c b/ctdb/protocol/protocol_client.c index 6d850be86df..dcce83f02a1 100644 --- a/ctdb/protocol/protocol_client.c +++ b/ctdb/protocol/protocol_client.c @@ -2360,3 +2360,39 @@ int ctdb_reply_control_echo_data(struct ctdb_reply_control *reply) return reply->status; } + +/* CTDB_CONTROL_DISABLE_NODE */ + +void ctdb_req_control_disable_node(struct ctdb_req_control *request) +{ + request->opcode = CTDB_CONTROL_DISABLE_NODE; + request->pad = 0; + request->srvid = 0; + request->client_id = 0; + request->flags = 0; + + request->rdata.opcode = CTDB_CONTROL_DISABLE_NODE; +} + +int ctdb_reply_control_disable_node(struct ctdb_reply_control *reply) +{ + return ctdb_reply_control_generic(reply, CTDB_CONTROL_DISABLE_NODE); +} + +/* CTDB_CONTROL_ENABLE_NODE */ + +void ctdb_req_control_enable_node(struct ctdb_req_control *request) +{ + request->opcode = CTDB_CONTROL_ENABLE_NODE; + request->pad = 0; + request->srvid = 0; + request->client_id = 0; + request->flags = 0; + + request->rdata.opcode = CTDB_CONTROL_ENABLE_NODE; +} + +int ctdb_reply_control_enable_node(struct ctdb_reply_control *reply) +{ + return ctdb_reply_control_generic(reply, CTDB_CONTROL_ENABLE_NODE); +} diff --git a/ctdb/protocol/protocol_control.c b/ctdb/protocol/protocol_control.c index fb6b0219ef7..f64a1a90e10 100644 --- a/ctdb/protocol/protocol_control.c +++ b/ctdb/protocol/protocol_control.c @@ -411,6 +411,12 @@ static size_t ctdb_req_control_data_len(struct ctdb_req_control_data *cd) case CTDB_CONTROL_ECHO_DATA: len = ctdb_echo_data_len(cd->data.echo_data); break; + + case CTDB_CONTROL_DISABLE_NODE: + break; + + case CTDB_CONTROL_ENABLE_NODE: + break; } return len; @@ -1385,6 +1391,12 @@ static size_t ctdb_reply_control_data_len(struct ctdb_reply_control_data *cd) case CTDB_CONTROL_ECHO_DATA: len = ctdb_echo_data_len(cd->data.echo_data); break; + + case CTDB_CONTROL_DISABLE_NODE: + break; + + case CTDB_CONTROL_ENABLE_NODE: + break; } return len; diff --git a/ctdb/protocol/protocol_debug.c b/ctdb/protocol/protocol_debug.c index 694285515e1..d94cb548d68 100644 --- a/ctdb/protocol/protocol_debug.c +++ b/ctdb/protocol/protocol_debug.c @@ -243,6 +243,8 @@ static void ctdb_opcode_print(uint32_t opcode, FILE *fp) { CTDB_CONTROL_VACUUM_FETCH, "VACUUM_FETCH" }, { CTDB_CONTROL_DB_VACUUM, "DB_VACUUM" }, { CTDB_CONTROL_ECHO_DATA, "ECHO_DATA" }, + { CTDB_CONTROL_DISABLE_NODE, "DISABLE_NODE" }, + { CTDB_CONTROL_ENABLE_NODE, "ENABLE_NODE" }, { MAP_END, "" }, }; diff --git a/ctdb/tests/UNIT/cunit/protocol_test_101.sh b/ctdb/tests/UNIT/cunit/protocol_test_101.sh index 79dfabeb801..b84f208b109 100755 --- a/ctdb/tests/UNIT/cunit/protocol_test_101.sh +++ b/ctdb/tests/UNIT/cunit/protocol_test_101.sh @@ -2,7 +2,7 @@ . "${TEST_SCRIPTS_DIR}/unit.sh" -last_control=156 +last_control=158 generate_control_output () { diff --git a/ctdb/tests/src/protocol_common_ctdb.c b/ctdb/tests/src/protocol_common_ctdb.c index 67ec5c4d3a1..6bbddbddc1e 100644 --- a/ctdb/tests/src/protocol_common_ctdb.c +++ b/ctdb/tests/src/protocol_common_ctdb.c @@ -594,6 +594,12 @@ void fill_ctdb_req_control_data(TALLOC_CTX *mem_ctx, assert(cd->data.echo_data != NULL); fill_ctdb_echo_data(mem_ctx, cd->data.echo_data); break; + + case CTDB_CONTROL_DISABLE_NODE: + break; + + case CTDB_CONTROL_ENABLE_NODE: + break; } } @@ -984,6 +990,12 @@ void verify_ctdb_req_control_data(struct ctdb_req_control_data *cd, case CTDB_CONTROL_ECHO_DATA: verify_ctdb_echo_data(cd->data.echo_data, cd2->data.echo_data); break; + + case CTDB_CONTROL_DISABLE_NODE: + break; + + case CTDB_CONTROL_ENABLE_NODE: + break; } } @@ -1380,6 +1392,12 @@ void fill_ctdb_reply_control_data(TALLOC_CTX *mem_ctx, assert(cd->data.echo_data != NULL); fill_ctdb_echo_data(mem_ctx, cd->data.echo_data); break; + + case CTDB_CONTROL_DISABLE_NODE: + break; + + case CTDB_CONTROL_ENABLE_NODE: + break; } } @@ -1717,6 +1735,12 @@ void verify_ctdb_reply_control_data(struct ctdb_reply_control_data *cd, case CTDB_CONTROL_ECHO_DATA: verify_ctdb_echo_data(cd->data.echo_data, cd2->data.echo_data); break; + + case CTDB_CONTROL_DISABLE_NODE: + break; + + case CTDB_CONTROL_ENABLE_NODE: + break; } } diff --git a/ctdb/tests/src/protocol_ctdb_test.c b/ctdb/tests/src/protocol_ctdb_test.c index 6a9a8a4aef6..b359e7a1280 100644 --- a/ctdb/tests/src/protocol_ctdb_test.c +++ b/ctdb/tests/src/protocol_ctdb_test.c @@ -284,7 +284,7 @@ PROTOCOL_CTDB4_TEST(struct ctdb_req_dmaster, ctdb_req_dmaster, PROTOCOL_CTDB4_TEST(struct ctdb_reply_dmaster, ctdb_reply_dmaster, CTDB_REPLY_DMASTER); -#define NUM_CONTROLS 157 +#define NUM_CONTROLS 159 PROTOCOL_CTDB2_TEST(struct ctdb_req_control_data, ctdb_req_control_data); PROTOCOL_CTDB2_TEST(struct ctdb_reply_control_data, ctdb_reply_control_data); -- 2.30.2 From dae9601a348aafbdb7743c9dfb86013e2251d579 Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Wed, 28 Jul 2021 10:27:42 +1000 Subject: [PATCH 06/19] ctdb-daemon: Add a helper variable Simplifies a subsequent change. BUG: https://bugzilla.samba.org/show_bug.cgi?id=14784 Signed-off-by: Martin Schwenke Reviewed-by: Amitay Isaacs (cherry picked from commit e0a7b5a9e866452b1faaed86a105492fe7b237e2) --- ctdb/server/ctdb_daemon.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/ctdb/server/ctdb_daemon.c b/ctdb/server/ctdb_daemon.c index 57f80235e7c..3071e3cad54 100644 --- a/ctdb/server/ctdb_daemon.c +++ b/ctdb/server/ctdb_daemon.c @@ -1237,26 +1237,28 @@ failed: static void initialise_node_flags (struct ctdb_context *ctdb) { + struct ctdb_node *node = NULL; unsigned int i; /* Always found: PNN correctly set just before this is called */ for (i = 0; i < ctdb->num_nodes; i++) { - if (ctdb->pnn == ctdb->nodes[i]->pnn) { + node = ctdb->nodes[i]; + if (ctdb->pnn == node->pnn) { break; } } - ctdb->nodes[i]->flags &= ~NODE_FLAGS_DISCONNECTED; + node->flags &= ~NODE_FLAGS_DISCONNECTED; /* do we start out in DISABLED mode? */ if (ctdb->start_as_disabled != 0) { D_ERR("This node is configured to start in DISABLED state\n"); - ctdb->nodes[i]->flags |= NODE_FLAGS_DISABLED; + node->flags |= NODE_FLAGS_DISABLED; } /* do we start out in STOPPED mode? */ if (ctdb->start_as_stopped != 0) { D_ERR("This node is configured to start in STOPPED state\n"); - ctdb->nodes[i]->flags |= NODE_FLAGS_STOPPED; + node->flags |= NODE_FLAGS_STOPPED; } } -- 2.30.2 From 6d01d305ce5facd38ac04f654f44bf370078796e Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Fri, 9 Jul 2021 14:01:33 +1000 Subject: [PATCH 07/19] ctdb-daemon: Factor out a function to get node structure from PNN BUG: https://bugzilla.samba.org/show_bug.cgi?id=14784 Signed-off-by: Martin Schwenke Reviewed-by: Amitay Isaacs (cherry picked from commit 1ac7bc7532b2fad791d0e53effa7c64cdc73c4eb) --- ctdb/include/ctdb_private.h | 2 ++ ctdb/server/ctdb_daemon.c | 27 ++++++++++++++++++++++++--- 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/ctdb/include/ctdb_private.h b/ctdb/include/ctdb_private.h index 8eb6686f953..f5e647f08a5 100644 --- a/ctdb/include/ctdb_private.h +++ b/ctdb/include/ctdb_private.h @@ -565,6 +565,8 @@ int daemon_deregister_message_handler(struct ctdb_context *ctdb, void daemon_tunnel_handler(uint64_t tunnel_id, TDB_DATA data, void *private_data); +struct ctdb_node *ctdb_find_node(struct ctdb_context *ctdb, uint32_t pnn); + int ctdb_start_daemon(struct ctdb_context *ctdb, bool interactive, bool test_mode_enabled); diff --git a/ctdb/server/ctdb_daemon.c b/ctdb/server/ctdb_daemon.c index 3071e3cad54..e204bae73ad 100644 --- a/ctdb/server/ctdb_daemon.c +++ b/ctdb/server/ctdb_daemon.c @@ -1235,19 +1235,40 @@ failed: return -1; } -static void initialise_node_flags (struct ctdb_context *ctdb) +struct ctdb_node *ctdb_find_node(struct ctdb_context *ctdb, uint32_t pnn) { struct ctdb_node *node = NULL; unsigned int i; + if (pnn == CTDB_CURRENT_NODE) { + pnn = ctdb->pnn; + } + /* Always found: PNN correctly set just before this is called */ for (i = 0; i < ctdb->num_nodes; i++) { node = ctdb->nodes[i]; - if (ctdb->pnn == node->pnn) { - break; + if (pnn == node->pnn) { + return node; } } + return NULL; +} + +static void initialise_node_flags (struct ctdb_context *ctdb) +{ + struct ctdb_node *node = NULL; + + node = ctdb_find_node(ctdb, CTDB_CURRENT_NODE); + /* + * PNN correctly set just before this is called so always + * found but keep static analysers happy... + */ + if (node == NULL) { + DBG_ERR("Unable to find current node\n"); + return; + } + node->flags &= ~NODE_FLAGS_DISCONNECTED; /* do we start out in DISABLED mode? */ -- 2.30.2 From e0c961023b0e4d15f6ee4f0721392d1cc0be80e3 Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Fri, 9 Jul 2021 14:02:28 +1000 Subject: [PATCH 08/19] ctdb-daemon: Start as disabled means PERMANENTLY_DISABLED DISABLED is UNHEALTHY | PERMANENTLY_DISABLED, which is not what is intended here. Luckily, it doesn't do any harm because nodes are marked unhealthy at startup anyway. BUG: https://bugzilla.samba.org/show_bug.cgi?id=14784 Signed-off-by: Martin Schwenke Reviewed-by: Amitay Isaacs (cherry picked from commit 60c1ef146538d90f97b7823459f7548ca5fa6dd3) --- ctdb/server/ctdb_daemon.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ctdb/server/ctdb_daemon.c b/ctdb/server/ctdb_daemon.c index e204bae73ad..0896ba08f90 100644 --- a/ctdb/server/ctdb_daemon.c +++ b/ctdb/server/ctdb_daemon.c @@ -1274,7 +1274,7 @@ static void initialise_node_flags (struct ctdb_context *ctdb) /* do we start out in DISABLED mode? */ if (ctdb->start_as_disabled != 0) { D_ERR("This node is configured to start in DISABLED state\n"); - node->flags |= NODE_FLAGS_DISABLED; + node->flags |= NODE_FLAGS_PERMANENTLY_DISABLED; } /* do we start out in STOPPED mode? */ if (ctdb->start_as_stopped != 0) { -- 2.30.2 From 1ffdad9160c0dda8370f084c560c49de84259589 Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Fri, 9 Jul 2021 14:12:59 +1000 Subject: [PATCH 09/19] ctdb_daemon: Implement controls DISABLE_NODE/ENABLE_NODE BUG: https://bugzilla.samba.org/show_bug.cgi?id=14784 Signed-off-by: Martin Schwenke Reviewed-by: Amitay Isaacs (cherry picked from commit 15a6489c288b3adb635a728cb2049621ab1a07f7) --- ctdb/server/ctdb_control.c | 42 +++++++++++++++++++++++++++++ ctdb/tests/src/fake_ctdbd.c | 54 +++++++++++++++++++++++++++++++++++++ 2 files changed, 96 insertions(+) diff --git a/ctdb/server/ctdb_control.c b/ctdb/server/ctdb_control.c index 206ea149693..131ebd43afc 100644 --- a/ctdb/server/ctdb_control.c +++ b/ctdb/server/ctdb_control.c @@ -173,6 +173,40 @@ done: TALLOC_FREE(state); } +static int ctdb_control_disable_node(struct ctdb_context *ctdb) +{ + struct ctdb_node *node; + + node = ctdb_find_node(ctdb, CTDB_CURRENT_NODE); + if (node == NULL) { + /* Can't happen */ + DBG_ERR("Unable to find current node\n"); + return -1; + } + + D_ERR("Disable node\n"); + node->flags |= NODE_FLAGS_PERMANENTLY_DISABLED; + + return 0; +} + +static int ctdb_control_enable_node(struct ctdb_context *ctdb) +{ + struct ctdb_node *node; + + node = ctdb_find_node(ctdb, CTDB_CURRENT_NODE); + if (node == NULL) { + /* Can't happen */ + DBG_ERR("Unable to find current node\n"); + return -1; + } + + D_ERR("Enable node\n"); + node->flags &= ~NODE_FLAGS_PERMANENTLY_DISABLED; + + return 0; +} + /* process a control request */ @@ -827,6 +861,14 @@ static int32_t ctdb_control_dispatch(struct ctdb_context *ctdb, return ctdb_control_echo_data(ctdb, c, indata, async_reply); } + case CTDB_CONTROL_DISABLE_NODE: + CHECK_CONTROL_DATA_SIZE(0); + return ctdb_control_disable_node(ctdb); + + case CTDB_CONTROL_ENABLE_NODE: + CHECK_CONTROL_DATA_SIZE(0); + return ctdb_control_enable_node(ctdb); + default: DEBUG(DEBUG_CRIT,(__location__ " Unknown CTDB control opcode %u\n", opcode)); return -1; diff --git a/ctdb/tests/src/fake_ctdbd.c b/ctdb/tests/src/fake_ctdbd.c index 146b7da344c..d6552baee8d 100644 --- a/ctdb/tests/src/fake_ctdbd.c +++ b/ctdb/tests/src/fake_ctdbd.c @@ -3513,6 +3513,52 @@ static void control_check_pid_srvid(TALLOC_CTX *mem_ctx, client_send_control(req, header, &reply); } +static void control_disable_node(TALLOC_CTX *mem_ctx, + struct tevent_req *req, + struct ctdb_req_header *header, + struct ctdb_req_control *request) +{ + struct client_state *state = tevent_req_data( + req, struct client_state); + struct ctdbd_context *ctdb = state->ctdb; + struct ctdb_reply_control reply; + + reply.rdata.opcode = request->opcode; + + DEBUG(DEBUG_INFO, ("Disabling node\n")); + ctdb->node_map->node[header->destnode].flags |= + NODE_FLAGS_PERMANENTLY_DISABLED; + + reply.status = 0; + reply.errmsg = NULL; + + client_send_control(req, header, &reply); + return; +} + +static void control_enable_node(TALLOC_CTX *mem_ctx, + struct tevent_req *req, + struct ctdb_req_header *header, + struct ctdb_req_control *request) +{ + struct client_state *state = tevent_req_data( + req, struct client_state); + struct ctdbd_context *ctdb = state->ctdb; + struct ctdb_reply_control reply; + + reply.rdata.opcode = request->opcode; + + DEBUG(DEBUG_INFO, ("Enable node\n")); + ctdb->node_map->node[header->destnode].flags &= + ~NODE_FLAGS_PERMANENTLY_DISABLED; + + reply.status = 0; + reply.errmsg = NULL; + + client_send_control(req, header, &reply); + return; +} + static bool fake_control_failure(TALLOC_CTX *mem_ctx, struct tevent_req *req, struct ctdb_req_header *header, @@ -4205,6 +4251,14 @@ static void client_process_control(struct tevent_req *req, control_check_pid_srvid(mem_ctx, req, &header, &request); break; + case CTDB_CONTROL_DISABLE_NODE: + control_disable_node(mem_ctx, req, &header, &request); + break; + + case CTDB_CONTROL_ENABLE_NODE: + control_enable_node(mem_ctx, req, &header, &request); + break; + default: if (! (request.flags & CTDB_CTRL_FLAG_NOREPLY)) { control_error(mem_ctx, req, &header, &request); -- 2.30.2 From 73a4f5fc35b8ecff67d4c8556dbb06040946a15f Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Fri, 9 Jul 2021 14:32:12 +1000 Subject: [PATCH 10/19] ctdb-client: Add client code for disable/enable controls BUG: https://bugzilla.samba.org/show_bug.cgi?id=14784 Signed-off-by: Martin Schwenke Reviewed-by: Amitay Isaacs (cherry picked from commit 6fe6a54e7f32e650be6ab36041159081dbde5165) --- ctdb/client/client_control_sync.c | 68 +++++++++++++++++++++++++++++++ ctdb/client/client_sync.h | 12 ++++++ 2 files changed, 80 insertions(+) diff --git a/ctdb/client/client_control_sync.c b/ctdb/client/client_control_sync.c index b9a25ce2b2c..e9f97dd0f30 100644 --- a/ctdb/client/client_control_sync.c +++ b/ctdb/client/client_control_sync.c @@ -2660,3 +2660,71 @@ int ctdb_ctrl_tunnel_deregister(TALLOC_CTX *mem_ctx, struct tevent_context *ev, return 0; } + +int ctdb_ctrl_disable_node(TALLOC_CTX *mem_ctx, + struct tevent_context *ev, + struct ctdb_client_context *client, + int destnode, + struct timeval timeout) +{ + struct ctdb_req_control request; + struct ctdb_reply_control *reply; + int ret; + + ctdb_req_control_disable_node(&request); + ret = ctdb_client_control(mem_ctx, + ev, + client, + destnode, + timeout, + &request, + &reply); + if (ret != 0) { + D_ERR("Control DISABLE_NODE failed to node %u, ret=%d\n", + destnode, + ret); + return ret; + } + + ret = ctdb_reply_control_disable_node(reply); + if (ret != 0) { + D_ERR("Control DISABLE_NODE failed, ret=%d\n", ret); + return ret; + } + + return 0; +} + +int ctdb_ctrl_enable_node(TALLOC_CTX *mem_ctx, + struct tevent_context *ev, + struct ctdb_client_context *client, + int destnode, + struct timeval timeout) +{ + struct ctdb_req_control request; + struct ctdb_reply_control *reply; + int ret; + + ctdb_req_control_enable_node(&request); + ret = ctdb_client_control(mem_ctx, + ev, + client, + destnode, + timeout, + &request, + &reply); + if (ret != 0) { + D_ERR("Control ENABLE_NODE failed to node %u, ret=%d\n", + destnode, + ret); + return ret; + } + + ret = ctdb_reply_control_enable_node(reply); + if (ret != 0) { + D_ERR("Control ENABLE_NODE failed, ret=%d\n", ret); + return ret; + } + + return 0; +} diff --git a/ctdb/client/client_sync.h b/ctdb/client/client_sync.h index dc8b67395e3..b8f5d905857 100644 --- a/ctdb/client/client_sync.h +++ b/ctdb/client/client_sync.h @@ -482,6 +482,18 @@ int ctdb_ctrl_tunnel_deregister(TALLOC_CTX *mem_ctx, struct tevent_context *ev, int destnode, struct timeval timeout, uint64_t tunnel_id); +int ctdb_ctrl_disable_node(TALLOC_CTX *mem_ctx, + struct tevent_context *ev, + struct ctdb_client_context *client, + int destnode, + struct timeval timeout); + +int ctdb_ctrl_enable_node(TALLOC_CTX *mem_ctx, + struct tevent_context *ev, + struct ctdb_client_context *client, + int destnode, + struct timeval timeout); + /* from client/client_message_sync.c */ int ctdb_message_recd_update_ip(TALLOC_CTX *mem_ctx, struct tevent_context *ev, -- 2.30.2 From e5f5f55e2d5cf231470d75ab191c4c2049d4c2fa Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Fri, 9 Jul 2021 14:37:19 +1000 Subject: [PATCH 11/19] ctdb-tools: Use disable and enable controls in tool Note that there a change from broadcast to a directed control here. This is OK because the recovery master will push flags if any nodes disagree with the canonical flags fetched from a node. Static function ctdb_ctrl_modflags() is no longer used to drop it. BUG: https://bugzilla.samba.org/show_bug.cgi?id=14784 Signed-off-by: Martin Schwenke Reviewed-by: Amitay Isaacs (cherry picked from commit 5914054698dab934fd4db5efb9d211b2fdc40bb9) --- ctdb/tools/ctdb.c | 57 ++++++++++------------------------------------- 1 file changed, 12 insertions(+), 45 deletions(-) diff --git a/ctdb/tools/ctdb.c b/ctdb/tools/ctdb.c index 598ab4ff4b7..8370e11f8e6 100644 --- a/ctdb/tools/ctdb.c +++ b/ctdb/tools/ctdb.c @@ -2573,40 +2573,6 @@ static void wait_for_flags(TALLOC_CTX *mem_ctx, struct ctdb_context *ctdb, } } -static int ctdb_ctrl_modflags(TALLOC_CTX *mem_ctx, struct tevent_context *ev, - struct ctdb_client_context *client, - uint32_t destnode, struct timeval timeout, - uint32_t set, uint32_t clear) -{ - struct ctdb_node_map *nodemap; - struct ctdb_node_flag_change flag_change; - struct ctdb_req_control request; - uint32_t *pnn_list; - int ret, count; - - ret = ctdb_ctrl_get_nodemap(mem_ctx, ev, client, destnode, - tevent_timeval_zero(), &nodemap); - if (ret != 0) { - return ret; - } - - flag_change.pnn = destnode; - flag_change.old_flags = nodemap->node[destnode].flags; - flag_change.new_flags = flag_change.old_flags | set; - flag_change.new_flags &= ~clear; - - count = list_of_connected_nodes(nodemap, -1, mem_ctx, &pnn_list); - if (count == -1) { - return ENOMEM; - } - - ctdb_req_control_modify_flags(&request, &flag_change); - ret = ctdb_client_control_multi(mem_ctx, ev, client, pnn_list, count, - tevent_timeval_zero(), &request, - NULL, NULL); - return ret; -} - struct ipreallocate_state { int status; bool done; @@ -2688,13 +2654,13 @@ static int control_disable(TALLOC_CTX *mem_ctx, struct ctdb_context *ctdb, return 0; } - ret = ctdb_ctrl_modflags(mem_ctx, ctdb->ev, ctdb->client, - ctdb->cmd_pnn, TIMEOUT(), - NODE_FLAGS_PERMANENTLY_DISABLED, 0); + ret = ctdb_ctrl_disable_node(mem_ctx, + ctdb->ev, + ctdb->client, + ctdb->cmd_pnn, + TIMEOUT()); if (ret != 0) { - fprintf(stderr, - "Failed to set DISABLED flag on node %u\n", - ctdb->cmd_pnn); + fprintf(stderr, "Failed to disable node %u\n", ctdb->cmd_pnn); return ret; } @@ -2717,12 +2683,13 @@ static int control_enable(TALLOC_CTX *mem_ctx, struct ctdb_context *ctdb, return 0; } - ret = ctdb_ctrl_modflags(mem_ctx, ctdb->ev, ctdb->client, - ctdb->cmd_pnn, TIMEOUT(), - 0, NODE_FLAGS_PERMANENTLY_DISABLED); + ret = ctdb_ctrl_enable_node(mem_ctx, + ctdb->ev, + ctdb->client, + ctdb->cmd_pnn, + TIMEOUT()); if (ret != 0) { - fprintf(stderr, "Failed to reset DISABLED flag on node %u\n", - ctdb->cmd_pnn); + fprintf(stderr, "Failed to enable node %u\n", ctdb->cmd_pnn); return ret; } -- 2.30.2 From d2968f37af0472b8f883b70575ed9f0d72aa4b20 Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Fri, 9 Jul 2021 15:13:49 +1000 Subject: [PATCH 12/19] ctdb-daemon: Correct the condition for logging unchanged flags Don't trust the old flags from the recovery master. Surrounding code will change in future comments, including the use of old-style debug macros, so just make this change clear. BUG: https://bugzilla.samba.org/show_bug.cgi?id=14784 Signed-off-by: Martin Schwenke Reviewed-by: Amitay Isaacs (cherry picked from commit eec44e286250a6ee7b5c42d85d632bdc300a409f) --- ctdb/server/ctdb_monitor.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ctdb/server/ctdb_monitor.c b/ctdb/server/ctdb_monitor.c index 5c694bde969..a49162dd9d9 100644 --- a/ctdb/server/ctdb_monitor.c +++ b/ctdb/server/ctdb_monitor.c @@ -484,7 +484,7 @@ int32_t ctdb_control_modflags(struct ctdb_context *ctdb, TDB_DATA indata) } } - if (node->flags == c->old_flags) { + if (node->flags == old_flags) { DEBUG(DEBUG_INFO, ("Control modflags on node %u - Unchanged - flags 0x%x\n", c->pnn, node->flags)); return 0; } -- 2.30.2 From 8633f7626aa5424e3de21e929af89ce8a384d73d Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Thu, 8 Jul 2021 11:29:38 +1000 Subject: [PATCH 13/19] ctdb-daemon: Update logging for flag changes When flags change, promote the message to NOTICE level and switch the message to the style that is currently generated by ctdb-recoverd.c:monitor_handler(). This will allow monitor_handler() to go away in future. Drop logging when flags do not change. The recovery master now logs when it pushes flags for a node, so the lack of a corresponding "changed flags" message here indicates that no update was required. BUG: https://bugzilla.samba.org/show_bug.cgi?id=14784 Signed-off-by: Martin Schwenke Reviewed-by: Amitay Isaacs (cherry picked from commit b6d25d079e30919457cacbfbbfd670bf88295a9c) --- ctdb/server/ctdb_monitor.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/ctdb/server/ctdb_monitor.c b/ctdb/server/ctdb_monitor.c index a49162dd9d9..84d8c2ce299 100644 --- a/ctdb/server/ctdb_monitor.c +++ b/ctdb/server/ctdb_monitor.c @@ -485,11 +485,13 @@ int32_t ctdb_control_modflags(struct ctdb_context *ctdb, TDB_DATA indata) } if (node->flags == old_flags) { - DEBUG(DEBUG_INFO, ("Control modflags on node %u - Unchanged - flags 0x%x\n", c->pnn, node->flags)); return 0; } - DEBUG(DEBUG_INFO, ("Control modflags on node %u - flags now 0x%x\n", c->pnn, node->flags)); + D_NOTICE("Node %u has changed flags - 0x%x -> 0x%x\n", + c->pnn, + old_flags, + node->flags); if (node->flags == 0 && ctdb->runstate <= CTDB_RUNSTATE_STARTUP) { DEBUG(DEBUG_ERR, (__location__ " Node %u became healthy - force recovery for startup\n", -- 2.30.2 From d2094baf5710ca36521dfb51c2ae75f223f5db98 Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Thu, 8 Jul 2021 11:34:49 +1000 Subject: [PATCH 14/19] ctdb-daemon: Modernise remaining debug macro in this function BUG: https://bugzilla.samba.org/show_bug.cgi?id=14784 Signed-off-by: Martin Schwenke Reviewed-by: Amitay Isaacs (cherry picked from commit 0132bd5a2233193256af434a37506f86ed62c075) --- ctdb/server/ctdb_monitor.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ctdb/server/ctdb_monitor.c b/ctdb/server/ctdb_monitor.c index 84d8c2ce299..ad936925a30 100644 --- a/ctdb/server/ctdb_monitor.c +++ b/ctdb/server/ctdb_monitor.c @@ -494,8 +494,8 @@ int32_t ctdb_control_modflags(struct ctdb_context *ctdb, TDB_DATA indata) node->flags); if (node->flags == 0 && ctdb->runstate <= CTDB_RUNSTATE_STARTUP) { - DEBUG(DEBUG_ERR, (__location__ " Node %u became healthy - force recovery for startup\n", - c->pnn)); + DBG_ERR("Node %u became healthy - force recovery for startup\n", + c->pnn); ctdb->recovery_mode = CTDB_RECOVERY_ACTIVE; } -- 2.30.2 From e4dd64d59ebb754c3aaa552da85399555f1fd36d Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Thu, 8 Jul 2021 11:32:20 +1000 Subject: [PATCH 15/19] ctdb-daemon: Don't bother sending CTDB_SRVID_SET_NODE_FLAGS The code that handles this message is ctdb_recoverd.c:monitor_handler(). Although it appears to do something potentially useful, it only logs the flags changes. All changes made are to local structures - there are no actual side-effects. It used to trigger a takeover run when the DISABLED flag changed. This was dropped back in commit 662f06de9fdce7b1bc1772a4fbe43de271564917. BUG: https://bugzilla.samba.org/show_bug.cgi?id=14784 Signed-off-by: Martin Schwenke Reviewed-by: Amitay Isaacs (cherry picked from commit e75256767fffc6a7ac0b97e58737a39c63c8b187) --- ctdb/server/ctdb_monitor.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/ctdb/server/ctdb_monitor.c b/ctdb/server/ctdb_monitor.c index ad936925a30..67b99c61e8f 100644 --- a/ctdb/server/ctdb_monitor.c +++ b/ctdb/server/ctdb_monitor.c @@ -499,10 +499,5 @@ int32_t ctdb_control_modflags(struct ctdb_context *ctdb, TDB_DATA indata) ctdb->recovery_mode = CTDB_RECOVERY_ACTIVE; } - /* tell the recovery daemon something has changed */ - c->new_flags = node->flags; - ctdb_daemon_send_message(ctdb, ctdb->pnn, - CTDB_SRVID_SET_NODE_FLAGS, indata); - return 0; } -- 2.30.2 From 946edd2cd26d1c9d98f11c0fc552d90188b4ecaa Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Wed, 17 Jan 2018 19:04:34 +1100 Subject: [PATCH 16/19] ctdb-recoverd: Mark CTDB_SRVID_SET_NODE_FLAGS obsolete CTDB_SRVID_SET_NODE_FLAGS is no longer sent so drop monitor_handler() and replace with srvid_not_implemented(). Mark the SRVID obsolete in its comment. BUG: https://bugzilla.samba.org/show_bug.cgi?id=14784 Signed-off-by: Martin Schwenke Reviewed-by: Amitay Isaacs (cherry picked from commit 916c5ee131dc5c7f1d9c3540147d1f915c8302ad) --- ctdb/protocol/protocol.h | 2 +- ctdb/server/ctdb_recoverd.c | 61 +++++++++---------------------------- 2 files changed, 16 insertions(+), 47 deletions(-) diff --git a/ctdb/protocol/protocol.h b/ctdb/protocol/protocol.h index 9fd7f1cdecb..5f788f6f2a8 100644 --- a/ctdb/protocol/protocol.h +++ b/ctdb/protocol/protocol.h @@ -137,7 +137,7 @@ struct ctdb_call { /* SRVID to inform clients that an IP address has been taken over */ #define CTDB_SRVID_TAKE_IP 0xF301000000000000LL -/* SRVID to inform recovery daemon of the node flags */ +/* SRVID to inform recovery daemon of the node flags - OBSOLETE */ #define CTDB_SRVID_SET_NODE_FLAGS 0xF400000000000000LL /* SRVID to inform recovery daemon to update public ip assignment */ diff --git a/ctdb/server/ctdb_recoverd.c b/ctdb/server/ctdb_recoverd.c index 79f4cb03b0f..dfa6d0d089b 100644 --- a/ctdb/server/ctdb_recoverd.c +++ b/ctdb/server/ctdb_recoverd.c @@ -1781,54 +1781,21 @@ static void force_election(struct ctdb_recoverd *rec, uint32_t pnn, } - -/* - handler for when a node changes its flags -*/ -static void monitor_handler(uint64_t srvid, TDB_DATA data, void *private_data) +static void srvid_not_implemented(uint64_t srvid, + TDB_DATA data, + void *private_data) { - struct ctdb_recoverd *rec = talloc_get_type( - private_data, struct ctdb_recoverd); - struct ctdb_context *ctdb = rec->ctdb; - int ret; - struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr; - struct ctdb_node_map_old *nodemap=NULL; - TALLOC_CTX *tmp_ctx; - unsigned int i; - - if (data.dsize != sizeof(*c)) { - DEBUG(DEBUG_ERR,(__location__ "Invalid data in ctdb_node_flag_change\n")); - return; - } - - tmp_ctx = talloc_new(ctdb); - CTDB_NO_MEMORY_VOID(ctdb, tmp_ctx); + const char *s; - ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap); - if (ret != 0) { - DEBUG(DEBUG_ERR,(__location__ "ctdb_ctrl_getnodemap failed in monitor_handler\n")); - talloc_free(tmp_ctx); - return; - } - - - for (i=0;inum;i++) { - if (nodemap->nodes[i].pnn == c->pnn) break; - } - - if (i == nodemap->num) { - DEBUG(DEBUG_CRIT,(__location__ "Flag change for non-existant node %u\n", c->pnn)); - talloc_free(tmp_ctx); - return; - } - - if (c->old_flags != c->new_flags) { - DEBUG(DEBUG_NOTICE,("Node %u has changed flags - now 0x%x was 0x%x\n", c->pnn, c->new_flags, c->old_flags)); + switch (srvid) { + case CTDB_SRVID_SET_NODE_FLAGS: + s = "CTDB_SRVID_SET_NODE_FLAGS"; + break; + default: + s = "UNKNOWN"; } - nodemap->nodes[i].flags = c->new_flags; - - talloc_free(tmp_ctx); + D_WARNING("SRVID %s (0x%" PRIx64 ") is obsolete\n", s, srvid); } /* @@ -2998,8 +2965,10 @@ static void monitor_cluster(struct ctdb_context *ctdb) /* register a message port for recovery elections */ ctdb_client_set_message_handler(ctdb, CTDB_SRVID_ELECTION, election_handler, rec); - /* when nodes are disabled/enabled */ - ctdb_client_set_message_handler(ctdb, CTDB_SRVID_SET_NODE_FLAGS, monitor_handler, rec); + ctdb_client_set_message_handler(ctdb, + CTDB_SRVID_SET_NODE_FLAGS, + srvid_not_implemented, + rec); /* when we are asked to puch out a flag change */ ctdb_client_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec); -- 2.30.2 From 6083e8605d382f80fd7d9034eae371cdb02473c1 Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Thu, 8 Jul 2021 11:11:11 +1000 Subject: [PATCH 17/19] ctdb-daemon: Simplify ctdb_control_modflags() Now that there are separate disable/enable controls used by the ctdb tool this control can ignore any flag updates for the current nodes. These only come from the recovery master, which depends on being able to fetch flags for all nodes. BUG: https://bugzilla.samba.org/show_bug.cgi?id=14784 Signed-off-by: Martin Schwenke Reviewed-by: Amitay Isaacs (cherry picked from commit ae10a8a4b70e53ea3be6257d1f86f2d9a56aa62a) --- ctdb/server/ctdb_monitor.c | 47 +++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 23 deletions(-) diff --git a/ctdb/server/ctdb_monitor.c b/ctdb/server/ctdb_monitor.c index 67b99c61e8f..913e2721f98 100644 --- a/ctdb/server/ctdb_monitor.c +++ b/ctdb/server/ctdb_monitor.c @@ -455,34 +455,35 @@ int32_t ctdb_control_modflags(struct ctdb_context *ctdb, TDB_DATA indata) struct ctdb_node *node; uint32_t old_flags; - if (c->pnn >= ctdb->num_nodes) { - DEBUG(DEBUG_ERR,(__location__ " Node %d is invalid, num_nodes :%d\n", c->pnn, ctdb->num_nodes)); - return -1; + /* + * Don't let other nodes override the current node's flags. + * The recovery master fetches flags from this node so there's + * no need to push them back. Doing so is racy. + */ + if (c->pnn == ctdb->pnn) { + DBG_DEBUG("Ignoring flag changes for current node\n"); + return 0; } - node = ctdb->nodes[c->pnn]; - old_flags = node->flags; - if (c->pnn != ctdb->pnn) { - c->old_flags = node->flags; + node = ctdb_find_node(ctdb, c->pnn); + if (node == NULL) { + DBG_ERR("Node %u is invalid\n", c->pnn); + return -1; } - node->flags = c->new_flags & ~NODE_FLAGS_DISCONNECTED; - node->flags |= (c->old_flags & NODE_FLAGS_DISCONNECTED); - /* we don't let other nodes modify our STOPPED status */ - if (c->pnn == ctdb->pnn) { - node->flags &= ~NODE_FLAGS_STOPPED; - if (old_flags & NODE_FLAGS_STOPPED) { - node->flags |= NODE_FLAGS_STOPPED; - } - } + /* + * Remember the old flags. We don't care what some other node + * thought the old flags were - that's irrelevant. + */ + old_flags = node->flags; - /* we don't let other nodes modify our BANNED status */ - if (c->pnn == ctdb->pnn) { - node->flags &= ~NODE_FLAGS_BANNED; - if (old_flags & NODE_FLAGS_BANNED) { - node->flags |= NODE_FLAGS_BANNED; - } - } + /* + * This node tracks nodes it is connected to, so don't let + * another node override this + */ + node->flags = + (old_flags & NODE_FLAGS_DISCONNECTED) | + (c->new_flags & ~NODE_FLAGS_DISCONNECTED); if (node->flags == old_flags) { return 0; -- 2.30.2 From 4f8819ca99e77d7ef71c4c128c5a83ddf18b5f5d Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Tue, 27 Jul 2021 15:50:54 +1000 Subject: [PATCH 18/19] ctdb-daemon: Ignore flag changes for disconnected nodes If this node is not connected to a node then we shouldn't know anything about it. The state will be pushed later by the recovery master. BUG: https://bugzilla.samba.org/show_bug.cgi?id=14784 Signed-off-by: Martin Schwenke Signed-off-by: Amitay Isaacs (cherry picked from commit 7f697b1938efb3972f03f25546bf807d5af9a26c) --- ctdb/server/ctdb_monitor.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/ctdb/server/ctdb_monitor.c b/ctdb/server/ctdb_monitor.c index 913e2721f98..ab58ec485fe 100644 --- a/ctdb/server/ctdb_monitor.c +++ b/ctdb/server/ctdb_monitor.c @@ -471,6 +471,11 @@ int32_t ctdb_control_modflags(struct ctdb_context *ctdb, TDB_DATA indata) return -1; } + if (node->flags & NODE_FLAGS_DISCONNECTED) { + DBG_DEBUG("Ignoring flag changes for disconnected node\n"); + return 0; + } + /* * Remember the old flags. We don't care what some other node * thought the old flags were - that's irrelevant. -- 2.30.2 From 1f0d0b1af08e05a48bcad7c80f6c86f70ea2defe Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Fri, 9 Jul 2021 17:25:32 +1000 Subject: [PATCH 19/19] ctdb-daemon: Don't mark a node as unhealthy when connecting to it Remote nodes are already initialised as UNHEALTHY when the node list is initialised at startup (ctdb_load_nodes_file() calls convert_node_map_to_list()) and when disconnected (ctdb_node_dead()). So, drop this code. RN: Fix CTDB flag/status update race conditions BUG: https://bugzilla.samba.org/show_bug.cgi?id=14784 Signed-off-by: Martin Schwenke Reviewed-by: Amitay Isaacs Autobuild-User(master): Amitay Isaacs Autobuild-Date(master): Thu Sep 9 02:38:34 UTC 2021 on sn-devel-184 (cherry picked from commit 9e7d2d9794af7251c42cb22f23ee9f86c6ea05c1) --- ctdb/server/ctdb_server.c | 1 - 1 file changed, 1 deletion(-) diff --git a/ctdb/server/ctdb_server.c b/ctdb/server/ctdb_server.c index 1470b00dba5..ec6480c5067 100644 --- a/ctdb/server/ctdb_server.c +++ b/ctdb/server/ctdb_server.c @@ -337,7 +337,6 @@ void ctdb_node_connected(struct ctdb_node *node) node->ctdb->num_connected++; node->dead_count = 0; node->flags &= ~NODE_FLAGS_DISCONNECTED; - node->flags |= NODE_FLAGS_UNHEALTHY; DEBUG(DEBUG_ERR, ("%s: connected to %s - %u connected\n", node->ctdb->name, node->name, node->ctdb->num_connected)); -- 2.30.2