From ef719b97b007971192203283e39dfb760cd7a9fb Mon Sep 17 00:00:00 2001 From: Amitay Isaacs Date: Thu, 22 Jun 2017 14:00:13 +1000 Subject: [PATCH 1/6] ctdb-recovery: Assign banning credits if database fails to freeze https://bugzilla.samba.org/show_bug.cgi?id=12857 Signed-off-by: Amitay Isaacs Reviewed-by: Martin Schwenke (cherry picked from commit c9d9f56bffe1e19665dba8e0cf899399d3d9fb72) --- ctdb/server/ctdb_recovery_helper.c | 1 + 1 file changed, 1 insertion(+) diff --git a/ctdb/server/ctdb_recovery_helper.c b/ctdb/server/ctdb_recovery_helper.c index 0222aa0..474b900 100644 --- a/ctdb/server/ctdb_recovery_helper.c +++ b/ctdb/server/ctdb_recovery_helper.c @@ -1627,6 +1627,7 @@ static void recover_db_freeze_done(struct tevent_req *subreq) if (ret2 != 0) { LOG("control FREEZE_DB failed for db %s on node %u," " ret=%d\n", state->db_name, pnn, ret2); + state->ban_credits[pnn] += 1; } else { LOG("control FREEZE_DB failed for db %s, ret=%d\n", state->db_name, ret); -- 2.9.4 From 6d58fe96a622db2b3dc7c39d0bd721aae65ce666 Mon Sep 17 00:00:00 2001 From: Amitay Isaacs Date: Thu, 22 Jun 2017 14:49:02 +1000 Subject: [PATCH 2/6] ctdb-recovery: Setting up of recmode should be idempotent BUG: https://bugzilla.samba.org/show_bug.cgi?id=12857 If the recovery mode is already set to the expected value, there is nothing to do. Signed-off-by: Amitay Isaacs Reviewed-by: Martin Schwenke (cherry picked from commit f2771fcbf438e8b06321752c7203f01bbe33b573) --- ctdb/server/ctdb_recover.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/ctdb/server/ctdb_recover.c b/ctdb/server/ctdb_recover.c index 6bed61c..ada31af 100644 --- a/ctdb/server/ctdb_recover.c +++ b/ctdb/server/ctdb_recover.c @@ -856,6 +856,12 @@ int32_t ctdb_control_set_recmode(struct ctdb_context *ctdb, struct set_recmode_state *state; struct ctdb_cluster_mutex_handle *h; + if (recmode == ctdb->recovery_mode) { + D_INFO("Recovery mode already set to %s\n", + recmode == CTDB_RECOVERY_NORMAL ? "NORMAL" : "ACTIVE"); + return 0; + } + /* if we enter recovery but stay in recovery for too long we will eventually drop all our ip addresses */ -- 2.9.4 From 3640d9e07d02b958a7d33aab58afb420c4b0dd71 Mon Sep 17 00:00:00 2001 From: Amitay Isaacs Date: Thu, 22 Jun 2017 14:52:32 +1000 Subject: [PATCH 3/6] ctdb-recovery: Simplify logging of recovery mode setting BUG: https://bugzilla.samba.org/show_bug.cgi?id=12857 Signed-off-by: Amitay Isaacs Reviewed-by: Martin Schwenke (cherry picked from commit d74dadd7f26a9e8c48ba92468d7d0c4a7aa5a8e5) --- ctdb/server/ctdb_recover.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/ctdb/server/ctdb_recover.c b/ctdb/server/ctdb_recover.c index ada31af..e35ba73 100644 --- a/ctdb/server/ctdb_recover.c +++ b/ctdb/server/ctdb_recover.c @@ -862,6 +862,9 @@ int32_t ctdb_control_set_recmode(struct ctdb_context *ctdb, return 0; } + D_NOTICE("Recovery mode set to %s\n", + recmode == CTDB_RECOVERY_NORMAL ? "NORMAL" : "ACTIVE"); + /* if we enter recovery but stay in recovery for too long we will eventually drop all our ip addresses */ @@ -874,11 +877,6 @@ int32_t ctdb_control_set_recmode(struct ctdb_context *ctdb, } } - if (recmode != ctdb->recovery_mode) { - DEBUG(DEBUG_NOTICE,(__location__ " Recovery mode set to %s\n", - recmode==CTDB_RECOVERY_NORMAL?"NORMAL":"ACTIVE")); - } - if (recmode != CTDB_RECOVERY_NORMAL || ctdb->recovery_mode != CTDB_RECOVERY_ACTIVE) { ctdb->recovery_mode = recmode; -- 2.9.4 From b3ed692ea9aafb04ebf994ca4bdfd3d18edf273e Mon Sep 17 00:00:00 2001 From: Amitay Isaacs Date: Thu, 22 Jun 2017 14:09:32 +1000 Subject: [PATCH 4/6] ctdb-recovery: Finish processing for recovery mode ACTIVE first BUG: https://bugzilla.samba.org/show_bug.cgi?id=12857 This simplifies the code and avoids complicated conditions. Signed-off-by: Amitay Isaacs Reviewed-by: Martin Schwenke (cherry picked from commit f8200153b21f5b19c9a1d57be3e05e739d9fafcd) --- ctdb/server/ctdb_recover.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/ctdb/server/ctdb_recover.c b/ctdb/server/ctdb_recover.c index e35ba73..813a1ad 100644 --- a/ctdb/server/ctdb_recover.c +++ b/ctdb/server/ctdb_recover.c @@ -868,18 +868,12 @@ int32_t ctdb_control_set_recmode(struct ctdb_context *ctdb, /* if we enter recovery but stay in recovery for too long we will eventually drop all our ip addresses */ - if (recmode == CTDB_RECOVERY_NORMAL) { - talloc_free(ctdb->release_ips_ctx); - ctdb->release_ips_ctx = NULL; - } else { + if (recmode == CTDB_RECOVERY_ACTIVE) { if (ctdb_deferred_drop_all_ips(ctdb) != 0) { - DEBUG(DEBUG_ERR,("Failed to set up deferred drop all ips\n")); + D_ERR("Failed to set up deferred drop all ips\n"); } - } - if (recmode != CTDB_RECOVERY_NORMAL || - ctdb->recovery_mode != CTDB_RECOVERY_ACTIVE) { - ctdb->recovery_mode = recmode; + ctdb->recovery_mode = CTDB_RECOVERY_ACTIVE; return 0; } @@ -888,6 +882,8 @@ int32_t ctdb_control_set_recmode(struct ctdb_context *ctdb, * Therefore, what follows is special handling when setting * recovery mode back to normal */ + TALLOC_FREE(ctdb->release_ips_ctx); + for (ctdb_db = ctdb->db_list; ctdb_db != NULL; ctdb_db = ctdb_db->next) { if (ctdb_db->generation != ctdb->vnn_map->generation) { DEBUG(DEBUG_ERR, -- 2.9.4 From 5092b013a420e17a007289a2b060e28e6f95d7f3 Mon Sep 17 00:00:00 2001 From: Amitay Isaacs Date: Thu, 22 Jun 2017 17:45:20 +1000 Subject: [PATCH 5/6] ctdb-recovery: Get recmode unconditionally in the main_loop BUG: https://bugzilla.samba.org/show_bug.cgi?id=12857 This can be used later in the main_loop to avoid the local ip check. Signed-off-by: Amitay Isaacs Reviewed-by: Martin Schwenke (cherry picked from commit 2fd2ccd4c8617cfa7374d7a5ee3d1cc61c4fa4ad) --- ctdb/server/ctdb_recoverd.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/ctdb/server/ctdb_recoverd.c b/ctdb/server/ctdb_recoverd.c index 9ea0f61..5c40895 100644 --- a/ctdb/server/ctdb_recoverd.c +++ b/ctdb/server/ctdb_recoverd.c @@ -2608,6 +2608,13 @@ static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, return; } + ret = ctdb_ctrl_getrecmode(ctdb, mem_ctx, CONTROL_TIMEOUT(), + CTDB_CURRENT_NODE, &ctdb->recovery_mode); + if (ret != 0) { + D_ERR("Failed to read recmode from local node\n"); + return; + } + /* if the local daemon is STOPPED or BANNED, we verify that the databases are also frozen and that the recmode is set to active. */ @@ -2620,10 +2627,6 @@ static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, */ rec->priority_time = timeval_current(); - ret = ctdb_ctrl_getrecmode(ctdb, mem_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->recovery_mode); - if (ret != 0) { - DEBUG(DEBUG_ERR,(__location__ " Failed to read recmode from local node\n")); - } if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) { DEBUG(DEBUG_ERR,("Node is stopped or banned but recovery mode is not active. Activate recovery mode and lock databases\n")); -- 2.9.4 From 1fb2e4fd486596066b1ff7ea011165fdf728282d Mon Sep 17 00:00:00 2001 From: Amitay Isaacs Date: Thu, 22 Jun 2017 16:15:47 +1000 Subject: [PATCH 6/6] ctdb-recovery: Do not run local ip verification when in recovery BUG: https://bugzilla.samba.org/show_bug.cgi?id=12857 If we drop public IPs because CTDB is in recovery for too long, then avoid spamming logs "Trigger takeoverrun" every second. Signed-off-by: Amitay Isaacs Reviewed-by: Martin Schwenke (cherry picked from commit ea46699b27ef8d4ac7b5dd07035465cb3df09ea4) --- ctdb/server/ctdb_recoverd.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/ctdb/server/ctdb_recoverd.c b/ctdb/server/ctdb_recoverd.c index 5c40895..d9cc4a2 100644 --- a/ctdb/server/ctdb_recoverd.c +++ b/ctdb/server/ctdb_recoverd.c @@ -2670,9 +2670,11 @@ static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, return; } - /* Check if an IP takeover run is needed and trigger one if - * necessary */ - verify_local_ip_allocation(ctdb, rec, pnn, nodemap); + if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) { + /* Check if an IP takeover run is needed and trigger one if + * necessary */ + verify_local_ip_allocation(ctdb, rec, pnn, nodemap); + } /* if we are not the recmaster then we do not need to check if recovery is needed -- 2.9.4