From 5582e7534d7e539b3cf53162ede81d5dbdaf2499 Mon Sep 17 00:00:00 2001 From: Amitay Isaacs Date: Thu, 22 Jun 2017 14:00:13 +1000 Subject: [PATCH 1/6] ctdb-recovery: Assign banning credits if database fails to freeze https://bugzilla.samba.org/show_bug.cgi?id=12857 Signed-off-by: Amitay Isaacs Reviewed-by: Martin Schwenke (cherry picked from commit c9d9f56bffe1e19665dba8e0cf899399d3d9fb72) --- ctdb/server/ctdb_recovery_helper.c | 1 + 1 file changed, 1 insertion(+) diff --git a/ctdb/server/ctdb_recovery_helper.c b/ctdb/server/ctdb_recovery_helper.c index d54c290..ba2f0a9 100644 --- a/ctdb/server/ctdb_recovery_helper.c +++ b/ctdb/server/ctdb_recovery_helper.c @@ -1647,6 +1647,7 @@ static void recover_db_freeze_done(struct tevent_req *subreq) if (ret2 != 0) { LOG("control FREEZE_DB failed for db %s on node %u," " ret=%d\n", state->db_name, pnn, ret2); + state->ban_credits[pnn] += 1; } else { LOG("control FREEZE_DB failed for db %s, ret=%d\n", state->db_name, ret); -- 2.9.4 From 2c64b3ce660114dcff82f6d6abf31e1f41ca3e42 Mon Sep 17 00:00:00 2001 From: Amitay Isaacs Date: Thu, 22 Jun 2017 14:49:02 +1000 Subject: [PATCH 2/6] ctdb-recovery: Setting up of recmode should be idempotent BUG: https://bugzilla.samba.org/show_bug.cgi?id=12857 If the recovery mode is already set to the expected value, there is nothing to do. Signed-off-by: Amitay Isaacs Reviewed-by: Martin Schwenke (cherry picked from commit f2771fcbf438e8b06321752c7203f01bbe33b573) --- ctdb/server/ctdb_recover.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/ctdb/server/ctdb_recover.c b/ctdb/server/ctdb_recover.c index 6bed61c..410117a 100644 --- a/ctdb/server/ctdb_recover.c +++ b/ctdb/server/ctdb_recover.c @@ -856,6 +856,13 @@ int32_t ctdb_control_set_recmode(struct ctdb_context *ctdb, struct set_recmode_state *state; struct ctdb_cluster_mutex_handle *h; + if (recmode == ctdb->recovery_mode) { + DEBUG(DEBUG_INFO, + ("Recovery mode already set to %s\n", + recmode == CTDB_RECOVERY_NORMAL ? "NORMAL" : "ACTIVE")); + return 0; + } + /* if we enter recovery but stay in recovery for too long we will eventually drop all our ip addresses */ -- 2.9.4 From 5295500c579d99cd5c34fccb30301ea7aaff49cc Mon Sep 17 00:00:00 2001 From: Amitay Isaacs Date: Thu, 22 Jun 2017 14:52:32 +1000 Subject: [PATCH 3/6] ctdb-recovery: Simplify logging of recovery mode setting BUG: https://bugzilla.samba.org/show_bug.cgi?id=12857 Signed-off-by: Amitay Isaacs Reviewed-by: Martin Schwenke (cherry picked from commit d74dadd7f26a9e8c48ba92468d7d0c4a7aa5a8e5) --- ctdb/server/ctdb_recover.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/ctdb/server/ctdb_recover.c b/ctdb/server/ctdb_recover.c index 410117a..08eb5c9 100644 --- a/ctdb/server/ctdb_recover.c +++ b/ctdb/server/ctdb_recover.c @@ -863,6 +863,10 @@ int32_t ctdb_control_set_recmode(struct ctdb_context *ctdb, return 0; } + DEBUG(DEBUG_NOTICE, + ("Recovery mode set to %s\n", + recmode == CTDB_RECOVERY_NORMAL ? "NORMAL" : "ACTIVE")); + /* if we enter recovery but stay in recovery for too long we will eventually drop all our ip addresses */ @@ -875,11 +879,6 @@ int32_t ctdb_control_set_recmode(struct ctdb_context *ctdb, } } - if (recmode != ctdb->recovery_mode) { - DEBUG(DEBUG_NOTICE,(__location__ " Recovery mode set to %s\n", - recmode==CTDB_RECOVERY_NORMAL?"NORMAL":"ACTIVE")); - } - if (recmode != CTDB_RECOVERY_NORMAL || ctdb->recovery_mode != CTDB_RECOVERY_ACTIVE) { ctdb->recovery_mode = recmode; -- 2.9.4 From 7126b65bf676190d9d5f5ba9e929aa6e7dcf970d Mon Sep 17 00:00:00 2001 From: Amitay Isaacs Date: Thu, 22 Jun 2017 14:09:32 +1000 Subject: [PATCH 4/6] ctdb-recovery: Finish processing for recovery mode ACTIVE first BUG: https://bugzilla.samba.org/show_bug.cgi?id=12857 This simplifies the code and avoids complicated conditions. Signed-off-by: Amitay Isaacs Reviewed-by: Martin Schwenke (cherry picked from commit f8200153b21f5b19c9a1d57be3e05e739d9fafcd) --- ctdb/server/ctdb_recover.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/ctdb/server/ctdb_recover.c b/ctdb/server/ctdb_recover.c index 08eb5c9..ea8a0b3 100644 --- a/ctdb/server/ctdb_recover.c +++ b/ctdb/server/ctdb_recover.c @@ -870,18 +870,13 @@ int32_t ctdb_control_set_recmode(struct ctdb_context *ctdb, /* if we enter recovery but stay in recovery for too long we will eventually drop all our ip addresses */ - if (recmode == CTDB_RECOVERY_NORMAL) { - talloc_free(ctdb->release_ips_ctx); - ctdb->release_ips_ctx = NULL; - } else { + if (recmode == CTDB_RECOVERY_ACTIVE) { if (ctdb_deferred_drop_all_ips(ctdb) != 0) { - DEBUG(DEBUG_ERR,("Failed to set up deferred drop all ips\n")); + DEBUG(DEBUG_ERR, + ("Failed to set up deferred drop all ips\n")); } - } - if (recmode != CTDB_RECOVERY_NORMAL || - ctdb->recovery_mode != CTDB_RECOVERY_ACTIVE) { - ctdb->recovery_mode = recmode; + ctdb->recovery_mode = CTDB_RECOVERY_ACTIVE; return 0; } @@ -890,6 +885,8 @@ int32_t ctdb_control_set_recmode(struct ctdb_context *ctdb, * Therefore, what follows is special handling when setting * recovery mode back to normal */ + TALLOC_FREE(ctdb->release_ips_ctx); + for (ctdb_db = ctdb->db_list; ctdb_db != NULL; ctdb_db = ctdb_db->next) { if (ctdb_db->generation != ctdb->vnn_map->generation) { DEBUG(DEBUG_ERR, -- 2.9.4 From f11d0209b235d3de7e52a0b954a64c3265a3417d Mon Sep 17 00:00:00 2001 From: Amitay Isaacs Date: Thu, 22 Jun 2017 17:45:20 +1000 Subject: [PATCH 5/6] ctdb-recovery: Get recmode unconditionally in the main_loop BUG: https://bugzilla.samba.org/show_bug.cgi?id=12857 This can be used later in the main_loop to avoid the local ip check. Signed-off-by: Amitay Isaacs Reviewed-by: Martin Schwenke (cherry picked from commit 2fd2ccd4c8617cfa7374d7a5ee3d1cc61c4fa4ad) --- ctdb/server/ctdb_recoverd.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/ctdb/server/ctdb_recoverd.c b/ctdb/server/ctdb_recoverd.c index d233817..c7f31e3 100644 --- a/ctdb/server/ctdb_recoverd.c +++ b/ctdb/server/ctdb_recoverd.c @@ -2596,6 +2596,13 @@ static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, return; } + ret = ctdb_ctrl_getrecmode(ctdb, mem_ctx, CONTROL_TIMEOUT(), + CTDB_CURRENT_NODE, &ctdb->recovery_mode); + if (ret != 0) { + DEBUG(DEBUG_ERR, ("Failed to read recmode from local node\n")); + return; + } + /* if the local daemon is STOPPED or BANNED, we verify that the databases are also frozen and that the recmode is set to active. */ @@ -2608,10 +2615,6 @@ static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, */ rec->priority_time = timeval_current(); - ret = ctdb_ctrl_getrecmode(ctdb, mem_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->recovery_mode); - if (ret != 0) { - DEBUG(DEBUG_ERR,(__location__ " Failed to read recmode from local node\n")); - } if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) { DEBUG(DEBUG_ERR,("Node is stopped or banned but recovery mode is not active. Activate recovery mode and lock databases\n")); -- 2.9.4 From f741c6dd7b8b66c5aa66d69918499b27dc131f38 Mon Sep 17 00:00:00 2001 From: Amitay Isaacs Date: Thu, 22 Jun 2017 16:15:47 +1000 Subject: [PATCH 6/6] ctdb-recovery: Do not run local ip verification when in recovery BUG: https://bugzilla.samba.org/show_bug.cgi?id=12857 If we drop public IPs because CTDB is in recovery for too long, then avoid spamming logs "Trigger takeoverrun" every second. Signed-off-by: Amitay Isaacs Reviewed-by: Martin Schwenke (cherry picked from commit ea46699b27ef8d4ac7b5dd07035465cb3df09ea4) --- ctdb/server/ctdb_recoverd.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/ctdb/server/ctdb_recoverd.c b/ctdb/server/ctdb_recoverd.c index c7f31e3..f947ee0 100644 --- a/ctdb/server/ctdb_recoverd.c +++ b/ctdb/server/ctdb_recoverd.c @@ -2658,9 +2658,11 @@ static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, return; } - /* Check if an IP takeover run is needed and trigger one if - * necessary */ - verify_local_ip_allocation(ctdb, rec, pnn, nodemap); + if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) { + /* Check if an IP takeover run is needed and trigger one if + * necessary */ + verify_local_ip_allocation(ctdb, rec, pnn, nodemap); + } /* if we are not the recmaster then we do not need to check if recovery is needed -- 2.9.4