From f694940f06ceee6aa4ce0696f4bbc9e1f05c235f Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Thu, 18 Aug 2016 12:57:33 +1000 Subject: [PATCH] ctdb-ipalloc: Fix cumulative takeover timeout Commit c40fc62642ff5ac49b75e9af49c299e33dbc9073 runs the IP allocation algorithm after calculating the timeout offset. If the algorithm takes a long time then there may be no attempt to release or take over IPs. Instead, reset the timeout just before the RELEASE_IP stage if an early jump to IPREALLOCATED was not taken. BUG: https://bugzilla.samba.org/show_bug.cgi?id=12161 Signed-off-by: Martin Schwenke Reviewed-by: Amitay Isaacs Autobuild-User(master): Amitay Isaacs Autobuild-Date(master): Thu Aug 18 12:36:37 CEST 2016 on sn-devel-144 (cherry picked from commit 626dcc9e493e2ac4fd502f75c7cb4d29f686f017) --- ctdb/server/ctdb_takeover.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/ctdb/server/ctdb_takeover.c b/ctdb/server/ctdb_takeover.c index 6d182de..ea377f8 100644 --- a/ctdb/server/ctdb_takeover.c +++ b/ctdb/server/ctdb_takeover.c @@ -1541,16 +1541,9 @@ int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodem return -1; } - /* Each of the later stages (RELEASE_IP, TAKEOVER_IP, - * IPREALLOCATED) notionally has a timeout of TakeoverTimeout - * seconds. However, RELEASE_IP can take longer due to TCP - * connection killing, so sometimes needs more time. - * Therefore, use a cumulative timeout of TakeoverTimeout * 3 - * seconds across all 3 stages. No explicit expiry checks are - * needed before each stage because tevent is smart enough to - * fire the timeouts even if they are in the past. Initialise - * this here to cope with early jumps to IPREALLOCATED. */ - timeout = timeval_current_ofs(3 * ctdb->tunable.takeover_timeout,0); + /* Default timeout for early jump to IPREALLOCATED. See below + * for explanation of 3 times... */ + timeout = timeval_current_ofs(3 * ctdb->tunable.takeover_timeout, 0); /* * ip failover is completely disabled, just send out the @@ -1624,6 +1617,20 @@ int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodem ZERO_STRUCT(ip); /* Avoid valgrind warnings for union */ + /* Each of the following stages (RELEASE_IP, TAKEOVER_IP, + * IPREALLOCATED) notionally has a timeout of TakeoverTimeout + * seconds. However, RELEASE_IP can take longer due to TCP + * connection killing, so sometimes needs more time. + * Therefore, use a cumulative timeout of TakeoverTimeout * 3 + * seconds across all 3 stages. No explicit expiry checks are + * needed before each stage because tevent is smart enough to + * fire the timeouts even if they are in the past. Initialise + * this here so it explicitly covers the stages we're + * interested in but, in particular, not the time taken by the + * ipalloc(). + */ + timeout = timeval_current_ofs(3 * ctdb->tunable.takeover_timeout, 0); + /* Send a RELEASE_IP to all nodes that should not be hosting * each IP. For each IP, all but one of these will be * redundant. However, the redundant ones are used to tell -- 2.8.1