From 9d96c73f5244d5bbf10eb8f8439b18a26bce4ed4 Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Mon, 21 Jan 2019 16:28:28 +1100 Subject: [PATCH 1/6] ctdb-recoverd: Free cluster mutex handler on failure to take lock If nested events occur while the file descriptor handler is still active then chaos can ensue. For example, if a node is banned and the lock is explicitly cancelled (e.g. due to election loss) then double-talloc-free()s abound. BUG: https://bugzilla.samba.org/show_bug.cgi?id=13800 Signed-off-by: Martin Schwenke Reviewed-by: Amitay Isaacs (cherry picked from commit 621658cbed5d91d7096fc208bac2ff93a1880e7d) --- ctdb/server/ctdb_recoverd.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/ctdb/server/ctdb_recoverd.c b/ctdb/server/ctdb_recoverd.c index 673c99c3d34..20b6e44e7ae 100644 --- a/ctdb/server/ctdb_recoverd.c +++ b/ctdb/server/ctdb_recoverd.c @@ -897,6 +897,16 @@ static void take_reclock_handler(char status, struct ctdb_recovery_lock_handle *s = (struct ctdb_recovery_lock_handle *) private_data; + s->locked = (status == '0') ; + + /* + * If unsuccessful then ensure the process has exited and that + * the file descriptor event handler has been cancelled + */ + if (! s->locked) { + TALLOC_FREE(s->h); + } + switch (status) { case '0': s->latency = latency; @@ -912,7 +922,6 @@ static void take_reclock_handler(char status, } s->done = true; - s->locked = (status == '0') ; } static bool ctdb_recovery_lock(struct ctdb_recoverd *rec); -- 2.20.1 From 484d55714db1a687b91406a52b01083e8710d0a1 Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Mon, 21 Jan 2019 16:36:13 +1100 Subject: [PATCH 2/6] ctdb-recoverd: Clean up logging on failure to take recovery lock Add an explicit case for a timeout and clean up the other messages. BUG: https://bugzilla.samba.org/show_bug.cgi?id=13800 Signed-off-by: Martin Schwenke Reviewed-by: Amitay Isaacs (cherry picked from commit 7e4aae6943291c3144c8a3ff97537e8d4c7dc7c9) --- ctdb/server/ctdb_recoverd.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/ctdb/server/ctdb_recoverd.c b/ctdb/server/ctdb_recoverd.c index 20b6e44e7ae..ed055bdcdfe 100644 --- a/ctdb/server/ctdb_recoverd.c +++ b/ctdb/server/ctdb_recoverd.c @@ -913,12 +913,15 @@ static void take_reclock_handler(char status, break; case '1': - DEBUG(DEBUG_ERR, - ("Unable to take recovery lock - contention\n")); + D_ERR("Unable to take recovery lock - contention\n"); + break; + + case '2': + D_ERR("Unable to take recovery lock - timeout\n"); break; default: - DEBUG(DEBUG_ERR, ("ERROR: when taking recovery lock\n")); + D_ERR("Unable to take recover lock - unknown error\n"); } s->done = true; -- 2.20.1 From 5d51ee162e76b543b2aebd950005e590dbac1365 Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Thu, 10 Jan 2019 13:24:34 +1100 Subject: [PATCH 3/6] ctdb-recoverd: Make recoverd context available in recovery lock handle BUG: https://bugzilla.samba.org/show_bug.cgi?id=13800 Signed-off-by: Martin Schwenke Reviewed-by: Amitay Isaacs (cherry picked from commit c0fb62ed3954fc6e8667480aba92003fc270f257) --- ctdb/server/ctdb_recoverd.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ctdb/server/ctdb_recoverd.c b/ctdb/server/ctdb_recoverd.c index ed055bdcdfe..560e632d028 100644 --- a/ctdb/server/ctdb_recoverd.c +++ b/ctdb/server/ctdb_recoverd.c @@ -888,6 +888,7 @@ struct ctdb_recovery_lock_handle { bool locked; double latency; struct ctdb_cluster_mutex_handle *h; + struct ctdb_recoverd *rec; }; static void take_reclock_handler(char status, @@ -955,6 +956,8 @@ static bool ctdb_recovery_lock(struct ctdb_recoverd *rec) return false; }; + s->rec = rec; + h = ctdb_cluster_mutex(s, ctdb, ctdb->recovery_lock, -- 2.20.1 From 5346bcad893931d8242ab2789a8b2671f252b02a Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Thu, 10 Jan 2019 14:01:57 +1100 Subject: [PATCH 4/6] ctdb-recoverd: Ban node on unknown error when taking recovery lock We really shouldn't see unknown errors. They probably represent a misconfigured recovery lock or similar. BUG: https://bugzilla.samba.org/show_bug.cgi?id=13800 Signed-off-by: Martin Schwenke Reviewed-by: Amitay Isaacs (cherry picked from commit 45a77d65b2e39b4af94da4ab99575f4ee08a7ebd) --- ctdb/server/ctdb_recoverd.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/ctdb/server/ctdb_recoverd.c b/ctdb/server/ctdb_recoverd.c index 560e632d028..47ccda15a1a 100644 --- a/ctdb/server/ctdb_recoverd.c +++ b/ctdb/server/ctdb_recoverd.c @@ -923,6 +923,17 @@ static void take_reclock_handler(char status, default: D_ERR("Unable to take recover lock - unknown error\n"); + + { + struct ctdb_recoverd *rec = s->rec; + struct ctdb_context *ctdb = rec->ctdb; + uint32_t pnn = ctdb_get_pnn(ctdb); + + D_ERR("Banning this node\n"); + ctdb_ban_node(rec, + pnn, + ctdb->tunable.recovery_ban_period); + } } s->done = true; -- 2.20.1 From ce1a244e9f89122d2eaa54549583c70b73d82023 Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Fri, 22 Feb 2019 15:09:33 +1100 Subject: [PATCH 5/6] ctdb-recoverd: Time out attempt to take recovery lock after 120s Currently this will wait forever. It really needs a timeout in case the cluster filesystem (or other lock mechanism) is completely wedged. BUG: https://bugzilla.samba.org/show_bug.cgi?id=13800 Signed-off-by: Martin Schwenke Reviewed-by: Amitay Isaacs (cherry picked from commit 13a1a4808935290dceb219daccd7aac3fda4e184) --- ctdb/server/ctdb_recoverd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ctdb/server/ctdb_recoverd.c b/ctdb/server/ctdb_recoverd.c index 47ccda15a1a..469fd21b463 100644 --- a/ctdb/server/ctdb_recoverd.c +++ b/ctdb/server/ctdb_recoverd.c @@ -972,7 +972,7 @@ static bool ctdb_recovery_lock(struct ctdb_recoverd *rec) h = ctdb_cluster_mutex(s, ctdb, ctdb->recovery_lock, - 0, + 120, take_reclock_handler, s, lost_reclock_handler, -- 2.20.1 From 20ddcf56504dc0efd3c331cba8cf710c87d7982a Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Mon, 21 Jan 2019 12:16:43 +1100 Subject: [PATCH 6/6] ctdb-cluster-mutex: Separate out command and file handling This code is difficult to read and there really is no common code between the 2 cases. For example, there is no need to split a filename into words. Separating each of the 2 cases into its own function makes the logic much easier to understand. BUG: https://bugzilla.samba.org/show_bug.cgi?id=13800 Signed-off-by: Martin Schwenke Reviewed-by: Amitay Isaacs Autobuild-User(master): Amitay Isaacs Autobuild-Date(master): Mon Feb 25 03:40:16 CET 2019 on sn-devel-144 (cherry picked from commit c93430fe8fe530a55b9a04cf6cc660c3d420e333) (cherry picked from commit d5131afc533102ed5adfb147bf1a316e51810729) --- ctdb/server/ctdb_cluster_mutex.c | 113 +++++++++++++++++++------------ 1 file changed, 71 insertions(+), 42 deletions(-) diff --git a/ctdb/server/ctdb_cluster_mutex.c b/ctdb/server/ctdb_cluster_mutex.c index 330d5fd1d90..2e3cb8112ad 100644 --- a/ctdb/server/ctdb_cluster_mutex.c +++ b/ctdb/server/ctdb_cluster_mutex.c @@ -118,72 +118,101 @@ static void cluster_mutex_handler(struct tevent_context *ev, static char cluster_mutex_helper[PATH_MAX+1] = ""; -static bool cluster_mutex_helper_args(TALLOC_CTX *mem_ctx, - const char *argstring, char ***argv) +static bool cluster_mutex_helper_args_file(TALLOC_CTX *mem_ctx, + const char *argstring, + char ***argv) { - int nargs, i, ret, n; - bool is_command = false; + bool ok; char **args = NULL; - char *strv = NULL; - char *t = NULL; - if (argstring != NULL && argstring[0] == '!') { - /* This is actually a full command */ - is_command = true; - t = discard_const(&argstring[1]); - } else { - is_command = false; - t = discard_const(argstring); + ok = ctdb_set_helper("cluster mutex helper", + cluster_mutex_helper, + sizeof(cluster_mutex_helper), + "CTDB_CLUSTER_MUTEX_HELPER", + CTDB_HELPER_BINDIR, + "ctdb_mutex_fcntl_helper"); + if (! ok) { + DBG_ERR("ctdb exiting with error: " + "Unable to set cluster mutex helper\n"); + exit(1); } - ret = strv_split(mem_ctx, &strv, t, " \t"); - if (ret != 0) { - DEBUG(DEBUG_ERR, - ("Unable to parse mutex helper string \"%s\" (%s)\n", - argstring, strerror(ret))); + + /* Array includes default helper, file and NULL */ + args = talloc_array(mem_ctx, char *, 3); + if (args == NULL) { + DBG_ERR("Memory allocation error\n"); return false; } - n = strv_count(strv); - args = talloc_array(mem_ctx, char *, n + (is_command ? 1 : 2)); + args[0] = cluster_mutex_helper; - if (args == NULL) { - DEBUG(DEBUG_ERR,(__location__ " out of memory\n")); + args[1] = talloc_strdup(args, argstring); + if (args[1] == NULL) { + DBG_ERR("Memory allocation error\n"); return false; } - nargs = 0; - - if (! is_command) { - if (!ctdb_set_helper("cluster mutex helper", - cluster_mutex_helper, - sizeof(cluster_mutex_helper), - "CTDB_CLUSTER_MUTEX_HELPER", - CTDB_HELPER_BINDIR, - "ctdb_mutex_fcntl_helper")) { - DEBUG(DEBUG_ERR,("ctdb exiting with error: %s\n", - __location__ - " Unable to set cluster mutex helper\n")); - exit(1); - } + args[2] = NULL; + + *argv = args; + return true; +} - args[nargs++] = cluster_mutex_helper; +static bool cluster_mutex_helper_args_cmd(TALLOC_CTX *mem_ctx, + const char *argstring, + char ***argv) +{ + int i, ret, n; + char **args = NULL; + char *strv = NULL; + char *t = NULL; + + ret = strv_split(mem_ctx, &strv, argstring, " \t"); + if (ret != 0) { + D_ERR("Unable to parse mutex helper command \"%s\" (%s)\n", + argstring, + strerror(ret)); + return false; } + n = strv_count(strv); + + /* Extra slot for NULL */ + args = talloc_array(mem_ctx, char *, n + 1); + if (args == NULL) { + DBG_ERR("Memory allocation error\n"); + return false; + } + + talloc_steal(args, strv); t = NULL; - for (i = 0; i < n; i++) { - /* Don't copy, just keep cmd_args around */ + for (i = 0 ; i < n; i++) { t = strv_next(strv, t); - args[nargs++] = t; + args[i] = t; } - /* Make sure last argument is NULL */ - args[nargs] = NULL; + args[n] = NULL; *argv = args; return true; } +static bool cluster_mutex_helper_args(TALLOC_CTX *mem_ctx, + const char *argstring, + char ***argv) +{ + bool ok; + + if (argstring != NULL && argstring[0] == '!') { + ok = cluster_mutex_helper_args_cmd(mem_ctx, &argstring[1], argv); + } else { + ok = cluster_mutex_helper_args_file(mem_ctx, argstring, argv); + } + + return ok; +} + struct ctdb_cluster_mutex_handle * ctdb_cluster_mutex(TALLOC_CTX *mem_ctx, struct ctdb_context *ctdb, -- 2.20.1