From 5428312aa78fd8dbe6248c9052f5d3a8ae3a01bb Mon Sep 17 00:00:00 2001 From: Ralph Boehme Date: Sat, 29 Feb 2020 12:26:19 +0100 Subject: [PATCH 1/9] ctdb-daemon: ensure restart() callback is called in half-connected state If NODE_FLAGS_DISCONNECTED is set the node can be in half-connected state. With this change we ensure to restart the transport for this case. BUG: https://bugzilla.samba.org/show_bug.cgi?id=14295 Signed-off-by: Ralph Boehme Reviewed-by: Martin Schwenke (cherry picked from commit 6a4fa0785fc83561939fa41617d526eb96c1af89) --- ctdb/server/ctdb_server.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/ctdb/server/ctdb_server.c b/ctdb/server/ctdb_server.c index 0d5451d62a8..0bbd3751b51 100644 --- a/ctdb/server/ctdb_server.c +++ b/ctdb/server/ctdb_server.c @@ -301,6 +301,12 @@ void ctdb_input_pkt(struct ctdb_context *ctdb, struct ctdb_req_header *hdr) */ void ctdb_node_dead(struct ctdb_node *node) { + if (node->ctdb->methods == NULL) { + DEBUG(DEBUG_ERR,(__location__ " Can not restart transport while shutting down daemon.\n")); + return; + } + + node->ctdb->methods->restart(node); if (node->flags & NODE_FLAGS_DISCONNECTED) { DEBUG(DEBUG_INFO,("%s: node %s is already marked disconnected: %u connected\n", node->ctdb->name, node->name, @@ -315,13 +321,6 @@ void ctdb_node_dead(struct ctdb_node *node) DEBUG(DEBUG_ERR,("%s: node %s is dead: %u connected\n", node->ctdb->name, node->name, node->ctdb->num_connected)); ctdb_daemon_cancel_controls(node->ctdb, node); - - if (node->ctdb->methods == NULL) { - DEBUG(DEBUG_ERR,(__location__ " Can not restart transport while shutting down daemon.\n")); - return; - } - - node->ctdb->methods->restart(node); } /* -- 2.24.1 From edba51fad9915165e49f4874f141394ce04a5462 Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Sun, 1 Mar 2020 16:40:41 +1100 Subject: [PATCH 2/9] ctdb-daemon: more logical whitespace, debug modernisation BUG: https://bugzilla.samba.org/show_bug.cgi?id=14295 Signed-off-by: Martin Schwenke Reviewed-by: Ralph Boehme (cherry picked from commit 15762a34559599cf908e30651a2d4c11560068ed) --- ctdb/server/ctdb_server.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ctdb/server/ctdb_server.c b/ctdb/server/ctdb_server.c index 0bbd3751b51..1470b00dba5 100644 --- a/ctdb/server/ctdb_server.c +++ b/ctdb/server/ctdb_server.c @@ -302,11 +302,11 @@ void ctdb_input_pkt(struct ctdb_context *ctdb, struct ctdb_req_header *hdr) void ctdb_node_dead(struct ctdb_node *node) { if (node->ctdb->methods == NULL) { - DEBUG(DEBUG_ERR,(__location__ " Can not restart transport while shutting down daemon.\n")); + DBG_ERR("Can not restart transport while shutting down\n"); return; } - node->ctdb->methods->restart(node); + if (node->flags & NODE_FLAGS_DISCONNECTED) { DEBUG(DEBUG_INFO,("%s: node %s is already marked disconnected: %u connected\n", node->ctdb->name, node->name, -- 2.24.1 From 3a4a55222afcd832c477648a18ff7056d946dcc4 Mon Sep 17 00:00:00 2001 From: Noel Power Date: Sat, 29 Feb 2020 15:49:28 +0000 Subject: [PATCH 3/9] ctdb-tcp: move free of inbound queue to TCP restart Since commit 77deaadca8e8dbc3c92ea16893099c72f6dc874e, a nodeA which had previously accepted a connection from nodeB (where nodeB dies e.g. as as result of fencing) when nodeB attempts to connect again after restarting is always rejected with ctdb_listen_event: Incoming queue active, rejecting connection from w.x.y.z messages. Consolidate dead node handling in the TCP restart handling. BUG: https://bugzilla.samba.org/show_bug.cgi?id=14295 Signed-off-by: Noel Power Reviewed-by: Ralph Boehme Reviewed-by: Martin Schwenke (cherry picked from commit 0ff1b78fc2f0491f9e11131d0040bdaba8873770) --- ctdb/tcp/tcp_init.c | 2 +- ctdb/tcp/tcp_io.c | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/ctdb/tcp/tcp_init.c b/ctdb/tcp/tcp_init.c index 559ad8691d0..dbf6c4b9bcf 100644 --- a/ctdb/tcp/tcp_init.c +++ b/ctdb/tcp/tcp_init.c @@ -121,7 +121,7 @@ static void ctdb_tcp_restart(struct ctdb_node *node) node->transport_data, struct ctdb_tcp_node); DEBUG(DEBUG_NOTICE,("Tearing down connection to dead node :%d\n", node->pnn)); - + TALLOC_FREE(tnode->in_queue); ctdb_tcp_stop_connection(node); tnode->connect_te = tevent_add_timer(node->ctdb->ev, tnode, diff --git a/ctdb/tcp/tcp_io.c b/ctdb/tcp/tcp_io.c index df9ca02b413..bcb18fbf300 100644 --- a/ctdb/tcp/tcp_io.c +++ b/ctdb/tcp/tcp_io.c @@ -75,7 +75,6 @@ void ctdb_tcp_read_cb(uint8_t *data, size_t cnt, void *args) return; failed: - TALLOC_FREE(tnode->in_queue); node->ctdb->upcalls->node_dead(node); TALLOC_FREE(data); -- 2.24.1 From f4080e9874f47540e912cba56d04266ec9e64948 Mon Sep 17 00:00:00 2001 From: Ralph Boehme Date: Sat, 29 Feb 2020 12:13:12 +0100 Subject: [PATCH 4/9] ctdb-tcp: always call node_dead() upcall in ctdb_tcp_tnode_cb() ctdb_tcp_tnode_cb() is called when we receive data on the outgoing connection. This can happen when we get an EOF on the connection because the other side as closed. In this case data will be NULL. It would also be called if we received data from the peer. In this case data will not be NULL. The latter case is a fatal error though and we already call ctdb_tcp_stop_connection() for this case as well, which means even though the node is not fully connected anymore, by not calling the node_dead() upcall NODE_FLAGS_DISCONNECTED will not be set. BUG: https://bugzilla.samba.org/show_bug.cgi?id=14295 Signed-off-by: Ralph Boehme Reviewed-by: Martin Schwenke (cherry picked from commit b83ef98c7466b2a81968555de83fb977bb6ca9f0) --- ctdb/tcp/tcp_connect.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/ctdb/tcp/tcp_connect.c b/ctdb/tcp/tcp_connect.c index 559442f14bf..ea98e6126a6 100644 --- a/ctdb/tcp/tcp_connect.c +++ b/ctdb/tcp/tcp_connect.c @@ -65,9 +65,7 @@ void ctdb_tcp_tnode_cb(uint8_t *data, size_t cnt, void *private_data) struct ctdb_tcp_node *tnode = talloc_get_type( node->transport_data, struct ctdb_tcp_node); - if (data == NULL) { - node->ctdb->upcalls->node_dead(node); - } + node->ctdb->upcalls->node_dead(node); ctdb_tcp_stop_connection(node); tnode->connect_te = tevent_add_timer(node->ctdb->ev, tnode, -- 2.24.1 From 8c24a722a27faaea7d14f48f9f20966eccd7e516 Mon Sep 17 00:00:00 2001 From: Ralph Boehme Date: Sat, 29 Feb 2020 12:28:20 +0100 Subject: [PATCH 5/9] ctdb-tcp: Remove redundant restart in ctdb_tcp_tnode_cb() The node dead upcall has already restarted the outgoing connection. There's no need to repeat it. BUG: https://bugzilla.samba.org/show_bug.cgi?id=14295 Signed-off-by: Ralph Boehme Signed-off-by: Martin Schwenke (cherry picked from commit ea37ecdcd5960311f54a7a5510b88a654da23daa) --- ctdb/tcp/tcp_connect.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/ctdb/tcp/tcp_connect.c b/ctdb/tcp/tcp_connect.c index ea98e6126a6..805fba5c27a 100644 --- a/ctdb/tcp/tcp_connect.c +++ b/ctdb/tcp/tcp_connect.c @@ -62,15 +62,9 @@ void ctdb_tcp_stop_connection(struct ctdb_node *node) void ctdb_tcp_tnode_cb(uint8_t *data, size_t cnt, void *private_data) { struct ctdb_node *node = talloc_get_type(private_data, struct ctdb_node); - struct ctdb_tcp_node *tnode = talloc_get_type( - node->transport_data, struct ctdb_tcp_node); node->ctdb->upcalls->node_dead(node); - ctdb_tcp_stop_connection(node); - tnode->connect_te = tevent_add_timer(node->ctdb->ev, tnode, - timeval_current_ofs(3, 0), - ctdb_tcp_node_connect, node); TALLOC_FREE(data); } -- 2.24.1 From 0e969f6a96608bb26968acc71d3b459d03e1f4a1 Mon Sep 17 00:00:00 2001 From: Ralph Boehme Date: Fri, 28 Feb 2020 11:36:00 +0100 Subject: [PATCH 6/9] ctdb-tcp: rename ctdb_tcp_stop_connection() to ctdb_tcp_stop_outgoing() No change in behaviour. This makes the code self-documenting. BUG: https://bugzilla.samba.org/show_bug.cgi?id=14295 Signed-off-by: Ralph Boehme Reviewed-by: Martin Schwenke (cherry picked from commit 1e2a967ff41cc29c3a0d7f61a46937c68fdb90ba) --- ctdb/tcp/ctdb_tcp.h | 2 +- ctdb/tcp/tcp_connect.c | 12 ++++++------ ctdb/tcp/tcp_init.c | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/ctdb/tcp/ctdb_tcp.h b/ctdb/tcp/ctdb_tcp.h index daabad74297..095056e8544 100644 --- a/ctdb/tcp/ctdb_tcp.h +++ b/ctdb/tcp/ctdb_tcp.h @@ -48,7 +48,7 @@ void ctdb_tcp_node_connect(struct tevent_context *ev, struct tevent_timer *te, struct timeval t, void *private_data); void ctdb_tcp_read_cb(uint8_t *data, size_t cnt, void *args); void ctdb_tcp_tnode_cb(uint8_t *data, size_t cnt, void *private_data); -void ctdb_tcp_stop_connection(struct ctdb_node *node); +void ctdb_tcp_stop_outgoing(struct ctdb_node *node); #define CTDB_TCP_ALIGNMENT 8 diff --git a/ctdb/tcp/tcp_connect.c b/ctdb/tcp/tcp_connect.c index 805fba5c27a..757c41dc703 100644 --- a/ctdb/tcp/tcp_connect.c +++ b/ctdb/tcp/tcp_connect.c @@ -38,9 +38,9 @@ #include "ctdb_tcp.h" /* - stop any connecting (established or pending) to a node + stop any outgoing connection (established or pending) to a node */ -void ctdb_tcp_stop_connection(struct ctdb_node *node) +void ctdb_tcp_stop_outgoing(struct ctdb_node *node) { struct ctdb_tcp_node *tnode = talloc_get_type( node->transport_data, struct ctdb_tcp_node); @@ -89,7 +89,7 @@ static void ctdb_node_connect_write(struct tevent_context *ev, ret = getsockopt(tnode->out_fd, SOL_SOCKET, SO_ERROR, &error, &len); if (ret != 0 || error != 0) { - ctdb_tcp_stop_connection(node); + ctdb_tcp_stop_outgoing(node); tnode->connect_te = tevent_add_timer(ctdb->ev, tnode, timeval_current_ofs(1, 0), ctdb_tcp_node_connect, node); @@ -126,7 +126,7 @@ static void ctdb_node_connect_write(struct tevent_context *ev, node->name); if (tnode->out_queue == NULL) { DBG_ERR("Failed to set up outgoing queue\n"); - ctdb_tcp_stop_connection(node); + ctdb_tcp_stop_outgoing(node); tnode->connect_te = tevent_add_timer(ctdb->ev, tnode, timeval_current_ofs(1, 0), @@ -166,7 +166,7 @@ void ctdb_tcp_node_connect(struct tevent_context *ev, struct tevent_timer *te, ctdb_sock_addr sock_out; int ret; - ctdb_tcp_stop_connection(node); + ctdb_tcp_stop_outgoing(node); sock_out = node->address; @@ -250,7 +250,7 @@ void ctdb_tcp_node_connect(struct tevent_context *ev, struct tevent_timer *te, return; failed: - ctdb_tcp_stop_connection(node); + ctdb_tcp_stop_outgoing(node); tnode->connect_te = tevent_add_timer(ctdb->ev, tnode, timeval_current_ofs(1, 0), diff --git a/ctdb/tcp/tcp_init.c b/ctdb/tcp/tcp_init.c index dbf6c4b9bcf..c6acb166807 100644 --- a/ctdb/tcp/tcp_init.c +++ b/ctdb/tcp/tcp_init.c @@ -122,7 +122,7 @@ static void ctdb_tcp_restart(struct ctdb_node *node) DEBUG(DEBUG_NOTICE,("Tearing down connection to dead node :%d\n", node->pnn)); TALLOC_FREE(tnode->in_queue); - ctdb_tcp_stop_connection(node); + ctdb_tcp_stop_outgoing(node); tnode->connect_te = tevent_add_timer(node->ctdb->ev, tnode, timeval_zero(), -- 2.24.1 From dfb23700e26cef0fcecab562241665dbeb9f4399 Mon Sep 17 00:00:00 2001 From: Ralph Boehme Date: Sat, 29 Feb 2020 11:54:51 +0100 Subject: [PATCH 7/9] ctdb-tcp: add ctdb_tcp_stop_incoming() No change in behaviour. This makes the code self-documenting. BUG: https://bugzilla.samba.org/show_bug.cgi?id=14295 Signed-off-by: Ralph Boehme Signed-off-by: Martin Schwenke (cherry picked from commit 2c73dbafba50b28e72a8ec7b4382fae42fca6d17) --- ctdb/tcp/ctdb_tcp.h | 1 + ctdb/tcp/tcp_connect.c | 10 ++++++++++ ctdb/tcp/tcp_init.c | 2 +- 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/ctdb/tcp/ctdb_tcp.h b/ctdb/tcp/ctdb_tcp.h index 095056e8544..cb8d66fa5dc 100644 --- a/ctdb/tcp/ctdb_tcp.h +++ b/ctdb/tcp/ctdb_tcp.h @@ -49,6 +49,7 @@ void ctdb_tcp_node_connect(struct tevent_context *ev, struct tevent_timer *te, void ctdb_tcp_read_cb(uint8_t *data, size_t cnt, void *args); void ctdb_tcp_tnode_cb(uint8_t *data, size_t cnt, void *private_data); void ctdb_tcp_stop_outgoing(struct ctdb_node *node); +void ctdb_tcp_stop_incoming(struct ctdb_node *node); #define CTDB_TCP_ALIGNMENT 8 diff --git a/ctdb/tcp/tcp_connect.c b/ctdb/tcp/tcp_connect.c index 757c41dc703..4732a8d612b 100644 --- a/ctdb/tcp/tcp_connect.c +++ b/ctdb/tcp/tcp_connect.c @@ -54,6 +54,16 @@ void ctdb_tcp_stop_outgoing(struct ctdb_node *node) } } +/* + stop incoming connection to a node + */ +void ctdb_tcp_stop_incoming(struct ctdb_node *node) +{ + struct ctdb_tcp_node *tnode = talloc_get_type( + node->transport_data, struct ctdb_tcp_node); + + TALLOC_FREE(tnode->in_queue); +} /* called when a complete packet has come in - should not happen on this socket diff --git a/ctdb/tcp/tcp_init.c b/ctdb/tcp/tcp_init.c index c6acb166807..97ebe1d887a 100644 --- a/ctdb/tcp/tcp_init.c +++ b/ctdb/tcp/tcp_init.c @@ -121,7 +121,7 @@ static void ctdb_tcp_restart(struct ctdb_node *node) node->transport_data, struct ctdb_tcp_node); DEBUG(DEBUG_NOTICE,("Tearing down connection to dead node :%d\n", node->pnn)); - TALLOC_FREE(tnode->in_queue); + ctdb_tcp_stop_incoming(node); ctdb_tcp_stop_outgoing(node); tnode->connect_te = tevent_add_timer(node->ctdb->ev, tnode, -- 2.24.1 From 5908fd88d580c3b4430268326cc8e2f30339d10c Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Fri, 6 Mar 2020 15:59:32 +1100 Subject: [PATCH 8/9] ctdb-tcp: Factor out function ctdb_tcp_start_outgoing() BUG: https://bugzilla.samba.org/show_bug.cgi?id=14295 Signed-off-by: Amitay Isaacs Signed-off-by: Martin Schwenke (cherry picked from commit 3c8747fe29486a4f95308b335a5e3ec1807f62cb) --- ctdb/tcp/tcp_connect.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/ctdb/tcp/tcp_connect.c b/ctdb/tcp/tcp_connect.c index 4732a8d612b..6576f3362cd 100644 --- a/ctdb/tcp/tcp_connect.c +++ b/ctdb/tcp/tcp_connect.c @@ -162,11 +162,8 @@ static void ctdb_node_connect_write(struct tevent_context *ev, /* called when we should try and establish a tcp connection to a node */ -void ctdb_tcp_node_connect(struct tevent_context *ev, struct tevent_timer *te, - struct timeval t, void *private_data) +static void ctdb_tcp_start_outgoing(struct ctdb_node *node) { - struct ctdb_node *node = talloc_get_type(private_data, - struct ctdb_node); struct ctdb_tcp_node *tnode = talloc_get_type(node->transport_data, struct ctdb_tcp_node); struct ctdb_context *ctdb = node->ctdb; @@ -176,8 +173,6 @@ void ctdb_tcp_node_connect(struct tevent_context *ev, struct tevent_timer *te, ctdb_sock_addr sock_out; int ret; - ctdb_tcp_stop_outgoing(node); - sock_out = node->address; tnode->out_fd = socket(sock_out.sa.sa_family, SOCK_STREAM, IPPROTO_TCP); @@ -268,6 +263,18 @@ void ctdb_tcp_node_connect(struct tevent_context *ev, struct tevent_timer *te, node); } +void ctdb_tcp_node_connect(struct tevent_context *ev, + struct tevent_timer *te, + struct timeval t, + void *private_data) +{ + struct ctdb_node *node = talloc_get_type_abort(private_data, + struct ctdb_node); + + ctdb_tcp_stop_outgoing(node); + ctdb_tcp_start_outgoing(node); +} + /* called when we get contacted by another node currently makes no attempt to check if the connection is really from a ctdb -- 2.24.1 From 9e838e9943b54dfa98f05e7fed64e25029e42734 Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Fri, 6 Mar 2020 16:11:23 +1100 Subject: [PATCH 9/9] ctdb-tcp: Do not stop outbound connection in ctdb_tcp_node_connect() The only place the outgoing connection needs to be stopped is when there is a timeout when waiting for the connection to become writable. Add a new function ctdb_tcp_node_connect_timeout() to handle this case. All of the other cases are attempts to establish a new outgoing connection (initial attempt, retry after an error or disconnect, ...) so drop stopping the connection in those cases. BUG: https://bugzilla.samba.org/show_bug.cgi?id=14295 Signed-off-by: Amitay Isaacs Signed-off-by: Martin Schwenke Autobuild-User(master): Martin Schwenke Autobuild-Date(master): Thu Mar 12 05:29:20 UTC 2020 on sn-devel-184 (cherry picked from commit 319c93f0c6a949545229b616dfbd4f51baf11171) --- ctdb/tcp/tcp_connect.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/ctdb/tcp/tcp_connect.c b/ctdb/tcp/tcp_connect.c index 6576f3362cd..f7703b77f0d 100644 --- a/ctdb/tcp/tcp_connect.c +++ b/ctdb/tcp/tcp_connect.c @@ -159,6 +159,11 @@ static void ctdb_node_connect_write(struct tevent_context *ev, } +static void ctdb_tcp_node_connect_timeout(struct tevent_context *ev, + struct tevent_timer *te, + struct timeval t, + void *private_data); + /* called when we should try and establish a tcp connection to a node */ @@ -249,7 +254,7 @@ static void ctdb_tcp_start_outgoing(struct ctdb_node *node) tnode->connect_te = tevent_add_timer(ctdb->ev, tnode, timeval_current_ofs(1, 0), - ctdb_tcp_node_connect, + ctdb_tcp_node_connect_timeout, node); return; @@ -271,6 +276,17 @@ void ctdb_tcp_node_connect(struct tevent_context *ev, struct ctdb_node *node = talloc_get_type_abort(private_data, struct ctdb_node); + ctdb_tcp_start_outgoing(node); +} + +static void ctdb_tcp_node_connect_timeout(struct tevent_context *ev, + struct tevent_timer *te, + struct timeval t, + void *private_data) +{ + struct ctdb_node *node = talloc_get_type_abort(private_data, + struct ctdb_node); + ctdb_tcp_stop_outgoing(node); ctdb_tcp_start_outgoing(node); } -- 2.24.1