From 0e06abb6aae50d7e7c32f66ec85808de0ec046de Mon Sep 17 00:00:00 2001 From: Ralph Boehme Date: Sat, 29 Feb 2020 12:26:19 +0100 Subject: [PATCH 01/10] ctdb-daemon: ensure restart() callback is called in half-connected state If NODE_FLAGS_DISCONNECTED is set the node can be in half-connected state. With this change we ensure to restart the transport for this case. BUG: https://bugzilla.samba.org/show_bug.cgi?id=14295 Signed-off-by: Ralph Boehme Reviewed-by: Martin Schwenke --- ctdb/server/ctdb_server.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/ctdb/server/ctdb_server.c b/ctdb/server/ctdb_server.c index 0d5451d62a8..0bbd3751b51 100644 --- a/ctdb/server/ctdb_server.c +++ b/ctdb/server/ctdb_server.c @@ -301,6 +301,12 @@ done: */ void ctdb_node_dead(struct ctdb_node *node) { + if (node->ctdb->methods == NULL) { + DEBUG(DEBUG_ERR,(__location__ " Can not restart transport while shutting down daemon.\n")); + return; + } + + node->ctdb->methods->restart(node); if (node->flags & NODE_FLAGS_DISCONNECTED) { DEBUG(DEBUG_INFO,("%s: node %s is already marked disconnected: %u connected\n", node->ctdb->name, node->name, @@ -315,13 +321,6 @@ void ctdb_node_dead(struct ctdb_node *node) DEBUG(DEBUG_ERR,("%s: node %s is dead: %u connected\n", node->ctdb->name, node->name, node->ctdb->num_connected)); ctdb_daemon_cancel_controls(node->ctdb, node); - - if (node->ctdb->methods == NULL) { - DEBUG(DEBUG_ERR,(__location__ " Can not restart transport while shutting down daemon.\n")); - return; - } - - node->ctdb->methods->restart(node); } /* -- 2.25.1 From 60aa7ee998818aa4e1bfb9dd14a489d35462bade Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Sun, 1 Mar 2020 16:40:41 +1100 Subject: [PATCH 02/10] ctdb-daemon: more logical whitespace, debug modernisation BUG: https://bugzilla.samba.org/show_bug.cgi?id=14295 Signed-off-by: Martin Schwenke Reviewed-by: Ralph Boehme --- ctdb/server/ctdb_server.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ctdb/server/ctdb_server.c b/ctdb/server/ctdb_server.c index 0bbd3751b51..1470b00dba5 100644 --- a/ctdb/server/ctdb_server.c +++ b/ctdb/server/ctdb_server.c @@ -302,11 +302,11 @@ done: void ctdb_node_dead(struct ctdb_node *node) { if (node->ctdb->methods == NULL) { - DEBUG(DEBUG_ERR,(__location__ " Can not restart transport while shutting down daemon.\n")); + DBG_ERR("Can not restart transport while shutting down\n"); return; } - node->ctdb->methods->restart(node); + if (node->flags & NODE_FLAGS_DISCONNECTED) { DEBUG(DEBUG_INFO,("%s: node %s is already marked disconnected: %u connected\n", node->ctdb->name, node->name, -- 2.25.1 From adf1027ccc73a6d9537c4d6d21d7376b47d0a84f Mon Sep 17 00:00:00 2001 From: Noel Power Date: Sat, 29 Feb 2020 15:49:28 +0000 Subject: [PATCH 03/10] ctdb-tcp: move free of inbound queue to TCP restart Since commit 77deaadca8e8dbc3c92ea16893099c72f6dc874e, a nodeA which had previously accepted a connection from nodeB (where nodeB dies e.g. as as result of fencing) when nodeB attempts to connect again after restarting is always rejected with ctdb_listen_event: Incoming queue active, rejecting connection from w.x.y.z messages. Consolidate dead node handling in the TCP restart handling. BUG: https://attachments.samba.org/attachment.cgi?id=15826 Signed-off-by: Noel Power Reviewed-by: Ralph Boehme Reviewed-by: Martin Schwenke --- ctdb/tcp/tcp_init.c | 2 +- ctdb/tcp/tcp_io.c | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/ctdb/tcp/tcp_init.c b/ctdb/tcp/tcp_init.c index 559ad8691d0..dbf6c4b9bcf 100644 --- a/ctdb/tcp/tcp_init.c +++ b/ctdb/tcp/tcp_init.c @@ -121,7 +121,7 @@ static void ctdb_tcp_restart(struct ctdb_node *node) node->transport_data, struct ctdb_tcp_node); DEBUG(DEBUG_NOTICE,("Tearing down connection to dead node :%d\n", node->pnn)); - + TALLOC_FREE(tnode->in_queue); ctdb_tcp_stop_connection(node); tnode->connect_te = tevent_add_timer(node->ctdb->ev, tnode, diff --git a/ctdb/tcp/tcp_io.c b/ctdb/tcp/tcp_io.c index df9ca02b413..bcb18fbf300 100644 --- a/ctdb/tcp/tcp_io.c +++ b/ctdb/tcp/tcp_io.c @@ -75,7 +75,6 @@ void ctdb_tcp_read_cb(uint8_t *data, size_t cnt, void *args) return; failed: - TALLOC_FREE(tnode->in_queue); node->ctdb->upcalls->node_dead(node); TALLOC_FREE(data); -- 2.25.1 From 0b62b01e7a9ee324186cde8527cc284006c9d185 Mon Sep 17 00:00:00 2001 From: Ralph Boehme Date: Sat, 29 Feb 2020 12:13:12 +0100 Subject: [PATCH 04/10] ctdb-tcp: always call node_dead() upcall in ctdb_tcp_tnode_cb() ctdb_tcp_tnode_cb() is called when we receive data on the outgoing connection. This can happen when we get an EOF on the connection because the other side as closed. In this case data will be NULL. It would also be called if we received data from the peer. In this case data will not be NULL. The latter case is a fatal error though and we already call ctdb_tcp_stop_connection() for this case as well, which means even though the node is not fully connected anymore, by not calling the node_dead() upcall NODE_FLAGS_DISCONNECTED will not be set. BUG: https://bugzilla.samba.org/show_bug.cgi?id=14295 Signed-off-by: Ralph Boehme Reviewed-by: Martin Schwenke --- ctdb/tcp/tcp_connect.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/ctdb/tcp/tcp_connect.c b/ctdb/tcp/tcp_connect.c index 559442f14bf..ea98e6126a6 100644 --- a/ctdb/tcp/tcp_connect.c +++ b/ctdb/tcp/tcp_connect.c @@ -65,9 +65,7 @@ void ctdb_tcp_tnode_cb(uint8_t *data, size_t cnt, void *private_data) struct ctdb_tcp_node *tnode = talloc_get_type( node->transport_data, struct ctdb_tcp_node); - if (data == NULL) { - node->ctdb->upcalls->node_dead(node); - } + node->ctdb->upcalls->node_dead(node); ctdb_tcp_stop_connection(node); tnode->connect_te = tevent_add_timer(node->ctdb->ev, tnode, -- 2.25.1 From 866aad5a1d393c371de4fc8ae78b0580916e541f Mon Sep 17 00:00:00 2001 From: Ralph Boehme Date: Sat, 29 Feb 2020 12:28:20 +0100 Subject: [PATCH 05/10] ctdb-tcp: remove redundant call to ctdb_tcp_stop_connection() This is now reliably handled by the node_dead() upcall. BUG: https://bugzilla.samba.org/show_bug.cgi?id=14295 Signed-off-by: Ralph Boehme Reviewed-by: Martin Schwenke --- ctdb/tcp/tcp_connect.c | 1 - 1 file changed, 1 deletion(-) diff --git a/ctdb/tcp/tcp_connect.c b/ctdb/tcp/tcp_connect.c index ea98e6126a6..25efed2ec0b 100644 --- a/ctdb/tcp/tcp_connect.c +++ b/ctdb/tcp/tcp_connect.c @@ -67,7 +67,6 @@ void ctdb_tcp_tnode_cb(uint8_t *data, size_t cnt, void *private_data) node->ctdb->upcalls->node_dead(node); - ctdb_tcp_stop_connection(node); tnode->connect_te = tevent_add_timer(node->ctdb->ev, tnode, timeval_current_ofs(3, 0), ctdb_tcp_node_connect, node); -- 2.25.1 From 123d5c0c2162dc012420b981bf31a445781e01dc Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Wed, 11 Mar 2020 11:42:01 +1100 Subject: [PATCH 06/10] SQ: Remove redundant restart in ctdb_tcp_tnode_cb() The node dead upcall has already restarted the outgoing connection. There's no need to repeat it. NOTE: tnode->connect_te wil be orphaned without this fixup if ctdb_tcp_node_connect() drops its (usually) unnecessary call to ctdb_tcp_stop_connection(). Signed-off-by: Martin Schwenke --- ctdb/tcp/tcp_connect.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/ctdb/tcp/tcp_connect.c b/ctdb/tcp/tcp_connect.c index 25efed2ec0b..805fba5c27a 100644 --- a/ctdb/tcp/tcp_connect.c +++ b/ctdb/tcp/tcp_connect.c @@ -62,14 +62,9 @@ void ctdb_tcp_stop_connection(struct ctdb_node *node) void ctdb_tcp_tnode_cb(uint8_t *data, size_t cnt, void *private_data) { struct ctdb_node *node = talloc_get_type(private_data, struct ctdb_node); - struct ctdb_tcp_node *tnode = talloc_get_type( - node->transport_data, struct ctdb_tcp_node); node->ctdb->upcalls->node_dead(node); - tnode->connect_te = tevent_add_timer(node->ctdb->ev, tnode, - timeval_current_ofs(3, 0), - ctdb_tcp_node_connect, node); TALLOC_FREE(data); } -- 2.25.1 From cf5883c410591816d6d21fdba6695450055cae11 Mon Sep 17 00:00:00 2001 From: Ralph Boehme Date: Fri, 28 Feb 2020 11:36:00 +0100 Subject: [PATCH 07/10] ctdb-tcp: rename ctdb_tcp_stop_connection() to ctdb_tcp_stop_outgoing() No change in behaviour. This makes the code self-documenting. BUG: https://bugzilla.samba.org/show_bug.cgi?id=14295 Signed-off-by: Ralph Boehme Reviewed-by: Martin Schwenke --- ctdb/tcp/ctdb_tcp.h | 2 +- ctdb/tcp/tcp_connect.c | 12 ++++++------ ctdb/tcp/tcp_init.c | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/ctdb/tcp/ctdb_tcp.h b/ctdb/tcp/ctdb_tcp.h index daabad74297..095056e8544 100644 --- a/ctdb/tcp/ctdb_tcp.h +++ b/ctdb/tcp/ctdb_tcp.h @@ -48,7 +48,7 @@ void ctdb_tcp_node_connect(struct tevent_context *ev, struct tevent_timer *te, struct timeval t, void *private_data); void ctdb_tcp_read_cb(uint8_t *data, size_t cnt, void *args); void ctdb_tcp_tnode_cb(uint8_t *data, size_t cnt, void *private_data); -void ctdb_tcp_stop_connection(struct ctdb_node *node); +void ctdb_tcp_stop_outgoing(struct ctdb_node *node); #define CTDB_TCP_ALIGNMENT 8 diff --git a/ctdb/tcp/tcp_connect.c b/ctdb/tcp/tcp_connect.c index 805fba5c27a..757c41dc703 100644 --- a/ctdb/tcp/tcp_connect.c +++ b/ctdb/tcp/tcp_connect.c @@ -38,9 +38,9 @@ #include "ctdb_tcp.h" /* - stop any connecting (established or pending) to a node + stop any outgoing connection (established or pending) to a node */ -void ctdb_tcp_stop_connection(struct ctdb_node *node) +void ctdb_tcp_stop_outgoing(struct ctdb_node *node) { struct ctdb_tcp_node *tnode = talloc_get_type( node->transport_data, struct ctdb_tcp_node); @@ -89,7 +89,7 @@ static void ctdb_node_connect_write(struct tevent_context *ev, ret = getsockopt(tnode->out_fd, SOL_SOCKET, SO_ERROR, &error, &len); if (ret != 0 || error != 0) { - ctdb_tcp_stop_connection(node); + ctdb_tcp_stop_outgoing(node); tnode->connect_te = tevent_add_timer(ctdb->ev, tnode, timeval_current_ofs(1, 0), ctdb_tcp_node_connect, node); @@ -126,7 +126,7 @@ static void ctdb_node_connect_write(struct tevent_context *ev, node->name); if (tnode->out_queue == NULL) { DBG_ERR("Failed to set up outgoing queue\n"); - ctdb_tcp_stop_connection(node); + ctdb_tcp_stop_outgoing(node); tnode->connect_te = tevent_add_timer(ctdb->ev, tnode, timeval_current_ofs(1, 0), @@ -166,7 +166,7 @@ void ctdb_tcp_node_connect(struct tevent_context *ev, struct tevent_timer *te, ctdb_sock_addr sock_out; int ret; - ctdb_tcp_stop_connection(node); + ctdb_tcp_stop_outgoing(node); sock_out = node->address; @@ -250,7 +250,7 @@ void ctdb_tcp_node_connect(struct tevent_context *ev, struct tevent_timer *te, return; failed: - ctdb_tcp_stop_connection(node); + ctdb_tcp_stop_outgoing(node); tnode->connect_te = tevent_add_timer(ctdb->ev, tnode, timeval_current_ofs(1, 0), diff --git a/ctdb/tcp/tcp_init.c b/ctdb/tcp/tcp_init.c index dbf6c4b9bcf..c6acb166807 100644 --- a/ctdb/tcp/tcp_init.c +++ b/ctdb/tcp/tcp_init.c @@ -122,7 +122,7 @@ static void ctdb_tcp_restart(struct ctdb_node *node) DEBUG(DEBUG_NOTICE,("Tearing down connection to dead node :%d\n", node->pnn)); TALLOC_FREE(tnode->in_queue); - ctdb_tcp_stop_connection(node); + ctdb_tcp_stop_outgoing(node); tnode->connect_te = tevent_add_timer(node->ctdb->ev, tnode, timeval_zero(), -- 2.25.1 From 190d137f1d5b50c4dc3239baaa5a8e7f90319fbb Mon Sep 17 00:00:00 2001 From: Ralph Boehme Date: Sat, 29 Feb 2020 11:54:51 +0100 Subject: [PATCH 08/10] ctdb-tcp: add ctdb_tcp_stop_incoming() No change in behaviour. This makes the code self-documenting. BUG: https://bugzilla.samba.org/show_bug.cgi?id=14295 Signed-off-by: Ralph Boehme Signed-off-by: Martin Schwenke --- ctdb/tcp/ctdb_tcp.h | 1 + ctdb/tcp/tcp_connect.c | 10 ++++++++++ ctdb/tcp/tcp_init.c | 2 +- 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/ctdb/tcp/ctdb_tcp.h b/ctdb/tcp/ctdb_tcp.h index 095056e8544..cb8d66fa5dc 100644 --- a/ctdb/tcp/ctdb_tcp.h +++ b/ctdb/tcp/ctdb_tcp.h @@ -49,6 +49,7 @@ void ctdb_tcp_node_connect(struct tevent_context *ev, struct tevent_timer *te, void ctdb_tcp_read_cb(uint8_t *data, size_t cnt, void *args); void ctdb_tcp_tnode_cb(uint8_t *data, size_t cnt, void *private_data); void ctdb_tcp_stop_outgoing(struct ctdb_node *node); +void ctdb_tcp_stop_incoming(struct ctdb_node *node); #define CTDB_TCP_ALIGNMENT 8 diff --git a/ctdb/tcp/tcp_connect.c b/ctdb/tcp/tcp_connect.c index 757c41dc703..4732a8d612b 100644 --- a/ctdb/tcp/tcp_connect.c +++ b/ctdb/tcp/tcp_connect.c @@ -54,6 +54,16 @@ void ctdb_tcp_stop_outgoing(struct ctdb_node *node) } } +/* + stop incoming connection to a node + */ +void ctdb_tcp_stop_incoming(struct ctdb_node *node) +{ + struct ctdb_tcp_node *tnode = talloc_get_type( + node->transport_data, struct ctdb_tcp_node); + + TALLOC_FREE(tnode->in_queue); +} /* called when a complete packet has come in - should not happen on this socket diff --git a/ctdb/tcp/tcp_init.c b/ctdb/tcp/tcp_init.c index c6acb166807..97ebe1d887a 100644 --- a/ctdb/tcp/tcp_init.c +++ b/ctdb/tcp/tcp_init.c @@ -121,7 +121,7 @@ static void ctdb_tcp_restart(struct ctdb_node *node) node->transport_data, struct ctdb_tcp_node); DEBUG(DEBUG_NOTICE,("Tearing down connection to dead node :%d\n", node->pnn)); - TALLOC_FREE(tnode->in_queue); + ctdb_tcp_stop_incoming(node); ctdb_tcp_stop_outgoing(node); tnode->connect_te = tevent_add_timer(node->ctdb->ev, tnode, -- 2.25.1 From 7338d6b2b50b07b7fc8b8e8bcb10cca252e40f14 Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Fri, 6 Mar 2020 15:59:32 +1100 Subject: [PATCH 09/10] ctdb-tcp: Factor out function ctdb_tcp_start_outgoing() Signed-off-by: Amitay Isaacs Signed-off-by: Martin Schwenke --- ctdb/tcp/tcp_connect.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/ctdb/tcp/tcp_connect.c b/ctdb/tcp/tcp_connect.c index 4732a8d612b..6576f3362cd 100644 --- a/ctdb/tcp/tcp_connect.c +++ b/ctdb/tcp/tcp_connect.c @@ -162,11 +162,8 @@ static void ctdb_node_connect_write(struct tevent_context *ev, /* called when we should try and establish a tcp connection to a node */ -void ctdb_tcp_node_connect(struct tevent_context *ev, struct tevent_timer *te, - struct timeval t, void *private_data) +static void ctdb_tcp_start_outgoing(struct ctdb_node *node) { - struct ctdb_node *node = talloc_get_type(private_data, - struct ctdb_node); struct ctdb_tcp_node *tnode = talloc_get_type(node->transport_data, struct ctdb_tcp_node); struct ctdb_context *ctdb = node->ctdb; @@ -176,8 +173,6 @@ void ctdb_tcp_node_connect(struct tevent_context *ev, struct tevent_timer *te, ctdb_sock_addr sock_out; int ret; - ctdb_tcp_stop_outgoing(node); - sock_out = node->address; tnode->out_fd = socket(sock_out.sa.sa_family, SOCK_STREAM, IPPROTO_TCP); @@ -268,6 +263,18 @@ failed: node); } +void ctdb_tcp_node_connect(struct tevent_context *ev, + struct tevent_timer *te, + struct timeval t, + void *private_data) +{ + struct ctdb_node *node = talloc_get_type_abort(private_data, + struct ctdb_node); + + ctdb_tcp_stop_outgoing(node); + ctdb_tcp_start_outgoing(node); +} + /* called when we get contacted by another node currently makes no attempt to check if the connection is really from a ctdb -- 2.25.1 From 886440ae0bbd7b0da3a55a0a8ef17b0b24a395ed Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Fri, 6 Mar 2020 16:11:23 +1100 Subject: [PATCH 10/10] ctdb-tcp: Do not stop outbound connection in ctdb_tcp_node_connect() The only place the outgoing connection needs to be stopped is when there is a timeout when waiting for the connection to become writable. Add a new function ctdb_tcp_node_connect_timeout() to handle this case. All of the other cases are attempts to establish a new outgoing connection (initial attempt, retry after an error or disconnect, ...) so drop stopping the connection in those cases. Signed-off-by: Amitay Isaacs Signed-off-by: Martin Schwenke --- ctdb/tcp/tcp_connect.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/ctdb/tcp/tcp_connect.c b/ctdb/tcp/tcp_connect.c index 6576f3362cd..f7703b77f0d 100644 --- a/ctdb/tcp/tcp_connect.c +++ b/ctdb/tcp/tcp_connect.c @@ -159,6 +159,11 @@ static void ctdb_node_connect_write(struct tevent_context *ev, } +static void ctdb_tcp_node_connect_timeout(struct tevent_context *ev, + struct tevent_timer *te, + struct timeval t, + void *private_data); + /* called when we should try and establish a tcp connection to a node */ @@ -249,7 +254,7 @@ static void ctdb_tcp_start_outgoing(struct ctdb_node *node) tnode->connect_te = tevent_add_timer(ctdb->ev, tnode, timeval_current_ofs(1, 0), - ctdb_tcp_node_connect, + ctdb_tcp_node_connect_timeout, node); return; @@ -271,6 +276,17 @@ void ctdb_tcp_node_connect(struct tevent_context *ev, struct ctdb_node *node = talloc_get_type_abort(private_data, struct ctdb_node); + ctdb_tcp_start_outgoing(node); +} + +static void ctdb_tcp_node_connect_timeout(struct tevent_context *ev, + struct tevent_timer *te, + struct timeval t, + void *private_data) +{ + struct ctdb_node *node = talloc_get_type_abort(private_data, + struct ctdb_node); + ctdb_tcp_stop_outgoing(node); ctdb_tcp_start_outgoing(node); } -- 2.25.1