From 858729ee6dd00aa92cd2cafd14510f6eeabdf5e4 Mon Sep 17 00:00:00 2001 From: Ralph Boehme Date: Fri, 28 Feb 2020 11:36:00 +0100 Subject: [PATCH 1/9] ctdb: rename ctdb_tcp_stop_connection() to ctdb_tcp_stop_outgoing_connection() No change in behavour, just a function rename that prepares for adding a new function ctdb_tcp_stop_connection() that will also tear down any incoming connection. BUG: https://bugzilla.samba.org/show_bug.cgi?id=14295 Signed-off-by: Ralph Boehme --- ctdb/tcp/ctdb_tcp.h | 2 +- ctdb/tcp/tcp_connect.c | 14 +++++++------- ctdb/tcp/tcp_init.c | 2 +- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/ctdb/tcp/ctdb_tcp.h b/ctdb/tcp/ctdb_tcp.h index daabad74297..5e11dab1156 100644 --- a/ctdb/tcp/ctdb_tcp.h +++ b/ctdb/tcp/ctdb_tcp.h @@ -48,7 +48,7 @@ void ctdb_tcp_node_connect(struct tevent_context *ev, struct tevent_timer *te, struct timeval t, void *private_data); void ctdb_tcp_read_cb(uint8_t *data, size_t cnt, void *args); void ctdb_tcp_tnode_cb(uint8_t *data, size_t cnt, void *private_data); -void ctdb_tcp_stop_connection(struct ctdb_node *node); +void ctdb_tcp_stop_outgoing_connection(struct ctdb_node *node); #define CTDB_TCP_ALIGNMENT 8 diff --git a/ctdb/tcp/tcp_connect.c b/ctdb/tcp/tcp_connect.c index 559442f14bf..da1b1df3b93 100644 --- a/ctdb/tcp/tcp_connect.c +++ b/ctdb/tcp/tcp_connect.c @@ -38,9 +38,9 @@ #include "ctdb_tcp.h" /* - stop any connecting (established or pending) to a node + stop any outgoing connection (established or pending) to a node */ -void ctdb_tcp_stop_connection(struct ctdb_node *node) +void ctdb_tcp_stop_outgoing_connection(struct ctdb_node *node) { struct ctdb_tcp_node *tnode = talloc_get_type( node->transport_data, struct ctdb_tcp_node); @@ -69,7 +69,7 @@ void ctdb_tcp_tnode_cb(uint8_t *data, size_t cnt, void *private_data) node->ctdb->upcalls->node_dead(node); } - ctdb_tcp_stop_connection(node); + ctdb_tcp_stop_outgoing_connection(node); tnode->connect_te = tevent_add_timer(node->ctdb->ev, tnode, timeval_current_ofs(3, 0), ctdb_tcp_node_connect, node); @@ -97,7 +97,7 @@ static void ctdb_node_connect_write(struct tevent_context *ev, ret = getsockopt(tnode->out_fd, SOL_SOCKET, SO_ERROR, &error, &len); if (ret != 0 || error != 0) { - ctdb_tcp_stop_connection(node); + ctdb_tcp_stop_outgoing_connection(node); tnode->connect_te = tevent_add_timer(ctdb->ev, tnode, timeval_current_ofs(1, 0), ctdb_tcp_node_connect, node); @@ -134,7 +134,7 @@ static void ctdb_node_connect_write(struct tevent_context *ev, node->name); if (tnode->out_queue == NULL) { DBG_ERR("Failed to set up outgoing queue\n"); - ctdb_tcp_stop_connection(node); + ctdb_tcp_stop_outgoing_connection(node); tnode->connect_te = tevent_add_timer(ctdb->ev, tnode, timeval_current_ofs(1, 0), @@ -174,7 +174,7 @@ void ctdb_tcp_node_connect(struct tevent_context *ev, struct tevent_timer *te, ctdb_sock_addr sock_out; int ret; - ctdb_tcp_stop_connection(node); + ctdb_tcp_stop_outgoing_connection(node); sock_out = node->address; @@ -258,7 +258,7 @@ void ctdb_tcp_node_connect(struct tevent_context *ev, struct tevent_timer *te, return; failed: - ctdb_tcp_stop_connection(node); + ctdb_tcp_stop_outgoing_connection(node); tnode->connect_te = tevent_add_timer(ctdb->ev, tnode, timeval_current_ofs(1, 0), diff --git a/ctdb/tcp/tcp_init.c b/ctdb/tcp/tcp_init.c index 559ad8691d0..967eb3ee494 100644 --- a/ctdb/tcp/tcp_init.c +++ b/ctdb/tcp/tcp_init.c @@ -122,7 +122,7 @@ static void ctdb_tcp_restart(struct ctdb_node *node) DEBUG(DEBUG_NOTICE,("Tearing down connection to dead node :%d\n", node->pnn)); - ctdb_tcp_stop_connection(node); + ctdb_tcp_stop_outgoing_connection(node); tnode->connect_te = tevent_add_timer(node->ctdb->ev, tnode, timeval_zero(), -- 2.24.1 From 94dc96863646b1cb019272c0d7fdcbcf5fc6c033 Mon Sep 17 00:00:00 2001 From: Ralph Boehme Date: Fri, 28 Feb 2020 11:38:28 +0100 Subject: [PATCH 2/9] ctdb: add ctdb_tcp_stop_connection() BUG: https://bugzilla.samba.org/show_bug.cgi?id=14295 Signed-off-by: Ralph Boehme --- ctdb/tcp/ctdb_tcp.h | 1 + ctdb/tcp/tcp_connect.c | 11 +++++++++++ 2 files changed, 12 insertions(+) diff --git a/ctdb/tcp/ctdb_tcp.h b/ctdb/tcp/ctdb_tcp.h index 5e11dab1156..6e0a9af9ab2 100644 --- a/ctdb/tcp/ctdb_tcp.h +++ b/ctdb/tcp/ctdb_tcp.h @@ -48,6 +48,7 @@ void ctdb_tcp_node_connect(struct tevent_context *ev, struct tevent_timer *te, struct timeval t, void *private_data); void ctdb_tcp_read_cb(uint8_t *data, size_t cnt, void *args); void ctdb_tcp_tnode_cb(uint8_t *data, size_t cnt, void *private_data); +void ctdb_tcp_stop_connection(struct ctdb_node *node); void ctdb_tcp_stop_outgoing_connection(struct ctdb_node *node); #define CTDB_TCP_ALIGNMENT 8 diff --git a/ctdb/tcp/tcp_connect.c b/ctdb/tcp/tcp_connect.c index da1b1df3b93..7da5f6f6870 100644 --- a/ctdb/tcp/tcp_connect.c +++ b/ctdb/tcp/tcp_connect.c @@ -54,6 +54,17 @@ void ctdb_tcp_stop_outgoing_connection(struct ctdb_node *node) } } +/* + stop both incoming and outgoing connection (established or pending) to a node + */ +void ctdb_tcp_stop_connection(struct ctdb_node *node) +{ + struct ctdb_tcp_node *tnode = talloc_get_type( + node->transport_data, struct ctdb_tcp_node); + + ctdb_tcp_stop_outgoing_connection(node); + TALLOC_FREE(tnode->in_queue); +} /* called when a complete packet has come in - should not happen on this socket -- 2.24.1 From 485cb66b537f5c2a4d3cc303c14821130d885544 Mon Sep 17 00:00:00 2001 From: Ralph Boehme Date: Sat, 29 Feb 2020 11:54:51 +0100 Subject: [PATCH 3/9] ctdb: add ctdb_tcp_stop_incoming_connection() BUG: https://bugzilla.samba.org/show_bug.cgi?id=14295 Signed-off-by: Ralph Boehme --- ctdb/tcp/ctdb_tcp.h | 1 + ctdb/tcp/tcp_connect.c | 11 +++++++++++ 2 files changed, 12 insertions(+) diff --git a/ctdb/tcp/ctdb_tcp.h b/ctdb/tcp/ctdb_tcp.h index 6e0a9af9ab2..b3363de087f 100644 --- a/ctdb/tcp/ctdb_tcp.h +++ b/ctdb/tcp/ctdb_tcp.h @@ -50,6 +50,7 @@ void ctdb_tcp_read_cb(uint8_t *data, size_t cnt, void *args); void ctdb_tcp_tnode_cb(uint8_t *data, size_t cnt, void *private_data); void ctdb_tcp_stop_connection(struct ctdb_node *node); void ctdb_tcp_stop_outgoing_connection(struct ctdb_node *node); +void ctdb_tcp_stop_incoming_connection(struct ctdb_node *node); #define CTDB_TCP_ALIGNMENT 8 diff --git a/ctdb/tcp/tcp_connect.c b/ctdb/tcp/tcp_connect.c index 7da5f6f6870..7093aa10feb 100644 --- a/ctdb/tcp/tcp_connect.c +++ b/ctdb/tcp/tcp_connect.c @@ -54,6 +54,17 @@ void ctdb_tcp_stop_outgoing_connection(struct ctdb_node *node) } } +/* + stop incoming connection to a node + */ +void ctdb_tcp_stop_incoming_connection(struct ctdb_node *node) +{ + struct ctdb_tcp_node *tnode = talloc_get_type( + node->transport_data, struct ctdb_tcp_node); + + TALLOC_FREE(tnode->in_queue); +} + /* stop both incoming and outgoing connection (established or pending) to a node */ -- 2.24.1 From 2c6657fa35b633c384107b59c875b241b1d66b35 Mon Sep 17 00:00:00 2001 From: Ralph Boehme Date: Sat, 29 Feb 2020 11:55:12 +0100 Subject: [PATCH 4/9] ctdb: use ctdb_tcp_stop_incoming_connection() BUG: https://bugzilla.samba.org/show_bug.cgi?id=14295 Signed-off-by: Ralph Boehme --- ctdb/tcp/tcp_connect.c | 5 +---- ctdb/tcp/tcp_io.c | 2 +- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/ctdb/tcp/tcp_connect.c b/ctdb/tcp/tcp_connect.c index 7093aa10feb..a538ad45590 100644 --- a/ctdb/tcp/tcp_connect.c +++ b/ctdb/tcp/tcp_connect.c @@ -70,11 +70,8 @@ void ctdb_tcp_stop_incoming_connection(struct ctdb_node *node) */ void ctdb_tcp_stop_connection(struct ctdb_node *node) { - struct ctdb_tcp_node *tnode = talloc_get_type( - node->transport_data, struct ctdb_tcp_node); - ctdb_tcp_stop_outgoing_connection(node); - TALLOC_FREE(tnode->in_queue); + ctdb_tcp_stop_incoming_connection(node); } /* diff --git a/ctdb/tcp/tcp_io.c b/ctdb/tcp/tcp_io.c index df9ca02b413..78fbbf4522b 100644 --- a/ctdb/tcp/tcp_io.c +++ b/ctdb/tcp/tcp_io.c @@ -75,7 +75,7 @@ void ctdb_tcp_read_cb(uint8_t *data, size_t cnt, void *args) return; failed: - TALLOC_FREE(tnode->in_queue); + ctdb_tcp_stop_incoming_connection(node); node->ctdb->upcalls->node_dead(node); TALLOC_FREE(data); -- 2.24.1 From 70a568c0eb6b3809bf73c2a9b71d8f4b235058eb Mon Sep 17 00:00:00 2001 From: Ralph Boehme Date: Fri, 28 Feb 2020 11:39:07 +0100 Subject: [PATCH 5/9] ctdb: use ctdb_tcp_stop_connection() in ctdb_tcp_restart() This fixes a regression introduced by commit d0baad257e511280ff3e5c7372c38c43df841070 as part of the fixes for bug 14175. The scenario that triggers this is: - hard power off of a node A - all other nodes in the cluster fail to free struct ctdb_tcp_node.in_queue - restart node A and start ctdb - node A connect to other nodes but the other nodes reject the incoming connection with Feb 21 13:47:13 somenode ctdbd[302424]: ctdb_listen_event: Incoming queue active, rejecting connection from SOMEIP struct ctdb_tcp_node.in_queue is only ever freed in the fd readable handler ctdb_tcp_read_cb(), but this gets never called as the TCP stacks on the nodes doesn't notice the connection is dead. ctdb sets SO_KEEPALIVE on the socket, but the default timeout for tcp_keepalive_time is 2 hours. BUG: https://bugzilla.samba.org/show_bug.cgi?id=14295 Signed-off-by: Ralph Boehme --- ctdb/tcp/tcp_init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ctdb/tcp/tcp_init.c b/ctdb/tcp/tcp_init.c index 967eb3ee494..559ad8691d0 100644 --- a/ctdb/tcp/tcp_init.c +++ b/ctdb/tcp/tcp_init.c @@ -122,7 +122,7 @@ static void ctdb_tcp_restart(struct ctdb_node *node) DEBUG(DEBUG_NOTICE,("Tearing down connection to dead node :%d\n", node->pnn)); - ctdb_tcp_stop_outgoing_connection(node); + ctdb_tcp_stop_connection(node); tnode->connect_te = tevent_add_timer(node->ctdb->ev, tnode, timeval_zero(), -- 2.24.1 From bc1d690e4a30b72e4e23fd09d3d563c6425d949b Mon Sep 17 00:00:00 2001 From: Ralph Boehme Date: Fri, 28 Feb 2020 11:40:44 +0100 Subject: [PATCH 6/9] ctdb: use ctdb_tcp_stop_connection() in ctdb_tcp_tnode_cb() ctdb_tcp_tnode_cb() get called when we receive data on the outgoing connection. Intead of only tearing down the outgoing connection, we better tear down outgoing *and* incoming connection to the node. BUG: https://bugzilla.samba.org/show_bug.cgi?id=14295 Signed-off-by: Ralph Boehme --- ctdb/tcp/tcp_connect.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ctdb/tcp/tcp_connect.c b/ctdb/tcp/tcp_connect.c index a538ad45590..904c625ccfe 100644 --- a/ctdb/tcp/tcp_connect.c +++ b/ctdb/tcp/tcp_connect.c @@ -88,7 +88,7 @@ void ctdb_tcp_tnode_cb(uint8_t *data, size_t cnt, void *private_data) node->ctdb->upcalls->node_dead(node); } - ctdb_tcp_stop_outgoing_connection(node); + ctdb_tcp_stop_connection(node); tnode->connect_te = tevent_add_timer(node->ctdb->ev, tnode, timeval_current_ofs(3, 0), ctdb_tcp_node_connect, node); -- 2.24.1 From f1d7b879b8ff7e17cf4b1bc4cc005788a7106218 Mon Sep 17 00:00:00 2001 From: Ralph Boehme Date: Sat, 29 Feb 2020 12:13:12 +0100 Subject: [PATCH 7/9] ctdb: always call node_dead_() upcall in ctdb_tcp_tnode_cb() ctdb_tcp_tnode_cb() is called when we receive data on the outgoing connection. This can happen when we get an EOF on the connection because the other side as closed. In this case data will be NULL. It would also be called if we received data from the peer. In this case data will not be NULL. The latter case is a fatal error though and we already call ctdb_tcp_stop_connection() for this case as well, which means even though the node is not fully connected anymore, by not calling the node_dead() upcall NODE_FLAGS_DISCONNECTED will not be set. BUG: https://bugzilla.samba.org/show_bug.cgi?id=14295 Signed-off-by: Ralph Boehme --- ctdb/tcp/tcp_connect.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/ctdb/tcp/tcp_connect.c b/ctdb/tcp/tcp_connect.c index 904c625ccfe..89e93119cbb 100644 --- a/ctdb/tcp/tcp_connect.c +++ b/ctdb/tcp/tcp_connect.c @@ -84,9 +84,7 @@ void ctdb_tcp_tnode_cb(uint8_t *data, size_t cnt, void *private_data) struct ctdb_tcp_node *tnode = talloc_get_type( node->transport_data, struct ctdb_tcp_node); - if (data == NULL) { - node->ctdb->upcalls->node_dead(node); - } + node->ctdb->upcalls->node_dead(node); ctdb_tcp_stop_connection(node); tnode->connect_te = tevent_add_timer(node->ctdb->ev, tnode, -- 2.24.1 From c6dfdc26b9f56d8b3357b86999f810d2fbd6ed6f Mon Sep 17 00:00:00 2001 From: Ralph Boehme Date: Sat, 29 Feb 2020 12:26:19 +0100 Subject: [PATCH 8/9] ctdb: ctdb_node_dead: ensure restart() callback is called in half-connected state If NODE_FLAGS_DISCONNECTED is set the node can be in half-connected state. With this change we ensure to restart the transport for this case. BUG: https://bugzilla.samba.org/show_bug.cgi?id=14295 Signed-off-by: Ralph Boehme --- ctdb/server/ctdb_server.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/ctdb/server/ctdb_server.c b/ctdb/server/ctdb_server.c index 0d5451d62a8..0bbd3751b51 100644 --- a/ctdb/server/ctdb_server.c +++ b/ctdb/server/ctdb_server.c @@ -301,6 +301,12 @@ void ctdb_input_pkt(struct ctdb_context *ctdb, struct ctdb_req_header *hdr) */ void ctdb_node_dead(struct ctdb_node *node) { + if (node->ctdb->methods == NULL) { + DEBUG(DEBUG_ERR,(__location__ " Can not restart transport while shutting down daemon.\n")); + return; + } + + node->ctdb->methods->restart(node); if (node->flags & NODE_FLAGS_DISCONNECTED) { DEBUG(DEBUG_INFO,("%s: node %s is already marked disconnected: %u connected\n", node->ctdb->name, node->name, @@ -315,13 +321,6 @@ void ctdb_node_dead(struct ctdb_node *node) DEBUG(DEBUG_ERR,("%s: node %s is dead: %u connected\n", node->ctdb->name, node->name, node->ctdb->num_connected)); ctdb_daemon_cancel_controls(node->ctdb, node); - - if (node->ctdb->methods == NULL) { - DEBUG(DEBUG_ERR,(__location__ " Can not restart transport while shutting down daemon.\n")); - return; - } - - node->ctdb->methods->restart(node); } /* -- 2.24.1 From b06f4cd07d06818f618a253dd689b7eee12c18a7 Mon Sep 17 00:00:00 2001 From: Ralph Boehme Date: Sat, 29 Feb 2020 12:28:20 +0100 Subject: [PATCH 9/9] ctdb: remove redundant calls to ctdb_tcp_stop_*connection() This is now reliably handled by the node_dead() upcall. BUG: https://bugzilla.samba.org/show_bug.cgi?id=14295 Signed-off-by: Ralph Boehme --- ctdb/tcp/tcp_connect.c | 1 - ctdb/tcp/tcp_io.c | 1 - 2 files changed, 2 deletions(-) diff --git a/ctdb/tcp/tcp_connect.c b/ctdb/tcp/tcp_connect.c index 89e93119cbb..d2af589a03d 100644 --- a/ctdb/tcp/tcp_connect.c +++ b/ctdb/tcp/tcp_connect.c @@ -86,7 +86,6 @@ void ctdb_tcp_tnode_cb(uint8_t *data, size_t cnt, void *private_data) node->ctdb->upcalls->node_dead(node); - ctdb_tcp_stop_connection(node); tnode->connect_te = tevent_add_timer(node->ctdb->ev, tnode, timeval_current_ofs(3, 0), ctdb_tcp_node_connect, node); diff --git a/ctdb/tcp/tcp_io.c b/ctdb/tcp/tcp_io.c index 78fbbf4522b..bcb18fbf300 100644 --- a/ctdb/tcp/tcp_io.c +++ b/ctdb/tcp/tcp_io.c @@ -75,7 +75,6 @@ void ctdb_tcp_read_cb(uint8_t *data, size_t cnt, void *args) return; failed: - ctdb_tcp_stop_incoming_connection(node); node->ctdb->upcalls->node_dead(node); TALLOC_FREE(data); -- 2.24.1