From 0f433da363fd85d2d2faeeb17cca745c7b57755c Mon Sep 17 00:00:00 2001 From: Ralph Boehme Date: Fri, 28 Feb 2020 11:36:00 +0100 Subject: [PATCH 1/4] ctdb: rename ctdb_tcp_stop_connection() to ctdb_tcp_stop_outgoing_connection() No change in behavour, just a function rename that prepares for adding a new function ctdb_tcp_stop_connection() that will also tear down any incoming connection. BUG: https://bugzilla.samba.org/show_bug.cgi?id=14295 Signed-off-by: Ralph Boehme --- ctdb/tcp/ctdb_tcp.h | 2 +- ctdb/tcp/tcp_connect.c | 14 +++++++------- ctdb/tcp/tcp_init.c | 2 +- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/ctdb/tcp/ctdb_tcp.h b/ctdb/tcp/ctdb_tcp.h index daabad74297..5e11dab1156 100644 --- a/ctdb/tcp/ctdb_tcp.h +++ b/ctdb/tcp/ctdb_tcp.h @@ -48,7 +48,7 @@ void ctdb_tcp_node_connect(struct tevent_context *ev, struct tevent_timer *te, struct timeval t, void *private_data); void ctdb_tcp_read_cb(uint8_t *data, size_t cnt, void *args); void ctdb_tcp_tnode_cb(uint8_t *data, size_t cnt, void *private_data); -void ctdb_tcp_stop_connection(struct ctdb_node *node); +void ctdb_tcp_stop_outgoing_connection(struct ctdb_node *node); #define CTDB_TCP_ALIGNMENT 8 diff --git a/ctdb/tcp/tcp_connect.c b/ctdb/tcp/tcp_connect.c index 559442f14bf..da1b1df3b93 100644 --- a/ctdb/tcp/tcp_connect.c +++ b/ctdb/tcp/tcp_connect.c @@ -38,9 +38,9 @@ #include "ctdb_tcp.h" /* - stop any connecting (established or pending) to a node + stop any outgoing connection (established or pending) to a node */ -void ctdb_tcp_stop_connection(struct ctdb_node *node) +void ctdb_tcp_stop_outgoing_connection(struct ctdb_node *node) { struct ctdb_tcp_node *tnode = talloc_get_type( node->transport_data, struct ctdb_tcp_node); @@ -69,7 +69,7 @@ void ctdb_tcp_tnode_cb(uint8_t *data, size_t cnt, void *private_data) node->ctdb->upcalls->node_dead(node); } - ctdb_tcp_stop_connection(node); + ctdb_tcp_stop_outgoing_connection(node); tnode->connect_te = tevent_add_timer(node->ctdb->ev, tnode, timeval_current_ofs(3, 0), ctdb_tcp_node_connect, node); @@ -97,7 +97,7 @@ static void ctdb_node_connect_write(struct tevent_context *ev, ret = getsockopt(tnode->out_fd, SOL_SOCKET, SO_ERROR, &error, &len); if (ret != 0 || error != 0) { - ctdb_tcp_stop_connection(node); + ctdb_tcp_stop_outgoing_connection(node); tnode->connect_te = tevent_add_timer(ctdb->ev, tnode, timeval_current_ofs(1, 0), ctdb_tcp_node_connect, node); @@ -134,7 +134,7 @@ static void ctdb_node_connect_write(struct tevent_context *ev, node->name); if (tnode->out_queue == NULL) { DBG_ERR("Failed to set up outgoing queue\n"); - ctdb_tcp_stop_connection(node); + ctdb_tcp_stop_outgoing_connection(node); tnode->connect_te = tevent_add_timer(ctdb->ev, tnode, timeval_current_ofs(1, 0), @@ -174,7 +174,7 @@ void ctdb_tcp_node_connect(struct tevent_context *ev, struct tevent_timer *te, ctdb_sock_addr sock_out; int ret; - ctdb_tcp_stop_connection(node); + ctdb_tcp_stop_outgoing_connection(node); sock_out = node->address; @@ -258,7 +258,7 @@ void ctdb_tcp_node_connect(struct tevent_context *ev, struct tevent_timer *te, return; failed: - ctdb_tcp_stop_connection(node); + ctdb_tcp_stop_outgoing_connection(node); tnode->connect_te = tevent_add_timer(ctdb->ev, tnode, timeval_current_ofs(1, 0), diff --git a/ctdb/tcp/tcp_init.c b/ctdb/tcp/tcp_init.c index 559ad8691d0..967eb3ee494 100644 --- a/ctdb/tcp/tcp_init.c +++ b/ctdb/tcp/tcp_init.c @@ -122,7 +122,7 @@ static void ctdb_tcp_restart(struct ctdb_node *node) DEBUG(DEBUG_NOTICE,("Tearing down connection to dead node :%d\n", node->pnn)); - ctdb_tcp_stop_connection(node); + ctdb_tcp_stop_outgoing_connection(node); tnode->connect_te = tevent_add_timer(node->ctdb->ev, tnode, timeval_zero(), -- 2.24.1 From 934e7f79a182568620ea88f480cb2e42b58c6f87 Mon Sep 17 00:00:00 2001 From: Ralph Boehme Date: Fri, 28 Feb 2020 11:38:28 +0100 Subject: [PATCH 2/4] ctdb: add ctdb_tcp_stop_connection() BUG: https://bugzilla.samba.org/show_bug.cgi?id=14295 Signed-off-by: Ralph Boehme --- ctdb/tcp/ctdb_tcp.h | 1 + ctdb/tcp/tcp_connect.c | 11 +++++++++++ 2 files changed, 12 insertions(+) diff --git a/ctdb/tcp/ctdb_tcp.h b/ctdb/tcp/ctdb_tcp.h index 5e11dab1156..6e0a9af9ab2 100644 --- a/ctdb/tcp/ctdb_tcp.h +++ b/ctdb/tcp/ctdb_tcp.h @@ -48,6 +48,7 @@ void ctdb_tcp_node_connect(struct tevent_context *ev, struct tevent_timer *te, struct timeval t, void *private_data); void ctdb_tcp_read_cb(uint8_t *data, size_t cnt, void *args); void ctdb_tcp_tnode_cb(uint8_t *data, size_t cnt, void *private_data); +void ctdb_tcp_stop_connection(struct ctdb_node *node); void ctdb_tcp_stop_outgoing_connection(struct ctdb_node *node); #define CTDB_TCP_ALIGNMENT 8 diff --git a/ctdb/tcp/tcp_connect.c b/ctdb/tcp/tcp_connect.c index da1b1df3b93..7da5f6f6870 100644 --- a/ctdb/tcp/tcp_connect.c +++ b/ctdb/tcp/tcp_connect.c @@ -54,6 +54,17 @@ void ctdb_tcp_stop_outgoing_connection(struct ctdb_node *node) } } +/* + stop both incoming and outgoing connection (established or pending) to a node + */ +void ctdb_tcp_stop_connection(struct ctdb_node *node) +{ + struct ctdb_tcp_node *tnode = talloc_get_type( + node->transport_data, struct ctdb_tcp_node); + + ctdb_tcp_stop_outgoing_connection(node); + TALLOC_FREE(tnode->in_queue); +} /* called when a complete packet has come in - should not happen on this socket -- 2.24.1 From 7eb82b9b9c5ef5ef5291901271ffeed15fde1b8a Mon Sep 17 00:00:00 2001 From: Ralph Boehme Date: Fri, 28 Feb 2020 11:39:07 +0100 Subject: [PATCH 3/4] ctdb: use ctdb_tcp_stop_connection() in ctdb_tcp_restart() This fixes a regression introduced by commit d0baad257e511280ff3e5c7372c38c43df841070 as part of the fixes for bug 14175. The scenario that triggers this is: - hard power off of a node A - all other nodes in the cluster fail to free struct ctdb_tcp_node.in_queue - restart node A and start ctdb - node A connect to other nodes but the other nodes reject the incoming connection with Feb 21 13:47:13 somenode ctdbd[302424]: ctdb_listen_event: Incoming queue active, rejecting connection from SOMEIP struct ctdb_tcp_node.in_queue is only ever freed in the fd readable handler ctdb_tcp_read_cb(), but this gets never called as the TCP stacks on the nodes doesn't notice the connection is dead. ctdb sets SO_KEEPALIVE on the socket, but the default timeout for tcp_keepalive_time is 2 hours. BUG: https://bugzilla.samba.org/show_bug.cgi?id=14295 Signed-off-by: Ralph Boehme --- ctdb/tcp/tcp_init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ctdb/tcp/tcp_init.c b/ctdb/tcp/tcp_init.c index 967eb3ee494..559ad8691d0 100644 --- a/ctdb/tcp/tcp_init.c +++ b/ctdb/tcp/tcp_init.c @@ -122,7 +122,7 @@ static void ctdb_tcp_restart(struct ctdb_node *node) DEBUG(DEBUG_NOTICE,("Tearing down connection to dead node :%d\n", node->pnn)); - ctdb_tcp_stop_outgoing_connection(node); + ctdb_tcp_stop_connection(node); tnode->connect_te = tevent_add_timer(node->ctdb->ev, tnode, timeval_zero(), -- 2.24.1 From c18d11b225976bdfc8781aa0dc9c39b612d91085 Mon Sep 17 00:00:00 2001 From: Ralph Boehme Date: Fri, 28 Feb 2020 11:40:44 +0100 Subject: [PATCH 4/4] ctdb: use ctdb_tcp_stop_connection() in ctdb_tcp_tnode_cb() ctdb_tcp_tnode_cb() get called when we receive data on the outgoing connection. Intead of only tearing down the outgoing connection, we better tear down outgoing *and* incoming connection to the node. BUG: https://bugzilla.samba.org/show_bug.cgi?id=14295 Signed-off-by: Ralph Boehme --- ctdb/tcp/tcp_connect.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ctdb/tcp/tcp_connect.c b/ctdb/tcp/tcp_connect.c index 7da5f6f6870..f1fc745cbab 100644 --- a/ctdb/tcp/tcp_connect.c +++ b/ctdb/tcp/tcp_connect.c @@ -80,7 +80,7 @@ void ctdb_tcp_tnode_cb(uint8_t *data, size_t cnt, void *private_data) node->ctdb->upcalls->node_dead(node); } - ctdb_tcp_stop_outgoing_connection(node); + ctdb_tcp_stop_connection(node); tnode->connect_te = tevent_add_timer(node->ctdb->ev, tnode, timeval_current_ofs(3, 0), ctdb_tcp_node_connect, node); -- 2.24.1