The Samba-Bugzilla – Attachment 15828 Details for
Bug 14295
Starting ctdb node that was powered off hard before results in recovery loop
Home
|
New
|
Browse
|
Search
|
[?]
|
Reports
|
Requests
|
Help
|
New Account
|
Log In
[x]
|
Forgot Password
Login:
[x]
[patch]
Patch for master v4
bug14295-master-v4.patch (text/plain), 15.53 KB, created by
Ralph Böhme
on 2020-02-29 11:40:40 UTC
(
hide
)
Description:
Patch for master v4
Filename:
MIME Type:
Creator:
Ralph Böhme
Created:
2020-02-29 11:40:40 UTC
Size:
15.53 KB
patch
obsolete
>From 858729ee6dd00aa92cd2cafd14510f6eeabdf5e4 Mon Sep 17 00:00:00 2001 >From: Ralph Boehme <slow@samba.org> >Date: Fri, 28 Feb 2020 11:36:00 +0100 >Subject: [PATCH 1/9] ctdb: rename ctdb_tcp_stop_connection() to > ctdb_tcp_stop_outgoing_connection() > >No change in behavour, just a function rename that prepares for adding a new >function ctdb_tcp_stop_connection() that will also tear down any incoming >connection. > >BUG: https://bugzilla.samba.org/show_bug.cgi?id=14295 > >Signed-off-by: Ralph Boehme <slow@samba.org> >--- > ctdb/tcp/ctdb_tcp.h | 2 +- > ctdb/tcp/tcp_connect.c | 14 +++++++------- > ctdb/tcp/tcp_init.c | 2 +- > 3 files changed, 9 insertions(+), 9 deletions(-) > >diff --git a/ctdb/tcp/ctdb_tcp.h b/ctdb/tcp/ctdb_tcp.h >index daabad74297..5e11dab1156 100644 >--- a/ctdb/tcp/ctdb_tcp.h >+++ b/ctdb/tcp/ctdb_tcp.h >@@ -48,7 +48,7 @@ void ctdb_tcp_node_connect(struct tevent_context *ev, struct tevent_timer *te, > struct timeval t, void *private_data); > void ctdb_tcp_read_cb(uint8_t *data, size_t cnt, void *args); > void ctdb_tcp_tnode_cb(uint8_t *data, size_t cnt, void *private_data); >-void ctdb_tcp_stop_connection(struct ctdb_node *node); >+void ctdb_tcp_stop_outgoing_connection(struct ctdb_node *node); > > #define CTDB_TCP_ALIGNMENT 8 > >diff --git a/ctdb/tcp/tcp_connect.c b/ctdb/tcp/tcp_connect.c >index 559442f14bf..da1b1df3b93 100644 >--- a/ctdb/tcp/tcp_connect.c >+++ b/ctdb/tcp/tcp_connect.c >@@ -38,9 +38,9 @@ > #include "ctdb_tcp.h" > > /* >- stop any connecting (established or pending) to a node >+ stop any outgoing connection (established or pending) to a node > */ >-void ctdb_tcp_stop_connection(struct ctdb_node *node) >+void ctdb_tcp_stop_outgoing_connection(struct ctdb_node *node) > { > struct ctdb_tcp_node *tnode = talloc_get_type( > node->transport_data, struct ctdb_tcp_node); >@@ -69,7 +69,7 @@ void ctdb_tcp_tnode_cb(uint8_t *data, size_t cnt, void *private_data) > node->ctdb->upcalls->node_dead(node); > } > >- ctdb_tcp_stop_connection(node); >+ ctdb_tcp_stop_outgoing_connection(node); > tnode->connect_te = tevent_add_timer(node->ctdb->ev, tnode, > timeval_current_ofs(3, 0), > ctdb_tcp_node_connect, node); >@@ -97,7 +97,7 @@ static void ctdb_node_connect_write(struct tevent_context *ev, > > ret = getsockopt(tnode->out_fd, SOL_SOCKET, SO_ERROR, &error, &len); > if (ret != 0 || error != 0) { >- ctdb_tcp_stop_connection(node); >+ ctdb_tcp_stop_outgoing_connection(node); > tnode->connect_te = tevent_add_timer(ctdb->ev, tnode, > timeval_current_ofs(1, 0), > ctdb_tcp_node_connect, node); >@@ -134,7 +134,7 @@ static void ctdb_node_connect_write(struct tevent_context *ev, > node->name); > if (tnode->out_queue == NULL) { > DBG_ERR("Failed to set up outgoing queue\n"); >- ctdb_tcp_stop_connection(node); >+ ctdb_tcp_stop_outgoing_connection(node); > tnode->connect_te = tevent_add_timer(ctdb->ev, > tnode, > timeval_current_ofs(1, 0), >@@ -174,7 +174,7 @@ void ctdb_tcp_node_connect(struct tevent_context *ev, struct tevent_timer *te, > ctdb_sock_addr sock_out; > int ret; > >- ctdb_tcp_stop_connection(node); >+ ctdb_tcp_stop_outgoing_connection(node); > > sock_out = node->address; > >@@ -258,7 +258,7 @@ void ctdb_tcp_node_connect(struct tevent_context *ev, struct tevent_timer *te, > return; > > failed: >- ctdb_tcp_stop_connection(node); >+ ctdb_tcp_stop_outgoing_connection(node); > tnode->connect_te = tevent_add_timer(ctdb->ev, > tnode, > timeval_current_ofs(1, 0), >diff --git a/ctdb/tcp/tcp_init.c b/ctdb/tcp/tcp_init.c >index 559ad8691d0..967eb3ee494 100644 >--- a/ctdb/tcp/tcp_init.c >+++ b/ctdb/tcp/tcp_init.c >@@ -122,7 +122,7 @@ static void ctdb_tcp_restart(struct ctdb_node *node) > > DEBUG(DEBUG_NOTICE,("Tearing down connection to dead node :%d\n", node->pnn)); > >- ctdb_tcp_stop_connection(node); >+ ctdb_tcp_stop_outgoing_connection(node); > > tnode->connect_te = tevent_add_timer(node->ctdb->ev, tnode, > timeval_zero(), >-- >2.24.1 > > >From 94dc96863646b1cb019272c0d7fdcbcf5fc6c033 Mon Sep 17 00:00:00 2001 >From: Ralph Boehme <slow@samba.org> >Date: Fri, 28 Feb 2020 11:38:28 +0100 >Subject: [PATCH 2/9] ctdb: add ctdb_tcp_stop_connection() > >BUG: https://bugzilla.samba.org/show_bug.cgi?id=14295 > >Signed-off-by: Ralph Boehme <slow@samba.org> >--- > ctdb/tcp/ctdb_tcp.h | 1 + > ctdb/tcp/tcp_connect.c | 11 +++++++++++ > 2 files changed, 12 insertions(+) > >diff --git a/ctdb/tcp/ctdb_tcp.h b/ctdb/tcp/ctdb_tcp.h >index 5e11dab1156..6e0a9af9ab2 100644 >--- a/ctdb/tcp/ctdb_tcp.h >+++ b/ctdb/tcp/ctdb_tcp.h >@@ -48,6 +48,7 @@ void ctdb_tcp_node_connect(struct tevent_context *ev, struct tevent_timer *te, > struct timeval t, void *private_data); > void ctdb_tcp_read_cb(uint8_t *data, size_t cnt, void *args); > void ctdb_tcp_tnode_cb(uint8_t *data, size_t cnt, void *private_data); >+void ctdb_tcp_stop_connection(struct ctdb_node *node); > void ctdb_tcp_stop_outgoing_connection(struct ctdb_node *node); > > #define CTDB_TCP_ALIGNMENT 8 >diff --git a/ctdb/tcp/tcp_connect.c b/ctdb/tcp/tcp_connect.c >index da1b1df3b93..7da5f6f6870 100644 >--- a/ctdb/tcp/tcp_connect.c >+++ b/ctdb/tcp/tcp_connect.c >@@ -54,6 +54,17 @@ void ctdb_tcp_stop_outgoing_connection(struct ctdb_node *node) > } > } > >+/* >+ stop both incoming and outgoing connection (established or pending) to a node >+ */ >+void ctdb_tcp_stop_connection(struct ctdb_node *node) >+{ >+ struct ctdb_tcp_node *tnode = talloc_get_type( >+ node->transport_data, struct ctdb_tcp_node); >+ >+ ctdb_tcp_stop_outgoing_connection(node); >+ TALLOC_FREE(tnode->in_queue); >+} > > /* > called when a complete packet has come in - should not happen on this socket >-- >2.24.1 > > >From 485cb66b537f5c2a4d3cc303c14821130d885544 Mon Sep 17 00:00:00 2001 >From: Ralph Boehme <slow@samba.org> >Date: Sat, 29 Feb 2020 11:54:51 +0100 >Subject: [PATCH 3/9] ctdb: add ctdb_tcp_stop_incoming_connection() > >BUG: https://bugzilla.samba.org/show_bug.cgi?id=14295 > >Signed-off-by: Ralph Boehme <slow@samba.org> >--- > ctdb/tcp/ctdb_tcp.h | 1 + > ctdb/tcp/tcp_connect.c | 11 +++++++++++ > 2 files changed, 12 insertions(+) > >diff --git a/ctdb/tcp/ctdb_tcp.h b/ctdb/tcp/ctdb_tcp.h >index 6e0a9af9ab2..b3363de087f 100644 >--- a/ctdb/tcp/ctdb_tcp.h >+++ b/ctdb/tcp/ctdb_tcp.h >@@ -50,6 +50,7 @@ void ctdb_tcp_read_cb(uint8_t *data, size_t cnt, void *args); > void ctdb_tcp_tnode_cb(uint8_t *data, size_t cnt, void *private_data); > void ctdb_tcp_stop_connection(struct ctdb_node *node); > void ctdb_tcp_stop_outgoing_connection(struct ctdb_node *node); >+void ctdb_tcp_stop_incoming_connection(struct ctdb_node *node); > > #define CTDB_TCP_ALIGNMENT 8 > >diff --git a/ctdb/tcp/tcp_connect.c b/ctdb/tcp/tcp_connect.c >index 7da5f6f6870..7093aa10feb 100644 >--- a/ctdb/tcp/tcp_connect.c >+++ b/ctdb/tcp/tcp_connect.c >@@ -54,6 +54,17 @@ void ctdb_tcp_stop_outgoing_connection(struct ctdb_node *node) > } > } > >+/* >+ stop incoming connection to a node >+ */ >+void ctdb_tcp_stop_incoming_connection(struct ctdb_node *node) >+{ >+ struct ctdb_tcp_node *tnode = talloc_get_type( >+ node->transport_data, struct ctdb_tcp_node); >+ >+ TALLOC_FREE(tnode->in_queue); >+} >+ > /* > stop both incoming and outgoing connection (established or pending) to a node > */ >-- >2.24.1 > > >From 2c6657fa35b633c384107b59c875b241b1d66b35 Mon Sep 17 00:00:00 2001 >From: Ralph Boehme <slow@samba.org> >Date: Sat, 29 Feb 2020 11:55:12 +0100 >Subject: [PATCH 4/9] ctdb: use ctdb_tcp_stop_incoming_connection() > >BUG: https://bugzilla.samba.org/show_bug.cgi?id=14295 > >Signed-off-by: Ralph Boehme <slow@samba.org> >--- > ctdb/tcp/tcp_connect.c | 5 +---- > ctdb/tcp/tcp_io.c | 2 +- > 2 files changed, 2 insertions(+), 5 deletions(-) > >diff --git a/ctdb/tcp/tcp_connect.c b/ctdb/tcp/tcp_connect.c >index 7093aa10feb..a538ad45590 100644 >--- a/ctdb/tcp/tcp_connect.c >+++ b/ctdb/tcp/tcp_connect.c >@@ -70,11 +70,8 @@ void ctdb_tcp_stop_incoming_connection(struct ctdb_node *node) > */ > void ctdb_tcp_stop_connection(struct ctdb_node *node) > { >- struct ctdb_tcp_node *tnode = talloc_get_type( >- node->transport_data, struct ctdb_tcp_node); >- > ctdb_tcp_stop_outgoing_connection(node); >- TALLOC_FREE(tnode->in_queue); >+ ctdb_tcp_stop_incoming_connection(node); > } > > /* >diff --git a/ctdb/tcp/tcp_io.c b/ctdb/tcp/tcp_io.c >index df9ca02b413..78fbbf4522b 100644 >--- a/ctdb/tcp/tcp_io.c >+++ b/ctdb/tcp/tcp_io.c >@@ -75,7 +75,7 @@ void ctdb_tcp_read_cb(uint8_t *data, size_t cnt, void *args) > return; > > failed: >- TALLOC_FREE(tnode->in_queue); >+ ctdb_tcp_stop_incoming_connection(node); > node->ctdb->upcalls->node_dead(node); > > TALLOC_FREE(data); >-- >2.24.1 > > >From 70a568c0eb6b3809bf73c2a9b71d8f4b235058eb Mon Sep 17 00:00:00 2001 >From: Ralph Boehme <slow@samba.org> >Date: Fri, 28 Feb 2020 11:39:07 +0100 >Subject: [PATCH 5/9] ctdb: use ctdb_tcp_stop_connection() in > ctdb_tcp_restart() > >This fixes a regression introduced by commit >d0baad257e511280ff3e5c7372c38c43df841070 as part of the fixes for bug 14175. > >The scenario that triggers this is: > >- hard power off of a node A > >- all other nodes in the cluster fail to free > struct ctdb_tcp_node.in_queue > >- restart node A and start ctdb > >- node A connect to other nodes but the other nodes > reject the incoming connection with > > Feb 21 13:47:13 somenode ctdbd[302424]: ctdb_listen_event: > Incoming queue active, rejecting connection from SOMEIP > >struct ctdb_tcp_node.in_queue is only ever freed in the fd readable handler >ctdb_tcp_read_cb(), but this gets never called as the TCP stacks on the nodes >doesn't notice the connection is dead. ctdb sets SO_KEEPALIVE on the socket, but >the default timeout for tcp_keepalive_time is 2 hours. > >BUG: https://bugzilla.samba.org/show_bug.cgi?id=14295 > >Signed-off-by: Ralph Boehme <slow@samba.org> >--- > ctdb/tcp/tcp_init.c | 2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > >diff --git a/ctdb/tcp/tcp_init.c b/ctdb/tcp/tcp_init.c >index 967eb3ee494..559ad8691d0 100644 >--- a/ctdb/tcp/tcp_init.c >+++ b/ctdb/tcp/tcp_init.c >@@ -122,7 +122,7 @@ static void ctdb_tcp_restart(struct ctdb_node *node) > > DEBUG(DEBUG_NOTICE,("Tearing down connection to dead node :%d\n", node->pnn)); > >- ctdb_tcp_stop_outgoing_connection(node); >+ ctdb_tcp_stop_connection(node); > > tnode->connect_te = tevent_add_timer(node->ctdb->ev, tnode, > timeval_zero(), >-- >2.24.1 > > >From bc1d690e4a30b72e4e23fd09d3d563c6425d949b Mon Sep 17 00:00:00 2001 >From: Ralph Boehme <slow@samba.org> >Date: Fri, 28 Feb 2020 11:40:44 +0100 >Subject: [PATCH 6/9] ctdb: use ctdb_tcp_stop_connection() in > ctdb_tcp_tnode_cb() > >ctdb_tcp_tnode_cb() get called when we receive data on the outgoing >connection. Intead of only tearing down the outgoing connection, we better tear >down outgoing *and* incoming connection to the node. > >BUG: https://bugzilla.samba.org/show_bug.cgi?id=14295 > >Signed-off-by: Ralph Boehme <slow@samba.org> >--- > ctdb/tcp/tcp_connect.c | 2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > >diff --git a/ctdb/tcp/tcp_connect.c b/ctdb/tcp/tcp_connect.c >index a538ad45590..904c625ccfe 100644 >--- a/ctdb/tcp/tcp_connect.c >+++ b/ctdb/tcp/tcp_connect.c >@@ -88,7 +88,7 @@ void ctdb_tcp_tnode_cb(uint8_t *data, size_t cnt, void *private_data) > node->ctdb->upcalls->node_dead(node); > } > >- ctdb_tcp_stop_outgoing_connection(node); >+ ctdb_tcp_stop_connection(node); > tnode->connect_te = tevent_add_timer(node->ctdb->ev, tnode, > timeval_current_ofs(3, 0), > ctdb_tcp_node_connect, node); >-- >2.24.1 > > >From f1d7b879b8ff7e17cf4b1bc4cc005788a7106218 Mon Sep 17 00:00:00 2001 >From: Ralph Boehme <slow@samba.org> >Date: Sat, 29 Feb 2020 12:13:12 +0100 >Subject: [PATCH 7/9] ctdb: always call node_dead_() upcall in > ctdb_tcp_tnode_cb() > >ctdb_tcp_tnode_cb() is called when we receive data on the outgoing connection. > >This can happen when we get an EOF on the connection because the other side as >closed. In this case data will be NULL. > >It would also be called if we received data from the peer. In this case data >will not be NULL. > >The latter case is a fatal error though and we already call >ctdb_tcp_stop_connection() for this case as well, which means even though the >node is not fully connected anymore, by not calling the node_dead() upcall >NODE_FLAGS_DISCONNECTED will not be set. > >BUG: https://bugzilla.samba.org/show_bug.cgi?id=14295 > >Signed-off-by: Ralph Boehme <slow@samba.org> >--- > ctdb/tcp/tcp_connect.c | 4 +--- > 1 file changed, 1 insertion(+), 3 deletions(-) > >diff --git a/ctdb/tcp/tcp_connect.c b/ctdb/tcp/tcp_connect.c >index 904c625ccfe..89e93119cbb 100644 >--- a/ctdb/tcp/tcp_connect.c >+++ b/ctdb/tcp/tcp_connect.c >@@ -84,9 +84,7 @@ void ctdb_tcp_tnode_cb(uint8_t *data, size_t cnt, void *private_data) > struct ctdb_tcp_node *tnode = talloc_get_type( > node->transport_data, struct ctdb_tcp_node); > >- if (data == NULL) { >- node->ctdb->upcalls->node_dead(node); >- } >+ node->ctdb->upcalls->node_dead(node); > > ctdb_tcp_stop_connection(node); > tnode->connect_te = tevent_add_timer(node->ctdb->ev, tnode, >-- >2.24.1 > > >From c6dfdc26b9f56d8b3357b86999f810d2fbd6ed6f Mon Sep 17 00:00:00 2001 >From: Ralph Boehme <slow@samba.org> >Date: Sat, 29 Feb 2020 12:26:19 +0100 >Subject: [PATCH 8/9] ctdb: ctdb_node_dead: ensure restart() callback is called > in half-connected state > >If NODE_FLAGS_DISCONNECTED is set the node can be in half-connected state. With >this change we ensure to restart the transport for this case. > >BUG: https://bugzilla.samba.org/show_bug.cgi?id=14295 > >Signed-off-by: Ralph Boehme <slow@samba.org> >--- > ctdb/server/ctdb_server.c | 13 ++++++------- > 1 file changed, 6 insertions(+), 7 deletions(-) > >diff --git a/ctdb/server/ctdb_server.c b/ctdb/server/ctdb_server.c >index 0d5451d62a8..0bbd3751b51 100644 >--- a/ctdb/server/ctdb_server.c >+++ b/ctdb/server/ctdb_server.c >@@ -301,6 +301,12 @@ void ctdb_input_pkt(struct ctdb_context *ctdb, struct ctdb_req_header *hdr) > */ > void ctdb_node_dead(struct ctdb_node *node) > { >+ if (node->ctdb->methods == NULL) { >+ DEBUG(DEBUG_ERR,(__location__ " Can not restart transport while shutting down daemon.\n")); >+ return; >+ } >+ >+ node->ctdb->methods->restart(node); > if (node->flags & NODE_FLAGS_DISCONNECTED) { > DEBUG(DEBUG_INFO,("%s: node %s is already marked disconnected: %u connected\n", > node->ctdb->name, node->name, >@@ -315,13 +321,6 @@ void ctdb_node_dead(struct ctdb_node *node) > DEBUG(DEBUG_ERR,("%s: node %s is dead: %u connected\n", > node->ctdb->name, node->name, node->ctdb->num_connected)); > ctdb_daemon_cancel_controls(node->ctdb, node); >- >- if (node->ctdb->methods == NULL) { >- DEBUG(DEBUG_ERR,(__location__ " Can not restart transport while shutting down daemon.\n")); >- return; >- } >- >- node->ctdb->methods->restart(node); > } > > /* >-- >2.24.1 > > >From b06f4cd07d06818f618a253dd689b7eee12c18a7 Mon Sep 17 00:00:00 2001 >From: Ralph Boehme <slow@samba.org> >Date: Sat, 29 Feb 2020 12:28:20 +0100 >Subject: [PATCH 9/9] ctdb: remove redundant calls to > ctdb_tcp_stop_*connection() > >This is now reliably handled by the node_dead() upcall. > >BUG: https://bugzilla.samba.org/show_bug.cgi?id=14295 > >Signed-off-by: Ralph Boehme <slow@samba.org> >--- > ctdb/tcp/tcp_connect.c | 1 - > ctdb/tcp/tcp_io.c | 1 - > 2 files changed, 2 deletions(-) > >diff --git a/ctdb/tcp/tcp_connect.c b/ctdb/tcp/tcp_connect.c >index 89e93119cbb..d2af589a03d 100644 >--- a/ctdb/tcp/tcp_connect.c >+++ b/ctdb/tcp/tcp_connect.c >@@ -86,7 +86,6 @@ void ctdb_tcp_tnode_cb(uint8_t *data, size_t cnt, void *private_data) > > node->ctdb->upcalls->node_dead(node); > >- ctdb_tcp_stop_connection(node); > tnode->connect_te = tevent_add_timer(node->ctdb->ev, tnode, > timeval_current_ofs(3, 0), > ctdb_tcp_node_connect, node); >diff --git a/ctdb/tcp/tcp_io.c b/ctdb/tcp/tcp_io.c >index 78fbbf4522b..bcb18fbf300 100644 >--- a/ctdb/tcp/tcp_io.c >+++ b/ctdb/tcp/tcp_io.c >@@ -75,7 +75,6 @@ void ctdb_tcp_read_cb(uint8_t *data, size_t cnt, void *args) > return; > > failed: >- ctdb_tcp_stop_incoming_connection(node); > node->ctdb->upcalls->node_dead(node); > > TALLOC_FREE(data); >-- >2.24.1 >
You cannot view the attachment while viewing its details because your browser does not support IFRAMEs.
View the attachment on a separate page
.
View Attachment As Raw
Actions:
View
Attachments on
bug 14295
:
15823
|
15826
|
15827
|
15828
|
15829
|
15830
|
15855
|
15859
|
15860
|
15862
|
15869