The Samba-Bugzilla – Attachment 12170 Details for
Bug 11941
CTDB does not ban misbehaving nodes during recovery
Home
|
New
|
Browse
|
Search
|
[?]
|
Reports
|
Requests
|
Help
|
New Account
|
Log In
[x]
|
Forgot Password
Login:
[x]
[patch]
Patch for v4-4 branch
v4-4.patches (text/plain), 13.79 KB, created by
Amitay Isaacs
on 2016-06-09 04:13:08 UTC
(
hide
)
Description:
Patch for v4-4 branch
Filename:
MIME Type:
Creator:
Amitay Isaacs
Created:
2016-06-09 04:13:08 UTC
Size:
13.79 KB
patch
obsolete
>From 01b32af0d6ccad7bccd515d82f39833d84aa1231 Mon Sep 17 00:00:00 2001 >From: Amitay Isaacs <amitay@gmail.com> >Date: Thu, 17 Mar 2016 17:16:09 +1100 >Subject: [PATCH 1/3] ctdb-protocol: Add srvid for assigning banning credits > >Signed-off-by: Amitay Isaacs <amitay@gmail.com> >Reviewed-by: Martin Schwenke <martin@meltin.net> >(cherry picked from commit fc63eae80b7b521598560b970a4ce10a0838a3ce) >--- > ctdb/protocol/protocol.h | 5 ++++- > ctdb/protocol/protocol_message.c | 12 ++++++++++++ > 2 files changed, 16 insertions(+), 1 deletion(-) > >diff --git a/ctdb/protocol/protocol.h b/ctdb/protocol/protocol.h >index 798c928..aae6347 100644 >--- a/ctdb/protocol/protocol.h >+++ b/ctdb/protocol/protocol.h >@@ -124,6 +124,9 @@ struct ctdb_call { > /* SRVID prefix used during recovery for pulling and pushing databases */ > #define CTDB_SRVID_RECOVERY 0xF001000000000000LL > >+/* SRVID to assign of banning credits */ >+#define CTDB_SRVID_BANNING 0xF002000000000000LL >+ > /* SRVID to inform of election data */ > #define CTDB_SRVID_ELECTION 0xF100000000000000LL > >@@ -993,7 +996,7 @@ union ctdb_message_data { > uint32_t db_id; > /* SRVID_MEM_DUMP, SRVID_TAKEOVER_RUN */ > struct ctdb_srvid_message *msg; >- /* SRVID_REBALANCE_NODE */ >+ /* SRVID_BANNING, SRVID_REBALANCE_NODE */ > uint32_t pnn; > /* SRVID_DISABLE_TAKEOVER_RUNS, SRVID_DISABLE_RECOVERIES */ > struct ctdb_disable_message *disable; >diff --git a/ctdb/protocol/protocol_message.c b/ctdb/protocol/protocol_message.c >index 696367e..615a49f 100644 >--- a/ctdb/protocol/protocol_message.c >+++ b/ctdb/protocol/protocol_message.c >@@ -40,6 +40,10 @@ static size_t ctdb_message_data_len(union ctdb_message_data *mdata, > size_t len = 0; > > switch (srvid) { >+ case CTDB_SRVID_BANNING: >+ len = ctdb_uint32_len(mdata->pnn); >+ break; >+ > case CTDB_SRVID_ELECTION: > len = ctdb_election_message_len(mdata->election); > break; >@@ -114,6 +118,10 @@ static void ctdb_message_data_push(union ctdb_message_data *mdata, > uint64_t srvid, uint8_t *buf) > { > switch (srvid) { >+ case CTDB_SRVID_BANNING: >+ ctdb_uint32_push(mdata->pnn, buf); >+ break; >+ > case CTDB_SRVID_ELECTION: > ctdb_election_message_push(mdata->election, buf); > break; >@@ -189,6 +197,10 @@ static int ctdb_message_data_pull(uint8_t *buf, size_t buflen, > int ret = 0; > > switch (srvid) { >+ case CTDB_SRVID_BANNING: >+ ret = ctdb_uint32_pull(buf, buflen, mem_ctx, &mdata->pnn); >+ break; >+ > case CTDB_SRVID_ELECTION: > ret = ctdb_election_message_pull(buf, buflen, mem_ctx, > &mdata->election); >-- >2.5.5 > > >From 85d48ee047651fea799a91a02b4a6a3fead3142a Mon Sep 17 00:00:00 2001 >From: Amitay Isaacs <amitay@gmail.com> >Date: Thu, 17 Mar 2016 17:26:30 +1100 >Subject: [PATCH 2/3] ctdb-recoverd: Add message handler to assigning banning > credits > >This will be called from recovery helper to assign banning credits to >misbehaving node. > >Signed-off-by: Amitay Isaacs <amitay@gmail.com> >Reviewed-by: Martin Schwenke <martin@meltin.net> >(cherry picked from commit ae366fb932e9d42fbde5aa48f04d70e15dc36888) >--- > ctdb/server/ctdb_recoverd.c | 28 ++++++++++++++++++++++++++++ > 1 file changed, 28 insertions(+) > >diff --git a/ctdb/server/ctdb_recoverd.c b/ctdb/server/ctdb_recoverd.c >index e42433d..c0a06b6 100644 >--- a/ctdb/server/ctdb_recoverd.c >+++ b/ctdb/server/ctdb_recoverd.c >@@ -2660,6 +2660,30 @@ static void process_ipreallocate_requests(struct ctdb_context *ctdb, > srvid_requests_reply(ctdb, ¤t, result); > } > >+/* >+ * handler for assigning banning credits >+ */ >+static void banning_handler(uint64_t srvid, TDB_DATA data, void *private_data) >+{ >+ struct ctdb_recoverd *rec = talloc_get_type( >+ private_data, struct ctdb_recoverd); >+ uint32_t ban_pnn; >+ >+ /* Ignore if we are not recmaster */ >+ if (rec->ctdb->pnn != rec->recmaster) { >+ return; >+ } >+ >+ if (data.dsize != sizeof(uint32_t)) { >+ DEBUG(DEBUG_ERR, (__location__ "invalid data size %zu\n", >+ data.dsize)); >+ return; >+ } >+ >+ ban_pnn = *(uint32_t *)data.dptr; >+ >+ ctdb_set_culprit_count(rec, ban_pnn, rec->nodemap->num); >+} > > /* > handler for recovery master elections >@@ -3888,6 +3912,10 @@ static void monitor_cluster(struct ctdb_context *ctdb) > /* register a message port for sending memory dumps */ > ctdb_client_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec); > >+ /* when a node is assigned banning credits */ >+ ctdb_client_set_message_handler(ctdb, CTDB_SRVID_BANNING, >+ banning_handler, rec); >+ > /* register a message port for recovery elections */ > ctdb_client_set_message_handler(ctdb, CTDB_SRVID_ELECTION, election_handler, rec); > >-- >2.5.5 > > >From 324bdbc9904259b79299861e913087df70c175cc Mon Sep 17 00:00:00 2001 >From: Amitay Isaacs <amitay@gmail.com> >Date: Tue, 15 Mar 2016 15:08:24 +1100 >Subject: [PATCH 3/3] ctdb-recovery-helper: Add banning to parallel recovery > >If one or more nodes are misbehaving during recovery, keep track of >failures as ban_credits. If the node with the highest ban_credits exceeds >5 ban credits, then tell recovery daemon to assign banning credits. > >This will ban only a single node at a time in case of recovery failure. > >Signed-off-by: Amitay Isaacs <amitay@gmail.com> >Reviewed-by: Martin Schwenke <martin@meltin.net> > >Autobuild-User(master): Martin Schwenke <martins@samba.org> >Autobuild-Date(master): Fri Mar 25 06:57:32 CET 2016 on sn-devel-144 > >(cherry picked from commit c51b8c22349bde6a3280c51ac147cab5ea27b5a6) >--- > ctdb/server/ctdb_recovery_helper.c | 95 ++++++++++++++++++++++++++++++++++---- > 1 file changed, 87 insertions(+), 8 deletions(-) > >diff --git a/ctdb/server/ctdb_recovery_helper.c b/ctdb/server/ctdb_recovery_helper.c >index d1ec1e7..24aa42f 100644 >--- a/ctdb/server/ctdb_recovery_helper.c >+++ b/ctdb/server/ctdb_recovery_helper.c >@@ -1218,6 +1218,7 @@ struct collect_highseqnum_db_state { > uint32_t *pnn_list; > int count; > uint32_t *caps; >+ uint32_t *ban_credits; > uint32_t db_id; > struct recdb_context *recdb; > uint32_t max_pnn; >@@ -1231,7 +1232,8 @@ static struct tevent_req *collect_highseqnum_db_send( > struct tevent_context *ev, > struct ctdb_client_context *client, > uint32_t *pnn_list, int count, uint32_t *caps, >- uint32_t db_id, struct recdb_context *recdb) >+ uint32_t *ban_credits, uint32_t db_id, >+ struct recdb_context *recdb) > { > struct tevent_req *req, *subreq; > struct collect_highseqnum_db_state *state; >@@ -1248,6 +1250,7 @@ static struct tevent_req *collect_highseqnum_db_send( > state->pnn_list = pnn_list; > state->count = count; > state->caps = caps; >+ state->ban_credits = ban_credits; > state->db_id = db_id; > state->recdb = recdb; > >@@ -1332,12 +1335,15 @@ static void collect_highseqnum_db_pulldb_done(struct tevent_req *subreq) > { > struct tevent_req *req = tevent_req_callback_data( > subreq, struct tevent_req); >+ struct collect_highseqnum_db_state *state = tevent_req_data( >+ req, struct collect_highseqnum_db_state); > int ret; > bool status; > > status = pull_database_recv(subreq, &ret); > TALLOC_FREE(subreq); > if (! status) { >+ state->ban_credits[state->max_pnn] += 1; > tevent_req_error(req, ret); > return; > } >@@ -1360,6 +1366,7 @@ struct collect_all_db_state { > uint32_t *pnn_list; > int count; > uint32_t *caps; >+ uint32_t *ban_credits; > uint32_t db_id; > struct recdb_context *recdb; > struct ctdb_pulldb pulldb; >@@ -1373,7 +1380,8 @@ static struct tevent_req *collect_all_db_send( > struct tevent_context *ev, > struct ctdb_client_context *client, > uint32_t *pnn_list, int count, uint32_t *caps, >- uint32_t db_id, struct recdb_context *recdb) >+ uint32_t *ban_credits, uint32_t db_id, >+ struct recdb_context *recdb) > { > struct tevent_req *req, *subreq; > struct collect_all_db_state *state; >@@ -1418,6 +1426,8 @@ static void collect_all_db_pulldb_done(struct tevent_req *subreq) > status = pull_database_recv(subreq, &ret); > TALLOC_FREE(subreq); > if (! status) { >+ pnn = state->pnn_list[state->index]; >+ state->ban_credits[pnn] += 1; > tevent_req_error(req, ret); > return; > } >@@ -1463,6 +1473,7 @@ struct recover_db_state { > uint32_t *pnn_list; > int count; > uint32_t *caps; >+ uint32_t *ban_credits; > uint32_t db_id; > bool persistent; > >@@ -1489,6 +1500,7 @@ static struct tevent_req *recover_db_send(TALLOC_CTX *mem_ctx, > struct ctdb_tunable_list *tun_list, > uint32_t *pnn_list, int count, > uint32_t *caps, >+ uint32_t *ban_credits, > uint32_t generation, > uint32_t db_id, bool persistent) > { >@@ -1507,6 +1519,7 @@ static struct tevent_req *recover_db_send(TALLOC_CTX *mem_ctx, > state->pnn_list = pnn_list; > state->count = count; > state->caps = caps; >+ state->ban_credits = ban_credits; > state->db_id = db_id; > state->persistent = persistent; > >@@ -1692,12 +1705,14 @@ static void recover_db_transaction_started(struct tevent_req *subreq) > subreq = collect_highseqnum_db_send( > state, state->ev, state->client, > state->pnn_list, state->count, state->caps, >- state->db_id, state->recdb); >+ state->ban_credits, state->db_id, >+ state->recdb); > } else { > subreq = collect_all_db_send( > state, state->ev, state->client, > state->pnn_list, state->count, state->caps, >- state->db_id, state->recdb); >+ state->ban_credits, state->db_id, >+ state->recdb); > } > if (tevent_req_nomem(subreq, req)) { > return; >@@ -1912,6 +1927,7 @@ struct db_recovery_one_state { > uint32_t *pnn_list; > int count; > uint32_t *caps; >+ uint32_t *ban_credits; > uint32_t generation; > uint32_t db_id; > bool persistent; >@@ -1927,6 +1943,7 @@ static struct tevent_req *db_recovery_send(TALLOC_CTX *mem_ctx, > struct ctdb_tunable_list *tun_list, > uint32_t *pnn_list, int count, > uint32_t *caps, >+ uint32_t *ban_credits, > uint32_t generation) > { > struct tevent_req *req, *subreq; >@@ -1963,13 +1980,14 @@ static struct tevent_req *db_recovery_send(TALLOC_CTX *mem_ctx, > substate->pnn_list = pnn_list; > substate->count = count; > substate->caps = caps; >+ substate->ban_credits = ban_credits; > substate->generation = generation; > substate->db_id = dbmap->dbs[i].db_id; > substate->persistent = dbmap->dbs[i].flags & > CTDB_DB_FLAGS_PERSISTENT; > > subreq = recover_db_send(state, ev, client, tun_list, >- pnn_list, count, caps, >+ pnn_list, count, caps, ban_credits, > generation, substate->db_id, > substate->persistent); > if (tevent_req_nomem(subreq, req)) { >@@ -2005,7 +2023,7 @@ static void db_recovery_one_done(struct tevent_req *subreq) > subreq = recover_db_send(state, state->ev, substate->client, > substate->tun_list, > substate->pnn_list, substate->count, >- substate->caps, >+ substate->caps, substate->ban_credits, > substate->generation, substate->db_id, > substate->persistent); > if (tevent_req_nomem(subreq, req)) { >@@ -2074,6 +2092,7 @@ struct recovery_state { > uint32_t destnode; > struct ctdb_node_map *nodemap; > uint32_t *caps; >+ uint32_t *ban_credits; > struct ctdb_tunable_list *tun_list; > struct ctdb_vnn_map *vnnmap; > struct ctdb_dbid_map *dbmap; >@@ -2088,6 +2107,7 @@ static void recovery_active_done(struct tevent_req *subreq); > static void recovery_start_recovery_done(struct tevent_req *subreq); > static void recovery_vnnmap_update_done(struct tevent_req *subreq); > static void recovery_db_recovery_done(struct tevent_req *subreq); >+static void recovery_failed_done(struct tevent_req *subreq); > static void recovery_normal_done(struct tevent_req *subreq); > static void recovery_end_recovery_done(struct tevent_req *subreq); > >@@ -2197,6 +2217,12 @@ static void recovery_nodemap_done(struct tevent_req *subreq) > return; > } > >+ state->ban_credits = talloc_zero_array(state, uint32_t, >+ state->nodemap->num); >+ if (tevent_req_nomem(state->ban_credits, req)) { >+ return; >+ } >+ > ctdb_req_control_getvnnmap(&request); > subreq = ctdb_client_control_send(state, state->ev, state->client, > state->destnode, TIMEOUT(), >@@ -2523,7 +2549,8 @@ static void recovery_vnnmap_update_done(struct tevent_req *subreq) > subreq = db_recovery_send(state, state->ev, state->client, > state->dbmap, state->tun_list, > state->pnn_list, state->count, >- state->caps, state->vnnmap->generation); >+ state->caps, state->ban_credits, >+ state->vnnmap->generation); > if (tevent_req_nomem(subreq, req)) { > return; > } >@@ -2546,7 +2573,43 @@ static void recovery_db_recovery_done(struct tevent_req *subreq) > LOG("%d of %d databases recovered\n", count, state->dbmap->num); > > if (! status) { >- tevent_req_error(req, EIO); >+ uint32_t max_pnn = CTDB_UNKNOWN_PNN, max_credits = 0; >+ int i; >+ >+ /* Bans are not enabled */ >+ if (state->tun_list->enable_bans == 0) { >+ tevent_req_error(req, EIO); >+ return; >+ } >+ >+ for (i=0; i<state->count; i++) { >+ uint32_t pnn; >+ pnn = state->pnn_list[i]; >+ if (state->ban_credits[pnn] > max_credits) { >+ max_pnn = pnn; >+ max_credits = state->ban_credits[pnn]; >+ } >+ } >+ >+ /* If pulling database fails multiple times */ >+ if (max_credits >= NUM_RETRIES) { >+ struct ctdb_req_message message; >+ >+ LOG("Assigning banning credits to node %u\n", max_pnn); >+ >+ message.srvid = CTDB_SRVID_BANNING; >+ message.data.pnn = max_pnn; >+ >+ subreq = ctdb_client_message_send( >+ state, state->ev, state->client, >+ ctdb_client_pnn(state->client), >+ &message); >+ if (tevent_req_nomem(subreq, req)) { >+ return; >+ } >+ tevent_req_set_callback(subreq, recovery_failed_done, >+ req); >+ } > return; > } > >@@ -2561,6 +2624,22 @@ static void recovery_db_recovery_done(struct tevent_req *subreq) > tevent_req_set_callback(subreq, recovery_normal_done, req); > } > >+static void recovery_failed_done(struct tevent_req *subreq) >+{ >+ struct tevent_req *req = tevent_req_callback_data( >+ subreq, struct tevent_req); >+ int ret; >+ bool status; >+ >+ status = ctdb_client_message_recv(subreq, &ret); >+ TALLOC_FREE(subreq); >+ if (! status) { >+ LOG("failed to assign banning credits, ret=%d\n", ret); >+ } >+ >+ tevent_req_error(req, EIO); >+} >+ > static void recovery_normal_done(struct tevent_req *subreq) > { > struct tevent_req *req = tevent_req_callback_data( >-- >2.5.5 >
You cannot view the attachment while viewing its details because your browser does not support IFRAMEs.
View the attachment on a separate page
.
View Attachment As Raw
Flags:
martins
:
review+
Actions:
View
Attachments on
bug 11941
: 12170