From f4f54f04bb06220f35125f029bc283b7c5a6d4b8 Mon Sep 17 00:00:00 2001 From: Amitay Isaacs Date: Thu, 18 May 2017 10:15:01 +1000 Subject: [PATCH 1/2] Revert "ctdb-readonly: Avoid a tight loop waiting for revoke to complete" BUG: https://bugzilla.samba.org/show_bug.cgi?id=12697 This reverts commit ad758cb869ac83534993caa212abc9fe9905ec68. This is an incomplete fix and introduces a regression. Signed-off-by: Amitay Isaacs Reviewed-by: Martin Schwenke (cherry picked from commit a50b25d0ebbe731a766f8d2ce1924b34d6041668) --- ctdb/server/ctdb_call.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/ctdb/server/ctdb_call.c b/ctdb/server/ctdb_call.c index f9c2922..3478419 100644 --- a/ctdb/server/ctdb_call.c +++ b/ctdb/server/ctdb_call.c @@ -1599,6 +1599,7 @@ static int deferred_call_destructor(struct revokechild_deferred_call *deferred_c { struct ctdb_context *ctdb = deferred_call->ctdb; struct revokechild_requeue_handle *requeue_handle = talloc(ctdb, struct revokechild_requeue_handle); + struct ctdb_req_call_old *c = (struct ctdb_req_call_old *)deferred_call->hdr; requeue_handle->ctdb = ctdb; requeue_handle->hdr = deferred_call->hdr; @@ -1606,12 +1607,9 @@ static int deferred_call_destructor(struct revokechild_deferred_call *deferred_c requeue_handle->ctx = deferred_call->ctx; talloc_steal(requeue_handle, requeue_handle->hdr); - /* Always delay revoke requests. Either wait for the read/write - * operation to complete, or if revoking failed wait for recovery to - * complete - */ + /* when revoking, any READONLY requests have 1 second grace to let read/write finish first */ tevent_add_timer(ctdb->ev, requeue_handle, - timeval_current_ofs(1, 0), + timeval_current_ofs(c->flags & CTDB_WANT_READONLY ? 1 : 0, 0), deferred_call_requeue, requeue_handle); return 0; -- 2.9.4 From c0d0b5e08404214604610de9342b24ed5cb87c01 Mon Sep 17 00:00:00 2001 From: Amitay Isaacs Date: Thu, 18 May 2017 11:50:09 +1000 Subject: [PATCH 2/2] ctdb-readonly: Avoid a tight loop waiting for revoke to complete BUG: https://bugzilla.samba.org/show_bug.cgi?id=12697 During revoking readonly delegations, if one of the nodes disappears, then there is no point re-trying revoking readonly delegation immedately. The database needs to be recovered before the revoke operation can succeed. However, if the revoke is successful, then all the write requests need to be processed immediately before the read-only requests. This avoids starving write requests, in case there are read-only requests coming from other nodes. In deferred_call_destructor, the result of revoke is not available and deferred calls cannot be correctly ordered. To correctly order the deferred calls, process them in revokechild_destructor where the result of revoke is known. Signed-off-by: Amitay Isaacs Reviewed-by: Martin Schwenke (cherry picked from commit f5f05a644dadc0b1858c99c5f1f5af1ef80f3a28) --- ctdb/server/ctdb_call.c | 91 +++++++++++++++++++++++++++++++------------------ 1 file changed, 58 insertions(+), 33 deletions(-) diff --git a/ctdb/server/ctdb_call.c b/ctdb/server/ctdb_call.c index 3478419..93642cd 100644 --- a/ctdb/server/ctdb_call.c +++ b/ctdb/server/ctdb_call.c @@ -1561,6 +1561,7 @@ void ctdb_send_keepalive(struct ctdb_context *ctdb, uint32_t destnode) struct revokechild_deferred_call { + struct revokechild_deferred_call *prev, *next; struct ctdb_context *ctdb; struct ctdb_req_header *hdr; deferred_requeue_fn fn; @@ -1576,48 +1577,31 @@ struct revokechild_handle { int fd[2]; pid_t child; TDB_DATA key; -}; - -struct revokechild_requeue_handle { - struct ctdb_context *ctdb; - struct ctdb_req_header *hdr; - deferred_requeue_fn fn; - void *ctx; + struct revokechild_deferred_call *deferred_call_list; }; static void deferred_call_requeue(struct tevent_context *ev, struct tevent_timer *te, struct timeval t, void *private_data) { - struct revokechild_requeue_handle *requeue_handle = talloc_get_type(private_data, struct revokechild_requeue_handle); + struct revokechild_deferred_call *dlist = talloc_get_type_abort( + private_data, struct revokechild_deferred_call); - requeue_handle->fn(requeue_handle->ctx, requeue_handle->hdr); - talloc_free(requeue_handle); -} + while (dlist != NULL) { + struct revokechild_deferred_call *dcall = dlist; -static int deferred_call_destructor(struct revokechild_deferred_call *deferred_call) -{ - struct ctdb_context *ctdb = deferred_call->ctdb; - struct revokechild_requeue_handle *requeue_handle = talloc(ctdb, struct revokechild_requeue_handle); - struct ctdb_req_call_old *c = (struct ctdb_req_call_old *)deferred_call->hdr; - - requeue_handle->ctdb = ctdb; - requeue_handle->hdr = deferred_call->hdr; - requeue_handle->fn = deferred_call->fn; - requeue_handle->ctx = deferred_call->ctx; - talloc_steal(requeue_handle, requeue_handle->hdr); - - /* when revoking, any READONLY requests have 1 second grace to let read/write finish first */ - tevent_add_timer(ctdb->ev, requeue_handle, - timeval_current_ofs(c->flags & CTDB_WANT_READONLY ? 1 : 0, 0), - deferred_call_requeue, requeue_handle); - - return 0; + DLIST_REMOVE(dlist, dcall); + dcall->fn(dcall->ctx, dcall->hdr); + talloc_free(dcall); + } } static int revokechild_destructor(struct revokechild_handle *rc) { + struct revokechild_deferred_call *now_list = NULL; + struct revokechild_deferred_call *delay_list = NULL; + if (rc->fde != NULL) { talloc_free(rc->fde); } @@ -1631,6 +1615,48 @@ static int revokechild_destructor(struct revokechild_handle *rc) ctdb_kill(rc->ctdb, rc->child, SIGKILL); DLIST_REMOVE(rc->ctdb_db->revokechild_active, rc); + + while (rc->deferred_call_list != NULL) { + struct revokechild_deferred_call *dcall; + + dcall = rc->deferred_call_list; + DLIST_REMOVE(rc->deferred_call_list, dcall); + + /* If revoke is successful, then first process all the calls + * that need write access, and delay readonly requests by 1 + * second grace. + * + * If revoke is unsuccessful, most likely because of node + * failure, delay all the pending requests, so database can + * be recovered. + */ + + if (rc->status == 0) { + struct ctdb_req_call_old *c; + + c = (struct ctdb_req_call_old *)dcall->hdr; + if (c->flags & CTDB_WANT_READONLY) { + DLIST_ADD(delay_list, dcall); + } else { + DLIST_ADD(now_list, dcall); + } + } else { + DLIST_ADD(delay_list, dcall); + } + } + + if (now_list != NULL) { + tevent_add_timer(rc->ctdb->ev, rc->ctdb_db, + tevent_timeval_current_ofs(0, 0), + deferred_call_requeue, now_list); + } + + if (delay_list != NULL) { + tevent_add_timer(rc->ctdb->ev, rc->ctdb_db, + tevent_timeval_current_ofs(1, 0), + deferred_call_requeue, delay_list); + } + return 0; } @@ -1909,19 +1935,18 @@ int ctdb_add_revoke_deferred_call(struct ctdb_context *ctdb, struct ctdb_db_cont return -1; } - deferred_call = talloc(rc, struct revokechild_deferred_call); + deferred_call = talloc(ctdb_db, struct revokechild_deferred_call); if (deferred_call == NULL) { DEBUG(DEBUG_ERR,("Failed to allocate deferred call structure for revoking record\n")); return -1; } deferred_call->ctdb = ctdb; - deferred_call->hdr = hdr; + deferred_call->hdr = talloc_steal(deferred_call, hdr); deferred_call->fn = fn; deferred_call->ctx = call_context; - talloc_set_destructor(deferred_call, deferred_call_destructor); - talloc_steal(deferred_call, hdr); + DLIST_ADD(rc->deferred_call_list, deferred_call); return 0; } -- 2.9.4