From 9920e7b8089398578959f5bee53539c19dc7bde5 Mon Sep 17 00:00:00 2001
From: Andrew Tridgell <tridge@samba.org>
Date: Fri, 5 Feb 2010 19:14:45 -0800
Subject: [PATCH 1/4] s3-events: make the old timed events compatible with tevent

tevent ensures that a timed event is only called once. The old events
code relied on the called handler removing the event itself. If the
handler removed the event after calling a function which invoked the
event loop then the timed event could loop forever.

This change makes the two timed event systems more compatible, by
allowing the handler to free the te if it wants to, but ensuring it is
off the linked list of events before the handler is called, and
ensuring it is freed even if the handler doesn't free it.
(cherry picked from commit 5dbf175c75bd6139f3238f36665000641f7f7f79)
---
 source3/lib/events.c |   21 +++++++++++++++++++--
 1 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/source3/lib/events.c b/source3/lib/events.c
index 7a06ad0..75aa250 100644
--- a/source3/lib/events.c
+++ b/source3/lib/events.c
@@ -105,12 +105,29 @@ bool run_events(struct tevent_context *ev,
 
 	if ((ev->timer_events != NULL)
 	    && (timeval_compare(&now, &ev->timer_events->next_event) >= 0)) {
+		/* this older events system did not auto-free timed
+		   events on running them, and had a race condition
+		   where the event could be called twice if the
+		   talloc_free of the te happened after the callback
+		   made a call which invoked the event loop. To avoid
+		   this while still allowing old code which frees the
+		   te, we need to create a temporary context which
+		   will be used to ensure the te is freed. We also
+		   remove the te from the timed event list before we
+		   call the handler, to ensure we can't loop */
+
+		struct tevent_timer *te = ev->timer_events;
+		TALLOC_CTX *tmp_ctx = talloc_new(ev);
 
 		DEBUG(10, ("Running timed event \"%s\" %p\n",
 			   ev->timer_events->handler_name, ev->timer_events));
 
-		ev->timer_events->handler(ev, ev->timer_events, now,
-					  ev->timer_events->private_data);
+		DLIST_REMOVE(ev->timer_events, te);
+		talloc_steal(tmp_ctx, te);
+
+		te->handler(ev, te, now, te->private_data);
+
+		talloc_free(tmp_ctx);
 		return true;
 	}
 
-- 
1.6.3.3


From c6d9e832ca09fc0c7bc2d31333caf751fcd1803a Mon Sep 17 00:00:00 2001
From: Andrew Tridgell <tridge@samba.org>
Date: Fri, 5 Feb 2010 20:59:43 -0800
Subject: [PATCH 2/4] s3-brlock: add a minimim retry time for pending blocking locks

When we are waiting on a pending byte range lock, another smbd might
exit uncleanly, and therefore not notify us of the removal of the
lock, and thus not trigger the lock to be retried.

We coped with this up to now by adding a message_send_all() in the
SIGCHLD and cluster reconfigure handlers to send a MSG_SMB_UNLOCK to
all smbd processes. That would generate O(N^2) work when a large
number of clients disconnected at once (such as on a network outage),
which could leave the whole system unusable for a very long time (many
minutes, or even longer).

By adding a minimum re-check time for pending byte range locks we
avoid this problem by ensuring that pending locks are retried at a
more regular interval.
(cherry picked from commit 5b398edbee672392f2cea260ab17445ecca927d7)
---
 source3/smbd/blocking.c |   20 ++++++++++++++++++++
 1 files changed, 20 insertions(+), 0 deletions(-)

diff --git a/source3/smbd/blocking.c b/source3/smbd/blocking.c
index e33c0b6..3f49421 100644
--- a/source3/smbd/blocking.c
+++ b/source3/smbd/blocking.c
@@ -72,6 +72,7 @@ static bool recalc_brl_timeout(void)
 {
 	struct blocking_lock_record *blr;
 	struct timeval next_timeout;
+	int max_brl_timeout = lp_parm_int(-1, "brl", "recalctime", 5);
 
 	TALLOC_FREE(brl_timeout);
 
@@ -100,6 +101,25 @@ static bool recalc_brl_timeout(void)
 		return True;
 	}
 
+	/* 
+	 to account for unclean shutdowns by clients we need a
+	 maximum timeout that we use for checking pending locks. If
+	 we have any pending locks at all, then check if the pending
+	 lock can continue at least every brl:recalctime seconds
+	 (default 5 seconds).
+
+	 This saves us needing to do a message_send_all() in the
+	 SIGCHLD handler in the parent daemon. That
+	 message_send_all() caused O(n^2) work to be done when IP
+	 failovers happened in clustered Samba, which could make the
+	 entire system unusable for many minutes.
+	*/
+
+	if (max_brl_timeout > 0) {
+		struct timeval min_to = timeval_current_ofs(max_brl_timeout, 0);
+		next_timeout = timeval_min(&next_timeout, &min_to);             
+	}
+
 	if (DEBUGLVL(10)) {
 		struct timeval cur, from_now;
 
-- 
1.6.3.3


From 92b4f744b847d9ac65063f5c674afbf8b4df0df9 Mon Sep 17 00:00:00 2001
From: Andrew Tridgell <tridge@samba.org>
Date: Fri, 5 Feb 2010 21:02:24 -0800
Subject: [PATCH 3/4] s3-brlock: we don't need these MSG_SMB_UNLOCK calls now

These have been replaced with the min timeout in blocking.c
(cherry picked from commit 74267d652485cdcb711f734f0d80da0fb1495867)
---
 source3/lib/ctdbd_conn.c |    8 --------
 source3/smbd/server.c    |    2 --
 2 files changed, 0 insertions(+), 10 deletions(-)

diff --git a/source3/lib/ctdbd_conn.c b/source3/lib/ctdbd_conn.c
index 84bba3b..8ddb12a 100644
--- a/source3/lib/ctdbd_conn.c
+++ b/source3/lib/ctdbd_conn.c
@@ -542,15 +542,7 @@ static NTSTATUS ctdb_handle_message(uint8_t *buf, size_t length,
 		messaging_send(conn->msg_ctx, procid_self(),
 			       MSG_SMB_BRL_VALIDATE, &data_blob_null);
 
-		/*
-		 * it's possible that we have just rejoined the cluster after
-		 * an outage. In that case our pending locks could have been
-		 * removed from the lockdb, so retry them once more
-		 */
-		message_send_all(conn->msg_ctx, MSG_SMB_UNLOCK, NULL, 0, NULL);
-
 		TALLOC_FREE(buf);
-
 		return NT_STATUS_OK;
 	}
 
diff --git a/source3/smbd/server.c b/source3/smbd/server.c
index f719961..56c7e2f 100644
--- a/source3/smbd/server.c
+++ b/source3/smbd/server.c
@@ -230,8 +230,6 @@ static void remove_child_pid(pid_t pid, bool unclean_shutdown)
 		DEBUG(3,(__location__ " Unclean shutdown of pid %u\n", (unsigned int)pid));
 		messaging_send_buf(smbd_messaging_context(), procid_self(), 
 				   MSG_SMB_BRL_VALIDATE, NULL, 0);
-		message_send_all(smbd_messaging_context(), 
-				 MSG_SMB_UNLOCK, NULL, 0, NULL);
 	}
 
 	for (child = children; child != NULL; child = child->next) {
-- 
1.6.3.3


From 55846f5a71b3b575029d99aedab2dbe754dc1a10 Mon Sep 17 00:00:00 2001
From: Andrew Tridgell <tridge@samba.org>
Date: Fri, 5 Feb 2010 21:08:56 -0800
Subject: [PATCH 4/4] s3-smbd: add a rate limited cleanup of brl, connections and locking db

On unclean shutdown we can end up with stale entries in the brlock,
connections and locking db. Previously we would do the cleanup on
every unclean exit, but that can cause smbd to be completely
unavailable for several minutes when a large number of child smbd
processes exit.

This adds a rate limited cleanup of the databases, with the default
that cleanup happens at most every 20s
(cherry picked from commit dd498d2eecf124a03b6117ddab892a1112f9e9db)
---
 source3/smbd/server.c |   48 ++++++++++++++++++++++++++++++++++++++++++------
 1 files changed, 42 insertions(+), 6 deletions(-)

diff --git a/source3/smbd/server.c b/source3/smbd/server.c
index 56c7e2f..0c59135 100644
--- a/source3/smbd/server.c
+++ b/source3/smbd/server.c
@@ -219,17 +219,53 @@ static void add_child_pid(pid_t pid)
 	num_children += 1;
 }
 
+/*
+  at most every smbd:cleanuptime seconds (default 20), we scan the BRL
+  and locking database for entries to cleanup. As a side effect this
+  also cleans up dead entries in the connections database (due to the
+  traversal in message_send_all()
+
+  Using a timer for this prevents a flood of traversals when a large
+  number of clients disconnect at the same time (perhaps due to a
+  network outage).  
+*/
+
+static void cleanup_timeout_fn(struct event_context *event_ctx,
+				struct timed_event *te,
+				struct timeval now,
+				void *private_data)
+{
+	struct timed_event **cleanup_te = (struct timed_event **)private_data;
+
+	DEBUG(1,("Cleaning up brl and lock database after unclean shutdown\n"));
+	message_send_all(smbd_messaging_context(), MSG_SMB_UNLOCK, NULL, 0, NULL);
+	messaging_send_buf(smbd_messaging_context(), procid_self(), 
+				MSG_SMB_BRL_VALIDATE, NULL, 0);
+	/* mark the cleanup as having been done */
+	(*cleanup_te) = NULL;
+}
+
 static void remove_child_pid(pid_t pid, bool unclean_shutdown)
 {
 	struct child_pid *child;
+	static struct timed_event *cleanup_te;
 
 	if (unclean_shutdown) {
-		/* a child terminated uncleanly so tickle all processes to see 
-		   if they can grab any of the pending locks
-		*/
-		DEBUG(3,(__location__ " Unclean shutdown of pid %u\n", (unsigned int)pid));
-		messaging_send_buf(smbd_messaging_context(), procid_self(), 
-				   MSG_SMB_BRL_VALIDATE, NULL, 0);
+		/* a child terminated uncleanly so tickle all
+		   processes to see if they can grab any of the
+		   pending locks
+                */
+		DEBUG(3,(__location__ " Unclean shutdown of pid %u\n", 
+			(unsigned int)pid));
+		if (!cleanup_te) {
+			/* call the cleanup timer, but not too often */
+			int cleanup_time = lp_parm_int(-1, "smbd", "cleanuptime", 20);
+			cleanup_te = event_add_timed(smbd_event_context(), NULL,
+						timeval_current_ofs(cleanup_time, 0),
+						cleanup_timeout_fn, 
+						&cleanup_te);
+			DEBUG(1,("Scheduled cleanup of brl and lock database after unclean shutdown\n"));
+		}
 	}
 
 	for (child = children; child != NULL; child = child->next) {
-- 
1.6.3.3