From fb1d5ec061a3f49e36064a7520f5806ba27712df Mon Sep 17 00:00:00 2001
From: Volker Lendecke <vl@samba.org>
Date: Sun, 25 Oct 2009 16:12:12 +0100
Subject: [PATCH 01/32] s3: Implement global locks in a g_lock tdb

This is the basis to implement global locks in ctdb without depending on a
shared file system. The initial goal is to make ctdb persistent transactions
deterministic without too many timeouts.
(cherry picked from commit 4c1c3f2549f32fd069e0e7bf3aec299213f1e85b)
---
 source3/Makefile.in                    |    2 +
 source3/include/ctdbd_conn.h           |    2 +
 source3/include/g_lock.h               |   55 +++
 source3/lib/ctdbd_conn.c               |   73 ++++-
 source3/lib/g_lock.c                   |  594 ++++++++++++++++++++++++++++++++
 source3/librpc/gen_ndr/messaging.h     |    4 +-
 source3/librpc/gen_ndr/ndr_messaging.c |    1 +
 source3/librpc/idl/messaging.idl       |    3 +-
 source3/utils/net.c                    |    7 +
 source3/utils/net_g_lock.c             |  213 ++++++++++++
 source3/utils/net_proto.h              |    3 +
 11 files changed, 950 insertions(+), 7 deletions(-)
 create mode 100644 source3/include/g_lock.h
 create mode 100644 source3/lib/g_lock.c
 create mode 100644 source3/utils/net_g_lock.c

diff --git a/source3/Makefile.in b/source3/Makefile.in
index f84ed20..f58bb70 100644
--- a/source3/Makefile.in
+++ b/source3/Makefile.in
@@ -265,6 +265,7 @@ EXTRA_ALL_TARGETS = @EXTRA_ALL_TARGETS@
 TDB_LIB_OBJ = lib/util_tdb.o ../lib/util/util_tdb.o \
 	  lib/dbwrap.o lib/dbwrap_tdb.o \
 	  lib/dbwrap_ctdb.o \
+	  lib/g_lock.o \
 	  lib/dbwrap_rbt.o
 
 TDB_VALIDATE_OBJ = lib/tdb_validate.o
@@ -1011,6 +1012,7 @@ NET_OBJ1 = utils/net.o utils/net_ads.o utils/net_help.o \
 	   utils/net_conf.o utils/net_join.o utils/net_user.o \
 	   utils/net_group.o utils/net_file.o utils/net_registry.o \
 	   auth/token_util.o utils/net_dom.o utils/net_share.o \
+	   utils/net_g_lock.o \
 	   utils/net_eventlog.o
 
 # these are not processed by make proto
diff --git a/source3/include/ctdbd_conn.h b/source3/include/ctdbd_conn.h
index d721235..96e7aa0 100644
--- a/source3/include/ctdbd_conn.h
+++ b/source3/include/ctdbd_conn.h
@@ -73,5 +73,7 @@ NTSTATUS ctdbd_control_local(struct ctdbd_connection *conn, uint32 opcode,
 			     uint64_t srvid, uint32_t flags, TDB_DATA data, 
 			     TALLOC_CTX *mem_ctx, TDB_DATA *outdata,
 			     int *cstatus);
+NTSTATUS ctdb_watch_us(struct ctdbd_connection *conn);
+NTSTATUS ctdb_unwatch(struct ctdbd_connection *conn);
 
 #endif /* _CTDBD_CONN_H */
diff --git a/source3/include/g_lock.h b/source3/include/g_lock.h
new file mode 100644
index 0000000..c0eed38
--- /dev/null
+++ b/source3/include/g_lock.h
@@ -0,0 +1,55 @@
+/*
+   Unix SMB/CIFS implementation.
+   global locks based on ctdb
+   Copyright (C) 2009 by Volker Lendecke
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef _G_LOCK_H_
+#define _G_LOCK_H_
+
+#include "dbwrap.h"
+
+struct g_lock_ctx;
+
+enum g_lock_type {
+	G_LOCK_READ = 0,
+	G_LOCK_WRITE = 1,
+};
+
+/*
+ * Or'ed with g_lock_type
+ */
+#define G_LOCK_PENDING (2)
+
+struct g_lock_ctx *g_lock_ctx_init(TALLOC_CTX *mem_ctx,
+				   struct messaging_context *msg);
+
+NTSTATUS g_lock_lock(struct g_lock_ctx *ctx, const char *name,
+		     enum g_lock_type lock_type, struct timeval timeout);
+NTSTATUS g_lock_unlock(struct g_lock_ctx *ctx, const char *name);
+NTSTATUS g_lock_get(struct g_lock_ctx *ctx, const char *name,
+		struct server_id *pid);
+
+int g_lock_locks(struct g_lock_ctx *ctx,
+		 int (*fn)(const char *name, void *private_data),
+		 void *private_data);
+NTSTATUS g_lock_dump(struct g_lock_ctx *ctx, const char *name,
+		     int (*fn)(struct server_id pid,
+			       enum g_lock_type lock_type,
+			       void *private_data),
+		     void *private_data);
+
+#endif
diff --git a/source3/lib/ctdbd_conn.c b/source3/lib/ctdbd_conn.c
index 8ddb12a..900fa34 100644
--- a/source3/lib/ctdbd_conn.c
+++ b/source3/lib/ctdbd_conn.c
@@ -361,10 +361,18 @@ static NTSTATUS ctdb_read_req(struct ctdbd_connection *conn, uint32 reqid,
 			goto next_pkt;
 		}
 
-		if (msg->srvid == CTDB_SRVID_RECONFIGURE) {
-			DEBUG(0,("Got cluster reconfigure message in ctdb_read_req\n"));
+		if ((msg->srvid == CTDB_SRVID_RECONFIGURE)
+		    || (msg->srvid == CTDB_SRVID_SAMBA_NOTIFY)) {
+
+			DEBUG(1, ("ctdb_read_req: Got %s message\n",
+				  (msg->srvid == CTDB_SRVID_RECONFIGURE)
+				  ? "cluster reconfigure" : "SAMBA_NOTIFY"));
+
 			messaging_send(conn->msg_ctx, procid_self(),
 				       MSG_SMB_BRL_VALIDATE, &data_blob_null);
+			messaging_send(conn->msg_ctx, procid_self(),
+				       MSG_DBWRAP_G_LOCK_RETRY,
+				       &data_blob_null);
 			TALLOC_FREE(hdr);
 			goto next_pkt;
 		}
@@ -493,6 +501,11 @@ NTSTATUS ctdbd_messaging_connection(TALLOC_CTX *mem_ctx,
 		goto fail;
 	}
 
+	status = register_with_ctdbd(conn, CTDB_SRVID_SAMBA_NOTIFY);
+	if (!NT_STATUS_IS_OK(status)) {
+		goto fail;
+	}
+
 	*pconn = conn;
 	return NT_STATUS_OK;
 
@@ -533,15 +546,21 @@ static NTSTATUS ctdb_handle_message(uint8_t *buf, size_t length,
 
 	SMB_ASSERT(conn->msg_ctx != NULL);
 
-	if (msg->srvid == CTDB_SRVID_RECONFIGURE) {
+	if ((msg->srvid == CTDB_SRVID_RECONFIGURE)
+	    || (msg->srvid == CTDB_SRVID_SAMBA_NOTIFY)){
 		DEBUG(0,("Got cluster reconfigure message\n"));
 		/*
-		 * when the cluster is reconfigured, we need to clean the brl
-		 * database
+		 * when the cluster is reconfigured or someone of the
+		 * family has passed away (SAMBA_NOTIFY), we need to
+		 * clean the brl database
 		 */
 		messaging_send(conn->msg_ctx, procid_self(),
 			       MSG_SMB_BRL_VALIDATE, &data_blob_null);
 
+		messaging_send(conn->msg_ctx, procid_self(),
+			       MSG_DBWRAP_G_LOCK_RETRY,
+			       &data_blob_null);
+
 		TALLOC_FREE(buf);
 		return NT_STATUS_OK;
 	}
@@ -1302,6 +1321,50 @@ NTSTATUS ctdbd_control_local(struct ctdbd_connection *conn, uint32 opcode,
 	return ctdbd_control(conn, CTDB_CURRENT_NODE, opcode, srvid, flags, data, mem_ctx, outdata, cstatus);
 }
 
+NTSTATUS ctdb_watch_us(struct ctdbd_connection *conn)
+{
+	struct ctdb_client_notify_register reg_data;
+	size_t struct_len;
+	NTSTATUS status;
+	int cstatus;
+
+	reg_data.srvid = CTDB_SRVID_SAMBA_NOTIFY;
+	reg_data.len = 1;
+	reg_data.notify_data[0] = 0;
+
+	struct_len = offsetof(struct ctdb_client_notify_register,
+			      notify_data) + reg_data.len;
+
+	status = ctdbd_control_local(
+		conn, CTDB_CONTROL_REGISTER_NOTIFY, conn->rand_srvid, 0,
+		make_tdb_data((uint8_t *)&reg_data, struct_len),
+		NULL, NULL, &cstatus);
+	if (!NT_STATUS_IS_OK(status)) {
+		DEBUG(1, ("ctdbd_control_local failed: %s\n",
+			  nt_errstr(status)));
+	}
+	return status;
+}
+
+NTSTATUS ctdb_unwatch(struct ctdbd_connection *conn)
+{
+	struct ctdb_client_notify_deregister dereg_data;
+	NTSTATUS status;
+	int cstatus;
+
+	dereg_data.srvid = CTDB_SRVID_SAMBA_NOTIFY;
+
+	status = ctdbd_control_local(
+		conn, CTDB_CONTROL_DEREGISTER_NOTIFY, conn->rand_srvid, 0,
+		make_tdb_data((uint8_t *)&dereg_data, sizeof(dereg_data)),
+		NULL, NULL, &cstatus);
+	if (!NT_STATUS_IS_OK(status)) {
+		DEBUG(1, ("ctdbd_control_local failed: %s\n",
+			  nt_errstr(status)));
+	}
+	return status;
+}
+
 #else
 
 NTSTATUS ctdbd_init_connection(TALLOC_CTX *mem_ctx,
diff --git a/source3/lib/g_lock.c b/source3/lib/g_lock.c
new file mode 100644
index 0000000..6508b39
--- /dev/null
+++ b/source3/lib/g_lock.c
@@ -0,0 +1,594 @@
+/*
+   Unix SMB/CIFS implementation.
+   global locks based on dbwrap and messaging
+   Copyright (C) 2009 by Volker Lendecke
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "includes.h"
+#include "g_lock.h"
+
+static NTSTATUS g_lock_force_unlock(struct g_lock_ctx *ctx, const char *name,
+				    struct server_id pid);
+
+struct g_lock_ctx {
+	struct db_context *db;
+	struct messaging_context *msg;
+};
+
+/*
+ * The "g_lock.tdb" file contains records, indexed by the 0-terminated
+ * lockname. The record contains an array of "struct g_lock_rec"
+ * structures. Waiters have the lock_type with G_LOCK_PENDING or'ed.
+ */
+
+struct g_lock_rec {
+	enum g_lock_type lock_type;
+	struct server_id pid;
+};
+
+struct g_lock_ctx *g_lock_ctx_init(TALLOC_CTX *mem_ctx,
+				   struct messaging_context *msg)
+{
+	struct g_lock_ctx *result;
+
+	result = talloc(mem_ctx, struct g_lock_ctx);
+	if (result == NULL) {
+		return NULL;
+	}
+	result->msg = msg;
+
+	result->db = db_open(result, lock_path("g_lock.tdb"), 0,
+			     TDB_CLEAR_IF_FIRST, O_RDWR|O_CREAT, 0700);
+	if (result->db == NULL) {
+		DEBUG(1, ("g_lock_init: Could not open g_lock.tdb"));
+		TALLOC_FREE(result);
+		return NULL;
+	}
+	return result;
+}
+
+static bool g_lock_conflicts(enum g_lock_type lock_type,
+			     const struct g_lock_rec *rec)
+{
+	enum g_lock_type rec_lock = rec->lock_type;
+
+	if ((rec_lock & G_LOCK_PENDING) != 0) {
+		return false;
+	}
+
+	/*
+	 * Only tested write locks so far. Very likely this routine
+	 * needs to be fixed for read locks....
+	 */
+	if ((lock_type == G_LOCK_READ) && (rec_lock == G_LOCK_READ)) {
+		return false;
+	}
+	return true;
+}
+
+static bool g_lock_parse(TALLOC_CTX *mem_ctx, TDB_DATA data,
+			 int *pnum_locks, struct g_lock_rec **plocks)
+{
+	int i, num_locks;
+	struct g_lock_rec *locks;
+
+	if ((data.dsize % sizeof(struct g_lock_rec)) != 0) {
+		DEBUG(1, ("invalid lock record length %d\n", (int)data.dsize));
+		return false;
+	}
+
+	num_locks = data.dsize / sizeof(struct g_lock_rec);
+	locks = talloc_array(mem_ctx, struct g_lock_rec, num_locks);
+	if (locks == NULL) {
+		DEBUG(1, ("talloc failed\n"));
+		return false;
+	}
+
+	memcpy(locks, data.dptr, data.dsize);
+
+	DEBUG(10, ("locks:\n"));
+	for (i=0; i<num_locks; i++) {
+		DEBUGADD(10, ("%s: %s %s\n",
+			      procid_str(talloc_tos(), &locks[i].pid),
+			      ((locks[i].lock_type & 1) == G_LOCK_READ) ?
+			      "read" : "write",
+			      (locks[i].lock_type & G_LOCK_PENDING) ?
+			      "(pending)" : "(owner)"));
+
+		if (process_exists(locks[i].pid)) {
+			continue;
+		}
+		DEBUGADD(10, ("%s does not exist -- discarding\n",
+			      procid_str(talloc_tos(), &locks[i].pid)));
+
+		if (i < (num_locks-1)) {
+			locks[i] = locks[num_locks-1];
+		}
+		num_locks -= 1;
+	}
+
+	*plocks = locks;
+	*pnum_locks = num_locks;
+	return true;
+}
+
+static struct g_lock_rec *g_lock_addrec(TALLOC_CTX *mem_ctx,
+					struct g_lock_rec *locks,
+					int num_locks,
+					const struct server_id pid,
+					enum g_lock_type lock_type)
+{
+	struct g_lock_rec *result;
+
+	result = talloc_realloc(mem_ctx, locks, struct g_lock_rec,
+				num_locks+1);
+	if (result == NULL) {
+		return NULL;
+	}
+
+	result[num_locks].pid = pid;
+	result[num_locks].lock_type = lock_type;
+	return result;
+}
+
+static void g_lock_got_retry(struct messaging_context *msg,
+			     void *private_data,
+			     uint32_t msg_type,
+			     struct server_id server_id,
+			     DATA_BLOB *data);
+static void g_lock_timedout(struct tevent_context *ev,
+			    struct tevent_timer *te,
+			    struct timeval current_time,
+			    void *private_data);
+
+static NTSTATUS g_lock_trylock(struct g_lock_ctx *ctx, const char *name,
+			       enum g_lock_type lock_type)
+{
+	struct db_record *rec = NULL;
+	struct g_lock_rec *locks = NULL;
+	int i, num_locks;
+	struct server_id self;
+	int our_index;
+	TDB_DATA data;
+	NTSTATUS status = NT_STATUS_OK;
+	NTSTATUS store_status;
+
+again:
+	rec = ctx->db->fetch_locked(ctx->db, talloc_tos(),
+				    string_term_tdb_data(name));
+	if (rec == NULL) {
+		DEBUG(10, ("fetch_locked(\"%s\") failed\n", name));
+		status = NT_STATUS_LOCK_NOT_GRANTED;
+		goto done;
+	}
+
+	if (!g_lock_parse(talloc_tos(), rec->value, &num_locks, &locks)) {
+		DEBUG(10, ("g_lock_parse for %s failed\n", name));
+		status = NT_STATUS_INTERNAL_ERROR;
+		goto done;
+	}
+
+	self = procid_self();
+	our_index = -1;
+
+	for (i=0; i<num_locks; i++) {
+		if (procid_equal(&self, &locks[i].pid)) {
+			if (our_index != -1) {
+				DEBUG(1, ("g_lock_trylock: Added ourself "
+					  "twice!\n"));
+				status = NT_STATUS_INTERNAL_ERROR;
+				goto done;
+			}
+			if ((locks[i].lock_type & G_LOCK_PENDING) == 0) {
+				DEBUG(1, ("g_lock_trylock: Found ourself not "
+					  "pending!\n"));
+				status = NT_STATUS_INTERNAL_ERROR;
+				goto done;
+			}
+
+			our_index = i;
+
+			/* never conflict with ourself */
+			continue;
+		}
+		if (g_lock_conflicts(lock_type, &locks[i])) {
+			struct server_id pid = locks[i].pid;
+
+			if (!process_exists(pid)) {
+				TALLOC_FREE(locks);
+				TALLOC_FREE(rec);
+				status = g_lock_force_unlock(ctx, name, pid);
+				if (!NT_STATUS_IS_OK(status)) {
+					DEBUG(1, ("Could not unlock dead lock "
+						  "holder!\n"));
+					goto done;
+				}
+				goto again;
+			}
+			lock_type |= G_LOCK_PENDING;
+		}
+	}
+
+	if (our_index == -1) {
+		/* First round, add ourself */
+
+		locks = g_lock_addrec(talloc_tos(), locks, num_locks,
+				      self, lock_type);
+		if (locks == NULL) {
+			DEBUG(10, ("g_lock_addrec failed\n"));
+			status = NT_STATUS_NO_MEMORY;
+			goto done;
+		}
+	} else {
+		/*
+		 * Retry. We were pending last time. Overwrite the
+		 * stored lock_type with what we calculated, we might
+		 * have acquired the lock this time.
+		 */
+		locks[our_index].lock_type = lock_type;
+	}
+
+	data = make_tdb_data((uint8_t *)locks, talloc_get_size(locks));
+	store_status = rec->store(rec, data, 0);
+	if (!NT_STATUS_IS_OK(store_status)) {
+		DEBUG(1, ("rec->store failed: %s\n",
+			  nt_errstr(store_status)));
+		status = store_status;
+	}
+
+done:
+	TALLOC_FREE(locks);
+	TALLOC_FREE(rec);
+
+	if (NT_STATUS_IS_OK(status) && (lock_type & G_LOCK_PENDING) != 0) {
+		return STATUS_PENDING;
+	}
+
+	return NT_STATUS_OK;
+}
+
+NTSTATUS g_lock_lock(struct g_lock_ctx *ctx, const char *name,
+		     enum g_lock_type lock_type, struct timeval timeout)
+{
+	struct tevent_timer *te = NULL;
+	NTSTATUS status;
+	bool retry = false;
+	bool timedout = false;
+
+	DEBUG(10, ("Trying to acquire lock %d for %s\n", (int)lock_type,
+		   name));
+
+	if (lock_type & ~1) {
+		DEBUG(1, ("Got invalid lock type %d for %s\n",
+			  (int)lock_type, name));
+		return NT_STATUS_INVALID_PARAMETER;
+	}
+
+#ifdef CLUSTER_SUPPORT
+	if (lp_clustering()) {
+		status = ctdb_watch_us(messaging_ctdbd_connection());
+		if (!NT_STATUS_IS_OK(status)) {
+			DEBUG(10, ("could not register retry with ctdb: %s\n",
+				   nt_errstr(status)));
+			goto done;
+		}
+	}
+#endif
+
+	status = messaging_register(ctx->msg, &retry, MSG_DBWRAP_G_LOCK_RETRY,
+				    g_lock_got_retry);
+	if (!NT_STATUS_IS_OK(status)) {
+		DEBUG(10, ("messaging_register failed: %s\n",
+			   nt_errstr(status)));
+		return status;
+	}
+again:
+	retry = false;
+
+	status = g_lock_trylock(ctx, name, lock_type);
+	if (NT_STATUS_IS_OK(status)) {
+		DEBUG(10, ("Got lock %s\n", name));
+		goto done;
+	}
+	if (!NT_STATUS_EQUAL(status, STATUS_PENDING)) {
+		DEBUG(10, ("g_lock_trylock failed: %s\n",
+			   nt_errstr(status)));
+		goto done;
+	}
+
+	if (retry) {
+		goto again;
+	}
+
+	DEBUG(10, ("g_lock_trylock: Did not get lock, waiting...\n"));
+
+	if (te == NULL) {
+		te = tevent_add_timer(
+			ctx->msg->event_ctx, talloc_tos(),
+			timeval_current_ofs(timeout.tv_sec, timeout.tv_usec),
+			g_lock_timedout, &timedout);
+		if (te == NULL) {
+			DEBUG(10, ("tevent_add_timer failed\n"));
+			status = NT_STATUS_NO_MEMORY;
+			goto done;
+		}
+	}
+
+	while (true) {
+		if (tevent_loop_once(ctx->msg->event_ctx) == -1) {
+			DEBUG(1, ("tevent_loop_once failed\n"));
+			status = NT_STATUS_INTERNAL_ERROR;
+			goto done;
+		}
+		if (retry) {
+			goto again;
+		}
+		if (timedout) {
+			DEBUG(10, ("g_lock_lock timed out\n"));
+
+			te = NULL;
+
+			status = NT_STATUS_LOCK_NOT_GRANTED;
+			goto done;
+		}
+	}
+done:
+
+	if (!NT_STATUS_IS_OK(status)) {
+		NTSTATUS unlock_status;
+
+		unlock_status = g_lock_unlock(ctx, name);
+
+		if (!NT_STATUS_IS_OK(unlock_status)) {
+			DEBUG(1, ("Could not remove ourself from the locking "
+				  "db: %s\n", nt_errstr(status)));
+		}
+	}
+
+	messaging_deregister(ctx->msg, MSG_DBWRAP_G_LOCK_RETRY, &retry);
+	TALLOC_FREE(te);
+
+	return status;
+}
+
+static void g_lock_got_retry(struct messaging_context *msg,
+			     void *private_data,
+			     uint32_t msg_type,
+			     struct server_id server_id,
+			     DATA_BLOB *data)
+{
+	bool *pretry = (bool *)private_data;
+
+	DEBUG(10, ("Got retry message from pid %s\n",
+		   procid_str(talloc_tos(), &server_id)));
+
+	*pretry = true;
+}
+
+static void g_lock_timedout(struct tevent_context *ev,
+			    struct tevent_timer *te,
+			    struct timeval current_time,
+			    void *private_data)
+{
+	bool *ptimedout = (bool *)private_data;
+	*ptimedout = true;
+	TALLOC_FREE(te);
+}
+
+static NTSTATUS g_lock_force_unlock(struct g_lock_ctx *ctx, const char *name,
+				    struct server_id pid)
+{
+	struct db_record *rec = NULL;
+	struct g_lock_rec *locks = NULL;
+	int i, num_locks;
+	enum g_lock_type lock_type;
+	NTSTATUS status;
+
+	rec = ctx->db->fetch_locked(ctx->db, talloc_tos(),
+				    string_term_tdb_data(name));
+	if (rec == NULL) {
+		DEBUG(10, ("fetch_locked(\"%s\") failed\n", name));
+		status = NT_STATUS_INTERNAL_ERROR;
+		goto done;
+	}
+
+	if (!g_lock_parse(talloc_tos(), rec->value, &num_locks, &locks)) {
+		DEBUG(10, ("g_lock_parse for %s failed\n", name));
+		status = NT_STATUS_INTERNAL_ERROR;
+		goto done;
+	}
+
+	for (i=0; i<num_locks; i++) {
+		if (procid_equal(&pid, &locks[i].pid)) {
+			break;
+		}
+	}
+
+	if (i == num_locks) {
+		DEBUG(10, ("g_lock_force_unlock: Lock not found\n"));
+		status = NT_STATUS_INTERNAL_ERROR;
+		goto done;
+	}
+
+	lock_type = locks[i].lock_type;
+
+	if (i < (num_locks-1)) {
+		locks[i] = locks[num_locks-1];
+	}
+	num_locks -= 1;
+
+	if (num_locks == 0) {
+		status = rec->delete_rec(rec);
+	} else {
+		TDB_DATA data;
+		data = make_tdb_data((uint8_t *)locks,
+				     sizeof(struct g_lock_rec) * num_locks);
+		status = rec->store(rec, data, 0);
+	}
+
+	if (!NT_STATUS_IS_OK(status)) {
+		DEBUG(1, ("g_lock_force_unlock: Could not store record: %s\n",
+			  nt_errstr(status)));
+		goto done;
+	}
+
+	if ((lock_type & G_LOCK_PENDING) == 0) {
+		/*
+		 * We've been the lock holder. Tell all others to retry.
+		 */
+		for (i=0; i<num_locks; i++) {
+			if ((locks[i].lock_type & G_LOCK_PENDING) == 0) {
+				continue;
+			}
+
+			/*
+			 * Ping all waiters to retry
+			 */
+			status = messaging_send(ctx->msg, locks[i].pid,
+						MSG_DBWRAP_G_LOCK_RETRY,
+						&data_blob_null);
+			if (!NT_STATUS_IS_OK(status)) {
+				DEBUG(1, ("sending retry to %s failed: %s\n",
+					  procid_str(talloc_tos(),
+						     &locks[i].pid),
+					  nt_errstr(status)));
+			}
+		}
+	}
+done:
+
+	TALLOC_FREE(locks);
+	TALLOC_FREE(rec);
+	return status;
+}
+
+NTSTATUS g_lock_unlock(struct g_lock_ctx *ctx, const char *name)
+{
+	NTSTATUS status;
+
+	status = g_lock_force_unlock(ctx, name, procid_self());
+
+#ifdef CLUSTER_SUPPORT
+	if (lp_clustering()) {
+		ctdb_unwatch(messaging_ctdbd_connection());
+	}
+#endif
+	return status;
+}
+
+struct g_lock_locks_state {
+	int (*fn)(const char *name, void *private_data);
+	void *private_data;
+};
+
+static int g_lock_locks_fn(struct db_record *rec, void *priv)
+{
+	struct g_lock_locks_state *state = (struct g_lock_locks_state *)priv;
+
+	if ((rec->key.dsize == 0) || (rec->key.dptr[rec->key.dsize-1] != 0)) {
+		DEBUG(1, ("invalid key in g_lock.tdb, ignoring\n"));
+		return 0;
+	}
+	return state->fn((char *)rec->key.dptr, state->private_data);
+}
+
+int g_lock_locks(struct g_lock_ctx *ctx,
+		 int (*fn)(const char *name, void *private_data),
+		 void *private_data)
+{
+	struct g_lock_locks_state state;
+
+	state.fn = fn;
+	state.private_data = private_data;
+
+	return ctx->db->traverse_read(ctx->db, g_lock_locks_fn, &state);
+}
+
+NTSTATUS g_lock_dump(struct g_lock_ctx *ctx, const char *name,
+		     int (*fn)(struct server_id pid,
+			       enum g_lock_type lock_type,
+			       void *private_data),
+		     void *private_data)
+{
+	TDB_DATA data;
+	int i, num_locks;
+	struct g_lock_rec *locks = NULL;
+	bool ret;
+
+	if (ctx->db->fetch(ctx->db, talloc_tos(), string_term_tdb_data(name),
+			   &data) != 0) {
+		return NT_STATUS_NOT_FOUND;
+	}
+
+	if ((data.dsize == 0) || (data.dptr == NULL)) {
+		return NT_STATUS_OK;
+	}
+
+	ret = g_lock_parse(talloc_tos(), data, &num_locks, &locks);
+
+	TALLOC_FREE(data.dptr);
+
+	if (!ret) {
+		DEBUG(10, ("g_lock_parse for %s failed\n", name));
+		return NT_STATUS_INTERNAL_ERROR;
+	}
+
+	for (i=0; i<num_locks; i++) {
+		if (fn(locks[i].pid, locks[i].lock_type, private_data) != 0) {
+			break;
+		}
+	}
+	TALLOC_FREE(locks);
+	return NT_STATUS_OK;
+}
+
+struct g_lock_get_state {
+	bool found;
+	struct server_id *pid;
+};
+
+static int g_lock_get_fn(struct server_id pid, enum g_lock_type lock_type,
+			 void *priv)
+{
+	struct g_lock_get_state *state = (struct g_lock_get_state *)priv;
+
+	if ((lock_type & G_LOCK_PENDING) != 0) {
+		return 0;
+	}
+
+	state->found = true;
+	*state->pid = pid;
+	return 1;
+}
+
+NTSTATUS g_lock_get(struct g_lock_ctx *ctx, const char *name,
+		    struct server_id *pid)
+{
+	struct g_lock_get_state state;
+	NTSTATUS status;
+
+	state.found = false;
+	state.pid = pid;
+
+	status = g_lock_dump(ctx, name, g_lock_get_fn, &state);
+	if (!NT_STATUS_IS_OK(status)) {
+		return status;
+	}
+	if (!state.found) {
+		return NT_STATUS_NOT_FOUND;
+	}
+	return NT_STATUS_OK;
+}
diff --git a/source3/librpc/gen_ndr/messaging.h b/source3/librpc/gen_ndr/messaging.h
index 225440a..1312c84 100644
--- a/source3/librpc/gen_ndr/messaging.h
+++ b/source3/librpc/gen_ndr/messaging.h
@@ -62,7 +62,8 @@ enum messaging_type
 	MSG_WINBIND_VALIDATE_CACHE=(int)(0x0408),
 	MSG_WINBIND_DUMP_DOMAIN_LIST=(int)(0x0409),
 	MSG_DUMP_EVENT_LIST=(int)(0x0500),
-	MSG_DBWRAP_TDB2_CHANGES=(int)(4001)
+	MSG_DBWRAP_TDB2_CHANGES=(int)(4001),
+	MSG_DBWRAP_G_LOCK_RETRY=(int)(4002)
 }
 #else
  { __donnot_use_enum_messaging_type=0x7FFFFFFF}
@@ -118,6 +119,7 @@ enum messaging_type
 #define MSG_WINBIND_DUMP_DOMAIN_LIST ( 0x0409 )
 #define MSG_DUMP_EVENT_LIST ( 0x0500 )
 #define MSG_DBWRAP_TDB2_CHANGES ( 4001 )
+#define MSG_DBWRAP_G_LOCK_RETRY ( 4002 )
 #endif
 ;
 
diff --git a/source3/librpc/gen_ndr/ndr_messaging.c b/source3/librpc/gen_ndr/ndr_messaging.c
index 3e2aa1f..1452630 100644
--- a/source3/librpc/gen_ndr/ndr_messaging.c
+++ b/source3/librpc/gen_ndr/ndr_messaging.c
@@ -74,6 +74,7 @@ _PUBLIC_ void ndr_print_messaging_type(struct ndr_print *ndr, const char *name,
 		case MSG_WINBIND_DUMP_DOMAIN_LIST: val = "MSG_WINBIND_DUMP_DOMAIN_LIST"; break;
 		case MSG_DUMP_EVENT_LIST: val = "MSG_DUMP_EVENT_LIST"; break;
 		case MSG_DBWRAP_TDB2_CHANGES: val = "MSG_DBWRAP_TDB2_CHANGES"; break;
+		case MSG_DBWRAP_G_LOCK_RETRY: val = "MSG_DBWRAP_G_LOCK_RETRY"; break;
 	}
 	ndr_print_enum(ndr, name, "ENUM", val, r);
 }
diff --git a/source3/librpc/idl/messaging.idl b/source3/librpc/idl/messaging.idl
index 0686585..08caa59 100644
--- a/source3/librpc/idl/messaging.idl
+++ b/source3/librpc/idl/messaging.idl
@@ -88,7 +88,8 @@ interface messaging
 		MSG_DUMP_EVENT_LIST		= 0x0500,
 
 		/* dbwrap messages 4001-4999 */
-		MSG_DBWRAP_TDB2_CHANGES		= 4001
+		MSG_DBWRAP_TDB2_CHANGES		= 4001,
+		MSG_DBWRAP_G_LOCK_RETRY		= 4002
 	} messaging_type;
 
 	/* messaging struct sent across the sockets and stored in the tdb */
diff --git a/source3/utils/net.c b/source3/utils/net.c
index 85c3c7d..0c5f080 100644
--- a/source3/utils/net.c
+++ b/source3/utils/net.c
@@ -612,6 +612,13 @@ static struct functable net_func[] = {
 		N_("  Use 'net help lookup' to get more information about 'net "
 		   "lookup' commands.")
 	},
+	{	"g_lock",
+		net_g_lock,
+		NET_TRANSPORT_LOCAL,
+		N_("Manipulate the global lock table"),
+		N_("  Use 'net help g_lock' to get more information about "
+		   "'net g_lock' commands.")
+	},
 	{	"join",
 		net_join,
 		NET_TRANSPORT_ADS | NET_TRANSPORT_RPC,
diff --git a/source3/utils/net_g_lock.c b/source3/utils/net_g_lock.c
new file mode 100644
index 0000000..f30ed33
--- /dev/null
+++ b/source3/utils/net_g_lock.c
@@ -0,0 +1,213 @@
+/*
+ * Samba Unix/Linux SMB client library
+ * Interface to the g_lock facility
+ * Copyright (C) Volker Lendecke 2009
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "includes.h"
+#include "net.h"
+#include "g_lock.h"
+
+static bool net_g_lock_init(TALLOC_CTX *mem_ctx,
+			    struct tevent_context **pev,
+			    struct messaging_context **pmsg,
+			    struct g_lock_ctx **pg_ctx)
+{
+	struct tevent_context *ev = NULL;
+	struct messaging_context *msg = NULL;
+	struct g_lock_ctx *g_ctx = NULL;
+
+	ev = tevent_context_init(talloc_tos());
+	if (ev == NULL) {
+		d_fprintf(stderr, "ERROR: could not init event context\n");
+		goto fail;
+	}
+	msg = messaging_init(talloc_tos(), server_id_self(), ev);
+	if (msg == NULL) {
+		d_fprintf(stderr, "ERROR: could not init messaging context\n");
+		goto fail;
+	}
+	g_ctx = g_lock_ctx_init(talloc_tos(), msg);
+	if (g_ctx == NULL) {
+		d_fprintf(stderr, "ERROR: could not init g_lock context\n");
+		goto fail;
+	}
+
+	*pev = ev;
+	*pmsg = msg;
+	*pg_ctx = g_ctx;
+	return true;
+fail:
+	TALLOC_FREE(g_ctx);
+	TALLOC_FREE(msg);
+	TALLOC_FREE(ev);
+	return false;
+}
+
+
+static int net_g_lock_do(struct net_context *c, int argc, const char **argv)
+{
+	struct tevent_context *ev = NULL;
+	struct messaging_context *msg = NULL;
+	struct g_lock_ctx *g_ctx = NULL;
+	const char *name, *cmd;
+	int timeout, res;
+	bool locked = false;
+	NTSTATUS status;
+	int ret = -1;
+
+	if (argc != 3) {
+		d_printf("Usage: net g_lock do <lockname> <timeout> "
+			 "<command>\n");
+		return -1;
+	}
+	name = argv[0];
+	timeout = atoi(argv[1]);
+	cmd = argv[2];
+
+	if (!net_g_lock_init(talloc_tos(), &ev, &msg, &g_ctx)) {
+		goto done;
+	}
+
+	status = g_lock_lock(g_ctx, name, G_LOCK_WRITE,
+			     timeval_set(timeout / 1000, timeout % 1000));
+	if (!NT_STATUS_IS_OK(status)) {
+		d_fprintf(stderr, "ERROR: Could not get lock: %s\n",
+			  nt_errstr(status));
+		goto done;
+	}
+	locked = true;
+
+	res = system(cmd);
+
+	if (res == -1) {
+		d_fprintf(stderr, "ERROR: system() returned %s\n",
+			  strerror(errno));
+		goto done;
+	}
+	d_fprintf(stderr, "command returned %d\n", res);
+
+	ret = 0;
+
+done:
+	if (locked) {
+		g_lock_unlock(g_ctx, name);
+	}
+	TALLOC_FREE(g_ctx);
+	TALLOC_FREE(msg);
+	TALLOC_FREE(ev);
+	return ret;
+}
+
+static int net_g_lock_dump_fn(struct server_id pid, enum g_lock_type lock_type,
+			      void *private_data)
+{
+	char *pidstr;
+
+	pidstr = procid_str(talloc_tos(), &pid);
+	d_printf("%s: %s (%s)\n", pidstr,
+		 (lock_type & 1) ? "WRITE" : "READ",
+		 (lock_type & G_LOCK_PENDING) ? "pending" : "holder");
+	TALLOC_FREE(pidstr);
+	return 0;
+}
+
+static int net_g_lock_dump(struct net_context *c, int argc, const char **argv)
+{
+	struct tevent_context *ev = NULL;
+	struct messaging_context *msg = NULL;
+	struct g_lock_ctx *g_ctx = NULL;
+	NTSTATUS status;
+	int ret = -1;
+
+	if (argc != 1) {
+		d_printf("Usage: net g_lock dump <lockname>\n");
+		return -1;
+	}
+
+	if (!net_g_lock_init(talloc_tos(), &ev, &msg, &g_ctx)) {
+		goto done;
+	}
+
+	status = g_lock_dump(g_ctx, argv[0], net_g_lock_dump_fn, NULL);
+
+	ret = 0;
+done:
+	TALLOC_FREE(g_ctx);
+	TALLOC_FREE(msg);
+	TALLOC_FREE(ev);
+	return ret;
+}
+
+static int net_g_lock_locks_fn(const char *name, void *private_data)
+{
+	d_printf("%s\n", name);
+	return 0;
+}
+
+static int net_g_lock_locks(struct net_context *c, int argc, const char **argv)
+{
+	struct tevent_context *ev = NULL;
+	struct messaging_context *msg = NULL;
+	struct g_lock_ctx *g_ctx = NULL;
+	int ret = -1;
+
+	if (argc != 0) {
+		d_printf("Usage: net g_lock locks\n");
+		return -1;
+	}
+
+	if (!net_g_lock_init(talloc_tos(), &ev, &msg, &g_ctx)) {
+		goto done;
+	}
+
+	ret = g_lock_locks(g_ctx, net_g_lock_locks_fn, NULL);
+done:
+	TALLOC_FREE(g_ctx);
+	TALLOC_FREE(msg);
+	TALLOC_FREE(ev);
+	return ret;
+}
+
+int net_g_lock(struct net_context *c, int argc, const char **argv)
+{
+	struct functable func[] = {
+		{
+			"do",
+			net_g_lock_do,
+			NET_TRANSPORT_LOCAL,
+			N_("Execute a shell command under a lock"),
+			N_("net g_lock do <lock name> <timeout> <command>\n")
+		},
+		{
+			"locks",
+			net_g_lock_locks,
+			NET_TRANSPORT_LOCAL,
+			N_("List all locknames"),
+			N_("net g_lock locks\n")
+		},
+		{
+			"dump",
+			net_g_lock_dump,
+			NET_TRANSPORT_LOCAL,
+			N_("Dump a g_lock locking table"),
+			N_("net g_lock dump <lock name>\n")
+		},
+		{NULL, NULL, 0, NULL, NULL}
+	};
+
+	return net_run_function(c, argc, argv, "net g_lock", func);
+}
diff --git a/source3/utils/net_proto.h b/source3/utils/net_proto.h
index 098e2a2..540d31e 100644
--- a/source3/utils/net_proto.h
+++ b/source3/utils/net_proto.h
@@ -497,4 +497,7 @@ NTSTATUS net_lookup_sid_from_name(struct net_context *c, TALLOC_CTX *ctx,
 char *stdin_new_passwd( void);
 char *get_pass( const char *prompt, bool stdin_get);
 
+/* The following definitions come from utils/net_g_lock.c  */
+int net_g_lock(struct net_context *c, int argc, const char **argv);
+
 #endif /*  _NET_PROTO_H_  */
-- 
1.6.3.3


From 7baaa858d2e60c7c18a0ced4df6b9fbefa63dcc2 Mon Sep 17 00:00:00 2001
From: Volker Lendecke <vl@samba.org>
Date: Fri, 4 Dec 2009 13:22:30 +0100
Subject: [PATCH 02/32] s3: Add ctdb_conn_msg_ctx()
 (cherry picked from commit 12abab711b58237ddccfa1d9bb526f8c7dbb6e9f)

---
 source3/include/ctdbd_conn.h |    1 +
 source3/lib/ctdbd_conn.c     |    5 +++++
 2 files changed, 6 insertions(+), 0 deletions(-)

diff --git a/source3/include/ctdbd_conn.h b/source3/include/ctdbd_conn.h
index 96e7aa0..71516c7 100644
--- a/source3/include/ctdbd_conn.h
+++ b/source3/include/ctdbd_conn.h
@@ -31,6 +31,7 @@ uint32 ctdbd_vnn(const struct ctdbd_connection *conn);
 
 NTSTATUS ctdbd_register_msg_ctx(struct ctdbd_connection *conn,
 				struct messaging_context *msg_ctx);
+struct messaging_context *ctdb_conn_msg_ctx(struct ctdbd_connection *conn);
 
 NTSTATUS ctdbd_messaging_send(struct ctdbd_connection *conn,
 			      uint32 dst_vnn, uint64 dst_srvid,
diff --git a/source3/lib/ctdbd_conn.c b/source3/lib/ctdbd_conn.c
index 900fa34..743594d 100644
--- a/source3/lib/ctdbd_conn.c
+++ b/source3/lib/ctdbd_conn.c
@@ -514,6 +514,11 @@ NTSTATUS ctdbd_messaging_connection(TALLOC_CTX *mem_ctx,
 	return status;
 }
 
+struct messaging_context *ctdb_conn_msg_ctx(struct ctdbd_connection *conn)
+{
+	return conn->msg_ctx;
+}
+
 /*
  * Packet handler to receive and handle a ctdb message
  */
-- 
1.6.3.3


From 3c72b313193b3baeb819f25882b0623919c26e59 Mon Sep 17 00:00:00 2001
From: Volker Lendecke <vl@samba.org>
Date: Mon, 7 Dec 2009 00:36:51 +0100
Subject: [PATCH 03/32] s3: setup debug for smbtorture
 (cherry picked from commit b13dd17840a598ae3441e48b130a2b2a2b940572)

---
 source3/torture/torture.c |    2 ++
 1 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/source3/torture/torture.c b/source3/torture/torture.c
index 9abcabd..b0f5d82 100644
--- a/source3/torture/torture.c
+++ b/source3/torture/torture.c
@@ -7318,6 +7318,8 @@ static void usage(void)
 
 	load_case_tables();
 
+	setup_logging("smbtorture", true);
+
 	if (is_default_dyn_CONFIGFILE()) {
 		if(getenv("SMB_CONF_PATH")) {
 			set_dyn_CONFIGFILE(getenv("SMB_CONF_PATH"));
-- 
1.6.3.3


From a5dc99f425605ffeb93042c21d361cba55381a96 Mon Sep 17 00:00:00 2001
From: Volker Lendecke <vl@samba.org>
Date: Fri, 11 Dec 2009 15:37:52 +0100
Subject: [PATCH 04/32] s3:torture: add a test LOCAL-DBTRANS to torture dbwrap with transactions.

---
 source3/torture/torture.c |  130 +++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 130 insertions(+), 0 deletions(-)

diff --git a/source3/torture/torture.c b/source3/torture/torture.c
index b0f5d82..fe5bbf3 100644
--- a/source3/torture/torture.c
+++ b/source3/torture/torture.c
@@ -7031,6 +7031,135 @@ fail:
 	return result;
 }
 
+static bool dbtrans_inc(struct db_context *db)
+{
+	struct db_record *rec;
+	uint32_t *val;
+	bool ret = false;
+	NTSTATUS status;
+
+	rec = db->fetch_locked(db, db, string_term_tdb_data("transtest"));
+	if (rec == NULL) {
+		printf(__location__ "fetch_lock failed\n");
+		return false;
+	}
+
+	if (rec->value.dsize != sizeof(uint32_t)) {
+		printf(__location__ "value.dsize = %d\n",
+		       (int)rec->value.dsize);
+		goto fail;
+	}
+
+	val = (uint32_t *)rec->value.dptr;
+	*val += 1;
+
+	status = rec->store(rec, make_tdb_data((uint8_t *)val,
+					       sizeof(uint32_t)),
+			    0);
+	if (!NT_STATUS_IS_OK(status)) {
+		printf(__location__ "store failed: %s\n",
+		       nt_errstr(status));
+		goto fail;
+	}
+
+	ret = true;
+fail:
+	TALLOC_FREE(rec);
+	return ret;
+}
+
+static bool run_local_dbtrans(int dummy)
+{
+	struct db_context *db;
+	struct db_record *rec;
+	NTSTATUS status;
+	uint32_t initial;
+	int res;
+
+	db = db_open(talloc_tos(), "transtest.tdb", 0, TDB_DEFAULT,
+		     O_RDWR|O_CREAT, 0600);
+	if (db == NULL) {
+		printf("Could not open transtest.db\n");
+		return false;
+	}
+
+	res = db->transaction_start(db);
+	if (res == -1) {
+		printf(__location__ "transaction_start failed\n");
+		return false;
+	}
+
+	rec = db->fetch_locked(db, db, string_term_tdb_data("transtest"));
+	if (rec == NULL) {
+		printf(__location__ "fetch_lock failed\n");
+		return false;
+	}
+
+	if (rec->value.dptr == NULL) {
+		initial = 0;
+		status = rec->store(
+			rec, make_tdb_data((uint8_t *)&initial,
+					   sizeof(initial)),
+			0);
+		if (!NT_STATUS_IS_OK(status)) {
+			printf(__location__ "store returned %s\n",
+			       nt_errstr(status));
+			return false;
+		}
+	}
+
+	TALLOC_FREE(rec);
+
+	res = db->transaction_commit(db);
+	if (res == -1) {
+		printf(__location__ "transaction_commit failed\n");
+		return false;
+	}
+
+	while (true) {
+		uint32_t val, val2;
+		int i;
+
+		res = db->transaction_start(db);
+		if (res == -1) {
+			printf(__location__ "transaction_start failed\n");
+			break;
+		}
+
+		if (!dbwrap_fetch_uint32(db, "transtest", &val)) {
+			printf(__location__ "dbwrap_fetch_uint32 failed\n");
+			break;
+		}
+
+		for (i=0; i<10; i++) {
+			if (!dbtrans_inc(db)) {
+				return false;
+			}
+		}
+
+		if (!dbwrap_fetch_uint32(db, "transtest", &val2)) {
+			printf(__location__ "dbwrap_fetch_uint32 failed\n");
+			break;
+		}
+
+		if (val2 != val + 10) {
+			printf(__location__ "val=%d, val2=%d\n",
+			       (int)val, (int)val2);
+			break;
+		}
+
+		printf("val2=%d\r", val2);
+
+		res = db->transaction_commit(db);
+		if (res == -1) {
+			printf(__location__ "transaction_commit failed\n");
+			break;
+		}
+	}
+
+	TALLOC_FREE(db);
+	return true;
+}
 
 static double create_procs(bool (*fn)(int), bool *result)
 {
@@ -7205,6 +7334,7 @@ static struct {
 	{ "LOCAL-MEMCACHE", run_local_memcache, 0},
 	{ "LOCAL-STREAM-NAME", run_local_stream_name, 0},
 	{ "LOCAL-WBCLIENT", run_local_wbclient, 0},
+	{ "LOCAL-DBTRANS", run_local_dbtrans, 0},
 	{NULL, NULL, 0}};
 
 
-- 
1.6.3.3


From 29badc06940827bddd3bfe3554a14b914528f6cb Mon Sep 17 00:00:00 2001
From: Volker Lendecke <vl@samba.org>
Date: Thu, 3 Dec 2009 18:43:49 +0100
Subject: [PATCH 05/32] s3: Add tdb_data_equal
 (cherry picked from commit ebc08b9938a4d266be16ca7e06d27813952cd00f)

---
 lib/util/util_tdb.c |    8 ++++++++
 lib/util/util_tdb.h |    1 +
 2 files changed, 9 insertions(+), 0 deletions(-)

diff --git a/lib/util/util_tdb.c b/lib/util/util_tdb.c
index cda8dc7..46dbf6d 100644
--- a/lib/util/util_tdb.c
+++ b/lib/util/util_tdb.c
@@ -38,6 +38,14 @@ TDB_DATA make_tdb_data(const uint8_t *dptr, size_t dsize)
 	return ret;
 }
 
+bool tdb_data_equal(TDB_DATA t1, TDB_DATA t2)
+{
+	if (t1.dsize != t2.dsize) {
+		return false;
+	}
+	return (memcmp(t1.dptr, t2.dptr, t1.dsize) == 0);
+}
+
 TDB_DATA string_tdb_data(const char *string)
 {
 	return make_tdb_data((const uint8_t *)string, string ? strlen(string) : 0 );
diff --git a/lib/util/util_tdb.h b/lib/util/util_tdb.h
index da6378e..79c4671 100644
--- a/lib/util/util_tdb.h
+++ b/lib/util/util_tdb.h
@@ -6,6 +6,7 @@
  Make a TDB_DATA and keep the const warning in one place
 ****************************************************************/
 TDB_DATA make_tdb_data(const uint8_t *dptr, size_t dsize);
+bool tdb_data_equal(TDB_DATA t1, TDB_DATA t2);
 TDB_DATA string_tdb_data(const char *string);
 TDB_DATA string_term_tdb_data(const char *string);
 
-- 
1.6.3.3


From 15cfedd40fe948852c5906269a45715d17b29e2b Mon Sep 17 00:00:00 2001
From: Michael Adam <obnox@samba.org>
Date: Thu, 3 Dec 2009 17:29:54 +0100
Subject: [PATCH 06/32] s3:dbwrap_ctdb: start rewrite of transactions using the global lock (g_lock)

This simplifies the transaction code a lot:

* transaction_start essentially consists of acquiring a global lock.

* No write operations at all are performed on the local database
  until the transaction is committed: Every store operation is just
  going into the marshall buffer.

* The commit operation calls a new simplified TRANS3_COMMIT control
  in ctdb which rolls out thae changes to all nodes including the
  node that is performing the transaction.

Michael
(cherry picked from commit 16bc6ba2268e3660d026076264de8666356e00bf)
---
 source3/lib/dbwrap_ctdb.c |  480 +++++++++++++++------------------------------
 1 files changed, 160 insertions(+), 320 deletions(-)

diff --git a/source3/lib/dbwrap_ctdb.c b/source3/lib/dbwrap_ctdb.c
index 8e188d0..c32398a 100644
--- a/source3/lib/dbwrap_ctdb.c
+++ b/source3/lib/dbwrap_ctdb.c
@@ -22,10 +22,10 @@
 #include "ctdb.h"
 #include "ctdb_private.h"
 #include "ctdbd_conn.h"
+#include "g_lock.h"
 
 struct db_ctdb_transaction_handle {
 	struct db_ctdb_ctx *ctx;
-	bool in_replay;
 	/*
 	 * we store the reads and writes done under a transaction:
 	 * - one list stores both reads and writes (m_all),
@@ -35,6 +35,7 @@ struct db_ctdb_transaction_handle {
 	struct ctdb_marshall_buffer *m_write;
 	uint32_t nesting;
 	bool nested_cancel;
+	char *lock_name;
 };
 
 struct db_ctdb_ctx {
@@ -42,6 +43,7 @@ struct db_ctdb_ctx {
 	struct tdb_wrap *wtdb;
 	uint32 db_id;
 	struct db_ctdb_transaction_handle *transaction;
+	struct g_lock_ctx *lock_ctx;
 };
 
 struct db_ctdb_rec {
@@ -297,145 +299,21 @@ static struct ctdb_rec_data *db_ctdb_marshall_loop_next(struct ctdb_marshall_buf
 	return r;
 }
 
-
-static int32_t db_ctdb_transaction_active(uint32_t db_id)
-{
-	int32_t status;
-	NTSTATUS ret;
-	TDB_DATA indata;
-
-	indata.dptr = (uint8_t *)&db_id;
-	indata.dsize = sizeof(db_id);
-
-	ret = ctdbd_control_local(messaging_ctdbd_connection(),
-				  CTDB_CONTROL_TRANS2_ACTIVE, 0, 0,
-				  indata, NULL, NULL, &status);
-
-	if (!NT_STATUS_IS_OK(ret)) {
-		DEBUG(2, ("ctdb control TRANS2_ACTIVE failed\n"));
-		return -1;
-	}
-
-	return status;
-}
-
-
 /**
  * CTDB transaction destructor
  */
 static int db_ctdb_transaction_destructor(struct db_ctdb_transaction_handle *h)
 {
-	tdb_transaction_cancel(h->ctx->wtdb->tdb);
-	return 0;
-}
-
-/**
- * start a transaction on a ctdb database:
- * - lock the transaction lock key
- * - start the tdb transaction
- */
-static int db_ctdb_transaction_fetch_start(struct db_ctdb_transaction_handle *h)
-{
-	struct db_record *rh;
-	struct db_ctdb_rec *crec;
-	TDB_DATA key;
-	TALLOC_CTX *tmp_ctx;
-	const char *keyname = CTDB_TRANSACTION_LOCK_KEY;
-	int ret;
-	struct db_ctdb_ctx *ctx = h->ctx;
-	TDB_DATA data;
-	pid_t pid;
 	NTSTATUS status;
-	struct ctdb_ltdb_header header;
-	int32_t transaction_status;
 
-	key.dptr = (uint8_t *)discard_const(keyname);
-	key.dsize = strlen(keyname);
-
-again:
-	tmp_ctx = talloc_new(h);
-
-	rh = fetch_locked_internal(ctx, tmp_ctx, key, true);
-	if (rh == NULL) {
-		DEBUG(0,(__location__ " Failed to fetch_lock database\n"));		
-		talloc_free(tmp_ctx);
-		return -1;
-	}
-	crec = talloc_get_type_abort(rh->private_data, struct db_ctdb_rec);
-
-	transaction_status = db_ctdb_transaction_active(ctx->db_id);
-	if (transaction_status == 1) {
-		unsigned long int usec = (1000 + random()) % 100000;
-		DEBUG(3, ("Transaction already active on db_id[0x%08x]."
-			  "Re-trying after %lu microseconds...",
-			  ctx->db_id, usec));
-		talloc_free(tmp_ctx);
-		usleep(usec);
-		goto again;
-	}
-
-	/*
-	 * store the pid in the database:
-	 * it is not enought that the node is dmaster...
-	 */
-	pid = getpid();
-	data.dptr = (unsigned char *)&pid;
-	data.dsize = sizeof(pid_t);
-	crec->header.rsn++;
-	crec->header.dmaster = get_my_vnn();
-	status = db_ctdb_ltdb_store(ctx, key, &(crec->header), data);
+	status = g_lock_unlock(h->ctx->lock_ctx, h->lock_name);
 	if (!NT_STATUS_IS_OK(status)) {
-		DEBUG(0, (__location__ " Failed to store pid in transaction "
-			  "record: %s\n", nt_errstr(status)));
-		talloc_free(tmp_ctx);
+		DEBUG(0, ("g_lock_unlock failed: %s\n", nt_errstr(status)));
 		return -1;
 	}
-
-	talloc_free(rh);
-
-	ret = tdb_transaction_start(ctx->wtdb->tdb);
-	if (ret != 0) {
-		DEBUG(0,(__location__ " Failed to start tdb transaction\n"));
-		talloc_free(tmp_ctx);
-		return -1;
-	}
-
-	status = db_ctdb_ltdb_fetch(ctx, key, &header, tmp_ctx, &data);
-	if (!NT_STATUS_IS_OK(status)) {
-		DEBUG(0, (__location__ " failed to refetch transaction lock "
-			  "record inside transaction: %s - retrying\n",
-			  nt_errstr(status)));
-		tdb_transaction_cancel(ctx->wtdb->tdb);
-		talloc_free(tmp_ctx);
-		goto again;
-	}
-
-	if (header.dmaster != get_my_vnn()) {
-		DEBUG(3, (__location__ " refetch transaction lock record : "
-			  "we are not dmaster any more "
-			  "(dmaster[%u] != my_vnn[%u]) - retrying\n",
-			  header.dmaster, get_my_vnn()));
-		tdb_transaction_cancel(ctx->wtdb->tdb);
-		talloc_free(tmp_ctx);
-		goto again;
-	}
-
-	if ((data.dsize != sizeof(pid_t)) || (*(pid_t *)(data.dptr) != pid)) {
-		DEBUG(3, (__location__ " refetch transaction lock record: "
-			  "another local process has started a transaction "
-			  "(stored pid [%u] != my pid [%u]) - retrying\n",
-			  *(pid_t *)(data.dptr), pid));
-		tdb_transaction_cancel(ctx->wtdb->tdb);
-		talloc_free(tmp_ctx);
-		goto again;
-	}
-
-	talloc_free(tmp_ctx);
-
 	return 0;
 }
 
-
 /**
  * CTDB dbwrap API: transaction_start function
  * starts a transaction on a persistent database
@@ -443,7 +321,7 @@ again:
 static int db_ctdb_transaction_start(struct db_context *db)
 {
 	struct db_ctdb_transaction_handle *h;
-	int ret;
+	NTSTATUS status;
 	struct db_ctdb_ctx *ctx = talloc_get_type_abort(db->private_data,
 							struct db_ctdb_ctx);
 
@@ -466,9 +344,22 @@ static int db_ctdb_transaction_start(struct db_context *db)
 
 	h->ctx = ctx;
 
-	ret = db_ctdb_transaction_fetch_start(h);
-	if (ret != 0) {
-		talloc_free(h);
+	h->lock_name = talloc_asprintf(h, "transaction_db_0x%08x",
+				       (unsigned int)ctx->db_id);
+	if (h->lock_name == NULL) {
+		DEBUG(0, ("talloc_asprintf failed\n"));
+		TALLOC_FREE(h);
+		return -1;
+	}
+
+	/*
+	 * Wait a day, i.e. forever...
+	 */
+	status = g_lock_lock(ctx->lock_ctx, h->lock_name, G_LOCK_WRITE,
+			     timeval_set(86400, 0));
+	if (!NT_STATUS_IS_OK(status)) {
+		DEBUG(0, ("g_lock_lock failed: %s\n", nt_errstr(status)));
+		TALLOC_FREE(h);
 		return -1;
 	}
 
@@ -481,7 +372,65 @@ static int db_ctdb_transaction_start(struct db_context *db)
 	return 0;
 }
 
+static bool pull_newest_from_marshall_buffer(struct ctdb_marshall_buffer *buf,
+					     TDB_DATA key,
+					     struct ctdb_ltdb_header *pheader,
+					     TALLOC_CTX *mem_ctx,
+					     TDB_DATA *pdata)
+{
+	struct ctdb_rec_data *rec = NULL;
+	struct ctdb_ltdb_header h;
+	bool found;
+	TDB_DATA data;
+	int i;
+
+	if (buf == NULL) {
+		return false;
+	}
+
+	/*
+	 * Walk the list of records written during this
+	 * transaction. If we want to read one we have already
+	 * written, return the last written sample. Thus we do not do
+	 * a "break;" for the first hit, this record might have been
+	 * overwritten later.
+	 */
+
+	for (i=0; i<buf->count; i++) {
+		TDB_DATA tkey, tdata;
+		uint32_t reqid;
+
+		rec = db_ctdb_marshall_loop_next(buf, rec, &reqid, &h, &tkey,
+						 &tdata);
+		if (rec == NULL) {
+			return false;
+		}
+
+		if (tdb_data_equal(key, tkey)) {
+			found = true;
+			data = tdata;
+		}
+	}
+
+	if (!found) {
+		return false;
+	}
+
+	if (pdata != NULL) {
+		data.dptr = (uint8_t *)talloc_memdup(mem_ctx, data.dptr,
+						     data.dsize);
+		if ((data.dsize != 0) && (data.dptr == NULL)) {
+			return false;
+		}
+		*pdata = data;
+	}
+
+	if (pheader != NULL) {
+		*pheader = h;
+	}
 
+	return true;
+}
 
 /*
   fetch a record inside a transaction
@@ -492,6 +441,13 @@ static int db_ctdb_transaction_fetch(struct db_ctdb_ctx *db,
 {
 	struct db_ctdb_transaction_handle *h = db->transaction;
 	NTSTATUS status;
+	bool found;
+
+	found = pull_newest_from_marshall_buffer(h->m_write, key, NULL,
+						 mem_ctx, data);
+	if (found) {
+		return 0;
+	}
 
 	status = db_ctdb_ltdb_fetch(h->ctx, key, NULL, mem_ctx, data);
 
@@ -501,14 +457,14 @@ static int db_ctdb_transaction_fetch(struct db_ctdb_ctx *db,
 		return -1;
 	}
 
-	if (!h->in_replay) {
-		h->m_all = db_ctdb_marshall_add(h, h->m_all, h->ctx->db_id, 1, key, NULL, *data);
-		if (h->m_all == NULL) {
-			DEBUG(0,(__location__ " Failed to add to marshalling record\n"));
-			data->dsize = 0;
-			talloc_free(data->dptr);
-			return -1;
-		}
+	h->m_all = db_ctdb_marshall_add(h, h->m_all, h->ctx->db_id, 1, key,
+					NULL, *data);
+	if (h->m_all == NULL) {
+		DEBUG(0,(__location__ " Failed to add to marshalling "
+			 "record\n"));
+		data->dsize = 0;
+		talloc_free(data->dptr);
+		return -1;
 	}
 
 	return 0;
@@ -543,6 +499,11 @@ static struct db_record *db_ctdb_fetch_locked_transaction(struct db_ctdb_ctx *ct
 	result->store = db_ctdb_store_transaction;
 	result->delete_rec = db_ctdb_delete_transaction;
 
+	if (pull_newest_from_marshall_buffer(ctx->transaction->m_write, key,
+					     NULL, result, &result->value)) {
+		return result;
+	}
+
 	ctdb_data = tdb_fetch(ctx->wtdb->tdb, key);
 	if (ctdb_data.dptr == NULL) {
 		/* create the record */
@@ -619,27 +580,35 @@ static int db_ctdb_transaction_store(struct db_ctdb_transaction_handle *h,
 				     TDB_DATA key, TDB_DATA data)
 {
 	TALLOC_CTX *tmp_ctx = talloc_new(h);
-	int ret;
 	TDB_DATA rec;
 	struct ctdb_ltdb_header header;
-	NTSTATUS status;
+
+	ZERO_STRUCT(header);
 
 	/* we need the header so we can update the RSN */
-	rec = tdb_fetch(h->ctx->wtdb->tdb, key);
-	if (rec.dptr == NULL) {
-		/* the record doesn't exist - create one with us as dmaster.
-		   This is only safe because we are in a transaction and this
-		   is a persistent database */
-		ZERO_STRUCT(header);
-	} else {
-		memcpy(&header, rec.dptr, sizeof(struct ctdb_ltdb_header));
-		rec.dsize -= sizeof(struct ctdb_ltdb_header);
-		/* a special case, we are writing the same data that is there now */
-		if (data.dsize == rec.dsize &&
-		    memcmp(data.dptr, rec.dptr + sizeof(struct ctdb_ltdb_header), data.dsize) == 0) {
-			SAFE_FREE(rec.dptr);
-			talloc_free(tmp_ctx);
-			return 0;
+
+	if (!pull_newest_from_marshall_buffer(h->m_write, key, &header,
+					      NULL, NULL)) {
+
+		rec = tdb_fetch(h->ctx->wtdb->tdb, key);
+
+		if (rec.dptr != NULL) {
+			memcpy(&header, rec.dptr,
+			       sizeof(struct ctdb_ltdb_header));
+			rec.dsize -= sizeof(struct ctdb_ltdb_header);
+
+			/*
+			 * a special case, we are writing the same
+			 * data that is there now
+			 */
+			if (data.dsize == rec.dsize &&
+			    memcmp(data.dptr,
+				   rec.dptr + sizeof(struct ctdb_ltdb_header),
+				   data.dsize) == 0) {
+				SAFE_FREE(rec.dptr);
+				talloc_free(tmp_ctx);
+				return 0;
+			}
 		}
 		SAFE_FREE(rec.dptr);
 	}
@@ -647,13 +616,13 @@ static int db_ctdb_transaction_store(struct db_ctdb_transaction_handle *h,
 	header.dmaster = get_my_vnn();
 	header.rsn++;
 
-	if (!h->in_replay) {
-		h->m_all = db_ctdb_marshall_add(h, h->m_all, h->ctx->db_id, 0, key, NULL, data);
-		if (h->m_all == NULL) {
-			DEBUG(0,(__location__ " Failed to add to marshalling record\n"));
-			talloc_free(tmp_ctx);
-			return -1;
-		}
+	h->m_all = db_ctdb_marshall_add(h, h->m_all, h->ctx->db_id, 0, key,
+					NULL, data);
+	if (h->m_all == NULL) {
+		DEBUG(0,(__location__ " Failed to add to marshalling "
+			 "record\n"));
+		talloc_free(tmp_ctx);
+		return -1;
 	}
 
 	h->m_write = db_ctdb_marshall_add(h, h->m_write, h->ctx->db_id, 0, key, &header, data);
@@ -663,16 +632,8 @@ static int db_ctdb_transaction_store(struct db_ctdb_transaction_handle *h,
 		return -1;
 	}
 
-	status = db_ctdb_ltdb_store(h->ctx, key, &header, data);
-	if (NT_STATUS_IS_OK(status)) {
-		ret = 0;
-	} else {
-		ret = -1;
-	}
-
 	talloc_free(tmp_ctx);
-
-	return ret;
+	return 0;
 }
 
 
@@ -708,64 +669,6 @@ static NTSTATUS db_ctdb_delete_transaction(struct db_record *rec)
 	return NT_STATUS_OK;
 }
 
-
-/*
-  replay a transaction
- */
-static int ctdb_replay_transaction(struct db_ctdb_transaction_handle *h)
-{
-	int ret, i;
-	struct ctdb_rec_data *rec = NULL;
-
-	h->in_replay = true;
-	talloc_free(h->m_write);
-	h->m_write = NULL;
-
-	ret = db_ctdb_transaction_fetch_start(h);
-	if (ret != 0) {
-		return ret;
-	}
-
-	for (i=0;i<h->m_all->count;i++) {
-		TDB_DATA key, data;
-
-		rec = db_ctdb_marshall_loop_next(h->m_all, rec, NULL, NULL, &key, &data);
-		if (rec == NULL) {
-			DEBUG(0, (__location__ " Out of records in ctdb_replay_transaction?\n"));
-			goto failed;
-		}
-
-		if (rec->reqid == 0) {
-			/* its a store */
-			if (db_ctdb_transaction_store(h, key, data) != 0) {
-				goto failed;
-			}
-		} else {
-			TDB_DATA data2;
-			TALLOC_CTX *tmp_ctx = talloc_new(h);
-
-			if (db_ctdb_transaction_fetch(h->ctx, tmp_ctx, key, &data2) != 0) {
-				talloc_free(tmp_ctx);
-				goto failed;
-			}
-			if (data2.dsize != data.dsize ||
-			    memcmp(data2.dptr, data.dptr, data.dsize) != 0) {
-				/* the record has changed on us - we have to give up */
-				talloc_free(tmp_ctx);
-				goto failed;
-			}
-			talloc_free(tmp_ctx);
-		}
-	}
-
-	return 0;
-
-failed:
-	tdb_transaction_cancel(h->ctx->wtdb->tdb);
-	return -1;
-}
-
-
 /*
   commit a transaction
  */
@@ -774,11 +677,8 @@ static int db_ctdb_transaction_commit(struct db_context *db)
 	struct db_ctdb_ctx *ctx = talloc_get_type_abort(db->private_data,
 							struct db_ctdb_ctx);
 	NTSTATUS rets;
-	int ret;
 	int status;
-	int retries = 0;
 	struct db_ctdb_transaction_handle *h = ctx->transaction;
-	enum ctdb_controls failure_control = CTDB_CONTROL_TRANS2_ERROR;
 
 	if (h == NULL) {
 		DEBUG(0,(__location__ " transaction commit with no open transaction on db 0x%08x\n", ctx->db_id));
@@ -798,102 +698,29 @@ static int db_ctdb_transaction_commit(struct db_context *db)
 
 	DEBUG(5,(__location__ " Commit transaction on db 0x%08x\n", ctx->db_id));
 
-	talloc_set_destructor(h, NULL);
-
-	/* our commit strategy is quite complex.
-
-	   - we first try to commit the changes to all other nodes
-
-	   - if that works, then we commit locally and we are done
-
-	   - if a commit on another node fails, then we need to cancel
-	     the transaction, then restart the transaction (thus
-	     opening a window of time for a pending recovery to
-	     complete), then replay the transaction, checking all the
-	     reads and writes (checking that reads give the same data,
-	     and writes succeed). Then we retry the transaction to the
-	     other nodes
-	*/
-
 again:
 	if (h->m_write == NULL) {
 		/* no changes were made, potentially after a retry */
-		tdb_transaction_cancel(h->ctx->wtdb->tdb);
-		talloc_free(h);
-		ctx->transaction = NULL;
-		return 0;
+		goto done;
 	}
 
 	/* tell ctdbd to commit to the other nodes */
-	rets = ctdbd_control_local(messaging_ctdbd_connection(), 
-				   retries==0?CTDB_CONTROL_TRANS2_COMMIT:CTDB_CONTROL_TRANS2_COMMIT_RETRY, 
+	rets = ctdbd_control_local(messaging_ctdbd_connection(),
+				   CTDB_CONTROL_TRANS3_COMMIT,
 				   h->ctx->db_id, 0,
-				   db_ctdb_marshall_finish(h->m_write), NULL, NULL, &status);
+				   db_ctdb_marshall_finish(h->m_write),
+				   NULL, NULL, &status);
 	if (!NT_STATUS_IS_OK(rets) || status != 0) {
-		tdb_transaction_cancel(h->ctx->wtdb->tdb);
-		sleep(1);
-
-		if (!NT_STATUS_IS_OK(rets)) {
-			failure_control = CTDB_CONTROL_TRANS2_ERROR;			
-		} else {
-			/* work out what error code we will give if we 
-			   have to fail the operation */
-			switch ((enum ctdb_trans2_commit_error)status) {
-			case CTDB_TRANS2_COMMIT_SUCCESS:
-			case CTDB_TRANS2_COMMIT_SOMEFAIL:
-			case CTDB_TRANS2_COMMIT_TIMEOUT:
-				failure_control = CTDB_CONTROL_TRANS2_ERROR;
-				break;
-			case CTDB_TRANS2_COMMIT_ALLFAIL:
-				failure_control = CTDB_CONTROL_TRANS2_FINISHED;
-				break;
-			}
-		}
-
-		if (++retries == 100) {
-			DEBUG(0,(__location__ " Giving up transaction on db 0x%08x after %d retries failure_control=%u\n", 
-				 h->ctx->db_id, retries, (unsigned)failure_control));
-			ctdbd_control_local(messaging_ctdbd_connection(), failure_control,
-					    h->ctx->db_id, CTDB_CTRL_FLAG_NOREPLY, 
-					    tdb_null, NULL, NULL, NULL);
-			h->ctx->transaction = NULL;
-			talloc_free(h);
-			ctx->transaction = NULL;
-			return -1;			
-		}
-
-		if (ctdb_replay_transaction(h) != 0) {
-			DEBUG(0,(__location__ " Failed to replay transaction failure_control=%u\n",
-				 (unsigned)failure_control));
-			ctdbd_control_local(messaging_ctdbd_connection(), failure_control,
-					    h->ctx->db_id, CTDB_CTRL_FLAG_NOREPLY, 
-					    tdb_null, NULL, NULL, NULL);
-			h->ctx->transaction = NULL;
-			talloc_free(h);
-			ctx->transaction = NULL;
-			return -1;
-		}
+		/*
+		 * TODO:
+		 * check the database sequence number and
+		 * compare it to the seqnum after applying the
+		 * marshall buffer. If it is the same: return success.
+		 */
 		goto again;
-	} else {
-		failure_control = CTDB_CONTROL_TRANS2_ERROR;
 	}
 
-	/* do the real commit locally */
-	ret = tdb_transaction_commit(h->ctx->wtdb->tdb);
-	if (ret != 0) {
-		DEBUG(0,(__location__ " Failed to commit transaction failure_control=%u\n",
-			 (unsigned)failure_control));
-		ctdbd_control_local(messaging_ctdbd_connection(), failure_control, h->ctx->db_id, 
-				    CTDB_CTRL_FLAG_NOREPLY, tdb_null, NULL, NULL, NULL);
-		h->ctx->transaction = NULL;
-		talloc_free(h);
-		return ret;
-	}
-
-	/* tell ctdbd that we are finished with our local commit */
-	ctdbd_control_local(messaging_ctdbd_connection(), CTDB_CONTROL_TRANS2_FINISHED, 
-			    h->ctx->db_id, CTDB_CTRL_FLAG_NOREPLY, 
-			    tdb_null, NULL, NULL, NULL);
+done:
 	h->ctx->transaction = NULL;
 	talloc_free(h);
 	return 0;
@@ -1314,6 +1141,7 @@ struct db_context *db_open_ctdb(TALLOC_CTX *mem_ctx,
 	struct db_context *result;
 	struct db_ctdb_ctx *db_ctdb;
 	char *db_path;
+	struct ctdbd_connection *conn;
 
 	if (!lp_clustering()) {
 		DEBUG(10, ("Clustering disabled -- no ctdb\n"));
@@ -1335,13 +1163,15 @@ struct db_context *db_open_ctdb(TALLOC_CTX *mem_ctx,
 	db_ctdb->transaction = NULL;
 	db_ctdb->db = result;
 
-	if (!NT_STATUS_IS_OK(ctdbd_db_attach(messaging_ctdbd_connection(),name, &db_ctdb->db_id, tdb_flags))) {
+	conn = messaging_ctdbd_connection();
+
+	if (!NT_STATUS_IS_OK(ctdbd_db_attach(conn, name, &db_ctdb->db_id, tdb_flags))) {
 		DEBUG(0, ("ctdbd_db_attach failed for %s\n", name));
 		TALLOC_FREE(result);
 		return NULL;
 	}
 
-	db_path = ctdbd_dbpath(messaging_ctdbd_connection(), db_ctdb, db_ctdb->db_id);
+	db_path = ctdbd_dbpath(conn, db_ctdb, db_ctdb->db_id);
 
 	result->persistent = ((tdb_flags & TDB_CLEAR_IF_FIRST) == 0);
 
@@ -1361,6 +1191,16 @@ struct db_context *db_open_ctdb(TALLOC_CTX *mem_ctx,
 	}
 	talloc_free(db_path);
 
+	if (result->persistent) {
+		db_ctdb->lock_ctx = g_lock_ctx_init(db_ctdb,
+						    ctdb_conn_msg_ctx(conn));
+		if (db_ctdb->lock_ctx == NULL) {
+			DEBUG(0, ("g_lock_ctx_init failed\n"));
+			TALLOC_FREE(result);
+			return NULL;
+		}
+	}
+
 	result->private_data = (void *)db_ctdb;
 	result->fetch_locked = db_ctdb_fetch_locked;
 	result->fetch = db_ctdb_fetch;
-- 
1.6.3.3


From f631e5f328ef4824d98aab3edd54ad14d1913beb Mon Sep 17 00:00:00 2001
From: Michael Adam <obnox@samba.org>
Date: Fri, 4 Dec 2009 11:49:21 +0100
Subject: [PATCH 07/32] build: Add a configure check for CTDB_CONTROL_TRANS3_COMMIT.

This is the new implementation of ctdb transactions using the
global lock feature. It is needed by the current dbwrap_ctdb code.

Michael
(cherry picked from commit d4c0afa841ecdae1cab955cc73360deae23f5873)
---
 source3/configure.in |   17 +++++++++++++++++
 1 files changed, 17 insertions(+), 0 deletions(-)

diff --git a/source3/configure.in b/source3/configure.in
index a9b9e47..99f2448 100644
--- a/source3/configure.in
+++ b/source3/configure.in
@@ -5267,6 +5267,23 @@ else
 	ctdb_broken="transaction support too old"
 fi
 
+AC_HAVE_DECL(CTDB_CONTROL_TRANS3_COMMIT,[
+#include "confdefs.h"
+#define NO_CONFIG_H
+#include "replace.h"
+#include "system/wait.h"
+#include "system/network.h"
+#include <talloc.h>
+#include <tdb.h>
+#include <ctdb.h>
+#include <ctdb_private.h>
+])
+if test x"$ac_cv_have_CTDB_CONTROL_TRANS3_COMMIT_decl" = x"yes"; then
+	ctdb_broken=no
+else
+	ctdb_broken="transaction support too old"
+fi
+
 # in ctdb 1.0.57 ctdb_control_tcp was temparary renamed to ctdb_tcp_client
 AC_CHECK_TYPE(struct ctdb_tcp_client,[
 	AC_DEFINE([ctdb_control_tcp],[ctdb_tcp_client],[ctdb ipv4 support])
-- 
1.6.3.3


From a6681c93cd643a785fbd6f686f0fdffa9f6d1256 Mon Sep 17 00:00:00 2001
From: Michael Adam <obnox@samba.org>
Date: Fri, 11 Dec 2009 10:35:50 +0100
Subject: [PATCH 08/32] s3:dbwrap_ctdb: update (C)

Michael
(cherry picked from commit 5a0c42770b349877928a2b3fd8316903dd62e5b7)
---
 source3/lib/dbwrap_ctdb.c |    3 ++-
 1 files changed, 2 insertions(+), 1 deletions(-)

diff --git a/source3/lib/dbwrap_ctdb.c b/source3/lib/dbwrap_ctdb.c
index c32398a..63b63fa 100644
--- a/source3/lib/dbwrap_ctdb.c
+++ b/source3/lib/dbwrap_ctdb.c
@@ -1,7 +1,8 @@
 /* 
    Unix SMB/CIFS implementation.
    Database interface wrapper around ctdbd
-   Copyright (C) Volker Lendecke 2007
+   Copyright (C) Volker Lendecke 2007-2009
+   Copyright (C) Michael Adam 2009
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
-- 
1.6.3.3


From f2f8abf24c89822b6e17d2f4e6077f93eeec7e33 Mon Sep 17 00:00:00 2001
From: Michael Adam <obnox@samba.org>
Date: Fri, 11 Dec 2009 12:30:57 +0100
Subject: [PATCH 09/32] s3:dbwrap_ctdb: change db_ctdb_transaction_store() to return NTSTATUS.

The return values calculated by the callers were wrong anyways since
the new marshalling code does not set the local tdbs tdb error code.

Michael
(cherry picked from commit 26225d3e798892b39b3c238b0bee465bffac6550)
---
 source3/lib/dbwrap_ctdb.c |   30 ++++++++++++------------------
 1 files changed, 12 insertions(+), 18 deletions(-)

diff --git a/source3/lib/dbwrap_ctdb.c b/source3/lib/dbwrap_ctdb.c
index 63b63fa..0986083 100644
--- a/source3/lib/dbwrap_ctdb.c
+++ b/source3/lib/dbwrap_ctdb.c
@@ -577,8 +577,8 @@ static struct db_record *db_ctdb_fetch_locked_persistent(struct db_ctdb_ctx *ctx
 /*
   stores a record inside a transaction
  */
-static int db_ctdb_transaction_store(struct db_ctdb_transaction_handle *h, 
-				     TDB_DATA key, TDB_DATA data)
+static NTSTATUS db_ctdb_transaction_store(struct db_ctdb_transaction_handle *h,
+					  TDB_DATA key, TDB_DATA data)
 {
 	TALLOC_CTX *tmp_ctx = talloc_new(h);
 	TDB_DATA rec;
@@ -608,7 +608,7 @@ static int db_ctdb_transaction_store(struct db_ctdb_transaction_handle *h,
 				   data.dsize) == 0) {
 				SAFE_FREE(rec.dptr);
 				talloc_free(tmp_ctx);
-				return 0;
+				return NT_STATUS_OK;
 			}
 		}
 		SAFE_FREE(rec.dptr);
@@ -623,18 +623,18 @@ static int db_ctdb_transaction_store(struct db_ctdb_transaction_handle *h,
 		DEBUG(0,(__location__ " Failed to add to marshalling "
 			 "record\n"));
 		talloc_free(tmp_ctx);
-		return -1;
+		return NT_STATUS_NO_MEMORY;
 	}
 
 	h->m_write = db_ctdb_marshall_add(h, h->m_write, h->ctx->db_id, 0, key, &header, data);
 	if (h->m_write == NULL) {
 		DEBUG(0,(__location__ " Failed to add to marshalling record\n"));
 		talloc_free(tmp_ctx);
-		return -1;
+		return NT_STATUS_NO_MEMORY;
 	}
 
 	talloc_free(tmp_ctx);
-	return 0;
+	return NT_STATUS_OK;
 }
 
 
@@ -645,13 +645,10 @@ static NTSTATUS db_ctdb_store_transaction(struct db_record *rec, TDB_DATA data,
 {
 	struct db_ctdb_transaction_handle *h = talloc_get_type_abort(
 		rec->private_data, struct db_ctdb_transaction_handle);
-	int ret;
+	NTSTATUS status;
 
-	ret = db_ctdb_transaction_store(h, rec->key, data);
-	if (ret != 0) {
-		return tdb_error_to_ntstatus(h->ctx->wtdb->tdb);
-	}
-	return NT_STATUS_OK;
+	status = db_ctdb_transaction_store(h, rec->key, data);
+	return status;
 }
 
 /* 
@@ -661,13 +658,10 @@ static NTSTATUS db_ctdb_delete_transaction(struct db_record *rec)
 {
 	struct db_ctdb_transaction_handle *h = talloc_get_type_abort(
 		rec->private_data, struct db_ctdb_transaction_handle);
-	int ret;
+	NTSTATUS status;
 
-	ret = db_ctdb_transaction_store(h, rec->key, tdb_null);
-	if (ret != 0) {
-		return tdb_error_to_ntstatus(h->ctx->wtdb->tdb);
-	}
-	return NT_STATUS_OK;
+	status =  db_ctdb_transaction_store(h, rec->key, tdb_null);
+	return status;
 }
 
 /*
-- 
1.6.3.3


From a069477de2af1dbc15563eed00b3354fc823e2ca Mon Sep 17 00:00:00 2001
From: Michael Adam <obnox@samba.org>
Date: Fri, 11 Dec 2009 14:07:28 +0100
Subject: [PATCH 10/32] s3:dbwrap_ctdb: maintain a database sequence number that bumps in transactions

For persistent databases, 64bit integer is kept in a special record
__db_sequence_number__. This record is incremented with each completed
transaction.

The retry mechanism for failing TRANS3_COMMIT controls inside the
db_ctdb_transaction_commit() function now relies one a modified
behaviour of ctdbd's treatment of persistent databases in recoveries.
Recently, a special treatment for persistent databases had been
introduced in ctdb (1.0.108) to work around the problems with the
orinal design of persistent transactions.
Now with the rewrite we need to revert to the old behaviour that
ctdb always takes the newest copies of all records.

This change also paves the way for a next step, which will make
recovery use the db seqnum to tell which node has the newest copy
of a persistent db and use that node's copy. This will greatly
reduce the amount of data transferred with each recovery.

Michael
(cherry picked from commit 3fe7ce141d6afe3825b06c5feb90558911e4df1e)
---
 source3/lib/dbwrap_ctdb.c |  121 +++++++++++++++++++++++++++++++++++++++++++--
 1 files changed, 116 insertions(+), 5 deletions(-)

diff --git a/source3/lib/dbwrap_ctdb.c b/source3/lib/dbwrap_ctdb.c
index 0986083..fb99e1d 100644
--- a/source3/lib/dbwrap_ctdb.c
+++ b/source3/lib/dbwrap_ctdb.c
@@ -664,6 +664,65 @@ static NTSTATUS db_ctdb_delete_transaction(struct db_record *rec)
 	return status;
 }
 
+/**
+ * Fetch the db sequence number of a persistent db directly from the db.
+ */
+static NTSTATUS db_ctdb_fetch_db_seqnum_from_db(struct db_ctdb_ctx *db,
+						uint64_t *seqnum)
+{
+	NTSTATUS status;
+	const char *keyname = CTDB_DB_SEQNUM_KEY;
+	TDB_DATA key;
+	TDB_DATA data;
+	struct ctdb_ltdb_header header;
+	TALLOC_CTX *mem_ctx = talloc_stackframe();
+
+	if (seqnum == NULL) {
+		return NT_STATUS_INVALID_PARAMETER;
+	}
+
+	key.dptr = (uint8_t *)discard_const(keyname);
+	key.dsize = strlen(keyname) + 1;
+
+	status = db_ctdb_ltdb_fetch(db, key, &header, mem_ctx, &data);
+	if (!NT_STATUS_IS_OK(status)) {
+		goto done;
+	}
+
+	if (data.dsize != sizeof(uint64_t)) {
+		*seqnum = 0;
+		goto done;
+	}
+
+	*seqnum = *(uint64_t *)data.dptr;
+
+done:
+	TALLOC_FREE(mem_ctx);
+	return status;
+}
+
+/**
+ * Store the database sequence number inside a transaction.
+ */
+static NTSTATUS db_ctdb_store_db_seqnum(struct db_ctdb_transaction_handle *h,
+					uint64_t seqnum)
+{
+	NTSTATUS status;
+	const char *keyname = CTDB_DB_SEQNUM_KEY;
+	TDB_DATA key;
+	TDB_DATA data;
+
+	key.dptr = (uint8_t *)discard_const(keyname);
+	key.dsize = strlen(keyname);
+
+	data.dptr = (uint8_t *)&seqnum;
+	data.dsize = sizeof(uint64_t);
+
+	status = db_ctdb_transaction_store(h, key, data);
+
+	return status;
+}
+
 /*
   commit a transaction
  */
@@ -674,6 +733,8 @@ static int db_ctdb_transaction_commit(struct db_context *db)
 	NTSTATUS rets;
 	int status;
 	struct db_ctdb_transaction_handle *h = ctx->transaction;
+	uint64_t old_seqnum, new_seqnum;
+	int ret;
 
 	if (h == NULL) {
 		DEBUG(0,(__location__ " transaction commit with no open transaction on db 0x%08x\n", ctx->db_id));
@@ -693,6 +754,30 @@ static int db_ctdb_transaction_commit(struct db_context *db)
 
 	DEBUG(5,(__location__ " Commit transaction on db 0x%08x\n", ctx->db_id));
 
+	/*
+	 * As the last db action before committing, bump the database sequence
+	 * number. Note that this undoes all changes to the seqnum records
+	 * performed under the transaction. This record is not meant to be
+	 * modified by user interaction. It is for internal use only...
+	 */
+	rets = db_ctdb_fetch_db_seqnum_from_db(ctx, &old_seqnum);
+	if (!NT_STATUS_IS_OK(rets)) {
+		DEBUG(1, (__location__ " failed to fetch the db sequence number "
+			  "in transaction commit on db 0x%08x\n", ctx->db_id));
+		ret = -1;
+		goto done;
+	}
+
+	new_seqnum = old_seqnum + 1;
+
+	rets = db_ctdb_store_db_seqnum(h, new_seqnum);
+	if (!NT_STATUS_IS_OK(rets)) {
+		DEBUG(1, (__location__ "failed to store the db sequence number "
+			  " in transaction commit on db 0x%08x\n", ctx->db_id));
+		ret = -1;
+		goto done;
+	}
+
 again:
 	if (h->m_write == NULL) {
 		/* no changes were made, potentially after a retry */
@@ -707,14 +792,40 @@ again:
 				   NULL, NULL, &status);
 	if (!NT_STATUS_IS_OK(rets) || status != 0) {
 		/*
-		 * TODO:
-		 * check the database sequence number and
-		 * compare it to the seqnum after applying the
-		 * marshall buffer. If it is the same: return success.
+		 * The TRANS3_COMMIT control should only possibly fail when a
+		 * recovery has been running concurrently. In any case, the db
+		 * will be the same on all nodes, either the new copy or the
+		 * old copy.  This can be detected by comparing the old and new
+		 * local sequence numbers.
+		 */
+		rets = db_ctdb_fetch_db_seqnum_from_db(ctx, &new_seqnum);
+		if (!NT_STATUS_IS_OK(rets)) {
+			DEBUG(1, (__location__ " failed to refetch db sequence "
+				  "number after failed TRANS3_COMMIT\n"));
+			ret = -1;
+			goto done;
+		}
+
+		if (new_seqnum == old_seqnum) {
+			/* Recovery prevented all our changes: retry. */
+			goto again;
+		} else if (new_seqnum != (old_seqnum + 1)) {
+			DEBUG(0, (__location__ " ERROR: new_seqnum[%lu] != "
+				  "old_seqnum[%lu] + (0 or 1) after failed "
+				  "TRANS3_COMMIT - this should not happen!\n",
+				  (unsigned long)new_seqnum,
+				  (unsigned long)old_seqnum));
+			ret = -1;
+			goto done;
+		}
+		/*
+		 * Recovery propagated our changes to all nodes, completing
+		 * our commit for us - succeed.
 		 */
-		goto again;
 	}
 
+	ret = 0;
+
 done:
 	h->ctx->transaction = NULL;
 	talloc_free(h);
-- 
1.6.3.3


From 6ae38ef5660945ff5846612907d57fc65c748ad4 Mon Sep 17 00:00:00 2001
From: Michael Adam <obnox@samba.org>
Date: Fri, 11 Dec 2009 16:45:38 +0100
Subject: [PATCH 11/32] s3:build: remove checks for deprecated ctdb controls.

Michael
(cherry picked from commit 9113ce82b59c718ac709eb01b125e9e6746a96b7)
---
 source3/configure.in |   36 +-----------------------------------
 1 files changed, 1 insertions(+), 35 deletions(-)

diff --git a/source3/configure.in b/source3/configure.in
index 99f2448..3111d5a 100644
--- a/source3/configure.in
+++ b/source3/configure.in
@@ -5233,40 +5233,6 @@ AC_CHECK_HEADERS(ctdb.h ctdb_private.h,,,[
 #include <ctdb.h>
 ])
 
-AC_HAVE_DECL(CTDB_CONTROL_TRANS2_COMMIT_RETRY,[
-#include "confdefs.h"
-#define NO_CONFIG_H
-#include "replace.h"
-#include "system/wait.h"
-#include "system/network.h"
-#include <talloc.h>
-#include <tdb.h>
-#include <ctdb.h>
-#include <ctdb_private.h>
-])
-if test x"$ac_cv_have_CTDB_CONTROL_TRANS2_COMMIT_RETRY_decl" = x"yes"; then
-	ctdb_broken=no
-else
-	ctdb_broken="missing transaction support"
-fi
-
-AC_HAVE_DECL(CTDB_CONTROL_TRANS2_ACTIVE,[
-#include "confdefs.h"
-#define NO_CONFIG_H
-#include "replace.h"
-#include "system/wait.h"
-#include "system/network.h"
-#include <talloc.h>
-#include <tdb.h>
-#include <ctdb.h>
-#include <ctdb_private.h>
-])
-if test x"$ac_cv_have_CTDB_CONTROL_TRANS2_ACTIVE_decl" = x"yes"; then
-	ctdb_broken=no
-else
-	ctdb_broken="transaction support too old"
-fi
-
 AC_HAVE_DECL(CTDB_CONTROL_TRANS3_COMMIT,[
 #include "confdefs.h"
 #define NO_CONFIG_H
@@ -5281,7 +5247,7 @@ AC_HAVE_DECL(CTDB_CONTROL_TRANS3_COMMIT,[
 if test x"$ac_cv_have_CTDB_CONTROL_TRANS3_COMMIT_decl" = x"yes"; then
 	ctdb_broken=no
 else
-	ctdb_broken="transaction support too old"
+	ctdb_broken="ctdb transaction support missing or too old"
 fi
 
 # in ctdb 1.0.57 ctdb_control_tcp was temparary renamed to ctdb_tcp_client
-- 
1.6.3.3


From 8a8bd12a9eadd5007d5948a8519709ec835f46ca Mon Sep 17 00:00:00 2001
From: Michael Adam <obnox@samba.org>
Date: Sat, 12 Dec 2009 00:30:37 +0100
Subject: [PATCH 12/32] s3:dbwrap_ctdb: fix db_ctdb_fetch_db_seqnum_from_db() when NT_STATUS_NOT_FOUND.

Don't treat this as an error but return seqnum 0 instead.

Michael
(cherry picked from commit 10a44ee6930bb51b4b20ce42f35bc455ac1b7293)
---
 source3/lib/dbwrap_ctdb.c |    6 +++++-
 1 files changed, 5 insertions(+), 1 deletions(-)

diff --git a/source3/lib/dbwrap_ctdb.c b/source3/lib/dbwrap_ctdb.c
index fb99e1d..4c4486c 100644
--- a/source3/lib/dbwrap_ctdb.c
+++ b/source3/lib/dbwrap_ctdb.c
@@ -685,10 +685,14 @@ static NTSTATUS db_ctdb_fetch_db_seqnum_from_db(struct db_ctdb_ctx *db,
 	key.dsize = strlen(keyname) + 1;
 
 	status = db_ctdb_ltdb_fetch(db, key, &header, mem_ctx, &data);
-	if (!NT_STATUS_IS_OK(status)) {
+	if (!NT_STATUS_IS_OK(status) &&
+	    !NT_STATUS_EQUAL(status, NT_STATUS_NOT_FOUND))
+	{
 		goto done;
 	}
 
+	status = NT_STATUS_OK;
+
 	if (data.dsize != sizeof(uint64_t)) {
 		*seqnum = 0;
 		goto done;
-- 
1.6.3.3


From f3355db811b47ebf94bf260be5756ca075123f3d Mon Sep 17 00:00:00 2001
From: Michael Adam <obnox@samba.org>
Date: Sat, 12 Dec 2009 00:38:14 +0100
Subject: [PATCH 13/32] s3:dbwrap_ctdb: fix two "may be used uninitialized" warnings

Michael
(cherry picked from commit fb981cdb8282d3b9b46d9ca515a5685add232a72)
---
 source3/lib/dbwrap_ctdb.c |    3 +++
 1 files changed, 3 insertions(+), 0 deletions(-)

diff --git a/source3/lib/dbwrap_ctdb.c b/source3/lib/dbwrap_ctdb.c
index 4c4486c..d70637e 100644
--- a/source3/lib/dbwrap_ctdb.c
+++ b/source3/lib/dbwrap_ctdb.c
@@ -389,6 +389,9 @@ static bool pull_newest_from_marshall_buffer(struct ctdb_marshall_buffer *buf,
 		return false;
 	}
 
+	ZERO_STRUCT(h);
+	ZERO_STRUCT(data);
+
 	/*
 	 * Walk the list of records written during this
 	 * transaction. If we want to read one we have already
-- 
1.6.3.3


From dfbdef3e2890f2b8e8478eefd20e215edc800e0e Mon Sep 17 00:00:00 2001
From: Michael Adam <obnox@samba.org>
Date: Tue, 5 Jan 2010 16:17:27 +0100
Subject: [PATCH 14/32] s3:dbwrap_ctdb: fix an uninitialized variable.

Michael
(cherry picked from commit 1505b69dea6044a13a59f672e22f5833256cb981)
---
 source3/lib/dbwrap_ctdb.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/source3/lib/dbwrap_ctdb.c b/source3/lib/dbwrap_ctdb.c
index d70637e..d6fde35 100644
--- a/source3/lib/dbwrap_ctdb.c
+++ b/source3/lib/dbwrap_ctdb.c
@@ -381,7 +381,7 @@ static bool pull_newest_from_marshall_buffer(struct ctdb_marshall_buffer *buf,
 {
 	struct ctdb_rec_data *rec = NULL;
 	struct ctdb_ltdb_header h;
-	bool found;
+	bool found = false;
 	TDB_DATA data;
 	int i;
 
-- 
1.6.3.3


From 033d307def08666c6b8eb0a0edd3c31540aa4d97 Mon Sep 17 00:00:00 2001
From: Michael Adam <obnox@samba.org>
Date: Wed, 6 Jan 2010 00:37:21 +0100
Subject: [PATCH 15/32] s3:dbwrap_ctdb: fix logic error in pull_newest_from_marshall_buffer().

The logic bug was that if a record was found in the marshall buffer,
then always the ctdb header of tha last record in the marshall buffer
was returned, and not the ctdb header of the last occurrence of the
requested record.

This is fixed by introducing an additional temporary variable.

Michael
(cherry picked from commit 524072b56bf659002410a817749bf86fe6f51e83)
---
 source3/lib/dbwrap_ctdb.c |    6 +++++-
 1 files changed, 5 insertions(+), 1 deletions(-)

diff --git a/source3/lib/dbwrap_ctdb.c b/source3/lib/dbwrap_ctdb.c
index d6fde35..4e97d26 100644
--- a/source3/lib/dbwrap_ctdb.c
+++ b/source3/lib/dbwrap_ctdb.c
@@ -403,8 +403,11 @@ static bool pull_newest_from_marshall_buffer(struct ctdb_marshall_buffer *buf,
 	for (i=0; i<buf->count; i++) {
 		TDB_DATA tkey, tdata;
 		uint32_t reqid;
+		struct ctdb_ltdb_header hdr;
 
-		rec = db_ctdb_marshall_loop_next(buf, rec, &reqid, &h, &tkey,
+		ZERO_STRUCT(hdr);
+
+		rec = db_ctdb_marshall_loop_next(buf, rec, &reqid, &hdr, &tkey,
 						 &tdata);
 		if (rec == NULL) {
 			return false;
@@ -413,6 +416,7 @@ static bool pull_newest_from_marshall_buffer(struct ctdb_marshall_buffer *buf,
 		if (tdb_data_equal(key, tkey)) {
 			found = true;
 			data = tdata;
+			h = hdr;
 		}
 	}
 
-- 
1.6.3.3


From 59d53e82c1512cc684de8110c1a125c69caecc6c Mon Sep 17 00:00:00 2001
From: Michael Adam <obnox@samba.org>
Date: Wed, 13 Jan 2010 23:51:34 +0100
Subject: [PATCH 16/32] s3:dbwrap_ctdb: fix brown paperbag bug in ctdb_transaction_commit.

I carefully prepared the return value only to "return 0;" at the bottom. :-(
This may well have hit us for instance in the nested cancel case
and produced random errors.

Michael
(cherry picked from commit 1d594bd734a2f7146ed52872456a16c5e41816f1)
---
 source3/lib/dbwrap_ctdb.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/source3/lib/dbwrap_ctdb.c b/source3/lib/dbwrap_ctdb.c
index 4e97d26..c0b5fd5 100644
--- a/source3/lib/dbwrap_ctdb.c
+++ b/source3/lib/dbwrap_ctdb.c
@@ -840,7 +840,7 @@ again:
 done:
 	h->ctx->transaction = NULL;
 	talloc_free(h);
-	return 0;
+	return ret;
 }
 
 
-- 
1.6.3.3


From 79bdb10b65395bb5cd03a0422a1697e5e90db22c Mon Sep 17 00:00:00 2001
From: Michael Adam <obnox@samba.org>
Date: Wed, 13 Jan 2010 23:53:54 +0100
Subject: [PATCH 17/32] s3:dbwrap_ctdb: exit early when nothing has been written in transaction_commit.

This skips update of the __db_sequence_number__ record when nothing else has
been written. There are transactions that are just openend and then nothing
is written until transaction_commit is called. This is for instance the case
with registry initialization routines: They start a transaction and only
write somthing when the registry has not been initialized yet.
So this change will skip many db_seqnum bumps and TRANS3_COMMIT roundtrips.

Michael
(cherry picked from commit c311697aded87ce624d40cbf14e05d6e6377c257)
---
 source3/lib/dbwrap_ctdb.c |   14 +++++++++-----
 1 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/source3/lib/dbwrap_ctdb.c b/source3/lib/dbwrap_ctdb.c
index c0b5fd5..79c4c0c 100644
--- a/source3/lib/dbwrap_ctdb.c
+++ b/source3/lib/dbwrap_ctdb.c
@@ -763,6 +763,15 @@ static int db_ctdb_transaction_commit(struct db_context *db)
 		return 0;
 	}
 
+	if (h->m_write == NULL) {
+		/*
+		 * No changes were made, so don't change the seqnum,
+		 * don't push to other node, just exit with success.
+		 */
+		ret = 0;
+		goto done;
+	}
+
 	DEBUG(5,(__location__ " Commit transaction on db 0x%08x\n", ctx->db_id));
 
 	/*
@@ -790,11 +799,6 @@ static int db_ctdb_transaction_commit(struct db_context *db)
 	}
 
 again:
-	if (h->m_write == NULL) {
-		/* no changes were made, potentially after a retry */
-		goto done;
-	}
-
 	/* tell ctdbd to commit to the other nodes */
 	rets = ctdbd_control_local(messaging_ctdbd_connection(),
 				   CTDB_CONTROL_TRANS3_COMMIT,
-- 
1.6.3.3


From 88c6be332667dda4d322f2a795fbad2df835d33e Mon Sep 17 00:00:00 2001
From: Michael Adam <obnox@samba.org>
Date: Mon, 18 Jan 2010 17:26:04 +0100
Subject: [PATCH 18/32] s3:dbwrap_ctdb: fix reading/storing of special key __db_sequence_number__

The key for reading and writing was inconsistent due to a
off by one data length.

Michael
(cherry picked from commit 1933214108d1a71bc6473a696ce35020a427d8f4)
---
 source3/lib/dbwrap_ctdb.c |    6 ++----
 1 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/source3/lib/dbwrap_ctdb.c b/source3/lib/dbwrap_ctdb.c
index 79c4c0c..ddc8868 100644
--- a/source3/lib/dbwrap_ctdb.c
+++ b/source3/lib/dbwrap_ctdb.c
@@ -688,8 +688,7 @@ static NTSTATUS db_ctdb_fetch_db_seqnum_from_db(struct db_ctdb_ctx *db,
 		return NT_STATUS_INVALID_PARAMETER;
 	}
 
-	key.dptr = (uint8_t *)discard_const(keyname);
-	key.dsize = strlen(keyname) + 1;
+	key = string_term_tdb_data(keyname);
 
 	status = db_ctdb_ltdb_fetch(db, key, &header, mem_ctx, &data);
 	if (!NT_STATUS_IS_OK(status) &&
@@ -723,8 +722,7 @@ static NTSTATUS db_ctdb_store_db_seqnum(struct db_ctdb_transaction_handle *h,
 	TDB_DATA key;
 	TDB_DATA data;
 
-	key.dptr = (uint8_t *)discard_const(keyname);
-	key.dsize = strlen(keyname);
+	key = string_term_tdb_data(keyname);
 
 	data.dptr = (uint8_t *)&seqnum;
 	data.dsize = sizeof(uint64_t);
-- 
1.6.3.3


From 05afe8843cf656376f61b3f4af887fe4eac3e5ee Mon Sep 17 00:00:00 2001
From: Michael Adam <obnox@samba.org>
Date: Fri, 22 Jan 2010 15:56:28 +0100
Subject: [PATCH 19/32] s3:g_lock: remove an unreached code path.

Michael
(cherry picked from commit 8e306b51b79d3dacd68be9f13aa8455e2eb4c03f)
---
 source3/lib/g_lock.c |    4 ----
 1 files changed, 0 insertions(+), 4 deletions(-)

diff --git a/source3/lib/g_lock.c b/source3/lib/g_lock.c
index 6508b39..0eae3f2 100644
--- a/source3/lib/g_lock.c
+++ b/source3/lib/g_lock.c
@@ -309,10 +309,6 @@ again:
 		goto done;
 	}
 
-	if (retry) {
-		goto again;
-	}
-
 	DEBUG(10, ("g_lock_trylock: Did not get lock, waiting...\n"));
 
 	if (te == NULL) {
-- 
1.6.3.3


From 88430eacb4203ce02f592db2ab87255119ace70f Mon Sep 17 00:00:00 2001
From: Michael Adam <obnox@samba.org>
Date: Sat, 23 Jan 2010 00:05:15 +0100
Subject: [PATCH 20/32] s3:ctdb_conn: add ctdbd_conn_get_fd() to get the fd out of the ctdb connection

Michael
(cherry picked from commit e4af0bc5af2c3ee025ca7fac251c3672ba2c8dd5)
---
 source3/include/ctdbd_conn.h |    2 ++
 source3/lib/ctdbd_conn.c     |    5 +++++
 2 files changed, 7 insertions(+), 0 deletions(-)

diff --git a/source3/include/ctdbd_conn.h b/source3/include/ctdbd_conn.h
index 71516c7..c5ba572 100644
--- a/source3/include/ctdbd_conn.h
+++ b/source3/include/ctdbd_conn.h
@@ -33,6 +33,8 @@ NTSTATUS ctdbd_register_msg_ctx(struct ctdbd_connection *conn,
 				struct messaging_context *msg_ctx);
 struct messaging_context *ctdb_conn_msg_ctx(struct ctdbd_connection *conn);
 
+int ctdbd_conn_get_fd(struct ctdbd_connection *conn);
+
 NTSTATUS ctdbd_messaging_send(struct ctdbd_connection *conn,
 			      uint32 dst_vnn, uint64 dst_srvid,
 			      struct messaging_rec *msg);
diff --git a/source3/lib/ctdbd_conn.c b/source3/lib/ctdbd_conn.c
index 743594d..6b50009 100644
--- a/source3/lib/ctdbd_conn.c
+++ b/source3/lib/ctdbd_conn.c
@@ -519,6 +519,11 @@ struct messaging_context *ctdb_conn_msg_ctx(struct ctdbd_connection *conn)
 	return conn->msg_ctx;
 }
 
+int ctdbd_conn_get_fd(struct ctdbd_connection *conn)
+{
+	return packet_get_fd(conn->pkt);
+}
+
 /*
  * Packet handler to receive and handle a ctdb message
  */
-- 
1.6.3.3


From e787b6e8c18bc8c60cdbe432cf6c88cc0c6dc413 Mon Sep 17 00:00:00 2001
From: Michael Adam <obnox@samba.org>
Date: Sat, 23 Jan 2010 01:17:06 +0100
Subject: [PATCH 21/32] s3:g_lock: remove a nested event loop, replacing the inner loop by select

This made smbd crash in g_lock_lock() when trying to start a
transaction on a db with an already started transaction,
e.g. in a tcon_and_X where the share_info.tdb was not yet
initialized but share_info.tdb was already locked by another
process or writing acces to the winreg rpc pipe where the
registry tdb was already locked by another process.

What we really _want_ to do here by design is to react to
MSG_DBWRAP_G_LOCK_RETRY messages that are either sent
by a client doing g_lock_unlock or by ourselves when
we receive a CTDB_SRVID_SAMBA_NOTIFY or
CTDB_SRVID_RECONFIGURE message from ctdbd, i.e. when
either a client holding a lock or a complete node
has died.

Doing this properly involves calling tevent_loop_once(),
but doing this here with the main ctdbd messaging context
creates a nested event loop when g_lock_lock() is called
from the main event loop.

So as a quick fix, we act a little corasely here: we do
a select on the ctdb connection fd and when it is readable
or we get EINTR, then we retry without actually parsing
any ctdb packages or dispatching messages. This means that
we retry more often than necessary and intended by design,
but this does not harm and it is unobtrusive. When we have
finished, the main loop will pick up all the messages and
ctdb packets. The only extra twist is that we cannot use
timed events here but have to handcode a timeout for select.

Michael
(cherry picked from commit 83fffbeb44441a87569e543054af21d975eb20ae)
---
 source3/lib/g_lock.c |  139 ++++++++++++++++++++++++++++++++++++--------------
 1 files changed, 101 insertions(+), 38 deletions(-)

diff --git a/source3/lib/g_lock.c b/source3/lib/g_lock.c
index 0eae3f2..8a44b22 100644
--- a/source3/lib/g_lock.c
+++ b/source3/lib/g_lock.c
@@ -266,7 +266,9 @@ NTSTATUS g_lock_lock(struct g_lock_ctx *ctx, const char *name,
 	struct tevent_timer *te = NULL;
 	NTSTATUS status;
 	bool retry = false;
-	bool timedout = false;
+	struct timeval timeout_end;
+	struct timeval timeout_remaining;
+	struct timeval time_now;
 
 	DEBUG(10, ("Trying to acquire lock %d for %s\n", (int)lock_type,
 		   name));
@@ -295,52 +297,113 @@ NTSTATUS g_lock_lock(struct g_lock_ctx *ctx, const char *name,
 			   nt_errstr(status)));
 		return status;
 	}
-again:
-	retry = false;
 
-	status = g_lock_trylock(ctx, name, lock_type);
-	if (NT_STATUS_IS_OK(status)) {
-		DEBUG(10, ("Got lock %s\n", name));
-		goto done;
-	}
-	if (!NT_STATUS_EQUAL(status, STATUS_PENDING)) {
-		DEBUG(10, ("g_lock_trylock failed: %s\n",
-			   nt_errstr(status)));
-		goto done;
-	}
-
-	DEBUG(10, ("g_lock_trylock: Did not get lock, waiting...\n"));
-
-	if (te == NULL) {
-		te = tevent_add_timer(
-			ctx->msg->event_ctx, talloc_tos(),
-			timeval_current_ofs(timeout.tv_sec, timeout.tv_usec),
-			g_lock_timedout, &timedout);
-		if (te == NULL) {
-			DEBUG(10, ("tevent_add_timer failed\n"));
-			status = NT_STATUS_NO_MEMORY;
-			goto done;
-		}
-	}
+	time_now = timeval_current();
+	timeout_end = timeval_sum(&time_now, &timeout);
 
 	while (true) {
-		if (tevent_loop_once(ctx->msg->event_ctx) == -1) {
-			DEBUG(1, ("tevent_loop_once failed\n"));
-			status = NT_STATUS_INTERNAL_ERROR;
-			goto done;
+		fd_set _r_fds;
+		fd_set *r_fds = NULL;
+		int max_fd = 0;
+		int ret;
+
+		status = g_lock_trylock(ctx, name, lock_type);
+		if (NT_STATUS_IS_OK(status)) {
+			DEBUG(10, ("Got lock %s\n", name));
+			break;
+		}
+		if (!NT_STATUS_EQUAL(status, STATUS_PENDING)) {
+			DEBUG(10, ("g_lock_trylock failed: %s\n",
+				   nt_errstr(status)));
+			break;
 		}
-		if (retry) {
-			goto again;
+
+		DEBUG(10, ("g_lock_trylock: Did not get lock, waiting...\n"));
+
+		/* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+		 *             !!! HACK ALERT --- FIX ME !!!
+		 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+		 * What we really want to do here is to react to
+		 * MSG_DBWRAP_G_LOCK_RETRY messages that are either sent
+		 * by a client doing g_lock_unlock or by ourselves when
+		 * we receive a CTDB_SRVID_SAMBA_NOTIFY or
+		 * CTDB_SRVID_RECONFIGURE message from ctdbd, i.e. when
+		 * either a client holding a lock or a complete node
+		 * has died.
+		 *
+		 * Doing this properly involves calling tevent_loop_once(),
+		 * but doing this here with the main ctdbd messaging context
+		 * creates a nested event loop when g_lock_lock() is called
+		 * from the main event loop, e.g. in a tcon_and_X where the
+		 * share_info.tdb needs to be initialized and is locked by
+		 * another process, or when the remore registry is accessed
+		 * for writing and some other process already holds a lock
+		 * on the registry.tdb.
+		 *
+		 * So as a quick fix, we act a little corasely here: we do
+		 * a select on the ctdb connection fd and when it is readable
+		 * or we get EINTR, then we retry without actually parsing
+		 * any ctdb packages or dispatching messages. This means that
+		 * we retry more often than intended by design, but this does
+		 * not harm and it is unobtrusive. When we have finished,
+		 * the main loop will pick up all the messages and ctdb
+		 * packets. The only extra twist is that we cannot use timed
+		 * events here but have to handcode a timeout.
+		 */
+
+#ifdef CLUSTER_SUPPORT
+		if (lp_clustering()) {
+			struct ctdbd_connection *conn = messaging_ctdbd_connection();
+
+			r_fds = &_r_fds;
+			FD_ZERO(r_fds);
+			max_fd = ctdbd_conn_get_fd(conn);
+			FD_SET(max_fd, r_fds);
 		}
-		if (timedout) {
-			DEBUG(10, ("g_lock_lock timed out\n"));
+#endif
 
-			te = NULL;
+		time_now = timeval_current();
+		timeout_remaining = timeval_until(&time_now, &timeout_end);
 
-			status = NT_STATUS_LOCK_NOT_GRANTED;
-			goto done;
+		ret = sys_select(max_fd + 1, r_fds, NULL, NULL,
+				 &timeout_remaining);
+
+		if (ret == -1) {
+			if (errno != EINTR) {
+				DEBUG(1, ("error calling select: %s\n",
+					  strerror(errno)));
+				status = NT_STATUS_INTERNAL_ERROR;
+				break;
+			}
+			/*
+			 * errno == EINTR:
+			 * This means a signal was received.
+			 * It might have been a MSG_DBWRAP_G_LOCK_RETRY message.
+			 * ==> retry
+			 */
+		} else if (ret == 0) {
+			if (timeval_expired(&timeout_end)) {
+				DEBUG(10, ("g_lock_lock timed out\n"));
+				status = NT_STATUS_LOCK_NOT_GRANTED;
+				break;
+			} else {
+				DEBUG(10, ("select returned 0 but timeout not "
+					   "not expired: strange - retrying\n"));
+			}
+		} else if (ret != 1) {
+			DEBUG(1, ("invalid return code of select: %d\n", ret));
+			status = NT_STATUS_INTERNAL_ERROR;
+			break;
 		}
+		/*
+		 * ret == 1:
+		 * This means ctdbd has sent us some data.
+		 * Might be a CTDB_SRVID_RECONFIGURE or a
+		 * CTDB_SRVID_SAMBA_NOTIFY message.
+		 * ==> retry
+		 */
 	}
+
 done:
 
 	if (!NT_STATUS_IS_OK(status)) {
-- 
1.6.3.3


From bdf4071ddb2db94969a6d1bf0d9e5d29daa60945 Mon Sep 17 00:00:00 2001
From: Jeremy Allison <jra@samba.org>
Date: Fri, 12 Feb 2010 22:21:19 -0800
Subject: [PATCH 22/32] Fix warning messages on compile in g_lock.c Volker & Michael please check.

Jeremy.
(cherry picked from commit 10e54fb422d9f1ae6d33e5fabbf8c651b0e57a8c)
---
 source3/lib/g_lock.c |   18 ++++--------------
 1 files changed, 4 insertions(+), 14 deletions(-)

diff --git a/source3/lib/g_lock.c b/source3/lib/g_lock.c
index 8a44b22..37fb7ce 100644
--- a/source3/lib/g_lock.c
+++ b/source3/lib/g_lock.c
@@ -149,10 +149,6 @@ static void g_lock_got_retry(struct messaging_context *msg,
 			     uint32_t msg_type,
 			     struct server_id server_id,
 			     DATA_BLOB *data);
-static void g_lock_timedout(struct tevent_context *ev,
-			    struct tevent_timer *te,
-			    struct timeval current_time,
-			    void *private_data);
 
 static NTSTATUS g_lock_trylock(struct g_lock_ctx *ctx, const char *name,
 			       enum g_lock_type lock_type)
@@ -302,7 +298,9 @@ NTSTATUS g_lock_lock(struct g_lock_ctx *ctx, const char *name,
 	timeout_end = timeval_sum(&time_now, &timeout);
 
 	while (true) {
+#ifdef CLUSTER_SUPPORT
 		fd_set _r_fds;
+#endif
 		fd_set *r_fds = NULL;
 		int max_fd = 0;
 		int ret;
@@ -404,7 +402,9 @@ NTSTATUS g_lock_lock(struct g_lock_ctx *ctx, const char *name,
 		 */
 	}
 
+#ifdef CLUSTER_SUPPORT
 done:
+#endif
 
 	if (!NT_STATUS_IS_OK(status)) {
 		NTSTATUS unlock_status;
@@ -437,16 +437,6 @@ static void g_lock_got_retry(struct messaging_context *msg,
 	*pretry = true;
 }
 
-static void g_lock_timedout(struct tevent_context *ev,
-			    struct tevent_timer *te,
-			    struct timeval current_time,
-			    void *private_data)
-{
-	bool *ptimedout = (bool *)private_data;
-	*ptimedout = true;
-	TALLOC_FREE(te);
-}
-
 static NTSTATUS g_lock_force_unlock(struct g_lock_ctx *ctx, const char *name,
 				    struct server_id pid)
 {
-- 
1.6.3.3


From 7732a4128e41ca8379211ad16109e88095d941a0 Mon Sep 17 00:00:00 2001
From: Volker Lendecke <vl@samba.org>
Date: Mon, 15 Feb 2010 16:35:06 +0100
Subject: [PATCH 23/32] s3: Fix a typo
 (cherry picked from commit bac235dd302570850bb25194ff4bd39b6d653f0d)

---
 source3/lib/g_lock.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/source3/lib/g_lock.c b/source3/lib/g_lock.c
index 37fb7ce..26b079d 100644
--- a/source3/lib/g_lock.c
+++ b/source3/lib/g_lock.c
@@ -338,7 +338,7 @@ NTSTATUS g_lock_lock(struct g_lock_ctx *ctx, const char *name,
 		 * for writing and some other process already holds a lock
 		 * on the registry.tdb.
 		 *
-		 * So as a quick fix, we act a little corasely here: we do
+		 * So as a quick fix, we act a little coarsely here: we do
 		 * a select on the ctdb connection fd and when it is readable
 		 * or we get EINTR, then we retry without actually parsing
 		 * any ctdb packages or dispatching messages. This means that
-- 
1.6.3.3


From 80bd6c73d5008682bc8ad148436020ac4a495da2 Mon Sep 17 00:00:00 2001
From: Volker Lendecke <vl@samba.org>
Date: Mon, 15 Feb 2010 16:49:46 +0100
Subject: [PATCH 24/32] s3: Fix handling of processes that died in g_lock

g_lock_parse might have thrown away entries from the locks array because the
processes were not around anymore. Don't store the orphaned entries.
(cherry picked from commit f3bdb163f461175c50b4930fa3464beaee30f4a8)
---
 source3/lib/g_lock.c |    8 +++++---
 1 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/source3/lib/g_lock.c b/source3/lib/g_lock.c
index 26b079d..42c0397 100644
--- a/source3/lib/g_lock.c
+++ b/source3/lib/g_lock.c
@@ -127,11 +127,12 @@ static bool g_lock_parse(TALLOC_CTX *mem_ctx, TDB_DATA data,
 
 static struct g_lock_rec *g_lock_addrec(TALLOC_CTX *mem_ctx,
 					struct g_lock_rec *locks,
-					int num_locks,
+					int *pnum_locks,
 					const struct server_id pid,
 					enum g_lock_type lock_type)
 {
 	struct g_lock_rec *result;
+	int num_locks = *pnum_locks;
 
 	result = talloc_realloc(mem_ctx, locks, struct g_lock_rec,
 				num_locks+1);
@@ -141,6 +142,7 @@ static struct g_lock_rec *g_lock_addrec(TALLOC_CTX *mem_ctx,
 
 	result[num_locks].pid = pid;
 	result[num_locks].lock_type = lock_type;
+	*pnum_locks += 1;
 	return result;
 }
 
@@ -221,7 +223,7 @@ again:
 	if (our_index == -1) {
 		/* First round, add ourself */
 
-		locks = g_lock_addrec(talloc_tos(), locks, num_locks,
+		locks = g_lock_addrec(talloc_tos(), locks, &num_locks,
 				      self, lock_type);
 		if (locks == NULL) {
 			DEBUG(10, ("g_lock_addrec failed\n"));
@@ -237,7 +239,7 @@ again:
 		locks[our_index].lock_type = lock_type;
 	}
 
-	data = make_tdb_data((uint8_t *)locks, talloc_get_size(locks));
+	data = make_tdb_data((uint8_t *)locks, num_locks * sizeof(*locks));
 	store_status = rec->store(rec, data, 0);
 	if (!NT_STATUS_IS_OK(store_status)) {
 		DEBUG(1, ("rec->store failed: %s\n",
-- 
1.6.3.3


From 0b6177bb8f683034916084b1fd3da669563a138b Mon Sep 17 00:00:00 2001
From: Volker Lendecke <vl@samba.org>
Date: Mon, 15 Feb 2010 16:57:16 +0100
Subject: [PATCH 25/32] s3: Optimize g_lock_lock for a heavily contended case

Only check the existence of the lock owner in g_lock_parse, check the rest of
the records only when we got the lock successfully. This reduces the load on
process_exists which can involve a network roundtrip in the clustered case.
(cherry picked from commit 07978bd175395e0dc770f68fff5b8bd8b0fdeb51)
---
 source3/lib/g_lock.c |   39 ++++++++++++++++++++++++++++++++++++---
 1 files changed, 36 insertions(+), 3 deletions(-)

diff --git a/source3/lib/g_lock.c b/source3/lib/g_lock.c
index 42c0397..e262025 100644
--- a/source3/lib/g_lock.c
+++ b/source3/lib/g_lock.c
@@ -108,6 +108,34 @@ static bool g_lock_parse(TALLOC_CTX *mem_ctx, TDB_DATA data,
 			      (locks[i].lock_type & G_LOCK_PENDING) ?
 			      "(pending)" : "(owner)"));
 
+		if (((locks[i].lock_type & G_LOCK_PENDING) == 0)
+		    && !process_exists(locks[i].pid)) {
+
+			DEBUGADD(10, ("lock owner %s died -- discarding\n",
+				      procid_str(talloc_tos(),
+						 &locks[i].pid)));
+
+			if (i < (num_locks-1)) {
+				locks[i] = locks[num_locks-1];
+			}
+			num_locks -= 1;
+		}
+	}
+
+	*plocks = locks;
+	*pnum_locks = num_locks;
+	return true;
+}
+
+static void g_lock_cleanup(int *pnum_locks, struct g_lock_rec *locks)
+{
+	int i, num_locks;
+
+	num_locks = *pnum_locks;
+
+	DEBUG(10, ("g_lock_cleanup: %d locks\n", num_locks));
+
+	for (i=0; i<num_locks; i++) {
 		if (process_exists(locks[i].pid)) {
 			continue;
 		}
@@ -119,10 +147,8 @@ static bool g_lock_parse(TALLOC_CTX *mem_ctx, TDB_DATA data,
 		}
 		num_locks -= 1;
 	}
-
-	*plocks = locks;
 	*pnum_locks = num_locks;
-	return true;
+	return;
 }
 
 static struct g_lock_rec *g_lock_addrec(TALLOC_CTX *mem_ctx,
@@ -239,6 +265,13 @@ again:
 		locks[our_index].lock_type = lock_type;
 	}
 
+	if (NT_STATUS_IS_OK(status) && ((lock_type & G_LOCK_PENDING) == 0)) {
+		/*
+		 * Walk through the list of locks, search for dead entries
+		 */
+		g_lock_cleanup(&num_locks, locks);
+	}
+
 	data = make_tdb_data((uint8_t *)locks, num_locks * sizeof(*locks));
 	store_status = rec->store(rec, data, 0);
 	if (!NT_STATUS_IS_OK(store_status)) {
-- 
1.6.3.3


From 1914a31880ac136cfadf9e74cefd6e3caf468cc7 Mon Sep 17 00:00:00 2001
From: Volker Lendecke <vl@samba.org>
Date: Tue, 16 Feb 2010 12:22:08 +0100
Subject: [PATCH 26/32] s3: Avoid a thundering herd in g_lock_unlock

Only notify the first 5 pending lock waiters. This avoids a thundering herd
problem that is really nasty in a cluster. It also makes acquiring a lock a bit
more FIFO, lock waiters are added to the end of the array.
(cherry picked from commit 725b3654f831fbe0388cc09f46269903c9eef1d7)
---
 source3/lib/g_lock.c |   17 ++++++++++++++++-
 1 files changed, 16 insertions(+), 1 deletions(-)

diff --git a/source3/lib/g_lock.c b/source3/lib/g_lock.c
index e262025..512c068 100644
--- a/source3/lib/g_lock.c
+++ b/source3/lib/g_lock.c
@@ -530,13 +530,23 @@ static NTSTATUS g_lock_force_unlock(struct g_lock_ctx *ctx, const char *name,
 	}
 
 	if ((lock_type & G_LOCK_PENDING) == 0) {
+		int num_wakeups = 0;
+
 		/*
-		 * We've been the lock holder. Tell all others to retry.
+		 * We've been the lock holder. Others to retry. Don't
+		 * tell all others to avoid a thundering herd. In case
+		 * this leads to a complete stall because we miss some
+		 * processes, the loop in g_lock_lock tries at least
+		 * once a minute.
 		 */
+
 		for (i=0; i<num_locks; i++) {
 			if ((locks[i].lock_type & G_LOCK_PENDING) == 0) {
 				continue;
 			}
+			if (!process_exists(locks[i].pid)) {
+				continue;
+			}
 
 			/*
 			 * Ping all waiters to retry
@@ -549,6 +559,11 @@ static NTSTATUS g_lock_force_unlock(struct g_lock_ctx *ctx, const char *name,
 					  procid_str(talloc_tos(),
 						     &locks[i].pid),
 					  nt_errstr(status)));
+			} else {
+				num_wakeups += 1;
+			}
+			if (num_wakeups > 5) {
+				break;
 			}
 		}
 	}
-- 
1.6.3.3


From fe5db728b6eb7e647458e0b9921ce7ce29c1a1d7 Mon Sep 17 00:00:00 2001
From: Volker Lendecke <vl@samba.org>
Date: Tue, 16 Feb 2010 12:28:53 +0100
Subject: [PATCH 27/32] s3: Avoid starving locks when many processes die at the same time

In g_lock_unlock we have a little race between the process_exists and
messaging_send call: We only send to 5 waiters now, they all might have died
between us checking their existence and sending the message. This change makes
g_lock_lock retry at least once every minute.
(cherry picked from commit be919d6faed198cdc29322a4d9491946c0b044b3)
---
 source3/lib/g_lock.c |   10 ++++------
 1 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/source3/lib/g_lock.c b/source3/lib/g_lock.c
index 512c068..33ebe94 100644
--- a/source3/lib/g_lock.c
+++ b/source3/lib/g_lock.c
@@ -298,7 +298,6 @@ NTSTATUS g_lock_lock(struct g_lock_ctx *ctx, const char *name,
 	NTSTATUS status;
 	bool retry = false;
 	struct timeval timeout_end;
-	struct timeval timeout_remaining;
 	struct timeval time_now;
 
 	DEBUG(10, ("Trying to acquire lock %d for %s\n", (int)lock_type,
@@ -339,6 +338,7 @@ NTSTATUS g_lock_lock(struct g_lock_ctx *ctx, const char *name,
 		fd_set *r_fds = NULL;
 		int max_fd = 0;
 		int ret;
+		struct timeval select_timeout;
 
 		status = g_lock_trylock(ctx, name, lock_type);
 		if (NT_STATUS_IS_OK(status)) {
@@ -395,12 +395,10 @@ NTSTATUS g_lock_lock(struct g_lock_ctx *ctx, const char *name,
 		}
 #endif
 
-		time_now = timeval_current();
-		timeout_remaining = timeval_until(&time_now, &timeout_end);
+		select_timeout = timeval_set(60, 0);
 
 		ret = sys_select(max_fd + 1, r_fds, NULL, NULL,
-				 &timeout_remaining);
-
+				 &select_timeout);
 		if (ret == -1) {
 			if (errno != EINTR) {
 				DEBUG(1, ("error calling select: %s\n",
@@ -421,7 +419,7 @@ NTSTATUS g_lock_lock(struct g_lock_ctx *ctx, const char *name,
 				break;
 			} else {
 				DEBUG(10, ("select returned 0 but timeout not "
-					   "not expired: strange - retrying\n"));
+					   "not expired, retrying\n"));
 			}
 		} else if (ret != 1) {
 			DEBUG(1, ("invalid return code of select: %d\n", ret));
-- 
1.6.3.3


From 16965425a0aa2d213e7ddadc898b09b2399617de Mon Sep 17 00:00:00 2001
From: Volker Lendecke <vl@samba.org>
Date: Tue, 16 Feb 2010 12:31:58 +0100
Subject: [PATCH 28/32] s3: Slightly increase parallelism in g_lock

There's no need to still hold the g_lock tdb-level lock while telling the
waiters to retry
(cherry picked from commit 83542d973ca771353109c7da4b0391d6ba910f53)
---
 source3/lib/g_lock.c |    8 +++++++-
 1 files changed, 7 insertions(+), 1 deletions(-)

diff --git a/source3/lib/g_lock.c b/source3/lib/g_lock.c
index 33ebe94..add670c 100644
--- a/source3/lib/g_lock.c
+++ b/source3/lib/g_lock.c
@@ -527,6 +527,8 @@ static NTSTATUS g_lock_force_unlock(struct g_lock_ctx *ctx, const char *name,
 		goto done;
 	}
 
+	TALLOC_FREE(rec);
+
 	if ((lock_type & G_LOCK_PENDING) == 0) {
 		int num_wakeups = 0;
 
@@ -566,9 +568,13 @@ static NTSTATUS g_lock_force_unlock(struct g_lock_ctx *ctx, const char *name,
 		}
 	}
 done:
+	/*
+	 * For the error path, TALLOC_FREE(rec) as well. In the good
+	 * path we have already freed it.
+	 */
+	TALLOC_FREE(rec);
 
 	TALLOC_FREE(locks);
-	TALLOC_FREE(rec);
 	return status;
 }
 
-- 
1.6.3.3


From a6c06b24ba0730b41240d1db652012f628ff1a73 Mon Sep 17 00:00:00 2001
From: Volker Lendecke <vl@samba.org>
Date: Tue, 16 Feb 2010 15:21:25 +0100
Subject: [PATCH 29/32] s3: Fix timeout calculation if g_lock_lock is given a timeout < 60s

Detected while showing this code to obnox :-)
(cherry picked from commit f8b246e44c819b909b23b4b98ef0999c84d2f4ff)
---
 source3/lib/g_lock.c |    7 ++++++-
 1 files changed, 6 insertions(+), 1 deletions(-)

diff --git a/source3/lib/g_lock.c b/source3/lib/g_lock.c
index add670c..e4c6d7c 100644
--- a/source3/lib/g_lock.c
+++ b/source3/lib/g_lock.c
@@ -338,7 +338,7 @@ NTSTATUS g_lock_lock(struct g_lock_ctx *ctx, const char *name,
 		fd_set *r_fds = NULL;
 		int max_fd = 0;
 		int ret;
-		struct timeval select_timeout;
+		struct timeval timeout_remaining, select_timeout;
 
 		status = g_lock_trylock(ctx, name, lock_type);
 		if (NT_STATUS_IS_OK(status)) {
@@ -395,8 +395,13 @@ NTSTATUS g_lock_lock(struct g_lock_ctx *ctx, const char *name,
 		}
 #endif
 
+		time_now = timeval_current();
+		timeout_remaining = timeval_until(&time_now, &timeout_end);
 		select_timeout = timeval_set(60, 0);
 
+		select_timeout = timeval_min(&select_timeout,
+					     &timeout_remaining);
+
 		ret = sys_select(max_fd + 1, r_fds, NULL, NULL,
 				 &select_timeout);
 		if (ret == -1) {
-- 
1.6.3.3


From aee580c4722377e8d1260467ad652937cee2de29 Mon Sep 17 00:00:00 2001
From: Volker Lendecke <vl@samba.org>
Date: Fri, 12 Mar 2010 14:22:54 +0100
Subject: [PATCH 30/32] s3: Add "g_lock_do" as a convenience wrapper function
 (cherry picked from commit 79100c242153ea174a4405afd45cbf635da313aa)

---
 source3/include/g_lock.h   |    4 +++
 source3/lib/g_lock.c       |   64 ++++++++++++++++++++++++++++++++++++++++++++
 source3/utils/net_g_lock.c |   49 +++++++++++++++------------------
 3 files changed, 90 insertions(+), 27 deletions(-)

diff --git a/source3/include/g_lock.h b/source3/include/g_lock.h
index c0eed38..b2d2c2d 100644
--- a/source3/include/g_lock.h
+++ b/source3/include/g_lock.h
@@ -43,6 +43,10 @@ NTSTATUS g_lock_unlock(struct g_lock_ctx *ctx, const char *name);
 NTSTATUS g_lock_get(struct g_lock_ctx *ctx, const char *name,
 		struct server_id *pid);
 
+NTSTATUS g_lock_do(const char *name, enum g_lock_type lock_type,
+		   struct timeval timeout,
+		   void (*fn)(void *private_data), void *private_data);
+
 int g_lock_locks(struct g_lock_ctx *ctx,
 		 int (*fn)(const char *name, void *private_data),
 		 void *private_data);
diff --git a/source3/lib/g_lock.c b/source3/lib/g_lock.c
index e4c6d7c..1726047 100644
--- a/source3/lib/g_lock.c
+++ b/source3/lib/g_lock.c
@@ -700,3 +700,67 @@ NTSTATUS g_lock_get(struct g_lock_ctx *ctx, const char *name,
 	}
 	return NT_STATUS_OK;
 }
+
+static bool g_lock_init_all(TALLOC_CTX *mem_ctx,
+			    struct tevent_context **pev,
+			    struct messaging_context **pmsg,
+			    struct g_lock_ctx **pg_ctx)
+{
+	struct tevent_context *ev = NULL;
+	struct messaging_context *msg = NULL;
+	struct g_lock_ctx *g_ctx = NULL;
+
+	ev = tevent_context_init(mem_ctx);
+	if (ev == NULL) {
+		d_fprintf(stderr, "ERROR: could not init event context\n");
+		goto fail;
+	}
+	msg = messaging_init(mem_ctx, procid_self(), ev);
+	if (msg == NULL) {
+		d_fprintf(stderr, "ERROR: could not init messaging context\n");
+		goto fail;
+	}
+	g_ctx = g_lock_ctx_init(mem_ctx, msg);
+	if (g_ctx == NULL) {
+		d_fprintf(stderr, "ERROR: could not init g_lock context\n");
+		goto fail;
+	}
+
+	*pev = ev;
+	*pmsg = msg;
+	*pg_ctx = g_ctx;
+	return true;
+fail:
+	TALLOC_FREE(g_ctx);
+	TALLOC_FREE(msg);
+	TALLOC_FREE(ev);
+	return false;
+}
+
+NTSTATUS g_lock_do(const char *name, enum g_lock_type lock_type,
+		   struct timeval timeout,
+		   void (*fn)(void *private_data), void *private_data)
+{
+	struct tevent_context *ev = NULL;
+	struct messaging_context *msg = NULL;
+	struct g_lock_ctx *g_ctx = NULL;
+	NTSTATUS status;
+
+	if (!g_lock_init_all(talloc_tos(), &ev, &msg, &g_ctx)) {
+		status = NT_STATUS_ACCESS_DENIED;
+		goto done;
+	}
+
+	status = g_lock_lock(g_ctx, name, lock_type, timeout);
+	if (!NT_STATUS_IS_OK(status)) {
+		goto done;
+	}
+	fn(private_data);
+	g_lock_unlock(g_ctx, name);
+
+done:
+	TALLOC_FREE(g_ctx);
+	TALLOC_FREE(msg);
+	TALLOC_FREE(ev);
+	return status;
+}
diff --git a/source3/utils/net_g_lock.c b/source3/utils/net_g_lock.c
index f30ed33..a070510 100644
--- a/source3/utils/net_g_lock.c
+++ b/source3/utils/net_g_lock.c
@@ -57,17 +57,24 @@ fail:
 	return false;
 }
 
+struct net_g_lock_do_state {
+	const char *cmd;
+	int result;
+};
+
+static void net_g_lock_do_fn(void *private_data)
+{
+	struct net_g_lock_do_state *state =
+		(struct net_g_lock_do_state *)private_data;
+	state->result = system(state->cmd);
+}
 
 static int net_g_lock_do(struct net_context *c, int argc, const char **argv)
 {
-	struct tevent_context *ev = NULL;
-	struct messaging_context *msg = NULL;
-	struct g_lock_ctx *g_ctx = NULL;
+	struct net_g_lock_do_state state;
 	const char *name, *cmd;
-	int timeout, res;
-	bool locked = false;
+	int timeout;
 	NTSTATUS status;
-	int ret = -1;
 
 	if (argc != 3) {
 		d_printf("Usage: net g_lock do <lockname> <timeout> "
@@ -78,38 +85,26 @@ static int net_g_lock_do(struct net_context *c, int argc, const char **argv)
 	timeout = atoi(argv[1]);
 	cmd = argv[2];
 
-	if (!net_g_lock_init(talloc_tos(), &ev, &msg, &g_ctx)) {
-		goto done;
-	}
+	state.cmd = cmd;
+	state.result = -1;
 
-	status = g_lock_lock(g_ctx, name, G_LOCK_WRITE,
-			     timeval_set(timeout / 1000, timeout % 1000));
+	status = g_lock_do(name, G_LOCK_WRITE,
+			   timeval_set(timeout / 1000, timeout % 1000),
+			   net_g_lock_do_fn, &state);
 	if (!NT_STATUS_IS_OK(status)) {
-		d_fprintf(stderr, "ERROR: Could not get lock: %s\n",
+		d_fprintf(stderr, "ERROR: g_lock_do failed: %s\n",
 			  nt_errstr(status));
 		goto done;
 	}
-	locked = true;
-
-	res = system(cmd);
-
-	if (res == -1) {
+	if (state.result == -1) {
 		d_fprintf(stderr, "ERROR: system() returned %s\n",
 			  strerror(errno));
 		goto done;
 	}
-	d_fprintf(stderr, "command returned %d\n", res);
-
-	ret = 0;
+	d_fprintf(stderr, "command returned %d\n", state.result);
 
 done:
-	if (locked) {
-		g_lock_unlock(g_ctx, name);
-	}
-	TALLOC_FREE(g_ctx);
-	TALLOC_FREE(msg);
-	TALLOC_FREE(ev);
-	return ret;
+	return state.result;
 }
 
 static int net_g_lock_dump_fn(struct server_id pid, enum g_lock_type lock_type,
-- 
1.6.3.3


From 3b6283cce22113f0f46dbebdc8c58bce4c0f8cb0 Mon Sep 17 00:00:00 2001
From: Volker Lendecke <vl@samba.org>
Date: Fri, 5 Mar 2010 15:28:39 +0100
Subject: [PATCH 31/32] s3: db->persistent==true was handled earlier, make this more obvious
 (cherry picked from commit c7835a4845bbc7e4d340a75229866b2d4946f6eb)

---
 source3/lib/dbwrap_ctdb.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/source3/lib/dbwrap_ctdb.c b/source3/lib/dbwrap_ctdb.c
index ddc8868..05ac777 100644
--- a/source3/lib/dbwrap_ctdb.c
+++ b/source3/lib/dbwrap_ctdb.c
@@ -1050,7 +1050,7 @@ static struct db_record *db_ctdb_fetch_locked(struct db_context *db,
 		return db_ctdb_fetch_locked_persistent(ctx, mem_ctx, key);
 	}
 
-	return fetch_locked_internal(ctx, mem_ctx, key, db->persistent);
+	return fetch_locked_internal(ctx, mem_ctx, key, false);
 }
 
 /*
-- 
1.6.3.3


From 7b200cd8809c22aa1978fa8919b002a26053c483 Mon Sep 17 00:00:00 2001
From: Volker Lendecke <vl@samba.org>
Date: Fri, 5 Mar 2010 15:30:22 +0100
Subject: [PATCH 32/32] s3: Remove the unused parameter "persistent" from fetch_locked_internal
 (cherry picked from commit a5db27936e9c6aad99300ea46808481803f57e08)

---
 source3/lib/dbwrap_ctdb.c |   10 ++--------
 1 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/source3/lib/dbwrap_ctdb.c b/source3/lib/dbwrap_ctdb.c
index 05ac777..938a312 100644
--- a/source3/lib/dbwrap_ctdb.c
+++ b/source3/lib/dbwrap_ctdb.c
@@ -52,11 +52,6 @@ struct db_ctdb_rec {
 	struct ctdb_ltdb_header header;
 };
 
-static struct db_record *fetch_locked_internal(struct db_ctdb_ctx *ctx,
-					       TALLOC_CTX *mem_ctx,
-					       TDB_DATA key,
-					       bool persistent);
-
 static NTSTATUS tdb_error_to_ntstatus(struct tdb_context *tdb)
 {
 	NTSTATUS status;
@@ -921,8 +916,7 @@ static int db_ctdb_record_destr(struct db_record* data)
 
 static struct db_record *fetch_locked_internal(struct db_ctdb_ctx *ctx,
 					       TALLOC_CTX *mem_ctx,
-					       TDB_DATA key,
-					       bool persistent)
+					       TDB_DATA key)
 {
 	struct db_record *result;
 	struct db_ctdb_rec *crec;
@@ -1050,7 +1044,7 @@ static struct db_record *db_ctdb_fetch_locked(struct db_context *db,
 		return db_ctdb_fetch_locked_persistent(ctx, mem_ctx, key);
 	}
 
-	return fetch_locked_internal(ctx, mem_ctx, key, false);
+	return fetch_locked_internal(ctx, mem_ctx, key);
 }
 
 /*
-- 
1.6.3.3