From 6c0c1d20e651ec4366e606dab81535b3889a6c13 Mon Sep 17 00:00:00 2001 From: Andrew Bartlett Date: Thu, 9 Mar 2023 17:02:35 +1300 Subject: [PATCH 1/2] selftest/drs: Demonstrate ERROR(ldb): uncaught exception - Deleted target CN=NTDS Settings... in join "samba-tool domain join" uses the replication API in a strange way, perhaps no longer required, except that we often still have folks upgrading from very old Samba versions. By deferring the writing out to the DB of link replication to the very end, we have a better chance that all the objects required are present, however the situation may have changed during the cycle, and a link could still be sent, pointing to a deleted object. We currently fail in this situation. BUG: https://bugzilla.samba.org/show_bug.cgi?id=15329 Signed-off-by: Andrew Bartlett Reviewed-by: Joseph Sutton (cherry picked from commit 2d41bcce83a976b85636c92d6fc38c63fdde5431) --- .../knownfail.d/replicate_against_deleted | 1 + source4/dsdb/pydsdb.c | 2 + source4/dsdb/samdb/samdb.h | 2 + source4/torture/drs/python/ridalloc_exop.py | 135 ++++++++++++++++++ 4 files changed, 140 insertions(+) create mode 100644 selftest/knownfail.d/replicate_against_deleted diff --git a/selftest/knownfail.d/replicate_against_deleted b/selftest/knownfail.d/replicate_against_deleted new file mode 100644 index 00000000000..9caa5346a45 --- /dev/null +++ b/selftest/knownfail.d/replicate_against_deleted @@ -0,0 +1 @@ +samba4.drs.ridalloc_exop.python\(.*\).ridalloc_exop.DrsReplicaSyncTestCase.test_replicate_against_deleted_objects_transaction diff --git a/source4/dsdb/pydsdb.c b/source4/dsdb/pydsdb.c index 804007e9e86..fd7cda21a43 100644 --- a/source4/dsdb/pydsdb.c +++ b/source4/dsdb/pydsdb.c @@ -1768,5 +1768,7 @@ MODULE_INIT_FUNC(dsdb) ADD_DSDB_STRING(DS_GUID_SCHEMA_CLASS_MANAGED_SERVICE_ACCOUNT); ADD_DSDB_STRING(DS_GUID_SCHEMA_CLASS_USER); + ADD_DSDB_STRING(DSDB_FULL_JOIN_REPLICATION_COMPLETED_OPAQUE_NAME); + return m; } diff --git a/source4/dsdb/samdb/samdb.h b/source4/dsdb/samdb/samdb.h index f2f738121f9..1121801941f 100644 --- a/source4/dsdb/samdb/samdb.h +++ b/source4/dsdb/samdb/samdb.h @@ -370,6 +370,8 @@ struct dsdb_extended_dn_store_format { #define DSDB_OPAQUE_PARTITION_MODULE_MSG_OPAQUE_NAME "DSDB_OPAQUE_PARTITION_MODULE_MSG" +#define DSDB_FULL_JOIN_REPLICATION_COMPLETED_OPAQUE_NAME "DSDB_FULL_JOIN_REPLICATION_COMPLETED" + #define DSDB_ACL_CHECKS_DIRSYNC_FLAG 0x1 #define DSDB_SAMDB_MINIMUM_ALLOWED_RID 1000 diff --git a/source4/torture/drs/python/ridalloc_exop.py b/source4/torture/drs/python/ridalloc_exop.py index 7869967cf1d..8ce4098bc06 100644 --- a/source4/torture/drs/python/ridalloc_exop.py +++ b/source4/torture/drs/python/ridalloc_exop.py @@ -43,6 +43,7 @@ from samba.auth import system_session, admin_session from samba.dbchecker import dbcheck from samba.ndr import ndr_pack from samba.dcerpc import security +from samba import drs_utils, dsdb class DrsReplicaSyncTestCase(drs_base.DrsBaseTestCase): @@ -673,3 +674,137 @@ class DrsReplicaSyncTestCase(drs_base.DrsBaseTestCase): finally: self._test_force_demote(fsmo_owner['dns_name'], "RIDALLOCTEST7") shutil.rmtree(targetdir, ignore_errors=True) + + def test_replicate_against_deleted_objects_transaction(self): + """Not related to RID allocation, but uses the infrastructure here. + Do a join, create a link between two objects remotely, but + remove the target locally. Show that we need to set a magic + opaque if there is an outer transaction. + + """ + fsmo_dn = ldb.Dn(self.ldb_dc1, "CN=RID Manager$,CN=System," + self.ldb_dc1.domain_dn()) + (fsmo_owner, fsmo_not_owner) = self._determine_fSMORoleOwner(fsmo_dn) + + test_user4 = "ridalloctestuser4" + test_group = "ridalloctestgroup1" + + self.ldb_dc1.newuser(test_user4, "P@ssword!") + + self.addCleanup(self.ldb_dc1.deleteuser, test_user4) + + self.ldb_dc1.newgroup(test_group) + self.addCleanup(self.ldb_dc1.deletegroup, test_group) + + targetdir = self._test_join(self.dnsname_dc1, "RIDALLOCTEST8") + try: + # Connect to the database + ldb_url = "tdb://%s" % os.path.join(targetdir, "private/sam.ldb") + lp = self.get_loadparm() + + new_ldb = SamDB(ldb_url, + session_info=system_session(lp), lp=lp) + + destination_dsa_guid = misc.GUID(new_ldb.get_ntds_GUID()) + + repl = drs_utils.drs_Replicate(f'ncacn_ip_tcp:{self.dnsname_dc1}[seal]', + lp, + self.get_credentials(), + new_ldb, + destination_dsa_guid) + + source_dsa_invocation_id = misc.GUID(self.ldb_dc1.invocation_id) + + # Add the link on the remote DC + self.ldb_dc1.add_remove_group_members(test_group, [test_user4]) + + # Starting a transaction overrides, currently the logic + # inside repl.replicatate to retry with GET_TGT which in + # turn tells the repl_meta_data module that the most up to + # date info is already available + new_ldb.transaction_start() + repl.replicate(self.ldb_dc1.domain_dn(), + source_dsa_invocation_id, + destination_dsa_guid) + + # Delete the user locally, before applying the links. + # This simulates getting the delete in the replciation + # stream. + new_ldb.deleteuser(test_user4) + + # This fails as the user has been deleted locally but a remote link is sent + self.assertRaises(ldb.LdbError, new_ldb.transaction_commit) + + new_ldb.transaction_start() + repl.replicate(self.ldb_dc1.domain_dn(), + source_dsa_invocation_id, + destination_dsa_guid) + + # Delete the user locally (the previous transaction + # doesn't apply), before applying the links. This + # simulates getting the delete in the replciation stream. + new_ldb.deleteuser(test_user4) + + new_ldb.set_opaque_integer(dsdb.DSDB_FULL_JOIN_REPLICATION_COMPLETED_OPAQUE_NAME, + 1) + + # This should now work + try: + new_ldb.transaction_commit() + except ldb.LdbError as e: + self.fail(f"Failed to replicate despite setting opaque with {e.args[1]}") + + finally: + self._test_force_demote(self.dnsname_dc1, "RIDALLOCTEST8") + shutil.rmtree(targetdir, ignore_errors=True) + + def test_replicate_against_deleted_objects_normal(self): + """Not related to RID allocation, but uses the infrastructure here. + Do a join, create a link between two objects remotely, but + remove the target locally. . + + """ + fsmo_dn = ldb.Dn(self.ldb_dc1, "CN=RID Manager$,CN=System," + self.ldb_dc1.domain_dn()) + (fsmo_owner, fsmo_not_owner) = self._determine_fSMORoleOwner(fsmo_dn) + + test_user5 = "ridalloctestuser5" + test_group2 = "ridalloctestgroup2" + + self.ldb_dc1.newuser(test_user5, "P@ssword!") + self.addCleanup(self.ldb_dc1.deleteuser, test_user5) + + self.ldb_dc1.newgroup(test_group2) + self.addCleanup(self.ldb_dc1.deletegroup, test_group2) + + targetdir = self._test_join(self.dnsname_dc1, "RIDALLOCTEST9") + try: + # Connect to the database + ldb_url = "tdb://%s" % os.path.join(targetdir, "private/sam.ldb") + lp = self.get_loadparm() + + new_ldb = SamDB(ldb_url, + session_info=system_session(lp), lp=lp) + + destination_dsa_guid = misc.GUID(new_ldb.get_ntds_GUID()) + + repl = drs_utils.drs_Replicate(f'ncacn_ip_tcp:{self.dnsname_dc1}[seal]', + lp, + self.get_credentials(), + new_ldb, + destination_dsa_guid) + + source_dsa_invocation_id = misc.GUID(self.ldb_dc1.invocation_id) + + # Add the link on the remote DC + self.ldb_dc1.add_remove_group_members(test_group2, [test_user5]) + + # Delete the user locally + new_ldb.deleteuser(test_user5) + + # Confirm replication copes with a link to a locally deleted user + repl.replicate(self.ldb_dc1.domain_dn(), + source_dsa_invocation_id, + destination_dsa_guid) + + finally: + self._test_force_demote(self.dnsname_dc1, "RIDALLOCTEST9") + shutil.rmtree(targetdir, ignore_errors=True) -- 2.25.1 From 0bfd3d7dddb629c03e80602f1bd561c4f1fa45da Mon Sep 17 00:00:00 2001 From: Andrew Bartlett Date: Thu, 9 Mar 2023 20:25:06 +1300 Subject: [PATCH 2/2] dsdb: Avoid ERROR(ldb): uncaught exception - Deleted target CN=NTDS Settings... in join "samba-tool domain join" uses the replication API in a strange way, perhaps no longer required, except that we often still have folks upgrading from very old Samba versions. When deferring the writing out to the DB of link replication to the very end, there is a greater opportunity for the deletion of an object to have been sent with the other objects, and have the link applied later. This tells the repl_meta_data code to behave as if GET_TGT had been sent at the time the link was returned, allowing a link to a deleted object to be silently discarded. BUG: https://bugzilla.samba.org/show_bug.cgi?id=15329 Signed-off-by: Andrew Bartlett Reviewed-by: Joseph Sutton (cherry picked from commit bfc33b47bb428233e100f75e7a725ac52179f823) --- python/samba/join.py | 19 +++++++++++++++++++ .../knownfail.d/replicate_against_deleted | 1 - .../dsdb/samdb/ldb_modules/repl_meta_data.c | 13 ++++++++++++- 3 files changed, 31 insertions(+), 2 deletions(-) delete mode 100644 selftest/knownfail.d/replicate_against_deleted diff --git a/python/samba/join.py b/python/samba/join.py index 70b3c9729b0..6c1ab3be7b4 100644 --- a/python/samba/join.py +++ b/python/samba/join.py @@ -50,6 +50,7 @@ import tempfile from collections import OrderedDict from samba.common import get_string from samba.netcmd import CommandError +from samba import dsdb class DCJoinException(Exception): @@ -937,6 +938,10 @@ class DCJoinContext(object): """Replicate the SAM.""" ctx.logger.info("Starting replication") + + # A global transaction is started so that linked attributes + # are applied at the very end, once all partitions are + # replicated. This helps get all cross-partition links. ctx.local_samdb.transaction_start() try: source_dsa_invocation_id = misc.GUID(ctx.samdb.get_invocation_id()) @@ -1057,7 +1062,21 @@ class DCJoinContext(object): ctx.local_samdb.transaction_cancel() raise else: + + # This is a special case, we have completed a full + # replication so if a link comes to us that points to a + # deleted object, and we asked for all objects already, we + # just have to ignore it, the chance to re-try the + # replication with GET_TGT has long gone. This can happen + # if the object is deleted and sent to us after the link + # was sent, as we are processing all links in the + # transaction_commit(). + if not ctx.domain_replica_flags & drsuapi.DRSUAPI_DRS_CRITICAL_ONLY: + ctx.local_samdb.set_opaque_integer(dsdb.DSDB_FULL_JOIN_REPLICATION_COMPLETED_OPAQUE_NAME, + 1) ctx.local_samdb.transaction_commit() + ctx.local_samdb.set_opaque_integer(dsdb.DSDB_FULL_JOIN_REPLICATION_COMPLETED_OPAQUE_NAME, + 0) ctx.logger.info("Committed SAM database") # A large replication may have caused our LDB connection to the diff --git a/selftest/knownfail.d/replicate_against_deleted b/selftest/knownfail.d/replicate_against_deleted deleted file mode 100644 index 9caa5346a45..00000000000 --- a/selftest/knownfail.d/replicate_against_deleted +++ /dev/null @@ -1 +0,0 @@ -samba4.drs.ridalloc_exop.python\(.*\).ridalloc_exop.DrsReplicaSyncTestCase.test_replicate_against_deleted_objects_transaction diff --git a/source4/dsdb/samdb/ldb_modules/repl_meta_data.c b/source4/dsdb/samdb/ldb_modules/repl_meta_data.c index c1ea5ad90f8..175a02d3ba7 100644 --- a/source4/dsdb/samdb/ldb_modules/repl_meta_data.c +++ b/source4/dsdb/samdb/ldb_modules/repl_meta_data.c @@ -7533,6 +7533,16 @@ static int replmd_allow_missing_target(struct ldb_module *module, source_dn, target_dn); if (is_in_same_nc) { + /* + * We allow the join.py code to point out that all + * replication is completed, so failing now would just + * trigger errors, rather than trigger a GET_TGT + */ + int *finished_full_join_ptr = + talloc_get_type(ldb_get_opaque(ldb, + DSDB_FULL_JOIN_REPLICATION_COMPLETED_OPAQUE_NAME), + int); + bool finished_full_join = finished_full_join_ptr && *finished_full_join_ptr; /* * if the target is already be up-to-date there's no point in @@ -7540,7 +7550,8 @@ static int replmd_allow_missing_target(struct ldb_module *module, * on a one-way link was deleted. We ignore the link rather * than failing the replication cycle completely */ - if (dsdb_repl_flags & DSDB_REPL_FLAG_TARGETS_UPTODATE) { + if (finished_full_join + || dsdb_repl_flags & DSDB_REPL_FLAG_TARGETS_UPTODATE) { *ignore_link = true; DBG_WARNING("%s is %s " "but up to date. Ignoring link from %s\n", -- 2.25.1