Thread 1 (Thread 0x7fa156c7e900 (LWP 28123)): #0 0x00007fa146969387 in raise () from /lib64/libc.so.6 No symbol table info available. #1 0x00007fa14696aa78 in abort () from /lib64/libc.so.6 No symbol table info available. #2 0x00007fa155e71b78 in dump_core () at ../../source3/lib/dumpcore.c:338 called = true __FUNCTION__ = "dump_core" #3 0x00007fa155e56e18 in smb_panic_s3 (why=<optimized out>) at ../../source3/lib/util.c:847 cmd = <optimized out> result = <optimized out> __FUNCTION__ = "smb_panic_s3" #4 0x00007fa15665c6cd in smb_panic (why=0x7fa147e3fb28 "Bad talloc magic value - access after free") at ../../lib/util/fault.c:170 No locals. #5 0x00007fa147e368af in talloc_chunk_from_ptr.part.4 () from /lib64/libtalloc.so.2 No symbol table info available. #6 0x00007fa147e3763c in _tc_free_internal () from /lib64/libtalloc.so.2 No symbol table info available. #7 0x00007fa1561fb426 in open_directory (conn=conn@entry=0x5614e88f71a0, req=req@entry=0x5614e8924b00, smb_dname=smb_dname@entry=0x5614e89225d0, access_mask=access_mask@entry=1048705, share_access=share_access@entry=7, create_disposition=create_disposition@entry=1, create_options=create_options@entry=1, file_attributes=16, file_attributes@entry=0, pinfo=0x7ffde8dc8ee8, result=0x7ffde8dc8f00) at ../../source3/smbd/open.c:4313 fsp = 0x5614e88f1b70 dir_existed = <optimized out> lck = 0x5614e892aa70 status = <optimized out> mtimespec = {tv_sec = 0, tv_nsec = 0} info = <optimized out> ok = <optimized out> __FUNCTION__ = "open_directory" #8 0x00007fa1561ff2b3 in create_file_unixpath (conn=conn@entry=0x5614e88f71a0, req=req@entry=0x5614e8924b00, smb_fname=smb_fname@entry=0x5614e89225d0, access_mask=access_mask@entry=1048705, share_access=share_access@entry=7, create_disposition=create_disposition@entry=1, create_options=create_options@entry=1, file_attributes=file_attributes@entry=0, oplock_request=oplock_request@entry=0, lease=lease@entry=0x0, allocation_size=allocation_size@entry=0, private_flags=private_flags@entry=0, sd=sd@entry=0x0, ea_list=ea_list@entry=0x0, result=result@entry=0x7ffde8dc9038, pinfo=pinfo@entry=0x7ffde8dc9034) at ../../source3/smbd/open.c:5331 info = 1 base_fsp = 0x0 fsp = 0x0 status = <optimized out> __func__ = "create_file_unixpath" __FUNCTION__ = "create_file_unixpath" #9 0x00007fa1562012f9 in create_file_default (conn=0x5614e88f71a0, req=0x5614e8924b00, root_dir_fid=<optimized out>, smb_fname=0x5614e89225d0, access_mask=1048705, share_access=7, create_disposition=create_disposition@entry=1, create_options=create_options@entry=1, file_attributes=file_attributes@entry=0, oplock_request=oplock_request@entry=0, lease=lease@entry=0x0, allocation_size=allocation_size@entry=0, private_flags=private_flags@entry=0, sd=sd@entry=0x0, ea_list=ea_list@entry=0x0, result=result@entry=0x5614e892b2a8, pinfo=pinfo@entry=0x5614e892b2bc, in_context_blobs=in_context_blobs@entry=0x7ffde8dc9280, out_context_blobs=out_context_blobs@entry=0x5614e8922e90) at ../../source3/smbd/open.c:5701 info = 1 fsp = 0x0 status = <optimized out> stream_name = <optimized out> __func__ = "create_file_default" __FUNCTION__ = "create_file_default" #10 0x00007fa15616595e in vfswrap_create_file (handle=<optimized out>, req=<optimized out>, root_dir_fid=<optimized out>, smb_fname=<optimized out>, access_mask=<optimized out>, share_access=<optimized out>, create_disposition=1, create_options=1, file_attributes=0, oplock_request=0, lease=0x0, allocation_size=0, private_flags=0, sd=0x0, ea_list=0x0, result=0x5614e892b2a8, pinfo=0x5614e892b2bc, in_context_blobs=0x7ffde8dc9280, out_context_blobs=0x5614e8922e90) at ../../source3/modules/vfs_default.c:582 No locals. #11 0x00007fa156208458 in smb_vfs_call_create_file (handle=<optimized out>, req=req@entry=0x5614e8924b00, root_dir_fid=root_dir_fid@entry=0, smb_fname=<optimized out>, access_mask=access_mask@entry=1048705, share_access=share_access@entry=7, create_disposition=1, create_options=create_options@entry=1, file_attributes=file_attributes@entry=0, oplock_request=0, lease=lease@entry=0x0, allocation_size=allocation_size@entry=0, private_flags=private_flags@entry=0, sd=sd@entry=0x0, ea_list=ea_list@entry=0x0, result=result@entry=0x5614e892b2a8, pinfo=pinfo@entry=0x5614e892b2bc, in_context_blobs=in_context_blobs@entry=0x7ffde8dc9280, out_context_blobs=out_context_blobs@entry=0x5614e8922e90) at ../../source3/smbd/vfs.c:1627 No locals. #12 0x00007fa156238f7b in smbd_smb2_create_send (in_context_blobs=..., in_name=<optimized out>, in_create_options=1, in_create_disposition=<optimized out>, in_share_access=7, in_file_attributes=0, in_desired_access=1048705, in_impersonation_level=2, in_oplock_level=<optimized out>, smb2req=0x5614e8916de0, ev=<optimized out>, mem_ctx=0x5614e8916de0) at ../../source3/smbd/smb2_create.c:938 state = 0x5614e892b250 status = <optimized out> smb1req = 0x5614e8924b00 req = 0x5614e892b0a0 smb_fname = 0x5614e89225d0 ucf_flags = <optimized out> #13 smbd_smb2_request_process_create (smb2req=smb2req@entry=0x5614e8916de0) at ../../source3/smbd/smb2_create.c:230 inbody = <optimized out> indyniov = <optimized out> in_oplock_level = <optimized out> in_impersonation_level = 2 in_desired_access = 1048705 in_file_attributes = 0 in_share_access = 7 in_create_disposition = <optimized out> in_create_options = 33 in_name_offset = <optimized out> in_name_length = <optimized out> in_name_buffer = <optimized out> in_name_string = 0x5614e89171d0 "" in_name_string_size = 0 name_offset = <optimized out> name_available_length = <optimized out> in_context_offset = <optimized out> in_context_length = <optimized out> in_context_blobs = {num_blobs = 2, blobs = 0x5614e8922400} context_offset = <optimized out> context_available_length = <optimized out> dyn_offset = <optimized out> status = <optimized out> ok = <optimized out> #14 0x00007fa15622f27b in smbd_smb2_request_dispatch (req=req@entry=0x5614e8916de0) at ../../source3/smbd/smb2_server.c:2675 xconn = <optimized out> call = <optimized out> intf_v = <optimized out> inhdr = <optimized out> opcode = <optimized out> flags = <optimized out> mid = <optimized out> status = <optimized out> session_status = <optimized out> allowed_flags = <optimized out> return_value = <optimized out> x = <optimized out> signing_required = <optimized out> encryption_desired = <optimized out> encryption_required = <optimized out> __FUNCTION__ = "smbd_smb2_request_dispatch" #15 0x00007fa156230c45 in smbd_smb2_io_handler (fde_flags=<optimized out>, xconn=0x5614e8919df0) at ../../source3/smbd/smb2_server.c:3979 err = <optimized out> sconn = 0x5614e8919540 state = 0x5614e8919ea8 req = 0x5614e8916de0 min_recvfile_size = 4294967295 ret = <optimized out> retry = false status = <optimized out> now = <optimized out> #16 smbd_smb2_connection_handler (ev=<optimized out>, fde=<optimized out>, flags=<optimized out>, private_data=<optimized out>) at ../../source3/smbd/smb2_server.c:4017 xconn = 0x5614e8919df0 status = <optimized out>
Andreas, what *exact* version of the code is this ? I need this to investigate for the line numbers to mean anything :-). There's been a lot of code changes in master in open.c. Is this reproducible ? Have you tried valgrinding it so we know where the initial free occurs ?
The backtrace is from Samba 4.10.4. I don't have a reproducer for this which is the big issue. My next step would be to check smbtorture tests which work with share mode locks and build smbd with AddressSanitizer.
(In reply to Andreas Schneider from comment #2) we have a very similar report, same-ish version 4.10 #0 0x00007fed7ce862a7 in __GI_raise (sig=sig@entry=6) at ../sysdeps/unix/sysv/linux/raise.c:55 #1 0x00007fed7ce8767a in __GI_abort () at abort.c:78 #2 0x00007fed7f9e6ba9 in dump_core () at ../../source3/lib/dumpcore.c:338 #3 0x00007fed7f9c0138 in smb_panic_s3 (why=<optimized out>) at ../../source3/lib/util.c:847 #4 0x00007fed80f0071d in smb_panic (why=0x7fed7d83ac98 "Bad talloc magic value - access after free") at ../../lib/util/fault.c:174 #5 0x00007fed7d831bdf in talloc_abort_access_after_free () at ../../talloc.c:508 #6 talloc_chunk_from_ptr (ptr=ptr@entry=0x560ccf3bdac0) at ../../talloc.c:525 #7 0x00007fed7d83247c in talloc_chunk_from_ptr (ptr=0x560ccf3bdac0) at /usr/include/bits/string3.h:90 #8 talloc_parent_chunk (ptr=0x560ccf3bdac0) at ../../talloc.c:574 #9 _tc_free_children_internal (location=0x7fed80bec030 "../../source3/smbd/open.c:4312", ptr=0x560ccf3be180, tc=0x560ccf3be120) at ../../talloc.c:1647 #10 _tc_free_internal (location=0x7fed80bec030 "../../source3/smbd/open.c:4312", tc=0x560ccf3be120) at ../../talloc.c:1163 #11 _tc_free_children_internal (location=0x7fed80bec030 "../../source3/smbd/open.c:4312", ptr=0x560ccf3bd8b0, tc=0x560ccf3bd850) at ../../talloc.c:1646 #12 _tc_free_internal (tc=0x560ccf3bd850, location=0x7fed80bec030 "../../source3/smbd/open.c:4312") at ../../talloc.c:1163 #13 0x00007fed80a8da36 in open_directory (conn=conn@entry=0x560ccf3c1570, req=req@entry=0x560ccf3c1320, smb_dname=smb_dname@entry=0x560ccf38c4d0, access_mask=access_mask@entry=128, share_access=share_access@entry=7, create_disposition=create_disposition@entry=1, create_options=create_options@entry=0, file_attributes=16, file_attributes@entry=0, pinfo=0x7ffc00231468, result=0x7ffc00231480) at ../../source3/smbd/open.c:4312 (gdb) print *lck->data $1 = {sequence_number = 8801807969726352176, servicepath = 0x560ccf3be050 "/home/data/austausch_test_prod", base_name = 0x560ccf3be110 ".", stream_name = 0x0, num_share_modes = 145259, share_modes = 0x7fed66096070, num_leases = 0, leases = 0x560ccf3bde90, num_delete_tokens = 0, delete_tokens = 0x560ccf3bdef0, old_write_time = {tv_sec = 0, tv_nsec = 0}, changed_write_time = {tv_sec = 0, tv_nsec = 0}, fresh = 0 '\000', modified = 0 '\000', record = 0x0, id = {devid = 65030, inode = 2719762, extid = 0}} and at frame 10 (gdb) down #10 _tc_free_internal (location=0x7fed80bec030 "../../source3/smbd/open.c:4312", tc=0x560ccf3be120) at ../../talloc.c:1163 1163 _tc_free_children_internal(tc, ptr, location); (gdb) print tc $10 = (struct talloc_chunk *) 0x560ccf3be120 (gdb) print *tc $11 = {flags = 3236755050, next = 0x0, prev = 0x0, parent = 0x560ccf3bd850, child = 0x0, refs = 0x0, destructor = 0x0, name = 0x7fed80c42b17 "struct share_mode_lock", size = 8, limit = 0x0, pool = 0x560ccf3bc1b0} the real ptr is tc + TALLOC_HDR_SIZE (gdb) print (char*)tc + 96 $15 = 0x560ccf3be180 (according to tc above should be a share_mode_lock) (gdb) print (struct share_mode_lock*)((char*)tc + 96) $16 = (struct share_mode_lock *) 0x560ccf3be180 (gdb) print *(struct share_mode_lock*)((char*)tc + 96) $17 = {data = 0x560ccf3bdac0} (the_lock ???) (gdb) down #9 _tc_free_children_internal (location=0x7fed80bec030 "../../source3/smbd/open.c:4312", ptr=0x560ccf3be180, tc=0x560ccf3be120) at ../../talloc.c:1647 gdb) down #8 talloc_parent_chunk (ptr=0x560ccf3bdac0) at ../../talloc.c:574 so it is at the point where we try to get the talloc chunk associated with (0x560ccf3bdac0) the access after free is triggered, so it seems 0x560ccf3bdac0 (or lck->data) has been freed from the memory cache (as we can see when we examine the talloc_chunk associated with 0x560ccf3bdac0) (gdb) print *(struct talloc_chunk*)((char*)ptr - 96) $19 = {flags = 3927309435, next = 0x0, prev = 0x0, parent = 0x0, child = 0x0, refs = 0x0, destructor = 0x0, name = 0x7fed80f52878 "../../lib/util/memcache.c:218", size = 152, limit = 0x0, pool = 0x560ccf3bc1b0} so... the only way I could think of for this to happen is if that item had been evicted (and freed) from the memcache if we look back above at the print *lck->data from frame 13 we can see lck->data->num_share_modes = 145259 some additional cores also show the same num share modes. This number of share modes will exceed the default size of the memcache and force the cache to evict entries. Note: In this case there is only one entry (but a very very big one) after the error print *smbd_memcache_ctx $2 = {mru = 0x0, tree = {rb_node = 0x0}, size = 0, max_size = 524288} and memcache trim only removes lru entries (e.g those at the tail of mru) until the size limits are ok so the fact mru & tree are both NULL would seem to indicate there was only one entry. Like Andreas we don't have a reproducer (or even logs with elevated debug, have asked for these, hopefully we get them at some point) I suspect some application is doing something crazy with share mode locks Anyway I wrote a small program to test the theory, I'll attach it and results here (the core was very simiar)
Created attachment 16418 [details] test code to test theory
./bin/cache-test setting memcache size to 524288 main: ## iter 0 testmemfoo: #1 created lock testmemfoo: #1 allocating just one share_mode lock testmemfoo: #1 free-ing lock share_mode_data_destructor: about to unparse share_mode_memcache_delete: deleting entry for file . seq 2a134251882c3a6e key 1:2:0 share_mode_data_destructor: about to store cache entry share_mode_memcache_store: stored entry for file . seq 2a134251882c3a6f key 1:2:0 testmemfoo: #1 freed lock share_mode_memcache_fetch: deleted entry for file . seq 2a134251882c3a6f key 1:2:0 share_mode_memcache_fetch: fetched entry for file . seq 2a134251882c3a6f key 1:2:0 testmemfoo: #2 created lock testmemfoo: #2 allocating many many (10000) share_mode locks testmemfoo: #2 free-ing lock share_mode_data_destructor: about to unparse share_mode_memcache_delete: deleting entry for file . seq 2a134251882c3a6f key 1:2:0 share_mode_data_destructor: about to store cache entry share_mode_memcache_store: stored entry for file . seq 2a134251882c3a70 key 1:2:0 memcache_trim: evicting a cache item Bad talloc magic value - unknown value Aborted (core dumped)
share_mode_data_destructor() doesn't exist in 4.11.0 and above. This *may* be purely a 4.10.x problem.
(In reply to Noel Power from comment #3) Great analysis! On my system the test program even already crashes at step #1 in testmemfoo(): $ sudo ./bin/cache-test -d 0 [sudo] password for slow: setting memcache size to 524288 main: ## iter 0 failed to find entry for key 1:2:0 testmemfoo: #1 created lock testmemfoo: #1 allocating just one share_mode lock testmemfoo: #1 free-ing lock share_mode_data_destructor: about to unparse share_mode_memcache_delete: deleting entry for file . seq 3133868bc5207a5d key 1:2:0 share_mode_data_destructor: about to store cache entry share_mode_memcache_store: stored entry for file . seq 3133868bc5207a5e key 1:2:0 memcache_trim: evicting a cache item Bad talloc magic value - unknown value Aborted
setting memcache size to 524288 main: ## iter 0 testmemfoo: #1 created lock testmemfoo: #1 allocating just one share_mode lock testmemfoo: #1 free-ing lock share_mode_data_destructor: about to unparse share_mode_memcache_delete: deleting entry for file . seq fa5223454cbde6c5 key 1:2:0 share_mode_data_destructor: about to store cache entry share_mode_memcache_store: stored entry for file . seq fa5223454cbde6c6 key 1:2:0 testmemfoo: #1 freed lock share_mode_memcache_fetch: deleted entry for file . seq fa5223454cbde6c6 key 1:2:0 share_mode_memcache_fetch: fetched entry for file . seq fa5223454cbde6c6 key 1:2:0 testmemfoo: #2 created lock testmemfoo: #2 allocating many many (10000) share_mode locks testmemfoo: #2 free-ing lock share_mode_data_destructor: about to unparse share_mode_memcache_delete: deleting entry for file . seq fa5223454cbde6c6 key 1:2:0 share_mode_data_destructor: about to store cache entry share_mode_memcache_store: stored entry for file . seq fa5223454cbde6c7 key 1:2:0 memcache_trim: evicting a cache item ================================================================= ==871494==ERROR: AddressSanitizer: heap-use-after-free on address 0x611000002eb0 at pc 0x7f134c1bc400 bp 0x7ffc66960290 sp 0x7ffc66960288 READ of size 8 at 0x611000002eb0 thread T0 #0 0x7f134c1bc3ff in _tc_free_internal ../../lib/talloc/talloc.c:1142 #1 0x7f134c1bc0d5 in _tc_free_children_internal ../../lib/talloc/talloc.c:1646 #2 0x7f134c1bc0d5 in _tc_free_internal ../../lib/talloc/talloc.c:1163 #3 0x7f134c1bc0d5 in _tc_free_children_internal ../../lib/talloc/talloc.c:1646 #4 0x7f134c1bc0d5 in _tc_free_internal ../../lib/talloc/talloc.c:1163 #5 0x7f134c1bd5d1 in _talloc_free_internal ../../lib/talloc/talloc.c:1227 #6 0x7f134c1bd5d1 in _talloc_free ../../lib/talloc/talloc.c:1769 #7 0x56098b334a42 in testmemfoo ../../source3/utils/cache-test.c:85 #8 0x56098b334fe5 in main ../../source3/utils/cache-test.c:113 #9 0x7f134b23e1e1 in __libc_start_main ../csu/libc-start.c:314 #10 0x56098b3340cd in _start (/home/asn/workspace/projects/samba/bin/default/source3/utils/cache-test+0x40cd) 0x611000002eb0 is located 48 bytes inside of 248-byte region [0x611000002e80,0x611000002f78) freed by thread T0 here: #0 0x7f134c2760c7 in __interceptor_free (/lib64/libasan.so.6+0xab0c7) #1 0x7f134c1bc9c2 in _tc_free_internal ../../lib/talloc/talloc.c:1201 #2 0x7f134c1bd5d1 in _talloc_free_internal ../../lib/talloc/talloc.c:1227 #3 0x7f134c1bd5d1 in _talloc_free ../../lib/talloc/talloc.c:1769 #4 0x7f134b45e867 in memcache_delete_element ../../lib/util/memcache.c:218 #5 0x7f134b45ff2f in memcache_trim ../../lib/util/memcache.c:234 #6 0x7f134b45ff2f in memcache_add ../../lib/util/memcache.c:355 #7 0x7f134b46019c in memcache_add_talloc ../../lib/util/memcache.c:373 #8 0x7f134bc3c950 in share_mode_memcache_store ../../source3/locking/share_mode_lock.c:186 #9 0x7f134bc3c950 in share_mode_data_destructor ../../source3/locking/share_mode_lock.c:448 #10 0x7f134c1bc35b in _tc_free_internal ../../lib/talloc/talloc.c:1137 #11 0x7f134c1bc0d5 in _tc_free_children_internal ../../lib/talloc/talloc.c:1646 #12 0x7f134c1bc0d5 in _tc_free_internal ../../lib/talloc/talloc.c:1163 #13 0x7f134c1bc0d5 in _tc_free_children_internal ../../lib/talloc/talloc.c:1646 #14 0x7f134c1bc0d5 in _tc_free_internal ../../lib/talloc/talloc.c:1163 #15 0x7f134c1bd5d1 in _talloc_free_internal ../../lib/talloc/talloc.c:1227 #16 0x7f134c1bd5d1 in _talloc_free ../../lib/talloc/talloc.c:1769 #17 0x56098b334a42 in testmemfoo ../../source3/utils/cache-test.c:85 #18 0x56098b334fe5 in main ../../source3/utils/cache-test.c:113 #19 0x7f134b23e1e1 in __libc_start_main ../csu/libc-start.c:314 previously allocated by thread T0 here: #0 0x7f134c2763cf in __interceptor_malloc (/lib64/libasan.so.6+0xab3cf) #1 0x7f134c1bfb75 in __talloc_with_prefix ../../lib/talloc/talloc.c:763 #2 0x7f134c1c100e in __talloc ../../lib/talloc/talloc.c:804 #3 0x7f134c1c100e in _talloc_named_const ../../lib/talloc/talloc.c:961 #4 0x7f134c1c100e in _talloc_zero ../../lib/talloc/talloc.c:2402 #5 0x7f134bc3d54d in fresh_share_mode_lock ../../source3/locking/share_mode_lock.c:469 #6 0x7f134bc3d54d in get_share_mode_lock_internal ../../source3/locking/share_mode_lock.c:525 #7 0x7f134bc3d54d in get_share_mode_lock ../../source3/locking/share_mode_lock.c:584 #8 0x56098b3348ce in testmemfoo ../../source3/utils/cache-test.c:65 #9 0x56098b334fe5 in main ../../source3/utils/cache-test.c:113 #10 0x7f134b23e1e1 in __libc_start_main ../csu/libc-start.c:314 SUMMARY: AddressSanitizer: heap-use-after-free ../../lib/talloc/talloc.c:1142 in _tc_free_internal Shadow bytes around the buggy address: 0x0c227fff8580: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 0x0c227fff8590: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 0x0c227fff85a0: fa fa fa fa fa fa fa fa 00 00 00 00 00 00 00 00 0x0c227fff85b0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 0x0c227fff85c0: 00 00 00 00 00 00 00 00 fa fa fa fa fa fa fa fa =>0x0c227fff85d0: fd fd fd fd fd fd[fd]fd fd fd fd fd fd fd fd fd 0x0c227fff85e0: fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd fa 0x0c227fff85f0: fa fa fa fa fa fa fa fa fd fd fd fd fd fd fd fd 0x0c227fff8600: fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd 0x0c227fff8610: fd fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa 0x0c227fff8620: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa Shadow byte legend (one shadow byte represents 8 application bytes): Addressable: 00 Partially addressable: 01 02 03 04 05 06 07 Heap left redzone: fa Freed heap region: fd Stack left redzone: f1 Stack mid redzone: f2 Stack right redzone: f3 Stack after return: f5 Stack use after scope: f8 Global redzone: f9 Global init order: f6 Poisoned by user: f7 Container overflow: fc Array cookie: ac Intra object redzone: bb ASan internal: fe Left alloca redzone: ca Right alloca redzone: cb Shadow gap: cc ==871494==ABORTING
(In reply to Ralph Böhme from comment #7) wow, don't know how you got that to work (or more correctly fail :-) ) with the first step
a maybe solution (I was discussing with Andreas on irc) diff --git a/source3/locking/share_mode_lock.c b/source3/locking/share_mode_lock.c index c5ac18dbeb5..a42773f3874 100644 --- a/source3/locking/share_mode_lock.c +++ b/source3/locking/share_mode_lock.c @@ -182,6 +182,8 @@ static void share_mode_memcache_store(struct share_mode_data *d) */ talloc_set_destructor(d, NULL); + /* orphan the memory temporarily */ + (void)talloc_move(NULL, &d); /* Cache will own d after this call. */ memcache_add_talloc(NULL, SHARE_MODE_LOCK_CACHE, so as you can see no error checking (admittedly I don't know what to do here on error if talloc_move fails) before memcache_add_talloc is called above, lck->data (e.g. d) is owned by lck (see source3/locking/share_mode_lock.c:548) The idea is we could protect against the pathological case here by orphaning the memory which would protect lck (which is in the middle of being free-ed) and which currently owns 'd' from the situation where memcache_add_talloc deletes 'd' instead of talking ownership of it
(In reply to Noel Power from comment #10) after that talloc_move() d is set to NULL, so you're passing d=NULL to memcache_add_talloc(). Is that what you want?
(In reply to Ralph Böhme from comment #11) >after that talloc_move() d is set to NULL, so you're passing d=NULL to >memcache_add_talloc(). Is that what you want? why not!!! well seriously obviously no, talloc_steal instead ? hmm that doesn't seem to work (still get core) :-(
(In reply to Noel Power from comment #12) d = talloc_move(NULL, &d); is probably what you want, but that doesn't fix the crash for me.
According to Address Sanitizer We do memcache_add_talloc -> memcache_add However the cache is full so memcache_trim is called and the item deletes itself: share_mode_memcache_store: stored entry for file . seq 53a13bdbfc50b13c key 1:2:0 memcache_add_talloc: XXXXXX memcache_add_talloc mtv.ptr=0x611000002ee0 memcache_trim: evicting a cache item memcache_delete_element: XXXXXX memcache_delete_element mtv.ptr=0x611000002ee0
(In reply to Andreas Schneider from comment #14) Woohoo! I think you nailed it! Great work, Noel and Andreas! Now we only need a test plus the fix. :)
I dunno how to fix this. Removing the memcache_trim() or increasing the memcache size fixes the issue. The question is why the TALLOC_FREE(mtv.ptr) in memcache_delete_element() is poisonous in this case. Is this an issue only with share mode locks or could this affect other users of memcache. Why do we put this in the memcache and then check the size and remove it again. Shouldn't we check if it fits in before?
(In reply to Andreas Schneider from comment #16) I guess I would pass memcache_element e to memcache_trim() and adjust memcache_trim() so that it ignores e when walking the list freeing elements.
(In reply to Ralph Böhme from comment #15) Have to say still speculation that this is what happens with both our reproducers. I mean this lots of share_modes scenario e.g. lck->data->num_share_modes = 145259 I don't know is this really just some dirty memory reading (but it looks valid and I saw similar numbers in other cores supplied at the same time). If this really is the repro situation I am not how this actually happens in reality, I don't know enough about the locking dynamics. Only logs with tell I guess, also this is now in an unsupported EOL code stream so there wont be a fix here (meaning upstream) especially since not only EOL but this code no longer exists
Created attachment 16421 [details] possible fix I agree with Noel. We have an empty cache and add an object which is bigger than the cache and because of that we directly free it again. The patch doesn't delete it from the cache if we exceed it but the next call would do it.
Great analysis and fix ! (Sorry, dealing with pandemic home issues at the moment mean this is hard to focus on). The fix inside memcache_trim() to prevent trimming the element we're trying to add would seem the correct one. I think we just need a smbtorture3 local.X memcache test for this and that fix should also go into master and all supported versions as well as the specific 4.10.x as the core memcache bug still exists.
Comment on attachment 16421 [details] possible fix lgtm, thanks Andreas, sorry I didn't have a chance today to have a crack at the fix myself
Comment on attachment 16421 [details] possible fix Patch looks good functiona wise, but please move the opening brace of the multiline for condition on the next line where it stands on its own, cf README.Coding. Thanks!
I will look into writing a unit test for this.
This bug was referenced in samba master: bebbf621d6052f797c5cf19a2a9bbc13e699d3f0 00543ab3b29e3fbfe8314e51919629803e14ede6 0bdbe50fac680be3fe21043246b8c75005611351
Created attachment 16427 [details] patch for 4.14
Created attachment 16428 [details] patch for 4.13
Created attachment 16429 [details] patch for 4.12
Backported patches for 4.10 can be found here: https://gitlab.com/samba-redhat/samba/-/commits/v4-10-redhat/
Reassigning to Karolin for inclusion in 4.12, 4.13 and 4.14.
(In reply to Ralph Böhme from comment #29) Pushed to autobuild-v4-{14,13,12}-test.
This bug was referenced in samba v4-12-test: a658dd19627ebe43d4e4fc494ff0d60507554a84 65b3648c55332d6186c6a03c0ef56e683a7afa46 40e503732dc0129b8e5f7bc84ca027514d2ef02e
This bug was referenced in samba v4-13-test: 32d62beae343a58e8ddc1bccff8dbf9bfe8f869e a3fa41c742900d7a91adc2e80a388e86e5b05f17 cf9066b21534de4d0a05a696ba2c28ae2813192d
This bug was referenced in samba v4-14-test: 805b8be9497e03f3dbcfc94cc1aa3634b135a065 27d93c8d46ac5dd61bd47b9ba88d5bedb651b86b 938b89616a18161d3cef32f62c3bc48f80adeb70
Pushed to all branches. Closing out bug report. Thanks!
This bug was referenced in samba v4-14-stable (Release samba-4.14.0rc3): 805b8be9497e03f3dbcfc94cc1aa3634b135a065 27d93c8d46ac5dd61bd47b9ba88d5bedb651b86b 938b89616a18161d3cef32f62c3bc48f80adeb70
This bug was referenced in samba v4-13-stable (Release samba-4.13.5): 32d62beae343a58e8ddc1bccff8dbf9bfe8f869e a3fa41c742900d7a91adc2e80a388e86e5b05f17 cf9066b21534de4d0a05a696ba2c28ae2813192d
This bug was referenced in samba v4-12-stable (Release samba-4.12.12): a658dd19627ebe43d4e4fc494ff0d60507554a84 65b3648c55332d6186c6a03c0ef56e683a7afa46 40e503732dc0129b8e5f7bc84ca027514d2ef02e