Hello, I have a problem with ctdb recovery. I have a 3 nodes(100.7.44.121,126,129) ceph cluster, when I stop a osd,the usage of system memory become high on two nodes(121,126). Then their ctdb becomes abnormal, node129 becomes the recmaster. It generates core files on node129 when ctdb recovery. please help stack as follows: (1) Core was generated by `/usr/libexec/ctdb/ctdb_recovery_helper 50 48 /var/run/ctdb/ctdbd.socket 6546685'. Program terminated with signal 11, Segmentation fault. #0 pull_database_handler (srvid=17294104044079415299, data=..., private_data=<optimized out>) at ../ctdb/server/ctdb_recovery_helper.c:612 612 if (srvid != state->srvid) { Missing separate debuginfos, use: debuginfo-install bzip2-libs-1.0.6-13.el7.x86_64 elfutils-libelf-0.163-3.el7.x86_64 elfutils-libs-0.163-3.el7.x86_64 glibc-2.17-105.el7.x86_64 libattr-2.4.46-12.el7.x86_64 libcap-2.22-8.el7.x86_64 libgcc-4.8.5-4.el7.x86_64 libgcrypt-1.5.3-12.el7_1.1.x86_64 libgpg-error-1.12-3.el7.x86_64 libselinux-2.5-11.el7.x86_64 pcre-8.32-15.el7.x86_64 systemd-libs-219-19.el7.x86_64 xz-libs-5.1.2-12alpha.el7.x86_64 zlib-1.2.7-15.el7.x86_64 (gdb) bt #0 pull_database_handler (srvid=17294104044079415299, data=..., private_data=<optimized out>) at ../ctdb/server/ctdb_recovery_helper.c:612 #1 0x00007f880da66ee1 in srvid_dispatch (srv=0x7f880f22cb90, srvid=17294104044079415299, srvid_all=srvid_all@entry=18446744073709551615, data=...) at ../ctdb/common/srvid.c:253 #2 0x00007f880da7060e in ctdb_client_req_message (client=client@entry=0x7f880f22c960, buf=buf@entry=0x7f880f234f88 "H\001", buflen=buflen@entry=328, reqid=<optimized out>) at ../ctdb/client/client_message.c:161 #3 0x00007f880da6f101 in client_read_handler (buf=0x7f880f234f88 "H\001", buflen=<optimized out>, private_data=<optimized out>) at ../ctdb/client/client_connect.c:193 #4 0x00007f880da6770b in comm_read_done (subreq=0x7f880f25de30) at ../ctdb/common/comm.c:208 #5 0x00007f880da675fd in comm_fd_handler (ev=0x7f880f22c5d0, fde=0x7f880f2356b0, flags=<optimized out>, private_data=<optimized out>) at ../ctdb/common/comm.c:378 #6 0x00007f880c766c7b in epoll_event_loop (tvalp=0x7ffeb6fd01d0, epoll_ev=0x7f880f22c8a0) at ../lib/tevent/tevent_epoll.c:728 #7 epoll_event_loop_once (ev=<optimized out>, location=<optimized out>) at ../lib/tevent/tevent_epoll.c:926 #8 0x00007f880c765137 in std_event_loop_once (ev=0x7f880f22c5d0, location=0x7f880c7673d8 "../lib/tevent/tevent_req.c:264") at ../lib/tevent/tevent_standard.c:114 #9 0x00007f880c76138d in _tevent_loop_once (ev=ev@entry=0x7f880f22c5d0, location=location@entry=0x7f880c7673d8 "../lib/tevent/tevent_req.c:264") at ../lib/tevent/tevent.c:533 #10 0x00007f880c76255f in tevent_req_poll (req=req@entry=0x7f880f237c30, ev=ev@entry=0x7f880f22c5d0) at ../lib/tevent/tevent_req.c:264 #11 0x00007f880da66435 in main (argc=<optimized out>, argv=<optimized out>) at ../ctdb/server/ctdb_recovery_helper.c:2895 (gdb) f 1 #1 0x00007f880da66ee1 in srvid_dispatch (srv=0x7f880f22cb90, srvid=17294104044079415299, srvid_all=srvid_all@entry=18446744073709551615, data=...) at ../ctdb/common/srvid.c:253 253 h->handler(srvid, data, h->private_data); (gdb) p *list->h $1 = {prev = 0x7f880f251c00, next = 0x0, list = 0x7f880f24ba40, handler = 0x7f880da6e3b0 <pull_database_handler>, private_data = 0x7f880f25bbe0} (gdb) p *(struct tevent_req *)(list->h->private_data) $2 = {async = {fn = 0x7f880da71230 <ctdb_client_control_multi_done>, private_data = 0x7f880f25f1a0}, data = 0x7f880f25bdd0, private_print = 0x0, private_cancel = 0x0, private_cleanup = {fn = 0x0, state = TEVENT_REQ_INIT}, internal = {private_type = 0x7f880da8c280 "struct ctdb_client_control_state", create_location = 0x7f880da8c320 "../ctdb/client/client_control.c:74", finish_location = 0x0, cancel_location = 0x0, state = TEVENT_REQ_IN_PROGRESS, error = 0, trigger = 0x7f880f25bcf0, defer_callback_ev = 0x0, timer = 0x7f880f251db0}} (gdb) (2) Core was generated by `/usr/libexec/ctdb/ctdb_recovery_helper 52 50 /var/run/ctdb/ctdbd.socket 4284972'. Program terminated with signal 11, Segmentation fault. #0 __talloc_get_name (ptr=0x7f7579ac2a20) at ../lib/talloc/talloc.c:1617 1617 if (unlikely(tc->name == TALLOC_MAGIC_REFERENCE)) { Missing separate debuginfos, use: debuginfo-install bzip2-libs-1.0.6-13.el7.x86_64 elfutils-libelf-0.163-3.el7.x86_64 elfutils-libs-0.163-3.el7.x86_64 glibc-2.17-105.el7.x86_64 libattr-2.4.46-12.el7.x86_64 libcap-2.22-8.el7.x86_64 libgcc-4.8.5-4.el7.x86_64 libgcrypt-1.5.3-12.el7_1.1.x86_64 libgpg-error-1.12-3.el7.x86_64 libselinux-2.5-11.el7.x86_64 pcre-8.32-15.el7.x86_64 systemd-libs-219-19.el7.x86_64 xz-libs-5.1.2-12alpha.el7.x86_64 zlib-1.2.7-15.el7.x86_64 (gdb) bt #0 __talloc_get_name (ptr=0x7f7579ac2a20) at ../lib/talloc/talloc.c:1617 #1 _talloc_get_type_abort (ptr=0x7f7579ac2a20, name=name@entry=0x7f7578d7f53a "struct tevent_req", location=location@entry=0x7f7578d83378 "../ctdb/server/ctdb_recovery_helper.c:605") at ../lib/talloc/talloc.c:1673 #2 0x00007f7578d673d9 in pull_database_handler (srvid=17294104044079415299, data=..., private_data=<optimized out>) at ../ctdb/server/ctdb_recovery_helper.c:604 #3 0x00007f7578d5fee1 in srvid_dispatch (srv=0x7f7579ab3b90, srvid=17294104044079415299, srvid_all=srvid_all@entry=18446744073709551615, data=...) at ../ctdb/common/srvid.c:253 #4 0x00007f7578d6960e in ctdb_client_req_message (client=client@entry=0x7f7579ab3960, buf=buf@entry=0x7f7579abbf88 "H\001", buflen=buflen@entry=328, reqid=<optimized out>) at ../ctdb/client/client_message.c:161 #5 0x00007f7578d68101 in client_read_handler (buf=0x7f7579abbf88 "H\001", buflen=<optimized out>, private_data=<optimized out>) at ../ctdb/client/client_connect.c:193 #6 0x00007f7578d6070b in comm_read_done (subreq=0x7f7579ab9a70) at ../ctdb/common/comm.c:208 #7 0x00007f7578d605fd in comm_fd_handler (ev=0x7f7579ab35d0, fde=0x7f7579abc6b0, flags=<optimized out>, private_data=<optimized out>) at ../ctdb/common/comm.c:378 #8 0x00007f7577a5fc7b in epoll_event_loop (tvalp=0x7ffc42bfbee0, epoll_ev=0x7f7579ab38a0) at ../lib/tevent/tevent_epoll.c:728 #9 epoll_event_loop_once (ev=<optimized out>, location=<optimized out>) at ../lib/tevent/tevent_epoll.c:926 #10 0x00007f7577a5e137 in std_event_loop_once (ev=0x7f7579ab35d0, location=0x7f7577a603d8 "../lib/tevent/tevent_req.c:264") at ../lib/tevent/tevent_standard.c:114 #11 0x00007f7577a5a38d in _tevent_loop_once (ev=ev@entry=0x7f7579ab35d0, location=location@entry=0x7f7577a603d8 "../lib/tevent/tevent_req.c:264") at ../lib/tevent/tevent.c:533 #12 0x00007f7577a5b55f in tevent_req_poll (req=req@entry=0x7f7579abec30, ev=ev@entry=0x7f7579ab35d0) at ../lib/tevent/tevent_req.c:264 #13 0x00007f7578d5f435 in main (argc=<optimized out>, argv=<optimized out>) at ../ctdb/server/ctdb_recovery_helper.c:2895 (gdb) f 0 #0 __talloc_get_name (ptr=0x7f7579ac2a20) at ../lib/talloc/talloc.c:1617 1617 if (unlikely(tc->name == TALLOC_MAGIC_REFERENCE)) { (gdb) p ptr $1 = (const void *) 0x7f7579ac2a20 (gdb) p (struct talloc_chunk *)(ptr - ((sizeof(struct talloc_chunk)+15)&~15)) $2 = (struct talloc_chunk *) 0x7f7579ac2990 (gdb) p *(struct talloc_chunk *)(ptr - ((sizeof(struct talloc_chunk)+15)&~15)) $3 = {flags = 0, next = 0x0, prev = 0x0, parent = 0x1, child = 0x0, refs = 0x0, destructor = 0x1, name = 0x7f7578d85528 "../ctdb/client/client_control.c:302", size = 140142515233328, limit = 0x7f7579ac4d40, pool = 0x7f7579ac2bc0, talloc_context_mutex = {__data = {__lock = 0, __count = 0, __owner = 0, __nusers = 0, __kind = 0, __spins = 0, __list = {__prev = 0x0, __next = 0x7f7578d85280}}, __size = '\000' <repeats 32 times>, "\200R\330xu\177\000", __align = 0}, talloc_mutex_attr = {__size = " S\330x", __align = 2027442976}} (gdb)
What is the CTDB version you are using?
Created attachment 14141 [details] the ctdb log of node129
Created attachment 14142 [details] the ctdb log of node121
Created attachment 14143 [details] the ctdb log of node126
(In reply to Amitay Isaacs from comment #1) Thank you for attention, the ctdb version is 4.5.1
core dump happend at time 15:56
(In reply to doubenjun from comment #5) Samba/CTDB 4.5.x is end of life. Please upgrade to supported version. https://wiki.samba.org/index.php/Samba_Release_Planning This bug has been fixed in the newer releases.
The bug has been fixed in more recent versions. Closing this bug report.