From ec21124dfbac45b0473cd2a17140a04a93312e7d Mon Sep 17 00:00:00 2001 From: David Disseldorp Date: Tue, 5 Apr 2011 13:26:29 +0200 Subject: [PATCH] client: handle transient connection errors Client connections to the ctdbd unix domain socket may fail intermittently while the server is under heavy load. This change introduces a client connect retry loop. During failure the client will retry for a maximum of 64 seconds, the ctdb --timelimit option can be used to cap client runtime. --- client/ctdb_client.c | 34 +++++++++++++++++++++++++++++----- 1 files changed, 29 insertions(+), 5 deletions(-) diff --git a/client/ctdb_client.c b/client/ctdb_client.c index 7caa5cb..ede4542 100644 --- a/client/ctdb_client.c +++ b/client/ctdb_client.c @@ -253,16 +253,40 @@ done: } /* - connect to a unix domain socket + connect with exponential backoff, thanks Stevens */ -int ctdb_socket_connect(struct ctdb_context *ctdb) +#define CONNECT_MAXSLEEP 64 +static int ctdb_connect_retry(struct ctdb_context *ctdb) { struct sockaddr_un addr; + int nsec; + int ret = 0; memset(&addr, 0, sizeof(addr)); addr.sun_family = AF_UNIX; strncpy(addr.sun_path, ctdb->daemon.name, sizeof(addr.sun_path)); + for (nsec = 1; nsec <= CONNECT_MAXSLEEP; nsec <<= 1) { + ret = connect(ctdb->daemon.sd, (struct sockaddr *)&addr, + sizeof(addr)); + if ((ret == 0) || (errno != EAGAIN)) + break; + + if (nsec <= (CONNECT_MAXSLEEP / 2)) { + DEBUG(DEBUG_ERR,("connect failed: %s, retry in %d second(s)\n", + strerror(errno), nsec)); + sleep(nsec); + } + } + + return ret; +} + +/* + connect to a unix domain socket +*/ +int ctdb_socket_connect(struct ctdb_context *ctdb) +{ ctdb->daemon.sd = socket(AF_UNIX, SOCK_STREAM, 0); if (ctdb->daemon.sd == -1) { DEBUG(DEBUG_ERR,(__location__ " Failed to open client socket. Errno:%s(%d)\n", strerror(errno), errno)); @@ -271,11 +295,11 @@ int ctdb_socket_connect(struct ctdb_context *ctdb) set_nonblocking(ctdb->daemon.sd); set_close_on_exec(ctdb->daemon.sd); - - if (connect(ctdb->daemon.sd, (struct sockaddr *)&addr, sizeof(addr)) == -1) { + + if (ctdb_connect_retry(ctdb) == -1) { close(ctdb->daemon.sd); ctdb->daemon.sd = -1; - DEBUG(DEBUG_ERR,(__location__ " Failed to connect client socket to daemon. Errno:%s(%d)\n", strerror(errno), errno)); + DEBUG(DEBUG_ERR,(__location__ " Failed to connect client socket to daemon\n")); return -1; } -- 1.7.1