From c9ce2ea476c88cb59d5174288edd835fe905f719 Mon Sep 17 00:00:00 2001 From: David Disseldorp Date: Tue, 5 Apr 2011 13:26:29 +0200 Subject: [PATCH 1/2] client: handle transient connection errors Client connections to the ctdbd unix domain socket may fail intermittently while the server is under heavy load. This change introduces a client connect retry loop. During failure the client will retry for a maximum of 64 seconds, the ctdb --timelimit option can be used to cap client runtime. --- client/ctdb_client.c | 35 ++++++++++++++++++++++++++++++----- 1 files changed, 30 insertions(+), 5 deletions(-) diff --git a/client/ctdb_client.c b/client/ctdb_client.c index 2d3c176..3edcd1a 100644 --- a/client/ctdb_client.c +++ b/client/ctdb_client.c @@ -246,16 +246,41 @@ done: } /* - connect to a unix domain socket + connect with exponential backoff, thanks Stevens */ -int ctdb_socket_connect(struct ctdb_context *ctdb) +#define CONNECT_MAXSLEEP 64 +static int ctdb_connect_retry(struct ctdb_context *ctdb) { struct sockaddr_un addr; + int secs; + int ret = 0; memset(&addr, 0, sizeof(addr)); addr.sun_family = AF_UNIX; strncpy(addr.sun_path, ctdb->daemon.name, sizeof(addr.sun_path)); + for (secs = 1; secs <= CONNECT_MAXSLEEP; secs *= 2) { + ret = connect(ctdb->daemon.sd, (struct sockaddr *)&addr, + sizeof(addr)); + if ((ret == 0) || (errno != EAGAIN)) { + break; + } + + if (secs <= (CONNECT_MAXSLEEP / 2)) { + DEBUG(DEBUG_ERR,("connect failed: %s, retry in %d second(s)\n", + strerror(errno), secs)); + sleep(secs); + } + } + + return ret; +} + +/* + connect to a unix domain socket +*/ +int ctdb_socket_connect(struct ctdb_context *ctdb) +{ ctdb->daemon.sd = socket(AF_UNIX, SOCK_STREAM, 0); if (ctdb->daemon.sd == -1) { DEBUG(DEBUG_ERR,(__location__ " Failed to open client socket. Errno:%s(%d)\n", strerror(errno), errno)); @@ -264,11 +289,11 @@ int ctdb_socket_connect(struct ctdb_context *ctdb) set_nonblocking(ctdb->daemon.sd); set_close_on_exec(ctdb->daemon.sd); - - if (connect(ctdb->daemon.sd, (struct sockaddr *)&addr, sizeof(addr)) == -1) { + + if (ctdb_connect_retry(ctdb) == -1) { + DEBUG(DEBUG_ERR,(__location__ " Failed to connect client socket to daemon. Errno:%s(%d)\n", strerror(errno), errno)); close(ctdb->daemon.sd); ctdb->daemon.sd = -1; - DEBUG(DEBUG_ERR,(__location__ " Failed to connect client socket to daemon. Errno:%s(%d)\n", strerror(errno), errno)); return -1; } -- 1.7.1