[pgpool-hackers: 399] Test down node in master/slave streaming replication mode, and reattach it.

Wed Nov 13 13:59:53 JST 2013

In streaming master/slave we testing all online nodes in function
check_replication_time_lag.

I add new feature. I test all nodes include dead nodes.

If down slave node now alive, i compare time_lag with master. And try this
few times (new param in config sr_check_max_retries) If slave node not far
behind, i mark node as NODE_UP.

If mater node now is online, i check time_lag on all slave nodes. And if
lag=0, i think this master is working fine, and reattach it.

This will solved the problem ticket #17
My implementation see in diff file.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://www.sraoss.jp/pipermail/pgpool-hackers/attachments/20131113/26316ce5/attachment.html>
-------------- next part --------------

diff --git a/pool_config.h b/pool_config.h
index a5a48f1..4668a68 100644
--- a/pool_config.h
+++ b/pool_config.h
@@ -124,6 +124,7 @@ typedef struct {
 	int sr_check_period;		/* streming replication check period */
 	char *sr_check_user;		/* PostgreSQL user name streaming replication check */
 	char *sr_check_password;	/* password for sr_check_user */
+	int sr_check_max_retries;	/* streming replication max retries to allow down node */
 	char *failover_command;     /* execute command when failover happens */
 	char *follow_master_command; /* execute command when failover is ended */
 	char *failback_command;     /* execute command when failback happens */
diff --git a/pool_config.l b/pool_config.l
index b560e65..8e221e0 100644
--- a/pool_config.l
+++ b/pool_config.l
@@ -195,6 +195,7 @@ int pool_init_config(void)
 	pool_config->sr_check_period = 0;
 	pool_config->sr_check_user = "nobody";
 	pool_config->sr_check_password = "";
+	pool_config->sr_check_max_retries = 0;
 	pool_config->failover_command = "";
 	pool_config->follow_master_command = "";
 	pool_config->failback_command = "";
@@ -1265,6 +1266,20 @@ int pool_get_config(char *confpath, POOL_CONFIG_CONTEXT context)
 			pool_config->sr_check_password = str;
 		}
 
+		else if (!strcmp(key, "sr_check_max_retries") &&
+				 CHECK_CONTEXT(INIT_CONFIG|RELOAD_CONFIG, context))
+		{
+			int v = atoi(yytext);
+
+			if (token != POOL_INTEGER || v < 0)
+			{
+				pool_error("pool_config: %s must be equal or higher than 0 numeric value", key);
+				fclose(fd);
+				return(-1);
+			}
+			pool_config->sr_check_max_retries = v;
+		}
+
 		else if (!strcmp(key, "failover_command") &&
 				 CHECK_CONTEXT(INIT_CONFIG|RELOAD_CONFIG, context))
 		{
diff --git a/pool_worker_child.c b/pool_worker_child.c
index 429a953..58d2871 100644
--- a/pool_worker_child.c
+++ b/pool_worker_child.c
@@ -62,6 +62,9 @@ extern char **myargv;
 
 char remote_ps_data[NI_MAXHOST];		/* used for set_ps_display */
 static POOL_CONNECTION_POOL_SLOT	*slots[MAX_NUM_BACKENDS];
+int pg_standby[MAX_NUM_BACKENDS]; // -1 not defined, 0 - master, 1 - standby mode.
+int pg_retrycn[MAX_NUM_BACKENDS]; // retry count
+
 static volatile sig_atomic_t reload_config_request = 0;
 static volatile sig_atomic_t restart_request = 0;
 
@@ -91,6 +94,9 @@ static void reload_config(void);
 */
 void do_worker_child(void)
 {
+	int i;
+	BackendInfo *bkinfo;
+
 	pool_debug("I am %d", getpid());
 
 	/* Identify myself via ps */
@@ -114,6 +120,13 @@ void do_worker_child(void)
 	/* Initialize per process context */
 	pool_init_process_context();
 
+
+	for (i=0;i<NUM_BACKENDS;i++)
+	{
+		pg_retrycn[i] = 0;
+		pg_standby[i] = -1;
+	}
+
 	for (;;)
 	{
 		CHECK_REQUEST;
@@ -137,6 +150,57 @@ void do_worker_child(void)
 
 			/* Discard persistent connections */
 			discard_persistent_connection();
+
+			if (pool_config->sr_check_max_retries > 0)
+			{
+				for (i=0;i<NUM_BACKENDS;i++)
+				{
+					if (VALID_BACKEND(i))
+						continue;
+
+					if (pg_standby[i] == 1) // up stanby node
+					{
+						bkinfo = pool_get_node_info(i);
+						if (bkinfo->standby_delay == 0)
+							pg_retrycn[i]++;
+						else
+							pg_retrycn[i] = 0;
+
+						pool_log("Standby node:%d now is online (retry=%d)",i,pg_retrycn[i]);
+						if (pg_retrycn[i] >= pool_config->sr_check_max_retries)
+						{
+							send_failback_request(i);
+							pg_retrycn[i] = 0;
+							break;
+						}
+					}
+					else if (pg_standby[i] == 0) // up master node
+					{
+						int lag = 0;
+						int j;
+						// To allow master node need lag=0 for all standby servers
+						for (j=0;j<NUM_BACKENDS;j++)
+							if (VALID_BACKEND(j) && pg_standby[i] != -1)
+							{
+								bkinfo = pool_get_node_info(j);
+								lag = lag + bkinfo->standby_delay;
+							}
+						
+						if (lag == 0)
+							pg_retrycn[i]++;
+						else
+							pg_retrycn[i] = 0;
+						
+						pool_log("Master node:%d now is online (retry=%d)",i,pg_retrycn[i]);
+						if (pg_retrycn[i] >= pool_config->sr_check_max_retries)
+						{
+							send_failback_request(i);
+							pg_retrycn[i] = 0;
+							break;
+						}
+					}
+				}
+			}
 		}
 		sleep(pool_config->sr_check_period);
 	}
@@ -154,9 +218,7 @@ static void establish_persistent_connection(void)
 
 	for (i=0;i<NUM_BACKENDS;i++)
 	{
-		if (!VALID_BACKEND(i))
-			continue;
-
+		pg_standby[i] = -1;
 		if (slots[i] == NULL)
 		{
 			bkinfo = pool_get_node_info(i);
@@ -168,7 +230,10 @@ static void establish_persistent_connection(void)
 			if (s)
 				slots[i] = s;
 			else
+			{
 				slots[i] = NULL;
+				pg_retrycn[i] = 0;
+			}
 		}
 	}
 }
@@ -203,20 +268,17 @@ static void check_replication_time_lag(void)
 	char *query;
 	BackendInfo *bkinfo;
 	unsigned long long int lag;
-
-	if (NUM_BACKENDS <= 1)
-	{
-		/* If there's only one node, there's no point to do checking */
-		return;
-	}
+	int real_primary = -1;
 
 	/* Count healthy nodes */
 	for (i=0;i<NUM_BACKENDS;i++)
 	{
-		if (VALID_BACKEND(i))
+		if (slots[i])
 			active_nodes++;
 	}
 
+	pool_debug("check_replication_time_lag: active_nodes %d",active_nodes);
+
 	if (active_nodes <= 1)
 	{
 		/* If there's only one or less active node, there's no point
@@ -226,25 +288,46 @@ static void check_replication_time_lag(void)
 
 	for (i=0;i<NUM_BACKENDS;i++)
 	{
-		if (!VALID_BACKEND(i))
+		pool_debug("check_replication_time_lag: check DB node %d",i);
+		pg_standby[i]=-1;
+
+		if (!(VALID_BACKEND(i) || slots[i]))
 			continue;
 
 		if (!slots[i])
 		{
 			pool_debug("check_replication_time_lag: DB node is valid but no persistent connection");
 			pool_error("check_replication_time_lag: could not connect to DB node %d, check sr_check_user and sr_check_password", i);
-
-			return;
+			continue;
 		}
 
-		if (PRIMARY_NODE_ID == i)
+		query = "SELECT pg_current_xlog_location()";
+		sts = do_query(slots[i]->con, "SELECT pg_is_in_recovery()",
+						  &res, PROTO_MAJOR_V3);
+		if (res->numrows <= 0)
 		{
-			query = "SELECT pg_current_xlog_location()";
+			pool_error("check_replication_time_lag: do_query returns no rows");
 		}
-		else
+		if (res->data[0] == NULL)
+		{
+			pool_error("check_replication_time_lag: do_query returns no data");
+		}
+		if (res->nullflags[0] == -1)
+		{
+			pool_error("check_replication_time_lag: do_query returns NULL");
+		}
+		if (res->data[0] && !strcmp(res->data[0], "t"))
 		{
+			pg_standby[i]=1;
 			query = "SELECT pg_last_xlog_replay_location()";
+			pool_debug("check_replication_time_lag: standby node id is %d", i);
 		}
+		else
+		{
+			pg_standby[i]=0;
+			pool_debug("check_replication_time_lag: primary node id is %d", i);
+		}
+		free_select_result(res);
 
 		sts = do_query(slots[i]->con, query, &res, PROTO_MAJOR_V3);
 		if (sts != POOL_CONTINUE)
@@ -287,16 +370,52 @@ static void check_replication_time_lag(void)
 
 	for (i=0;i<NUM_BACKENDS;i++)
 	{
-		if (!VALID_BACKEND(i))
+		if (pg_standby[i] == 0)
+		{
+			if (real_primary == -1)
+			{
+				real_primary = i;
+			}
+			else
+			{
+				if (VALID_BACKEND(i))
+				{
+					pg_standby[i] = -1;
+					pg_standby[real_primary] = -1;
+					pool_error("check_replication_time_lag: second master found id is %d, keep old master id %d",i,PRIMARY_NODE_ID);
+					real_primary = PRIMARY_NODE_ID;
+					pg_standby[real_primary] = 0;
+				}
+				else
+				{
+					pg_standby[i] = -1;
+					pool_error("check_replication_time_lag: second master found id is %d",i);
+				}
+
+			}
+		}
+	}
+
+	if (real_primary < 0)
+	{
+		pool_debug("check_replication_time_lag: no primary node found");
+		return;
+	}
+
+	for (i=0;i<NUM_BACKENDS;i++)
+	{
+		if (!slots[i] || pg_standby[i] == -1)
 			continue;
 
 		/* Set standby delay value */
 		bkinfo = pool_get_node_info(i);
-		lag = (lsn[PRIMARY_NODE_ID] > lsn[i]) ? lsn[PRIMARY_NODE_ID] - lsn[i] : 0;
+		lag = (lsn[real_primary] > lsn[i]) ? lsn[real_primary] - lsn[i] : 0;
 
-		if (PRIMARY_NODE_ID == i)
+		if (real_primary == i)
 		{
 			bkinfo->standby_delay = 0;
+			if (!VALID_BACKEND(i))
+				pool_log("Master replication node:%d now is online",i);
 		}
 		else
 		{
@@ -308,8 +427,12 @@ static void check_replication_time_lag(void)
 				 !strcmp(pool_config->log_standby_delay, "if_over_threshold") &&
 				 lag > pool_config->delay_threshold))
 			{
-				pool_log("Replication of node:%d is behind %llu bytes from the primary server (node:%d)",
-				         i, lsn[PRIMARY_NODE_ID] - lsn[i], PRIMARY_NODE_ID);
+				if (PRIMARY_NODE_ID != real_primary)
+					pool_log("Replication of node:%d is behind %llu bytes from the new primary server (node:%d)",
+				        i, lag, real_primary);
+				else
+					pool_log("Replication of node:%d is behind %llu bytes from the primary server (node:%d)",
+				        i, lag, real_primary);
 			}
 		}
 	}