[pgpool-hackers: 2239] WIP patch for 0000289: Inconsistent backend state

Muhammad Usama m.usama at gmail.com
Thu Apr 6 03:23:20 JST 2017


Hi Ishii-San

Can you please have a look at the attached patch to sync backend nodes
states over the watchdog when the Pgpool-II nodes joins the watchdog
cluster as a standby node.

Currently the Pgpool-II syncs the backend node states only at the time of
startup which works fine for almost all cases except when the watchdog
cluster becomes partitioned (because of some network problem) and after
recovering from it when the Pgpool-II nodes (that are already up and
serving) joins back the cluster. At that time the backend node status among
different nodes can become In-sync among Pgpool-II nodes, which can be a
major problem. The same type of scenario is also mentioned in bug report of
0000289: Inconsistent backend state (
http://www.pgpool.net/mantisbt/view.php?id=289)

This work in progress patch solves this by syncing the backend status from
the master watchdog node every time the Pgpool-II node's state change back
to STANDBY. And the tricky part in this is how to service the pgpool-II
children processes after updating the backend node status.

So can you please review the attached patch more specifically for the
following two changes.

1-) The *sync_backend_from_watchdog**()* function logic where it decides
when to restart the pgpool-II child process. (The most of the logic is
adopted from the failover() function)

2-) The multiplexing of SIGUSR1 signal in pgpool-II main process. Now
SIGUSR1 is used for two types of interrupts. a) failover and b) watchdog
state change intimations.

Please note that the patch is generated against master branch and is still
work in progress, So it has little extra and elevated log messages, which I
will adjust before after completing the testing.


Thanks
Best Regards
Muhammad Usama
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://www.sraoss.jp/pipermail/pgpool-hackers/attachments/20170405/e2a22d9f/attachment-0001.html>
-------------- next part --------------
diff --git a/src/include/pool.h b/src/include/pool.h
index 3feef1f3..98bb00cd 100644
--- a/src/include/pool.h
+++ b/src/include/pool.h
@@ -514,6 +514,7 @@ extern char remote_port[];	/* client port */
 /*
  * public functions
  */
+extern void register_watchdog_state_change_interupt(void);
 extern bool register_node_operation_request(POOL_REQUEST_KIND kind, int* node_id_set, int count, bool switch_over, unsigned int wd_failover_id);
 extern char *get_config_file_name(void);
 extern char *get_hba_file_name(void);
diff --git a/src/include/watchdog/watchdog.h b/src/include/watchdog/watchdog.h
index f8dda212..63604217 100644
--- a/src/include/watchdog/watchdog.h
+++ b/src/include/watchdog/watchdog.h
@@ -27,6 +27,7 @@
 #define WATCHDOG_H
 
 #include <sys/time.h>
+#include "pool_config.h"
 
 #define WD_TIME_INIT(tv)      ((tv).tv_sec = (tv).tv_usec = 0)
 #define WD_TIME_ISSET(tv)     ((tv).tv_sec || (tv).tv_usec)
diff --git a/src/include/watchdog/wd_ipc_commands.h b/src/include/watchdog/wd_ipc_commands.h
index bc5bd16d..a395c939 100644
--- a/src/include/watchdog/wd_ipc_commands.h
+++ b/src/include/watchdog/wd_ipc_commands.h
@@ -28,6 +28,7 @@
 
 #include "watchdog/wd_ipc_defines.h"
 #include "watchdog/wd_json_data.h"
+#include "watchdog/watchdog.h"
 
 typedef enum WdCommandResult
 {
@@ -45,6 +46,17 @@ typedef struct WDIPCCmdResult
 	char*	data;
 }WDIPCCmdResult;
 
+typedef struct WDClusterInfo
+{
+	WD_STATES	wd_state;
+	int			master_node_id;
+	int			quorum_status;
+	bool		local_node_escalated;
+}WDClusterInfo;
+
+extern WD_STATES get_watchdog_local_node_state(void);
+extern void set_watchdog_local_node_state(WD_STATES wd_state);
+
 extern void wd_ipc_initialize_data(void);
 extern char* get_watchdog_ipc_address(void);
 extern unsigned int* get_ipc_shared_key(void);
diff --git a/src/main/pgpool_main.c b/src/main/pgpool_main.c
index 2f7620ba..e9ae334e 100644
--- a/src/main/pgpool_main.c
+++ b/src/main/pgpool_main.c
@@ -63,6 +63,21 @@
 #include "watchdog/watchdog.h"
 
 /*
+ * Reasons for signalling a pgpool-II main process
+ */
+typedef enum
+{
+	SIG_FAILOVER_INTERRUPT,		/* signal main to start failover */
+	SIG_WATCHDOG_STATE_CHANGED,	/* notify main about local watchdog node state changed */
+	MAX_INTERUPTS				/* Must be last! */
+} User1SignalReason;
+
+
+typedef struct User1SignalSlot
+{
+	sig_atomic_t	signalFlags[MAX_INTERUPTS];
+}User1SignalSlot;
+/*
  * Process pending signal actions.
  */
 #define CHECK_REQUEST \
@@ -72,10 +87,10 @@
 			wakeup_children(); \
 			wakeup_request = 0; \
 		} \
-		if (failover_request) \
+		if (siguser1_request) \
 		{ \
-			failover(); \
-			failover_request = 0; \
+			siguser1_interupt_processor(); \
+			siguser1_request = 0; \
 		} \
 		if (sigchld_request) \
 		{ \
@@ -98,6 +113,7 @@
 
 static int process_backend_health_check_failure(int health_check_node_id, int retrycnt);
 static bool do_health_check(bool use_template_db, volatile int *health_check_node_id);
+static void signal_user1_to_parent_with_reason(User1SignalReason reason);
 
 static void FileUnlink(int code, Datum path);
 static pid_t pcp_fork_a_child(int unix_fd, int inet_fd, char *pcp_conf_file);
@@ -107,6 +123,7 @@ static int create_unix_domain_socket(struct sockaddr_un un_addr_tmp);
 static int create_inet_domain_socket(const char *hostname, const int port);
 static int *create_inet_domain_sockets(const char *hostname, const int port);
 static void failover(void);
+static bool check_all_backend_down(void);
 static void reaper(void);
 static void wakeup_children(void);
 static void reload_config(void);
@@ -117,7 +134,8 @@ static pid_t fork_follow_child(int old_master, int new_primary, int old_primary)
 static int read_status_file(bool discard_status);
 static RETSIGTYPE exit_handler(int sig);
 static RETSIGTYPE reap_handler(int sig);
-static RETSIGTYPE failover_handler(int sig);
+static RETSIGTYPE SIGUSR1_handler(int sig);
+static void siguser1_interupt_processor(void);
 static RETSIGTYPE reload_config_handler(int sig);
 static RETSIGTYPE health_check_timer_handler(int sig);
 static RETSIGTYPE wakeup_handler(int sig);
@@ -131,11 +149,12 @@ static int find_primary_node_repeatedly(void);
 static void terminate_all_childrens();
 static void system_will_go_down(int code, Datum arg);
 static char* process_name_from_pid(pid_t pid);
-static void initialize_backend_status_from_watchdog(void);
+static void sync_backend_from_watchdog(void);
 
 static struct sockaddr_un un_addr;		/* unix domain socket path */
 static struct sockaddr_un pcp_un_addr;  /* unix domain socket path for PCP */
 ProcessInfo *process_info = NULL;		/* Per child info table on shmem */
+volatile User1SignalSlot	*user1SignalSlot = NULL;/* User 1 signal slot on shmem */
 struct timeval random_start_time;
 
 /*
@@ -159,12 +178,12 @@ extern char conf_file[POOLMAXPATHLEN+1];
 extern char hba_file[POOLMAXPATHLEN+1];
 
 static int exiting = 0;		/* non 0 if I'm exiting */
-static int switching = 0;		/* non 0 if I'm fail overing or degenerating */
+static int switching = 0;		/* non 0 if I'm failing over or degenerating */
 
 POOL_REQUEST_INFO *Req_info;		/* request info area in shared memory */
 volatile sig_atomic_t *InRecovery; /* non 0 if recovery is started */
 volatile sig_atomic_t reload_config_request = 0;
-static volatile sig_atomic_t failover_request = 0;
+static volatile sig_atomic_t siguser1_request = 0;
 static volatile sig_atomic_t sigchld_request = 0;
 static volatile sig_atomic_t wakeup_request = 0;
 
@@ -250,11 +269,11 @@ int PgpoolMain(bool discard_status, bool clear_memcache_oidmaps)
 		 */
 		pool_signal(SIGUSR2, wakeup_handler);
 		pool_signal(SIGCHLD, reap_handler);
-		pool_signal(SIGUSR1, failover_handler);
+		pool_signal(SIGUSR1, SIGUSR1_handler);
 
 		/*
 		 * okay as we need to wait until watchdog is in stable state
-		 * so only wait for SIGUSR2, SIGCHLD, and signals those are
+		 * so only wait for SIGUSR1, SIGCHLD, and signals those are
 		 * necessary to make sure we respond to user requests of shutdown
 		 * if it arrives while we are in waiting state.
 		 *
@@ -265,7 +284,7 @@ int PgpoolMain(bool discard_status, bool clear_memcache_oidmaps)
 		 * once our backend status will be synchronized across the cluster
 		 */
 		sigfillset(&mask);
-		sigdelset(&mask, SIGUSR2);
+		sigdelset(&mask, SIGUSR1);
 		sigdelset(&mask, SIGCHLD);
 		sigdelset(&mask, SIGTERM);
 		sigdelset(&mask, SIGINT);
@@ -273,7 +292,7 @@ int PgpoolMain(bool discard_status, bool clear_memcache_oidmaps)
 		watchdog_pid = initialize_watchdog();
 		ereport (LOG,
 				 (errmsg("waiting for watchdog to initialize")));
-		while (wakeup_request == 0 && sigchld_request == 0)
+		while (siguser1_request == 0 && sigchld_request == 0)
 		{
 			sigsuspend(&mask);
 		}
@@ -292,8 +311,11 @@ int PgpoolMain(bool discard_status, bool clear_memcache_oidmaps)
 		 */
 		wd_lifecheck_pid = initialize_watchdog_lifecheck();
 
-		/* load the backend node status from watchdog cluster */
-		initialize_backend_status_from_watchdog();
+		if (siguser1_request)
+		{
+			siguser1_interupt_processor();
+			siguser1_request = 0;
+		}
 	}
 
 	fds[0] = create_unix_domain_socket(un_addr);
@@ -347,7 +369,7 @@ int PgpoolMain(bool discard_status, bool clear_memcache_oidmaps)
 	pool_signal(SIGINT, exit_handler);
 	pool_signal(SIGQUIT, exit_handler);
 	pool_signal(SIGCHLD, reap_handler);
-	pool_signal(SIGUSR1, failover_handler);
+	pool_signal(SIGUSR1, SIGUSR1_handler);
 	pool_signal(SIGUSR2, wakeup_handler);
 	pool_signal(SIGHUP, reload_config_handler);
 
@@ -644,13 +666,23 @@ bool register_node_operation_request(POOL_REQUEST_KIND kind, int* node_id_set, i
 		POOL_SETMASK(&oldmask);
 		if(failover_in_progress == false)
 		{
-			pool_signal_parent(SIGUSR1);
+			signal_user1_to_parent_with_reason(SIG_FAILOVER_INTERRUPT);
 		}
 	}
 
 	return true;
 }
 
+void register_watchdog_state_change_interupt(void)
+{
+	signal_user1_to_parent_with_reason(SIG_WATCHDOG_STATE_CHANGED);
+}
+static void signal_user1_to_parent_with_reason(User1SignalReason reason)
+{
+	user1SignalSlot->signalFlags[reason] = true;
+	pool_signal_parent(SIGUSR1);
+}
+
 /*
  * fork a child for PCP
  */
@@ -1500,19 +1532,19 @@ static int get_next_master_node(void)
  * handle SIGUSR1
  *
  */
-static RETSIGTYPE failover_handler(int sig)
+static RETSIGTYPE SIGUSR1_handler(int sig)
 {
 	int save_errno = errno;
 
 	POOL_SETMASK(&BlockSig);
-	failover_request = 1;
+	siguser1_request = 1;
 
 	write(pipe_fds[1], "\0", 1);
 
 #ifdef NOT_USED
 	if(write(pipe_fds[1], "\0", 1) < 0)
         ereport(WARNING,
-                (errmsg("failover_handler: write to pipe failed with error \"%s\"", strerror(errno))));
+                (errmsg("SIGUSR1_handler: write to pipe failed with error \"%s\"", strerror(errno))));
 #endif
 
 	POOL_SETMASK(&UnBlockSig);
@@ -1520,6 +1552,60 @@ static RETSIGTYPE failover_handler(int sig)
 	errno = save_errno;
 }
 
+static void siguser1_interupt_processor(void)
+{
+	ereport(LOG,
+			(errmsg("Pgpool-II parent process received SIGUSR1")));
+
+	if (user1SignalSlot->signalFlags[SIG_WATCHDOG_STATE_CHANGED])
+	{
+		ereport(LOG,
+				(errmsg("Pgpool-II parent process received SIGUSR1 from watchdog")));
+
+		user1SignalSlot->signalFlags[SIG_WATCHDOG_STATE_CHANGED] = false;
+		if (get_watchdog_local_node_state() == WD_STANDBY)
+		{
+			ereport(LOG,
+					(errmsg("we have joined the watchdog cluster as STANDBY node"),
+					 errdetail("syncing the backend states from MASTER watchdog node")));
+			sync_backend_from_watchdog();
+		}
+	}
+	if (user1SignalSlot->signalFlags[SIG_FAILOVER_INTERRUPT])
+	{
+		ereport(LOG,
+				(errmsg("Pgpool-II parent process has received failover request")));
+		user1SignalSlot->signalFlags[SIG_FAILOVER_INTERRUPT] = false;
+		if (processState == INITIALIZING)
+		{
+			ereport(LOG,
+					(errmsg("ignoring the failover request, since we are still starting up")));
+		}
+		else
+		{
+			failover();
+		}
+	}
+}
+
+/* returns true if all backends are down */
+static bool check_all_backend_down(void)
+{
+	int i;
+	/* Check to see if all backends are down */
+	for (i=0;i<NUM_BACKENDS;i++)
+	{
+		if (BACKEND_INFO(i).backend_status != CON_DOWN &&
+			BACKEND_INFO(i).backend_status != CON_UNUSED)
+		{
+			ereport(LOG,
+					(errmsg("Node %d is not down (status: %d)",
+							i, BACKEND_INFO(i).backend_status)));
+			return false;
+		}
+	}
+	return true;
+}
 
 /*
  * backend connection error, failover/failback request, if possible
@@ -1659,18 +1745,7 @@ static void failover(void)
 					 BACKEND_INFO(node_id).backend_port)));
 
 			/* Check to see if all backends are down */
-			for (i=0;i<NUM_BACKENDS;i++)
-			{
-				if (BACKEND_INFO(i).backend_status != CON_DOWN &&
-					BACKEND_INFO(i).backend_status != CON_UNUSED)
-				{
-					ereport(LOG,
-							(errmsg("Node %d is not down (status: %d)",
-									i, BACKEND_INFO(i).backend_status)));
-					all_backend_down = false;
-					break;
-				}
-			}
+			all_backend_down = check_all_backend_down();
 
 			BACKEND_INFO(node_id).backend_status = CON_CONNECT_WAIT;	/* unset down status */
 			(void)write_status_file();
@@ -1778,7 +1853,7 @@ static void failover(void)
 		if (STREAM && reqkind == NODE_UP_REQUEST && all_backend_down == false)
 		{
 			ereport(LOG,
-					(errmsg("Do not restart children because we are failbacking node id %d host: %s port: %d and we are in streaming replication mode and not all backends were down", node_id,
+					(errmsg("Do not restart children because we are failing back node id %d host: %s port: %d and we are in streaming replication mode and not all backends were down", node_id,
 					 BACKEND_INFO(node_id).backend_hostname,
 					 BACKEND_INFO(node_id).backend_port)));
 
@@ -3026,6 +3101,7 @@ static void initialize_shared_mem_objects(bool clear_memcache_oidmaps)
 		process_info[i].connection_info = pool_coninfo(i,0,0);
 	}
 
+	user1SignalSlot = pool_shared_memory_create(sizeof(User1SignalSlot));
 	/* create fail over/switch over event area */
 	Req_info = pool_shared_memory_create(sizeof(POOL_REQUEST_INFO));
 
@@ -3442,72 +3518,213 @@ int pool_frontend_exists(void)
 	return -1;
 }
 
-static void initialize_backend_status_from_watchdog(void)
+static void sync_backend_from_watchdog(void)
 {
-	if (pool_config->use_watchdog)
+	bool primary_changed = false;
+	bool node_status_was_changed_to_down = false;
+	bool node_status_was_changed_to_up = false;
+	bool need_to_restart_children = false;
+	bool partial_restart = false;
+	bool reload_maste_node_id = false;
+
+	int down_node_ids[MAX_NUM_BACKENDS];
+	int down_node_ids_index = 0;
+	int i;
+
+	WDPGBackendStatus* backendStatus = get_pg_backend_status_from_master_wd_node();
+	if (!backendStatus)
+	{
+		ereport(WARNING,
+				(errmsg("failed to get the backend status from the master watchdog node"),
+				 errdetail("using the local backend node status")));
+		return;
+	}
+	if (backendStatus->node_count <= 0)
 	{
-		WDPGBackendStatus* backendStatus = get_pg_backend_status_from_master_wd_node();
-		if (backendStatus)
+		/*
+		 * -ve node count is returned by watchdog when the node itself is a master
+		 * and in that case we need to use the loacl backend node status
+		 */
+		ereport(LOG,
+			(errmsg("I am the master watchdog node"),
+				 errdetail("using the local backend node status")));
+		pfree(backendStatus);
+		return;
+	}
+
+	ereport(LOG,
+			(errmsg("master watchdog node \"%s\" returned status for %d backend nodes",backendStatus->nodeName,backendStatus->node_count)));
+
+	ereport(LOG,
+			(errmsg("primary node on master watchdog node \"%s\" is %d",backendStatus->nodeName,backendStatus->primary_node_id)));
+	if (Req_info->primary_node_id != backendStatus->primary_node_id)
+	{
+		ereport(LOG,
+				(errmsg("primary node on master watchdog node \"%s\" is %d which is different from local primary %d",
+						backendStatus->nodeName,backendStatus->primary_node_id,Req_info->primary_node_id)));
+
+		Req_info->primary_node_id = backendStatus->primary_node_id;
+		primary_changed = true;
+	}
+
+	for (i = 0; i < backendStatus->node_count; i++)
+	{
+		if (backendStatus->backend_status[i] == CON_DOWN)
 		{
-			if (backendStatus->node_count <= 0)
+			if (BACKEND_INFO(i).backend_status != CON_DOWN)
 			{
-				/*
-				 * -ve node count is returned by watchdog when the node itself is a master
-				 * and in that case we need to use the loacl backend node status
-				 */
+				BACKEND_INFO(i).backend_status = CON_DOWN;
+				my_backend_status[i] = &(BACKEND_INFO(i).backend_status);
+				reload_maste_node_id = true;
+				node_status_was_changed_to_down = true;
 				ereport(LOG,
-						(errmsg("I am the master watchdog node"),
-						 errdetail("using the local backend node status")));
+						(errmsg("backend status from \"%s\" backend:%d is set to down status",backendStatus->nodeName, i)));
+				down_node_ids[down_node_ids_index++] = i;
 			}
-			else
+		}
+		else if (backendStatus->backend_status[i] == CON_CONNECT_WAIT ||
+				 backendStatus->backend_status[i] == CON_UP)
+		{
+			if (BACKEND_INFO(i).backend_status != CON_CONNECT_WAIT)
 			{
-				int i;
-				bool reload_maste_node_id = false;
-				ereport(LOG,
-						(errmsg("master watchdog node \"%s\" returned status for %d backend nodes",backendStatus->nodeName,backendStatus->node_count)));
-				ereport(LOG,
-						(errmsg("primary node on master watchdog node \"%s\" is %d",backendStatus->nodeName,backendStatus->primary_node_id)));
+				if (BACKEND_INFO(i).backend_status == CON_DOWN)
+					node_status_was_changed_to_up = true;
 
-				Req_info->primary_node_id = backendStatus->primary_node_id;
+				BACKEND_INFO(i).backend_status = CON_CONNECT_WAIT;
+				my_backend_status[i] = &(BACKEND_INFO(i).backend_status);
+				reload_maste_node_id = true;
+			}
+		}
+	}
+	pfree(backendStatus);
 
-				for (i = 0; i < backendStatus->node_count; i++)
-				{
-					if (backendStatus->backend_status[i] == CON_DOWN)
-					{
-						if (BACKEND_INFO(i).backend_status != CON_DOWN)
-						{
+	if (reload_maste_node_id)
+	{
+		Req_info->master_node_id = get_next_master_node();
+	}
 
-							BACKEND_INFO(i).backend_status = CON_DOWN;
-							my_backend_status[i] = &(BACKEND_INFO(i).backend_status);
-							reload_maste_node_id = true;
-							ereport(LOG,
-									(errmsg("backend status from \"%s\" backend:%d is set to down status",backendStatus->nodeName, i)));
-						}
-					}
-					else if (backendStatus->backend_status[i] == CON_CONNECT_WAIT ||
-								backendStatus->backend_status[i] == CON_UP)
+	/* If we are in initializing phase, nothing more is needed */
+	if (processState == INITIALIZING)
+		return;
+
+	/* Check if no node state was changed */
+	 if (node_status_was_changed_to_up == false &&
+		 node_status_was_changed_to_down == false &&
+		 primary_changed == false)
+	 {
+		 ereport(LOG,
+				(errmsg("backend nodes status remains same after the sync from \"%s\"",backendStatus->nodeName)));
+		 return;
+	 }
+	/* if Primary node was changed, We should restart all
+	 * children
+	 */
+	if (primary_changed)
+	{
+		need_to_restart_children = true;
+		partial_restart = false;
+		ereport(LOG,
+			(errmsg("primary node was chenged after the sync from \"%s\"",backendStatus->nodeName),
+			 errdetail("all children needs to be restarted")));
+
+	}
+	else
+	{
+		if (node_status_was_changed_to_down == false)
+		{
+			/* no node was detached, So no need to restart
+			 * any child process
+			 */
+			need_to_restart_children = false;
+			partial_restart = false;
+			ereport(LOG,
+					(errmsg("No backend node was detached because of backend status sync from \"%s\"",backendStatus->nodeName),
+					 errdetail("no need to restart children")));
+
+		}
+		else
+		{
+			ereport(LOG,
+				(errmsg("%d backend node(s) were detached because of backend status sync from \"%s\"",down_node_ids_index,backendStatus->nodeName),
+					 errdetail("restarting the children processes")));
+
+			need_to_restart_children = true;
+			partial_restart = !check_all_backend_down();
+		}
+	}
+
+	/* Kill children and restart them if needed */
+	if (need_to_restart_children)
+	{
+		for (i=0;i<pool_config->num_init_children;i++)
+		{
+			/*
+			 * Try to kill pgpool child because previous kill signal
+			 * may not be received by pgpool child. This could happen
+			 * if multiple PostgreSQL are going down (or even starting
+			 * pgpool, without starting PostgreSQL can trigger this).
+			 * Child calls degenerate_backend() and it tries to aquire
+			 * semaphore to write a failover request. In this case the
+			 * signal mask is set as well, thus signals are never
+			 * received.
+			 */
+
+			bool restart = false;
+
+			if (partial_restart)
+			{
+				int j, k;
+				for (j=0;j<pool_config->max_pool;j++)
+				{
+					for (k=0;k<NUM_BACKENDS;k++)
 					{
-						if (BACKEND_INFO(i).backend_status != CON_CONNECT_WAIT)
+						int idx;
+						ConnectionInfo *con = pool_coninfo(i, j, k);
+						for (idx = 0; idx < down_node_ids_index; idx ++)
 						{
-							BACKEND_INFO(i).backend_status = CON_CONNECT_WAIT;
-							my_backend_status[i] = &(BACKEND_INFO(i).backend_status);
-							reload_maste_node_id = true;
+							int node_id = down_node_ids[idx];
+							if (con->connected && con->load_balancing_node == node_id)
+							{
+								ereport(LOG,
+										(errmsg("child pid %d needs to restart because pool %d uses backend %d",
+												process_info[i].pid, j, node_id)));
+								restart = true;
+								break;
+							}
+							if (restart)
+								break;
 						}
 					}
 				}
+			}
+			else
+			{
+				restart = true;
+			}
 
-				if (reload_maste_node_id)
+			if (restart)
+			{
+				if (process_info[i].pid)
 				{
-					Req_info->master_node_id = get_next_master_node();
+					kill(process_info[i].pid, SIGQUIT);
+
+					process_info[i].pid = fork_a_child(fds, i);
+					process_info[i].start_time = time(NULL);
 				}
 			}
-			pfree(backendStatus);
+			else
+				process_info[i].need_to_restart = 1;
 		}
-		else
+	}
+
+	else
+	{
+		/* Set restart request to each child. Children will exit(1)
+		 * whenever they are convenient.
+		 */
+		for (i=0;i<pool_config->num_init_children;i++)
 		{
-			ereport(WARNING,
-				(errmsg("failed to get the backend status from the master watchdog node"),
-					 errdetail("using the local backend node status")));
+			process_info[i].need_to_restart = 1;
 		}
 	}
 }
diff --git a/src/watchdog/watchdog.c b/src/watchdog/watchdog.c
index 1b6ff32f..e07d909f 100644
--- a/src/watchdog/watchdog.c
+++ b/src/watchdog/watchdog.c
@@ -489,7 +489,6 @@ static int wd_create_recv_socket(int port);
 static void wd_check_config(void);
 static pid_t watchdog_main(void);
 static pid_t fork_watchdog_child(void);
-static void cluster_in_stable_state(void);
 static bool check_IPC_client_authentication(json_value *rootObj, bool internal_client_only);
 static bool check_and_report_IPC_authentication(WDCommandData* ipcCommand);
 
@@ -4704,16 +4703,6 @@ static bool wd_commands_packet_processor(WD_EVENTS event, WatchdogNode* wdNode,
 }
 
 
-static void cluster_in_stable_state(void)
-{
-	if (g_cluster.clusterInitialized == false)
-	{
-		g_cluster.clusterInitialized = true;
-		/* Inform the parent */
-		kill(getppid(), SIGUSR2);
-	}
-}
-
 static void update_interface_status(void)
 {
 	struct ifaddrs *ifAddrStruct=NULL;
@@ -5350,7 +5339,7 @@ static int watchdog_state_machine_coordinator(WD_EVENTS event, WatchdogNode* wdN
 							 errdetail("our declare coordinator message is accepted by all nodes")));
 
 					g_cluster.masterNode = g_cluster.localNode;
-					cluster_in_stable_state();
+					register_watchdog_state_change_interupt();
 
 					/*
 					 * Check if the quorum is present then start the escalation process
@@ -6025,7 +6014,7 @@ static int watchdog_state_machine_standby(WD_EVENTS event, WatchdogNode* wdNode,
 				if (clusterCommand->commandStatus == COMMAND_FINISHED_ALL_REPLIED ||
 					clusterCommand->commandStatus == COMMAND_FINISHED_TIMEOUT)
 				{
-					cluster_in_stable_state();
+					register_watchdog_state_change_interupt();
 
 					ereport(LOG,
 						(errmsg("successfully joined the watchdog cluster as standby node"),
@@ -6220,7 +6209,7 @@ static int get_mimimum_nodes_required_for_quorum(void)
 
 
 /*
- * sets the state of local watchdog node, and fires an state change event
+ * sets the state of local watchdog node, and fires a state change event
  * if the new and old state differes
  */
 static int set_state(WD_STATES newState)
@@ -6229,6 +6218,7 @@ static int set_state(WD_STATES newState)
 	g_cluster.localNode->state = newState;
 	if (oldState != newState)
 	{
+		set_watchdog_local_node_state(newState);
 		gettimeofday(&g_cluster.localNode->current_state_time, NULL);
 		/* if we changing from the coordinator state, do the de-escalation if required */
 		if (oldState == WD_COORDINATOR)
diff --git a/src/watchdog/wd_commands.c b/src/watchdog/wd_commands.c
index 08d5fe9a..944722e9 100644
--- a/src/watchdog/wd_commands.c
+++ b/src/watchdog/wd_commands.c
@@ -76,6 +76,7 @@ unsigned int *ipc_shared_key = NULL;   /* key lives in shared memory
 										* used to identify the ipc internal
 										* clients
 										*/
+WDClusterInfo* wdSharedClusterInfo = NULL;
 
 void wd_ipc_initialize_data(void)
 {
@@ -110,6 +111,25 @@ void wd_ipc_initialize_data(void)
 		watchdog_node_escalated = pool_shared_memory_create(sizeof(bool));
 		*watchdog_node_escalated = false;
 	}
+
+	if (wdSharedClusterInfo == NULL)
+	{
+		wdSharedClusterInfo = pool_shared_memory_create(sizeof(WDClusterInfo));
+		wdSharedClusterInfo->wd_state = WD_DEAD;
+		wdSharedClusterInfo->master_node_id = -1;
+		wdSharedClusterInfo->quorum_status = -1;
+		wdSharedClusterInfo->local_node_escalated = false;
+	}
+}
+
+WD_STATES get_watchdog_local_node_state(void)
+{
+	return wdSharedClusterInfo->wd_state;
+}
+
+void set_watchdog_local_node_state(WD_STATES wd_state)
+{
+	wdSharedClusterInfo->wd_state = wd_state;
 }
 
 char* get_watchdog_ipc_address(void)


More information about the pgpool-hackers mailing list