diff --git a/src/include/watchdog/watchdog.h b/src/include/watchdog/watchdog.h
index 1146a4ae..83f6651b 100644
--- a/src/include/watchdog/watchdog.h
+++ b/src/include/watchdog/watchdog.h
@@ -79,7 +79,8 @@ typedef enum
 	WD_IN_NW_TROUBLE,
 	/* the following states are only valid on remote nodes */
 	WD_SHUTDOWN,
-	WD_ADD_MESSAGE_SENT
+	WD_ADD_MESSAGE_SENT,
+	WD_NETWORK_ISOLATION
 }			WD_STATES;
 
 typedef enum
@@ -114,9 +115,21 @@ typedef enum
 	WD_EVENT_NODE_CON_LOST,
 	WD_EVENT_NODE_CON_FOUND,
 	WD_EVENT_CLUSTER_QUORUM_CHANGED,
-	WD_EVENT_WD_STATE_REQUIRE_RELOAD
+	WD_EVENT_WD_STATE_REQUIRE_RELOAD,
+	WD_EVENT_I_AM_APPEARING_LOST,
+	WD_EVENT_I_AM_APPEARING_FOUND
 }			WD_EVENTS;
 
+typedef enum {
+	NODE_LOST_UNKNOWN_REASON,
+	NODE_LOST_BY_LIFECHECK,
+	NODE_LOST_BY_SEND_FAILURE,
+	NODE_LOST_BY_MISSING_BEACON,
+	NODE_LOST_BY_RECEIVE_TIMEOUT,
+	NODE_LOST_BY_NOT_REACHABLE,
+	NODE_LOST_SHUTDOWN
+} WD_NODE_LOST_REASONS;
+
 typedef struct SocketConnection
 {
 	int			sock;			/* socket descriptor */
@@ -135,6 +148,20 @@ typedef struct WatchdogNode
 									 * from the node */
 	struct timeval last_sent_time;	/* timestamp when last packet was sent on
 									 * the node */
+	bool   has_lost_us;             /*
+									 * True when this remote node thinks
+									 * we are lost
+									 */
+	int    sending_failures_count;  /* number of times we have failed
+									 * to send message to the node.
+									 * Gets reset after successfull sent
+									 */
+	int    missed_beacon_count;     /* number of times the node has
+									 * failed to reply for beacon.
+									 * message
+									 */
+	WD_NODE_LOST_REASONS node_lost_reason;
+
 	char		pgp_version[MAX_VERSION_STR_LEN];		/* Pgpool-II version */
 	int			wd_data_major_version;	/* watchdog messaging version major*/
 	int			wd_data_minor_version;  /* watchdog messaging version minor*/
diff --git a/src/test/regression/regress.sh b/src/test/regression/regress.sh
index a891c2ae..37efee02 100755
--- a/src/test/regression/regress.sh
+++ b/src/test/regression/regress.sh
@@ -38,7 +38,7 @@ function install_pgpool
 
 	test -d $log || mkdir $log
         
-	make install HEALTHCHECK_DEBUG=1 -C $dir/../../ -e prefix=${PGPOOL_PATH} >& regression.log 2>&1
+	make install HEALTHCHECK_DEBUG=1 WATCHDOG_DEBUG=1 -C $dir/../../ -e prefix=${PGPOOL_PATH} >& regression.log 2>&1
 
 	if [ $? != 0 ];then
 	    echo "make install failed"
diff --git a/src/watchdog/Makefile.am b/src/watchdog/Makefile.am
index bb4c2204..0400459f 100644
--- a/src/watchdog/Makefile.am
+++ b/src/watchdog/Makefile.am
@@ -1,6 +1,6 @@
 top_builddir = ../..
 AM_CPPFLAGS = -D_GNU_SOURCE -I @PGSQL_INCLUDE_DIR@
-
+WATCHDOG_DEBUG=0
 noinst_LIBRARIES = lib-watchdog.a
 
 lib_watchdog_a_SOURCES = \
@@ -16,3 +16,4 @@ lib_watchdog_a_SOURCES = \
 	wd_utils.c \
 	wd_escalation.c
 
+DEFS = @DEFS@ -DWATCHDOG_DEBUG_OPTS=$(WATCHDOG_DEBUG)
diff --git a/src/watchdog/Makefile.in b/src/watchdog/Makefile.in
index 891d6058..103dd318 100644
--- a/src/watchdog/Makefile.in
+++ b/src/watchdog/Makefile.in
@@ -189,7 +189,7 @@ COLLATEINDEX = @COLLATEINDEX@
 CPP = @CPP@
 CPPFLAGS = @CPPFLAGS@
 CYGPATH_W = @CYGPATH_W@
-DEFS = @DEFS@
+DEFS = @DEFS@ -DWATCHDOG_DEBUG_OPTS=$(WATCHDOG_DEBUG)
 DLLTOOL = @DLLTOOL@
 DOCBOOKSTYLE = @DOCBOOKSTYLE@
 DSYMUTIL = @DSYMUTIL@
@@ -312,6 +312,7 @@ top_build_prefix = @top_build_prefix@
 top_builddir = ../..
 top_srcdir = @top_srcdir@
 AM_CPPFLAGS = -D_GNU_SOURCE -I @PGSQL_INCLUDE_DIR@
+WATCHDOG_DEBUG = 0
 noinst_LIBRARIES = lib-watchdog.a
 lib_watchdog_a_SOURCES = \
 	watchdog.c \
diff --git a/src/watchdog/watchdog.c b/src/watchdog/watchdog.c
index 0a0e81ef..5793a6bd 100644
--- a/src/watchdog/watchdog.c
+++ b/src/watchdog/watchdog.c
@@ -89,6 +89,15 @@ typedef enum IPC_CMD_PREOCESS_RES
 #define	MAX_SECS_WAIT_FOR_REPLY_FROM_NODE	5	/* time in seconds to wait for
 												 * the reply from remote
 												 * watchdog node */
+
+#define MAX_ALLOWED_SEND_FAILURES           3	/* number of times sending message failure
+                                                  * can be tolerated
+                                                  */
+#define MAX_ALLOWED_BEACON_REPLY_MISS       3	/* number of times missing beacon message reply
+                                                  * can be tolerated
+                                                  */
+
+
 #define	FAILOVER_COMMAND_FINISH_TIMEOUT		15	/* timeout in seconds to wait
 												 * for Pgpool-II to build
 												 * consensus for failover */
@@ -129,6 +138,8 @@ typedef enum IPC_CMD_PREOCESS_RES
 #define CLUSTER_IAM_RESIGNING_FROM_MASTER	'R'
 #define CLUSTER_NODE_INVALID_VERSION		'V'
 #define CLUSTER_NODE_REQUIRE_TO_RELOAD		'I'
+#define CLUSTER_NODE_APPEARING_LOST 		'Y'
+#define CLUSTER_NODE_APPEARING_FOUND 		'Z'
 
 #define WD_MASTER_NODE getMasterWatchdogNode()
 
@@ -199,7 +210,8 @@ char	   *wd_event_name[] =
 	"NODE CONNECTION LOST",
 	"NODE CONNECTION FOUND",
 	"CLUSTER QUORUM STATUS CHANGED",
-	"NODE REQUIRE TO RELOAD STATE"
+	"NODE REQUIRE TO RELOAD STATE",
+	"I AM APPEARING LOST"
 };
 
 char	   *wd_state_names[] = {
@@ -214,7 +226,18 @@ char	   *wd_state_names[] = {
 	"LOST",
 	"IN NETWORK TROUBLE",
 	"SHUTDOWN",
-	"ADD MESSAGE SENT"
+	"ADD MESSAGE SENT",
+	"NETWORK ISOLATION"
+};
+
+char *wd_node_lost_reasons[] = {
+	"UNKNOWN REASON",
+	"REPORTED BY LIFECHECK",
+	"SEND MESSAGE FAILURES",
+	"MISSING BEACON REPLIES",
+	"RECEIVE TIMEOUT",
+	"NOT REACHABLE",
+	"SHUTDOWN"
 };
 
 typedef struct WDPacketData
@@ -353,6 +376,22 @@ typedef struct WDFailoverObject
 	int			state;
 }			WDFailoverObject;
 
+#ifdef WATCHDOG_DEBUG_OPTS
+#if WATCHDOG_DEBUG_OPTS > 0
+#define WATCHDOG_DEBUG
+#endif
+#endif
+
+static bool check_debug_request_do_not_send_beacon(void);
+static bool check_debug_request_do_not_reply_beacon(void);
+static bool check_debug_request_kill_all_communication(void);
+static bool check_debug_request_kill_all_receivers(void);
+static bool check_debug_request_kill_all_senders(void);
+
+
+#ifdef WATCHDOG_DEBUG
+static void load_watchdog_debug_test_option(void);
+#endif
 
 static void process_remote_failover_command_on_coordinator(WatchdogNode * wdNode, WDPacketData * pkt);
 static WDFailoverObject * get_failover_object(POOL_REQUEST_KIND reqKind, int nodesCount, int *nodeList);
@@ -422,7 +461,7 @@ static void service_internal_command(void);
 static unsigned int get_next_commandID(void);
 static WatchdogNode * parse_node_info_message(WDPacketData * pkt, char **authkey);
 static void update_quorum_status(void);
-static int	get_mimimum_remote_nodes_required_for_quorum(void);
+static int	get_minimum_remote_nodes_required_for_quorum(void);
 static int	get_minimum_votes_to_resolve_consensus(void);
 
 static bool write_packet_to_socket(int sock, WDPacketData * pkt, bool ipcPacket);
@@ -462,6 +501,7 @@ static int	watchdog_state_machine_joining(WD_EVENTS event, WatchdogNode * wdNode
 static int	watchdog_state_machine_loading(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pkt, WDCommandData * clusterCommand);
 static int	watchdog_state_machine(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pkt, WDCommandData * clusterCommand);
 static int	watchdog_state_machine_nw_error(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pkt, WDCommandData * clusterCommand);
+static int watchdog_state_machine_nw_isolation(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pkt, WDCommandData * clusterCommand);
 
 static int	I_am_master_and_cluser_in_split_brain(WatchdogNode * otherMasterNode);
 static void handle_split_brain(WatchdogNode * otherMasterNode, WDPacketData * pkt);
@@ -530,6 +570,7 @@ static void set_cluster_master_node(WatchdogNode * wdNode);
 static void clear_standby_nodes_list(void);
 static int	standby_node_left_cluster(WatchdogNode * wdNode);
 static int	standby_node_join_cluster(WatchdogNode * wdNode);
+static void update_missed_beacon_count(WDCommandData* ipcCommand, bool clear);
 
 /* global variables */
 wd_cluster	g_cluster;
@@ -1171,6 +1212,9 @@ watchdog_main(void)
 				g_timeout_sec = 0;
 			}
 		}
+#ifdef WATCHDOG_DEBUG
+		load_watchdog_debug_test_option();
+#endif
 		if (select_ret > 0)
 		{
 			int			processed_fds = 0;
@@ -1410,10 +1454,14 @@ read_sockets(fd_set *rmask, int pending_fds_count)
 
 				if (pkt)
 				{
-					watchdog_state_machine(WD_EVENT_PACKET_RCV, wdNode, pkt, NULL);
-					/* since a packet is received reset last sent time */
-					wdNode->last_sent_time.tv_sec = 0;
-					wdNode->last_sent_time.tv_usec = 0;
+					if (check_debug_request_kill_all_communication() == false &&
+						check_debug_request_kill_all_receivers() == false)
+					{
+						watchdog_state_machine(WD_EVENT_PACKET_RCV, wdNode, pkt, NULL);
+						/* since a packet is received reset last sent time */
+						wdNode->last_sent_time.tv_sec = 0;
+						wdNode->last_sent_time.tv_usec = 0;
+					}
 					free_packet(pkt);
 				}
 				else
@@ -1437,10 +1485,14 @@ read_sockets(fd_set *rmask, int pending_fds_count)
 
 				if (pkt)
 				{
-					watchdog_state_machine(WD_EVENT_PACKET_RCV, wdNode, pkt, NULL);
-					/* since a packet is received reset last sent time */
-					wdNode->last_sent_time.tv_sec = 0;
-					wdNode->last_sent_time.tv_usec = 0;
+					if (check_debug_request_kill_all_communication() == false &&
+						check_debug_request_kill_all_receivers() == false)
+					{
+						watchdog_state_machine(WD_EVENT_PACKET_RCV, wdNode, pkt, NULL);
+						/* since a packet is received reset last sent time */
+						wdNode->last_sent_time.tv_sec = 0;
+						wdNode->last_sent_time.tv_usec = 0;
+					}
 					free_packet(pkt);
 				}
 				else
@@ -1470,6 +1522,7 @@ read_sockets(fd_set *rmask, int pending_fds_count)
 			pkt = read_packet_of_type(conn, WD_ADD_NODE_MESSAGE);
 			if (pkt)
 			{
+				struct timeval 	previous_startup_time;
 				char	   *authkey = NULL;
 				WatchdogNode *tempNode = parse_node_info_message(pkt, &authkey);
 
@@ -1486,6 +1539,7 @@ read_sockets(fd_set *rmask, int pending_fds_count)
 					/* verify this node */
 					if (authenticated)
 					{
+						WD_STATES oldNodeState = WD_DEAD;
 						for (i = 0; i < g_cluster.remoteNodeCount; i++)
 						{
 							wdNode = &(g_cluster.remoteNodes[i]);
@@ -1495,6 +1549,9 @@ read_sockets(fd_set *rmask, int pending_fds_count)
 							{
 								/* We have found the match */
 								found = true;
+								previous_startup_time.tv_sec = wdNode->startup_time.tv_sec;
+								oldNodeState = wdNode->state;
+
 								close_socket_connection(&wdNode->server_socket);
 								strlcpy(wdNode->delegate_ip, tempNode->delegate_ip, WD_MAX_HOST_NAMELEN);
 								strlcpy(wdNode->nodeName, tempNode->nodeName, WD_MAX_HOST_NAMELEN);
@@ -1528,7 +1585,35 @@ read_sockets(fd_set *rmask, int pending_fds_count)
 											   wdNode->wd_data_major_version,
 											   wdNode->wd_data_minor_version)));
 
-							watchdog_state_machine(WD_EVENT_PACKET_RCV, wdNode, pkt, NULL);
+							if (oldNodeState == WD_SHUTDOWN)
+							{
+								ereport(LOG,
+										(errmsg("The newly joined node:\"%s\" had left the cluster because it was shutdown",wdNode->nodeName)));
+								watchdog_state_machine(WD_EVENT_PACKET_RCV, wdNode, pkt, NULL);
+
+							}
+							else if (oldNodeState == WD_LOST)
+							{
+								ereport(LOG,
+										(errmsg("The newly joined node:\"%s\" had left the cluster because it was lost",wdNode->nodeName),
+										 errdetail("lost reason was \"%s\" and startup time diff = %d",
+												   wd_node_lost_reasons[wdNode->node_lost_reason],
+												   abs((int)(previous_startup_time.tv_sec - wdNode->startup_time.tv_sec)))));
+
+								if (abs((int)(previous_startup_time.tv_sec - wdNode->startup_time.tv_sec)) <= 2 &&
+									wdNode->node_lost_reason == NODE_LOST_BY_LIFECHECK)
+								{
+									ereport(LOG,
+										(errmsg("node:\"%s\" was reported lost by the lifecheck process",wdNode->nodeName),
+											 errdetail("only lifecheck process can mark this node alive again")));
+									/* restore the node's lost state */
+									wdNode->state = oldNodeState;
+								}
+								else
+									watchdog_state_machine(WD_EVENT_PACKET_RCV, wdNode, pkt, NULL);
+
+							}
+
 						}
 						else
 							ereport(NOTICE,
@@ -1721,6 +1806,12 @@ write_ipc_command_with_result_data(WDCommandData * ipcCommand, char type, char *
 				(errmsg("not replying to IPC, Invalid IPC command.")));
 		return false;
 	}
+	/* DEBUG AID */
+	if (ipcCommand->commandSource == COMMAND_SOURCE_REMOTE &&
+		(check_debug_request_kill_all_senders() ||
+		check_debug_request_kill_all_communication()))
+		return false;
+
 	return write_packet_to_socket(ipcCommand->sourceIPCSocket, &pkt, true);
 }
 
@@ -1932,7 +2023,7 @@ static IPC_CMD_PREOCESS_RES process_IPC_get_runtime_variable_value_request(WDCom
 		json_value_free(root);
 		ereport(NOTICE,
 				(errmsg("failed to process get local variable IPC command"),
-				 errdetail("unable to parse json data")));
+				 errdetail("unable to parse JSON data")));
 		return IPC_CMD_ERROR;
 	}
 
@@ -2031,7 +2122,7 @@ static IPC_CMD_PREOCESS_RES process_IPC_nodeStatusChange_command(WDCommandData *
 	{
 		ereport(NOTICE,
 				(errmsg("failed to process NODE STATE CHANGE IPC command"),
-				 errdetail("unable to parse json data")));
+				 errdetail("unable to parse JSON data")));
 		return IPC_CMD_ERROR;
 	}
 
@@ -2086,7 +2177,10 @@ fire_node_status_event(int nodeID, int nodeStatus)
 		if (wdNode == g_cluster.localNode)
 			watchdog_state_machine(WD_EVENT_LOCAL_NODE_LOST, wdNode, NULL, NULL);
 		else
+		{
+			wdNode->node_lost_reason = NODE_LOST_BY_LIFECHECK;
 			watchdog_state_machine(WD_EVENT_REMOTE_NODE_LOST, wdNode, NULL, NULL);
+		}
 	}
 	else if (nodeStatus == WD_LIFECHECK_NODE_STATUS_ALIVE)
 	{
@@ -2188,7 +2282,7 @@ service_expired_failovers(void)
 										BACKEND_INFO(node_id).role == ROLE_PRIMARY)
 									{
 										ereport(LOG,
-												(errmsg("We are not able to build consensus for our primary node failover request, got %d votesonly for failover request ID:%d", failoverObj->request_count, failoverObj->failoverID),
+												(errmsg("We are not able to build consensus for our primary node failover request, got %d votes only for failover request ID:%d", failoverObj->request_count, failoverObj->failoverID),
 												 errdetail("resigning from the coordinator")));
 										need_to_resign = true;
 									}
@@ -2790,7 +2884,7 @@ static IPC_CMD_PREOCESS_RES process_IPC_failover_indication(WDCommandData * ipcC
 			{
 				ereport(LOG,
 						(errmsg("unable to process failover indication"),
-						 errdetail("invalid json data in command packet")));
+						 errdetail("invalid JSON data in command packet")));
 				res = FAILOVER_RES_INVALID_FUNCTION;
 			}
 			if (root)
@@ -2801,7 +2895,7 @@ static IPC_CMD_PREOCESS_RES process_IPC_failover_indication(WDCommandData * ipcC
 		{
 			ereport(LOG,
 					(errmsg("unable to process failover indication"),
-					 errdetail("invalid json data in command packet")));
+					 errdetail("invalid JSON data in command packet")));
 			res = FAILOVER_RES_INVALID_FUNCTION;
 		}
 		else if (failoverState == 0)	/* start */
@@ -2844,7 +2938,7 @@ failover_start_indication(WDCommandData * ipcCommand)
 	}
 	else if (get_local_node_state() == WD_STANDBY)
 	{
-		/* The node might be performing the locl quarantine opetaion */
+		/* The node might be performing the local quarantine operation */
 		ereport(DEBUG1,
 				(errmsg("main process is starting the local quarantine operation")));
 		return FAILOVER_RES_PROCEED;
@@ -2871,7 +2965,7 @@ failover_end_indication(WDCommandData * ipcCommand)
 	}
 	else if (get_local_node_state() == WD_STANDBY)
 	{
-		/* The node might be performing the locl quarantine opetaion */
+		/* The node might be performing the local quarantine operation */
 		ereport(DEBUG1,
 				(errmsg("main process is ending the local quarantine operation")));
 		return FAILOVER_RES_PROCEED;
@@ -3250,7 +3344,7 @@ update_successful_outgoing_cons(fd_set *wmask, int pending_fds_count)
 				{
 					ereport(DEBUG1,
 							(errmsg("error in outbound connection to %s:%d ", wdNode->hostname, wdNode->wd_port),
-							 errdetail("getsockopt faile with error \"%s\"", strerror(errno))));
+							 errdetail("getsockopt failed with error \"%s\"", strerror(errno))));
 					close_socket_connection(&wdNode->client_socket);
 					wdNode->client_socket.sock_state = WD_SOCK_ERROR;
 
@@ -3775,13 +3869,34 @@ cluster_service_message_processor(WatchdogNode * wdNode, WDPacketData * pkt)
 		}
 			break;
 
+		case CLUSTER_NODE_APPEARING_LOST:
+		{
+			ereport(LOG,
+				(errmsg("remote node \"%s\" is reporting that it has lost us",
+							wdNode->nodeName)));
+			wdNode->has_lost_us = true;
+			watchdog_state_machine(WD_EVENT_I_AM_APPEARING_LOST, wdNode, NULL, NULL);
+		}
+			break;
+
+		case CLUSTER_NODE_APPEARING_FOUND:
+		{
+			ereport(LOG,
+				(errmsg("remote node \"%s\" is reporting that it has found us again",
+							wdNode->nodeName)));
+			wdNode->has_lost_us = false;
+			watchdog_state_machine(WD_EVENT_I_AM_APPEARING_FOUND, wdNode, NULL, NULL);
+		}
+			break;
+
 		case CLUSTER_NODE_INVALID_VERSION:
 			{
 				/*
 				 * this should never happen means something is seriously wrong
 				 */
 				ereport(FATAL,
-						(errmsg("\"%s\" node has found serious issues in our watchdog messages",
+						(return_code(POOL_EXIT_FATAL),
+						 errmsg("\"%s\" node has found serious issues in our watchdog messages",
 								wdNode->nodeName),
 						 errdetail("shutting down")));
 			}
@@ -3846,16 +3961,6 @@ standard_packet_processor(WatchdogNode * wdNode, WDPacketData * pkt)
 			}
 			break;
 
-		case WD_EVENT_REMOTE_NODE_FOUND:
-			{
-				ereport(LOG,
-						(errmsg("remote node \"%s\" became reachable again", wdNode->nodeName),
-						 errdetail("requesting the node info")));
-				send_message_of_type(wdNode, WD_REQ_INFO_MESSAGE, NULL);
-				break;
-			}
-			break;
-
 		case WD_ADD_NODE_MESSAGE:
 		case WD_REQ_INFO_MESSAGE:
 			replyPkt = get_mynode_info_message(pkt);
@@ -3955,6 +4060,33 @@ standard_packet_processor(WatchdogNode * wdNode, WDPacketData * pkt)
 				{
 					standby_node_left_cluster(wdNode);
 				}
+				if (oldNodeState == WD_LOST)
+				{
+					/*
+					 * We have received the message from lost node
+					 * add it back to cluster if it was not marked by
+					 * life-check
+					 * Node lost by life-check processes can only be
+					 * added back when we get alive notification for the
+					 * node from life-check
+					 */
+					ereport(LOG,
+						(errmsg("we have received the NODE INFO message from the node:\"%s\" that was lost",wdNode->nodeName),
+						 errdetail("we had lost this node because of \"%s\"",wd_node_lost_reasons[wdNode->node_lost_reason])));
+
+					if (wdNode->node_lost_reason == NODE_LOST_BY_LIFECHECK)
+					{
+						ereport(LOG,
+							(errmsg("node:\"%s\" was reported lost by the lifecheck process",wdNode->nodeName),
+								 errdetail("only life-check process can mark this node alive again")));
+						/* restore the node's lost state */
+						wdNode->state = oldNodeState;
+					}
+					else
+					{
+						watchdog_state_machine(WD_EVENT_REMOTE_NODE_FOUND, wdNode, NULL, NULL);
+					}
+				}
 			}
 			break;
 
@@ -3989,11 +4121,16 @@ standard_packet_processor(WatchdogNode * wdNode, WDPacketData * pkt)
 
 					send_cluster_service_message(NULL, pkt, CLUSTER_IN_SPLIT_BRAIN);
 				}
-				else
+				else if (WD_MASTER_NODE != NULL)
 				{
 					replyPkt = get_mynode_info_message(pkt);
 					beacon_message_received_from_node(wdNode, pkt);
 				}
+				/*
+				 * if (WD_MASTER_NODE == NULL)
+				 * do not reply to beacon if we are not connected to
+				 * any master node
+				 */
 			}
 			break;
 
@@ -4014,6 +4151,10 @@ standard_packet_processor(WatchdogNode * wdNode, WDPacketData * pkt)
 static bool
 send_message_to_connection(SocketConnection * conn, WDPacketData * pkt)
 {
+	if (check_debug_request_kill_all_communication() == true ||
+		check_debug_request_kill_all_senders() == true)
+		return false;
+
 	if (conn->sock > 0 && conn->sock_state == WD_SOCK_CONNECTED)
 	{
 		if (write_packet_to_socket(conn->sock, pkt, false) == true)
@@ -4040,6 +4181,8 @@ send_message_to_node(WatchdogNode * wdNode, WDPacketData * pkt)
 	}
 	if (ret)
 	{
+		/* reset the sending error counter */
+		wdNode->sending_failures_count = 0;
 		/* we only update the last sent time if reply for packet is expected */
 		switch (pkt->type)
 		{
@@ -4054,6 +4197,7 @@ send_message_to_node(WatchdogNode * wdNode, WDPacketData * pkt)
 	}
 	else
 	{
+		wdNode->sending_failures_count++;
 		ereport(DEBUG1,
 				(errmsg("sending packet %c to node \"%s\" failed", pkt->type, wdNode->nodeName)));
 	}
@@ -4123,7 +4267,7 @@ static IPC_CMD_PREOCESS_RES wd_command_processor_for_node_lost_event(WDCommandDa
 				if (nodeResult->cmdState == COMMAND_STATE_SENT)
 				{
 					ereport(LOG,
-							(errmsg("remote node \"%s\" lost while ipc command was in progress ", wdLostNode->nodeName)));
+							(errmsg("remote node \"%s\" lost while IPC command was in progress ", wdLostNode->nodeName)));
 
 					/*
 					 * since the node is lost and will be removed from the
@@ -4342,15 +4486,36 @@ service_unreachable_nodes(void)
 							(errmsg("remote node \"%s\" is not replying..", wdNode->nodeName),
 							 errdetail("marking the node as lost")));
 					/* mark the node as lost */
+					wdNode->node_lost_reason = NODE_LOST_BY_RECEIVE_TIMEOUT;
 					watchdog_state_machine(WD_EVENT_REMOTE_NODE_LOST, wdNode, NULL, NULL);
 				}
 			}
+			else if (wdNode->sending_failures_count > MAX_ALLOWED_SEND_FAILURES)
+			{
+				ereport(LOG,
+						(errmsg("not able to send messages to remote node \"%s\"",wdNode->nodeName),
+						 errdetail("marking the node as lost")));
+				/* mark the node as lost */
+				wdNode->node_lost_reason = NODE_LOST_BY_SEND_FAILURE;
+				watchdog_state_machine(WD_EVENT_REMOTE_NODE_LOST, wdNode, NULL, NULL);
+			}
+			else if (wdNode->missed_beacon_count > MAX_ALLOWED_BEACON_REPLY_MISS)
+			{
+				ereport(LOG,
+						(errmsg("remote node \"%s\" is not responding to our beacon messages",wdNode->nodeName),
+						 errdetail("marking the node as lost")));
+				/* mark the node as lost */
+				wdNode->node_lost_reason = NODE_LOST_BY_MISSING_BEACON;
+				wdNode->missed_beacon_count = 0; /* Reset the counter */
+				watchdog_state_machine(WD_EVENT_REMOTE_NODE_LOST, wdNode, NULL, NULL);
+			}
 		}
 		else
 		{
 			ereport(LOG,
 					(errmsg("remote node \"%s\" is not reachable", wdNode->nodeName),
 					 errdetail("marking the node as lost")));
+			wdNode->node_lost_reason = NODE_LOST_BY_NOT_REACHABLE;
 			watchdog_state_machine(WD_EVENT_REMOTE_NODE_LOST, wdNode, NULL, NULL);
 		}
 	}
@@ -4379,8 +4544,6 @@ watchdog_internal_command_packet_processor(WatchdogNode * wdNode, WDPacketData *
 	for (i = 0; i < g_cluster.remoteNodeCount; i++)
 	{
 		WDCommandNodeResult *nodeRes = &clusterCommand->nodeResults[i];
-
-		clear_command_node_result(nodeRes);
 		if (nodeRes->wdNode == wdNode)
 		{
 			nodeResult = nodeRes;
@@ -4523,7 +4686,7 @@ issue_watchdog_internal_command(WatchdogNode * wdNode, WDPacketData * pkt, int t
 				if (send_message_to_node(nodeResult->wdNode, pkt) == false)
 				{
 					ereport(DEBUG1,
-							(errmsg("failed to send watchdog internla command packet %s", nodeResult->wdNode->nodeName),
+							(errmsg("failed to send watchdog internal command packet %s", nodeResult->wdNode->nodeName),
 							 errdetail("saving the packet. will try to resend it if connection recovers")));
 
 					/* failed to send. May be try again later */
@@ -4917,9 +5080,6 @@ watchdog_state_machine(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pk
 
 	if (event == WD_EVENT_REMOTE_NODE_LOST)
 	{
-		/* close all socket connections to the node */
-		close_socket_connection(&wdNode->client_socket);
-		close_socket_connection(&wdNode->server_socket);
 
 		if (wdNode->state == WD_SHUTDOWN)
 		{
@@ -4931,6 +5091,8 @@ watchdog_state_machine(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pk
 			wdNode->state = WD_LOST;
 			ereport(LOG,
 					(errmsg("remote node \"%s\" is lost", wdNode->nodeName)));
+			/* Inform the node, that it is lost for us */
+			 send_cluster_service_message(wdNode, pkt, CLUSTER_NODE_APPEARING_LOST);
 		}
 		if (wdNode == WD_MASTER_NODE)
 		{
@@ -4939,11 +5101,29 @@ watchdog_state_machine(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pk
 			set_cluster_master_node(NULL);
 		}
 
+		/* close all socket connections to the node */
+		close_socket_connection(&wdNode->client_socket);
+		close_socket_connection(&wdNode->server_socket);
+
 		/* clear the wait timer on the node */
 		wdNode->last_sent_time.tv_sec = 0;
 		wdNode->last_sent_time.tv_usec = 0;
+		wdNode->sending_failures_count = 0;
 		node_lost_while_ipc_command(wdNode);
 	}
+	else if (event == WD_EVENT_REMOTE_NODE_FOUND)
+	{
+		ereport(LOG,
+				(errmsg("remote node \"%s\" became reachable again", wdNode->nodeName),
+				 errdetail("requesting the node info")));
+		/*
+		 * remove the lost state from the node
+		 * and change it to joining for now
+		 */
+		wdNode->node_lost_reason = NODE_LOST_UNKNOWN_REASON;
+		wdNode->state = WD_LOADING;
+		send_cluster_service_message(wdNode, pkt, CLUSTER_NODE_APPEARING_FOUND);
+	}
 	else if (event == WD_EVENT_PACKET_RCV)
 	{
 		print_packet_node_info(pkt, wdNode, false);
@@ -4958,6 +5138,7 @@ watchdog_state_machine(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pk
 		if (pkt->type == WD_INFORM_I_AM_GOING_DOWN)
 		{
 			wdNode->state = WD_SHUTDOWN;
+			wdNode->node_lost_reason = NODE_LOST_SHUTDOWN;
 			return watchdog_state_machine(WD_EVENT_REMOTE_NODE_LOST, wdNode, NULL, NULL);
 		}
 
@@ -4982,7 +5163,7 @@ watchdog_state_machine(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pk
 		if (any_interface_available() == false)
 		{
 			ereport(WARNING,
-					(errmsg("network event has occured and all monitored interfaces are down"),
+					(errmsg("network event has occurred and all monitored interfaces are down"),
 					 errdetail("changing the state to in network trouble")));
 
 			set_state(WD_IN_NW_TROUBLE);
@@ -5021,7 +5202,7 @@ watchdog_state_machine(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pk
 	else if (event == WD_EVENT_LOCAL_NODE_LOST)
 	{
 		ereport(WARNING,
-				(errmsg("watchdog lifecheck reported, we are disconnected from the network"),
+				(errmsg("watchdog life-check reported, we are disconnected from the network"),
 				 errdetail("changing the state to LOST")));
 		set_state(WD_LOST);
 	}
@@ -5056,6 +5237,9 @@ watchdog_state_machine(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pk
 		case WD_IN_NW_TROUBLE:
 			watchdog_state_machine_nw_error(event, wdNode, pkt, clusterCommand);
 			break;
+		case WD_NETWORK_ISOLATION:
+			watchdog_state_machine_nw_isolation(event, wdNode, pkt, clusterCommand);
+			break;
 		default:
 			/* Should never ever happen */
 			ereport(WARNING,
@@ -5111,7 +5295,7 @@ watchdog_state_machine_loading(WD_EVENTS event, WatchdogNode * wdNode, WDPacketD
 					case WD_STAND_FOR_COORDINATOR_MESSAGE:
 						{
 							/*
-							 * We are loading but a note is already contesting
+							 * We are loading but a node is already contesting
 							 * for coordinator node well we can ignore it but
 							 * then this could eventually mean a lower
 							 * priority node can became a coordinator node. So
@@ -5368,8 +5552,10 @@ watchdog_state_machine_standForCord(WD_EVENTS event, WatchdogNode * wdNode, WDPa
 							if (pkt->type == WD_ERROR_MESSAGE)
 							{
 								ereport(LOG,
-										(errmsg("our stand for coordinator request is rejected by node \"%s\"", wdNode->nodeName)));
-								set_state(WD_JOINING);
+										(errmsg("our stand for coordinator request is rejected by node \"%s\"",wdNode->nodeName),
+										 errdetail("we might be in partial network isolation and cluster already have a valid master"),
+										 errhint("please verify the watchdog life-check and network is working properly")));
+								set_state(WD_NETWORK_ISOLATION);
 							}
 							else if (pkt->type == WD_REJECT_MESSAGE)
 							{
@@ -5474,6 +5660,7 @@ watchdog_state_machine_coordinator(WD_EVENTS event, WatchdogNode * wdNode, WDPac
 
 				send_cluster_command(NULL, WD_DECLARE_COORDINATOR_MESSAGE, 4);
 				set_timeout(MAX_SECS_WAIT_FOR_REPLY_FROM_NODE);
+				update_missed_beacon_count(NULL,true);
 				ereport(LOG,
 						(errmsg("I am announcing my self as master/coordinator watchdog node")));
 
@@ -5546,6 +5733,8 @@ watchdog_state_machine_coordinator(WD_EVENTS event, WatchdogNode * wdNode, WDPac
 
 				else if (clusterCommand->commandPacket.type == WD_IAM_COORDINATOR_MESSAGE)
 				{
+					update_missed_beacon_count(clusterCommand,false);
+
 					if (clusterCommand->commandStatus == COMMAND_FINISHED_ALL_REPLIED)
 					{
 						ereport(DEBUG1,
@@ -5669,11 +5858,49 @@ watchdog_state_machine_coordinator(WD_EVENTS event, WatchdogNode * wdNode, WDPac
 
 		case WD_EVENT_TIMEOUT:
 			{
-				send_cluster_command(NULL, WD_IAM_COORDINATOR_MESSAGE, 5);
+				if (check_debug_request_do_not_send_beacon() == false)
+					send_cluster_command(NULL, WD_IAM_COORDINATOR_MESSAGE, 5);
 				set_timeout(BEACON_MESSAGE_INTERVAL_SECONDS);
 			}
 			break;
 
+		case WD_EVENT_I_AM_APPEARING_LOST:
+		{
+			/* The remote node has lost us, It would have already marked
+			 * us as lost, So remove it from standby*/
+			standby_node_left_cluster(wdNode);
+		}
+			break;
+
+		case WD_EVENT_I_AM_APPEARING_FOUND:
+		{
+			/* The remote node has found us again */
+			if (wdNode->wd_data_major_version >= 1 && wdNode->wd_data_minor_version >= 1)
+			{
+				/*
+				 * Since data version 1.1 we support CLUSTER_NODE_REQUIRE_TO_RELOAD
+				 * which makes the standby nodes to re-send the join master node
+				 */
+				ereport(DEBUG1,
+					(errmsg("asking remote node \"%s\" to rejoin master", wdNode->nodeName),
+						errdetail("watchdog data version %s",WD_MESSAGE_DATA_VERSION)));
+
+				send_cluster_service_message(wdNode, pkt, CLUSTER_NODE_REQUIRE_TO_RELOAD);
+			}
+			else
+			{
+				/*
+				 * The node is on older version
+				 * So ask it to re-join the cluster
+				 */
+				ereport(DEBUG1,
+					(errmsg("asking remote node \"%s\" to rejoin cluster", wdNode->nodeName),
+						errdetail("watchdog data version %s",WD_MESSAGE_DATA_VERSION)));
+				send_cluster_service_message(wdNode, pkt, CLUSTER_NEEDS_ELECTION);
+			}
+		}
+			break;
+
 		case WD_EVENT_REMOTE_NODE_LOST:
 			{
 				standby_node_left_cluster(wdNode);
@@ -5685,6 +5912,7 @@ watchdog_state_machine_coordinator(WD_EVENTS event, WatchdogNode * wdNode, WDPac
 			ereport(LOG,
 				(errmsg("remote node \"%s\" is reachable again", wdNode->nodeName),
 					errdetail("trying to add it back as a standby")));
+			wdNode->node_lost_reason = NODE_LOST_UNKNOWN_REASON;
 			/* If I am the cluster master. Ask for the node info and to re-send the join message */
 			send_message_of_type(wdNode, WD_REQ_INFO_MESSAGE, NULL);
 			if (wdNode->wd_data_major_version >= 1 && wdNode->wd_data_minor_version >= 1)
@@ -5717,12 +5945,21 @@ watchdog_state_machine_coordinator(WD_EVENTS event, WatchdogNode * wdNode, WDPac
 			{
 				switch (pkt->type)
 				{
+					case WD_ADD_NODE_MESSAGE:
+						/* In case we received the ADD node message from
+						 * one of our standby, Remove that standby from
+						 * the list
+						 */
+						standby_node_left_cluster(wdNode);
+						standard_packet_processor(wdNode, pkt);
+						break;
+
 					case WD_STAND_FOR_COORDINATOR_MESSAGE:
 						reply_with_minimal_message(wdNode, WD_REJECT_MESSAGE, pkt);
 						break;
 					case WD_DECLARE_COORDINATOR_MESSAGE:
 						ereport(NOTICE,
-								(errmsg("We are corrdinator and another node tried a coup")));
+								(errmsg("We are coordinator and another node tried a coup")));
 						reply_with_minimal_message(wdNode, WD_ERROR_MESSAGE, pkt);
 						break;
 
@@ -5757,14 +5994,24 @@ watchdog_state_machine_coordinator(WD_EVENTS event, WatchdogNode * wdNode, WDPac
 
 					case WD_JOIN_COORDINATOR_MESSAGE:
 						{
-							reply_with_minimal_message(wdNode, WD_ACCEPT_MESSAGE, pkt);
-
 							/*
-							 * Also get the configurations from the standby
-							 * node
+							 * If the node is marked as lost because of
+							 * life-check, Do not let it join the cluster
 							 */
-							send_message_of_type(wdNode, WD_ASK_FOR_POOL_CONFIG, NULL);
-							standby_node_join_cluster(wdNode);
+							if (wdNode->state == WD_LOST && wdNode->node_lost_reason == NODE_LOST_BY_LIFECHECK)
+							{
+								ereport(LOG,
+										(errmsg("lost remote node \"%s\" is requesting to join the cluster",wdNode->nodeName),
+										 errdetail("rejecting the request until life-check inform us that it is reachable again")));
+								reply_with_minimal_message(wdNode, WD_REJECT_MESSAGE, pkt);
+							}
+							else
+							{
+								reply_with_minimal_message(wdNode, WD_ACCEPT_MESSAGE, pkt);
+								/* Also get the configurations from the standby node */
+								send_message_of_type(wdNode,WD_ASK_FOR_POOL_CONFIG,NULL);
+								standby_node_join_cluster(wdNode);
+							}
 						}
 						break;
 
@@ -5795,7 +6042,7 @@ watchdog_state_machine_coordinator(WD_EVENTS event, WatchdogNode * wdNode, WDPac
  * watchdog node when the network becomes reachable, but there is a problem.
  *
  * Once the cable on the system is unplugged or when the node gets isolated from the
- * cluster there is every likelihood that the backend healthcheck of the isolated node
+ * cluster there is every likelihood that the backend health-check of the isolated node
  * start reporting the backend node failure and the pgpool-II proceeds to perform
  * the failover for all attached backend nodes. Since the pgpool-II is yet not
  * smart enough to figure out it is because of the network failure of its own
@@ -5872,6 +6119,42 @@ watchdog_state_machine_nw_error(WD_EVENTS event, WatchdogNode * wdNode, WDPacket
 	return 0;
 }
 
+/*
+ * we could end up in tis state if we were connected to the
+ * master node as standby and got lost on the master.
+ * Here we just wait for BEACON_MESSAGE_INTERVAL_SECONDS
+ * and retry to join the cluster.
+ */
+static int
+watchdog_state_machine_nw_isolation(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pkt, WDCommandData * clusterCommand)
+{
+	switch (event)
+	{
+		case WD_EVENT_WD_STATE_CHANGED:
+			set_timeout(BEACON_MESSAGE_INTERVAL_SECONDS);
+			break;
+
+		case WD_EVENT_PACKET_RCV:
+			standard_packet_processor(wdNode, pkt);
+			break;
+
+		case WD_EVENT_REMOTE_NODE_FOUND:
+		case WD_EVENT_WD_STATE_REQUIRE_RELOAD:
+		case WD_EVENT_I_AM_APPEARING_FOUND:
+		case WD_EVENT_TIMEOUT:
+			/* fall through */
+		case WD_EVENT_NW_IP_IS_ASSIGNED:
+			ereport(LOG,
+				(errmsg("trying again to join the cluster")));
+			set_state(WD_JOINING);
+			break;
+
+		default:
+			break;
+	}
+	return 0;
+}
+
 static bool
 beacon_message_received_from_node(WatchdogNode * wdNode, WDPacketData * pkt)
 {
@@ -6044,7 +6327,7 @@ handle_split_brain(WatchdogNode * otherMasterNode, WDPacketData * pkt)
 				(errmsg("We are in split brain, and \"%s\" node is the best candidate for master/coordinator"
 						,otherMasterNode->nodeName),
 				 errdetail("re-initializing the local watchdog cluster state")));
-		/* brodcast the message about I am not the true master node */
+		/* broadcast the message about I am not the true master node */
 		send_cluster_service_message(NULL, pkt, CLUSTER_IAM_NOT_TRUE_MASTER);
 		set_state(WD_JOINING);
 	}
@@ -6070,8 +6353,8 @@ start_escalated_node(void)
 	while (g_cluster.de_escalation_pid > 0 && wait_secs-- > 0)
 	{
 		/*
-		 * de_escalation proceess was already running and we are esclating
-		 * again. give some time to de-escalation process to exit normaly
+		 * de_escalation process was already running and we are escalating
+		 * again. give some time to de-escalation process to exit normally
 		 */
 		ereport(LOG,
 				(errmsg("waiting for de-escalation process to exit before starting escalation")));
@@ -6112,8 +6395,8 @@ resign_from_escalated_node(void)
 	while (g_cluster.escalation_pid > 0 && wait_secs-- > 0)
 	{
 		/*
-		 * escalation proceess was already running and we are resigning from
-		 * it. wait for the escalation process to exit normaly
+		 * escalation process was already running and we are resigning from
+		 * it. wait for the escalation process to exit normally
 		 */
 		ereport(LOG,
 				(errmsg("waiting for escalation process to exit before starting de-escalation")));
@@ -6209,10 +6492,11 @@ watchdog_state_machine_standby(WD_EVENTS event, WatchdogNode * wdNode, WDPacketD
 			send_cluster_command(WD_MASTER_NODE, WD_JOIN_COORDINATOR_MESSAGE, 5);
 			/* Also reset my priority as per the original configuration */
 			g_cluster.localNode->wd_priority = pool_config->wd_priority;
+			set_timeout(BEACON_MESSAGE_INTERVAL_SECONDS);
 			break;
 
 		case WD_EVENT_TIMEOUT:
-			set_timeout(5);
+			set_timeout(BEACON_MESSAGE_INTERVAL_SECONDS);
 			break;
 
 		case WD_EVENT_WD_STATE_REQUIRE_RELOAD:
@@ -6224,27 +6508,54 @@ watchdog_state_machine_standby(WD_EVENTS event, WatchdogNode * wdNode, WDPacketD
 			break;
 
 		case WD_EVENT_COMMAND_FINISHED:
+		{
+			if (clusterCommand->commandPacket.type == WD_JOIN_COORDINATOR_MESSAGE)
 			{
-				if (clusterCommand->commandPacket.type == WD_JOIN_COORDINATOR_MESSAGE)
+				if (clusterCommand->commandStatus == COMMAND_FINISHED_ALL_REPLIED ||
+					clusterCommand->commandStatus == COMMAND_FINISHED_TIMEOUT)
 				{
-					if (clusterCommand->commandStatus == COMMAND_FINISHED_ALL_REPLIED ||
-						clusterCommand->commandStatus == COMMAND_FINISHED_TIMEOUT)
-					{
-						register_watchdog_state_change_interupt();
+					register_watchdog_state_change_interupt();
+
+					ereport(LOG,
+							(errmsg("successfully joined the watchdog cluster as standby node"),
+							 errdetail("our join coordinator request is accepted by cluster leader node \"%s\"", WD_MASTER_NODE->nodeName)));
+				}
+				else
+				{
+					ereport(NOTICE,
+							(errmsg("our join coordinator is rejected by node \"%s\"", wdNode->nodeName),
+							 errhint("rejoining the cluster.")));
 
+					if (WD_MASTER_NODE->has_lost_us)
+					{
 						ereport(LOG,
-								(errmsg("successfully joined the watchdog cluster as standby node"),
-								 errdetail("our join coordinator request is accepted by cluster leader node \"%s\"", WD_MASTER_NODE->nodeName)));
+								(errmsg("master node \"%s\" thinks we are lost, and \"%s\" is not letting us join",WD_MASTER_NODE->nodeName,wdNode->nodeName),
+								 errhint("please verify the watchdog life-check and network is working properly")));
+						set_state(WD_NETWORK_ISOLATION);
 					}
 					else
 					{
-						ereport(NOTICE,
-								(errmsg("our join coordinator is rejected by node \"%s\"", wdNode->nodeName),
-								 errhint("rejoining the cluster.")));
 						set_state(WD_JOINING);
 					}
 				}
 			}
+		}
+			break;
+
+		case WD_EVENT_I_AM_APPEARING_LOST:
+		{
+			/* The remote node has lost us, and if it
+			 * was our coordinator we might already be
+			 * removed from it's standby list
+			 * So re-Join the cluster
+			 */
+			if (WD_MASTER_NODE == wdNode)
+			{
+				ereport(LOG,
+						(errmsg("we are lost on the master node \"%s\"",wdNode->nodeName)));
+				set_state(WD_JOINING);
+			}
+		}
 			break;
 
 		case WD_EVENT_REMOTE_NODE_LOST:
@@ -6266,6 +6577,22 @@ watchdog_state_machine_standby(WD_EVENTS event, WatchdogNode * wdNode, WDPacketD
 			{
 				switch (pkt->type)
 				{
+					case WD_ADD_NODE_MESSAGE:
+					{
+						/* In case we received the ADD node message from
+						 * our coordinator. Reset the cluster state
+						 */
+						if (wdNode == WD_MASTER_NODE)
+						{
+							ereport(LOG,
+									(errmsg("received ADD NODE message from the master node \"%s\"", wdNode->nodeName),
+									 errdetail("re-joining the cluster")));
+							set_state(WD_JOINING);
+						}
+						standard_packet_processor(wdNode, pkt);
+					}
+						break;
+
 					case WD_FAILOVER_END:
 						{
 							register_backend_state_sync_req_interupt();
@@ -6281,8 +6608,11 @@ watchdog_state_machine_standby(WD_EVENTS event, WatchdogNode * wdNode, WDPacketD
 							}
 							else
 							{
+								ereport(LOG,
+										(errmsg("We are connected to master node \"%s\" and another node \"%s\" is trying to become a master",WD_MASTER_NODE->nodeName, wdNode->nodeName)));
 								reply_with_minimal_message(wdNode, WD_ERROR_MESSAGE, pkt);
-								set_state(WD_JOINING);
+								/* Ask master to re-send its node info */
+								send_message_of_type(WD_MASTER_NODE, WD_REQ_INFO_MESSAGE, NULL);
 							}
 						}
 						break;
@@ -6293,14 +6623,14 @@ watchdog_state_machine_standby(WD_EVENTS event, WatchdogNode * wdNode, WDPacketD
 							{
 								/*
 								 * we already have a master node and we got a
-								 * new node trying to be master re-initialize
-								 * the cluster, something is wrong
+								 * new node trying to be master
 								 */
+								ereport(LOG,
+										(errmsg("We are connected to master node \"%s\" and another node \"%s\" is trying to declare itself as a master",WD_MASTER_NODE->nodeName, wdNode->nodeName)));
 								reply_with_minimal_message(wdNode, WD_ERROR_MESSAGE, pkt);
-							}
-							else
-							{
-								set_state(WD_JOINING);
+								/* Ask master to re-send its node info */
+								send_message_of_type(WD_MASTER_NODE, WD_REQ_INFO_MESSAGE, NULL);
+
 							}
 						}
 						break;
@@ -6320,7 +6650,7 @@ watchdog_state_machine_standby(WD_EVENTS event, WatchdogNode * wdNode, WDPacketD
 
 								send_cluster_service_message(NULL, pkt, CLUSTER_IN_SPLIT_BRAIN);
 							}
-							else
+							else if (check_debug_request_do_not_reply_beacon() == false)
 							{
 								send_message_of_type(wdNode, WD_INFO_MESSAGE, pkt);
 								beacon_message_received_from_node(wdNode, pkt);
@@ -6350,7 +6680,7 @@ watchdog_state_machine_standby(WD_EVENTS event, WatchdogNode * wdNode, WDPacketD
 		gettimeofday(&currTime, NULL);
 		int			last_rcv_sec = WD_TIME_DIFF_SEC(currTime, WD_MASTER_NODE->last_rcv_time);
 
-		if (last_rcv_sec >= (2 * BEACON_MESSAGE_INTERVAL_SECONDS))
+		if (last_rcv_sec >= (3 * BEACON_MESSAGE_INTERVAL_SECONDS))
 		{
 			/* we have missed atleast two beacons from master node */
 			ereport(WARNING,
@@ -6358,9 +6688,8 @@ watchdog_state_machine_standby(WD_EVENTS event, WatchdogNode * wdNode, WDPacketD
 							WD_MASTER_NODE->nodeName),
 					 errdetail("re-initializing the cluster")));
 			set_state(WD_JOINING);
-
 		}
-		else if (last_rcv_sec >= BEACON_MESSAGE_INTERVAL_SECONDS)
+		else if (last_rcv_sec >= (2 * BEACON_MESSAGE_INTERVAL_SECONDS))
 		{
 			/*
 			 * We have not received a last becacon from master ask for the
@@ -6381,10 +6710,10 @@ watchdog_state_machine_standby(WD_EVENTS event, WatchdogNode * wdNode, WDPacketD
  * The function identifies the current quorum state
  * quorum values:
  * -1:
- *     quorum is lost or does not exisits
+ *     quorum is lost or does not exists
  * 0:
  *     The quorum is on the edge. (when participating cluster is configured
- *     with even number of nodes, and we have exectly 50% nodes
+ *     with even number of nodes, and we have exactly 50% nodes
  * 1:
  *     quorum exists
  */
@@ -6393,11 +6722,11 @@ update_quorum_status(void)
 {
 	int			quorum_status = g_cluster.quorum_status;
 
-	if (g_cluster.clusterMasterInfo.standby_nodes_count > get_mimimum_remote_nodes_required_for_quorum())
+	if (g_cluster.clusterMasterInfo.standby_nodes_count > get_minimum_remote_nodes_required_for_quorum())
 	{
 		g_cluster.quorum_status = 1;
 	}
-	else if (g_cluster.clusterMasterInfo.standby_nodes_count == get_mimimum_remote_nodes_required_for_quorum())
+	else if (g_cluster.clusterMasterInfo.standby_nodes_count == get_minimum_remote_nodes_required_for_quorum())
 	{
 		if (g_cluster.remoteNodeCount % 2 != 0)
 		{
@@ -6424,10 +6753,10 @@ update_quorum_status(void)
  * returns the minimum number of remote nodes required for quorum
  */
 static int
-get_mimimum_remote_nodes_required_for_quorum(void)
+get_minimum_remote_nodes_required_for_quorum(void)
 {
 	/*
-	 * Even numner of remote nodes, That means total number of nodes are odd,
+	 * Even number of remote nodes, That means total number of nodes are odd,
 	 * so minimum quorum is just remote/2.
 	 */
 	if (g_cluster.remoteNodeCount % 2 == 0)
@@ -6447,11 +6776,11 @@ static int
 get_minimum_votes_to_resolve_consensus(void)
 {
 	/*
-	 * Since get_mimimum_remote_nodes_required_for_quorum() returns
+	 * Since get_minimum_remote_nodes_required_for_quorum() returns
 	 * the number of remote nodes required to complete the quorum
 	 * that is always one less than the total number of nodes required
 	 * for the cluster to build quorum or consensus, reason being
-	 * in get_mimimum_remote_nodes_required_for_quorum()
+	 * in get_minimum_remote_nodes_required_for_quorum()
 	 * we always consider the local node as a valid pre-casted vote.
 	 * But when it comes to count the number of votes required to build
 	 * consensus for any type of decision, for example for building the
@@ -6463,8 +6792,8 @@ get_minimum_votes_to_resolve_consensus(void)
 	 * For example
 	 * If Total nodes in cluster = 4
 	 * 		remote node will be = 3
-	 * 		get_mimimum_remote_nodes_required_for_quorum() return = 1
-	 *		Minimum number of votes required for consensu will be
+	 * 		get_minimum_remote_nodes_required_for_quorum() return = 1
+	 *		Minimum number of votes required for consensus will be
 	 *
 	 *		if(pool_config->enable_consensus_with_half_votes = true)
 	 *			(exact 50% n/2) ==> 4/2 = 2
@@ -6474,13 +6803,13 @@ get_minimum_votes_to_resolve_consensus(void)
 	 *
 	 */
 
-	int required_node_count = get_mimimum_remote_nodes_required_for_quorum()  + 1;
+	int required_node_count = get_minimum_remote_nodes_required_for_quorum()  + 1;
 	/*
 	 * When the total number of nodes in the watchdog cluster including the
 	 * local node are even, The number of votes required for the consensus
 	 * depends on the enable_consensus_with_half_votes.
 	 * So for even number of nodes when enable_consensus_with_half_votes is
-	 * not allowed than we would nedd one more vote than exact 50%
+	 * not allowed than we would add one more vote than exact 50%
 	 */
 	if (g_cluster.remoteNodeCount % 2 != 0)
 	{
@@ -6493,7 +6822,7 @@ get_minimum_votes_to_resolve_consensus(void)
 
 /*
  * sets the state of local watchdog node, and fires a state change event
- * if the new and old state differes
+ * if the new and old state differs
  */
 static int
 set_state(WD_STATES newState)
@@ -6952,7 +7281,7 @@ check_IPC_client_authentication(json_value * rootObj, bool internal_client_only)
 	if (json_get_int_value_for_key(rootObj, WD_IPC_SHARED_KEY, (int *) &packet_key))
 	{
 		ereport(DEBUG2,
-				(errmsg("IPC json data packet does not contain shared key")));
+				(errmsg("IPC JSON data packet does not contain shared key")));
 		has_shared_key = false;
 	}
 	else
@@ -6973,15 +7302,15 @@ check_IPC_client_authentication(json_value * rootObj, bool internal_client_only)
 		if (has_shared_key == false)
 		{
 			ereport(LOG,
-					(errmsg("invalid json data packet"),
-					 errdetail("authentication shared key not found in json data")));
+					(errmsg("invalid JSON data packet"),
+					 errdetail("authentication shared key not found in JSON data")));
 			return false;
 		}
 		/* compare if shared keys match */
 		if (*shared_key != packet_key)
 			return false;
 
-		/* providing a valid shared key for inetenal clients is enough */
+		/* providing a valid shared key for internal clients is enough */
 		return true;
 	}
 
@@ -6993,14 +7322,14 @@ check_IPC_client_authentication(json_value * rootObj, bool internal_client_only)
 	if (has_shared_key == true && *shared_key == packet_key)
 		return true;
 
-	/* shared key is out of question validate the authKey valurs */
+	/* shared key is out of question validate the authKey values */
 	packet_auth_key = json_get_string_value_for_key(rootObj, WD_IPC_AUTH_KEY);
 
 	if (packet_auth_key == NULL)
 	{
 		ereport(DEBUG1,
-				(errmsg("invalid json data packet"),
-				 errdetail("authentication key not found in json data")));
+				(errmsg("invalid JSON data packet"),
+				 errdetail("authentication key not found in JSON data")));
 		return false;
 	}
 
@@ -7244,7 +7573,7 @@ set_cluster_master_node(WatchdogNode * wdNode)
 	{
 		if (wdNode == NULL)
 			ereport(LOG,
-					(errmsg("unassigning the %s node \"%s\" from watchdog cluster master",
+					(errmsg("removing the %s node \"%s\" from watchdog cluster master",
 							(g_cluster.localNode == WD_MASTER_NODE) ? "local" : "remote",
 							WD_MASTER_NODE->nodeName)));
 		else
@@ -7344,3 +7673,202 @@ clear_standby_nodes_list(void)
 	g_cluster.clusterMasterInfo.standby_nodes_count = 0;
 	g_cluster.localNode->standby_nodes_count = 0;
 }
+
+static void update_missed_beacon_count(WDCommandData* ipcCommand, bool clear)
+{
+	int i;
+	for (i=0; i< g_cluster.remoteNodeCount; i++)
+	{
+		if (clear)
+		{
+			WatchdogNode* wdNode = &(g_cluster.remoteNodes[i]);
+			wdNode->missed_beacon_count = 0;
+		}
+		else
+		{
+			WDCommandNodeResult* nodeResult = &ipcCommand->nodeResults[i];
+			if (ipcCommand->commandStatus == COMMAND_IN_PROGRESS )
+				return;
+
+			if (nodeResult->cmdState == COMMAND_STATE_SENT)
+			{
+				if (nodeResult->wdNode->state == WD_STANDBY)
+				{
+					nodeResult->wdNode->missed_beacon_count++;
+					if (nodeResult->wdNode->missed_beacon_count > 1)
+						ereport(LOG,
+							(errmsg("remote node \"%s\" is not replying to our beacons",nodeResult->wdNode->nodeName),
+							 errdetail("missed beacon reply count:%d",nodeResult->wdNode->missed_beacon_count)));
+				}
+				else
+					nodeResult->wdNode->missed_beacon_count = 0;
+			}
+			if (nodeResult->cmdState == COMMAND_STATE_REPLIED)
+			{
+				if (nodeResult->wdNode->missed_beacon_count > 0)
+					ereport(LOG,
+							(errmsg("remote node \"%s\" is replying again after missing %d beacons",nodeResult->wdNode->nodeName,
+									nodeResult->wdNode->missed_beacon_count)));
+				nodeResult->wdNode->missed_beacon_count = 0;
+			}
+		}
+	}
+}
+
+#ifdef WATCHDOG_DEBUG
+/*
+ * Node down request file. In the file, each line consists of watchdog
+ * debug command.  The possible commands are same as the defines below
+ * for example to stop Pgpool-II from sending the reply to beacon messages
+ * from the master node write DO_NOT_REPLY_TO_BEACON in watchdog_debug_requests
+ *
+ *
+ * echo "DO_NOT_REPLY_TO_BEACON" > pgpool_logdir/watchdog_debug_requests
+ */
+
+typedef struct watchdog_debug_commands
+{
+	char		command[100];
+	unsigned int code;
+}			watchdog_debug_commands;
+
+unsigned int watchdog_debug_command = 0;
+
+
+#define WATCHDOG_DEBUG_FILE	"watchdog_debug_requests"
+
+#define DO_NOT_REPLY_TO_BEACON 	1
+#define DO_NOT_SEND_BEACON 		2
+#define KILL_ALL_COMMUNICATION	4
+#define KILL_ALL_RECEIVERS		8
+#define KILL_ALL_SENDERS		16
+
+
+watchdog_debug_commands wd_debug_commands[] = {
+	{"DO_NOT_REPLY_TO_BEACON", DO_NOT_REPLY_TO_BEACON},
+	{"DO_NOT_SEND_BEACON",     DO_NOT_SEND_BEACON},
+	{"KILL_ALL_COMMUNICATION", KILL_ALL_COMMUNICATION},
+	{"KILL_ALL_RECEIVERS",     KILL_ALL_RECEIVERS},
+	{"KILL_ALL_SENDERS",       KILL_ALL_SENDERS},
+	{"", 0}
+};
+
+static bool
+check_debug_request_kill_all_communication(void)
+{
+	return (watchdog_debug_command & KILL_ALL_COMMUNICATION);
+}
+static bool
+check_debug_request_kill_all_receivers(void)
+{
+	return (watchdog_debug_command & KILL_ALL_RECEIVERS);
+}
+static bool
+check_debug_request_kill_all_senders(void)
+{
+	return (watchdog_debug_command & KILL_ALL_SENDERS);
+}
+
+static bool
+check_debug_request_do_not_send_beacon(void)
+{
+	return (watchdog_debug_command & DO_NOT_SEND_BEACON);
+}
+
+static bool
+check_debug_request_do_not_reply_beacon(void)
+{
+	return (watchdog_debug_command & DO_NOT_REPLY_TO_BEACON);
+}
+/*
+ * Check watchdog debug request options file for debug commands
+ * each line should contain only one command
+ *
+ * Possible commands
+ * 		DO_NOT_REPLY_TO_BEACON
+ *		DO_NOT_SEND_BEACON
+ *		KILL_ALL_COMMUNICATION
+ *		KILL_ALL_RECEIVERS
+ *		KILL_ALL_SENDERS
+ */
+
+static void
+load_watchdog_debug_test_option(void)
+{
+	static char wd_debug_request_file[POOLMAXPATHLEN];
+	FILE	   *fd;
+	int			i;
+#define MAXLINE 128
+	char		readbuf[MAXLINE];
+
+	watchdog_debug_command = 0;
+
+	if (wd_debug_request_file[0] == '\0')
+	{
+		snprintf(wd_debug_request_file, sizeof(wd_debug_request_file),
+				 "%s/%s", pool_config->logdir, WATCHDOG_DEBUG_FILE);
+	}
+
+	fd = fopen(wd_debug_request_file, "r");
+	if (!fd)
+	{
+		ereport(DEBUG3,
+				(errmsg("load_watchdog_debug_test_option: failed to open file %s",
+						wd_debug_request_file),
+				 errdetail("\"%s\"", strerror(errno))));
+		return;
+	}
+
+	for (i = 0;; i++)
+	{
+		int cmd = 0;
+		bool valid_command = false;
+		readbuf[MAXLINE - 1] = '\0';
+		if (fgets(readbuf, MAXLINE - 1, fd) == 0)
+			break;
+		for (cmd =0 ;; cmd++)
+		{
+			if (strlen(wd_debug_commands[cmd].command) == 0 || wd_debug_commands[cmd].code == 0)
+				break;
+
+			if (strncasecmp(wd_debug_commands[cmd].command,readbuf,strlen(wd_debug_commands[cmd].command)) == 0)
+			{
+				ereport(DEBUG3,
+						(errmsg("Watchdog DEBUG COMMAND %d: \"%s\" request found",
+								cmd,wd_debug_commands[cmd].command)));
+
+				watchdog_debug_command |= wd_debug_commands[cmd].code;
+				valid_command = true;
+				break;
+			}
+		}
+		if (!valid_command)
+			ereport(WARNING,
+				(errmsg("%s file contains invalid command",
+							wd_debug_request_file),
+					 errdetail("\"%s\" not recognized", readbuf)));
+	}
+
+	fclose(fd);
+}
+#else
+/*
+ * All these command checks return false when WATCHDOG_DEBUG is
+ * not enabled
+ */
+static bool
+check_debug_request_do_not_send_beacon(void)
+{return false;}
+static bool
+check_debug_request_do_not_reply_beacon(void)
+{return false;}
+static bool
+check_debug_request_kill_all_communication(void)
+{return false;}
+static bool
+check_debug_request_kill_all_receivers(void)
+{return false;}
+static bool
+check_debug_request_kill_all_senders(void)
+{return false;}
+#endif