[pgpool-hackers: 1278] Re: pgpool zombie with wathdog 3.5
Muhammad Usama
m.usama at gmail.com
Tue Dec 29 07:32:00 JST 2015
Hi Yugo
Many thanks for the patch. Although the patch takes care of zombie
processes, but I think blocking the watchdog process at time of escalation
and de-escalation might lead to some un-expected results, because at that
time when the watchdog process will be waiting for the
escalation/de-escalation process to finish it would not be replying to
other nodes.
So can you please have a look at the attached path which uses the SIGCHILD
to handle the zombie process problem
Thanks
Best regards
Muhammad Usama
On Mon, Dec 28, 2015 at 2:11 PM, Yugo Nagata <nagata at sraoss.co.jp> wrote:
> Hi Usama,
>
> I write the patch to fix it.
>
> In current, escalation process is forked but not waited by watchdog
> process. Fix is to wait the escalation process by waitpid. That is,
> watchdog is blocked while waiting the escalation finishing.
>
> Any comment?
>
> Regards,
>
> On Mon, 21 Dec 2015 16:46:33 +0900
> Yugo Nagata <nagata at sraoss.co.jp> wrote:
>
> > Usama,
> >
> > A zombie process appeared after pgpool started with watchdog.
> >
> > $ ps aux | grep pgpool
> > ...
> > yugo-n 32156 0.0 0.0 19880 1244 pts/27 S 16:31 0:00 pgpool:
> watchdog
> > yugo-n 32165 0.0 0.0 0 0 pts/27 Z 16:31 0:00
> [pgpool] <defunct>
> > yugo-n 32166 0.0 0.0 17792 768 pts/27 S 16:31 0:00 pgpool:
> lifecheck
> > ...
> >
> > I found PID 32165 was pid of the escalation process.
> >
> > 2015-12-21 16:31:33: pid 32156: LOG: escalation process started with
> PID:32165
> > 2015-12-21 16:31:33: pid 32165: LOG: watchdog: escalation started
> >
> > I can reproduce this on Ubuntu but not CentOS.
> >
> > Regarads,
> > --
> > Yugo Nagata <nagata at sraoss.co.jp>
> > _______________________________________________
> > pgpool-hackers mailing list
> > pgpool-hackers at pgpool.net
> > http://www.pgpool.net/mailman/listinfo/pgpool-hackers
>
>
> --
> Yugo Nagata <nagata at sraoss.co.jp>
>
> _______________________________________________
> pgpool-hackers mailing list
> pgpool-hackers at pgpool.net
> http://www.pgpool.net/mailman/listinfo/pgpool-hackers
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://www.sraoss.jp/pipermail/pgpool-hackers/attachments/20151229/972ea1d3/attachment-0001.html>
-------------- next part --------------
diff --git a/src/watchdog/watchdog.c b/src/watchdog/watchdog.c
index cf443ba..2763a3c 100644
--- a/src/watchdog/watchdog.c
+++ b/src/watchdog/watchdog.c
@@ -65,6 +65,12 @@ typedef enum IPC_CMD_PREOCESS_RES
#define MIN_SECS_CONNECTION_RETRY 10 /* Time in seconds to
* retry connection with
* node once it was failed */
+
+#define MAX_SECS_ESC_PROC_EXIT_WAIT 5 /* maximum amount of seconds to
+ * wait for escalation/de-esclation process to
+ * exit normaly before moving on
+ */
+
#define BEACON_MESSAGE_INTERVAL_SECONDS 10 /* interval between beacon messages */
@@ -255,6 +261,7 @@ typedef struct wd_cluster
WDCommandData currentCommand;
unsigned int nextCommandID;
pid_t escalation_pid;
+ pid_t de_escalation_pid;
int command_server_sock;
int network_monitor_sock;
bool holding_vip;
@@ -272,9 +279,11 @@ typedef struct wd_cluster
}wd_cluster;
volatile sig_atomic_t reload_config_signal = 0;
+volatile sig_atomic_t sigchld_request = 0;
-static void check_config_reload(void);
-static RETSIGTYPE reload_config_handler(int sig);
+static void check_signals(void);
+static void wd_child_signal_handler(void);
+static RETSIGTYPE watchdog_signal_handler(int sig);
static void FileUnlink(int code, Datum path);
static void wd_child_exit(int exit_signo);
@@ -358,6 +367,7 @@ static JsonNode* get_node_list_json(int id);
static bool add_nodeinfo_to_json(JsonNode* jNode, WatchdogNode* node);
static bool fire_node_status_event(int nodeID, int nodeStatus);
static void resign_from_escalated_node(void);
+static void start_escalated_node(void);
static void init_wd_packet(WDPacketData* pkt);
static bool wd_commands_packet_processor(WD_EVENTS event, WatchdogNode* wdNode, WDPacketData* pkt);
@@ -502,6 +512,7 @@ static void wd_cluster_initialize(void)
g_cluster.clusterInitialized = false;
g_cluster.holding_vip = false;
g_cluster.escalation_pid = 0;
+ g_cluster.de_escalation_pid = 0;
g_cluster.unidentified_socks = NULL;
g_cluster.command_server_sock = 0;
g_cluster.notify_clients = NULL;
@@ -750,15 +761,18 @@ static bool connect_to_node(WatchdogNode* wdNode)
return (wdNode->client_socket.sock_state != WD_SOCK_ERROR);
}
-/* SIGHUP handler */
-static RETSIGTYPE reload_config_handler(int sig)
+/* signal handler for SIGHUP and SIGCHILD handler */
+static RETSIGTYPE watchdog_signal_handler(int sig)
{
- reload_config_signal = 1;
+ if (sig == SIGHUP)
+ reload_config_signal = 1;
+ else if (sig == SIGCHLD)
+ sigchld_request = 1;
}
-static void check_config_reload(void)
+static void check_signals(void)
{
- /* reload config file */
+ /* reload config file signal? */
if (reload_config_signal)
{
MemoryContext oldContext = MemoryContextSwitchTo(TopMemoryContext);
@@ -766,6 +780,10 @@ static void check_config_reload(void)
MemoryContextSwitchTo(oldContext);
reload_config_signal = 0;
}
+ else if (sigchld_request)
+ {
+ wd_child_signal_handler();
+ }
}
@@ -815,8 +833,8 @@ watchdog_main(void)
pool_signal(SIGTERM, wd_child_exit);
pool_signal(SIGINT, wd_child_exit);
pool_signal(SIGQUIT, wd_child_exit);
- pool_signal(SIGHUP, reload_config_handler);
- pool_signal(SIGCHLD, SIG_DFL);
+ pool_signal(SIGHUP, watchdog_signal_handler);
+ pool_signal(SIGCHLD, watchdog_signal_handler);
pool_signal(SIGUSR1, SIG_IGN);
pool_signal(SIGUSR2, SIG_IGN);
pool_signal(SIGPIPE, SIG_IGN);
@@ -880,7 +898,7 @@ watchdog_main(void)
MemoryContextSwitchTo(ProcessLoopContext);
MemoryContextResetAndDeleteChildren(ProcessLoopContext);
- check_config_reload();
+ check_signals();
fd_max = prepare_fds(&rmask,&wmask,&emask);
tv.tv_sec = select_timeout;
@@ -2121,7 +2139,6 @@ static int set_local_node_state(WD_STATES newState)
}
-
static void
wd_child_exit(int exit_signo)
{
@@ -2130,11 +2147,63 @@ wd_child_exit(int exit_signo)
sigaddset(&mask, SIGTERM);
sigaddset(&mask, SIGINT);
sigaddset(&mask, SIGQUIT);
- sigaddset(&mask, SIGCHLD);
sigprocmask(SIG_BLOCK, &mask, NULL);
exit(0);
}
+static void wd_child_signal_handler(void)
+{
+ pid_t pid;
+ int status;
+
+ ereport(DEBUG1,
+ (errmsg("watchdog child signal handler")));
+
+ /* clear SIGCHLD request */
+ sigchld_request = 0;
+
+ while ((pid = pool_waitpid(&status)) > 0)
+ {
+ char *exiting_process_name;
+
+ if (g_cluster.de_escalation_pid == pid)
+ {
+ exiting_process_name = "de-escalation";
+ g_cluster.de_escalation_pid = 0;
+ }
+ else if (g_cluster.escalation_pid == pid)
+ {
+ exiting_process_name = "escalation";
+ g_cluster.escalation_pid = 0;
+ }
+ else
+ exiting_process_name = "unknown";
+
+ if(WIFEXITED(status))
+ {
+ if(WEXITSTATUS(status) == POOL_EXIT_FATAL)
+ ereport(LOG,
+ (errmsg("watchdog %s process with pid: %d exit with FATAL ERROR.",exiting_process_name, pid)));
+ else if(WEXITSTATUS(status) == POOL_EXIT_SUCCESS)
+ ereport(LOG,
+ (errmsg("watchdog %s process with pid: %d exit with SUCCESS.",exiting_process_name, pid)));
+ }
+ if (WIFSIGNALED(status))
+ {
+ /* Child terminated by segmentation fault. Report it */
+ if(WTERMSIG(status) == SIGSEGV)
+ ereport(WARNING,
+ (errmsg("watchdog %s process with pid: %d was terminated by segmentation fault",exiting_process_name,pid)));
+ else
+ ereport(LOG,
+ (errmsg("watchdog %s process with pid: %d exits with status %d by signal %d",exiting_process_name, pid, status, WTERMSIG(status))));
+ }
+ else
+ ereport(LOG,
+ (errmsg("watchdog %s process with pid: %d exits with status %d",exiting_process_name,pid, status)));
+ }
+}
+
/* Function invoked when watchdog process is about to exit */
static void wd_system_will_go_down(int code, Datum arg)
{
@@ -2158,6 +2227,15 @@ static void wd_system_will_go_down(int code, Datum arg)
/* close network monitoring socket */
if (g_cluster.network_monitor_sock > 0)
close(g_cluster.network_monitor_sock);
+ /* wait for sub-processes to exit */
+ if (g_cluster.de_escalation_pid > 0 || g_cluster.escalation_pid > 0)
+ {
+ pid_t wpid;
+ do
+ {
+ wpid = wait(NULL);
+ }while (wpid > 0 || (wpid == -1 && errno == EINTR));
+ }
}
static void close_socket_connection(SocketConnection* conn)
@@ -3822,21 +3900,7 @@ static int watchdog_state_machine_coordinator(WD_EVENTS event, WatchdogNode* wdN
{
ereport(LOG,
(errmsg("I am the cluster leader node. Starting escalation process")));
- g_cluster.escalation_pid = fork_escalation_process();
- if (g_cluster.escalation_pid > 0)
- {
- g_cluster.escalated = true;
- ereport(LOG,
- (errmsg("escalation process started with PID:%d",g_cluster.escalation_pid)));
- if (strlen(g_cluster.localNode->delegate_ip) > 0)
- g_cluster.holding_vip = true;
- }
- else
- {
- g_cluster.escalated = false;
- ereport(LOG,
- (errmsg("failed to start escalation process")));
- }
+ start_escalated_node();
}
}
else
@@ -3975,26 +4039,9 @@ static int watchdog_state_machine_coordinator(WD_EVENTS event, WatchdogNode* wdN
if (g_cluster.quorum_status >= 0)
{
ereport(LOG,
- (errmsg("quorum is complete after node \"%s\" joined the cluster",wdNode->nodeName),
+ (errmsg("quorum is complete after node \"%s\" joined the cluster",wdNode->nodeName),
errdetail("starting escalation process")));
-
- g_cluster.escalation_pid = fork_escalation_process();
- if (g_cluster.escalation_pid > 0)
- {
- g_cluster.escalated = true;
- ereport(LOG,
- (errmsg("escalation process started with PID:%d",g_cluster.escalation_pid)));
- if (strlen(g_cluster.localNode->delegate_ip) > 0)
- g_cluster.holding_vip = true;
- else
- g_cluster.holding_vip = false;
- }
- else
- {
- g_cluster.escalated = false;
- ereport(LOG,
- (errmsg("failed to start escalation process")));
- }
+ start_escalated_node();
}
}
}
@@ -4038,21 +4085,7 @@ static int watchdog_state_machine_coordinator(WD_EVENTS event, WatchdogNode* wdN
ereport(LOG,
(errmsg("quorum is complete after node \"%s\" joined the cluster",wdNode->nodeName),
errdetail("starting escalation process")));
- g_cluster.escalation_pid = fork_escalation_process();
- if (g_cluster.escalation_pid > 0)
- {
- g_cluster.escalated = true;
- ereport(LOG,
- (errmsg("escalation process started with PID:%d",g_cluster.escalation_pid)));
- if (strlen(g_cluster.localNode->delegate_ip) > 0)
- g_cluster.holding_vip = true;
- }
- else
- {
- g_cluster.escalated = false;
- ereport(LOG,
- (errmsg("failed to start escalation process")));
- }
+ start_escalated_node();
}
}
}
@@ -4131,12 +4164,71 @@ static int watchdog_state_machine_nw_error(WD_EVENTS event, WatchdogNode* wdNode
return 0;
}
+static void start_escalated_node(void)
+{
+ int wait_secs = MAX_SECS_ESC_PROC_EXIT_WAIT;
+ if (g_cluster.escalated == true) /* already escalated */
+ return;
+
+ while (g_cluster.de_escalation_pid > 0 && wait_secs-- > 0)
+ {
+ /*
+ * de_escalation proceess was already running and we are
+ * esclating again.
+ * give some time to de-escalation process to exit normaly
+ */
+ ereport(LOG,
+ (errmsg("waiting for de-escalation process to exit before starting escalation")));
+ if (sigchld_request)
+ wd_child_signal_handler();
+ sleep (1);
+ }
+ if (g_cluster.de_escalation_pid > 0)
+ ereport(LOG,
+ (errmsg("de-escalation process does not exited in time."),
+ errdetail("starting the escalation anyway")));
+
+ g_cluster.escalation_pid = fork_escalation_process();
+ if (g_cluster.escalation_pid > 0)
+ {
+ g_cluster.escalated = true;
+ ereport(LOG,
+ (errmsg("escalation process started with PID:%d",g_cluster.escalation_pid)));
+ if (strlen(g_cluster.localNode->delegate_ip) > 0)
+ g_cluster.holding_vip = true;
+ }
+ else
+ {
+ g_cluster.escalated = false;
+ ereport(LOG,
+ (errmsg("failed to start escalation process")));
+ }
+}
+
static void resign_from_escalated_node(void)
{
+ int wait_secs = MAX_SECS_ESC_PROC_EXIT_WAIT;
if (g_cluster.escalated == false)
return;
- fork_plunging_process();
+ while (g_cluster.escalation_pid > 0 && wait_secs-- > 0)
+ {
+ /*
+ * escalation proceess was already running and we are
+ * resigning from it.
+ * wait for the escalation process to exit normaly
+ */
+ ereport(LOG,
+ (errmsg("waiting for escalation process to exit before starting de-escalation")));
+ if (sigchld_request)
+ wd_child_signal_handler();
+ sleep (1);
+ }
+ if (g_cluster.escalation_pid > 0)
+ ereport(LOG,
+ (errmsg("escalation process does not exited in time"),
+ errdetail("starting the de-escalation anyway")));
+ g_cluster.de_escalation_pid = fork_plunging_process();
g_cluster.holding_vip = false;
g_cluster.escalated = false;
}
More information about the pgpool-hackers
mailing list