diff --git a/doc.ja/src/sgml/failover.sgml b/doc.ja/src/sgml/failover.sgml index 630f240..13f5151 100644 --- a/doc.ja/src/sgml/failover.sgml +++ b/doc.ja/src/sgml/failover.sgml @@ -819,6 +819,74 @@ GRANT pg_monitor TO sr_check_user; + + auto_failback (boolean) + + auto_failback 設定パラメータ + + + + + +onに設定した場合、ストリーミングレプリケーションが正常に動作しており、かつバックエンドノードのステータスがダウンのとき、スタンバイノードを自動で復帰させることができます。 +これは、一時的なネットワーク障害などによりスタンバイノードが認識できず縮退が行われた場合などに役立ちます。 + + + + +この機能を使用するためには、が有効であり、バックエンドノードとしてPostgreSQLが9.6以降である必要があります。 +プライマリノードのpg_stat_replicationを使用しており、自動フェイルバックはスタンバイノードに対してのみ実行されます。 +メンテナンスなどで、一時的にスタンバイノードを切り離す場合は、このパラメータをOFFにしてから実施してください。 +意図せずスタンバイノードが復帰してしまう可能性があります。 + + + +デフォルトはoffです。 +このパラメータはPgpool-IIの設定を再読み込みすることで変更可能です。 + + + + + + auto_failback_interval (integer) + + auto_failback_interval 設定パラメータ + + + + + +自動フェイルバックの実行間隔の最小時間を秒単位で指定します。 +次の自動フェイルバックは前回の自動フェイルバックから指定した時間経過するまで実行されません。 +ネットワークのエラーなどによりPgpool-IIが頻繁にバックエンドのDOWNを検出するような場合、大きい値を設定することでフェイルバックとフェイルオーバが繰り返される事を防ぐことができます。 +デフォルトは60です。 +0を指定すると自動フェイルバックは待ちません。 +このパラメータはPgpool-IIの設定を再読み込みすることで変更可能です。 + + + + diff --git a/doc/src/sgml/failover.sgml b/doc/src/sgml/failover.sgml index 5c96496..765db4d 100644 --- a/doc/src/sgml/failover.sgml +++ b/doc/src/sgml/failover.sgml @@ -618,6 +618,52 @@ GRANT pg_monitor TO sr_check_user; + + auto_failback (boolean) + + auto_failback configuration parameter + + + + + When set to on, standby node be automatically failback, if the node status + is down but streaming replication works normally. This is useful when + standby node is degenerated by pgpool because of the tempolary network failure. + + + + To use this feature, + must be enabled, and PostgreSQL 9.1 or later + is required as backend nodes. This feature use pg_stat_replicatoin + on primary node, the automatic failback is performed to standby node only. + Note that failback_command will be executed as well if failback_commnad is not empty. + If you plan to detach standby node for maintenance, set this parameter to off beforehand. + Otherwise it's possible that standby node is reattached against your intention. + + + The default is off. This parameter can be changed by reloading the Pgpool-II configurations. + + + + + + auto_failback_interval (integer) + + auto_failback_interval configuration parameter + + + + + Specifies the minimum amount of time in seconds for execution interval of auto failback. + Next auto failback won't execute until that specified time have passed + after previous auto failback. When Pgpool-II frequently detect + backend down because of network error for example, you may avoid repeating + failover and failback by setting this parameter to large enough value. + The default is 60. Setting this parameter to 0 means that auto failback don't wait. + + + + diff --git a/src/config/pool_config_variables.c b/src/config/pool_config_variables.c index e10b793..f291f40 100644 --- a/src/config/pool_config_variables.c +++ b/src/config/pool_config_variables.c @@ -597,6 +597,16 @@ static struct config_bool ConfigureNamesBool[] = NULL, NULL, NULL }, + { + {"auto_failback", CFGCXT_RELOAD, FAILOVER_CONFIG, + "Enables nodes automatically reattach, when dettached node continue streaming replication.", + CONFIG_VAR_TYPE_BOOL, false, 0 + }, + &g_pool_config.auto_failback, + false, + NULL, NULL, NULL + }, + /* End-of-list marker */ EMPTY_CONFIG_BOOL @@ -1895,6 +1905,17 @@ static struct config_int ConfigureNamesInt[] = NULL, NULL, NULL }, + { + {"auto_failback_interval", CFGCXT_RELOAD, FAILOVER_CONFIG, + "min interval of executing auto_failback in seconds", + CONFIG_VAR_TYPE_INT, false, 0 + }, + &g_pool_config.auto_failback_interval, + 60, + 0, INT_MAX, + NULL, NULL, NULL + }, + /* End-of-list marker */ EMPTY_CONFIG_INT }; diff --git a/src/include/pool_config.h b/src/include/pool_config.h index 4f7ad70..dab3fa3 100644 --- a/src/include/pool_config.h +++ b/src/include/pool_config.h @@ -229,6 +229,10 @@ typedef struct * false, just abort the * transaction to keep * the consistency. */ + bool auto_failback; /* If true, backend node reattach, + * when backend node detached and + * replication_status is 'stream' */ + int auto_failback_interval; /* min interval of executing auto_failback */ bool replicate_select; /* replicate SELECT statement when load * balancing is disabled. */ char **reset_query_list; /* comma separated list of queries to be diff --git a/src/main/health_check.c b/src/main/health_check.c index 9e67a02..c084d55 100644 --- a/src/main/health_check.c +++ b/src/main/health_check.c @@ -66,6 +66,7 @@ char remote_ps_data[NI_MAXHOST]; /* used for set_ps_display */ static POOL_CONNECTION_POOL_SLOT * slot; static volatile sig_atomic_t reload_config_request = 0; static volatile sig_atomic_t restart_request = 0; +static time_t auto_failback_interval = 0; /* resume time of auto_failback */ static bool establish_persistent_connection(int node); static void discard_persistent_connection(int node); static RETSIGTYPE my_signal_handler(int sig); @@ -243,6 +244,8 @@ establish_persistent_connection(int node) { BackendInfo *bkinfo; int retry_cnt; + bool check_failback = false; + time_t now; bkinfo = pool_get_node_info(node); @@ -254,7 +257,20 @@ establish_persistent_connection(int node) */ if (bkinfo->backend_status == CON_UNUSED || (bkinfo->backend_status == CON_DOWN && bkinfo->quarantine == false)) - return false; + { + /* get current time to use auto_faliback_interval */ + now = time(NULL); + + if (pool_config->auto_failback && auto_failback_interval < now && + STREAM && !strcmp(bkinfo->replication_state, "streaming") && !Req_info->switching) + { + ereport(DEBUG1, + (errmsg("health check DB node: %d (status:%d) for auto_failback", node, bkinfo->backend_status))); + check_failback = true; + } + else + return false; + } /* * If database is not specified, "postgres" database is assumed. @@ -326,6 +342,21 @@ establish_persistent_connection(int node) if (password) pfree(password); + if (check_failback && !Req_info->switching && slot) + { + ereport(LOG, + (errmsg("request auto failback, node id:%d", node))); + /* get current time to use auto_faliback_interval */ + now = time(NULL); + auto_failback_interval = now + pool_config->auto_failback_interval; + + send_failback_request(node, true, REQ_DETAIL_CONFIRMED); + } + } + /* if check_failback is true, backend_status is DOWN or UNUSED. */ + if (check_failback) + { + return false; } return true; } diff --git a/src/protocol/child.c b/src/protocol/child.c index ecc797b..6c1d72c 100644 --- a/src/protocol/child.c +++ b/src/protocol/child.c @@ -1819,7 +1819,7 @@ check_restart_request(void) if (pool_get_my_process_info()->need_to_restart) { ereport(LOG, - (errmsg("failback event detected"), + (errmsg("failover or failback event detected"), errdetail("restarting myself"))); pool_get_my_process_info()->need_to_restart = 0; @@ -2294,7 +2294,7 @@ retry_startup: { ereport(LOG, (errmsg("selecting backend connection"), - errdetail("failback event detected, discarding existing connections"))); + errdetail("failover or failback event detected, discarding existing connections"))); pool_get_my_process_info()->need_to_restart = 0; close_idle_connection(0); diff --git a/src/sample/pgpool.conf.sample b/src/sample/pgpool.conf.sample index b6176b7..65d016f 100644 --- a/src/sample/pgpool.conf.sample +++ b/src/sample/pgpool.conf.sample @@ -507,6 +507,13 @@ search_primary_node_timeout = 300 # 0 means no timeout, keep searching # for a primary node forever. +auto_failback = off + # Dettached backend node reattach automatically + # if replication_state is 'streaming'. +auto_failback_interval = 60 + # Min interval of executing auto_failback in + # seconds. + #------------------------------------------------------------------------------ # ONLINE RECOVERY #------------------------------------------------------------------------------ diff --git a/src/sample/pgpool.conf.sample-logical b/src/sample/pgpool.conf.sample-logical index ecd4226..8f6caa9 100644 --- a/src/sample/pgpool.conf.sample-logical +++ b/src/sample/pgpool.conf.sample-logical @@ -491,6 +491,13 @@ search_primary_node_timeout = 300 # 0 means no timeout, keep searching # for a primary node forever. +auto_failback = off + # Dettached backend node reattach automatically + # if replication_state is 'streaming'. +auto_failback_interval = 60 + # Min interval of executing auto_failback in + # seconds. + #------------------------------------------------------------------------------ # ONLINE RECOVERY #------------------------------------------------------------------------------ diff --git a/src/sample/pgpool.conf.sample-master-slave b/src/sample/pgpool.conf.sample-master-slave index 6e8860c..99725d9 100644 --- a/src/sample/pgpool.conf.sample-master-slave +++ b/src/sample/pgpool.conf.sample-master-slave @@ -503,6 +503,13 @@ search_primary_node_timeout = 300 # 0 means no timeout, keep searching # for a primary node forever. +auto_failback = off + # Dettached backend node reattach automatically + # if replication_state is 'streaming'. +auto_failback_interval = 60 + # Min interval of executing auto_failback in + # seconds. + #------------------------------------------------------------------------------ # ONLINE RECOVERY #------------------------------------------------------------------------------ diff --git a/src/sample/pgpool.conf.sample-replication b/src/sample/pgpool.conf.sample-replication index 9f9881a..7a1913a 100644 --- a/src/sample/pgpool.conf.sample-replication +++ b/src/sample/pgpool.conf.sample-replication @@ -501,6 +501,13 @@ search_primary_node_timeout = 300 # 0 means no timeout, keep searching # for a primary node forever. +auto_failback = off + # Dettached backend node reattach automatically + # if replication_state is 'streaming'. +auto_failback_interval = 60 + # Min interval of executing auto_failback in + # seconds. + #------------------------------------------------------------------------------ # ONLINE RECOVERY #------------------------------------------------------------------------------ diff --git a/src/sample/pgpool.conf.sample-stream b/src/sample/pgpool.conf.sample-stream index 0f3e45b..c82de21 100644 --- a/src/sample/pgpool.conf.sample-stream +++ b/src/sample/pgpool.conf.sample-stream @@ -531,6 +531,12 @@ client_idle_limit_in_recovery = 0 # 0 means no disconnection # -1 means immediate disconnection +auto_failback = off + # Dettached backend node reattach automatically + # if replication_state is 'streaming'. +auto_failback_interval = 60 + # Min interval of executing auto_failback in + # seconds. #------------------------------------------------------------------------------ # WATCHDOG diff --git a/src/streaming_replication/pool_worker_child.c b/src/streaming_replication/pool_worker_child.c index 40096ea..f731419 100644 --- a/src/streaming_replication/pool_worker_child.c +++ b/src/streaming_replication/pool_worker_child.c @@ -308,7 +308,7 @@ check_replication_time_lag(void) active_nodes++; } - if (active_nodes <= 1) + if (active_nodes <= 1 && !pool_config->auto_failback) { /* * If there's only one or less active node, there's no point to do @@ -403,8 +403,6 @@ check_replication_time_lag(void) if (i == PRIMARY_NODE_ID) continue; - if (!VALID_BACKEND(i)) - continue; if (*stat_rep_query == '\0') continue; @@ -428,7 +426,7 @@ check_replication_time_lag(void) strlcpy(bkinfo->replication_state, s, NAMEDATALEN); s = res_rep->data[1]? res_rep->data[2] : ""; strlcpy(bkinfo->replication_sync_state, s, NAMEDATALEN); - free_select_result(res_rep); + free_select_result(res_rep); } pfree(query_buf); } @@ -601,26 +599,6 @@ get_query_result(POOL_CONNECTION_POOL_SLOT * *slots, int backend_id, char *query return sts; } -/* - if ((*res)->data[0] == NULL) - { - free_select_result(*res); - ereport(LOG, - (errmsg("get_query_result: no rows returned"), - errdetail("node id (%d)", backend_id))); - return sts; - } - - - if ((*res)->nullflags[0] == -1) - { - free_select_result(*res); - ereport(LOG, - (errmsg("get_query_result: NULL data returned"), - errdetail("node id (%d)", backend_id))); - return sts; - } -*/ sts = 0; return sts; } diff --git a/src/test/regression/clean.sh b/src/test/regression/clean.sh index 6b73a95..cfbc7b8 100644 --- a/src/test/regression/clean.sh +++ b/src/test/regression/clean.sh @@ -3,6 +3,7 @@ dir=`pwd` export TESTLIBS=$dir/libs.sh export PGPOOL_SETUP=$HOME/bin/pgpool_setup +export WATCHDOG_SETUP=$HOME/bin/watchdog_setup log=$dir/log rm -fr $log diff --git a/src/test/regression/regress.sh b/src/test/regression/regress.sh index 32f71c6..a891c2a 100755 --- a/src/test/regression/regress.sh +++ b/src/test/regression/regress.sh @@ -48,6 +48,9 @@ function install_pgpool echo "moving pgpool_setup to temporary installation path ..." cp $dir/../pgpool_setup ${PGPOOL_PATH}/pgpool_setup export PGPOOL_SETUP=$PGPOOL_PATH/pgpool_setup + echo "moving watchdog_setup to temporary installation path ..." + cp $dir/../watchdog_setup ${PGPOOL_PATH}/watchdog_setup + export WATCHDOG_SETUP=$PGPOOL_PATH/watchdog_setup } function verify_pginstallation @@ -73,6 +76,7 @@ function export_env_vars # check if pgpool is in the path PGPOOL_PATH=/usr/local export PGPOOL_SETUP=$HOME/bin/pgpool_setup + export WATCHDOG_SETUP=$HOME/bin/watchdog_setup fi if [[ -z "$PGBENCH_PATH" ]]; then @@ -163,6 +167,7 @@ elif [ "$MODE" = "noinstall" ]; then PGPOOL_PATH=$PGPOOL_INSTALL_PATH fi export PGPOOL_SETUP=$dir/../pgpool_setup + export WATCHDOG_SETUP=$dir/../watchdog_setup else echo $MODE : Invalid mode exit -1 diff --git a/src/utils/pool_process_reporting.c b/src/utils/pool_process_reporting.c index 7be0a47..10d0ec1 100644 --- a/src/utils/pool_process_reporting.c +++ b/src/utils/pool_process_reporting.c @@ -645,6 +645,16 @@ get_config(int *nrows) StrNCpy(status[i].desc, "detach false primary", POOLCONFIG_MAXDESCLEN); i++; + StrNCpy(status[i].name, "auto_failback", POOLCONFIG_MAXNAMELEN); + snprintf(status[i].value, POOLCONFIG_MAXVALLEN, "%d", pool_config->auto_failback); + StrNCpy(status[i].desc, "auto_failback", POOLCONFIG_MAXDESCLEN); + i++; + + StrNCpy(status[i].name, "auto_failback_interval", POOLCONFIG_MAXNAMELEN); + snprintf(status[i].value, POOLCONFIG_MAXVALLEN, "%d", pool_config->auto_failback_interval); + StrNCpy(status[i].desc, "auto_failback_interval", POOLCONFIG_MAXDESCLEN); + i++; + /* ONLINE RECOVERY */ StrNCpy(status[i].name, "recovery_user", POOLCONFIG_MAXNAMELEN);