[pgpool-general: 1997] Re: Problem with watchdog...

Fernando Buzon fbuzon at creddefense.com
Tue Aug 6 06:08:16 JST 2013


OK thanks a lot again!
Next are both config files (pgpool.conf)

======================================================pgpool-01============================================
listen_addresses = '*'
port = 5432
socket_dir = '/tmp'

pcp_port = 9898
pcp_socket_dir = '/tmp'

backend_hostname0 = '10.0.0.250'
backend_port0 = 5432
backend_weight0 = 1
backend_data_directory0 = '/var/lib/postgresql/9.1/main/'
backend_flag0 = 'ALLOW_TO_FAILOVER'
backend_hostname1 = '10.0.1.54'
backend_port1 = 5432
backend_weight1 = 1000
backend_data_directory1 = '/var/lib/postgresql/9.1/main/'
backend_flag1 = 'ALLOW_TO_FAILOVER'

enable_pool_hba = on
pool_passwd = 'pool_passwd'
authentication_timeout = 60

ssl = off

num_init_children = 32
max_pool = 4

child_life_time = 300
child_max_connections = 0
connection_life_time = 0
client_idle_limit = 0

log_destination = 'stderr'

print_timestamp = on

log_connections = off
log_hostname = off
log_statement = off
log_per_node_statement = off
log_standby_delay = 'none'

syslog_facility = 'LOCAL0'
syslog_ident = 'pgpool'

debug_level = 0

pid_file_name = '/var/run/pgpool/pgpool.pid'
logdir = '/tmp'

connection_cache = on

reset_query_list = 'ABORT; DISCARD ALL'

replication_mode = off
replicate_select = off

replication_mode = off
replicate_select = off

insert_lock = on
lobj_lock_table = ''

replication_stop_on_mismatch = off

failover_if_affected_tuples_mismatch = off

load_balance_mode = off
ignore_leading_white_space = on
white_function_list = ''
black_function_list = 'nextval,setval'

master_slave_mode = on
master_slave_sub_mode = 'stream'

sr_check_period = 0
sr_check_user = 'postgres'
sr_check_password = 'protected'
delay_threshold = 0

follow_master_command = ''

parallel_mode = off
pgpool2_hostname = ''

system_db_hostname  = 'localhost'
system_db_port = 5432
system_db_dbname = 'pgpool'
system_db_schema = 'pgpool_catalog'
system_db_user = 'pgpool'
system_db_password = ''

health_check_period = 1
health_check_timeout = 20
health_check_user = 'postgres'
health_check_password = 'protected'
health_check_max_retries = 0
health_check_retry_delay = 1

failover_command = '/usr/local/bin/failover_stream.sh %d %H
/tmp/trigger_file'
failback_command = ''

fail_over_on_backend_error = on

recovery_user = 'postgres'
recovery_password = ''
recovery_1st_stage_command = ''
recovery_2nd_stage_command = ''
recovery_timeout = 90
client_idle_limit_in_recovery = 0

use_watchdog = on
trusted_servers = ''
delegate_IP = '10.11.12.13'
wd_hostname = '10.0.0.21'
wd_port = 9000
wd_interval = 10
ping_path = '/bin'
ifconfig_path = '/sbin'
if_up_cmd = 'ifconfig eth0:0 inet $_IP_$ netmask 255.255.255.0'
if_down_cmd = 'ifconfig eth0:0 down'

arping_cmd = 'arping -U $_IP_$ -w 1'
wd_life_point = 3
wd_lifecheck_query = 'SELECT 1'

other_pgpool_hostname0 = '10.0.1.21'
other_pgpool_port0 = 9999
other_wd_port0 = 9000

relcache_expire = 0

relcache_size = 256

check_temp_table = on

memory_cache_enabled = off
memqcache_method = 'shmem'
memqcache_memcached_host = 'localhost'
memqcache_memcached_port = 11211
memqcache_total_size = 67108864
memqcache_max_num_cache = 1000000
memqcache_expire = 0
memqcache_auto_cache_invalidation = on
memqcache_maxcache = 409600
memqcache_cache_block_size = 1048576
memqcache_oiddir = '/var/log/pgpool/oiddir'
white_memqcache_table_list = ''
black_memqcache_table_list = ''
=========================================================FIM===============================================

======================================================pgpool-02============================================
listen_addresses = '*'
port = 5432
socket_dir = '/tmp'

pcp_port = 9898
pcp_socket_dir = '/tmp'

backend_hostname0 = '10.0.0.250'
backend_port0 = 5432
backend_weight0 = 1
backend_data_directory0 = '/var/lib/postgresql/9.1/main/'
backend_flag0 = 'ALLOW_TO_FAILOVER'
backend_hostname1 = '10.0.1.54'
backend_port1 = 5432
backend_weight1 = 1000
backend_data_directory1 = '/var/lib/postgresql/9.1/main/'
backend_flag1 = 'ALLOW_TO_FAILOVER'

enable_pool_hba = on
pool_passwd = 'pool_passwd'
authentication_timeout = 60

ssl = off

num_init_children = 32
max_pool = 4

child_life_time = 300
child_max_connections = 0
connection_life_time = 0
client_idle_limit = 0

log_destination = 'stderr'

print_timestamp = on

log_connections = off
log_hostname = off
log_statement = off
log_per_node_statement = off
log_standby_delay = 'none'

syslog_facility = 'LOCAL0'
syslog_ident = 'pgpool'

debug_level = 0

pid_file_name = '/var/run/pgpool/pgpool.pid'
logdir = '/tmp'

connection_cache = on

reset_query_list = 'ABORT; DISCARD ALL'

replication_mode = off
replicate_select = off

insert_lock = on
lobj_lock_table = ''

replication_stop_on_mismatch = off

failover_if_affected_tuples_mismatch = off

load_balance_mode = off
ignore_leading_white_space = on
white_function_list = ''
black_function_list = 'nextval,setval'

master_slave_mode = on
master_slave_sub_mode = 'stream'

sr_check_period = 0
sr_check_user = 'postgres'
sr_check_password = 'protected'
delay_threshold = 0

follow_master_command = ''

parallel_mode = off
pgpool2_hostname = ''

system_db_hostname  = 'localhost'
system_db_port = 5432
system_db_dbname = 'pgpool'
system_db_schema = 'pgpool_catalog'
system_db_user = 'pgpool'
system_db_password = ''

health_check_period = 1
health_check_timeout = 20
health_check_user = 'postgres'
health_check_password = 'protected'
health_check_max_retries = 0
health_check_retry_delay = 1

failover_command = '/usr/local/bin/failover_stream.sh %d %H
/tmp/trigger_file'
failback_command = ''

fail_over_on_backend_error = on

recovery_user = 'postgres'
recovery_password = ''
recovery_1st_stage_command = ''
recovery_2nd_stage_command = ''
recovery_timeout = 90
client_idle_limit_in_recovery = 0

use_watchdog = on
trusted_servers = ''
delegate_IP = '10.11.12.13'
wd_hostname = '10.0.1.21'
wd_port = 9000
wd_interval = 10
ping_path = '/bin'
ifconfig_path = '/sbin'
if_up_cmd = 'ifconfig eth0:0 inet $_IP_$ netmask 255.255.255.0'
if_down_cmd = 'ifconfig eth0:0 down'

arping_cmd = 'arping -U $_IP_$ -w 1'
wd_life_point = 3
wd_lifecheck_query = 'SELECT 1'

other_pgpool_hostname0 = '10.0.0.21'
other_pgpool_port0 = 9999
other_wd_port0 = 9000

relcache_expire = 0

relcache_size = 256

check_temp_table = on

memory_cache_enabled = off
memqcache_method = 'shmem'
memqcache_memcached_host = 'localhost'
memqcache_memcached_port = 11211
memqcache_total_size = 67108864
memqcache_max_num_cache = 1000000
memqcache_expire = 0
memqcache_auto_cache_invalidation = on
memqcache_maxcache = 409600
memqcache_cache_block_size = 1048576
memqcache_oiddir = '/var/log/pgpool/oiddir'
white_memqcache_table_list = ''
black_memqcache_table_list = ''
=========================================================FIM===============================================


01 - I start the pgpool-01:
/usr/local/bin/pgpool -n

log:
2013-08-05 16:11:55 LOG:   pid 17526: wd_chk_sticky: ifup[/sbin/ifconfig]
doesn't have sticky bit
2013-08-05 16:11:55 LOG:   pid 17526: wd_create_send_socket: connect()
reports failure (Connection refused). You can safely ignore this while
starting up.
2013-08-05 16:12:07 LOG:   pid 17526: wd_escalation: escalated to master
pgpool
2013-08-05 16:12:07 LOG:   pid 17526: wd_create_send_socket: connect()
reports failure (Connection refused). You can safely ignore this while
starting up.
2013-08-05 16:12:07 LOG:   pid 17526: wd_escalation:  escalated to
delegate_IP holder
2013-08-05 16:12:07 LOG:   pid 17526: wd_init: start watchdog
2013-08-05 16:12:07 LOG:   pid 17526: pgpool-II successfully started.
version 3.2.5 (namameboshi)
2013-08-05 16:12:07 LOG:   pid 17526: find_primary_node: primary node id is
1

ifconfig:
eth0:0    Link encap:Ethernet  HWaddr 16:45:08:ce:61:8b
          inet addr:10.11.12.13  Bcast:10.11.12.255  Mask:255.255.255.0
          UP BROADCAST RUNNING MULTICAST  MTU:1500  Metric:1
          Interrupt:25

I can use the delegate ip from the application and works fine:
fbuzon at fernando-Inspiron-7520:~$ psql -h 10.11.12.13 -U postgres -W
Password for user postgres:
psql (9.1.9)
Type "help" for help.

postgres=#


02 - I start the pgpool-02:
/usr/local/bin/pgpool -n

log:
2013-08-05 16:12:28 LOG:   pid 11382: wd_chk_sticky: ifup[/sbin/ifconfig]
doesn't have sticky bit
2013-08-05 16:12:28 LOG:   pid 11382: wd_init: start watchdog
2013-08-05 16:12:28 LOG:   pid 11382: pgpool-II successfully started.
version 3.2.5 (namameboshi)
2013-08-05 16:12:28 LOG:   pid 11382: find_primary_node: primary node id is
1
2013-08-05 16:12:29 LOG:   pid 11385: watchdog: lifecheck started

ifconfig don't show the delegate_IP. OK.


03 - I stop the pgpool-01:
pgpool stop
stop request sent to pgpool. waiting for termination................done.

log pgpool-01:
2013-08-05 16:13:09 LOG:   pid 17526: received smart shutdown request
2013-08-05 16:13:09 LOG:   pid 17526: watchdog_pid: 17533
2013-08-05 16:13:21 LOG:   pid 17533: wd_IP_down: ifconfig down succeeded

ifconfig no more shows the delegate_ip. OK.

log pgpool-02:
2013-08-05 16:13:30 LOG:   pid 11384: wd_escalation: escalated to master
pgpool
2013-08-05 16:13:30 LOG:   pid 11384: wd_escalation:  escalated to
delegate_IP holder
2013-08-05 16:13:30 LOG:   pid 11384: wd_escalation: escalated to master
pgpool
2013-08-05 16:13:30 LOG:   pid 11384: wd_escalation:  escalated to
delegate_IP holder
2013-08-05 16:13:45 LOG:   pid 11385: wd_lifecheck: lifecheck failed 3
times. pgpool 1 (10.0.0.21:5432) seems not to be working
2013-08-05 16:13:55 LOG:   pid 11385: wd_lifecheck: lifecheck failed 3
times. pgpool 1 (10.0.0.21:5432) seems not to be working

ifconfig:
eth0:0    Link encap:Ethernet  HWaddr 16:45:08:e5:3f:34
          inet addr:10.11.12.13  Bcast:10.11.12.255  Mask:255.255.255.0
          UP BROADCAST RUNNING MULTICAST  MTU:1500  Metric:1
          Interrupt:25

I still using the delegate ip from the application and works fine:
fbuzon at fernando-Inspiron-7520:~$ psql -h 10.11.12.13 -U postgres -W
Password for user postgres:
psql (9.1.9)
Type "help" for help.

postgres=#

04 - I start again the pgpool-01:
/usr/local/bin/pgpool -n

log:
2013-08-05 16:17:24 LOG:   pid 17576: wd_chk_sticky: ifup[/sbin/ifconfig]
doesn't have sticky bit
2013-08-05 16:17:24 LOG:   pid 17576: wd_init: start watchdog
2013-08-05 16:17:24 LOG:   pid 17576: pgpool-II successfully started.
version 3.2.5 (namameboshi)
2013-08-05 16:17:24 LOG:   pid 17576: find_primary_node: primary node id is
1
2013-08-05 16:17:25 LOG:   pid 17579: watchdog: lifecheck started

ifconfig no more shows the delegate_ip. OK.


05 - I stop the pgpool-02:
pgpool stop
stop request sent to pgpool. waiting for termination................done.

log pgpool-02:
2013-08-05 16:18:53 LOG:   pid 11382: received smart shutdown request
2013-08-05 16:18:53 LOG:   pid 11382: watchdog_pid: 11385
2013-08-05 16:19:05 LOG:   pid 11385: wd_IP_down: ifconfig down succeeded

log pgpool-01:
2013-08-05 16:18:56 LOG:   pid 17578: wd_escalation: escalated to master
pgpool
2013-08-05 16:18:56 LOG:   pid 17578: wd_escalation:  escalated to
delegate_IP holder


06 - Start again the pgpool-02
log:

2013-08-05 16:21:30 LOG:   pid 11521: wd_chk_sticky: ifup[/sbin/ifconfig]
doesn't have sticky bit
2013-08-05 16:21:30 LOG:   pid 11521: wd_init: start watchdog
2013-08-05 16:21:30 LOG:   pid 11521: pgpool-II successfully started.
version 3.2.5 (namameboshi)
2013-08-05 16:21:30 LOG:   pid 11521: find_primary_node: primary node id is
1
2013-08-05 16:21:31 LOG:   pid 11524: watchdog: lifecheck started


FINAL:
Like I said, all is working nice.
And now I am with the 2 pgpools up and working again.
The escaled pgpool is pgpool-01.
I stop it with "killall -9 pgpool" and now wd_lifecheck worked fine on
pgpool-02!
I dont now what was the problem early, but now is working!

log on pgpool-02:
2013-08-05 17:52:42 LOG:   pid 11524: wd_lifecheck: lifecheck failed 3
times. pgpool 1 (10.0.0.21:5432) seems not to be working
2013-08-05 17:52:42 LOG:   pid 11524: wd_escalation: escalated to master
pgpool
2013-08-05 17:52:42 LOG:   pid 11524: wd_escalation:  escalated to
delegate_IP holder
2013-08-05 17:52:52 LOG:   pid 11524: wd_lifecheck: lifecheck failed 3
times. pgpool 1 (10.0.0.21:5432) seems not to be working

So rest only one problem, that is how to down delegate_ip from the
pgpool-01?
Because both servers is responding to delegate_ip.


Thanks!


2013/8/5 Jeff Frost <jeff at pgexperts.com>

>  On 08/05/13 07:51, Fernando Buzon wrote:
>
>
> 2013/8/2 Jeff Frost <jeff at pgexperts.com>
>
>>  So are you indicating that your test is:
>>
>>  killall -9 pgpool
>>
>>  on the primary pgpool server?
>>
> YES
>
>
>
>>
>>  I think I may have lost track of the sequence of events.  Maybe you
>> could lay it out as a simple test case for us?
>>
> What should I do?
>
>
>
>
> You should post a step-by-step method for reproducing your issue with
> appropriate log output after each step is completed.
>
> That will make it more obvious what's going on.
>
>
> Ex:
>
> pgpool.conf files from both nodes
> successful psql connecting to the delegate IP
> log lines indicating pgpool is up and running fine (on both hosts)
> ip addr or ifconfig output indicating which host has the delegate ip
>
> kill -9
> log output (on both hosts)
> ip addr or ifconfig output
> ......
>
> so it's obvious how to reproduce your issue (or tell you how to avoid it).
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://www.sraoss.jp/pipermail/pgpool-general/attachments/20130805/67a4b670/attachment-0001.html>


More information about the pgpool-general mailing list