[pgpool-general: 1047] Re: Raw failover not working as expected on pgpool-II v3.2.0

Tatsuo Ishii ishii at postgresql.org
Fri Sep 28 15:48:56 JST 2012


> replication_mode = off
> master_slave_mode = off

That's the expected behavior because you turn off both modes(it is
called "raw mode").
--
Tatsuo Ishii
SRA OSS, Inc. Japan
English: http://www.sraoss.co.jp/index_en.php
Japanese: http://www.sraoss.co.jp

> I'm experiencing an issue with pgpool-II v3.2.0 and testing raw failover capability with Postgres v8.4.1 on Centos 5.  I'm not sure if it's a configuration issue on my part or a bug.
> 
> I'm running pgpool on a tomcat application server and wish it to failover to a secondary database server if my primary server becomes unavailable.  I will setup log shipping and warm standby later but right now I just want to confirm the failover is working.   I have defined the two backend nodes, disabled all "other" modes and enabled health checking.
> 
> The problem is that while the master node (backend_hostname0) is up then everything is fine, however, if it gets taken offline then pgpool is performing the expected actions of a failover but setting the master node id back to '0' and my database connections fail because its unavailable. The health checking is detecting when nodes are up or down so it should be shifting connections to the slave node id 1 (backend_hostname1)  but the master node id is always set to node 0.
> 
> Thanks in advance for your thoughts/insight.
> 
> "show pool_nodes" when both nodes are up.
> 
> node_id |    hostname     | port | status | lb_weight |  role
> ---------+-----------------+------+--------+-----------+--------
> 0       | node01          | 5432 | 2      | 0.500000  | master
> 1       | node02          | 5432 | 1      | 0.500000  | slave
> 
> If I stop the node01 database, I get errors like below after I shut down the database and then try to connect with a psql session.   It seems to me that it's not picking the correct node id when it fails over.
> 
> 2012-09-25 14:26:37 LOG:   pid 27473: postmaster on DB node 0 was shutdown by administrative command
> 2012-09-25 14:26:37 LOG:   pid 27473: degenerate_backend_set: 0 fail over request from pid 27473
> 2012-09-25 14:26:37 LOG:   pid 25992: starting degeneration. shutdown host node01(5432)
> 2012-09-25 14:26:37 LOG:   pid 25992: Restart all children
> 2012-09-25 14:26:37 LOG:   pid 25992: execute command: /bin/true
> 2012-09-25 14:26:37 LOG:   pid 25992: failover: set new primary node: -1
> 2012-09-25 14:26:37 LOG:   pid 25992: failover: set new master node: 0  <-- this doesn't seem right to me.
> 2012-09-25 14:26:37 LOG:   pid 25992: failover done. shutdown host node01(5432)
> 2012-09-25 14:26:37 LOG:   pid 27481: worker process received restart request
> 2012-09-25 14:26:38 LOG:   pid 25992: worker child 27481 exits with status 256
> 2012-09-25 14:26:38 LOG:   pid 27482: pcp child process received restart request
> 2012-09-25 14:26:38 LOG:   pid 25992: fork a new worker child pid 27903
> 2012-09-25 14:26:38 LOG:   pid 25992: PCP child 27482 exits with status 256
> 2012-09-25 14:26:38 LOG:   pid 25992: fork a new PCP child pid 27904
> 2012-09-25 14:26:52 LOG:   pid 27902: connection received: host=X.X.X.X port=57167
> 2012-09-25 14:26:52 ERROR: pid 27902: connect_inet_domain_socket: connect() failed: Connection refused
> 2012-09-25 14:26:52 ERROR: pid 27902: connection to node01(5432) failed
> 2012-09-25 14:26:52 ERROR: pid 27902: new_connection: create_cp() failed
> 2012-09-25 14:26:52 LOG:   pid 27902: degenerate_backend_set: 0 fail over request from pid 27902
> 2012-09-25 14:26:52 LOG:   pid 25992: starting degeneration. shutdown host node01(5432)
> 2012-09-25 14:26:52 LOG:   pid 25992: Restart all children
> 2012-09-25 14:26:52 LOG:   pid 25992: execute command: /bin/true
> 2012-09-25 14:26:52 LOG:   pid 25992: failover: set new primary node: -1
> 2012-09-25 14:26:52 LOG:   pid 25992: failover: set new master node: 0  <-- this doesn't seem right to me.
> 2012-09-25 14:26:52 LOG:   pid 25992: failover done. shutdown host node01(5432)
> 2012-09-25 14:26:52 LOG:   pid 27903: worker process received restart request
> 2012-09-25 14:26:53 LOG:   pid 25992: worker child 27903 exits with status 256
> 2012-09-25 14:26:53 LOG:   pid 27904: pcp child process received restart request
> 2012-09-25 14:26:53 LOG:   pid 25992: fork a new worker child pid 27914
> 2012-09-25 14:26:53 LOG:   pid 25992: PCP child 27904 exits with status 256
> 2012-09-25 14:26:53 LOG:   pid 25992: fork a new PCP child pid 27915
> 
> My pgpool.conf has all modes set to off....
> 
> replication_mode = off
> load_balance_mode = off
> master_slave_mode = off
> parallel_mode = off
> 
> ... and pgpool.conf in full.
> 
> listen_addresses = '*'
> port = 9999
> socket_dir = '/tmp'
> pcp_port = 9898
> pcp_socket_dir = '/tmp'
> backend_hostname0 = 'node01'
> backend_port0 = 5432
> backend_weight0 = 1
> backend_data_directory0 = '/data'
> backend_flag0 = 'ALLOW_TO_FAILOVER'
> backend_hostname1 = 'node02'
> backend_port1 = 5432
> backend_weight1 = 1
> backend_data_directory1 = '/data'
> backend_flag1 = 'ALLOW_TO_FAILOVER'
> enable_pool_hba = off
> authentication_timeout = 60
> ssl = off
> num_init_children = 8
> max_pool = 4
> child_life_time = 300
> child_max_connections = 0
> connection_life_time = 0
> client_idle_limit = 0
> log_destination = 'stderr'
> print_timestamp = on
> log_connections = on
> log_hostname = off
> log_statement = off
> log_per_node_statement = off
> log_standby_delay = 'none'
> syslog_facility = 'LOCAL0'
> syslog_ident = 'pgpool'
> debug_level = 0
> pid_file_name = '/var/run/pgpool/pgpool.pid'
> logdir = '/tmp'
> connection_cache = off
> reset_query_list = 'ABORT; DISCARD ALL'
> replication_mode = off
> replicate_select = off
> insert_lock = off
> lobj_lock_table = ''
> replication_stop_on_mismatch = off
> failover_if_affected_tuples_mismatch = off
> load_balance_mode = off
> ignore_leading_white_space = on
> white_function_list = ''
> black_function_list = 'currval,lastval,nextval,setval'
> master_slave_mode = off
> master_slave_sub_mode = 'slony'
> sr_check_period = 0
> sr_check_user = 'nobody'
> sr_check_password = ''
> delay_threshold = 0
> follow_master_command = ''
> parallel_mode = off
> pgpool2_hostname = ''
> health_check_period = 5
> health_check_timeout = 20
> health_check_user = 'postgres'
> health_check_password = 'XXsecretXX'
> health_check_max_retries = 3
> health_check_retry_delay = 1
> failover_command = '/bin/true'
> failback_command = '/bin/true'
> fail_over_on_backend_error = on
> recovery_user = 'nobody'
> recovery_password = ''
> recovery_1st_stage_command = ''
> recovery_2nd_stage_command = ''
> recovery_timeout = 90
> client_idle_limit_in_recovery = 0
> use_watchdog = off
> trusted_servers = ''
> delegate_IP = ''
> wd_hostname = ''
> wd_port = 9000
> wd_interval = 10
> ping_path = '/bin'
> ifconfig_path = '/sbin'
> if_up_cmd = 'ifconfig eth0:0 inet $_IP_$ netmask 255.255.255.0'
> if_down_cmd = 'ifconfig eth0:0 down'
> arping_cmd = 'arping -U $_IP_$ -w 1'
> wd_life_point = 3
> wd_lifecheck_query = 'SELECT 1'
> relcache_expire = 0
> relcache_size = 256
> check_temp_table = on
> memory_cache_enabled = off
> memqcache_method = 'shmem'
> memqcache_memcached_host = 'localhost'
> memqcache_memcached_port = 11211
> memqcache_total_size = 67108864
> memqcache_max_num_cache = 1000000
> memqcache_expire = 0
> memqcache_auto_cache_invalidation = on
> memqcache_maxcache = 409600
> memqcache_cache_block_size = 1048576
> memqcache_oiddir = '/var/log/pgpool/oiddir'
> white_memqcache_table_list = ''
> black_memqcache_table_list = ''
> 
> Thanks again,
> 
> Quentin
> ________________________________
> 
> This communication is confidential and may contain privileged information.
> If you are not the named recipient, please erase this communication and contact the sender immediately.
> You must not copy, use or disclose this communication, or any attachments or information contained within, without prior consent.
> 
> P Please consider the environment before printing this email


More information about the pgpool-general mailing list