5.1 故障模拟
01:在db01(Master)服务器上查看主从复制及mha是否正常
02:停止db01(Master)服务器上的mysql服务
02:在db04上检查MHA的日志(/var/log/mha/app/app1/manager.log)
03:查看VIP是否飘移到新Master的服务器上,在新Master上查看主从复制的状态
04:在db04服务器上查看mha服务的状态及配置文件的变化
#在db01(Master)服务器上查看主从复制及mha是否正常
[root@db01 ~]# mysql -uroot -pchenliang -S /data/3306/mysql.sock
mysql> show processlist\G
*************************** 1. row ***************************
Id: 4
User: rep
扫描二维码关注公众号,回复: 4395206 查看本文章Host: 172.16.1.12:36522
db: NULL
Command: Binlog Dump GTID
Time: 14070
State: Master has sent all binlog to slave; waiting for more updates
Info: NULL
*************************** 2. row ***************************
Id: 5
User: rep
Host: 172.16.1.13:59189
db: NULL
Command: Binlog Dump GTID
Time: 13380
State: Master has sent all binlog to slave; waiting for more updates
Info: NULL
*************************** 3. row ***************************
Id: 6
User: rep
Host: 172.16.1.14:22492
db: NULL
Command: Binlog Dump GTID
Time: 12999
State: Master has sent all binlog to slave; waiting for more updates
Info: NULL
*************************** 4. row ***************************
Id: 33
User: mha
Host: 172.16.1.14:22720
db: NULL
Command: Sleep
Time: 2
State:
Info: NULL
*************************** 5. row ***************************
Id: 34
User: root
Host: localhost
db: NULL
Command: Query
Time: 0
State: starting
Info: show processlist
5 rows in set (0.00 sec)
#停止db01(Master)服务器上的mysql服务
[root@db01 ~]# /data/3306/mysqld stop
MySQL [3306] is not running
[root@db01 ~]# lsof -i :3306
[root@db01 ~]#
#在db04上检查MHA的日志(/var/log/mha/app/app1/manager.log)
[root@db04 ~]# tailf /var/log/mha/app/app1/manager.log
Started automated(non-interactive) failover.
Invalidated master IP address on 172.16.1.11(172.16.1.11:3306)
Selected 172.16.1.12(172.16.1.12:3306) as a new master.
172.16.1.12(172.16.1.12:3306): OK: Applying all logs succeeded.
172.16.1.12(172.16.1.12:3306): OK: Activated master IP address.
172.16.1.14(172.16.1.14:3306): OK: Slave started, replicating from 172.16.1.12(172.16.1.12:3306)
172.16.1.13(172.16.1.13:3306): OK: Slave started, replicating from 172.16.1.12(172.16.1.12:3306)
172.16.1.12(172.16.1.12:3306): Resetting slave info succeeded.
Master failover to 172.16.1.12(172.16.1.12:3306) completed successfully.
^=可以看出Master failover(故障转移)至172.16.1.12服务器上成功
^=那么就可以去172.16.1.12服务器上检查是否有VIP地址(172.16.1.10)及主从复制的状态
#查看VIP是否飘移到新Master的服务器上,在新Master(db02)上查看主从复制的状态
[root@db02 ~]# ifconfig eth1:1
eth1:1 Link encap:Ethernet HWaddr 00:0C:29:D3:59:E8
inet addr:172.16.1.10 Bcast:172.16.1.255 Mask:255.255.255.0
UP BROADCAST RUNNING MULTICAST MTU:1500 Metric:1
[root@db02 ~]# mysql -uroot -pchenliang -S /data/3306/mysql.sock
mysql> show processlist\G
*************************** 1. row ***************************
Id: 41
User: rep
Host: 172.16.1.14:45688
db: NULL
Command: Binlog Dump GTID
Time: 269
State: Master has sent all binlog to slave; waiting for more updates
Info: NULL
*************************** 2. row ***************************
Id: 42
User: rep
Host: 172.16.1.13:16598
db: NULL
Command: Binlog Dump GTID
Time: 269
State: Master has sent all binlog to slave; waiting for more updates
Info: NULL
*************************** 3. row ***************************
Id: 43
User: root
Host: localhost
db: NULL
Command: Query
Time: 0
State: starting
Info: show processlist
3 rows in set (0.00 sec)
^=从上面可以看出,当前Master(db02)的从库有172.16.1.13和172.16.1.14这两台服务器
^=mha服务也已经停止了(因为mha成功切换一次主库,它的服务是会自动停止的)
#在db04服务器上查看mha服务的状态及配置文件的变化
[root@db04 ~]# ps -ef|grep mha|grep -v grep
[root@db04 ~]#
[root@db04 ~]# cat /etc/mha/app/app1/app1.cnf
[server default]
manager_log=/var/log/mha/app/app1/manager.log
manager_workdir=/var/log/mha/app/app1
master_binlog_dir=/data/3306/binlog
master_ip_failover_script=/server/scripts/master_ip_failover
password=mha
ping_interval=2
repl_password=chenliang
repl_user=rep
ssh_port=921
ssh_user=toor
user=mha
[server2]
hostname=172.16.1.12
port=3306
[server3]
hostname=172.16.1.13
port=3306
[server4]
hostname=172.16.1.14
no_master=1
port=3306
^=从上面可以看出,mha服务已停止(正常的),配置文件中少了[server1]标签(正常的)
5.2 故障恢复
01:启动db01服务器上的mysql服务
02:在db04服务器的mha日志中找到change master语句
03:在db01服务器操作change master语句,把db01指向为新master(db02)的从库
04:在mha的配置文件(在db04服务器上)中加入[server1]标签
05:在db04服务器上的toor普通用户下再次启动mha服务
#启动db01服务器上的mysql服务
[root@db01 ~]# /data/3306/mysqld start
Start MySQL [3306] [ OK ]
[root@db01 ~]# netstat -lntup|grep mysqld
tcp 0 0 :::3306 :::* LISTEN 6184/mysqld
#在db04服务器的mha日志中找到change master语句
[root@db04 ~]# grep -i "change master" /var/log/mha/app/app1/manager.log
Fri Nov 16 14:15:16 2018 - [info] All other slaves should start replication from here. Statement should be: CHANGE MASTER TO MASTER_HOST='172.16.1.12', MASTER_PORT=3306, MASTER_AUTO_POSITION=1, MASTER_USER='rep', MASTER_PASSWORD='xxx';
Fri Nov 16 14:15:17 2018 - [info] Executed CHANGE MASTER.
Fri Nov 16 14:15:17 2018 - [info] Executed CHANGE MASTER.
#在db01服务器操作change master语句,把db01指向为新master(db02)的从库
[root@db01 ~]# mysql -uroot -pchenliang -S /data/3306/mysql.sock
mysql>
mysql> CHANGE MASTER TO MASTER_HOST='172.16.1.12', MASTER_PORT=3306, MASTER_AUTO_POSITION=1, MASTER_USER='rep', MASTER_PASSWORD='chenliang';
Query OK, 0 rows affected, 2 warnings (0.07 sec)
mysql> start slave;
Query OK, 0 rows affected (0.06 sec)
mysql> show slave status\G
*************************** 1. row ***************************
Slave_IO_State: Waiting for master to send event
Master_Host: 172.16.1.12
Master_User: rep
Master_Port: 3306
Connect_Retry: 60
Master_Log_File: db02_mysql_bin.000003
Read_Master_Log_Pos: 1495
Relay_Log_File: db01_relay_bin.000003
Relay_Log_Pos: 469
Relay_Master_Log_File: db02_mysql_bin.000003
Slave_IO_Running: Yes
Slave_SQL_Running: Yes
Replicate_Do_DB:
Replicate_Ignore_DB:
Replicate_Do_Table:
Replicate_Ignore_Table:
Replicate_Wild_Do_Table:
Replicate_Wild_Ignore_Table:
Last_Errno: 0
Last_Error:
Skip_Counter: 0
Exec_Master_Log_Pos: 1495
Relay_Log_Space: 1294
Until_Condition: None
Until_Log_File:
Until_Log_Pos: 0
Master_SSL_Allowed: No
Master_SSL_CA_File:
Master_SSL_CA_Path:
Master_SSL_Cert:
Master_SSL_Cipher:
Master_SSL_Key:
Seconds_Behind_Master: 0
Master_SSL_Verify_Server_Cert: No
Last_IO_Errno: 0
Last_IO_Error:
Last_SQL_Errno: 0
Last_SQL_Error:
Replicate_Ignore_Server_Ids:
Master_Server_Id: 12
Master_UUID: 1386976f-e7b8-11e8-b34b-000c29d359de
Master_Info_File: mysql.slave_master_info
SQL_Delay: 0
SQL_Remaining_Delay: NULL
Slave_SQL_Running_State: Slave has read all relay log; waiting for more updates
Master_Retry_Count: 86400
Master_Bind:
Last_IO_Error_Timestamp:
Last_SQL_Error_Timestamp:
Master_SSL_Crl:
Master_SSL_Crlpath:
Retrieved_Gtid_Set: 1386976f-e7b8-11e8-b34b-000c29d359de:1-2
Executed_Gtid_Set: 1386976f-e7b8-11e8-b34b-000c29d359de:1-2,
3ad8129b-e7b2-11e8-817e-000c296b2e4b:1-6
Auto_Position: 1
Replicate_Rewrite_DB:
Channel_Name:
Master_TLS_Version:
1 row in set (0.03 sec)
#在mha的配置文件(在db04服务器上)中加入[server1]标签
[root@db04 ~]# cat /etc/mha/app/app1/app1.cnf
[server default]
manager_log=/var/log/mha/app/app1/manager.log
manager_workdir=/var/log/mha/app/app1
master_binlog_dir=/data/3306/binlog
master_ip_failover_script=/server/scripts/master_ip_failover
password=mha
ping_interval=2
repl_password=chenliang
repl_user=rep
ssh_port=921
ssh_user=toor
user=mha
[server1]
hostname=172.16.1.11
port=3306
[server2]
hostname=172.16.1.12
port=3306
[server3]
hostname=172.16.1.13
port=3306
[server4]
hostname=172.16.1.14
no_master=1
port=3306
#在db04服务器上的toor普通用户下再次启动mha服务
[root@db04 ~]# su - toor
[toor@db04 ~]$ masterha_check_ssh --conf=/etc/mha/app/app1/app1.cnf
.........................
Fri Nov 16 14:56:42 2018 - [info] All SSH connection tests passed successfully.
[toor@db04 ~]$ masterha_check_repl --conf=/etc/mha/app/app1/app1.cnf
........
172.16.1.12(172.16.1.12:3306) (current master)
+--172.16.1.11(172.16.1.11:3306)
+--172.16.1.13(172.16.1.13:3306)
+--172.16.1.14(172.16.1.14:3306)
..........
MySQL Replication Health is OK
[toor@db04 ~]$ ps -ef|grep mha|grep -v grep
toor 6349 1 4 14:58 pts/0 00:00:00 perl /usr/bin/masterha_manager --conf=/etc/mha/app/app1/app1.cnf --remove_dead_master_conf --ignore_last_failover
5.3 再次提升原master为主库
[root@db02 ~]# /data/3306/mysqld stop
Stop MySQL[3306]
[root@db04 ~]# grep -i "change master" /var/log/mha/app/app1/manager.log
Fri Nov 16 15:50:29 2018 - [info] All other slaves should start replication from here. Statement should be: CHANGE MASTER TO MASTER_HOST='172.16.1.11', MASTER_PORT=3306, MASTER_AUTO_POSITION=1, MASTER_USER='rep', MASTER_PASSWORD='xxx';
Fri Nov 16 15:50:29 2018 - [info] Executed CHANGE MASTER.
Fri Nov 16 15:50:29 2018 - [info] Executed CHANGE MASTER.
[root@db01 ~]# ifconfig eth1:1
eth1:1 Link encap:Ethernet HWaddr 00:0C:29:6B:2E:55
inet addr:172.16.1.10 Bcast:172.16.1.255 Mask:255.255.255.0
UP BROADCAST RUNNING MULTICAST MTU:1500 Metric:1
[root@db02 ~]# /data/3306/mysqld start
Start MySQL [3306] [ OK ]
[root@db02 ~]# mysql -uroot -pchenliang -S /data/3306/mysql.sock
mysql>mysql> CHANGE MASTER TO MASTER_HOST='172.16.1.11', MASTER_PORT=3306, MASTER_AUTO_POSITION=1, MASTER_USER='rep', MASTER_PASSWORD='chenliang';
Query OK, 0 rows affected, 2 warnings (0.07 sec)
mysql> start slave;
Query OK, 0 rows affected (0.20 sec)
mysql> show slave status\G
*************************** 1. row ***************************
Slave_IO_State: Waiting for master to send event
Master_Host: 172.16.1.11
Master_User: rep
Master_Port: 3306
Connect_Retry: 60
Master_Log_File: db01_mysql_bin.000010
Read_Master_Log_Pos: 234
Relay_Log_File: db02_relay_bin.000002
Relay_Log_Pos: 377
Relay_Master_Log_File: db01_mysql_bin.000010
Slave_IO_Running: Yes
Slave_SQL_Running: Yes
Replicate_Do_DB:
Replicate_Ignore_DB:
Replicate_Do_Table:
Replicate_Ignore_Table:
Replicate_Wild_Do_Table:
Replicate_Wild_Ignore_Table:
Last_Errno: 0
Last_Error:
Skip_Counter: 0
Exec_Master_Log_Pos: 234
Relay_Log_Space: 583
Until_Condition: None
Until_Log_File:
Until_Log_Pos: 0
Master_SSL_Allowed: No
Master_SSL_CA_File:
Master_SSL_CA_Path:
Master_SSL_Cert:
Master_SSL_Cipher:
Master_SSL_Key:
Seconds_Behind_Master: 0
Master_SSL_Verify_Server_Cert: No
Last_IO_Errno: 0
Last_IO_Error:
Last_SQL_Errno: 0
Last_SQL_Error:
Replicate_Ignore_Server_Ids:
Master_Server_Id: 11
Master_UUID: 3ad8129b-e7b2-11e8-817e-000c296b2e4b
Master_Info_File: mysql.slave_master_info
SQL_Delay: 0
SQL_Remaining_Delay: NULL
Slave_SQL_Running_State: Slave has read all relay log; waiting for more updates
Master_Retry_Count: 86400
Master_Bind:
Last_IO_Error_Timestamp:
Last_SQL_Error_Timestamp:
Master_SSL_Crl:
Master_SSL_Crlpath:
Retrieved_Gtid_Set:
Executed_Gtid_Set: 1386976f-e7b8-11e8-b34b-000c29d359de:1-2,
3ad8129b-e7b2-11e8-817e-000c296b2e4b:1-6
Auto_Position: 1
Replicate_Rewrite_DB:
Channel_Name:
Master_TLS_Version:
1 row in set (0.00 sec)
[root@db04 ~]# cat /etc/mha/app/app1/app1.cnf
[server default]
manager_log=/var/log/mha/app/app1/manager.log
manager_workdir=/var/log/mha/app/app1
master_binlog_dir=/data/3306/binlog
master_ip_failover_script=/server/scripts/master_ip_failover
password=mha
ping_interval=2
repl_password=chenliang
repl_user=rep
ssh_port=921
ssh_user=toor
user=mha
[server1]
hostname=172.16.1.11
port=3306
[server2]
hostname=172.16.1.12
port=3306
[server3]
hostname=172.16.1.13
port=3306
[server4]
hostname=172.16.1.14
no_master=1
port=3306
[root@db04 ~]# su - toor
[toor@db04 ~]$ mha
[toor@db04 ~]$ ps -ef|grep mha
toor 9310 1 16 15:56 pts/0 00:00:00 perl /usr/bin/masterha_manager --conf=/etc/mha/app/app1/app1.cnf --remove_dead_master_conf --ignore_last_failover
toor 9322 9290 0 15:56 pts/0 00:00:00 grep --color=auto mha