一、基础环境配置
1.1 关闭防火墙和selinux
1.1.1 关闭防火墙
systemctl stop firewalld.service #停止防火墙
systemctl disable firewalld.service #禁止开机启动
1.1.2 关闭selinux
setenforce 0
sed -i 's/SELINUX=enforcing/SELINUX=disabled/g' /etc/selinux/config
cat /etc/sysconfig/selinux
/usr/sbin/sestatus -v
#这个地方最好重启下
1.2 配置IP域名映射
vim /etc/hosts
127.0.0.1 localhost localhost.localdomain localhost4 localhost4.localdomain4
::1 localhost localhost.localdomain localhost6 localhost6.localdomain6
192.168.0.97 zj1
192.168.1.156 zj2
192.168.0.48 zj3
1.3 设置swap分区大小
echo "vm.swappiness = 0" >>/etc/sysctl.conf
sysctl -p
#通过sysctl -a 查看
PS :swappiness值在0-100之间,0尽力使用物理内存,100尽力使用swap分区。
1.4 操作系统限制
cat /etc/security/limits.conf
...
* soft nproc 65536
* hard nproc 65536
* soft nofile 65536
* hard nofile 65536
...
1.5 关闭numa
sed -i 's/GRUB_CMDLINE_LINUX.*/GRUB_CMDLINE_LINUX="crashkernel=auto rd.lvm.lv=centos\/root rd.lvm.lv=centos\/swap rhgb quiet numa=off"/g' /etc/default/grub
grub2-mkconfig -o /etc/grub2.cfg
cat /etc/grub2.cfg
reboot
cat /proc/cmdline
dmesg | grep -i numa
1.6 配置互信
#在各节点执行
ssh-keygen -t rsa
ssh-copy-id -i /root/.ssh/id_rsa.pub [email protected]
ssh-copy-id -i /root/.ssh/id_rsa.pub [email protected]
ssh-copy-id -i /root/.ssh/id_rsa.pub [email protected]
二、实例安装
2.1 创建MySQL用户组和用户
groupadd mysql
useradd -g mysql mysql -s /sbin/nologin
id mysql
2.2 创建相关目录
mkdir -pv /usr/local/mysql/
mkdir -pv /data/mysql/{data,logs}
2.3 解压安装包并设置权限
tar xf mysql-5.7.28-el7-x86_64.tar -C /usr/local/mysql/
cd /usr/local/mysql/
tar xf mysql-5.7.28-el7-x86_64.tar.gz
tar xf mysql-test-5.7.28-el7-x86_64.tar.gz
chown mysql:mysql /data/mysql -R
2.4 软链接程序目录并配置环境变量
ln -s /usr/local/mysql/mysql-5.7.28-el7-x86_64 /usr/local/mysql/mysql
export PATH=/usr/local/mysql/mysql/bin:$PATH
echo 'PATH=/usr/local/mysql/mysql/bin:$PATH' >> /etc/profile
source /etc/profile
2.5 修改配置文件
[client]
port = 3306
socket = /data/mysql/data/mysql.sock
[mysql]
prompt="\u@mysql1 \R:\m:\s [\d]> "
no-auto-rehash
[mysqld]
user = mysql
port = 3306
#软件目录
basedir = /usr/local/mysql/mysql
#数据目录
datadir = /data/mysql/data
#套接字文件
socket = /data/mysql/data/mysql.sock
#PID文件
pid-file = mysql1.pid
#字符集
character-set-server = utf8mb4
#是否跳过域名解析
skip_name_resolve = 1
#若MySQL数据库主要运行在境外,请务必根据实际情况调整本参数
default_time_zone = "+8:00"
#最大打开文件数
open_files_limit= 65535
#MySQL(处理大量请求时,由于满负荷)暂时停止回复新请求前能够堆放的请求数
back_log = 1024
#最大连接数限制
max_connections = 256
#单个用户最大连接数限制
max_user_connections = 64
#最大连接错误数,防止密码暴力破解
max_connect_errors = 1000000
table_open_cache = 1024
table_definition_cache = 1024
table_open_cache_instances = 64
thread_stack = 512K
external-locking = FALSE
max_allowed_packet = 32M
sort_buffer_size = 4M
join_buffer_size = 4M
thread_cache_size = 384
interactive_timeout = 600
wait_timeout = 600
tmp_table_size = 32M
max_heap_table_size = 32M
slow_query_log = 1
log_timestamps = SYSTEM
slow_query_log_file = /data/mysql/logs/slow.log
log-error = /data/mysql/logs/error.log
long_query_time = 0.1
log_queries_not_using_indexes =1
log_throttle_queries_not_using_indexes = 60
min_examined_row_limit = 100
log_slow_admin_statements = 1
log_slow_slave_statements = 1
server-id = 33061
log-bin = /data/mysql/logs/binlog
sync_binlog = 1
binlog_cache_size = 4M
max_binlog_cache_size = 2G
max_binlog_size = 1G
auto_increment_offset=1
auto_increment_increment=1
#注意:MySQL 8.0开始,binlog_expire_logs_seconds选项也存在的话,会忽略expire_logs_days选项
expire_logs_days = 7
master_info_repository = TABLE
relay_log_info_repository = TABLE
gtid_mode = on
enforce_gtid_consistency = 1
log_slave_updates
slave-rows-search-algorithms = 'INDEX_SCAN,HASH_SCAN'
binlog_format = row
binlog_checksum = 1
relay_log_recovery = 1
relay-log-purge = 1
key_buffer_size = 32M
read_buffer_size = 8M
read_rnd_buffer_size = 4M
bulk_insert_buffer_size = 64M
myisam_sort_buffer_size = 128M
myisam_max_sort_file_size = 10G
myisam_repair_threads = 1
lock_wait_timeout = 3600
explicit_defaults_for_timestamp = 1
innodb_thread_concurrency = 0
innodb_sync_spin_loops = 100
innodb_spin_wait_delay = 30
transaction_isolation = READ-COMMITTED
#innodb_additional_mem_pool_size = 16M
innodb_buffer_pool_size = 1434M
innodb_buffer_pool_instances = 4
innodb_buffer_pool_load_at_startup = 1
innodb_buffer_pool_dump_at_shutdown = 1
innodb_data_file_path = ibdata1:1G:autoextend
innodb_flush_log_at_trx_commit = 1
innodb_log_buffer_size = 32M
innodb_log_file_size = 2G
innodb_log_files_in_group = 2
innodb_max_undo_log_size = 4G
innodb_undo_directory = /data/mysql/data
innodb_undo_tablespaces = 2
# 根据您的服务器IOPS能力适当调整
# 一般配普通SSD盘的话,可以调整到 10000 - 20000
# 配置高端PCIe SSD卡的话,则可以调整的更高,比如 50000 - 80000
innodb_io_capacity = 4000
innodb_io_capacity_max = 8000
innodb_flush_sync = 0
innodb_flush_neighbors = 0
innodb_write_io_threads = 8
innodb_read_io_threads = 8
innodb_purge_threads = 4
innodb_page_cleaners = 4
innodb_open_files = 65535
innodb_max_dirty_pages_pct = 50
innodb_flush_method = O_DIRECT
innodb_lru_scan_depth = 4000
innodb_checksum_algorithm = crc32
innodb_lock_wait_timeout = 10
innodb_rollback_on_timeout = 1
innodb_print_all_deadlocks = 1
innodb_file_per_table = 1
innodb_online_alter_log_max_size = 4G
innodb_stats_on_metadata = 0
innodb_undo_log_truncate = 1
#注意:MySQL 8.0.16开始删除该选项
#internal_tmp_disk_storage_engine = InnoDB
# some var for MySQL 8
#log_error_verbosity = 3
#innodb_print_ddl_logs = 1
#binlog_expire_logs_seconds = 604800
#innodb_dedicated_server = 0
innodb_status_file = 1
#注意: 开启 innodb_status_output & innodb_status_output_locks 后, 可能会导致log-error文件增长较快
innodb_status_output = 0
innodb_status_output_locks = 0
#performance_schema
performance_schema = 1
performance_schema_instrument = '%memory%=on'
performance_schema_instrument = '%lock%=on'
#innodb monitor
innodb_monitor_enable="module_innodb"
innodb_monitor_enable="module_server"
innodb_monitor_enable="module_dml"
innodb_monitor_enable="module_ddl"
innodb_monitor_enable="module_trx"
innodb_monitor_enable="module_os"
innodb_monitor_enable="module_purge"
innodb_monitor_enable="module_log"
innodb_monitor_enable="module_lock"
innodb_monitor_enable="module_buffer"
innodb_monitor_enable="module_index"
innodb_monitor_enable="module_ibuf_system"
innodb_monitor_enable="module_buffer_page"
innodb_monitor_enable="module_adaptive_hash"
#组提交&并行复制
slave_parallel_type=LOGICAL_CLOCK
slave_parallel_workers=2
binlog_group_commit_sync_delay=1000000
binlog_group_commit_sync_no_delay_count=10
#半同步
plugin-load = "rpl_semi_sync_master=semisync_master.so;rpl_semi_sync_slave=semisync_slave.so"
rpl-semi-sync-master-enabled = 1
rpl-semi-sync-slave-enabled = 1
[mysqldump]
quick
max_allowed_packet = 32M
2.6 初始化数据库
mysqld --defaults-file=/etc/my.cnf --initialize-insecure &
--initialize:该选项初始化时会在错误日志中写一个随机root密码,初始化完成之后在错误日志中搜索password,紧跟其后的一串字符串就是这个随机密码,在初始化完成并启动mysqld之后,初次登录需要使用这个随机密码才能够登录。
--initialize-insecure:该选项初始化时不会产生随机密码,而是像5.7版本之前一样初始化完成之后第一次登录数据库使用空的root密码
2.7 启动数据库服务
mysqld_safe --defaults-file=/etc/my.cnf &
#扫描配置文件的顺序,也可以人为指定其他位置
/etc/my.cnf>/etc/mysql/my.cnf>$mysql_home/my.cnf>~/.my.cnf
#查看是否启动成功,若失败,则去错误日志查看原因
ps -ef | grep mysql
netstat -anultp | grep 端口号
2.8 搭建复制
#master创建复制用户
create user repl@'192.168.1.%' identified by 'repl';
grant replication slave,replication client on *.* to repl@'192.168.1.%';
#安装半同步插件
INSTALL PLUGIN rpl_semi_sync_master SONAME 'semisync_master.so';
INSTALL PLUGIN rpl_semi_sync_slave SONAME 'semisync_slave.so';
SELECT PLUGIN_NAME, PLUGIN_STATUS FROM INFORMATION_SCHEMA.PLUGINS WHERE PLUGIN_NAME LIKE '%semi%';
#建立主从复制
change master to
master_host='192.168.0.97',
master_port=3306,
master_user='repl',
master_password='repl',
master_auto_position=1;
start slave;
#查看同步状态; 查看半同步的主备状态
show slave status;
show status like 'Rpl_semi_sync_%_status';
三、搭建MHA
3.1 Ansible 配置(可选)
# manager节点安装 ansible
yum install -y ansible
# vim /etc/ansible/hosts
[mysql-mha-node]
192.168.0.97
192.168.1.156
192.168.0.48
# vim /etc/ansible/ansible.cfg
[defaults]
sudo_user=root
remote_port=22
remote_user=root
module_name=command
host_key_checking=False
command_warnings = False
inventory=/etc/ansible/hosts
roles_path=/etc/ansible/roles
log_path=/var/log/ansible.log
private_key_file=/root/.ssh/id_rsa
manager节点执行:各节点hosts绑定 ip 和 服务器名称
ansible mysql-mha-node -m shell -a "echo -e '
192.168.0.97 zj1
192.168.1.156 zj2
192.168.0.48 zj3' >> /etc/hosts"
3.2 MHA node安装
yum install -y perl-DBD-MySQL perl-ExtUtils-MakeMaker perl-CPAN
yum install -y https://github.com/yoshinorim/mha4mysql-node/releases/download/v0.58/mha4mysql-node-0.58-0.el7.centos.noarch.rpm
3.3 MHA manager节点安装
#安装centos7的epel源
yum -y install epel-release
yum install -y https://github.com/yoshinorim/mha4mysql-manager/releases/download/v0.58/mha4mysql-manager-0.58-0.el7.centos.noarch.rpm
3.4 MHA manager 配置
mkdir -pv /var/log/mha/app1/
[server default]
remote_workdir=/var/log/mha
manager_workdir=/var/log/mha
manager_log=/var/log/mha/app1/manager.log
master_binlog_dir=/data/mysql/logs/ #设置master保存binlog的位置,以便MHA可以找到master的日志
user=root
password=970125
repl_user=repl
repl_password=repl
ssh_user=root
ping_interval=2 #设置监控主库,发送ping包的时间间隔,默认是3秒,尝试三次没有回应的时候自动进行failover
#可从manager二进制包解压找到(mha4mysql-manager-0.58/samples/scripts)
report_script= /usr/bin/send_report //设置发生切换后发送的报警的脚本
#shutdown_script= /usr/bin/power_manager
master_ip_failover_script= /usr/bin/master_ip_failover #设置自动failover时候的切换脚本
master_ip_online_change_script=/usr/bin/master_ip_online_change #设置手动切换时候的切换脚本
[server1]
port=3306
hostname=192.168.0.97
candidate_master=1 #设置为候选master,如果设置该参数以后,发生主从切换以后将会将此从库提升为主库,即使这个主库不是集群中事件最新的slave
check_repl_delay=0 #默认情况下如果一个slave落后master 100M的relay logs的话,MHA将不会选择该slave作为一个新的master,因为对于这个slave的恢复需要花费很长时间,通过设置check_repl_delay=0,MHA触发切换在选择一个新的master的时候将会忽略复制延时,这个参数对于设置了candidate_master=1的主机非常有用,因为这个候选主在切换的过程中一定是新的master
[server2]
port=3306
hostname=192.168.1.156
candidate_master=1
check_repl_delay=0
[server3]
port=3306
hostname=192.168.0.48
candidate_master=1
check_repl_delay=0
3.5 检查ssh
/usr/bin/masterha_check_ssh --conf=/etc/masterha.cnf
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-o7A0D8GD-1614831853969)(C:\Users\薛世杰\AppData\Roaming\Typora\typora-user-images\image-20191231143908596.png)]
3.6 检查主从复制
/usr/bin/masterha_check_repl --conf=/etc/masterha.cnf
注意:配置文件里面的账号要有响应的权限,并且可以连接。
3.7 MHA运行状态
masterha_check_status --conf=/etc/masterha.cnf
[root@zj3 bin]# /usr/bin/masterha_check_status --conf=/etc/masterha.cnf
masterha is stopped(2:NOT_RUNNING).
3.8 其他命令
# MHA master 监控(与 masterha_manager 一样)
masterha_master_monitor --conf=/etc/masterha.cnf
#启动 MHA
nohup masterha_manager --conf=/etc/masterha.cnf &> /var/log/mha/app1/manager.log &
[root@zj3 bin]# /usr/bin/masterha_check_status --conf=/etc/masterha.cnf
masterha (pid:73434) is running(0:PING_OK), master:192.168.0.97
#关闭 MHA
masterha_stop --conf=/etc/masterha.cnf
#添加或删除配置的server信息
masterha_conf_host
四、keepalived配置
4.1 安装ipvsadm、keepalived
yum -y install keepalived
4.2 配置keepalived
修改配置文件
vi/etc/keepalived/keepalived.conf
master和备选master配置文件内容相同
#ConfigurationFile for keepalived
global_defs {
notification_email { ######定义接受邮件的邮箱
[email protected]
}
notification_email_from [email protected] ######定义发送邮件的邮箱
smtp_server mail.qq.com
smtp_connect_timeout 10
}
vrrp_instance vrrptest { ######定义vrrptest实例
state BACKUP ######服务器状态
interface ens32 ######使用的接口
virtual_router_id 51 ######虚拟路由的标志,一组lvs的虚拟路由标识必须相同,这样才能切换
priority 150 ######服务启动优先级,值越大,优先级越高,BACKUP 不能大于MASTER
advert_int 1 ######服务器之间的存活检查时间
authentication {
auth_type PASS ######认证类型
auth_pass 970125 ######认证密码,一组lvs 服务器的认证密码必须一致
}
virtual_ipaddress { ######虚拟IP地址
192.168.1.1
}
}
这里master服务器的state不配置成MASTER,且配置的优先级一样,是期望在master宕机后再恢复时,不主动将MASTER状态抢过来,避免MySQL服务的波动。
这里没有配置vrrp_script,在后面会让mha实现vip的自动漂移。
五、测试
5.1 手动切换
#关闭 MHA
masterha_stop --conf=/etc/masterha.cnf
# 在线手动切换
masterha_master_switch --conf=/etc/masterha.cnf \
--master_state=alive \
--new_master_host=192.168.1.156 \
--new_master_port=3306 \
--orig_master_is_new_slave \
--running_updates_limit=10000
Tue Dec 31 14:57:00 2019 - [info] MHA::MasterRotate version 0.58.
Tue Dec 31 14:57:00 2019 - [info] Starting online master switch..
Tue Dec 31 14:57:00 2019 - [info]
Tue Dec 31 14:57:00 2019 - [info] * Phase 1: Configuration Check Phase..
Tue Dec 31 14:57:00 2019 - [info]
Tue Dec 31 14:57:00 2019 - [warning] Global configuration file /etc/masterha_default.cnf not found. Skipping.
Tue Dec 31 14:57:00 2019 - [info] Reading application default configuration from /etc/masterha.cnf..
Tue Dec 31 14:57:00 2019 - [info] Reading server configuration from /etc/masterha.cnf..
Tue Dec 31 14:57:01 2019 - [info] GTID failover mode = 1
Tue Dec 31 14:57:01 2019 - [info] Current Alive Master: 192.168.0.97(192.168.0.97:3306)
Tue Dec 31 14:57:01 2019 - [info] Alive Slaves:
Tue Dec 31 14:57:01 2019 - [info] 192.168.1.156(192.168.1.156:3306) Version=5.7.28-log (oldest major version between slaves) log-bin:enabled
Tue Dec 31 14:57:01 2019 - [info] GTID ON
Tue Dec 31 14:57:01 2019 - [info] Replicating from 192.168.0.97(192.168.0.97:3306)
Tue Dec 31 14:57:01 2019 - [info] Primary candidate for the new Master (candidate_master is set)
Tue Dec 31 14:57:01 2019 - [info] 192.168.0.48(192.168.0.48:3306) Version=5.7.28-log (oldest major version between slaves) log-bin:enabled
Tue Dec 31 14:57:01 2019 - [info] GTID ON
Tue Dec 31 14:57:01 2019 - [info] Replicating from 192.168.0.97(192.168.0.97:3306)
Tue Dec 31 14:57:01 2019 - [info] Primary candidate for the new Master (candidate_master is set)
It is better to execute FLUSH NO_WRITE_TO_BINLOG TABLES on the master before switching. Is it ok to execute on 192.168.0.97(192.168.0.97:3306)? (YES/no): YES
Tue Dec 31 14:57:27 2019 - [info] Executing FLUSH NO_WRITE_TO_BINLOG TABLES. This may take long time..
Tue Dec 31 14:57:27 2019 - [info] ok.
Tue Dec 31 14:57:27 2019 - [info] Checking MHA is not monitoring or doing failover..
Tue Dec 31 14:57:27 2019 - [info] Checking replication health on 192.168.1.156..
Tue Dec 31 14:57:27 2019 - [info] ok.
Tue Dec 31 14:57:27 2019 - [info] Checking replication health on 192.168.0.48..
Tue Dec 31 14:57:27 2019 - [info] ok.
Tue Dec 31 14:57:27 2019 - [info] 192.168.1.156 can be new master.
Tue Dec 31 14:57:27 2019 - [info]
From:
192.168.0.97(192.168.0.97:3306) (current master)
+--192.168.1.156(192.168.1.156:3306)
+--192.168.0.48(192.168.0.48:3306)
To:
192.168.1.156(192.168.1.156:3306) (new master)
+--192.168.0.48(192.168.0.48:3306)
+--192.168.0.97(192.168.0.97:3306)
Starting master switch from 192.168.0.97(192.168.0.97:3306) to 192.168.1.156(192.168.1.156:3306)? (yes/NO): yes
Tue Dec 31 14:57:30 2019 - [info] Checking whether 192.168.1.156(192.168.1.156:3306) is ok for the new master..
Tue Dec 31 14:57:30 2019 - [info] ok.
Tue Dec 31 14:57:30 2019 - [info] 192.168.0.97(192.168.0.97:3306): SHOW SLAVE STATUS returned empty result. To check replication filtering rules, temporarily executing CHANGE MASTER to a dummy host.
Tue Dec 31 14:57:30 2019 - [info] 192.168.0.97(192.168.0.97:3306): Resetting slave pointing to the dummy host.
Tue Dec 31 14:57:30 2019 - [info] ** Phase 1: Configuration Check Phase completed.
Tue Dec 31 14:57:30 2019 - [info]
Tue Dec 31 14:57:30 2019 - [info] * Phase 2: Rejecting updates Phase..
Tue Dec 31 14:57:30 2019 - [info]
master_ip_online_change_script is not defined. If you do not disable writes on the current master manually, applications keep writing on the current master. Is it ok to proceed? (yes/NO): yes
Tue Dec 31 14:57:32 2019 - [info] Locking all tables on the orig master to reject updates from everybody (including root):
Tue Dec 31 14:57:32 2019 - [info] Executing FLUSH TABLES WITH READ LOCK..
Tue Dec 31 14:57:32 2019 - [info] ok.
Tue Dec 31 14:57:32 2019 - [info] Orig master binlog:pos is binlog.000003:876.
Tue Dec 31 14:57:32 2019 - [info] Waiting to execute all relay logs on 192.168.1.156(192.168.1.156:3306)..
Tue Dec 31 14:57:32 2019 - [info] master_pos_wait(binlog.000003:876) completed on 192.168.1.156(192.168.1.156:3306). Executed 0 events.
Tue Dec 31 14:57:32 2019 - [info] done.
Tue Dec 31 14:57:32 2019 - [info] Getting new master's binlog name and position..
Tue Dec 31 14:57:32 2019 - [info] binlog.000005:876
Tue Dec 31 14:57:32 2019 - [info] All other slaves should start replication from here. Statement should be: CHANGE MASTER TO MASTER_HOST='192.168.1.156', MASTER_PORT=3306, MASTER_AUTO_POSITION=1, MASTER_USER='repl', MASTER_PASSWORD='xxx';
Tue Dec 31 14:57:32 2019 - [info]
Tue Dec 31 14:57:32 2019 - [info] * Switching slaves in parallel..
Tue Dec 31 14:57:32 2019 - [info]
Tue Dec 31 14:57:32 2019 - [info] -- Slave switch on host 192.168.0.48(192.168.0.48:3306) started, pid: 74104
Tue Dec 31 14:57:32 2019 - [info]
Tue Dec 31 14:57:33 2019 - [info] Log messages from 192.168.0.48 ...
Tue Dec 31 14:57:33 2019 - [info]
Tue Dec 31 14:57:32 2019 - [info] Waiting to execute all relay logs on 192.168.0.48(192.168.0.48:3306)..
Tue Dec 31 14:57:32 2019 - [info] master_pos_wait(binlog.000003:876) completed on 192.168.0.48(192.168.0.48:3306). Executed 0 events.
Tue Dec 31 14:57:32 2019 - [info] done.
Tue Dec 31 14:57:32 2019 - [info] Resetting slave 192.168.0.48(192.168.0.48:3306) and starting replication from the new master 192.168.1.156(192.168.1.156:3306)..
Tue Dec 31 14:57:32 2019 - [info] Executed CHANGE MASTER.
Tue Dec 31 14:57:32 2019 - [info] Slave started.
Tue Dec 31 14:57:33 2019 - [info] End of log messages from 192.168.0.48 ...
Tue Dec 31 14:57:33 2019 - [info]
Tue Dec 31 14:57:33 2019 - [info] -- Slave switch on host 192.168.0.48(192.168.0.48:3306) succeeded.
Tue Dec 31 14:57:33 2019 - [info] Unlocking all tables on the orig master:
Tue Dec 31 14:57:33 2019 - [info] Executing UNLOCK TABLES..
Tue Dec 31 14:57:33 2019 - [info] ok.
Tue Dec 31 14:57:33 2019 - [info] Starting orig master as a new slave..
Tue Dec 31 14:57:33 2019 - [info] Resetting slave 192.168.0.97(192.168.0.97:3306) and starting replication from the new master 192.168.1.156(192.168.1.156:3306)..
Tue Dec 31 14:57:33 2019 - [info] Executed CHANGE MASTER.
Tue Dec 31 14:57:33 2019 - [info] Slave started.
Tue Dec 31 14:57:33 2019 - [info] All new slave servers switched successfully.
Tue Dec 31 14:57:33 2019 - [info]
Tue Dec 31 14:57:33 2019 - [info] * Phase 5: New master cleanup phase..
Tue Dec 31 14:57:33 2019 - [info]
Tue Dec 31 14:57:33 2019 - [info] 192.168.1.156: Resetting slave info succeeded.
Tue Dec 31 14:57:33 2019 - [info] Switching master to 192.168.1.156(192.168.1.156:3306) completed successfully.
5.2 自动切换
master dead后,MHA当时已经开启,候选Master库(Slave)会自动failover为Master.
六、MHA日常操作
MHA Manager 端日常主要操作步骤

① 检查是否有下列文件,有则删除。
发生主从切换后,MHAmanager服务会自动停掉,且在manager_workdir目录下面生成文件app1.failover.complete,若要启动MHA,必须先确保无此文件)
# ll /var/log/mha/app1/app1.failover.complete
# ll /var/log/mha/app1/app1.failover.error
② 检查MHA当前置:
# masterha_check_repl --conf=/etc/masterha.cnf
③ 启动MHA:
# nohup masterha_manager --conf=/etc/masterha.cnf > /var/log/mha/app1/mha_manager.log 2>&1 &
当有slave 节点宕掉时,默认是启动不了的,加上 --ignore_fail_on_start 即使有节点宕掉也能启动MHA,如下:
#--last_failover_minute=(minutes):
当最近的一个failover 切换发生在last_failover_minute(默认为8小时) 之内,MHA manager 将不会在切换。因为它会认为有些问题没有得到解决。如果设置了 --ignore_last_failover 参数,参数(--last_failover_minute) 将会失效
--ignore_last_failover:
如果最近failover 失败,MHA 将不会再次开始failover机制,因为这个问题可能再次发生。常规步骤:手动清理failover 错误文件,此文件一般在manager_workdir/app_name.failover.error文件,然后在启动failover机制。如果设置此参数,MHA 将会继续failover 不管上次的failover状态
--wait_on_failover_error=(seconds):
在failover的过程,当发出错误了,masterha_manager 等待 wait_no_failover_error 的时间后,退出。如果设置为了0,直接退出。这个好处,是当后台运行master monitor 和 failover scripts的时候,masterha_manager 可以在 wait_no_failover_error 时间到达之前重启监控
--remove_dead_master_conf:
如果设置此参数,当成功failover后,MHA manager将会自动删除配置文件中关于dead master的配置选项。
# nohup masterha_manager --conf=/etc/masterha.cnf --ignore_fail_on_start >/var/log/mha/app1/mha_manager.log 2>&1 &
④ 检查状态:
# masterha_check_status --conf=/etc/masterha.cnf
⑤ 检查日志:
#tail -f /var/log/mha/app1/manager.log
⑥ 主从切换后续工作
主库切换后,把原主库修复成新从库,然后重新执行以上5步。原主库数据文件完整的情况下,可通过以下方式找出最后执行的CHANGE MASTER命令:
# grep "CHANGE MASTER TO MASTER" /var/log/mha/app1/manager.log | tail -1
CHANGE MASTER TO MASTER_HOST='10.1.5.9',MASTER_PORT=3306, MASTER_LOG_FILE='master-bin.000001', MASTER_LOG_POS=120,MASTER_USER='repl_user', MASTER_PASSWORD='xxx';
--最后启动新从库
# start slave;
# show slave status\G
purge_relay_logs -h
corntab -e
0 5 * * * /usr/bin/purge_relay_logs --user=root --password=970125 --port=3306 --disable_relay_log_purge >> /var/log/purge_relay.log 2>&1
七、附录
7.1 相关命令说明
Manager命令说明
masterha_check_ssh 检查MHA的SSH配置状况
masterha_check_repl 检查MySQL复制状况
masterha_manger 启动MHA
masterha_check_status 检测当前MHA运行状态
masterha_master_monitor 检测master是否宕机
masterha_master_switch 控制故障转移(自动或者手动)
masterha_conf_host 添加或删除配置的server信息
Node命令说明
save_binary_logs 保存和复制master的二进制日志
apply_diff_relay_logs 识别差异的中继日志事件并将其差异的事件应用于其他的slave
filter_mysqlbinlog 去除不必要的ROLLBACK事件(MHA已不再使用这个工具)
purge_relay_logs 清除中继日志(不会阻塞SQL线程)
7.2 master_ip_failover脚本代码
要想把keepalived服务引入MHA,我们只需要修改切换是触发的脚本文件master_ip_failover即可,在该脚本中添加在master发生宕机时对keepalived的处理。
在管理节点编辑脚本/usr/bin/master_ip_failover,修改后如下:
#!/usr/bin/env perl
use strict;
use warnings FATAL => 'all';
use Getopt::Long;
my (
$command, $ssh_user, $orig_master_host, $orig_master_ip,
$orig_master_port, $new_master_host, $new_master_ip, $new_master_port
);
my $vip = '192.168.1.1';
my $ssh_start_vip ="/etc/init.d/keepalived start";
my $ssh_stop_vip ="/etc/init.d/keepalived stop";
GetOptions(
'command=s' =>\$command,
'ssh_user=s' =>\$ssh_user,
'orig_master_host=s' => \$orig_master_host,
'orig_master_ip=s' =>\$orig_master_ip,
'orig_master_port=i' => \$orig_master_port,
'new_master_host=s' =>\$new_master_host,
'new_master_ip=s' =>\$new_master_ip,
'new_master_port=i' =>\$new_master_port,
);
exit &main();
sub main {
print "\n\nIN SCRIPTTEST====$ssh_stop_vip==$ssh_start_vip===\n\n";
if ( $command eq "stop" || $command eq "stopssh" ) {
my $exit_code = 1;
eval {
print "Disabling the VIP on old master: $orig_master_host \n";
&stop_vip();
$exit_code = 0;
};
if ($@) {
warn "Got Error: $@\n";
exit $exit_code;
}
exit $exit_code;
}
elsif ( $command eq "start" ) {
my $exit_code = 10;
eval {
print "Enabling the VIP - $vip on the new master - $new_master_host\n";
&start_vip();
$exit_code = 0;
};
if ($@) {
warn $@;
exit $exit_code;
}
exit $exit_code;
}
elsif ( $command eq "status" ) {
print "Checking the Status of the script.. OK \n";
exit 0;
}
else {
&usage();
exit 1;
}
}
sub start_vip() {
`ssh $ssh_user\@$new_master_host \" $ssh_start_vip \"`;
}
# A simple system call that disable the VIPon the old_master
sub stop_vip() {
`ssh $ssh_user\@$orig_master_host \" $ssh_stop_vip \"`;
}
sub usage {
print
"Usage: master_ip_failover --command=start|stop|stopssh|status--orig_master_host=host --orig_master_ip=ip --orig_master_port=port--new_master_host=host --new_master_ip=ip --new_master_port=port\n";
}
授予管理权限
chmod +x /usr/bin/master_ip_failover
** 注意:my $vip = ‘10.192.203.108’;这一行中的vip请根据情况改成自己的虚拟IP。
7.3 master_ip_online_change脚本代码
在管理节点编辑脚本/usr/bin/master_ip_online_change
#!/usr/bin/env perl
# Copyright (C) 2011 DeNA Co.,Ltd.
#
# This program is free software; you can redistribute it and/or modify
# itunder the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc.,
# 51Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
## Note: This is a sample script and is notcomplete. Modify the script based on your environment.
use strict;
use warnings FATAL => 'all';
use Getopt::Long;
use MHA::DBHelper;
use MHA::NodeUtil;
use Time::HiRes qw( sleep gettimeofdaytv_interval );
use Data::Dumper;
my $_tstart;
my $_running_interval = 0.1;
my (
$command, $orig_master_is_new_slave, $orig_master_host,
$orig_master_ip, $orig_master_port, $orig_master_user,
$orig_master_password, $orig_master_ssh_user, $new_master_host,
$new_master_ip, $new_master_port, $new_master_user,
$new_master_password, $new_master_ssh_user
);
my $vip = '192.168.1.1/32';
my $key = '1';
my $ssh_start_vip = "/etc/init.d/keepalived start";
my $ssh_stop_vip = "/etc/init.d/keepalived stop";
my $orig_master_ssh_port = 22;
my $new_master_ssh_port = 22;
GetOptions(
'command=s' =>\$command,
'orig_master_is_new_slave' => \$orig_master_is_new_slave,
'orig_master_host=s' =>\$orig_master_host,
'orig_master_ip=s' =>\$orig_master_ip,
'orig_master_port=i' =>\$orig_master_port,
'orig_master_user=s' =>\$orig_master_user,
'orig_master_password=s' =>\$orig_master_password,
'orig_master_ssh_user=s' =>\$orig_master_ssh_user,
'new_master_host=s' =>\$new_master_host,
'new_master_ip=s' =>\$new_master_ip,
'new_master_port=i' =>\$new_master_port,
'new_master_user=s' =>\$new_master_user,
'new_master_password=s' =>\$new_master_password,
'new_master_ssh_user=s' =>\$new_master_ssh_user,
'orig_master_ssh_port=i' =>\$orig_master_ssh_port,
'new_master_ssh_port=i' =>\$new_master_ssh_port,
);
exit &main();
sub current_time_us {
my( $sec, $microsec ) = gettimeofday();
my$curdate = localtime($sec);
return $curdate . " " . sprintf( "%06d", $microsec);
}
sub sleep_until {
my$elapsed = tv_interval($_tstart);
if( $_running_interval > $elapsed ) {
sleep( $_running_interval - $elapsed );
}
}
sub get_threads_util {
my$dbh = shift;
my$my_connection_id = shift;
my$running_time_threshold = shift;
my$type = shift;
$running_time_threshold = 0 unless ($running_time_threshold);
$type = 0 unless($type);
my@threads;
my$sth = $dbh->prepare("SHOW PROCESSLIST");
$sth->execute();
while ( my $ref = $sth->fetchrow_hashref() ) {
my $id = $ref->{Id};
my $user = $ref->{User};
my $host = $ref->{Host};
my $command =$ref->{Command};
my $state = $ref->{State};
my $query_time = $ref->{Time};
my $info = $ref->{Info};
$info =~ s/^\s*(.*?)\s*$/$1/ if defined($info);
next if ( $my_connection_id == $id );
next if ( defined($query_time) && $query_time <$running_time_threshold );
next if ( defined($command) && $command eq "Binlog Dump" );
next if ( defined($user) && $user eq "system user" );
next
if ( defined($command)
&& $command eq "Sleep"
&& defined($query_time)
&& $query_time >= 1 );
if ( $type >= 1 ) {
next if ( defined($command) && $command eq "Sleep" );
next if ( defined($command) && $command eq "Connect");
}
if ( $type >= 2 ) {
next if ( defined($info) && $info =~ m/^select/i );
next if ( defined($info) && $info =~ m/^show/i );
}
push @threads, $ref;
}
return @threads;
}
sub main {
if( $command eq "stop" ) {
## Gracefully killing connections on the current master
#1. Set read_only= 1 on the new master
#2. DROP USER so that no app user can establish new connections
#3. Set read_only= 1 on the current master
#4. Kill current queries
#* Any database access failure will result in script die.
my $exit_code = 1;
eval {
## Setting read_only=1 on the new master (to avoid accident)
my $new_master_handler = new MHA::DBHelper();
# args: hostname, port, user, password, raise_error(die_on_error)_or_not
$new_master_handler->connect( $new_master_ip, $new_master_port,
$new_master_user, $new_master_password, 1 );
print current_time_us() . " Set read_only on the new master..";
$new_master_handler->enable_read_only();
if ( $new_master_handler->is_read_only() ) {
print "ok.\n";
}
else {
die "Failed!\n";
}
$new_master_handler->disconnect();
# Connecting to the orig master, die if any database error happens
my $orig_master_handler = new MHA::DBHelper();
$orig_master_handler->connect( $orig_master_ip, $orig_master_port,
$orig_master_user, $orig_master_password, 1 );
## Drop application user so that nobody can connect. Disablingper-session binlog beforehand
$orig_master_handler->disable_log_bin_local();
print current_time_us() . " Drpping appuser on the orig master..\n";
#FIXME_xxx_drop_app_user($orig_master_handler);
## Waiting for N * 100 milliseconds so that current connections can exit
my $time_until_read_only = 15;
$_tstart = [gettimeofday];
my @threads = get_threads_util( $orig_master_handler->{dbh},
$orig_master_handler->{connection_id} );
while ( $time_until_read_only > 0 && $#threads >= 0 ) {
if ( $time_until_read_only % 5 == 0 ) {
printf
"%s Waiting all running %d threads aredisconnected.. (max %d milliseconds)\n",
current_time_us(), $#threads + 1, $time_until_read_only * 100;
if ( $#threads < 5 ) {
print Data::Dumper->new( [$_] )->Indent(0)->Terse(1)->Dump ."\n"
foreach (@threads);
}
}
sleep_until();
$_tstart = [gettimeofday];
$time_until_read_only--;
@threads = get_threads_util( $orig_master_handler->{dbh},
$orig_master_handler->{connection_id} );
}
## Setting read_only=1 on the current master so that nobody(exceptSUPER) can write
print current_time_us() . " Set read_only=1 on the orig master..";
$orig_master_handler->enable_read_only();
if ( $orig_master_handler->is_read_only() ) {
print "ok.\n";
}
else {
die "Failed!\n";
}
## Waiting for M * 100 milliseconds so that current update queries cancomplete
my $time_until_kill_threads = 5;
@threads = get_threads_util( $orig_master_handler->{dbh},
$orig_master_handler->{connection_id} );
while ( $time_until_kill_threads > 0 && $#threads >= 0 ) {
if ( $time_until_kill_threads % 5 == 0 ) {
printf
"%s Waiting all running %d queries aredisconnected.. (max %d milliseconds)\n",
current_time_us(), $#threads + 1, $time_until_kill_threads * 100;
if ( $#threads < 5 ) {
print Data::Dumper->new( [$_] )->Indent(0)->Terse(1)->Dump ."\n"
foreach (@threads);
}
}
sleep_until();
$_tstart = [gettimeofday];
$time_until_kill_threads--;
@threads = get_threads_util( $orig_master_handler->{dbh},
$orig_master_handler->{connection_id} );
}
## Terminating all threads
print current_time_us() . " Killing all applicationthreads..\n";
$orig_master_handler->kill_threads(@threads) if ( $#threads >= 0);
print current_time_us() . " done.\n";
$orig_master_handler->enable_log_bin_local();
$orig_master_handler->disconnect();
## After finishing the script, MHA executes FLUSH TABLES WITH READ LOCK
eval {
`ssh -p$orig_master_ssh_port $orig_master_ssh_user\@$orig_master_host\" $ssh_stop_vip \"`;
};
if ($@) {
warn $@;
}
$exit_code = 0;
};
if ($@) {
warn "Got Error: $@\n";
exit $exit_code;
}
exit $exit_code;
}
elsif ( $command eq "start" ) {
## Activating master ip on the new master
#1. Create app user with write privileges
#2. Moving backup script if needed
#3. Register new master's ip to the catalog database
# We don't return error even thoughactivating updatable accounts/ip failed so that we don't interrupt slaves'recovery.
# If exit code is 0 or 10, MHA does notabort
my $exit_code = 10;
eval {
my $new_master_handler = new MHA::DBHelper();
# args: hostname, port, user, password, raise_error_or_not
$new_master_handler->connect( $new_master_ip, $new_master_port,
$new_master_user, $new_master_password, 1 );
## Set read_only=0 on the new master
$new_master_handler->disable_log_bin_local();
print current_time_us() . " Setread_only=0 on the new master.\n";
$new_master_handler->disable_read_only();
## Creating an app user on the new master
print current_time_us() . " Creating app user on the newmaster..\n";
#FIXME_xxx_create_app_user($new_master_handler);
$new_master_handler->enable_log_bin_local();
$new_master_handler->disconnect();
## Update master ip on the catalog database, etc
`ssh -p$new_master_ssh_port $new_master_ssh_user\@$new_master_host\" $ssh_start_vip \"`;
$exit_code = 0;
};
if ($@) {
warn "Got Error: $@\n";
exit $exit_code;
}
exit $exit_code;
}
elsif ( $command eq "status" ) {
#do nothing
exit 0;
}
else {
&usage();
exit 1;
}
}
sub usage {
print
"Usage: master_ip_online_change--command=start|stop|status --orig_master_host=host --orig_master_ip=ip--orig_master_port=port --new_master_host=host --new_master_ip=ip--new_master_port=port\n";
die;
}
授予管理权限:
chmod +x /usr/bin/master_ip_online_change
注意:需要把my $vip = ‘10.192.203.108/32’;改成自己的vip即可。
7.4 send_report脚本代码
#!/usr/bin/perl
# Copyright (C) 2011 DeNA Co.,Ltd.
#
# This program is free software; you can redistribute it and/or modify
# itunder the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc.,
# 51Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
## Note: This is a sample script and is notcomplete. Modify the script based on your environment.
use strict;
use warnings FATAL => 'all';
use Mail::Sender;
use Getopt::Long;
#new_master_host and new_slave_hosts areset only when recovering master succeeded
my ( $dead_master_host, $new_master_host, $new_slave_hosts,$subject, $body );
my $smtp='smtp.163.com';
my $mail_from='[email protected]';
my $mail_user='[email protected]';
my $mail_pass='Password';
my$mail_to=['[email protected]','[email protected]'];
GetOptions(
'orig_master_host=s' => \$dead_master_host,
'new_master_host=s' =>\$new_master_host,
'new_slave_hosts=s' =>\$new_slave_hosts,
'subject=s' =>\$subject,
'body=s' => \$body,
);
mailToContacts($smtp,$mail_from,$mail_user,$mail_pass,$mail_to,$subject,$body);
sub mailToContacts {
my ( $smtp, $mail_from, $user, $passwd, $mail_to, $subject, $msg ) = @_;
open my $DEBUG, "> /tmp/monitormail.log"
or die "Can't open the debug file:$!\n";
my $sender = new Mail::Sender {
ctype => 'text/plain;charset=utf-8',
encoding => 'utf-8',
smtp => $smtp,
from => $mail_from,
auth => 'LOGIN',
TLS_allowed => '0',
authid => $user,
authpwd => $passwd,
to => $mail_to,
subject => $subject,
debug => $DEBUG
};
$sender->MailMsg(
{ msg => $msg,
debug => $DEBUG
}
)or print $Mail::Sender::Error;
return 1;
}
# Do whatever you want here
exit 0;
注意:需要修改下以下几行内容:
my $smtp='smtp.163.com';
my $mail_from='[email protected]';
my $mail_user='[email protected]';
my $mail_pass='Password';
my$mail_to=['[email protected]'];
赋予执行权限
chmod +x /usr/bin/send_report
**注意:需要确保管理节点服务器可以正常发送邮件。可以先用sendEmail命令试下。