TCP代理服务器动态路由方案

目前tcp长连接应用使用的方案是nginx(ngx_stream_core_module),静态配置;自带的健康检查只能基于端口,但应用有时程序错误或假死,端口是OK的,导致客户端不少报错,因此新的方案要求可以动态配置路由,由监控检查程序管理后端节点的动态配置。

http动态路由(也叫动态upstream)方案较多,有现成的插件,或者lua对这块也支持得较好;但tcp的方案相对少,充分研究了下,总结下以下两个靠谱的方案。

1. Nginx、openresty下的lua库、自己编写lua脚本
实现功能:

  • Nginx启动时将server配置初始化至redis, hash结构:{ip1:0, ip2:0, ip3:0,……}(0:OK,1:不可用);
  • 每个请求都从nginx中获取server,轮训方式,redis支持连接池;
  • 访问redis连接失败,或者访问的key中无可用服务器(hash为空,或value全部是1),则用本地静态配置;
  • 某个server不能访问(ip或端口不通,或太繁忙无法访问),会try其他节点,try次数可配;
  • 未实现和静态配置一样的端口健康检查自动踢出的机制(两个原因:1、会与外部独立的监控检查程序冲突;2、需要在nginx init阶段起定时任务,但init阶段不支持socket模块访问redis,没有找到合适的方案)

脚本说明:

高并发场景需要关注变量的使用(尽可能不使用全局变量)和逻辑的处理,尤其轮训、失败重试两个功能。

stream {
    lua_package_path "/usr/local/lib/lua/?.lua;;";
    lua_shared_dict  dict_ups 4m;
    lua_shared_dict  dict_try 10m;
    lua_shared_dict  rotate_lock 100k;
    lua_add_variable $dhq_proxypass;
    lua_add_variable $backend_server;
    lua_add_variable $try_cnt;
    lua_add_variable $dhq_conn_cnt;

    log_format  main  '$remote_addr [$time_local] $server_addr:$server_port $dhq_proxypass $dhq_conn_cnt $try_cnt $backend_server $status';
    access_log  logs/access.log  main;

    upstream mytcp_static {        
        server 10.40.20.201:22 max_fails=3 fail_timeout=5s;
        server 10.40.20.202:22 max_fails=3 fail_timeout=5s;
        server 10.40.20.203:22 max_fails=3 fail_timeout=5s;
        server 10.40.20.204:22 max_fails=3 fail_timeout=5s;
    }

    upstream mytcp_lua {
        # just a place holder,not work
        server 1.1.1.1:1111;
    
        balancer_by_lua_block {
            local backend_port = 22
            local try_cnt = ngx.shared.dict_try:get("conn" .. ngx.var.dhq_conn_cnt)
            if try_cnt > 16 then
                return
            end
            local balancer = require "ngx.balancer"
            balancer.set_timeouts(3, 3, 3)
            balancer.set_more_tries(4)
            if g_ups_cur_dhq then
                local state_name, status_code = balancer.get_last_failure()
                if state_name == nil then
                    balancer.set_current_peer(g_ups_cur_dhq, backend_port)
                    ngx.var.backend_server = g_ups_cur_dhq .. ":" .. backend_port
                else
                    local table_len = table.getn(g_ups_dhq_active_table)
                    local ups_cur_dhq = g_ups_dhq_active_table[(try_cnt - 1) % table_len + 1]
                    balancer.set_current_peer(ups_cur_dhq, backend_port)
                    ngx.var.backend_server = ups_cur_dhq .. ":" .. backend_port
                    try_cnt = try_cnt + 1
                    ngx.shared.dict_try:set("conn" .. ngx.var.dhq_conn_cnt, try_cnt)
                end
                ngx.var.try_cnt = try_cnt
            else
                ngx.log(ngx.ERR, "[error]: no server in upstream. ")
                return
            end
        }
    }

    server {
        listen  12345;
        proxy_connect_timeout  3s;
        proxy_timeout  120s;
        proxy_next_upstream_tries 5;

        preread_by_lua_block {
            ups_dhq_table = {}
            ups_dhq_table["10.40.20.201"] = 0
            ups_dhq_table["10.40.20.202"] = 0
            ups_dhq_table["10.40.20.203"] = 0
            ups_dhq_table["10.40.30.204"] = 0

            local ups_name = "mytcp"
            local redis_ups_key = "upstream_denghaoqi"
            ngx.var.dhq_proxypass = ups_name .. "_lua"

            function func_get_redis()
                local Redis = require "resty.redis"
                local redis = Redis:new()
                local pool_options = { pool_size = 300, blck_log = 20000 }
                redis:set_timeout(3000)
                local ok, err = redis:connect("10.40.16.45", 36379, pool_options)
                if not ok then
                    ngx.var.dhq_proxypass = ups_name .. "_static"
                    ngx.log(ngx.ERR, "connect to redis failed, ", err)
                    return
                end
                return redis
            end

            local dhq_conn_cnt, err = ngx.shared.dict_ups:incr("dhq_conn_cnt",1,0,0)
            ngx.var.dhq_conn_cnt = dhq_conn_cnt

            -- sync to redis when nginx start
            if (dhq_conn_cnt == 1) then
                local redis = func_get_redis()
                if redis == nil then 
                    return 
                end
                local ok, err = redis:del(redis_ups_key)
                local ok, err = redis:hmset(redis_ups_key, ups_dhq_table)
                -- local ok, err = redis:close()
                redis:set_keepalive(30000, 300)
            end
 
            -- get a server in rotation
            local redis = func_get_redis()
            if redis == nil then 
                return
            end
            local ok, err = redis:array_to_hash(redis:hgetall(redis_ups_key))
            if not ok then
                ngx.var.dhq_proxypass = ups_name .. "_static"
                ngx.log(ngx.ERR, "get redis key failed. ")
                return
            end
            redis:set_keepalive(30000, 300)
            if type(ok) == "table" then
                if ok[1] == false then
                    ngx.log(ngx.ERR, "error: ", ok[2])
                else
                    local ups_dhq_active_table = {}
                    for key, value in pairs(ok) do
                        if value == "0" then
                            table.insert(ups_dhq_active_table,key)
                        end
                    end
                    if table.getn(ups_dhq_active_table) == 0 then
                        ngx.var.dhq_proxypass = ups_name .. "_static"
                        ngx.log(ngx.ERR, "redis key has no valid server. ")
                        return
                    end
                    table.sort(ups_dhq_active_table)
                    local ind = (dhq_conn_cnt - 1) % table.getn(ups_dhq_active_table) + 1
                    g_ups_cur_dhq = ups_dhq_active_table[ind]
                    g_ups_dhq_active_table = ups_dhq_active_table
                    ngx.shared.dict_try:set("conn" .. dhq_conn_cnt, 1)
                end
            end
        }
        proxy_pass $dhq_proxypass;
    }
}

2. Haproxy、haproxy生态圈的dataplaneapi
说明:
Dataplaneapi实现了restful api,可通过友好的接口删除server,增加server,dataplaneapi与haproxy部署在一台服务器,是1对1 的关系,需要对每个haproxy节点的api操作;

手动编辑haproxy配置文件后,执行以下命令强刷dataplaneapi的缓存:kill -SIGUSR2 dataplaneapi进程,或重启dataplaneapi

Api使用说明:
增加或者删除server时,需要开启事务;
提交事务后,haproxy自动reload,配置文件自动修改

dataplaneapi服务启动

/root/dataplaneapi/dataplaneapi-master/build/dataplaneapi --host 10.40.20.203 --port 5555 -b /usr/local/haproxy/sbin/haproxy -c /usr/local/haproxy/conf/haproxy.cfg -d 5 -r "/usr/local/haproxy/haproxy_mgr.sh restart" -s "/usr/local/haproxy/haproxy_mgr.sh reload" -u api -t /tmp/haproxy

主要操作步骤:
 

查询信息,获取当前version
# curl -X GET -u admin:admin \
> -H "Content-Type: application/json" \
> "http://10.40.20.203:5555/v2/services/haproxy/configuration/servers?backend=test-proxy-srv"
{
    "_version":1,
    "data":[
        {"address":"10.40.20.208","check":"enabled","name":"10.40.20.208","port":222},
        {"address":"10.45.0.10","check":"enabled","name":"10.45.0.10","port":22,"weight":80},
        {"address":"10.45.0.11","check":"enabled","name":"10.45.0.11","port":22,"weight":80}
    ]
}
 
开启事务,获取事务id
参数version根据上述步骤结果递增
# curl -X POST -u admin:admin \
> -H "Content-Type: application/json" \
> http://10.40.20.203:5555/v2/services/haproxy/transactions?version=2
{"_version":1,"id":"c69fa5fe-8dc7-4c85-8912-0ac86b3ad59d","status":"in_progress"}
 
删除server
curl -X DELETE -u admin:admin \
-H "Content-Type: application/json" \
"http://10.40.20.203:5555/v2/services/haproxy/configuration/servers/10.40.20.207?backend=test-proxy-srv&transaction_id=c69fa5fe-8dc7-4c85-8912-0ac86b3ad59d"

增加server
curl -X POST -u admin:admin \
-H "Content-Type: application/json" \
--data '{"address": "10.45.0.11", "check": "enabled", "max-connections": 500, "name": "10.45.0.11", "port": 22, "weight": 80}' \
"http://10.40.20.203:5555/v2/services/haproxy/configuration/servers?backend=test-proxy-srv&transaction_id=c69fa5fe-8dc7-4c85-8912-0ac86b3ad59d "

 
提交事务
curl -X PUT -u admin:admin \
-H "Content-Type: application/json" \
http://10.40.20.203:5555/v2/services/haproxy/transactions/c69fa5fe-8dc7-4c85-8912-0ac86b3ad59d

3. 方案比较

 

Nginx + lua

Haproxy + dataplaneapi

稳定性

自己写的lua,经过多方面的功能测试和性能测试,稳定性有待线上验证

自带特性,稳定性更有保证

对监控程序要求

调整1个redis即可,对程序来说简单

需要了解api,需要对N个节点分别处理

对运维要求

引入复杂的lua编程(尤其是高并发情况下的故障或性能问题),后续要进一步优化

新引入haproxy,运维要进一步研究

 

发布了24 篇原创文章 · 获赞 25 · 访问量 2万+

猜你喜欢

转载自blog.csdn.net/sdmei/article/details/103773014