1. 安装
1.1 安装要求
- Java8:安装请参考centos7同时安装java8和openJdk11、windows同时安装java8和openJdk11
- Python3:安装请参考centos7同时安装Python2和Python3
1.2 下载解压
[root@bigdata001 opt]# wget https://datax-opensource.oss-cn-hangzhou.aliyuncs.com/20220530/datax.tar.gz
[root@bigdata001 opt]#
[root@bigdata001 opt]# tar -zxvf datax.tar.gz
[root@bigdata001 opt]#
[root@bigdata001 opt]# cd datax
[root@bigdata001 datax]#
2. 运行示例同步程序
下面运行示例同步程序,模拟产生10万条一样的数据,有5个字段。然后输出但不进行print。FrameWork部分设置了同步错误容忍率和每秒流量限制
[root@bigdata001 datax]# bin/datax.py job/job.json
DataX (DATAX-OPENSOURCE-3.0), From Alibaba !
......省略部分......
2022-06-14 09:47:06.779 [main] INFO Engine -
{
"content":[
{
"reader":{
"parameter":{
"column":[
{
"type":"string",
"value":"DataX"
},
{
"type":"long",
"value":19890604
},
{
"type":"date",
"value":"1989-06-04 00:00:00"
},
{
"type":"bool",
"value":true
},
{
"type":"bytes",
"value":"test"
}
],
"sliceRecordCount":100000
},
"name":"streamreader"
},
"writer":{
"parameter":{
"print":false,
"encoding":"UTF-8"
},
"name":"streamwriter"
}
}
],
"setting":{
"errorLimit":{
"record":0,
"percentage":0.02
},
"speed":{
"byte":10485760
}
}
}
......省略部分......
2022-06-14 09:47:16.912 [job-0] INFO JobContainer -
任务启动时刻 : 2022-06-14 09:47:06
任务结束时刻 : 2022-06-14 09:47:16
任务总计耗时 : 10s
任务平均流量 : 2.48MB/s
记录写入速度 : 100000rec/s
读出记录总数 : 1000000
读写失败总数 : 0
[root@bigdata001 datax]#
3. 查看数据同步模板
如果想同步一个数据源的数据,到另外一个数据库,可以通过如下方式,查看数据的同步模板
[root@bigdata001 datax]# bin/datax.py -r mysqlreader -w hdfswriter
DataX (DATAX-OPENSOURCE-3.0), From Alibaba !
Copyright (C) 2010-2017, Alibaba Group. All Rights Reserved.
Please refer to the mysqlreader document:
https://github.com/alibaba/DataX/blob/master/mysqlreader/doc/mysqlreader.md
Please refer to the hdfswriter document:
https://github.com/alibaba/DataX/blob/master/hdfswriter/doc/hdfswriter.md
Please save the following configuration as a json file and use
python {DATAX_HOME}/bin/datax.py {JSON_FILE_NAME}.json
to run the job.
{
"job": {
"content": [
{
"reader": {
"name": "mysqlreader",
"parameter": {
"column": [],
"connection": [
{
"jdbcUrl": [],
"table": []
}
],
"password": "",
"username": "",
"where": ""
}
},
"writer": {
"name": "hdfswriter",
"parameter": {
"column": [],
"compress": "",
"defaultFS": "",
"fieldDelimiter": "",
"fileName": "",
"fileType": "",
"path": "",
"writeMode": ""
}
}
}
],
"setting": {
"speed": {
"channel": ""
}
}
}
}
[root@bigdata001 datax]#
其中channel表示该job总的并发数
4. 同步速度控制参数
提供了并发数量、字节流(每秒同步字节)、记录流(每秒同步数量)三种流控模式
"speed": {
"channel": 5,
"byte": 1048576,
"record": 10000
}