定位问题
同事用DataX同步数据到ODPS(MaxCompute)的时候,出现了脏数据。
1,找到该条脏数据的主键,值得一提的是,有些时候配置的JSON,主键的顺序在比较靠后的位置,所以需要把主键放在前面。这样报错日志,才会把主键打印出来,至于为什么后面会讲。
2,去源段数据库查询这条记录,发现有部分字段乱码。
3,通过CAST函数、TO_CHAR函数、CONVERT函数、DUMP函数,等坚定这个函数,在存储上就是乱码,与字符集无关。而且该记录是Number(14,10)的,经过多次转换,发现该字段的值十六进制(DUMP(COL,'1016'))是bd,e,ed。很匪夷所思
4,姑且认定源段脏数据,但是为什么脏数据的记录,报错:源表的列个数小于目标表的列个数,源表列数是:9 目的表列数是:86 ,数目不匹配。。。。。。
5,于是定位源码问题
6,查看源码
OracleReader
public static class Task extends Reader.Task {
private Configuration readerSliceConfig;
private CommonRdbmsReader.Task commonRdbmsReaderTask;
@Override
public void init() {
this.readerSliceConfig = super.getPluginJobConf();
this.commonRdbmsReaderTask = new CommonRdbmsReader.Task(
DATABASE_TYPE ,super.getTaskGroupId(), super.getTaskId());
this.commonRdbmsReaderTask.init(this.readerSliceConfig);
}
@Override
public void startRead(RecordSender recordSender) {
int fetchSize = this.readerSliceConfig
.getInt(com.alibaba.datax.plugin.rdbms.reader.Constant.FETCH_SIZE);
//在这里
this.commonRdbmsReaderTask.startRead(this.readerSliceConfig,
recordSender, super.getTaskPluginCollector(), fetchSize);
}
@Override
public void post() {
this.commonRdbmsReaderTask.post(this.readerSliceConfig);
}
@Override
public void destroy() {
this.commonRdbmsReaderTask.destroy(this.readerSliceConfig);
}
}
CommonRbdmsReaderTask
public void startRead(Configuration readerSliceConfig,
RecordSender recordSender,
TaskPluginCollector taskPluginCollector, int fetchSize) {
String querySql = readerSliceConfig.getString(Key.QUERY_SQL);
String table = readerSliceConfig.getString(Key.TABLE);
PerfTrace.getInstance().addTaskDetails(taskId, table + "," + basicMsg);
LOG.info("Begin to read record by Sql: [{}\n] {}.",
querySql, basicMsg);
PerfRecord queryPerfRecord = new PerfRecord(taskGroupId,taskId, PerfRecord.PHASE.SQL_QUERY);
queryPerfRecord.start();
Connection conn = DBUtil.getConnection(this.dataBaseType, jdbcUrl,
username, password);
// session config .etc related
DBUtil.dealWithSessionConfig(conn, readerSliceConfig,
this.dataBaseType, basicMsg);
int columnNumber = 0;
ResultSet rs = null;
try {
rs = DBUtil.query(conn, querySql, fetchSize);
queryPerfRecord.end();
ResultSetMetaData metaData = rs.getMetaData();
columnNumber = metaData.getColumnCount();
//这个统计干净的result_Next时间
PerfRecord allResultPerfRecord = new PerfRecord(taskGroupId, taskId, PerfRecord.PHASE.RESULT_NEXT_ALL);
allResultPerfRecord.start();
long rsNextUsedTime = 0;
long lastTime = System.nanoTime();
while (rs.next()) {
rsNextUsedTime += (System.nanoTime() - lastTime);
//这里-------------------------------
this.transportOneRecord(recordSender, rs,
metaData, columnNumber, mandatoryEncoding, taskPluginCollector);
lastTime = System.nanoTime();
}
allResultPerfRecord.end(rsNextUsedTime);
//目前大盘是依赖这个打印,而之前这个Finish read record是包含了sql查询和result next的全部时间
LOG.info("Finished read record by Sql: [{}\n] {}.",
querySql, basicMsg);
}catch (Exception e) {
throw RdbmsException.asQueryException(this.dataBaseType, e, querySql, table, username);
} finally {
DBUtil.closeDBResources(null, conn);
}
}
transportOneRecord
protected Record transportOneRecord(RecordSender recordSender, ResultSet rs,
ResultSetMetaData metaData, int columnNumber, String mandatoryEncoding,
TaskPluginCollector taskPluginCollector) {
Record record = buildRecord(recordSender,rs,metaData,columnNumber,mandatoryEncoding,taskPluginCollector);
recordSender.sendToWriter(record);
return record;
}
buildRecord
protected Record buildRecord(RecordSender recordSender,ResultSet rs, ResultSetMetaData metaData, int columnNumber, String mandatoryEncoding,
TaskPluginCollector taskPluginCollector) {
Record record = recordSender.createRecord();
try {
for (int i = 1; i <= columnNumber; i++) {
switch (metaData.getColumnType(i)) {
case Types.CHAR:
case Types.NCHAR:
case Types.VARCHAR:
case Types.LONGVARCHAR:
case Types.NVARCHAR:
case Types.LONGNVARCHAR:
String rawData;
if(StringUtils.isBlank(mandatoryEncoding)){
rawData = rs.getString(i);
}else{
rawData = new String((rs.getBytes(i) == null ? EMPTY_CHAR_ARRAY :
rs.getBytes(i)), mandatoryEncoding);
}
record.addColumn(new StringColumn(rawData));
break;
case Types.CLOB:
case Types.NCLOB:
record.addColumn(new StringColumn(rs.getString(i)));
break;
case Types.SMALLINT:
case Types.TINYINT:
case Types.INTEGER:
case Types.BIGINT:
record.addColumn(new LongColumn(rs.getString(i)));
break;
case Types.NUMERIC:
case Types.DECIMAL:
record.addColumn(new DoubleColumn(rs.getString(i)));
break;
case Types.FLOAT:
case Types.REAL:
case Types.DOUBLE:
record.addColumn(new DoubleColumn(rs.getString(i)));
break;
case Types.TIME:
record.addColumn(new DateColumn(rs.getTime(i)));
break;
// for mysql bug, see http://bugs.mysql.com/bug.php?id=35115
case Types.DATE:
if (metaData.getColumnTypeName(i).equalsIgnoreCase("year")) {
record.addColumn(new LongColumn(rs.getInt(i)));
} else {
record.addColumn(new DateColumn(rs.getDate(i)));
}
break;
case Types.TIMESTAMP:
record.addColumn(new DateColumn(rs.getTimestamp(i)));
break;
case Types.BINARY:
case Types.VARBINARY:
case Types.BLOB:
case Types.LONGVARBINARY:
record.addColumn(new BytesColumn(rs.getBytes(i)));
break;
// warn: bit(1) -> Types.BIT 可使用BoolColumn
// warn: bit(>1) -> Types.VARBINARY 可使用BytesColumn
case Types.BOOLEAN:
case Types.BIT:
record.addColumn(new BoolColumn(rs.getBoolean(i)));
break;
case Types.NULL:
String stringData = null;
if(rs.getObject(i) != null) {
stringData = rs.getObject(i).toString();
}
record.addColumn(new StringColumn(stringData));
break;
default:
throw DataXException
.asDataXException(
DBUtilErrorCode.UNSUPPORTED_TYPE,
String.format(
"您的配置文件中的列配置信息有误. 因为DataX 不支持数据库读取这种字段类型. 字段名:[%s], 字段名称:[%s], 字段Java类型:[%s]. 请尝试使用数据库函数将其转换datax支持的类型 或者不同步该字段 .",
metaData.getColumnName(i),
metaData.getColumnType(i),
metaData.getColumnClassName(i)));
}
}
} catch (Exception e) {
if (IS_DEBUG) {
LOG.debug("read data " + record.toString()
+ " occur exception:", e);
}
//TODO 这里识别为脏数据靠谱吗?
taskPluginCollector.collectDirtyRecord(record, e);
if (e instanceof DataXException) {
throw (DataXException) e;
}
}
return record;
}
执行到这时发生异常
case Types.DECIMAL:
record.addColumn(new DoubleColumn(rs.getString(i)));
break;
于是考虑是否是getString的时候,结果集得不到这一列的值?
查看源码
T4CNumberAccessor.getString()
String getString(int var1) throws SQLException {
String var2 = super.getString(var1);
if (var2 != null && this.definedColumnSize > 0 && var2.length() > this.definedColumnSize) {
var2 = var2.substring(0, this.definedColumnSize);
}
return var2;
}
父类NumberCommonAccessor
String getString(int var1) throws SQLException {
Object var2 = null;
if (this.rowSpaceIndicator == null) {
SQLException var15 = DatabaseError.createSqlException(this.getConnectionDuringExceptionHandling(), 21);
var15.fillInStackTrace();
throw var15;
} else if (this.rowSpaceIndicator[this.indicatorIndex + var1] == -1) {
return (String)var2;
} else {
byte[] var3 = this.rowSpaceByte;
int var4 = this.columnIndex + this.byteLength * var1 + 1;
byte var5 = var3[var4 - 1];
byte[] var6 = new byte[var5];
System.arraycopy(var3, var4, var6, 0, var5);
NUMBER var7 = new NUMBER(var6);
String var8 = NUMBER.toString(var6);
int var9 = var8.length();
if (var8.startsWith("0.") || var8.startsWith("-0.")) {
--var9;
}
if (var9 <= 38) {
/* ----------- 这里 ---------*/
return var7.toText(38, (String)null).trim();
} else {
var8 = var7.toText(-44, (String)null);
int var10 = var8.indexOf(69);
int var11 = var8.indexOf(43);
if (var10 == -1) {
var10 = var8.indexOf(101);
}
int var12;
for(var12 = var10 - 1; var8.charAt(var12) == '0'; --var12) {
;
}
String var13 = var8.substring(0, var12 + 1);
String var14 = null;
if (var11 > 0) {
var14 = var8.substring(var11 + 1);
} else {
var14 = var8.substring(var10 + 1);
}
return (var13 + "E" + var14).trim();
}
}
}
关键地方 toText函数
public String toText(int var1, String var2) throws SQLException {
return _getLnxLib().lnxnuc(this.shareBytes(), var1, var2);
}
lnxnuc该函数,将字节数组转成对应的字符
var30[var15] = var4[0];
var30[var15 + 1] = var4[41];
var30[var15 + 2] = var4[10];
var30[var15 + 3] = var4[0];
var30[var15 + 4] = var4[0];
转换时,因为字符问题,转换异常导致数组越界抛出异常。
回到DataX,DirtyRecord打印JSON数据信息
@Override
public String toString() {
return JSON.toJSONString(this.columns);
}
该记录之后被判为脏数据,后面的列不再考虑,直接进入下一循环。
解决问题
1,脏数据的字段,先去除,然后人工集成,走merge逻辑。
2,修改CommonRdbmsReader代码,如果发现异常,抛出异常后,继续处理接下来的字段。因为是通过
for (int i = 1; i <= columnNumber; i++) {
//获取每一列的Type
switch (metaData.getColumnType(i)) {
/* code */
}
}
遍历每一列来做,所以遇到脏数据,把该数据写成指定的DirtyData标记。接着往里写。但不建议这么做。
3,开放脏数据限制,毕竟脏数据并不需要集成,如果可控范围內,就让他过滤掉脏数据把。