接上一篇,使用fiter+sql方式进行分布式写表,存在效率低的问题,现在尝试从源码入手,制定clickhouse的分布式表本地写入方案
编译好的class文件:
1、目标:
实现按行hash和按行随机两张写表模式,兼容之前的单机模式和分区随机模式
2、思路:
新增2个参数
write_mode:写表方式 hash_fields:hash字段,支持多个字段,逗号分隔
伪码:
如果(【cluster】参数有效):
如果(【write_mode】有效):
如果(write_mode=‘rowhash’,且【hash_fields】有效):
行hash模式
否则如果(write_mode=‘rowrandom’)
行随机模式
否则
分区随机模式,默认
否则:
分区随机模式,默认
否则:
单机模式
### 3、源码修改:
只需修改 io.github.interestinglab.waterdrop.output.batch.Clickhouse 类中的process方法即可,下面是Clickhouse 修改后的完整代码(版本1.5.1)
package io.github.interestinglab.waterdrop.output.batch
import java.text.SimpleDateFormat
import java.util
import java.util.Properties
import java.math.BigDecimal
import java.sql.ResultSet
import io.github.interestinglab.waterdrop.config.{Config, ConfigFactory}
import io.github.interestinglab.waterdrop.apis.BaseOutput
import io.github.interestinglab.waterdrop.config.ConfigRuntimeException
import io.github.interestinglab.waterdrop.config.TypesafeConfigUtils
import org.apache.spark.sql.{Dataset, Row, SparkSession}
import ru.yandex.clickhouse.except.{ClickHouseException, ClickHouseUnknownException}
import ru.yandex.clickhouse.settings.ClickHouseProperties
import ru.yandex.clickhouse.{
BalancedClickhouseDataSource,
ClickHouseConnectionImpl,
ClickHousePreparedStatement,
ClickhouseJdbcUrlParser
}
import scala.collection.JavaConversions._
import scala.collection.immutable.HashMap
import scala.collection.mutable.ArrayBuffer
import scala.collection.mutable.WrappedArray
import scala.util.matching.Regex
import scala.util.{Failure, Success, Try}
class Clickhouse extends BaseOutput {
var tableSchema: Map[String, String] = new HashMap[String, String]()
var jdbcLink: String = _
var initSQL: String = _
var table: String = _
var fields: java.util.List[String] = _
var cluster: String = _
//contians cluster basic info
var clusterInfo: ArrayBuffer[(String, Int, Int, String)] = _
var retryCodes: java.util.List[Integer] = _
var config: Config = ConfigFactory.empty()
val clickhousePrefix = "clickhouse."
val properties: Properties = new Properties()
var writeMode:String = "single"
var hashFieldsArray:Array[String] = _
/**
* Set Config.
* */
override def setConfig(config: Config): Unit = {
this.config = config
}
/**
* Get Config.
* */
override def getConfig(): Config = {
this.config
}
override def checkConfig(): (Boolean, String) = {
val requiredOptions = List("host", "table", "database")
val nonExistsOptions = requiredOptions.map(optionName => (optionName, config.hasPath(optionName))).filter { p =>
val (optionName, exists) = p
!exists
}
if (TypesafeConfigUtils.hasSubConfig(config, clickhousePrefix)) {
val clickhouseConfig = TypesafeConfigUtils.extractSubConfig(config, clickhousePrefix, false)
clickhouseConfig
.entrySet()
.foreach(entry => {
val key = entry.getKey
val value = String.valueOf(entry.getValue.unwrapped())
properties.put(key, value)
})
}
if (nonExistsOptions.nonEmpty) {
(
false,
"please specify " + nonExistsOptions
.map { option =>
val (name, exists) = option
"[" + name + "]"
}
.mkString(", ") + " as non-empty string")
}
val hasUserName = config.hasPath("username")
val hasPassword = config.hasPath("password")
if (hasUserName && !hasPassword || !hasUserName && hasPassword) {
(false, "please specify username and password at the same time")
}
if (hasPassword) {
properties.put("user", config.getString("username"))
properties.put("password", config.getString("password"))
}
(true, "")
}
override def prepare(spark: SparkSession): Unit = {
this.jdbcLink = String.format("jdbc:clickhouse://%s/%s", config.getString("host"), config.getString("database"))
val balanced: BalancedClickhouseDataSource = new BalancedClickhouseDataSource(this.jdbcLink, properties)
val conn = balanced.getConnection.asInstanceOf[ClickHouseConnectionImpl]
this.table = config.getString("table")
this.tableSchema = getClickHouseSchema(conn, table)
if (this.config.hasPath("fields")) {
this.fields = config.getStringList("fields")
val (flag, msg) = acceptedClickHouseSchema()
if (!flag) {
throw new ConfigRuntimeException(msg)
}
}
val defaultConfig = ConfigFactory.parseMap(
Map(
"bulk_size" -> 20000,
// "retry_codes" -> util.Arrays.asList(ClickHouseErrorCode.NETWORK_ERROR.code),
"retry_codes" -> util.Arrays.asList(),
"retry" -> 1
)
)
//检查配置文件中是否存在cluster参数
if (config.hasPath("cluster")) {
this.cluster = config.getString("cluster")
//从数据库中获取集群信息,后面在process方法中用到,clusterInfo其实是一个数组
this.clusterInfo = getClickHouseClusterInfo(conn, cluster)
if (this.clusterInfo.size == 0) {
val errorInfo = s"cloud not find cluster config in system.clusters, config cluster = $cluster"
logError(errorInfo)
throw new RuntimeException(errorInfo)
}
logInfo(s"get [$cluster] config from system.clusters, the replica info is [$clusterInfo].")
}
config = config.withFallback(defaultConfig)
retryCodes = config.getIntList("retry_codes")
super.prepare(spark)
}
override def process(df: Dataset[Row]): Unit = {
val dfFields = df.schema.fieldNames
val bulkSize = config.getInt("bulk_size")
val retry = config.getInt("retry")
if (!config.hasPath("fields")) {
fields = dfFields.toList
}
this.initSQL = initPrepareSQL()
logInfo(this.initSQL)
//如果clusterInfo中有数据,就是集群模式
if (this.clusterInfo != null && this.clusterInfo.size > 0) {
var writeMode="single"
var hashFields="" //支持多个字段,逗号分隔
//获取 write_mode、和hash_fields两个参数
if(config.hasPath("write_mode")) {
writeMode=config.getString("write_mode")
}
if(config.hasPath("hash_fields")) {
hashFields=config.getString("hash_fields")
hashFieldsArray=hashFields.split(",")
}
//写模式为hash,且hash_fields参数不为空,且hash_fields字段都存在于流中,则为使用hash方式,否则还是使用随机方式
logInfo(s"conf's writeMode: $writeMode")
logInfo(s"conf's HashFields: $hashFields")
//logDebug(s"dfFields.contains(HashField):"+dfFields.contains(HashField))
if("rowhash".equals(writeMode) && !"".equals(hashFields) && hashFields != null && checkFields(hashFields,dfFields)) {
this.writeMode="rowHash"
logInfo(s"cluster rowHash mode, shard index select in iteator")
}else if("rowrandom".equals(writeMode)) {
this.writeMode="rowRandom"
logInfo(s"cluster rowRandom mode, shard index select in iteator")
}
else{
this.writeMode="partRandom"
logInfo(s"cluster partRandom mode, shard index select in foreachPartition. the jdbc url is ")
}
}else{
this.writeMode="single"
logInfo(s"single mode, the jdbc url create in foreachPartition.")
}
logDebug(s"ready foreachPartition...")
df.foreachPartition { iter =>
var jdbcUrl = this.jdbcLink
var statementArray = new Array[ClickHousePreparedStatement](1) //先初始化1大小的数组,如果是集群模式再修改数组
var lengthArray = new Array[Int](1)
var shardIndex=0
logDebug(s"this.writeMode:" + this.writeMode)
if ("partRandom".equals(this.writeMode) || "rowHash".equals(this.writeMode) || "rowRandom".equals(this.writeMode)){
statementArray = new Array[ClickHousePreparedStatement](this.clusterInfo.size)
lengthArray = new Array[Int](this.clusterInfo.size)
//初始化statement
logInfo("this.clusterInfo.size is " + this.clusterInfo.size)
for(i <- 0 until this.clusterInfo.size){
logInfo(s"create connect for cluster[$i]")
val shardInfo = this.clusterInfo.get(i)
val host = shardInfo._4
val port = getJDBCPort(this.jdbcLink)
val database = config.getString("database") //数据库名也是从配置文件中获取,使用配置文件需要配置本地表对于的库名
jdbcUrl = s"jdbc:clickhouse://$host:$port/$database" //重新对jdbcUrl赋值,其实主要就是host
statementArray(i) = new BalancedClickhouseDataSource(jdbcUrl, this.properties)
.getConnection.asInstanceOf[ClickHouseConnectionImpl]
.createClickHousePreparedStatement(this.initSQL, ResultSet.TYPE_FORWARD_ONLY)
lengthArray(i) = 0
}
if ("partRandom".equals(this.writeMode)){
shardIndex = (Math.random() * this.clusterInfo.size).asInstanceOf[Int] //分区随机的核心代码,生成一个0~clusterInfo.size的随机数
logInfo(s"cluster partRandom mode, select shard index [$shardIndex] to insert data. the jdbc url is "
+ statementArray(shardIndex).getConnection.getMetaData.getURL)
}
}
else {
statementArray(0)=new BalancedClickhouseDataSource(jdbcUrl, this.properties)
.getConnection.asInstanceOf[ClickHouseConnectionImpl]
.createClickHousePreparedStatement(this.initSQL, ResultSet.TYPE_FORWARD_ONLY)
shardIndex=0
logInfo(s"single mode, the jdbc url is [$jdbcUrl].")
}
while (iter.hasNext) {
val row = iter.next()
if("rowHash".equals(this.writeMode)){
val hashValue=getFieldsRowHash(hashFieldsArray,row)
shardIndex= hashValue.abs % this.clusterInfo.size //根据HashField,计算hash值
logTrace(s"cluster rowHash mode, select shard index [$shardIndex] to insert data. the jdbc url is "
+ statementArray(shardIndex).getConnection.getMetaData.getURL)
}else if("rowRandom".equals(this.writeMode)){
shardIndex= (Math.random() * this.clusterInfo.size).asInstanceOf[Int] //按行随机
logTrace(s"cluster rowRandom mode, select shard index [$shardIndex] to insert data. the jdbc url is "
+ statementArray(shardIndex).getConnection.getMetaData.getURL)
}
//如果是partRandom模式,shardIndex为遍历iter前生成的随机数
//如果是single模式,shardIndex为初始化值:0
lengthArray(shardIndex) += 1
renderStatement(fields, row, dfFields, statementArray(shardIndex)) //fields:配置文件中的字段,row:数据行,dfFields:dataFrame中的字段
statementArray(shardIndex).addBatch()
if (lengthArray(shardIndex) >= bulkSize) { //如果缓冲区大小大于等于阈值(默认20000)则执行入库
execute(statementArray(shardIndex), retry) //
lengthArray(shardIndex) = 0
}
}
for (i <- 0 until statementArray.length){
execute(statementArray(i), retry)
lengthArray(i) = 0
}
}
}
private def checkFields(fields:String,dfFields: Array[String]): Boolean ={
val fieldsArray=fields.split(",")
checkFields(fieldsArray,dfFields)
}
private def checkFields(fields:Array[String],dfFields: Array[String]): Boolean ={
for (field <- fields){
if (!dfFields.contains(field)){
return false
}
}
true
}
private def getFieldsRowHash(fields:Array[String],row: Row): Int ={
var rowHashCode:Int = 0
for (field <- fields){
val fieldHash=row.getAs(field).hashCode()
rowHashCode = 31 * rowHashCode + fieldHash;
}
rowHashCode
}
private def getJDBCPort(jdbcUrl: String): Int = {
val clickHouseProperties: ClickHouseProperties = ClickhouseJdbcUrlParser.parse(jdbcUrl, properties)
clickHouseProperties.getPort
}
private def execute(statement: ClickHousePreparedStatement, retry: Int): Unit = {
val res = Try(statement.executeBatch())
res match {
case Success(_) => {
logInfo("Insert into ClickHouse succeed")
statement.close()
}
case Failure(e: ClickHouseException) => {
val errorCode = e.getErrorCode
if (retryCodes.contains(errorCode)) {
logError("Insert into ClickHouse failed. Reason: ", e)
if (retry > 0) {
execute(statement, retry - 1)
} else {
logError("Insert into ClickHouse failed and retry failed, drop this bulk.")
statement.close()
}
} else {
throw e
}
}
case Failure(e: ClickHouseUnknownException) => {
statement.close()
throw e
}
case Failure(e: Exception) => {
throw e
}
}
}
private def getClickHouseSchema(conn: ClickHouseConnectionImpl, table: String): Map[String, String] = {
val sql = s"desc $table"
val resultSet = conn.createStatement.executeQuery(sql)
var schema = new HashMap[String, String]()
while (resultSet.next()) {
schema += (resultSet.getString(1) -> resultSet.getString(2))
}
schema
}
private def getClickHouseClusterInfo(
conn: ClickHouseConnectionImpl,
cluster: String): ArrayBuffer[(String, Int, Int, String)] = {
val sql =
s"SELECT cluster, shard_num, shard_weight, host_address FROM system.clusters WHERE cluster = '$cluster' AND replica_num = 1"
val resultSet = conn.createStatement.executeQuery(sql)
val clusterInfo = ArrayBuffer[(String, Int, Int, String)]()
while (resultSet.next()) {
val shardWeight = resultSet.getInt("shard_weight")
for (_ <- 1 to shardWeight) {
val custerName = resultSet.getString("cluster")
val shardNum = resultSet.getInt("shard_num")
val hostAddress = resultSet.getString("host_address")
val shardInfo = Tuple4(custerName, shardNum, shardWeight, hostAddress)
clusterInfo += shardInfo
}
}
clusterInfo
}
private def initPrepareSQL(): String = {
val prepare = List.fill(fields.size)("?")
val sql = String.format(
"insert into %s (%s) values (%s)",
this.table,
this.fields.map(a => s"`$a`").mkString(","),
prepare.mkString(","))
sql
}
private def acceptedClickHouseSchema(): (Boolean, String) = {
val nonExistsFields = fields
.map(field => (field, tableSchema.contains(field)))
.filter { case (_, exist) => !exist }
if (nonExistsFields.nonEmpty) {
(
false,
"field " + nonExistsFields
.map { case (option) => "[" + option + "]" }
.mkString(", ") + " not exist in table " + this.table)
} else {
val nonSupportedType = fields
.map(field => (tableSchema(field), Clickhouse.supportOrNot(tableSchema(field))))
.filter { case (_, exist) => !exist }
if (nonSupportedType.nonEmpty) {
(
false,
"clickHouse data type " + nonSupportedType
.map { case (option) => "[" + option + "]" }
.mkString(", ") + " not support in current version.")
} else {
(true, "")
}
}
}
private def renderDefaultStatement(index: Int, fieldType: String, statement: ClickHousePreparedStatement): Unit = {
fieldType match {
case "DateTime" | "Date" | "String" =>
statement.setString(index + 1, Clickhouse.renderStringDefault(fieldType))
case "Int8" | "UInt8" | "Int16" | "Int32" | "UInt32" | "UInt16" =>
statement.setInt(index + 1, 0)
case "UInt64" | "Int64" =>
statement.setLong(index + 1, 0)
case "Float32" => statement.setFloat(index + 1, 0)
case "Float64" => statement.setDouble(index + 1, 0)
case Clickhouse.lowCardinalityPattern(lowCardinalityType) =>
renderDefaultStatement(index, lowCardinalityType, statement)
case Clickhouse.arrayPattern(_) => statement.setArray(index + 1, List())
case Clickhouse.nullablePattern(nullFieldType) => renderNullStatement(index, nullFieldType, statement)
case _ => statement.setString(index + 1, "")
}
}
private def renderNullStatement(index: Int, fieldType: String, statement: ClickHousePreparedStatement): Unit = {
fieldType match {
case "String" =>
statement.setNull(index + 1, java.sql.Types.VARCHAR)
case "DateTime" => statement.setNull(index + 1, java.sql.Types.DATE)
case "Date" => statement.setNull(index + 1, java.sql.Types.TIME)
case "Int8" | "UInt8" | "Int16" | "Int32" | "UInt32" | "UInt16" =>
statement.setNull(index + 1, java.sql.Types.INTEGER)
case "UInt64" | "Int64" =>
statement.setNull(index + 1, java.sql.Types.BIGINT)
case "Float32" => statement.setNull(index + 1, java.sql.Types.FLOAT)
case "Float64" => statement.setNull(index + 1, java.sql.Types.DOUBLE)
case "Array" => statement.setNull(index + 1, java.sql.Types.ARRAY)
case Clickhouse.decimalPattern(_) => statement.setNull(index + 1, java.sql.Types.DECIMAL)
}
}
private def renderBaseTypeStatement(
index: Int,
fieldIndex: Int,
fieldType: String,
item: Row,
statement: ClickHousePreparedStatement): Unit = {
fieldType match {
case "DateTime" | "Date" | "String" =>
statement.setString(index + 1, item.getAs[String](fieldIndex))
case "Int8" | "UInt8" | "Int16" | "UInt16" | "Int32" =>
statement.setInt(index + 1, item.getAs[Int](fieldIndex))
case "UInt32" | "UInt64" | "Int64" =>
statement.setLong(index + 1, item.getAs[Long](fieldIndex))
case "Float32" => statement.setFloat(index + 1, item.getAs[Float](fieldIndex))
case "Float64" => statement.setDouble(index + 1, item.getAs[Double](fieldIndex))
case Clickhouse.arrayPattern(_) =>
statement.setArray(index + 1, item.getAs[WrappedArray[AnyRef]](fieldIndex))
case "Decimal" => statement.setBigDecimal(index + 1, item.getAs[BigDecimal](fieldIndex))
case _ => statement.setString(index + 1, item.getAs[String](fieldIndex))
}
}
private def renderStatementEntry(
index: Int,
fieldIndex: Int,
fieldType: String,
item: Row,
statement: ClickHousePreparedStatement): Unit = {
fieldType match {
case "String" | "DateTime" | "Date" | Clickhouse.arrayPattern(_) =>
renderBaseTypeStatement(index, fieldIndex, fieldType, item, statement)
case Clickhouse.floatPattern(_) | Clickhouse.intPattern(_) | Clickhouse.uintPattern(_) =>
renderBaseTypeStatement(index, fieldIndex, fieldType, item, statement)
case Clickhouse.nullablePattern(dataType) =>
renderStatementEntry(index, fieldIndex, dataType, item, statement)
case Clickhouse.lowCardinalityPattern(dataType) =>
renderBaseTypeStatement(index, fieldIndex, dataType, item, statement)
case Clickhouse.decimalPattern(_) =>
renderBaseTypeStatement(index, fieldIndex, "Decimal", item, statement)
case _ => statement.setString(index + 1, item.getAs[String](fieldIndex))
}
}
private def renderStatement(
fields: util.List[String],
item: Row,
dsFields: Array[String],
statement: ClickHousePreparedStatement): Unit = {
for (i <- 0 until fields.size()) {
val field = fields.get(i)
val fieldType = tableSchema(field)
if (dsFields.indexOf(field) == -1) {
// specified field does not existed in row.
renderDefaultStatement(i, fieldType, statement)
} else {
val fieldIndex = item.fieldIndex(field)
if (item.isNullAt(fieldIndex)) {
// specified field is Null in Row.
renderDefaultStatement(i, fieldType, statement)
} else {
renderStatementEntry(i, fieldIndex, fieldType, item, statement)
}
}
}
}
}
object Clickhouse {
val arrayPattern: Regex = "(Array.*)".r
val nullablePattern: Regex = "Nullable\\((.*)\\)".r
val lowCardinalityPattern: Regex = "LowCardinality\\((.*)\\)".r
val intPattern: Regex = "(Int.*)".r
val uintPattern: Regex = "(UInt.*)".r
val floatPattern: Regex = "(Float.*)".r
val decimalPattern: Regex = "(Decimal.*)".r
/**
* Waterdrop support this clickhouse data type or not.
*
* @param dataType ClickHouse Data Type
* @return Boolean
* */
private[waterdrop] def supportOrNot(dataType: String): Boolean = {
dataType match {
case "Date" | "DateTime" | "String" =>
true
case arrayPattern(_) | nullablePattern(_) | floatPattern(_) | intPattern(_) | uintPattern(_) =>
true
case lowCardinalityPattern(_) =>
true
case decimalPattern(_) =>
true
case _ =>
false
}
}
private[waterdrop] def renderStringDefault(fieldType: String): String = {
fieldType match {
case "DateTime" =>
val dateFormat: SimpleDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
dateFormat.format(System.currentTimeMillis())
case "Date" =>
val dateFormat: SimpleDateFormat = new SimpleDateFormat("yyyy-MM-dd")
dateFormat.format(System.currentTimeMillis())
case "String" =>
""
}
}
}
由于1.x使用的是sbt进行打包和管理依赖,这个sbt比较难搞,最好是能连接官网。
如果懒得编译打包,可以下载我编译好的class文件,直接替换jar包中的class文件,重新压缩成jar包使用即可
4、使用说明
cluster:
还是跟原版一样,配置集群名称,如果不配置,就是单机模式
write_mode:
2个有效值rowhash、rowrandom。rowhash:按行hash、rowrandom:按行随机、其他:分区随机模式(默认)
hash_fields:
用于获取数据流中的字段,生成hash值。如果write_mode为rowhash,就需要配置个参数,否则不需要配置。如果write_mode为rowhash,但未配置hash_fields或者数据集中获取不到这些字段值,就回落到分区随机模式。
5、测试
cluster | write_mode | hash_fields | 最终写模式 |
---|---|---|---|
未配置 | - | - | 单机模式 |
配置错误 | - | - | 单机模式 |
配置正确 | 未配置 | - | 分区随机模式 |
配置正确 | 配置错误 | - | 分区随机模式 |
配置正确 | 配置正确 | 未配置 | 分区随机模式 |
配置正确 | 配置正确 | 配置错误 | 分区随机模式 |
配置正确 | 配置正确 | 配置正确 | 行hash或者行随机模式 |
在测试之前,请确保你的clickhouse分布式配置以及完成。
5.1、准备数据:
vi /tmp/dist_test.csv
id,user_name
1,aa
2,bb
3,cc
4,dd
5,ee
6,ff
7,gg
8,hh
9,ii
10,jj
5.2、测试用例
用例id | cluster | write_mode | hash_fields | 预期 | 配置文件摘要 | 结果 |
---|---|---|---|---|---|---|
1 | 未配置 | - | - | 单机模式 | clickhouse { host = "10.1.99.191:8123" database = "dw_local" table = "dist_test_01" username = "myuser" password = "mypassword" } | 符合预期 |
2 | xxx | - | - | 报错 | clickhouse { host = "10.1.99.191:8123" database = "dw_local" table = "dist_test_02" cluster = "xxx" username = "myuser" password = "mypassword" } | 符合预期 |
3 | dw_cluster | 未配置 | - | 分区随机模式 | clickhouse { host = "10.1.99.191:8123" database = "dw_local" table = "dist_test_03" cluster = "dw_cluster" username = "myuser" password = "mypassword" } | 符合预期 |
4 | dw_cluster | xxx | - | 分区随机模式 | clickhouse { host = "10.1.99.191:8123" database = "dw_local" table = "dist_test_04" cluster = "dw_cluster" write_mode = "xxx" username = "myuser" password = "mypassword" } | 符合预期 |
5 | dw_cluster | rowrandom | 未配置 | 行随机模式 | clickhouse { host = "10.1.99.191:8123" database = "dw_local" table = "dist_test_05" cluster = "dw_cluster" write_mode = "rowrandom" username = "myuser" password = "mypassword" } | 符合预期 |
6 | dw_cluster | rowrandom | xxx | 行随机模式 | clickhouse { host = "10.1.99.191:8123" database = "dw_local" table = "dist_test_06" cluster = "dw_cluster" write_mode = "rowrandom" hash_fields="xxx" username = "myuser" password = "mypassword" } | 符合预期 |
7 | dw_cluster | rowhash | 未配置 | 分区随机模式 | clickhouse { host = "10.1.99.191:8123" database = "dw_local" table = "dist_test_07" cluster = "dw_cluster" write_mode = "rowhash" username = "myuser" password = "mypassword" } | 符合预期 |
8 | dw_cluster | rowhash | xxx | 分区随机模式 | clickhouse { host = "10.1.99.191:8123" database = "dw_local" table = "dist_test_08" cluster = "dw_cluster" write_mode = "rowhash" hash_fields="xxx" username = "myuser" password = "mypassword" } | 符合预期 |
9 | dw_cluster | rowhash | user_name | 行hash模式 | clickhouse { host = "10.1.99.191:8123" database = "dw_local" table = "dist_test_09" cluster = "dw_cluster" write_mode = "rowhash" hash_fields="user_name" username = "myuser" password = "mypassword" } | 符合预期 |
10 | dw_cluster | rowhash | user_name,id | 行hash模式 | clickhouse { host = "10.1.99.191:8123" database = "dw_local" table = "dist_test_10" cluster = "dw_cluster" write_mode = "rowhash" hash_fields="user_name,id" username = "myuser" password = "mypassword" } | 符合预期 |
5.3、创建表:
-- 创建本地表,在所有节点中都需要执行
DROP TABLE IF EXISTS dw_local.dist_test_01;
CREATE TABLE dw_local.dist_test_01(id String,user_name String) engine = MergeTree primary key (id) order by (id);
DROP TABLE IF EXISTS dw_local.dist_test_02;
CREATE TABLE dw_local.dist_test_02(id String,user_name String) engine = MergeTree primary key (id) order by (id);
DROP TABLE IF EXISTS dw_local.dist_test_03;
CREATE TABLE dw_local.dist_test_03(id String,user_name String) engine = MergeTree primary key (id) order by (id);
DROP TABLE IF EXISTS dw_local.dist_test_04;
CREATE TABLE dw_local.dist_test_04(id String,user_name String) engine = MergeTree primary key (id) order by (id);
DROP TABLE IF EXISTS dw_local.dist_test_05;
CREATE TABLE dw_local.dist_test_05(id String,user_name String) engine = MergeTree primary key (id) order by (id);
DROP TABLE IF EXISTS dw_local.dist_test_06;
CREATE TABLE dw_local.dist_test_06(id String,user_name String) engine = MergeTree primary key (id) order by (id);
DROP TABLE IF EXISTS dw_local.dist_test_07;
CREATE TABLE dw_local.dist_test_07(id String,user_name String) engine = MergeTree primary key (id) order by (id);
DROP TABLE IF EXISTS dw_local.dist_test_08;
CREATE TABLE dw_local.dist_test_08(id String,user_name String) engine = MergeTree primary key (id) order by (id);
DROP TABLE IF EXISTS dw_local.dist_test_09;
CREATE TABLE dw_local.dist_test_09(id String,user_name String) engine = MergeTree primary key (id) order by (id);
DROP TABLE IF EXISTS dw_local.dist_test_10;
CREATE TABLE dw_local.dist_test_10(id String,user_name String) engine = MergeTree primary key (id) order by (id);
-- 创建分布式表
DROP TABLE IF EXISTS dw.dist_test_01;
CREATE TABLE dw.dist_test_01(id String,user_name String) ENGINE = Distributed(dw_cluster, dw_local, dist_test_01);
DROP TABLE IF EXISTS dw.dist_test_02;
CREATE TABLE dw.dist_test_02(id String,user_name String) ENGINE = Distributed(dw_cluster, dw_local, dist_test_02);
DROP TABLE IF EXISTS dw.dist_test_03;
CREATE TABLE dw.dist_test_03(id String,user_name String) ENGINE = Distributed(dw_cluster, dw_local, dist_test_03);
DROP TABLE IF EXISTS dw.dist_test_04;
CREATE TABLE dw.dist_test_04(id String,user_name String) ENGINE = Distributed(dw_cluster, dw_local, dist_test_04);
DROP TABLE IF EXISTS dw.dist_test_05;
CREATE TABLE dw.dist_test_05(id String,user_name String) ENGINE = Distributed(dw_cluster, dw_local, dist_test_05);
DROP TABLE IF EXISTS dw.dist_test_06;
CREATE TABLE dw.dist_test_06(id String,user_name String) ENGINE = Distributed(dw_cluster, dw_local, dist_test_06);
DROP TABLE IF EXISTS dw.dist_test_07;
CREATE TABLE dw.dist_test_07(id String,user_name String) ENGINE = Distributed(dw_cluster, dw_local, dist_test_07);
DROP TABLE IF EXISTS dw.dist_test_08;
CREATE TABLE dw.dist_test_08(id String,user_name String) ENGINE = Distributed(dw_cluster, dw_local, dist_test_08);
DROP TABLE IF EXISTS dw.dist_test_09;
CREATE TABLE dw.dist_test_09(id String,user_name String) ENGINE = Distributed(dw_cluster, dw_local, dist_test_09);
DROP TABLE IF EXISTS dw.dist_test_10;
CREATE TABLE dw.dist_test_10(id String,user_name String) ENGINE = Distributed(dw_cluster, dw_local, dist_test_10);
5.4、waterdrop配置:
用python3写批量写入配置文件
str1='''
spark {
spark.app.name = "Waterdrop"
spark.executor.instances = 1
spark.executor.cores = 1
spark.executor.memory = "1g"
spark.sql.catalogImplementation = "hive"
}
input {
file {
path = "file:///tmp/dist_test.csv"
format = "csv"
options.header = "true"
result_table_name = "dist_test"
}
}
filter {
repartition {
"注释":"对数进进行重新分区。"
num_partitions = 2
}
}
'''
filePath="/tmp/dist_test_01.conf"
str_output='''
output {
clickhouse {
host = "10.1.99.191:8123"
database = "dw_local"
table = "dist_test_01"
username = "myuser"
password = "mypassword"
}
}
'''
with open(filePath,"w",encoding="utf-8") as f:
f.write(str1+str_output)
filePath="/tmp/dist_test_02.conf"
str_output='''
output {
clickhouse {
host = "10.1.99.191:8123"
database = "dw_local"
table = "dist_test_02"
cluster = "xxx"
username = "myuser"
password = "mypassword"
}
}
'''
with open(filePath,"w",encoding="utf-8") as f:
f.write(str1+str_output)
filePath="/tmp/dist_test_03.conf"
str_output='''
output {
clickhouse {
host = "10.1.99.191:8123"
database = "dw_local"
table = "dist_test_03"
cluster = "dw_cluster"
username = "myuser"
password = "mypassword"
}
}
'''
with open(filePath,"w",encoding="utf-8") as f:
f.write(str1+str_output)
filePath="/tmp/dist_test_04.conf"
str_output='''
output {
clickhouse {
host = "10.1.99.191:8123"
database = "dw_local"
table = "dist_test_04"
cluster = "dw_cluster"
write_mode = "xxx"
username = "myuser"
password = "mypassword"
}
}
'''
with open(filePath,"w",encoding="utf-8") as f:
f.write(str1+str_output)
filePath="/tmp/dist_test_05.conf"
str_output='''
output {
clickhouse {
host = "10.1.99.191:8123"
database = "dw_local"
table = "dist_test_05"
cluster = "dw_cluster"
write_mode = "rowrandom"
username = "myuser"
password = "mypassword"
}
}
'''
with open(filePath,"w",encoding="utf-8") as f:
f.write(str1+str_output)
filePath="/tmp/dist_test_06.conf"
str_output='''
output {
clickhouse {
host = "10.1.99.191:8123"
database = "dw_local"
table = "dist_test_06"
cluster = "dw_cluster"
write_mode = "rowrandom"
hash_fields="xxx"
username = "myuser"
password = "mypassword"
}
}
'''
with open(filePath,"w",encoding="utf-8") as f:
f.write(str1+str_output)
filePath="/tmp/dist_test_07.conf"
str_output='''
output {
clickhouse {
host = "10.1.99.191:8123"
database = "dw_local"
table = "dist_test_07"
cluster = "dw_cluster"
write_mode = "rowhash"
username = "myuser"
password = "mypassword"
}
}
'''
with open(filePath,"w",encoding="utf-8") as f:
f.write(str1+str_output)
filePath="/tmp/dist_test_08.conf"
str_output='''
output {
clickhouse {
host = "10.1.99.191:8123"
database = "dw_local"
table = "dist_test_08"
cluster = "dw_cluster"
write_mode = "rowhash"
hash_fields="xxx"
username = "myuser"
password = "mypassword"
}
}
'''
with open(filePath,"w",encoding="utf-8") as f:
f.write(str1+str_output)
filePath="/tmp/dist_test_09.conf"
str_output='''
output {
clickhouse {
host = "10.1.99.191:8123"
database = "dw_local"
table = "dist_test_09"
cluster = "dw_cluster"
write_mode = "rowhash"
hash_fields="user_name"
username = "myuser"
password = "mypassword"
}
}
'''
with open(filePath,"w",encoding="utf-8") as f:
f.write(str1+str_output)
filePath="/tmp/dist_test_10.conf"
str_output='''
output {
clickhouse {
host = "10.1.99.191:8123"
database = "dw_local"
table = "dist_test_10"
cluster = "dw_cluster"
write_mode = "rowhash"
hash_fields="user_name,id"
username = "myuser"
password = "mypassword"
}
}
'''
with open(filePath,"w",encoding="utf-8") as f:
f.write(str1+str_output)
5.5、执行导入:
bin/start-waterdrop.sh --master local[1] --deploy-mode client --config /tmp/dist_test_01.conf
bin/start-waterdrop.sh --master local[1] --deploy-mode client --config /tmp/dist_test_02.conf
bin/start-waterdrop.sh --master local[1] --deploy-mode client --config /tmp/dist_test_03.conf
bin/start-waterdrop.sh --master local[1] --deploy-mode client --config /tmp/dist_test_04.conf
bin/start-waterdrop.sh --master local[1] --deploy-mode client --config /tmp/dist_test_05.conf
bin/start-waterdrop.sh --master local[1] --deploy-mode client --config /tmp/dist_test_06.conf
bin/start-waterdrop.sh --master local[1] --deploy-mode client --config /tmp/dist_test_07.conf
bin/start-waterdrop.sh --master local[1] --deploy-mode client --config /tmp/dist_test_08.conf
bin/start-waterdrop.sh --master local[1] --deploy-mode client --config /tmp/dist_test_09.conf
bin/start-waterdrop.sh --master local[1] --deploy-mode client --config /tmp/dist_test_10.conf
5.6、查询数据:
-- 节点1
select * from (
select 'dist_test_01' as table_name ,count() from dw_local.dist_test_01 union all
select 'dist_test_02' as table_name ,count() from dw_local.dist_test_02 union all
select 'dist_test_03' as table_name ,count() from dw_local.dist_test_03 union all
select 'dist_test_04' as table_name ,count() from dw_local.dist_test_04 union all
select 'dist_test_05' as table_name ,count() from dw_local.dist_test_05 union all
select 'dist_test_06' as table_name ,count() from dw_local.dist_test_06 union all
select 'dist_test_07' as table_name ,count() from dw_local.dist_test_07 union all
select 'dist_test_08' as table_name ,count() from dw_local.dist_test_08 union all
select 'dist_test_09' as table_name ,count() from dw_local.dist_test_09 union all
select 'dist_test_10' as table_name ,count() from dw_local.dist_test_10
) t order by table_name
FINAL;
select * from (
select 'dist_test_01' as table_name ,* from dw_local.dist_test_01 union all
select 'dist_test_02' as table_name ,* from dw_local.dist_test_02 union all
select 'dist_test_03' as table_name ,* from dw_local.dist_test_03 union all
select 'dist_test_04' as table_name ,* from dw_local.dist_test_04 union all
select 'dist_test_05' as table_name ,* from dw_local.dist_test_05 union all
select 'dist_test_06' as table_name ,* from dw_local.dist_test_06 union all
select 'dist_test_07' as table_name ,* from dw_local.dist_test_07 union all
select 'dist_test_08' as table_name ,* from dw_local.dist_test_08 union all
select 'dist_test_09' as table_name ,* from dw_local.dist_test_09 union all
select 'dist_test_10' as table_name ,* from dw_local.dist_test_10
) t order by table_name desc ,id;
5.7删除测试表和文件
DROP TABLE IF EXISTS dw_local.dist_test_01;
DROP TABLE IF EXISTS dw_local.dist_test_02;
DROP TABLE IF EXISTS dw_local.dist_test_03;
DROP TABLE IF EXISTS dw_local.dist_test_04;
DROP TABLE IF EXISTS dw_local.dist_test_05;
DROP TABLE IF EXISTS dw_local.dist_test_06;
DROP TABLE IF EXISTS dw_local.dist_test_07;
DROP TABLE IF EXISTS dw_local.dist_test_08;
DROP TABLE IF EXISTS dw_local.dist_test_09;
DROP TABLE IF EXISTS dw_local.dist_test_10;
DROP TABLE IF EXISTS dw.dist_test_01;
DROP TABLE IF EXISTS dw.dist_test_02;
DROP TABLE IF EXISTS dw.dist_test_03;
DROP TABLE IF EXISTS dw.dist_test_04;
DROP TABLE IF EXISTS dw.dist_test_05;
DROP TABLE IF EXISTS dw.dist_test_06;
DROP TABLE IF EXISTS dw.dist_test_07;
DROP TABLE IF EXISTS dw.dist_test_08;
DROP TABLE IF EXISTS dw.dist_test_09;
DROP TABLE IF EXISTS dw.dist_test_10;
rm -f /tmp/dist_test_01.conf
rm -f /tmp/dist_test_02.conf
rm -f /tmp/dist_test_03.conf
rm -f /tmp/dist_test_04.conf
rm -f /tmp/dist_test_05.conf
rm -f /tmp/dist_test_06.conf
rm -f /tmp/dist_test_07.conf
rm -f /tmp/dist_test_08.conf
rm -f /tmp/dist_test_09.conf
rm -f /tmp/dist_test_10.conf
6、性能测试:
测试表信息:行数:6800万,文件大小:3.6G,列数:24
节点数量:2
单机配置:100G内存,8核32线程
写入方式 | 第1次耗时(秒) | 第2次耗时(秒) | 第3次耗时(秒) | 平均耗时(秒) |
---|---|---|---|---|
单机 | 234.835144 | 242.014209 | 239.198289 | 238.68 |
分区随机 | 224.140249 | 230.265629 | 236.603885 | 230.34 |
行随机 | 238.440763 | 243.962106 | 245.21358 | 242.54 |
行hash | 228.066742 | 234.923924 | 227.100249 | 230.03 |
7、总结
1、新增的两种写模式都是基于行的,行随机模式需要每一行都计算一个随机值,行hash模式需要每一行基于一个或多个属性计算hash,相比默认的分区随机模式,理论上性能还有一定的影响,但实际测试中似乎没有什么影响
2、由于jdbc连接是在rdd分区中创建,且要保存所有shard的信息,所以每个分区中都需要创建N个连接对象,使用一个数组来存储,所以对应shard数量较多的集群来说,可能需要资源消耗会比较大,不过这个问题似乎无法避免。