文章目录
Building
Git : https://github.com/delta-io/delta
build/sbt compile package
启动方式
QuickStart
通过命令行
bin/spark-shell --packages io.delta:delta-core_2.11:0.4.0
这种方式会通过Ivy到maven中央仓库下载对应的delta 包到本地,并启动
使用Delta API操作数据
Delta并非一种新的数据格式,实际存储数据的是使用parquet文件存储的,然后再增加一个日志和checkpoint文件夹,将所有的数据操作,使用日志的方式来记录,再次读取数据的时候,就可以很容易的得到,哪些才是我们真正需要的数据啦~~
Delta Log 主要包含如下信息:
- commitInfo : 描述操作的时间,类型,参数等元数据信息
- protocol[optional] : 可选,读写的协议版本
- metaData[optional] : 可选,数据的存储格式,数据的Schema
- add / delete 文件
WRITE operation
因为是第一次写数据,所以直接更新,操作结果为添加了两个新文件
// operation
scala> val data = spark.range(0, 5)
data: org.apache.spark.sql.Dataset[Long] = [id: bigint]
scala> data.write.format("delta").save("/tmp/delta-table")
// hdfs file
➜ ~ hdfs dfs -ls -R /tmp/delta-table/
drwxr-xr-x - wankun supergroup 0 2019-10-22 10:34 /tmp/delta-table/_delta_log
-rw-r--r-- 1 wankun supergroup 842 2019-10-22 10:34 /tmp/delta-table/_delta_log/00000000000000000000.json
-rw-r--r-- 1 wankun supergroup 437 2019-10-22 10:34 /tmp/delta-table/part-00000-aab218f0-61b8-4741-91b3-49b6c2de34c3-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 442 2019-10-22 10:34 /tmp/delta-table/part-00001-49ccd3f2-72cf-4e56-92c7-41be80ab5ffa-c000.snappy.parquet
// delta log
➜ ~ hdfs dfs -cat /tmp/delta-table/_delta_log/00000000000000000000.json
{
"commitInfo":{
"timestamp":1571711682244,"operation":"WRITE","operationParameters":{
"mode":"ErrorIfExists","partitionBy":"[]"},"isBlindAppend":true}}
{
"protocol":{
"minReaderVersion":1,"minWriterVersion":2}}
{
"metaData":{
"id":"fd778656-8d0a-4b9c-8e56-0bdbc7407d0f","format":{
"provider":"parquet","options":{
}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{
},"createdTime":1571711680400}}
{
"add":{
"path":"part-00000-aab218f0-61b8-4741-91b3-49b6c2de34c3-c000.snappy.parquet","partitionValues":{
},"size":437,"modificationTime":1571711682201,"dataChange":true}}
{
"add":{
"path":"part-00001-49ccd3f2-72cf-4e56-92c7-41be80ab5ffa-c000.snappy.parquet","partitionValues":{
},"size":442,"modificationTime":1571711682201,"dataChange":true}}
Overwrite / Append WRITE operation
Overwrite 数据覆盖写简单,直接remove原来的文件,add 新的文件即可
Append 数据更简单,直接 add 新的文件即可
测试完毕后,当前目录有实际生效的Parquet文件有4个
part-00000-4dc5d3ff-8697-4cbc-a61c-47bc737eba9c-c000.snappy.parquet
part-00001-3c5565b5-d566-4e0e-9d46-0e09de6ecf9b-c000.snappy.parquet
part-00000-02e53950-9077-4be4-b492-782c8586c61a-c000.snappy.parquet
part-00001-611d3b00-89b0-4eeb-9f85-7ac0c74bfe3c-c000.snappy.parquet
// operation
scala> val data = spark.range(5, 10)
data: org.apache.spark.sql.Dataset[Long] = [id: bigint]
scala> data.write.format("delta").mode("overwrite").save("/tmp/delta-table")
// hdfs file
➜ ~ hdfs dfs -ls -R /tmp/delta-table/
drwxr-xr-x - wankun supergroup 0 2019-10-22 10:43 /tmp/delta-table/_delta_log
-rw-r--r-- 1 wankun supergroup 842 2019-10-22 10:34 /tmp/delta-table/_delta_log/00000000000000000000.json
-rw-r--r-- 1 wankun supergroup 787 2019-10-22 10:43 /tmp/delta-table/_delta_log/00000000000000000001.json
-rw-r--r-- 1 wankun supergroup 437 2019-10-22 10:42 /tmp/delta-table/part-00000-4dc5d3ff-8697-4cbc-a61c-47bc737eba9c-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 437 2019-10-22 10:34 /tmp/delta-table/part-00000-aab218f0-61b8-4741-91b3-49b6c2de34c3-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 442 2019-10-22 10:42 /tmp/delta-table/part-00001-3c5565b5-d566-4e0e-9d46-0e09de6ecf9b-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 442 2019-10-22 10:34 /tmp/delta-table/part-00001-49ccd3f2-72cf-4e56-92c7-41be80ab5ffa-c000.snappy.parquet
// delta log
➜ ~ hdfs dfs -cat /tmp/delta-table/_delta_log/00000000000000000001.json
{
"commitInfo":{
"timestamp":1571712180876,"operation":"WRITE","operationParameters":{
"mode":"Overwrite","partitionBy":"[]"},"readVersion":0,"isBlindAppend":false}}
{
"add":{
"path":"part-00000-4dc5d3ff-8697-4cbc-a61c-47bc737eba9c-c000.snappy.parquet","partitionValues":{
},"size":437,"modificationTime":1571712179728,"dataChange":true}}
{
"add":{
"path":"part-00001-3c5565b5-d566-4e0e-9d46-0e09de6ecf9b-c000.snappy.parquet","partitionValues":{
},"size":442,"modificationTime":1571712179731,"dataChange":true}}
{
"remove":{
"path":"part-00001-49ccd3f2-72cf-4e56-92c7-41be80ab5ffa-c000.snappy.parquet","deletionTimestamp":1571712180876,"dataChange":true}}
{
"remove":{
"path":"part-00000-aab218f0-61b8-4741-91b3-49b6c2de34c3-c000.snappy.parquet","deletionTimestamp":1571712180876,"dataChange":true}}
// operation
scala> val data = spark.range(10, 15)
data: org.apache.spark.sql.Dataset[Long] = [id: bigint]
scala> data.write.format("delta").mode("append").save("/tmp/delta-table")
scala> spark.read.format("delta").load("/tmp/delta-table").show()
+---+
| id|
+---+
| 7|
| 8|
| 9|
| 12|
| 13|
| 14|
| 5|
| 6|
| 10|
| 11|
+---+
// hdfs file
➜ ~ hdfs dfs -ls -R /tmp/delta-table/
drwxr-xr-x - wankun supergroup 0 2019-10-22 10:47 /tmp/delta-table/_delta_log
-rw-r--r-- 1 wankun supergroup 842 2019-10-22 10:34 /tmp/delta-table/_delta_log/00000000000000000000.json
-rw-r--r-- 1 wankun supergroup 787 2019-10-22 10:43 /tmp/delta-table/_delta_log/00000000000000000001.json
-rw-r--r-- 1 wankun supergroup 499 2019-10-22 10:47 /tmp/delta-table/_delta_log/00000000000000000002.json
-rw-r--r-- 1 wankun supergroup 437 2019-10-22 10:47 /tmp/delta-table/part-00000-02e53950-9077-4be4-b492-782c8586c61a-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 437 2019-10-22 10:42 /tmp/delta-table/part-00000-4dc5d3ff-8697-4cbc-a61c-47bc737eba9c-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 437 2019-10-22 10:34 /tmp/delta-table/part-00000-aab218f0-61b8-4741-91b3-49b6c2de34c3-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 442 2019-10-22 10:42 /tmp/delta-table/part-00001-3c5565b5-d566-4e0e-9d46-0e09de6ecf9b-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 442 2019-10-22 10:34 /tmp/delta-table/part-00001-49ccd3f2-72cf-4e56-92c7-41be80ab5ffa-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 442 2019-10-22 10:47 /tmp/delta-table/part-00001-611d3b00-89b0-4eeb-9f85-7ac0c74bfe3c-c000.snappy.parquet
// delta log
➜ ~ hdfs dfs -cat /tmp/delta-table/_delta_log/00000000000000000002.json
{
"commitInfo":{
"timestamp":1571712429843,"operation":"WRITE","operationParameters":{
"mode":"Append","partitionBy":"[]"},"readVersion":1,"isBlindAppend":true}}
{
"add":{
"path":"part-00000-02e53950-9077-4be4-b492-782c8586c61a-c000.snappy.parquet","partitionValues":{
},"size":437,"modificationTime":1571712429827,"dataChange":true}}
{
"add":{
"path":"part-00001-611d3b00-89b0-4eeb-9f85-7ac0c74bfe3c-c000.snappy.parquet","partitionValues":{
},"size":442,"modificationTime":1571712429826,"dataChange":true}}
Update operation
Update Operation 把原来的数据读取出来计算并保存为新文件,然后在删除老文件
// operation
import io.delta.tables._
import org.apache.spark.sql.functions._
val deltaTable = DeltaTable.forPath("/tmp/delta-table")
deltaTable.update(
condition = expr("id % 2 == 0"),
set = Map("id" -> expr("id + 100")))
scala> spark.read.format("delta").load("/tmp/delta-table").show()
+---+
| id|
+---+
| 7|
|108|
| 9|
|112|
| 13|
|114|
| 5|
|106|
|110|
| 11|
+---+
// hdfs file
➜ ~ hdfs dfs -ls -R /tmp/delta-table/
drwxr-xr-x - wankun supergroup 0 2019-10-22 10:53 /tmp/delta-table/_delta_log
-rw-r--r-- 1 wankun supergroup 842 2019-10-22 10:34 /tmp/delta-table/_delta_log/00000000000000000000.json
-rw-r--r-- 1 wankun supergroup 787 2019-10-22 10:43 /tmp/delta-table/_delta_log/00000000000000000001.json
-rw-r--r-- 1 wankun supergroup 499 2019-10-22 10:47 /tmp/delta-table/_delta_log/00000000000000000002.json
-rw-r--r-- 1 wankun supergroup 1100 2019-10-22 10:53 /tmp/delta-table/_delta_log/00000000000000000003.json
-rw-r--r-- 1 wankun supergroup 437 2019-10-22 10:47 /tmp/delta-table/part-00000-02e53950-9077-4be4-b492-782c8586c61a-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 437 2019-10-22 10:42 /tmp/delta-table/part-00000-4dc5d3ff-8697-4cbc-a61c-47bc737eba9c-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 455 2019-10-22 10:53 /tmp/delta-table/part-00000-5c227a09-5a85-418c-9199-f9768b06a067-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 437 2019-10-22 10:34 /tmp/delta-table/part-00000-aab218f0-61b8-4741-91b3-49b6c2de34c3-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 447 2019-10-22 10:53 /tmp/delta-table/part-00001-30914bb7-d705-43f3-8461-4992e4431545-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 442 2019-10-22 10:42 /tmp/delta-table/part-00001-3c5565b5-d566-4e0e-9d46-0e09de6ecf9b-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 442 2019-10-22 10:34 /tmp/delta-table/part-00001-49ccd3f2-72cf-4e56-92c7-41be80ab5ffa-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 442 2019-10-22 10:47 /tmp/delta-table/part-00001-611d3b00-89b0-4eeb-9f85-7ac0c74bfe3c-c000.snappy.parquet
// delta log
➜ ~ hdfs dfs -cat /tmp/delta-table/_delta_log/00000000000000000003.json
{
"commitInfo":{
"timestamp":1571712804813,"operation":"UPDATE","operationParameters":{
"predicate":"((id#536L % cast(2 as bigint)) = cast(0 as bigint))"},"readVersion":2,"isBlindAppend":false}}
{
"remove":{
"path":"part-00001-3c5565b5-d566-4e0e-9d46-0e09de6ecf9b-c000.snappy.parquet","deletionTimestamp":1571712804650,"dataChange":true}}
{
"remove":{
"path":"part-00000-4dc5d3ff-8697-4cbc-a61c-47bc737eba9c-c000.snappy.parquet","deletionTimestamp":1571712804650,"dataChange":true}}
{
"remove":{
"path":"part-00001-611d3b00-89b0-4eeb-9f85-7ac0c74bfe3c-c000.snappy.parquet","deletionTimestamp":1571712804650,"dataChange":true}}
{
"remove":{
"path":"part-00000-02e53950-9077-4be4-b492-782c8586c61a-c000.snappy.parquet","deletionTimestamp":1571712804650,"dataChange":true}}
{
"add":{
"path":"part-00000-5c227a09-5a85-418c-9199-f9768b06a067-c000.snappy.parquet","partitionValues":{
},"size":455,"modificationTime":1571712804805,"dataChange":true}}
{
"add":{
"path":"part-00001-30914bb7-d705-43f3-8461-4992e4431545-c000.snappy.parquet","partitionValues":{
},"size":447,"modificationTime":1571712804807,"dataChange":true}}
Delete Operation
Delete操作厉害了,上面我们计算结果,存储在两个文件中,但是我们可以只更新其中的部分文件
// part-00000-5c227a09-5a85-418c-9199-f9768b06a067-c000.snappy.parquet
// part-00001-30914bb7-d705-43f3-8461-4992e4431545-c000.snappy.parquet
scala> spark.read.format("delta").load("/tmp/delta-table").show()
+---+
| id|
+---+
| 7|
|108|
| 9|
|112|
| 13|
|114|
| 5|
|106|
|110|
| 11|
+---+
scala> deltaTable.delete(condition = expr("id == 106"))
// hdfs file
➜ ~ hdfs dfs -ls -R /tmp/delta-table/
drwxr-xr-x - wankun supergroup 0 2019-10-22 11:03 /tmp/delta-table/_delta_log
-rw-r--r-- 1 wankun supergroup 842 2019-10-22 10:34 /tmp/delta-table/_delta_log/00000000000000000000.json
-rw-r--r-- 1 wankun supergroup 787 2019-10-22 10:43 /tmp/delta-table/_delta_log/00000000000000000001.json
-rw-r--r-- 1 wankun supergroup 499 2019-10-22 10:47 /tmp/delta-table/_delta_log/00000000000000000002.json
-rw-r--r-- 1 wankun supergroup 1100 2019-10-22 10:53 /tmp/delta-table/_delta_log/00000000000000000003.json
-rw-r--r-- 1 wankun supergroup 487 2019-10-22 11:03 /tmp/delta-table/_delta_log/00000000000000000004.json
-rw-r--r-- 1 wankun supergroup 437 2019-10-22 10:47 /tmp/delta-table/part-00000-02e53950-9077-4be4-b492-782c8586c61a-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 437 2019-10-22 10:42 /tmp/delta-table/part-00000-4dc5d3ff-8697-4cbc-a61c-47bc737eba9c-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 455 2019-10-22 10:53 /tmp/delta-table/part-00000-5c227a09-5a85-418c-9199-f9768b06a067-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 437 2019-10-22 10:34 /tmp/delta-table/part-00000-aab218f0-61b8-4741-91b3-49b6c2de34c3-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 442 2019-10-22 11:03 /tmp/delta-table/part-00000-bb58e4d3-5432-4034-a3f4-5f0532bb8f99-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 447 2019-10-22 10:53 /tmp/delta-table/part-00001-30914bb7-d705-43f3-8461-4992e4431545-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 442 2019-10-22 10:42 /tmp/delta-table/part-00001-3c5565b5-d566-4e0e-9d46-0e09de6ecf9b-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 442 2019-10-22 10:34 /tmp/delta-table/part-00001-49ccd3f2-72cf-4e56-92c7-41be80ab5ffa-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 442 2019-10-22 10:47 /tmp/delta-table/part-00001-611d3b00-89b0-4eeb-9f85-7ac0c74bfe3c-c000.snappy.parquet
// delta log
➜ ~ hdfs dfs -cat /tmp/delta-table/_delta_log/00000000000000000004.json
{
"commitInfo":{
"timestamp":1571713428900,"operation":"DELETE","operationParameters":{
"predicate":"[\"(`id` = CAST(106 AS BIGINT))\"]"},"readVersion":3,"isBlindAppend":false}}
{
"remove":{
"path":"part-00001-30914bb7-d705-43f3-8461-4992e4431545-c000.snappy.parquet","deletionTimestamp":1571713428892,"dataChange":true}}
{
"add":{
"path":"part-00000-bb58e4d3-5432-4034-a3f4-5f0532bb8f99-c000.snappy.parquet","partitionValues":{
},"size":442,"modificationTime":1571713428889,"dataChange":true}}
scala> deltaTable.delete(condition = expr("id % 2 == 0"))
scala> spark.read.format("delta").load("/tmp/delta-table").show()
+---+
| id|
+---+
| 7|
| 9|
| 13|
| 5|
| 11|
+---+
// hdfs file
➜ ~ hdfs dfs -ls -R /tmp/delta-table/
drwxr-xr-x - wankun supergroup 0 2019-10-22 14:00 /tmp/delta-table/_delta_log
-rw-r--r-- 1 wankun supergroup 842 2019-10-22 10:34 /tmp/delta-table/_delta_log/00000000000000000000.json
-rw-r--r-- 1 wankun supergroup 787 2019-10-22 10:43 /tmp/delta-table/_delta_log/00000000000000000001.json
-rw-r--r-- 1 wankun supergroup 499 2019-10-22 10:47 /tmp/delta-table/_delta_log/00000000000000000002.json
-rw-r--r-- 1 wankun supergroup 1100 2019-10-22 10:53 /tmp/delta-table/_delta_log/00000000000000000003.json
-rw-r--r-- 1 wankun supergroup 487 2019-10-22 11:03 /tmp/delta-table/_delta_log/00000000000000000004.json
-rw-r--r-- 1 wankun supergroup 819 2019-10-22 14:00 /tmp/delta-table/_delta_log/00000000000000000005.json
-rw-r--r-- 1 wankun supergroup 437 2019-10-22 10:47 /tmp/delta-table/part-00000-02e53950-9077-4be4-b492-782c8586c61a-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 442 2019-10-22 14:00 /tmp/delta-table/part-00000-354a575c-1adb-4f05-bf85-99600c46561a-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 437 2019-10-22 10:42 /tmp/delta-table/part-00000-4dc5d3ff-8697-4cbc-a61c-47bc737eba9c-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 455 2019-10-22 10:53 /tmp/delta-table/part-00000-5c227a09-5a85-418c-9199-f9768b06a067-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 437 2019-10-22 10:34 /tmp/delta-table/part-00000-aab218f0-61b8-4741-91b3-49b6c2de34c3-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 442 2019-10-22 11:03 /tmp/delta-table/part-00000-bb58e4d3-5432-4034-a3f4-5f0532bb8f99-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 447 2019-10-22 10:53 /tmp/delta-table/part-00001-30914bb7-d705-43f3-8461-4992e4431545-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 442 2019-10-22 10:42 /tmp/delta-table/part-00001-3c5565b5-d566-4e0e-9d46-0e09de6ecf9b-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 442 2019-10-22 10:34 /tmp/delta-table/part-00001-49ccd3f2-72cf-4e56-92c7-41be80ab5ffa-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 442 2019-10-22 10:47 /tmp/delta-table/part-00001-611d3b00-89b0-4eeb-9f85-7ac0c74bfe3c-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 437 2019-10-22 14:00 /tmp/delta-table/part-00001-9e49fce3-e352-45d8-a3a4-973cb5ffa7be-c000.snappy.parquet
// delta log
➜ ~ hdfs dfs -cat /tmp/delta-table/_delta_log/00000000000000000005.json
{
"commitInfo":{
"timestamp":1571724027520,"operation":"DELETE","operationParameters":{
"predicate":"[\"((`id` % CAST(2 AS BIGINT)) = CAST(0 AS BIGINT))\"]"},"readVersion":4,"isBlindAppend":false}}
{
"remove":{
"path":"part-00000-bb58e4d3-5432-4034-a3f4-5f0532bb8f99-c000.snappy.parquet","deletionTimestamp":1571724027519,"dataChange":true}}
{
"remove":{
"path":"part-00000-5c227a09-5a85-418c-9199-f9768b06a067-c000.snappy.parquet","deletionTimestamp":1571724027519,"dataChange":true}}
{
"add":{
"path":"part-00000-354a575c-1adb-4f05-bf85-99600c46561a-c000.snappy.parquet","partitionValues":{
},"size":442,"modificationTime":1571724027109,"dataChange":true}}
{
"add":{
"path":"part-00001-9e49fce3-e352-45d8-a3a4-973cb5ffa7be-c000.snappy.parquet","partitionValues":{
},"size":437,"modificationTime":1571724027511,"dataChange":true}}
Merge Operation
Delete操作厉害了,上面我们计算结果,存储在两个文件中,但是我们可以只更新其中的部分文件
// part-00000-5c227a09-5a85-418c-9199-f9768b06a067-c000.snappy.parquet
// part-00001-30914bb7-d705-43f3-8461-4992e4431545-c000.snappy.parquet
val newData = spark.range(6, 15).map(v => (v,v+100)).toDF("old_id","new_id")
// newData 是 DF,可以同时给该DF起一个名字
// deltaTable 是DeltaTable 可以给他起一个名字
deltaTable.as("oldData").
merge(newData.as("newData"), "oldData.id = newData.old_id").
whenMatched.update(Map("id" -> col("newData.new_id"))).
whenNotMatched.insert(Map("id" -> col("newData.old_id"))).
execute()
// 对有相同的key数据 +100,同时加入缺失的key数据
scala> deltaTable.toDF.show()
+---+
| id|
+---+
|109|
| 8|
|113|
| 5|
| 12|
| 14|
| 6|
|107|
| 10|
|111|
+---+
// hdfs file
➜ ~ hdfs dfs -ls -R /tmp/delta-table/
drwxr-xr-x - wankun supergroup 0 2019-10-22 14:04 /tmp/delta-table/_delta_log
-rw-r--r-- 1 wankun supergroup 842 2019-10-22 10:34 /tmp/delta-table/_delta_log/00000000000000000000.json
-rw-r--r-- 1 wankun supergroup 787 2019-10-22 10:43 /tmp/delta-table/_delta_log/00000000000000000001.json
-rw-r--r-- 1 wankun supergroup 499 2019-10-22 10:47 /tmp/delta-table/_delta_log/00000000000000000002.json
-rw-r--r-- 1 wankun supergroup 1100 2019-10-22 10:53 /tmp/delta-table/_delta_log/00000000000000000003.json
-rw-r--r-- 1 wankun supergroup 487 2019-10-22 11:03 /tmp/delta-table/_delta_log/00000000000000000004.json
-rw-r--r-- 1 wankun supergroup 819 2019-10-22 14:00 /tmp/delta-table/_delta_log/00000000000000000005.json
-rw-r--r-- 1 wankun supergroup 2327 2019-10-22 14:04 /tmp/delta-table/_delta_log/00000000000000000006.json
-rw-r--r-- 1 wankun supergroup 437 2019-10-22 10:47 /tmp/delta-table/part-00000-02e53950-9077-4be4-b492-782c8586c61a-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 442 2019-10-22 14:00 /tmp/delta-table/part-00000-354a575c-1adb-4f05-bf85-99600c46561a-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 437 2019-10-22 10:42 /tmp/delta-table/part-00000-4dc5d3ff-8697-4cbc-a61c-47bc737eba9c-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 455 2019-10-22 10:53 /tmp/delta-table/part-00000-5c227a09-5a85-418c-9199-f9768b06a067-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 262 2019-10-22 14:04 /tmp/delta-table/part-00000-8a10cbc2-d87b-4b70-8d79-c29f141f2801-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 437 2019-10-22 10:34 /tmp/delta-table/part-00000-aab218f0-61b8-4741-91b3-49b6c2de34c3-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 442 2019-10-22 11:03 /tmp/delta-table/part-00000-bb58e4d3-5432-4034-a3f4-5f0532bb8f99-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 447 2019-10-22 10:53 /tmp/delta-table/part-00001-30914bb7-d705-43f3-8461-4992e4431545-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 442 2019-10-22 10:42 /tmp/delta-table/part-00001-3c5565b5-d566-4e0e-9d46-0e09de6ecf9b-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 442 2019-10-22 10:34 /tmp/delta-table/part-00001-49ccd3f2-72cf-4e56-92c7-41be80ab5ffa-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 442 2019-10-22 10:47 /tmp/delta-table/part-00001-611d3b00-89b0-4eeb-9f85-7ac0c74bfe3c-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 437 2019-10-22 14:00 /tmp/delta-table/part-00001-9e49fce3-e352-45d8-a3a4-973cb5ffa7be-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 429 2019-10-22 14:04 /tmp/delta-table/part-00011-5fe79d9f-beb9-4bee-8d96-53ac2596bc20-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 429 2019-10-22 14:04 /tmp/delta-table/part-00045-39af307b-a695-4482-89c2-f04a030216bb-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 429 2019-10-22 14:04 /tmp/delta-table/part-00049-2d645a80-79d6-48ad-8964-c8474b6afe57-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 429 2019-10-22 14:04 /tmp/delta-table/part-00068-76e6f7aa-90a9-4265-9bd4-63ba31bcbc87-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 429 2019-10-22 14:04 /tmp/delta-table/part-00077-6bb575e2-92dc-49be-ba6c-23055e627587-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 429 2019-10-22 14:04 /tmp/delta-table/part-00112-246f34c0-1c27-4e54-90eb-e69454dbf38f-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 429 2019-10-22 14:04 /tmp/delta-table/part-00116-292a7944-bb67-4d0b-83a9-92bfdbc9d0a3-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 429 2019-10-22 14:04 /tmp/delta-table/part-00121-1f384a9f-7adf-4672-b056-2c3a2dec7174-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 429 2019-10-22 14:04 /tmp/delta-table/part-00143-5113a2e9-47d9-4e34-80fc-0fbf9f17f376-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 429 2019-10-22 14:04 /tmp/delta-table/part-00154-69cc7569-5b61-482c-b737-600a52dbc2d4-c000.snappy.parquet
// delta log
➜ ~ hdfs dfs -cat /tmp/delta-table/_delta_log/00000000000000000006.json
{
"commitInfo":{
"timestamp":1571724253311,"operation":"MERGE","operationParameters":{
"predicate":"(oldData.`id` = newData.`old_id`)"},"readVersion":5,"isBlindAppend":false}}
{
"remove":{
"path":"part-00001-9e49fce3-e352-45d8-a3a4-973cb5ffa7be-c000.snappy.parquet","deletionTimestamp":1571724253305,"dataChange":true}}
{
"remove":{
"path":"part-00000-354a575c-1adb-4f05-bf85-99600c46561a-c000.snappy.parquet","deletionTimestamp":1571724253311,"dataChange":true}}
{
"add":{
"path":"part-00000-8a10cbc2-d87b-4b70-8d79-c29f141f2801-c000.snappy.parquet","partitionValues":{
},"size":262,"modificationTime":1571724252335,"dataChange":true}}
{
"add":{
"path":"part-00011-5fe79d9f-beb9-4bee-8d96-53ac2596bc20-c000.snappy.parquet","partitionValues":{
},"size":429,"modificationTime":1571724252118,"dataChange":true}}
{
"add":{
"path":"part-00045-39af307b-a695-4482-89c2-f04a030216bb-c000.snappy.parquet","partitionValues":{
},"size":429,"modificationTime":1571724252468,"dataChange":true}}
{
"add":{
"path":"part-00049-2d645a80-79d6-48ad-8964-c8474b6afe57-c000.snappy.parquet","partitionValues":{
},"size":429,"modificationTime":1571724252147,"dataChange":true}}
{
"add":{
"path":"part-00068-76e6f7aa-90a9-4265-9bd4-63ba31bcbc87-c000.snappy.parquet","partitionValues":{
},"size":429,"modificationTime":1571724252180,"dataChange":true}}
{
"add":{
"path":"part-00077-6bb575e2-92dc-49be-ba6c-23055e627587-c000.snappy.parquet","partitionValues":{
},"size":429,"modificationTime":1571724252208,"dataChange":true}}
{
"add":{
"path":"part-00112-246f34c0-1c27-4e54-90eb-e69454dbf38f-c000.snappy.parquet","partitionValues":{
},"size":429,"modificationTime":1571724252234,"dataChange":true}}
{
"add":{
"path":"part-00116-292a7944-bb67-4d0b-83a9-92bfdbc9d0a3-c000.snappy.parquet","partitionValues":{
},"size":429,"modificationTime":1571724252259,"dataChange":true}}
{
"add":{
"path":"part-00121-1f384a9f-7adf-4672-b056-2c3a2dec7174-c000.snappy.parquet","partitionValues":{
},"size":429,"modificationTime":1571724252279,"dataChange":true}}
{
"add":{
"path":"part-00143-5113a2e9-47d9-4e34-80fc-0fbf9f17f376-c000.snappy.parquet","partitionValues":{
},"size":429,"modificationTime":1571724252299,"dataChange":true}}
{
"add":{
"path":"part-00154-69cc7569-5b61-482c-b737-600a52dbc2d4-c000.snappy.parquet","partitionValues":{
},"size":429,"modificationTime":1571724252319,"dataChange":true}}
Vacuum
通过查看HDFS文件,我们发现HDFS上已经有了非常多小文件,而且很多是被标记为remove的文件。
默认情况下,被remove的文件会被保留一周,一周之后,当我们执行vacuum命令的时候,就可以把没用的文件给删除掉了。
我们这里测试一下,强制删除半个小时以前的文件。
结果是HDFS上parquet文件确实被物理删除了,但是delta_log 并没有被checkpoint。。。
scala> spark.conf.set("spark.databricks.delta.retentionDurationCheck.enabled", false)
scala> deltaTable.vacuum(0.5)
Deleted 7 files and directories in a total of 1 directories.
res23: org.apache.spark.sql.DataFrame = []
➜ ~ hdfs dfs -ls -R /tmp/delta-table/
drwxr-xr-x - wankun supergroup 0 2019-10-22 14:04 /tmp/delta-table/_delta_log
-rw-r--r-- 1 wankun supergroup 842 2019-10-22 10:34 /tmp/delta-table/_delta_log/00000000000000000000.json
-rw-r--r-- 1 wankun supergroup 787 2019-10-22 10:43 /tmp/delta-table/_delta_log/00000000000000000001.json
-rw-r--r-- 1 wankun supergroup 499 2019-10-22 10:47 /tmp/delta-table/_delta_log/00000000000000000002.json
-rw-r--r-- 1 wankun supergroup 1100 2019-10-22 10:53 /tmp/delta-table/_delta_log/00000000000000000003.json
-rw-r--r-- 1 wankun supergroup 487 2019-10-22 11:03 /tmp/delta-table/_delta_log/00000000000000000004.json
-rw-r--r-- 1 wankun supergroup 819 2019-10-22 14:00 /tmp/delta-table/_delta_log/00000000000000000005.json
-rw-r--r-- 1 wankun supergroup 2327 2019-10-22 14:04 /tmp/delta-table/_delta_log/00000000000000000006.json
-rw-r--r-- 1 wankun supergroup 437 2019-10-22 10:47 /tmp/delta-table/part-00000-02e53950-9077-4be4-b492-782c8586c61a-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 442 2019-10-22 14:00 /tmp/delta-table/part-00000-354a575c-1adb-4f05-bf85-99600c46561a-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 437 2019-10-22 10:42 /tmp/delta-table/part-00000-4dc5d3ff-8697-4cbc-a61c-47bc737eba9c-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 455 2019-10-22 10:53 /tmp/delta-table/part-00000-5c227a09-5a85-418c-9199-f9768b06a067-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 262 2019-10-22 14:04 /tmp/delta-table/part-00000-8a10cbc2-d87b-4b70-8d79-c29f141f2801-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 437 2019-10-22 10:34 /tmp/delta-table/part-00000-aab218f0-61b8-4741-91b3-49b6c2de34c3-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 442 2019-10-22 11:03 /tmp/delta-table/part-00000-bb58e4d3-5432-4034-a3f4-5f0532bb8f99-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 447 2019-10-22 10:53 /tmp/delta-table/part-00001-30914bb7-d705-43f3-8461-4992e4431545-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 442 2019-10-22 10:42 /tmp/delta-table/part-00001-3c5565b5-d566-4e0e-9d46-0e09de6ecf9b-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 442 2019-10-22 10:34 /tmp/delta-table/part-00001-49ccd3f2-72cf-4e56-92c7-41be80ab5ffa-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 442 2019-10-22 10:47 /tmp/delta-table/part-00001-611d3b00-89b0-4eeb-9f85-7ac0c74bfe3c-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 437 2019-10-22 14:00 /tmp/delta-table/part-00001-9e49fce3-e352-45d8-a3a4-973cb5ffa7be-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 429 2019-10-22 14:04 /tmp/delta-table/part-00011-5fe79d9f-beb9-4bee-8d96-53ac2596bc20-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 429 2019-10-22 14:04 /tmp/delta-table/part-00045-39af307b-a695-4482-89c2-f04a030216bb-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 429 2019-10-22 14:04 /tmp/delta-table/part-00049-2d645a80-79d6-48ad-8964-c8474b6afe57-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 429 2019-10-22 14:04 /tmp/delta-table/part-00068-76e6f7aa-90a9-4265-9bd4-63ba31bcbc87-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 429 2019-10-22 14:04 /tmp/delta-table/part-00077-6bb575e2-92dc-49be-ba6c-23055e627587-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 429 2019-10-22 14:04 /tmp/delta-table/part-00112-246f34c0-1c27-4e54-90eb-e69454dbf38f-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 429 2019-10-22 14:04 /tmp/delta-table/part-00116-292a7944-bb67-4d0b-83a9-92bfdbc9d0a3-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 429 2019-10-22 14:04 /tmp/delta-table/part-00121-1f384a9f-7adf-4672-b056-2c3a2dec7174-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 429 2019-10-22 14:04 /tmp/delta-table/part-00143-5113a2e9-47d9-4e34-80fc-0fbf9f17f376-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 429 2019-10-22 14:04 /tmp/delta-table/part-00154-69cc7569-5b61-482c-b737-600a52dbc2d4-c000.snappy.parquet
➜ ~ hdfs dfs -ls -R /tmp/delta-table/
drwxr-xr-x - wankun supergroup 0 2019-10-22 14:04 /tmp/delta-table/_delta_log
-rw-r--r-- 1 wankun supergroup 842 2019-10-22 10:34 /tmp/delta-table/_delta_log/00000000000000000000.json
-rw-r--r-- 1 wankun supergroup 787 2019-10-22 10:43 /tmp/delta-table/_delta_log/00000000000000000001.json
-rw-r--r-- 1 wankun supergroup 499 2019-10-22 10:47 /tmp/delta-table/_delta_log/00000000000000000002.json
-rw-r--r-- 1 wankun supergroup 1100 2019-10-22 10:53 /tmp/delta-table/_delta_log/00000000000000000003.json
-rw-r--r-- 1 wankun supergroup 487 2019-10-22 11:03 /tmp/delta-table/_delta_log/00000000000000000004.json
-rw-r--r-- 1 wankun supergroup 819 2019-10-22 14:00 /tmp/delta-table/_delta_log/00000000000000000005.json
-rw-r--r-- 1 wankun supergroup 2327 2019-10-22 14:04 /tmp/delta-table/_delta_log/00000000000000000006.json
-rw-r--r-- 1 wankun supergroup 442 2019-10-22 14:00 /tmp/delta-table/part-00000-354a575c-1adb-4f05-bf85-99600c46561a-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 455 2019-10-22 10:53 /tmp/delta-table/part-00000-5c227a09-5a85-418c-9199-f9768b06a067-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 262 2019-10-22 14:04 /tmp/delta-table/part-00000-8a10cbc2-d87b-4b70-8d79-c29f141f2801-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 442 2019-10-22 11:03 /tmp/delta-table/part-00000-bb58e4d3-5432-4034-a3f4-5f0532bb8f99-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 437 2019-10-22 14:00 /tmp/delta-table/part-00001-9e49fce3-e352-45d8-a3a4-973cb5ffa7be-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 429 2019-10-22 14:04 /tmp/delta-table/part-00011-5fe79d9f-beb9-4bee-8d96-53ac2596bc20-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 429 2019-10-22 14:04 /tmp/delta-table/part-00045-39af307b-a695-4482-89c2-f04a030216bb-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 429 2019-10-22 14:04 /tmp/delta-table/part-00049-2d645a80-79d6-48ad-8964-c8474b6afe57-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 429 2019-10-22 14:04 /tmp/delta-table/part-00068-76e6f7aa-90a9-4265-9bd4-63ba31bcbc87-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 429 2019-10-22 14:04 /tmp/delta-table/part-00077-6bb575e2-92dc-49be-ba6c-23055e627587-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 429 2019-10-22 14:04 /tmp/delta-table/part-00112-246f34c0-1c27-4e54-90eb-e69454dbf38f-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 429 2019-10-22 14:04 /tmp/delta-table/part-00116-292a7944-bb67-4d0b-83a9-92bfdbc9d0a3-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 429 2019-10-22 14:04 /tmp/delta-table/part-00121-1f384a9f-7adf-4672-b056-2c3a2dec7174-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 429 2019-10-22 14:04 /tmp/delta-table/part-00143-5113a2e9-47d9-4e34-80fc-0fbf9f17f376-c000.snappy.parquet
-rw-r--r-- 1 wankun supergroup 429 2019-10-22 14:04 /tmp/delta-table/part-00154-69cc7569-5b61-482c-b737-600a52dbc2d4-c000.snappy.parquet
Delta Log Replay 逻辑
对于Delta Log 日志的数据合并逻辑,这个和我们想的基本一致:
- AddFile 是新增文件,肯定要保留下来,如果之前删除过这个文件,取消删除
- RemoveFile 是删除文件,既然文件都删除了,就要取消前面的新增文件操作
- 对 Metadata, Protocol 如果有更改,肯定要保留最新的信息
- 对于SetTransaction 是要记录当前能看到的存在的所有的 transactions
class InMemoryLogReplay(minFileRetentionTimestamp: Long) {
var currentProtocolVersion: Protocol = null
var currentVersion: Long = -1
var currentMetaData: Metadata = null
val transactions = new scala.collection.mutable.HashMap[String, SetTransaction]()
val activeFiles = new scala.collection.mutable.HashMap[URI, AddFile]()
private val tombstones = new scala.collection.mutable.HashMap[URI, RemoveFile]()
def append(version: Long, actions: Iterator[Action]): Unit = {
assert(currentVersion == -1 || version == currentVersion + 1,
s"Attempted to replay version $version, but state is at $currentVersion")
currentVersion = version
actions.foreach {
case a: SetTransaction =>
transactions(a.appId) = a
case a: Metadata =>
currentMetaData = a
case a: Protocol =>
currentProtocolVersion = a
case add: AddFile =>
activeFiles(add.pathAsUri) = add.copy(dataChange = false)
// Remove the tombstone to make sure we only output one `FileAction`.
tombstones.remove(add.pathAsUri)
case remove: RemoveFile =>
activeFiles.remove(remove.pathAsUri)
tombstones(remove.pathAsUri) = remove.copy(dataChange = false)
case ci: CommitInfo => // do nothing
case null => // Some crazy future feature. Ignore
}
}
private def getTombstones: Iterable[FileAction] = //...
def checkpoint: Iterator[Action] = //...
}
时光机(Time Travel) 读取过去某个版本的数据
如果table 曾经被删除过,则version 可能回不到0了
scala> val df = spark.read.format("delta").option("versionAsOf", 0).load("/tmp/delta-table")
org.apache.spark.sql.AnalysisException: Cannot time travel Delta table to version 0. Available versions: [10, 18].;
at org.apache.spark.sql.delta.DeltaErrors$.versionNotExistException(DeltaErrors.scala:521)
at org.apache.spark.sql.delta.DeltaHistoryManager.checkVersionExists(DeltaHistoryManager.scala:146)
at org.apache.spark.sql.delta.DeltaTableUtils$.resolveTimeTravelVersion(DeltaTable.scala:222)
at org.apache.spark.sql.delta.DeltaLog$$anonfun$31.apply(DeltaLog.scala:607)
at org.apache.spark.sql.delta.DeltaLog$$anonfun$31.apply(DeltaLog.scala:606)
at scala.Option.map(Option.scala:146)
at org.apache.spark.sql.delta.DeltaLog.createRelation(DeltaLog.scala:606)
at org.apache.spark.sql.delta.sources.DeltaDataSource.createRelation(DeltaDataSource.scala:205)
at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:318)
at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:223)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:211)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:178)
... 53 elided
scala> val df = spark.read.format("delta").option("versionAsOf", 12).load("/tmp/delta-table")
df: org.apache.spark.sql.DataFrame = [id: bigint]
scala> df.show()
+---+
| id|
+---+
| 7|
| 9|
| 13|
| 14|
| 12|
| 5|
| 10|
| 11|
+---+
Spark-Sql with delta
启动Spark-sql
bin/spark-sql --packages io.delta:delta-core_2.11:0.4.0
$ hdfs dfs -text /tmp/wankun/test_json/test_json1.txt
{"id":1,"name":"liguohui","nums":[1,2,3,4,5]}
{"id":2,"name":"zhangsan","nums":[6,7,8,9,10]}
select * from json.`/tmp/wankun/test_json`;
CREATE TABLE test_user
USING delta
AS select * from json.`/tmp/wankun/test_json`;
select * from test_user;
~/Applications/spark hdfs dfs -ls /user/hive/warehouse/test_user
19/08/28 17:02:14 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Found 2 items
drwxr-xr-x - wankun supergroup 0 2019-08-28 17:00 /user/hive/warehouse/test_user/_delta_log
-rw-r--r-- 1 wankun supergroup 1044 2019-08-28 17:00 /user/hive/warehouse/test_user/part-00000-55e61d60-5cd4-449a-8e0e-a90f16f0a903-c000.snappy.parquet