基础原理
简单而言,这个是将SQL翻译成JAVA的一个开源实现。因为绝大部分数据分析工作是通过SQL完成的。把SQL翻译成JAVA是非常重要的。他的初心其实很简单,就是用SQL来统一访问一些结构化数据源,比如说内存里的map,一个csv文件等等。
有如下特性:
- 支持自定义SQL函数
例子
例子一
SELECT 1 as a from demo_mlt0317002
这个SQL被翻译为
public org.apache.calcite.linq4j.Enumerable bind(final org.apache.calcite.DataContext root) {
final org.apache.calcite.linq4j.Enumerable _inputEnumerable = org.apache.calcite.schema.Schemas.queryable(root, root.getRootSchema(), java.lang.Object[].class, "DEMO_MLT0317002").asEnumerable();
return new org.apache.calcite.linq4j.AbstractEnumerable(){
public org.apache.calcite.linq4j.Enumerator enumerator() {
return new org.apache.calcite.linq4j.Enumerator(){
public final org.apache.calcite.linq4j.Enumerator inputEnumerator = _inputEnumerable.enumerator();
public void reset() {
inputEnumerator.reset();
}
public boolean moveNext() {
return inputEnumerator.moveNext();
}
public void close() {
inputEnumerator.close();
}
public Object current() {
return 1;
}
};
}
};
}
public Class getElementType() {
return int.class;
}
其中,bind是最后我们实际执行数据的入口方法。第一行声明了一个输入表_inputEnumerable。这个来源于我们输入的DataContext。包含了schema和数据。
然后就是定于你了一个输出。其中最最重要的就是current这个。返回输出值。
例子二
SELECT 1 as a,2 as b from demo_mlt0317002
这个SQL被翻译为
public org.apache.calcite.linq4j.Enumerable bind(final org.apache.calcite.DataContext root) {
final org.apache.calcite.linq4j.Enumerable _inputEnumerable = org.apache.calcite.schema.Schemas.queryable(root, root.getRootSchema(), java.lang.Object[].class, "DEMO_MLT0317002").asEnumerable();
return new org.apache.calcite.linq4j.AbstractEnumerable(){
public org.apache.calcite.linq4j.Enumerator enumerator() {
return new org.apache.calcite.linq4j.Enumerator(){
public final org.apache.calcite.linq4j.Enumerator inputEnumerator = _inputEnumerable.enumerator();
public void reset() {
inputEnumerator.reset();
}
public boolean moveNext() {
return inputEnumerator.moveNext();
}
public void close() {
inputEnumerator.close();
}
public Object current() {
return new Object[] {
1,
2};
}
};
}
};
}
public Class getElementType() {
return java.lang.Object[].class;
}
可以看到多个输出被转换成OBJECT数组。
例子三
SELECT date1 as a from demo_mlt0317002
这个SQL被翻译为
public org.apache.calcite.linq4j.Enumerable bind(final org.apache.calcite.DataContext root) {
final org.apache.calcite.linq4j.Enumerable _inputEnumerable = org.apache.calcite.schema.Schemas.queryable(root, root.getRootSchema(), java.lang.Object[].class, "DEMO_MLT0317002").asEnumerable();
return new org.apache.calcite.linq4j.AbstractEnumerable(){
public org.apache.calcite.linq4j.Enumerator enumerator() {
return new org.apache.calcite.linq4j.Enumerator(){
public final org.apache.calcite.linq4j.Enumerator inputEnumerator = _inputEnumerable.enumerator();
public void reset() {
inputEnumerator.reset();
}
public boolean moveNext() {
return inputEnumerator.moveNext();
}
public void close() {
inputEnumerator.close();
}
public Object current() {
return (Long) ((Object[]) inputEnumerator.current())[1];
}
};
}
};
}
public Class getElementType() {
return java.lang.Long.class;
}
可以看到引用inputtable时也是使用顺序
例子四
来一个聚合函数试一试
SELECT date1 as a from demo_mlt0317002 group by string1
被翻译成为了
public static class Record2_0 implements java.io.Serializable {
public int f0;
public boolean f1;
public Record2_0() {
}
public boolean equals(Object o) {
if (this == o) {
return true;
}
if (!(o instanceof Record2_0)) {
return false;
}
return this.f0 == ((Record2_0) o).f0 && this.f1 == ((Record2_0) o).f1;
}
public int hashCode() {
int h = 0;
h = org.apache.calcite.runtime.Utilities.hash(h, this.f0);
h = org.apache.calcite.runtime.Utilities.hash(h, this.f1);
return h;
}
public int compareTo(Record2_0 that) {
int c;
c = org.apache.calcite.runtime.Utilities.compare(this.f0, that.f0);
if (c != 0) {
return c;
}
c = org.apache.calcite.runtime.Utilities.compare(this.f1, that.f1);
if (c != 0) {
return c;
}
return 0;
}
public String toString() {
return "{f0=" + this.f0 + ", f1=" + this.f1 + "}";
}
}
public org.apache.calcite.linq4j.Enumerable bind(final org.apache.calcite.DataContext root) {
java.util.List accumulatorAdders = new java.util.LinkedList();
accumulatorAdders.add(new org.apache.calcite.linq4j.function.Function2() {
public Record2_0 apply(Record2_0 acc, Object[] in) {
final Integer inp0_ = (Integer) in[0];
if (inp0_ != null) {
acc.f1 = true;
acc.f0 = acc.f0 + inp0_.intValue();
}
return acc;
}
public Record2_0 apply(Object acc, Object in) {
return apply(
(Record2_0) acc,
(Object[]) in);
}
}
);
org.apache.calcite.adapter.enumerable.AggregateLambdaFactory lambdaFactory = new org.apache.calcite.adapter.enumerable.BasicAggregateLambdaFactory(
new org.apache.calcite.linq4j.function.Function0() {
public Object apply() {
int a0s0;
boolean a0s1;
a0s1 = false;
a0s0 = 0;
Record2_0 record0;
record0 = new Record2_0();
record0.f0 = a0s0;
record0.f1 = a0s1;
return record0;
}
}
,
accumulatorAdders);
final org.apache.calcite.linq4j.Enumerable _inputEnumerable = org.apache.calcite.schema.Schemas.queryable(root, root.getRootSchema(), java.lang.Object[].class, "DEMO_MLT0317002").asEnumerable().groupBy(new org.apache.calcite.linq4j.function.Function1() {
public String apply(Object[] a0) {
return a0[1] == null ? (String) null : a0[1].toString();
}
public Object apply(Object a0) {
return apply(
(Object[]) a0);
}
}
, lambdaFactory.accumulatorInitializer(), lambdaFactory.accumulatorAdder(), lambdaFactory.resultSelector(new org.apache.calcite.linq4j.function.Function2() {
public Object[] apply(String key, Record2_0 acc) {
return new Object[] {
key,
acc.f1 ? Integer.valueOf(acc.f0) : (Integer) null};
}
public Object[] apply(Object key, Object acc) {
return apply(
(String) key,
(Record2_0) acc);
}
}
));
return new org.apache.calcite.linq4j.AbstractEnumerable(){
public org.apache.calcite.linq4j.Enumerator enumerator() {
return new org.apache.calcite.linq4j.Enumerator(){
public final org.apache.calcite.linq4j.Enumerator inputEnumerator = _inputEnumerable.enumerator();
public void reset() {
inputEnumerator.reset();
}
public boolean moveNext() {
return inputEnumerator.moveNext();
}
public void close() {
inputEnumerator.close();
}
public Object current() {
return (Integer) ((Object[]) inputEnumerator.current())[1];
}
};
}
};
}
public Class getElementType() {
return java.lang.Integer.class;
}
事情一下子就复杂了起来。
代码的第一部分,定义了一个Record2_0。其中f_0就是聚合的字段。我们这个例子里是一个int。
bind的第二行定义了一个聚合方法。写在了apply中
第三行定义了一个聚合工厂。第一个参数是初始值,第二个参数是如何进行叠加计算
public BasicAggregateLambdaFactory(
Function0<TAccumulate> accumulatorInitializer,
List<Function2<TAccumulate, TSource, TAccumulate>> accumulatorAdders) {
this.accumulatorInitializer = accumulatorInitializer;
this.accumulatorAdderDecorator = new AccumulatorAdderSeq(accumulatorAdders);
}
然后就是在_inputEnumerable定义中加上聚合的相关定义
例子五
SELECT int1 as a,int2 as b from demo_mlt0317002 left join demo_mlt0317002_2 on string1 = string2
翻译为java后为
public org.apache.calcite.linq4j.Enumerable bind(final org.apache.calcite.DataContext root) {
final org.apache.calcite.linq4j.Enumerable _inputEnumerable = org.apache.calcite.linq4j.EnumerableDefaults.nestedLoopJoin(org.apache.calcite.schema.Schemas.queryable(root, root.getRootSchema(), java.lang.Object[].class, "DEMO_MLT0317002").asEnumerable(), org.apache.calcite.schema.Schemas.queryable(root, root.getRootSchema(), java.lang.Object[].class, "DEMO_MLT0317002_2").asEnumerable(), new org.apache.calcite.linq4j.function.Predicate2() {
public boolean apply(Object[] left, org.apache.calcite.runtime.FlatLists.ComparableList right) {
final String inp2_ = left[2] == null ? (String) null : left[2].toString();
final String inp3_ = left[3] == null ? (String) null : left[3].toString();
return inp2_ != null && inp3_ != null && org.apache.calcite.runtime.SqlFunctions.eq(inp2_, inp3_);
}
public boolean apply(Object left, Object right) {
return apply(
(Object[]) left,
(org.apache.calcite.runtime.FlatLists.ComparableList) right);
}
}
, new org.apache.calcite.linq4j.function.Function2() {
public Object[] apply(Object[] left, org.apache.calcite.runtime.FlatLists.ComparableList right) {
return new Object[] {
left[0],
left[1],
left[2],
left[3],
left[4],
left[5]};
}
public Object[] apply(Object left, Object right) {
return apply(
(Object[]) left,
(org.apache.calcite.runtime.FlatLists.ComparableList) right);
}
}
, org.apache.calcite.linq4j.JoinType.LEFT);
return new org.apache.calcite.linq4j.AbstractEnumerable(){
public org.apache.calcite.linq4j.Enumerator enumerator() {
return new org.apache.calcite.linq4j.Enumerator(){
public final org.apache.calcite.linq4j.Enumerator inputEnumerator = _inputEnumerable.enumerator();
public void reset() {
inputEnumerator.reset();
}
public boolean moveNext() {
return inputEnumerator.moveNext();
}
public void close() {
inputEnumerator.close();
}
public Object current() {
final Object[] current = (Object[]) inputEnumerator.current();
return new Object[] {
current[1],
current[0]};
}
};
}
};
}
public Class getElementType() {
return java.lang.Object[].class;
}
calcite和hive
hive一个最基本的能力就是需要把sql翻译为mapreduce的DAG执行图。
整个编译过程分为六个阶段
- Antlr定义SQL的语法规则,完成SQL词法,语法解析,将SQL转化为抽象语法树AST Tree
- 遍历AST Tree,抽象出查询的基本组成单元QueryBlock
- 遍历QueryBlock,翻译为执行操作树OperatorTree
- 逻辑层优化器进行OperatorTree变换,合并不必要的ReduceSinkOperator,减少shuffle数据量
- 遍历OperatorTree,翻译为MapReduce任务
- 物理层优化器进行MapReduce任务的变换,生成最终的执行计划