1,第一步很重要 就是依赖的问题,因为是本地执行,所以最好有hadoop配置环境,没有的话会提示错误信息,自己百度一下,自己下载个winutils.exe ,然后配置环境变量
2,为了图方便 直接贴上pom文件依赖(自己看哈,就是hive跟 hadoop的依赖):
<?xml version="1.0" encoding="UTF-8"?> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>pijiuya</groupId> <artifactId>FlinkExample</artifactId> <version>1.0-SNAPSHOT</version> <properties> <flink.version>1.10.0</flink.version> </properties> <dependencies> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-java</artifactId> <version>${flink.version}</version> <!-- provided在这表示此依赖只在代码编译的时候使用,运行和打包的时候不使用 --> <!--<scope>provided</scope>--> </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-streaming-java_2.11</artifactId> <version>${flink.version}</version> <!--<scope>provided</scope>--> </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-scala_2.11</artifactId> <version>${flink.version}</version> <!--<scope>provided</scope>--> </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-streaming-scala_2.11</artifactId> <version>${flink.version}</version> <!--<scope>provided</scope>--> </dependency> <dependency> <groupId>org.apache.bahir</groupId> <artifactId>flink-connector-redis_2.11</artifactId> <version>1.0</version> </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-statebackend-rocksdb_2.11</artifactId> <version>${flink.version}</version> </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-connector-kafka-0.11_2.11</artifactId> <version>${flink.version}</version> </dependency> <dependency> <groupId>org.apache.kafka</groupId> <artifactId>kafka-clients</artifactId> <version>0.11.0.3</version> </dependency> <dependency> <groupId>org.slf4j</groupId> <artifactId>slf4j-api</artifactId> <version>1.7.25</version> </dependency> <dependency> <groupId>org.slf4j</groupId> <artifactId>slf4j-log4j12</artifactId> <version>1.7.25</version> </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-table</artifactId> <version>${flink.version}</version> <type>pom</type> </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-table-api-java-bridge_2.11</artifactId> <version>${flink.version}</version> </dependency> <!-- or... --> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-table-api-scala-bridge_2.11</artifactId> <version>${flink.version}</version> </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-table-common</artifactId> <version>${flink.version}</version> </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-table-api-java</artifactId> <version>${flink.version}</version> </dependency> <!--<dependency>--> <!--<groupId>org.apache.flink</groupId>--> <!--<artifactId>flink-table-api-scala_${scala.binary.version}</artifactId>--> <!--<version>${flink.version}</version>--> <!--</dependency>--> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-table-api-scala_2.11</artifactId> <version>${flink.version}</version> </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-table-planner-blink_2.11</artifactId> <version>${flink.version}</version> </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-table-planner_2.11</artifactId> <version>${flink.version}</version> </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-jdbc_2.11</artifactId> <version>${flink.version}</version> </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-csv</artifactId> <version>${flink.version}</version> </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>statefun-sdk</artifactId> <version>2.0.0</version> </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>statefun-flink-harness</artifactId> <version>2.0.0</version> </dependency> <dependency> <groupId>com.alibaba</groupId> <artifactId>fastjson</artifactId> <version>1.2.60</version> </dependency> <dependency> <groupId>redis.clients</groupId> <artifactId>jedis</artifactId> <version>2.9.0</version> <!--<scope>provided</scope>--> </dependency> <dependency> <groupId>org.apache.bahir</groupId> <artifactId>flink-connector-redis_2.11</artifactId> <version>1.0</version> </dependency> <!--时间类--> <dependency> <groupId>joda-time</groupId> <artifactId>joda-time</artifactId> <version>2.9.2</version> </dependency> <!--hive依赖--> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-connector-hive_2.11</artifactId> <version>1.10.0</version> <!--<scope>provided</scope>--> </dependency> <dependency> <groupId>org.apache.hive</groupId> <artifactId>hive-exec</artifactId> <version>1.1.0</version> <!--<scope>provided</scope>--> </dependency> <!-- hadoop依赖--> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-common</artifactId> <version>2.6.0-cdh5.16.1</version> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-hdfs</artifactId> <version>2.6.0-cdh5.16.1</version> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-client</artifactId> <version>2.6.0-cdh5.16.1</version> </dependency> </dependencies> <repositories> <repository> <id>cloudera</id> <url>https://repository.cloudera.com/artifactory/cloudera-repos/</url> <releases> <enabled>true</enabled> </releases> <snapshots> <enabled>true</enabled> </snapshots> </repository> </repositories> <build> <plugins> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-shade-plugin</artifactId> <version>3.1.0</version> <configuration> <createDependencyReducedPom>false</createDependencyReducedPom> </configuration> <executions> <execution> <phase>package</phase> <goals> <goal>shade</goal> </goals> <configuration> <transformers> <transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer"> <!--如果要打包的话,这里要换成对应的 main class--> <!--<mainClass>application.StormToFlink_demo</mainClass>--> <!--<mainClass>application.SaveDataToHbase</mainClass>--> <!--<mainClass>application.CheckPointState_demo</mainClass>--> <!--<mainClass>application.storm.FlinkTest</mainClass>--> <!--<mainClass>application.storm.FlinkMatchTopicApplication_develop</mainClass>--> <mainClass>batch.WordCount_demo</mainClass> <mainClass>developing_scala.kafka2RedisDemo_test</mainClass> </transformer> <transformer implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer"> <resource>reference.conf</resource> </transformer> </transformers> <filters> <filter> <artifact>*:*:*:*</artifact> <excludes> <exclude>META-INF/*.SF</exclude> <exclude>META-INF/*.DSA</exclude> <exclude>META-INF/*.RSA</exclude> </excludes> </filter> </filters> </configuration> </execution> </executions> </plugin> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-compiler-plugin</artifactId> <configuration> <source>8</source> <target>8</target> <encoding>utf8</encoding> </configuration> </plugin> </plugins> </build> </project>
3,因为每个人的hive版本不一样,请参考官网信息 一目了然
https://ci.apache.org/projects/flink/flink-docs-release-1.10/dev/table/hive/#connecting-to-hive
4,接下来把hive-site.xml配置文件给load下来放到一个路径,我这里是本地window10环境演示的,所以随便放了一个路径。
需要注意的就是有的是CDH HDP环境,可能load下来的配置文件密码加密啊 有的属性没有,参考官网必须知道一下的属性:
https://ci.apache.org/projects/flink/flink-docs-release-1.10/dev/table/hive/hive_catalog.html
<configuration>
<property>
<name>javax.jdo.option.ConnectionURL</name>
<value>jdbc:mysql://localhost/metastore?createDatabaseIfNotExist=true</value>
<description>metadata is stored in a MySQL server</description>
</property>
<property>
<name>javax.jdo.option.ConnectionDriverName</name>
<value>com.mysql.jdbc.Driver</value>
<description>MySQL JDBC driver class</description>
</property>
<property>
<name>javax.jdo.option.ConnectionUserName</name>
<value>...</value>
<description>user name for connecting to mysql server</description>
</property>
<property>
<name>javax.jdo.option.ConnectionPassword</name>
<value>...</value>
<description>password for connecting to mysql server</description>
</property>
<property>
<name>hive.metastore.uris</name>
<value>thrift://localhost:9083</value>
<description>IP address (or fully-qualified domain name) and port of the metastore host</description>
</property>
<property>
<name>hive.metastore.schema.verification</name>
<value>true</value>
</property>
</configuration>
5,后面就是代码了。很简单的。百度找找
package flink_sql import org.apache.flink.streaming.api.TimeCharacteristic import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment} import org.apache.flink.table.api.{EnvironmentSettings, Table} import org.apache.flink.table.api.scala.StreamTableEnvironment import org.apache.flink.table.catalog.hive.HiveCatalog /** * todo 从kafka读取数据创建catalog */ object Sql_source_kafka { def main(args: Array[String]): Unit = { import org.apache.flink.api.scala._ val streamEnv = StreamExecutionEnvironment.getExecutionEnvironment streamEnv.setParallelism(1) // streamEnv.setStreamTimeCharacteristic(TimeCharacteristic.EventTime) // streamEnv.setStreamTimeCharacteristic(TimeCharacteristic.ProcessingTime) val tableEnvSettings = EnvironmentSettings.newInstance() .useBlinkPlanner() .inStreamingMode() .build() val tableEnv = StreamTableEnvironment.create(streamEnv, tableEnvSettings) val catalog = new HiveCatalog( "rtdw", // catalog name "default", // default database "G:\\Flink SQL开发文件", // Hive config (hive-site.xml) directory "1.1.0" // Hive version ) //todo 注册这个catalog tableEnv.registerCatalog("rtdw", catalog) //todo 使用这个catalog,这个表在内存 tableEnv.useCatalog("rtdw") //todo 创建库 // val createDbSql1 = "CREATE DATABASE IF NOT EXISTS rtdw.default" // val createDbSql1 = "USE DATABASE default" // tableEnv.sqlUpdate(createDbSql1) //todo 存在哪些库 val aa: Array[String] = tableEnv.listCatalogs() print(aa.toList) //todo 存在的表 val tables = tableEnv.listTables() println(tables.toList) //todo kafka 数据 val kafkaLogStr = "{\"eventType\": \"clickBuyNow\",\"userId\": \"97470180\",\"ts\": 1585136092541}" // tableEnv.sqlUpdate("DROP TABLE Orders rtdw.ods.streaming_user_active_log2") val createTableSql_new = """CREATE TABLE flink_test_03 ( | eventType STRING, | userId STRING, | ts STRING |) | WITH |( | 'connector.type' = 'kafka', | 'connector.version' = '0.11', | 'connector.topic' = 'flink_test_topic', | 'connector.startup-mode' = 'earliest-offset', | 'connector.properties.zookeeper.connect' = 'node1:2181,node2:2181,node3:2181', | 'connector.properties.bootstrap.servers' = 'node11:9092,node2:9092,node3:9092', | 'connector.properties.group.id' = 'flink_test_1', | 'format.type' = 'json', | 'format.derive-schema' = 'true', | 'update-mode' = 'append' |)""".stripMargin tableEnv.sqlUpdate(createTableSql_new) val querySql = """SELECT eventType, |userId, |ts |FROM flink_test_03 """.stripMargin val result: Table = tableEnv.sqlQuery(querySql) print("打印元数据信息") val rsss: DataStream[(String, String, String)] = tableEnv.toAppendStream[(String, String, String)](result) rsss.print() streamEnv.execute() } }
6,执行之后我们可以在hive查看是否存在表信息,实际发现表存储,表字段不存在,通过mysql去查找也是:
通过命令DESCRIBE FORMATTED flink_test_03; 发现可以打印这个变的信息,mysql库是不存在表字段信息的。
7,如果是这种情况,我们只能是打印table的schme信息,手动去分析去存储,后续再研究怎么搞元数据的管理。