Flink实时数仓---1,Flink SQL读取kafka实操,已经遇到的问题.....持续更新吧

1,第一步很重要 就是依赖的问题,因为是本地执行,所以最好有hadoop配置环境,没有的话会提示错误信息,自己百度一下,自己下载个winutils.exe ,然后配置环境变量

2,为了图方便 直接贴上pom文件依赖(自己看哈,就是hive跟 hadoop的依赖): 

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>pijiuya</groupId>
    <artifactId>FlinkExample</artifactId>
    <version>1.0-SNAPSHOT</version>

    <properties>
        <flink.version>1.10.0</flink.version>
    </properties>


    <dependencies>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-java</artifactId>
            <version>${flink.version}</version>
            <!-- provided在这表示此依赖只在代码编译的时候使用,运行和打包的时候不使用 -->
            <!--<scope>provided</scope>-->
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-streaming-java_2.11</artifactId>
            <version>${flink.version}</version>
            <!--<scope>provided</scope>-->
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-scala_2.11</artifactId>
            <version>${flink.version}</version>
            <!--<scope>provided</scope>-->
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-streaming-scala_2.11</artifactId>
            <version>${flink.version}</version>
            <!--<scope>provided</scope>-->
        </dependency>

        <dependency>
            <groupId>org.apache.bahir</groupId>
            <artifactId>flink-connector-redis_2.11</artifactId>
            <version>1.0</version>
        </dependency>

        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-statebackend-rocksdb_2.11</artifactId>
            <version>${flink.version}</version>
        </dependency>

        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-connector-kafka-0.11_2.11</artifactId>
            <version>${flink.version}</version>
        </dependency>

        <dependency>
            <groupId>org.apache.kafka</groupId>
            <artifactId>kafka-clients</artifactId>
            <version>0.11.0.3</version>
        </dependency>

        <dependency>
            <groupId>org.slf4j</groupId>
            <artifactId>slf4j-api</artifactId>
            <version>1.7.25</version>
        </dependency>

        <dependency>
            <groupId>org.slf4j</groupId>
            <artifactId>slf4j-log4j12</artifactId>
            <version>1.7.25</version>
        </dependency>


        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-table</artifactId>
            <version>${flink.version}</version>
            <type>pom</type>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-table-api-java-bridge_2.11</artifactId>
            <version>${flink.version}</version>
        </dependency>
        <!-- or... -->
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-table-api-scala-bridge_2.11</artifactId>
            <version>${flink.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-table-common</artifactId>
            <version>${flink.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-table-api-java</artifactId>
            <version>${flink.version}</version>
        </dependency>

        <!--<dependency>-->
        <!--<groupId>org.apache.flink</groupId>-->
        <!--<artifactId>flink-table-api-scala_${scala.binary.version}</artifactId>-->
        <!--<version>${flink.version}</version>-->
        <!--</dependency>-->

        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-table-api-scala_2.11</artifactId>
            <version>${flink.version}</version>
        </dependency>

        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-table-planner-blink_2.11</artifactId>
            <version>${flink.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-table-planner_2.11</artifactId>
            <version>${flink.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-jdbc_2.11</artifactId>
            <version>${flink.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-csv</artifactId>
            <version>${flink.version}</version>
        </dependency>

        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>statefun-sdk</artifactId>
            <version>2.0.0</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>statefun-flink-harness</artifactId>
            <version>2.0.0</version>
        </dependency>


        <dependency>
            <groupId>com.alibaba</groupId>
            <artifactId>fastjson</artifactId>
            <version>1.2.60</version>
        </dependency>

        <dependency>
            <groupId>redis.clients</groupId>
            <artifactId>jedis</artifactId>
            <version>2.9.0</version>
            <!--<scope>provided</scope>-->
        </dependency>
        <dependency>
            <groupId>org.apache.bahir</groupId>
            <artifactId>flink-connector-redis_2.11</artifactId>
            <version>1.0</version>
        </dependency>
        <!--时间类-->
        <dependency>
            <groupId>joda-time</groupId>
            <artifactId>joda-time</artifactId>
            <version>2.9.2</version>
        </dependency>

        <!--hive依赖-->
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-connector-hive_2.11</artifactId>
            <version>1.10.0</version>
            <!--<scope>provided</scope>-->
        </dependency>

        <dependency>
            <groupId>org.apache.hive</groupId>
            <artifactId>hive-exec</artifactId>
            <version>1.1.0</version>
            <!--<scope>provided</scope>-->
        </dependency>

        <!-- hadoop依赖-->
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-common</artifactId>
            <version>2.6.0-cdh5.16.1</version>
        </dependency>

        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-hdfs</artifactId>
            <version>2.6.0-cdh5.16.1</version>
        </dependency>

        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>2.6.0-cdh5.16.1</version>
        </dependency>


    </dependencies>

    <repositories>
        <repository>
            <id>cloudera</id>
            <url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
            <releases>
                <enabled>true</enabled>
            </releases>
            <snapshots>
                <enabled>true</enabled>
            </snapshots>
        </repository>
    </repositories>

    <build>
        <plugins>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-shade-plugin</artifactId>
                <version>3.1.0</version>
                <configuration>
                    <createDependencyReducedPom>false</createDependencyReducedPom>
                </configuration>
                <executions>
                    <execution>
                        <phase>package</phase>
                        <goals>
                            <goal>shade</goal>
                        </goals>

                        <configuration>
                            <transformers>

                                <transformer
                                        implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
                                    <!--如果要打包的话,这里要换成对应的 main class-->
                                    <!--<mainClass>application.StormToFlink_demo</mainClass>-->
                                    <!--<mainClass>application.SaveDataToHbase</mainClass>-->
                                    <!--<mainClass>application.CheckPointState_demo</mainClass>-->
                                    <!--<mainClass>application.storm.FlinkTest</mainClass>-->
                                    <!--<mainClass>application.storm.FlinkMatchTopicApplication_develop</mainClass>-->
                                    <mainClass>batch.WordCount_demo</mainClass>
                                    <mainClass>developing_scala.kafka2RedisDemo_test</mainClass>
                                </transformer>
                                <transformer
                                        implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
                                    <resource>reference.conf</resource>
                                </transformer>
                            </transformers>
                            <filters>
                                <filter>
                                    <artifact>*:*:*:*</artifact>
                                    <excludes>
                                        <exclude>META-INF/*.SF</exclude>
                                        <exclude>META-INF/*.DSA</exclude>
                                        <exclude>META-INF/*.RSA</exclude>
                                    </excludes>
                                </filter>
                            </filters>
                        </configuration>
                    </execution>
                </executions>
            </plugin>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-compiler-plugin</artifactId>
                <configuration>
                    <source>8</source>
                    <target>8</target>
                    <encoding>utf8</encoding>
                </configuration>
            </plugin>
        </plugins>
    </build>


</project>

3,因为每个人的hive版本不一样,请参考官网信息 一目了然

https://ci.apache.org/projects/flink/flink-docs-release-1.10/dev/table/hive/#connecting-to-hive

4,接下来把hive-site.xml配置文件给load下来放到一个路径,我这里是本地window10环境演示的,所以随便放了一个路径。

需要注意的就是有的是CDH  HDP环境,可能load下来的配置文件密码加密啊 有的属性没有,参考官网必须知道一下的属性:

 https://ci.apache.org/projects/flink/flink-docs-release-1.10/dev/table/hive/hive_catalog.html

<configuration>
   <property>
      <name>javax.jdo.option.ConnectionURL</name>
      <value>jdbc:mysql://localhost/metastore?createDatabaseIfNotExist=true</value>
      <description>metadata is stored in a MySQL server</description>
   </property>

   <property>
      <name>javax.jdo.option.ConnectionDriverName</name>
      <value>com.mysql.jdbc.Driver</value>
      <description>MySQL JDBC driver class</description>
   </property>

   <property>
      <name>javax.jdo.option.ConnectionUserName</name>
      <value>...</value>
      <description>user name for connecting to mysql server</description>
   </property>

   <property>
      <name>javax.jdo.option.ConnectionPassword</name>
      <value>...</value>
      <description>password for connecting to mysql server</description>
   </property>

   <property>
       <name>hive.metastore.uris</name>
       <value>thrift://localhost:9083</value>
       <description>IP address (or fully-qualified domain name) and port of the metastore host</description>
   </property>

   <property>
       <name>hive.metastore.schema.verification</name>
       <value>true</value>
   </property>

</configuration>

5,后面就是代码了。很简单的。百度找找 

package flink_sql

import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment}
import org.apache.flink.table.api.{EnvironmentSettings, Table}
import org.apache.flink.table.api.scala.StreamTableEnvironment
import org.apache.flink.table.catalog.hive.HiveCatalog


/**
  * todo 从kafka读取数据创建catalog
  */
object Sql_source_kafka {
  def main(args: Array[String]): Unit = {

    import org.apache.flink.api.scala._

    val streamEnv = StreamExecutionEnvironment.getExecutionEnvironment
    streamEnv.setParallelism(1)
    //    streamEnv.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
    //    streamEnv.setStreamTimeCharacteristic(TimeCharacteristic.ProcessingTime)
    val tableEnvSettings = EnvironmentSettings.newInstance()
      .useBlinkPlanner()
      .inStreamingMode()
      .build()

    val tableEnv = StreamTableEnvironment.create(streamEnv, tableEnvSettings)

    val catalog = new HiveCatalog(
      "rtdw", // catalog name
      "default", // default database
      "G:\\Flink SQL开发文件", // Hive config (hive-site.xml) directory
      "1.1.0" // Hive version
    )

    //todo 注册这个catalog
    tableEnv.registerCatalog("rtdw", catalog)

    //todo 使用这个catalog,这个表在内存
    tableEnv.useCatalog("rtdw")

    //todo 创建库
    //    val createDbSql1 = "CREATE DATABASE IF NOT EXISTS rtdw.default"
    //    val createDbSql1 = "USE DATABASE default"
    //    tableEnv.sqlUpdate(createDbSql1)

    //todo 存在哪些库
    val aa: Array[String] = tableEnv.listCatalogs()
    print(aa.toList)

    //todo 存在的表
    val tables = tableEnv.listTables()
    println(tables.toList)


    //todo kafka 数据
    val kafkaLogStr = "{\"eventType\": \"clickBuyNow\",\"userId\": \"97470180\",\"ts\": 1585136092541}"
    //    tableEnv.sqlUpdate("DROP TABLE Orders rtdw.ods.streaming_user_active_log2")

    val createTableSql_new =
      """CREATE TABLE flink_test_03 (
        |  eventType STRING,
        |  userId STRING,
        |  ts STRING
        |)
        | WITH
        |(
        |  'connector.type' = 'kafka',
        |  'connector.version' = '0.11',
        |  'connector.topic' = 'flink_test_topic',
        |  'connector.startup-mode' = 'earliest-offset',
        | 'connector.properties.zookeeper.connect' = 'node1:2181,node2:2181,node3:2181',
        | 'connector.properties.bootstrap.servers' = 'node11:9092,node2:9092,node3:9092',
        |  'connector.properties.group.id' = 'flink_test_1',
        |  'format.type' = 'json',
        |  'format.derive-schema' = 'true',
        |  'update-mode' = 'append'
        |)""".stripMargin

    tableEnv.sqlUpdate(createTableSql_new)

    val querySql =
      """SELECT eventType,
        |userId,
        |ts
        |FROM flink_test_03
      """.stripMargin
    val result: Table = tableEnv.sqlQuery(querySql)
    print("打印元数据信息")


    val rsss: DataStream[(String, String, String)] = tableEnv.toAppendStream[(String, String, String)](result)
    rsss.print()

    
    streamEnv.execute()

  }

}

6,执行之后我们可以在hive查看是否存在表信息,实际发现表存储,表字段不存在,通过mysql去查找也是:

 

通过命令DESCRIBE FORMATTED flink_test_03; 发现可以打印这个变的信息,mysql库是不存在表字段信息的。

7,如果是这种情况,我们只能是打印table的schme信息,手动去分析去存储,后续再研究怎么搞元数据的管理。

猜你喜欢

转载自blog.csdn.net/qq_31866793/article/details/106232109