Hive复杂类型的导入

  Hive是大数据体系下ETL、数据预处理、数仓等领域比较重要的组件,应用广泛。博主空闲时间研究一下Hive的数据类型。

1. Hive数据类型组成

  关于Hive的数据类型,官方文档展示的比较全面,关于数据类型的详细说明Hive数据类型,这里总结如下

大类 类型 具体类型
基本类型 数值型 tinyint,smallint,int,bigint,float,double,decimal,numeric
基本类型 字符型 string,varchar,char
基本类型 日期型 timestamp,date,interval
基本类型 其他 boolean,binary
复杂类型 数组 array
复杂类型 映射 map
复杂类型 结构 struct
复杂类型 联合 uniontype

2. 数据导入举例

  这里主要研究具体建表、产生测试数据、导入数据等内容。

2.1. 建表语句

create table test_hive_meta(
    name string,
    age int,
    score float,
    insert_time string,-- 刚开始定义的是date类型,后面修改为string或timestamp
    students struct<sname:string,sage:int>,
    infos map<int,string>,
    scores array<float>
)  comment "测试数据表"
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'  
COLLECTION ITEMS TERMINATED BY '-'
MAP KEYS TERMINATED BY ':'
STORED as TEXTFILE ;

2.2. 产生测试数据

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Random;

public class TestMain {

    private static final String[] names = new String[] { "Laird", "莱尔德", "Lambert", "兰伯特", "Lamont", "拉蒙特", "Lance",
            "兰斯", "Lang", "兰格", "Lange", "兰格", "Langston", "兰斯顿", "Lanny", "兰尼", "Larkin", "拉金", "Larry", "拉里",
            "Clementina", "克莱门蒂娜", "Clementine", "克莱门廷", "Clemmie", "克莱米", "Cleo", "克利奥", "Cleopatra", "克利奥帕特拉",
            "Colette", "科莱特", "Colleen", "科琳", "Conchita", "康奇塔", "Connie", "康妮,康尼", "Constance", "康斯坦斯", "Alvina",
            "阿尔文娜", "Alvira", "阿尔薇拉", "Amabel", "阿玛贝尔", "Amanda", "阿曼达", "Amber", "安伯", "Amelia", "阿米莉亚", "Amity",
            "阿米蒂", "Amor", "埃默", "Amy", "艾米", "Ana", "安娜", "Ware", "韦尔", "Warner", "沃纳", "Warren", "沃伦", "Washburn",
            "沃什伯恩", "Washington", "华盛顿", "Watkins", "沃特金斯", "Watt", "瓦特", "Watts", "沃茨", "Wayne", "韦恩", "Webb", "韦布",
            "Lina", "莉娜", "Linda", "琳达", "Lindy", "琳迪", "Linn", "林", "Linsey", "林赛", "Lisa", "莉萨", "Lisbeth", "莉斯贝思",
            "Lise", "莉萨", "Lisette", "莉塞特", "Liz", "莉兹" };
    private static Random random = new Random();
    private static SimpleDateFormat sdf = new SimpleDateFormat("yyyy-mm-dd HH:MM:ss");
    private static int size = names.length;
    private static char[] a = { ',' };// fields termination
    private static char[] b = { '-' };// collection item termination
    private static char[] c = { ':' };// map key termination

    public static void main(String[] args) {

        try {
            File file = new File("./data.txt");
            if (!file.exists()) {
                file.createNewFile();
            }
            FileWriter fw = new FileWriter(file.getAbsoluteFile());
            BufferedWriter bw = new BufferedWriter(fw);
            for (int i = 0; i < 100; i++) {
                bw.write(gen1Line());
            }
            bw.close();
            System.out.println("Done");
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    private static String gen1Line() {

        StringBuffer line = new StringBuffer();
        line.append(names[random.nextInt(size)]);
        line.append(a);
        line.append(random.nextInt(100));
        line.append(a);
        line.append(random.nextDouble() * 10);
        line.append(a);
        line.append(sdf.format(new Date()));
        line.append(a);

        line.append(names[random.nextInt(size)]);
        line.append(b);
        line.append(random.nextInt(100));
        line.append(a);

        for (int i = 0; i < 10; i++) {
            line.append(random.nextInt(100));
            line.append(c);
            line.append(names[random.nextInt(size)]);
            line.append(b);
        }
        line.append(random.nextInt(100));
        line.append(c);
        line.append(names[random.nextInt(size)]);
        line.append(a);

        for (int i = 0; i < 10; i++) {
            line.append(random.nextDouble() * 10);
            line.append(b);
        }
        line.append(random.nextDouble() * 10);
        line.append("\n");

        return line.toString();
    }
}

2.3. 导入数据测试

-- 导入数据
load data local inpath "/home/data.txt" into table test_hive_meta;

-- 为了便于查看导入数据结果,打开列显示
set hive.cli.print.header=true;
set hive.cli.print.row.to.vertical=true;
set hive.cli.print.row.to.vertical.num=1;

2.4. 查看导入结果

-- 查询
select * from test_hive_meta limit 1 ;

-- 结果
name    age score   insert_time students    infos   scores
Cleopatra   11  0.28206065  2018-14-11 17:08:15 {"sname":"阿玛贝尔","sage":15}  {41:"Linda",82:"康斯坦斯",94:"艾米",81:"Washington",23:"兰尼",93:"Lise",36:"沃纳",70:"Lise",39:"克利奥帕特拉",35:"Lambert",67:"Colleen"}    [1.8265022,6.058134,7.794176,4.096524,8.195735,5.866253,0.75852406,6.835354,2.7134678,8.078223,6.275408]

3. 采坑

  1. 关于date数据类型
    发现使用date数据类型在导入的时候存在问题,刚开始使用date类型,导入类型为long,结果显示为null;后面导入数据修改为”yyyy-MM-dd HH:mm:ss”之后,结果仍为null。后面修改为string类型,导入数据为格式化的日期类型,或者修改为timestamp类型,导入类型为long。

4. 参考文章

  1. Apache Hive Document
  2. hive collection data type
  3. Hive中导入时间格式的数据显示为null

猜你喜欢

转载自blog.csdn.net/aWDac/article/details/81589339