Hive之Transform实现

需求:对于以下数据,写一个函数得到省份名

1367775,10
1363426,10
1371235,10
1371237,10
1371236,10
1376888,10
1382132,10

1367775   beijing    10
1363426   beijing    10
1371235   shanghai   10
1371237   shanghai   10
1361236   beijing    10
1366888   beijing    10
1382132   shenzhen   10

例1:hive自定义函数(udf:user-defined function) 

1、开发一个java类,继承UDF(聚合函数继承UDAF)并重载evaluate方法

package bigdata.udf

import org.apache.hadoop.hive.ql.exec.UDF;
                                 //继承类
public class ToLowerCase(GetProvince) extends UDF{
//加载一个字典表
    public static HashMap<Integer,String> provinceMap=new HashMap<Integer,String>
    static {
        provinceMap.put("136","beijing");
        provinceMap.put("137","shanghai");
        provinceMap.put("138","shenzhen");
    }


    //必须是public        //重载evaluate方法根据不同的输入判断调用那个函数
    public String evaluate(String field){
        String result = field.toLowerCase();
        return result;
    }
           //返回值           //输入
    public String evaluate(int phonenbr){
        String pnb = String.valueOf(phonenbr);
        return provinceMap.get(pnb.substring(0,3))== null?"huoxin":provinceMap.get(pnb.substring(0,3));
    }   
}

2、打成jar包上传到服务器
3、将jar包添加到hive的classpath
add JAR /home/hadoop/udf.jar;
4、创建临时函数与开发好的java class 关联
create temporary function getprovince as 'bigdata.udf.ToLowerCase';
5、hql中使用

create table t_flow(phonenbr int,flow int)
row format delimited //使用自带的serde:S erDe是Serialize/Deserilize的简称,目的是用于序列化和反序列化。S erDe能为表指定列,且对列指定相应的数据。
fields terminated by ',';
load data local inpath '/home/hadoop/flow.dat' into table t_flow;

select phonenbr,getprovince(phonenbr),flow from t_flow;

例2:

create table t_json(line string)
row format delimited;
load data local inpath '' into table t_json;
select * from t_json limit 10;

class JsonParser
package bigdata.udf;
import org.apache.hadoop.hive.ql.exec.UDF;
import parquet.org.codehaus.jackson.map.ObjectMapper;
public class JsonParser extends UDF {  //alt+/ctrl+shift+o导包
//Window - Preferences - Java - Editor - Templates,这里你可以看到所有的eclipse的快捷方式
//alt+/补全   
    public String evaluate(String jsonline){  //输入jsonline返回string
        ObjectMapper objectMapper = new ObjectMapper();
        try{
            MovieRateBean bean = ObjectMapper.readValue(jsonline,MovieRateBean);
            return bean.toString();
        }catch(Exception e){
        }
        return "";
    }   
}

MovieRateBean

package bigdata.udf;
public class MovieRateBean{
    private String movie;
    private String rate;
    private String timeStamp;
    private String uid;
    
    //alt+shift+s
    public String getMovie(){
        return movie;
    }
    public String setMovie(String movie){
        this.movie = movie;
    }
    public String getRate(){
        return rate;
    }
    public void setRate(String rate){
        this.rate = rate;
    }
    public String getTimeStamp(){
        return timestamp;
    }
    public void setTimeStamp(String timeStamp){
        this.timeStamp = timeStamp;
    }
    public String getUid(){
        return uid;
    }
    public void setUid(String uid){
        this.uid = uid;
    }
    
    public String toString(){
    
        return this.movie + "\t" + this.rate + "\t" +this.timeStamp + "\t" + this.uid();
    }
}

javabean:这个类是public的,还要有一个无参数的构造函数。第二,属性是private的,必须通过get 和set 方法进行访问。第三,支持“事件”,例如addXXXXListener(XXXEvent e),可以处理各种事件,比如鼠标点击,键盘响应等等。第四,提供一个反射机制。第五,可以序列化/反序列化的,这样,我就可以被方便的存储,转移了。

bin/beeline -u jdbc:hive2://localhost:10000 -n hadoop
add JAR /home/hadoop/udf.jar;
create temporary function parsejson as 'bigdata.udf.JsonParser';
select parsejson(line) form t_json limit 10;

但是只有一个字段,如何把它分为四个字段

//insert overwrite table t_rating
create table t_rating as
select split(parsejson(line),'\t')[0]as movieid,
split(parsejson(line),'\t')[1] as rate,
split(parsejson(line),'\t')[2] as timestring,
split(parsejson(line),'\t')[3] as uid 
from t_json;

内置json函数
select get_json_object(line,'$.movie') as moive,
get_json_object(line,'$.rate') as rate  from rat_json limit 10;

Transform实现

提供了在sql中调用自写脚本(python或shell脚本)的功能,适合hive中没有的功能又不想写udf的情况。
1.加载rating.json文件到hive的一个原始表

create table t_json(line string)
row format delimited;
load data local inpath '' into table t_json;
select * from t_json limit 10;

2.需要解析json数据成四个字段,插入一张新表t_rating
内置json函数

set hive.support.sql11.reserved.keywords=false;##不然识不出timeStamp
hive> create table t_rating as
    > select get_json_object(line,'$.movie') as moive,get_json_object(line,'$.rate') as rate,get_json_object(line,'$.timeStamp') as timeStamp,get_json_object(line,'$.uid') as uid from t_json;

3.使用transform+python的方式转换unixtime为weekday
先编辑一个python脚本文件,然后将文件加入hive的classpath下:

vi weekday_mapper.py
#!/bin/python
import sys
import datetime

for line in sys.stdin:
    line = line.strip()//去空格
    movieid,rate,timestring,uid = line.split('\t')
    weekday=datetime.datetime.fromtimestamp(float(timestring)).isoweekday()
    print '\t'.join([movieid,rating,str(weekday),userid])  //相当于后面用/t串起来

add file weekday_mapper.py;
create table u_data_new(
    movieid int,
    rating int,
    weekday int,
    userid int)
row format delimited
fields terminated by '/t';

insert overwrite table u_data_new
//create table u_data_new as
select
    transform(movieid,rate,timestring,uid)
    using'python weekday_mapper.py'
    as(movieid,rating,weekday,userid)
from t_rating;

报错:生无可恋
ERROR : Ended Job = job_local1691136619_0009 with errors
Error: Error while processing statement: FAILED: Execution Error, return code 2 from org.apache.hadoop.hive.ql.exec.mr.MapRedTask (state=08S01,code=2)

select distinct(weekday) from u_data_new limit 10;
扫描二维码关注公众号,回复: 9687515 查看本文章
发布了491 篇原创文章 · 获赞 435 · 访问量 12万+

猜你喜欢

转载自blog.csdn.net/lixinkuan328/article/details/104486972