需求:对于以下数据,写一个函数得到省份名
1367775,10
1363426,10
1371235,10
1371237,10
1371236,10
1376888,10
1382132,10
1367775 beijing 10
1363426 beijing 10
1371235 shanghai 10
1371237 shanghai 10
1361236 beijing 10
1366888 beijing 10
1382132 shenzhen 10
例1:hive自定义函数(udf:user-defined function)
1、开发一个java类,继承UDF(聚合函数继承UDAF)并重载evaluate方法
package bigdata.udf
import org.apache.hadoop.hive.ql.exec.UDF;
//继承类
public class ToLowerCase(GetProvince) extends UDF{
//加载一个字典表
public static HashMap<Integer,String> provinceMap=new HashMap<Integer,String>
static {
provinceMap.put("136","beijing");
provinceMap.put("137","shanghai");
provinceMap.put("138","shenzhen");
}
//必须是public //重载evaluate方法根据不同的输入判断调用那个函数
public String evaluate(String field){
String result = field.toLowerCase();
return result;
}
//返回值 //输入
public String evaluate(int phonenbr){
String pnb = String.valueOf(phonenbr);
return provinceMap.get(pnb.substring(0,3))== null?"huoxin":provinceMap.get(pnb.substring(0,3));
}
}
2、打成jar包上传到服务器
3、将jar包添加到hive的classpathadd JAR /home/hadoop/udf.jar;
4、创建临时函数与开发好的java class 关联create temporary function getprovince as 'bigdata.udf.ToLowerCase';
5、hql中使用
create table t_flow(phonenbr int,flow int)
row format delimited //使用自带的serde:S erDe是Serialize/Deserilize的简称,目的是用于序列化和反序列化。S erDe能为表指定列,且对列指定相应的数据。
fields terminated by ',';
load data local inpath '/home/hadoop/flow.dat' into table t_flow;
select phonenbr,getprovince(phonenbr),flow from t_flow;
例2:
create table t_json(line string)
row format delimited;
load data local inpath '' into table t_json;
select * from t_json limit 10;
class JsonParser
package bigdata.udf;
import org.apache.hadoop.hive.ql.exec.UDF;
import parquet.org.codehaus.jackson.map.ObjectMapper;
public class JsonParser extends UDF { //alt+/ctrl+shift+o导包
//Window - Preferences - Java - Editor - Templates,这里你可以看到所有的eclipse的快捷方式
//alt+/补全
public String evaluate(String jsonline){ //输入jsonline返回string
ObjectMapper objectMapper = new ObjectMapper();
try{
MovieRateBean bean = ObjectMapper.readValue(jsonline,MovieRateBean);
return bean.toString();
}catch(Exception e){
}
return "";
}
}
MovieRateBean
package bigdata.udf;
public class MovieRateBean{
private String movie;
private String rate;
private String timeStamp;
private String uid;
//alt+shift+s
public String getMovie(){
return movie;
}
public String setMovie(String movie){
this.movie = movie;
}
public String getRate(){
return rate;
}
public void setRate(String rate){
this.rate = rate;
}
public String getTimeStamp(){
return timestamp;
}
public void setTimeStamp(String timeStamp){
this.timeStamp = timeStamp;
}
public String getUid(){
return uid;
}
public void setUid(String uid){
this.uid = uid;
}
public String toString(){
return this.movie + "\t" + this.rate + "\t" +this.timeStamp + "\t" + this.uid();
}
}
javabean:这个类是public的,还要有一个无参数的构造函数。第二,属性是private的,必须通过get 和set 方法进行访问。第三,支持“事件”,例如addXXXXListener(XXXEvent e),可以处理各种事件,比如鼠标点击,键盘响应等等。第四,提供一个反射机制。第五,可以序列化/反序列化的,这样,我就可以被方便的存储,转移了。
bin/beeline -u jdbc:hive2://localhost:10000 -n hadoop
add JAR /home/hadoop/udf.jar;
create temporary function parsejson as 'bigdata.udf.JsonParser';
select parsejson(line) form t_json limit 10;
但是只有一个字段,如何把它分为四个字段
//insert overwrite table t_rating
create table t_rating as
select split(parsejson(line),'\t')[0]as movieid,
split(parsejson(line),'\t')[1] as rate,
split(parsejson(line),'\t')[2] as timestring,
split(parsejson(line),'\t')[3] as uid
from t_json;
内置json函数
select get_json_object(line,'$.movie') as moive,
get_json_object(line,'$.rate') as rate from rat_json limit 10;
Transform实现
提供了在sql中调用自写脚本(python或shell脚本)的功能,适合hive中没有的功能又不想写udf的情况。
1.加载rating.json文件到hive的一个原始表
create table t_json(line string)
row format delimited;
load data local inpath '' into table t_json;
select * from t_json limit 10;
2.需要解析json数据成四个字段,插入一张新表t_rating
内置json函数
set hive.support.sql11.reserved.keywords=false;##不然识不出timeStamp
hive> create table t_rating as
> select get_json_object(line,'$.movie') as moive,get_json_object(line,'$.rate') as rate,get_json_object(line,'$.timeStamp') as timeStamp,get_json_object(line,'$.uid') as uid from t_json;
3.使用transform+python的方式转换unixtime为weekday
先编辑一个python脚本文件,然后将文件加入hive的classpath下:
vi weekday_mapper.py
#!/bin/python
import sys
import datetime
for line in sys.stdin:
line = line.strip()//去空格
movieid,rate,timestring,uid = line.split('\t')
weekday=datetime.datetime.fromtimestamp(float(timestring)).isoweekday()
print '\t'.join([movieid,rating,str(weekday),userid]) //相当于后面用/t串起来
add file weekday_mapper.py;
create table u_data_new(
movieid int,
rating int,
weekday int,
userid int)
row format delimited
fields terminated by '/t';
insert overwrite table u_data_new
//create table u_data_new as
select
transform(movieid,rate,timestring,uid)
using'python weekday_mapper.py'
as(movieid,rating,weekday,userid)
from t_rating;
报错:生无可恋
ERROR : Ended Job = job_local1691136619_0009 with errors
Error: Error while processing statement: FAILED: Execution Error, return code 2 from org.apache.hadoop.hive.ql.exec.mr.MapRedTask (state=08S01,code=2)
select distinct(weekday) from u_data_new limit 10;
扫描二维码关注公众号,回复:
9687515 查看本文章