一、hive 数据仓库,在线分析处理
------------------------------------------------
1.数据仓库 OLAP:在线分析处理,批量计算,实时性不好,延迟高
数据库 OLTP:在线事务处理,批量处理,实时性好,延迟低
二、MR作业实现关联查询join select
---------------------------------------------------
1.准备数据
[customers.txt]
1,tom1,12
2,tom2,13
3,tom3,14
4,tom4,15
[orders.txt]
1,no001,12.23,1
2,no001,12.23,1
3,no001,12.23,2
4,no001,12.23,2
5,no001,12.23,2
6,no001,12.23,3
7,no001,12.23,3
8,no001,12.23,3
9,no001,12.23,3
2.准备自定义key
public class ComboKey implements WritableComparable<ComboKey> {
private int type; // 0--cus 1--order
private int cid;
private int oid;
private String cusInfo = "";
private String orderInfo = "";
public int getType() {
return type;
}
public void setType(int type) {
this.type = type;
}
public int getCid() {
return cid;
}
public void setCid(int cid) {
this.cid = cid;
}
public int getOid() {
return oid;
}
public void setOid(int oid) {
this.oid = oid;
}
public String getCusInfo() {
return cusInfo;
}
public void setCusInfo(String cusInfo) {
this.cusInfo = cusInfo;
}
public String getOrderInfo() {
return orderInfo;
}
public void setOrderInfo(String orderInfo) {
this.orderInfo = orderInfo;
}
public ComboKey() {
}
public ComboKey(int type, int cid, int oid, String cusInfo, String orderInfo) {
this.type = type;
this.cid = cid;
this.oid = oid;
this.cusInfo = cusInfo;
this.orderInfo = orderInfo;
}
/**
* 自定义对比器
* @param o
* @return
*/
public int compareTo(ComboKey o) {
System.out.println("===> compare" );
//cid升序
//oid升序
if(this.cid == o.cid)
{
//同一客户的两个订单
if (type == o.type) {
return this.oid - o.oid;
}
//一个客户和一个该客户的订单
else
{
//客户在前
return (type - o.type);
}
}
//不同客户
else {
return this.cid - o.cid;
}
}
public void write(DataOutput out) throws IOException {
out.writeInt(type);
out.writeInt(cid);
out.writeInt(oid);
out.writeUTF(cusInfo);
out.writeUTF(orderInfo);
}
public void readFields(DataInput in) throws IOException {
type = in.readInt();
cid = in.readInt();
oid = in.readInt();
cusInfo = in.readUTF();
orderInfo = in.readUTF();
}
@Override
public String toString() {
return "ComboKey{" +
"type=" + type +
", cid=" + cid +
", oid=" + oid +
", cusInfo='" + cusInfo + '\'' +
", orderInfo='" + orderInfo + '\'' +
'}';
}
}
3.准备Map函数
public class MyMapper extends Mapper<LongWritable, Text,ComboKey, NullWritable> {
private Map<String, String> allCustomers = new HashMap<String, String>();
/**
* 初始化客户信息
* @param context
* @throws IOException
* @throws InterruptedException
*/
@Override
protected void setup(Context context) throws IOException, InterruptedException {
System.out.println("====> map");
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//或者使用切片判断输入的是cus还是orders
//FileSplit sp = (FileSplit)context.getInputSplit();
//sp.getPath();
//订单信息
String line = value.toString();
String[] strs = line.split(",");
ComboKey cob = new ComboKey();
//orders
if (strs.length == 4) {
cob.setOid(Integer.parseInt(strs[0]));
cob.setCid(Integer.parseInt(strs[3]));
cob.setType(1);
cob.setOrderInfo(line);
} else {
cob.setCid(Integer.parseInt(strs[0]));
cob.setType(0);
cob.setCusInfo(line);
}
context.write(cob,NullWritable.get());
}
}
4.准备排序对比器
public class MySort extends WritableComparator {
protected MySort()
{
super(ComboKey.class,true);
}
@Override
public int compare(WritableComparable a, WritableComparable b) {
System.out.println("===> sort");
ComboKey k1 = (ComboKey) a;
ComboKey k2 = (ComboKey) b;
return k1.compareTo(k2);
}
}
5.准备分组对比器
public class MyGroup extends WritableComparator {
protected MyGroup()
{
super(ComboKey.class,true);
}
@Override
//将Cid相同的组成一个组
//返回0 -- 表示会进入同一个组
//返回非0 -- 不会进入同一个组
public int compare(WritableComparable a, WritableComparable b) {
System.out.println("===> group");
ComboKey k1 = (ComboKey)a ;
ComboKey k2 = (ComboKey)b ;
// if(k1.getCid() - k2.getCid() == 0)
// {
// return 0;
// }
// else
// {
// return -1000;
// }
return k1.getCid() - k2.getCid();
}
}
6.准备reduce函数
public class MyReducer extends Reducer<ComboKey, NullWritable, Text,NullWritable> {
@Override
protected void reduce(ComboKey key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
Iterator<NullWritable> it = values.iterator();
it.next();
String cInfo = key.getCusInfo();
while (it.hasNext()) {
it.next();
String oInfo = key.getOrderInfo();
context.write(new Text(cInfo + "," + oInfo),NullWritable.get());
}
}
}
7.准备app
public class MyApp {
public static void main(String [] args)
{
try {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
if(args.length > 1)
{
FileSystem.get(conf).delete(new Path(args[1]));
}
//设置job
job.setJobName("ts.join_2");
job.setJarByClass(MyApp.class);
FileInputFormat.addInputPath(job,new Path(args[0]));
FileOutputFormat.setOutputPath(job,new Path(args[1]));
job.setInputFormatClass(TextInputFormat.class);
job.setMapOutputKeyClass(ComboKey.class); //map输出key类型
job.setMapOutputValueClass(NullWritable.class); //map输出value类型
job.setOutputKeyClass(Text.class); //Reduce输出key类型
job.setOutputValueClass(NullWritable.class); //Reduce输出value类型
//设定map和reduce
job.setMapperClass(MyMapper.class);
job.setReducerClass(MyReducer.class);
job.setPartitionerClass(MyParttitioner.class);
job.setSortComparatorClass(MySort.class);
job.setGroupingComparatorClass(MyGroup.class);
//设定任务属性
job.setNumReduceTasks(2);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
job.waitForCompletion(true);
} catch (Exception e) {
e.printStackTrace();
}
}
}
三、FileInputFormat的一些设置
------------------------------------------------
1.设置输入路径递归读取
FileInputFormat.setInputDirRecursive(job,inputDirRecursive);
四、自定义文件输入格式
--------------------------------------------------------------------
1.实现处理整个文件作为一条记录处理,不需要切割
2.创建自定义文件输入格式类 MyInputFormat
public class MyInputFormat extends FileInputFormat<NullWritable, BytesWritable> {
/**
* 自定义是否可切片(不可切片)
* @param context
* @param filename
* @return
*/
@Override
protected boolean isSplitable(JobContext context, Path filename) {
return false;
}
//返回自定义阅读器
public RecordReader<NullWritable, BytesWritable> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
MyRecordReader reader = new MyRecordReader();
reader.initialize(split,context);
return reader;
}
}
3.创建自定义文件记录阅读器 MyRecordReader
/**
* 自定义记录阅读器
* 实现整个文件一次性读取,作为一条记录
*/
public class MyRecordReader extends RecordReader<NullWritable, BytesWritable> {
private FileSplit fileSplit;
private Configuration conf;
private BytesWritable value = new BytesWritable();
private boolean processed = false;
//初始化切片和配置文件信息
//初始化阅读器时调用一次
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
this.fileSplit = (FileSplit)split;
this.conf = context.getConfiguration();
}
//读取下一个kv对
//如果有下一个kv对,返回true
public boolean nextKeyValue() throws IOException, InterruptedException {
if (!processed) {
byte[] contents = new byte[(int) fileSplit.getLength()];
Path file = fileSplit.getPath();
FileSystem fs = file.getFileSystem(conf);
FSDataInputStream in = null;
try {
in = fs.open(file);
IOUtils.readFully(in, contents, 0, contents.length);
value.set(contents, 0, contents.length);
} finally {
IOUtils.closeStream(in);
}
processed = true;
return true;
}
return false;
}
//获取当前key,没有返回null
public NullWritable getCurrentKey() throws IOException, InterruptedException {
return NullWritable.get();
}
//获取当前value
public BytesWritable getCurrentValue() throws IOException, InterruptedException {
return value;
}
// 0~1 之间,表示读取的进度
public float getProgress() throws IOException, InterruptedException {
return processed? 1f:0f;
}
public void close() throws IOException {
}
}
4.创建map类,实现将多个小文件合并成一个顺序文件。key为文件路径,value为文件字节数组
public class MyMapper extends Mapper<NullWritable, BytesWritable,Text, BytesWritable> {
private Text fileNameKey;
/**
* 初始化客户信息
* @param context
* @throws IOException
* @throws InterruptedException
*/
@Override
protected void setup(Context context) throws IOException, InterruptedException {
InputSplit split = context.getInputSplit();
Path path = ((FileSplit)split).getPath();
fileNameKey = new Text(path.toString());
}
@Override
protected void map(NullWritable key, BytesWritable value, Context context) throws IOException, InterruptedException {
context.write(fileNameKey, value);
}
}
5.创建mr的app类
public class MyApp {
public static void main(String [] args)
{
try {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
if(args.length > 1)
{
FileSystem.get(conf).delete(new Path(args[1]));
}
//设置job
job.setJobName("ts.myInputformat");
job.setJarByClass(MyApp.class);
job.setInputFormatClass(MyInputFormat.class);
MyInputFormat.addInputPath(job, new Path(args[0]));
job.setOutputFormatClass(SequenceFileOutputFormat.class);
SequenceFileOutputFormat.setOutputPath(job,new Path(args[1]));
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(BytesWritable.class);
//设定map
job.setMapperClass(MyMapper.class);
job.waitForCompletion(true);
} catch (Exception e) {
e.printStackTrace();
}
}
}