Web日志流处理的MapReduce程序 -- 两个(一个使用Collections排序 一个使用MapReduce本身的排序)

我的这两个项目代码地址:
Collections排序:
https://gitee.com/tanghongping/web_click_mr_hve
MapReduce排序:
https://gitee.com/tanghongping/MapReduceTest

这两个项目里面会有一些车市的代码,可以忽略。

使用Collections.sort排序

WeblogBean

package com.thp.bigdata.webClick.mrBean;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.Writable;

/**
 * 对接外部数据的层,表结构定义最好跟外部数据源保持一致
 * @author 汤小萌
 *
 */
public class WeblogBean implements Writable {

	private boolean valid = true;		// 判断数据是否合法
	private String remote_addr;			// 记录客户端的ip地址
	private String remote_user;			// 记录客户端用户名称  忽略属性"-"
	private String time_local;			// 记录访问时间与时区
	private String request;				// 记录请求的url与http协议
	private String status;				// 记录请求状态;成功是200
	private String body_bytes_sent;		// 记录发给客户单主体文件的大小
	private String http_referer;		// 记录用户是从哪个链接过来的
	private String http_user_agent;		// 记录客户端浏览器的相关信息
	
	public void set(boolean valid, String remote_addr, String remote_user, String time_local, String request,
			String status, String body_bytes_sent, String http_referer, String http_user_agent) {
		this.valid = valid;
		this.remote_addr = remote_addr;
		this.remote_user = remote_user;
		this.time_local = time_local;
		this.request = request;
		this.status = status;
		this.body_bytes_sent = body_bytes_sent;
		this.http_referer = http_referer;
		this.http_user_agent = http_user_agent;
	}

	public boolean isValid() {
		return valid;
	}

	public void setValid(boolean valid) {
		this.valid = valid;
	}

	public String getRemote_addr() {
		return remote_addr;
	}

	public void setRemote_addr(String remote_addr) {
		this.remote_addr = remote_addr;
	}

	public String getRemote_user() {
		return remote_user;
	}

	public void setRemote_user(String remote_user) {
		this.remote_user = remote_user;
	}

	public String getTime_local() {
		return time_local;
	}

	public void setTime_local(String time_local) {
		this.time_local = time_local;
	}

	public String getRequest() {
		return request;
	}

	public void setRequest(String request) {
		this.request = request;
	}

	public String getStatus() {
		return status;
	}

	public void setStatus(String status) {
		this.status = status;
	}

	public String getBody_bytes_sent() {
		return body_bytes_sent;
	}

	public void setBody_bytes_sent(String body_bytes_sent) {
		this.body_bytes_sent = body_bytes_sent;
	}

	public String getHttp_referer() {
		return http_referer;
	}

	public void setHttp_referer(String http_referer) {
		this.http_referer = http_referer;
	}

	public String getHttp_user_agent() {
		return http_user_agent;
	}

	public void setHttp_user_agent(String http_user_agent) {
		this.http_user_agent = http_user_agent;
	}

	@Override
	public void write(DataOutput out) throws IOException {
		out.writeBoolean(this.valid);
		out.writeUTF(null==remote_addr?"":remote_addr);
		out.writeUTF(null==remote_user?"":remote_user);
		out.writeUTF(null==time_local?"":time_local);
		out.writeUTF(null==request?"":request);
		out.writeUTF(null==status?"":status);
		out.writeUTF(null==body_bytes_sent?"":body_bytes_sent);
		out.writeUTF(null==http_referer?"":http_referer);
		out.writeUTF(null==http_user_agent?"":http_user_agent);
	}

	@Override
	public void readFields(DataInput in) throws IOException {
		this.valid = in.readBoolean();
		this.remote_addr = in.readUTF();
		this.remote_user = in.readUTF();
		this.time_local = in.readUTF();
		this.request = in.readUTF();
		this.status = in.readUTF();
		this.body_bytes_sent = in.readUTF();
		this.http_referer = in.readUTF();
		this.http_user_agent = in.readUTF();
	}
	
	
	@Override
	public String toString() {
		StringBuilder sb = new StringBuilder();
		sb.append(this.valid);
		sb.append("\001").append(this.getRemote_addr());
		sb.append("\001").append(this.getRemote_user());
		sb.append("\001").append(this.getTime_local());
		sb.append("\001").append(this.getRequest());
		sb.append("\001").append(this.getStatus());
		sb.append("\001").append(this.getBody_bytes_sent());
		sb.append("\001").append(this.getHttp_referer());
		sb.append("\001").append(this.getHttp_user_agent());
		return sb.toString();
	}
	
	
}

PageViewsBean

package com.thp.bigdata.webClick.mrBean;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.Writable;

/**
 * 
 * @author 汤小萌
 *
 */
public class PageViewsBean implements Writable {
	
	private String session;			// sessionId
	private String remote_addr;		// 客户端ip地址
	private String timeStr;			// 访问的时间
	private String request;			// 请求的url
	private int step;				// 访问的第几步
	private String staylong;		// 停留的时间
	private String referal;			// 是从哪个页面过来的
	private String useragent;		// 记录跟浏览器相关信息
	private String bytes_send;		// 发送的数据字节大小
	private String status;			// 本次请求的状态
	
	
	
	public void set(String session, String remote_addr, String useragent, String timeStr, String request, int step, String staylong, String referal, String bytes_send, String status) {
		this.session = session;
		this.remote_addr = remote_addr;
		this.useragent = useragent;
		this.timeStr = timeStr;
		this.request = request;
		this.step = step;
		this.staylong = staylong;
		this.referal = referal;
		this.bytes_send = bytes_send;
		this.status = status;
	}
	
	public String getSession() {
		return session;
	}
	public void setSession(String session) {
		this.session = session;
	}
	public String getRemote_addr() {
		return remote_addr;
	}
	public void setRemote_addr(String remote_addr) {
		this.remote_addr = remote_addr;
	}
	public String getTimeStr() {
		return timeStr;
	}
	public void setTimeStr(String timeStr) {
		this.timeStr = timeStr;
	}
	public String getRequest() {
		return request;
	}
	public void setRequest(String request) {
		this.request = request;
	}
	public int getStep() {
		return step;
	}
	public void setStep(int step) {
		this.step = step;
	}
	public String getStaylong() {
		return staylong;
	}
	public void setStaylong(String staylong) {
		this.staylong = staylong;
	}
	public String getReferal() {
		return referal;
	}
	public void setReferal(String referal) {
		this.referal = referal;
	}
	public String getUseragent() {
		return useragent;
	}
	public void setUseragent(String useragent) {
		this.useragent = useragent;
	}
	public String getBytes_send() {
		return bytes_send;
	}
	public void setBytes_send(String bytes_send) {
		this.bytes_send = bytes_send;
	}
	public String getStatus() {
		return status;
	}
	public void setStatus(String status) {
		this.status = status;
	}
	@Override
	public void write(DataOutput out) throws IOException {
		out.writeUTF(session);
		out.writeUTF(remote_addr);
		out.writeUTF(timeStr);
		out.writeUTF(request);
		out.writeInt(step);
		out.writeUTF(staylong);
		out.writeUTF(referal);
		out.writeUTF(useragent);
		out.writeUTF(bytes_send);
		out.writeUTF(status);
	}
	@Override
	public void readFields(DataInput in) throws IOException {
		this.session = in.readUTF();
		this.remote_addr = in.readUTF();
		this.timeStr = in.readUTF();
		this.request = in.readUTF();
		this.step = in.readInt();
		this.staylong = in.readUTF();
		this.referal = in.readUTF();
		this.useragent = in.readUTF();
		this.bytes_send = in.readUTF();
		this.status = in.readUTF();
	}
	
	
	
	
	
}

VisitBean

package com.thp.bigdata.webClick.mrBean;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.Writable;

/**
 * 
 * @author 汤小萌
 *
 */
public class VisitBean implements Writable {
	private String session;
	private String remote_addr;
	private String inTime;
	private String outTime;
	private String inPage;
	private String outPage;
	private String referal;
	private int pageVisits;
	
	public void set(String session, String remote_addr, String inTime, String outTime, String inPage, String outPage, String referal, int pageVisits) {
		this.session = session;
		this.remote_addr = remote_addr;
		this.inTime = inTime;
		this.outTime = outTime;
		this.inPage = inPage;
		this.outPage = outPage;
		this.referal = referal;
		this.pageVisits = pageVisits;
	}

	public String getSession() {
		return session;
	}

	public void setSession(String session) {
		this.session = session;
	}

	public String getRemote_addr() {
		return remote_addr;
	}

	public void setRemote_addr(String remote_addr) {
		this.remote_addr = remote_addr;
	}

	public String getInTime() {
		return inTime;
	}

	public void setInTime(String inTime) {
		this.inTime = inTime;
	}

	public String getOutTime() {
		return outTime;
	}

	public void setOutTime(String outTime) {
		this.outTime = outTime;
	}

	public String getInPage() {
		return inPage;
	}

	public void setInPage(String inPage) {
		this.inPage = inPage;
	}

	public String getOutPage() {
		return outPage;
	}

	public void setOutPage(String outPage) {
		this.outPage = outPage;
	}

	public String getReferal() {
		return referal;
	}

	public void setReferal(String referal) {
		this.referal = referal;
	}

	public int getPageVisits() {
		return pageVisits;
	}

	public void setPageVisits(int pageVisits) {
		this.pageVisits = pageVisits;
	}

	@Override
	public void write(DataOutput out) throws IOException {
		out.writeUTF(session);
		out.writeUTF(remote_addr);
		out.writeUTF(inTime);
		out.writeUTF(outTime);
		out.writeUTF(inPage);
		out.writeUTF(outPage);
		out.writeUTF(referal);
		out.writeInt(pageVisits);
	}

	@Override
	public void readFields(DataInput in) throws IOException {
		this.session = in.readUTF();
		this.remote_addr = in.readUTF();
		this.inTime = in.readUTF();
		this.outTime = in.readUTF();
		this.inPage = in.readUTF();
		this.outPage = in.readUTF();
		this.referal = in.readUTF();
		this.pageVisits = in.readInt();
	}
	
	@Override
	public String toString() {
		return session + "\001" + remote_addr + "\001" + inTime + "\001" +
				outTime + "\001" + inPage + "\001" + outPage + "\001" + referal + "\001" + pageVisits;
	}

	
	
	
}

预处理解析类

package com.thp.bigdata.webClick.mrBean;

import java.io.IOException;
import java.io.InputStream;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Locale;
import java.util.Properties;
import java.util.Set;

import org.junit.Test;

/**
 * 对加载进来的数据进行 
 * @author 汤小萌
 *
 */
public class WeblogParser {
	
	
	
	/**
	 *  0 ) 194.237.142.21
		1 ) -
		2 ) -
		3 ) [18/Sep/2013:06:49:18
		4 ) +0000]
		5 ) "GET
		6 ) /wp-content/uploads/2013/07/rstudio-git3.png
		7 ) HTTP/1.1"
		8 ) 304
		9 ) 0
		10 ) "-"
		11 ) "Mozilla/4.0
		12 ) (compatible;)"
	 * @param line
	 * @return
	 */
	public static WeblogBean parser(String line) {
		WeblogBean weblogBean = new WeblogBean();
		String[] arr = line.split(" ");
		if(arr.length >11) {
			weblogBean.setRemote_addr(arr[0]);
			weblogBean.setRemote_user(arr[1]);
			String time_local = formatDate(arr[3].substring(1));
			if(null == time_local) time_local = "-invalid_time-";
			weblogBean.setTime_local(time_local);
			weblogBean.setRequest(arr[6]);
			
			weblogBean.setStatus(arr[8]);
			weblogBean.setBody_bytes_sent(arr[9]);
			weblogBean.setHttp_referer(arr[10]);
			
			// 如果useragent元素较多,则拼接useragent
			
			if(arr.length > 12) {
				StringBuffer sb = new StringBuffer();
				for(int i = 11; i < arr.length; i++) {
					sb.append(arr[i]);
				}
				weblogBean.setHttp_user_agent(sb.toString());
			} else {
				weblogBean.setHttp_user_agent(arr[11]);
			}
			
			if(Integer.parseInt(weblogBean.getStatus()) >= 400) {  // 状态码 >=400 说明请求错误
				weblogBean.setValid(false);
			}
			
			if("-invalid_time-".equals(weblogBean.getTime_local())) {
				weblogBean.setValid(false);
			}
			
		} else {
			weblogBean.setValid(false);
		}
		return weblogBean;
	}
	
	
	/**
	 * 过来静态资源
	 */
	public static void filterStaticResource(WeblogBean bean, Set<String> pages) {
		if(!pages.contains(bean.getRequest())) {
			bean.setValid(false);   // 在这些定义的url资源以外的资源都是作为静态资源处理
		}
	}
	
	
	
	public static SimpleDateFormat sdf1 = new SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss",Locale.US);
	public static SimpleDateFormat sdf2 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss",Locale.US);
	/**
	 * 时间转换
	 * @param time_local
	 * @return
	 */
	public static String formatDate(String time_local) {
		try {
			return sdf2.format(sdf1.parse(time_local));
		} catch (ParseException e) {
			e.printStackTrace();
		}
		return null;
	}
	
	@Test
	public void testSpilt() {
		String str = "194.237.142.21 - - [18/Sep/2013:06:49:18 +0000] \"GET /wp-content/uploads/2013/07/rstudio-git3.png HTTP/1.1\" 304 0 \"-\" \"Mozilla/4.0 (compatible;)\"";
		String[] arr = str.split(" ");
		int i = 1;
		for(String s : arr) {
			System.out.println(i + " ) " + s);
			i++;
		}
	}
	
	
	@Test
	public void testProp() throws IOException {
		
	}
	
	public static void main(String[] args) throws IOException {
		Properties pop = new Properties();
		InputStream is = WeblogParser.class.getClassLoader().getResourceAsStream("com/thp/bigdata/webClick/mrBean/url_1.propeties");
		
		pop.load(is);
		String str = (String) pop.get("url");
		System.out.println(str);
		
	}
	
}


MapReduce 程序

1 . 日志的预处理:

package com.thp.bigdata.webClick.mr.pre;

import java.io.IOException;
import java.io.InputStream;
import java.util.HashSet;
import java.util.Properties;
import java.util.Set;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import com.thp.bigdata.webClick.mrBean.WeblogBean;
import com.thp.bigdata.webClick.mrBean.WeblogParser;

/**
 * 处理原始的日志,过滤出真实的PV情况
 * 1)转换时间格式
 * 2)对缺失的字段填充默认值
 * 3)对记录标记valid和invalid
 * @author 汤小萌
 *
 */
public class WeblogPreProcess {

	
	static class WeblogPreProcessMapper extends Mapper<LongWritable, Text, Text, NullWritable> {
		
		Set<String> pages = new HashSet<String>();
		Text k = new Text();
		NullWritable v = NullWritable.get();
		/**
		 * 从外部加载网站url分类
		 */
		@Override
		protected void setup(Context context)
				throws IOException, InterruptedException {
			/*pages.add("/about/");
			pages.add("/black-ip-list/");
			pages.add("/cassandra-clustor/");
			pages.add("/finance-rhive-repurchase/");
			pages.add("/hadoop-family-roadmap/");
			pages.add("/hadoop-hive-intro/");
			pages.add("/hadoop-zookeeper-intro/");
			pages.add("/hadoop-mahout-roadmap/");*/
			
			Properties pop = new Properties();
			InputStream in = WeblogPreProcessMapper.class.getClassLoader().getResourceAsStream("url.propeties");
			pop.load(in);
			String urlStr = pop.getProperty("url");
			String[] urls = urlStr.split(",");
			for(String url : urls) {
				pages.add(url);
			}
			
		}
		
		@Override
		protected void map(LongWritable key, Text value, Context context)
				throws IOException, InterruptedException {
			String line = value.toString();
			WeblogBean weblogBean = WeblogParser.parser(line);
			// 可插拔的方法  : 过滤  js/图片/css等静态资源
			WeblogParser.filterStaticResource(weblogBean, pages);
			
			if(weblogBean.isValid()) {  // 无效的数据都被过滤出去了
				k.set(weblogBean.toString());
				context.write(k, v);
			}
			
		}
		
		
	}
	
	
	public static void main(String[] args) throws Exception {

		Configuration conf = new Configuration();
		Job job = Job.getInstance(conf);

		job.setJarByClass(WeblogPreProcess.class);

		job.setMapperClass(WeblogPreProcessMapper.class);

		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(NullWritable.class);

//		 FileInputFormat.setInputPaths(job, new Path(args[0]));
//		 FileOutputFormat.setOutputPath(job, new Path(args[1]));
		FileInputFormat.setInputPaths(job, new Path("f:/weblog/input"));
		FileOutputFormat.setOutputPath(job, new Path("f:/weblog/output"));

		job.setNumReduceTasks(0);
		
		job.waitForCompletion(true);

	}
	
	
	
	
}

2.分析出点击流:

package com.thp.bigdata.webClick.mr;

import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.Date;
import java.util.Locale;
import java.util.UUID;

import org.apache.commons.beanutils.BeanUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import com.thp.bigdata.webClick.mrBean.WeblogBean;

/**
 * 将清洗之后的日志梳理出点击流pageViews模型数据
 * 输入的数据是经过清洗之后的数据
 * 
 * 区分每一次会话,给每一次visit(session)增加了session-id(随机uuid)
 * 梳理出每一次会话中所访问的每个页面(请求时间,url,停留时长,以及该页面在这次session中的序号)
 * 保留referral_url,body_bytes_send,useragent
 * @author 汤小萌
 *
 */
public class ClickStream {
	
	static class ClickStreamMapper extends Mapper<LongWritable, Text, Text, WeblogBean> {
		Text k = new Text();
		WeblogBean v = new WeblogBean();
		
		@Override
		protected void map(LongWritable key, Text value, Context context)
				throws IOException, InterruptedException {
			String line = value.toString();
System.out.println(line);
			String[] fields = line.split("\001");
			if(fields.length < 9) return;
			v.set("true".equals(fields[0]) ? true : false, fields[1], fields[2], fields[3], fields[4], fields[5], 
					fields[6], fields[7], fields[8]);
			// 只有有效的记录才会进入后续处理
			if(v.isValid()) {
				k.set(v.getRemote_addr());
				context.write(k, v);
			}
		}
	}
	
	static class ClickStreamReducer extends Reducer<Text, WeblogBean, NullWritable, Text> {
		Text v = new Text();
		@Override
		protected void reduce(Text key, Iterable<WeblogBean> values, Context context) throws IOException, InterruptedException {
			ArrayList<WeblogBean> beans = new ArrayList<WeblogBean>();
			// 先将每一个用户都拿出来按照时间进行排序
			for(WeblogBean bean : values) {
				WeblogBean weblogBean = new WeblogBean();
				try {
					BeanUtils.copyProperties(weblogBean, bean);
				} catch (IllegalAccessException | InvocationTargetException e) {
					e.printStackTrace();
				}
				beans.add(weblogBean);
			}
			
			
			// 将Bean按照时间进行排序
			Collections.sort(beans, new Comparator<WeblogBean>() {
				@Override
				public int compare(WeblogBean o1, WeblogBean o2) {
					try {
						Date d1 = toDate(o1.getTime_local());
						Date d2 = toDate(o2.getTime_local());
						if(d1 == null || d2 == null) return 0;
						return d1.compareTo(d2);
					} catch (ParseException e) {
						e.printStackTrace();
					}
					return 0;
				}
				
			});
			
			int step = 1;
			String session = UUID.randomUUID().toString();
			for(int i = 0; i < beans.size(); i++) {
				WeblogBean bean = beans.get(i);
				// 如果仅有一条数据,则输出
				if(1 == beans.size()) {
					// 设置默认停留时间为60s
					v.set(session+"\001"+key.toString()+"\001"+bean.getRemote_user() + "\001" +
					bean.getTime_local() + "\001" + bean.getRequest() + "\001" + step + "\001" + (60) + 
					"\001" + bean.getHttp_referer() + "\001" + bean.getHttp_user_agent() + "\001" + 
					bean.getBody_bytes_sent() + "\001"
							+ bean.getStatus());
					context.write(NullWritable.get(), v);
					// 重新生成session
					session = UUID.randomUUID().toString();
					break;
				} 
				if(i == 0) {  // 不止一条数据,那么第一条要直接跳过,因为 bean.get(i-1)
					continue;
				}
				
				try {
					long timeDiff = timeDiff(bean.getTime_local(), beans.get(i - 1).getTime_local());
					if(timeDiff < 30*60*1000) {
						// 如果  本次  -  上次   时间差   <  30  min  ,则输出前一次的页面访问信息
						v.set(session+"\001"+key.toString()+"\001"+beans.get(i - 1).getRemote_user() + 
								"\001" + beans.get(i - 1).getTime_local() + "\001" + beans.get(i - 1).getRequest() + 
								"\001" + step + "\001" + (timeDiff / 1000) + "\001" + beans.get(i - 1).getHttp_referer() + "\001"
								+ beans.get(i - 1).getHttp_user_agent() + "\001" + beans.get(i - 1).getBody_bytes_sent() + "\001"
								+ beans.get(i - 1).getStatus());
						context.write(NullWritable.get(), v);
						step++;
					} else {
						// 如果  本次 - 上次 时间差  > 30 min, 则输出前一次的页面访问信息,将step重置为1,以分隔为为新的visit
						v.set(session+"\001"+key.toString()+"\001"+beans.get(i - 1).getRemote_user() + "\001" +
						beans.get(i - 1).getTime_local() + "\001" + beans.get(i - 1).getRequest() + "\001" + 
								(step) + "\001" + (60) + "\001" + beans.get(i - 1).getHttp_referer() + "\001"
								+ beans.get(i - 1).getHttp_user_agent() + "\001" + 
								beans.get(i - 1).getBody_bytes_sent() + "\001" + beans.get(i - 1).getStatus());
						context.write(NullWritable.get(), v);
						// 输出完上一条之后,重置step编号
						step = 1;
						// session 也要重新生成
						session = UUID.randomUUID().toString();
					}
				} catch (ParseException e) {
					e.printStackTrace();
				}
				
				// 如果此次遍历时最后一条数据,则将本条数据输出  session  在上面的逻辑都控制了
				if(i == beans.size() - 1) {
					// 设置默认停留时间为60s
					v.set(session+"\001"+key.toString()+"\001"+bean.getRemote_user() + "\001" + bean.getTime_local() + "\001" + bean.getRequest() + "\001" + step + "\001" + (60) + "\001" + bean.getHttp_referer() + "\001" + bean.getHttp_user_agent() + "\001" + bean.getBody_bytes_sent() + "\001" + bean.getStatus());
					context.write(NullWritable.get(), v);
				}
				
				
				
			}
			
			
			
			
		}
		
		
		private String toStr(Date date) {
			SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss",Locale.US);
			return sdf.format(date);
		}
		
		private Date toDate(String timeStr) throws ParseException {
			SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss",Locale.UK);
			return sdf.parse(timeStr);
		}
		
		// 算时间差
		private long timeDiff(String time1, String time2) throws ParseException {
			Date d1 = toDate(time1);
			Date d2 = toDate(time2);
			return d1.getTime() - d2.getTime();
		}
		
	}
	
	public static void main(String[] args) throws IllegalArgumentException, IOException, ClassNotFoundException, InterruptedException {
		Configuration conf = new Configuration();
		Job job = Job.getInstance(conf);

		job.setJarByClass(ClickStream.class);

		job.setMapperClass(ClickStreamMapper.class);
		job.setReducerClass(ClickStreamReducer.class);

		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(WeblogBean.class);

		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);

//		 FileInputFormat.setInputPaths(job, new Path(args[0]));
//		 FileOutputFormat.setOutputPath(job, new Path(args[1]));

		FileInputFormat.setInputPaths(job, new Path("f:/weblog_1/output"));
		FileOutputFormat.setOutputPath(job, new Path("f:/weblog_1/pageviews"));
		
		FileSystem fs = FileSystem.get(conf);
		if(fs.exists(new Path("f:/weblog_1/pageviews"))) {
			fs.delete(new Path("f:/weblog_1/pageviews"), true);
		}

		job.waitForCompletion(true);
	}
	
	
	
	
}

3.进一步梳理出visit模型:

package com.thp.bigdata.webClick.mr;

import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;

import org.apache.commons.beanutils.BeanUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import com.thp.bigdata.webClick.mrBean.PageViewsBean;
import com.thp.bigdata.webClick.mrBean.VisitBean;

/**
 * 从PageViews模型数据结果中进一步梳理出visit模型
 * 
 * 经过这里之后出去的数据格式:
 * sessionid   satrt-time   out-time   satrt-page   out-page   pagecounts  ...
 * 
 * @author 汤小萌
 *
 */
public class ClickStreamVisit {
	
	static class ClickStreamVisitMapper extends Mapper<LongWritable, Text, Text, PageViewsBean> {
		PageViewsBean pvBean = new PageViewsBean();
		Text k = new Text();
		
		@Override
		protected void map(LongWritable key, Text value, Context context)
				throws IOException, InterruptedException {
			String line = value.toString();
			String[] fields = line.split("\001");
			int step = Integer.parseInt(fields[5]);   // 访问的步数
			pvBean.set(fields[0], fields[1], fields[2], fields[3],fields[4], step, fields[6], fields[7], fields[8], fields[9]);
			k.set(pvBean.getSession());
			context.write(k, pvBean);
		}
	}
	
	
	static class ClickStreamVisitReducer extends Reducer<Text, PageViewsBean, NullWritable, VisitBean> {
		@Override
		protected void reduce(Text session, Iterable<PageViewsBean> pvBeans, Context context)
				throws IOException, InterruptedException {
			// 将pvBean按照step排序
			ArrayList<PageViewsBean> pvBeanList = new ArrayList<PageViewsBean>();
			for(PageViewsBean pvBean : pvBeans) {
				PageViewsBean bean = new PageViewsBean();
				try {
					BeanUtils.copyProperties(bean, pvBean);
					pvBeanList.add(bean);
				} catch (IllegalAccessException | InvocationTargetException e) {
					e.printStackTrace();
				}
			}
			Collections.sort(pvBeanList, new Comparator<PageViewsBean>() {
				@Override
				public int compare(PageViewsBean o1, PageViewsBean o2) {
					return o1.getStep() > o2.getStep() ? 1 : -1;
				}
			});
			
			// 取这次visit的首尾pageViews记录,放入VisitBean中
			VisitBean visitBean = new VisitBean();
			// 取visit 的首记录
			visitBean.setInPage(pvBeanList.get(0).getRequest());
			visitBean.setInTime(pvBeanList.get(0).getTimeStr());
			// 取visit 的尾记录
			visitBean.setOutPage(pvBeanList.get(pvBeanList.size() - 1).getRequest());
			visitBean.setOutTime(pvBeanList.get(pvBeanList.size() - 1).getTimeStr());
			// visit访问的页面数
			visitBean.setPageVisits(pvBeanList.size());
			// 来访者的ip
			visitBean.setRemote_addr(pvBeanList.get(0).getRemote_addr());
			// 本次visit的referal
			visitBean.setReferal(pvBeanList.get(0).getReferal());
			visitBean.setSession(session.toString());
			
			context.write(NullWritable.get(), visitBean);
		}
	}
	
	public static void main(String[] args) throws IllegalArgumentException, IOException, ClassNotFoundException, InterruptedException {
		Configuration conf = new Configuration();
		Job job = Job.getInstance(conf);

		job.setJarByClass(ClickStreamVisit.class);

		job.setMapperClass(ClickStreamVisitMapper.class);
		job.setReducerClass(ClickStreamVisitReducer.class);

		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(PageViewsBean.class);

		job.setOutputKeyClass(NullWritable.class);
		job.setOutputValueClass(VisitBean.class);
		
		
//		FileInputFormat.setInputPaths(job, new Path(args[0]));
//		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		FileInputFormat.setInputPaths(job, new Path("f:/weblog_1/pageviews"));
		FileOutputFormat.setOutputPath(job, new Path("f:/weblog_1/visitout"));
		
		FileSystem fs = FileSystem.get(conf);
		if(fs.exists(new Path("f:/weblog_1/visitout"))) {
			fs.delete(new Path("f:/weblog_1/visitout"), true);
		}
		
		boolean res = job.waitForCompletion(true);
		System.exit(res?0:1);
	}
	
	/**
	 * 	
	 * 2018年11月29日 上午9:00:57
	 * @param a
	 */
	public void a1(int a) {
		new StringBuffer();
	}
	
}


使用MapReduce自身的排序

WeblogBean

package mr.flow.weblog.bean;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Locale;

import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;

/**
 * 对接外部数据的层,表结构定义最好跟外部数据源保持一致
 * @author 汤小萌
 *
 */
public class WeblogBean implements WritableComparable<WeblogBean> {

	private boolean valid = true;		// 判断数据是否合法
	private String remote_addr;			// 记录客户端的ip地址
	private String remote_user;			// 记录客户端用户名称  忽略属性"-"
	private String time_local;			// 记录访问时间与时区
	private String request;				// 记录请求的url与http协议
	private String status;				// 记录请求状态;成功是200
	private String body_bytes_sent;		// 记录发给客户单主体文件的大小
	private String http_referer;		// 记录用户是从哪个链接过来的
	private String http_user_agent;		// 记录客户端浏览器的相关信息
	
	public void set(boolean valid, String remote_addr, String remote_user, String time_local, String request,
			String status, String body_bytes_sent, String http_referer, String http_user_agent) {
		this.valid = valid;
		this.remote_addr = remote_addr;
		this.remote_user = remote_user;
		this.time_local = time_local;
		this.request = request;
		this.status = status;
		this.body_bytes_sent = body_bytes_sent;
		this.http_referer = http_referer;
		this.http_user_agent = http_user_agent;
	}

	public boolean isValid() {
		return valid;
	}

	public void setValid(boolean valid) {
		this.valid = valid;
	}

	public String getRemote_addr() {
		return remote_addr;
	}

	public void setRemote_addr(String remote_addr) {
		this.remote_addr = remote_addr;
	}

	public String getRemote_user() {
		return remote_user;
	}

	public void setRemote_user(String remote_user) {
		this.remote_user = remote_user;
	}

	public String getTime_local() {
		return time_local;
	}

	public void setTime_local(String time_local) {
		this.time_local = time_local;
	}

	public String getRequest() {
		return request;
	}

	public void setRequest(String request) {
		this.request = request;
	}

	public String getStatus() {
		return status;
	}

	public void setStatus(String status) {
		this.status = status;
	}

	public String getBody_bytes_sent() {
		return body_bytes_sent;
	}

	public void setBody_bytes_sent(String body_bytes_sent) {
		this.body_bytes_sent = body_bytes_sent;
	}

	public String getHttp_referer() {
		return http_referer;
	}

	public void setHttp_referer(String http_referer) {
		this.http_referer = http_referer;
	}

	public String getHttp_user_agent() {
		return http_user_agent;
	}

	public void setHttp_user_agent(String http_user_agent) {
		this.http_user_agent = http_user_agent;
	}

	@Override
	public void write(DataOutput out) throws IOException {
		out.writeBoolean(this.valid);
		out.writeUTF(null==remote_addr?"":remote_addr);
		out.writeUTF(null==remote_user?"":remote_user);
		out.writeUTF(null==time_local?"":time_local);
		out.writeUTF(null==request?"":request);
		out.writeUTF(null==status?"":status);
		out.writeUTF(null==body_bytes_sent?"":body_bytes_sent);
		out.writeUTF(null==http_referer?"":http_referer);
		out.writeUTF(null==http_user_agent?"":http_user_agent);
	}

	@Override
	public void readFields(DataInput in) throws IOException {
		this.valid = in.readBoolean();
		this.remote_addr = in.readUTF();
		this.remote_user = in.readUTF();
		this.time_local = in.readUTF();
		this.request = in.readUTF();
		this.status = in.readUTF();
		this.body_bytes_sent = in.readUTF();
		this.http_referer = in.readUTF();
		this.http_user_agent = in.readUTF();
	}
	
	
	@Override
	public String toString() {
// System.out.println("=========================");
		StringBuilder sb = new StringBuilder();
		sb.append(this.valid);
		sb.append("\001").append(this.getRemote_addr());
		sb.append("\001").append(this.getRemote_user());
		sb.append("\001").append(this.getTime_local());
		sb.append("\001").append(this.getRequest());
		sb.append("\001").append(this.getStatus());
		sb.append("\001").append(this.getBody_bytes_sent());
		sb.append("\001").append(this.getHttp_referer());
		sb.append("\001").append(this.getHttp_user_agent());
		return sb.toString();
	}

	/**
	 * 跟时间先后顺序排序
	 */
	@Override
	public int compareTo(WeblogBean o) {
		/*System.out.println("++++++++++++++++++++++++++++++++++++");
		SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss",Locale.UK);
		try {
			Date d1 = sdf.parse(this.getTime_local());
			Date d2 = sdf.parse(o.getTime_local());
			if(d1 == null || d2 == null) return 0;
			System.out.println(d1.compareTo(d2));
			return d1.compareTo(d2);
		} catch (ParseException e) {
			e.printStackTrace();
		}*/
		
		
		// 先比较ip地址  --  【注意:】  这个ip必须要先继续一次比较  两个相同之后,才可以进行日期的比较   如果没有比较ip就只比较日期那么是会出问题的
		int ipCompareResult = this.getRemote_addr().compareTo(o.getRemote_addr());
		if(ipCompareResult == 0) { // ip地址相同,则继续比较同一个ip下的访问的时间
			SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss",Locale.UK);
			try {
				Date d1 = sdf.parse(this.getTime_local());
				Date d2 = sdf.parse(o.getTime_local());
				if(d1 == null || d2 == null) return 0;
				// System.out.println(d1.compareTo(d2));
				return d1.compareTo(d2);
			} catch (ParseException e) {
				e.printStackTrace();
			}
		} else {
			return ipCompareResult;
		}
		return 0;
	}
	
	
	
	
	
}

PageViewsBean

package mr.flow.weblog.bean;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;

import org.apache.hadoop.io.WritableComparable;
import org.junit.Test;

/**
 * sessionId 
 * @author 汤小萌
 *
 */
public class PageViewsBean implements WritableComparable<PageViewsBean> {
	
	private String session;			// sessionId
	private String remote_addr;		// 客户端ip地址
	private String timeStr;			// 访问的时间
	private String request;			// 请求的url
	private int step;				// 访问的第几步
	private String staylong;		// 停留的时间
	private String referal;			// 是从哪个页面过来的
	private String useragent;		// 记录跟浏览器相关信息
	private String bytes_send;		// 发送的数据字节大小
	private String status;			// 本次请求的状态
	
	public void set(String session, String remote_addr, String useragent, String timeStr, String request, int step, String staylong, String referal, String bytes_send, String status) {
		this.session = session;
		this.remote_addr = remote_addr;
		this.useragent = useragent;
		this.timeStr = timeStr;
		this.request = request;
		this.step = step;
		this.staylong = staylong;
		this.referal = referal;
		this.bytes_send = bytes_send;
		this.status = status;
	}
	
	@Override
	public void write(DataOutput out) throws IOException {
		out.writeUTF(session);
		out.writeUTF(remote_addr);
		out.writeUTF(timeStr);
		out.writeUTF(request);
		out.writeInt(step);
		out.writeUTF(staylong);
		out.writeUTF(referal);
		out.writeUTF(useragent);
		out.writeUTF(bytes_send);
		out.writeUTF(status);
	}
	@Override
	public void readFields(DataInput in) throws IOException {
		this.session = in.readUTF();
		this.remote_addr = in.readUTF();
		this.timeStr = in.readUTF();
		this.request = in.readUTF();
		this.step = in.readInt();
		this.staylong = in.readUTF();
		this.referal = in.readUTF();
		this.useragent = in.readUTF();
		this.bytes_send = in.readUTF();
		this.status = in.readUTF();
	}
	
	@Override
	public int compareTo(PageViewsBean o) {
		// 【注意:】这个session也要先进行比较,只有先进行了session的比较后面的step的比较菜有意义
		int sessionCompareResult = this.session.compareTo(o.getSession());
		if(sessionCompareResult == 0) {  // 相同的session的话就继续比较  step
			return this.step - o.getStep() > 0 ? 1 : -1;   // 这种方式 是正序  从小岛大排序
		} 
		return sessionCompareResult;
		// return 0;
	}

	public String getSession() {
		return session;
	}

	public void setSession(String session) {
		this.session = session;
	}

	public String getRemote_addr() {
		return remote_addr;
	}

	public void setRemote_addr(String remote_addr) {
		this.remote_addr = remote_addr;
	}

	public String getTimeStr() {
		return timeStr;
	}

	public void setTimeStr(String timeStr) {
		this.timeStr = timeStr;
	}

	public String getRequest() {
		return request;
	}

	public void setRequest(String request) {
		this.request = request;
	}

	public int getStep() {
		return step;
	}

	public void setStep(int step) {
		this.step = step;
	}

	public String getStaylong() {
		return staylong;
	}

	public void setStaylong(String staylong) {
		this.staylong = staylong;
	}

	public String getReferal() {
		return referal;
	}

	public void setReferal(String referal) {
		this.referal = referal;
	}

	public String getUseragent() {
		return useragent;
	}

	public void setUseragent(String useragent) {
		this.useragent = useragent;
	}

	public String getBytes_send() {
		return bytes_send;
	}

	public void setBytes_send(String bytes_send) {
		this.bytes_send = bytes_send;
	}

	public String getStatus() {
		return status;
	}

	public void setStatus(String status) {
		this.status = status;
	}
	
	
	@Override
	public String toString() {
		return this.session + " " + this.step + "";
	}
	
	
	@Test
	public void testCompareTo() {
		PageViewsBean pvb1 = new PageViewsBean();
		pvb1.set(null, null, null, null, null, 2, null, null, null, null);
		PageViewsBean pvb2 = new PageViewsBean();
		pvb2.set(null, null, null, null, null, 1, null, null, null, null);
		PageViewsBean pvb3 = new PageViewsBean();
		pvb3.set(null, null, null, null, null, 4, null, null, null, null);
		PageViewsBean pvb4 = new PageViewsBean();
		pvb4.set(null, null, null, null, null, 3, null, null, null, null);
		
		ArrayList<PageViewsBean> list = new ArrayList<PageViewsBean>();
		list.add(pvb1);
		list.add(pvb2);
		list.add(pvb3);
		list.add(pvb4);
		
		
		System.out.println(list);
		Collections.sort(list);
		System.out.println(list);
		
	}
	
	
	
	
	
}

VisitBean

package mr.flow.weblog.bean;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.Writable;

/**
 * 记录的是一个访问会话的  ip 地址   进入时间   出来时间   进来页面   出来页面  从哪个页面过来的   总共浏览过多少个页面
 * @author 汤小萌
 * @date 2018年11月28日 下午9:01:17
 */
public class VisitBean implements Writable {
	private String session;
	private String remote_addr;
	private String inTime;
	private String outTime;
	private String inPage;
	private String outPage;
	private String referal;
	private int pageVisits;
	
	
	public void set(String session, String remote_addr, String inTime, String outTime, String inPage, String outPage, String referal, int pageVisits) {
		this.session = session;
		this.remote_addr = remote_addr;
		this.inTime = inTime;
		this.outTime = outTime;
		this.inPage = inPage;
		this.outPage = outPage;
		this.referal = referal;
		this.pageVisits = pageVisits;
	}


	@Override
	public void write(DataOutput out) throws IOException {
		out.writeUTF(session);
		out.writeUTF(remote_addr);
		out.writeUTF(inTime);
		out.writeUTF(outTime);
		out.writeUTF(inPage);
		out.writeUTF(outPage);
		out.writeUTF(referal);
		out.writeInt(pageVisits);
	}


	@Override
	public void readFields(DataInput in) throws IOException {
		this.session = in.readUTF();
		this.remote_addr = in.readUTF();
		this.inTime = in.readUTF();
		this.outTime = in.readUTF();
		this.inPage = in.readUTF();
		this.outPage = in.readUTF();
		this.referal = in.readUTF();
		this.pageVisits = in.readInt();
	}
	
	@Override
	public String toString() {
		return session + "\001" + remote_addr + "\001" + inTime + "\001" +
				outTime + "\001" + inPage + "\001" + outPage + "\001" + referal + "\001" + pageVisits;
	}


	public String getSession() {
		return session;
	}


	public void setSession(String session) {
		this.session = session;
	}


	public String getRemote_addr() {
		return remote_addr;
	}


	public void setRemote_addr(String remote_addr) {
		this.remote_addr = remote_addr;
	}


	public String getInTime() {
		return inTime;
	}


	public void setInTime(String inTime) {
		this.inTime = inTime;
	}


	public String getOutTime() {
		return outTime;
	}


	public void setOutTime(String outTime) {
		this.outTime = outTime;
	}


	public String getInPage() {
		return inPage;
	}


	public void setInPage(String inPage) {
		this.inPage = inPage;
	}


	public String getOutPage() {
		return outPage;
	}


	public void setOutPage(String outPage) {
		this.outPage = outPage;
	}


	public String getReferal() {
		return referal;
	}


	public void setReferal(String referal) {
		this.referal = referal;
	}


	public int getPageVisits() {
		return pageVisits;
	}


	public void setPageVisits(int pageVisits) {
		this.pageVisits = pageVisits;
	}
	
	
	
}

WeblogParser

package mr.flow.weblog.bean;

import java.io.IOException;
import java.io.InputStream;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Locale;
import java.util.Properties;
import java.util.Set;

import org.junit.Test;

/**
 * 对加载进来的数据进行 
 * @author 汤小萌
 *
 */
public class WeblogParser {
	
	
	
	/**
	 *  0 ) 194.237.142.21
		1 ) -
		2 ) -
		3 ) [18/Sep/2013:06:49:18
		4 ) +0000]
		5 ) "GET
		6 ) /wp-content/uploads/2013/07/rstudio-git3.png
		7 ) HTTP/1.1"
		8 ) 304
		9 ) 0
		10 ) "-"
		11 ) "Mozilla/4.0
		12 ) (compatible;)"
	 * @param line
	 * @return
	 */
	public static WeblogBean parser(String line) {
		WeblogBean weblogBean = new WeblogBean();
		String[] arr = line.split(" ");
		if(arr.length >11) {
			weblogBean.setRemote_addr(arr[0]);
			weblogBean.setRemote_user(arr[1]);
			String time_local = formatDate(arr[3].substring(1));
			if(null == time_local) time_local = "-invalid_time-";
			weblogBean.setTime_local(time_local);
			weblogBean.setRequest(arr[6]);
			
			weblogBean.setStatus(arr[8]);
			weblogBean.setBody_bytes_sent(arr[9]);
			weblogBean.setHttp_referer(arr[10]);
			
			// 如果useragent元素较多,则拼接useragent
			
			if(arr.length > 12) {
				StringBuffer sb = new StringBuffer();
				for(int i = 11; i < arr.length; i++) {
					sb.append(arr[i]);
				}
				weblogBean.setHttp_user_agent(sb.toString());
			} else {
				weblogBean.setHttp_user_agent(arr[11]);
			}
			
			if(Integer.parseInt(weblogBean.getStatus()) >= 400) {  // 状态码 >=400 说明请求错误
				weblogBean.setValid(false);
			}
			
			if("-invalid_time-".equals(weblogBean.getTime_local())) {
				weblogBean.setValid(false);
			}
			
		} else {
			weblogBean.setValid(false);
		}
		return weblogBean;
	}
	
	
	/**
	 * 过来静态资源
	 */
	public static void filterStaticResource(WeblogBean bean, Set<String> pages) {
		if(!pages.contains(bean.getRequest())) {
			bean.setValid(false);   // 在这些定义的url资源以外的资源都是作为静态资源处理
		}
	}
	
	
	
	public static SimpleDateFormat sdf1 = new SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss",Locale.US);
	public static SimpleDateFormat sdf2 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss",Locale.US);
	/**
	 * 时间转换
	 * @param time_local
	 * @return
	 */
	public static String formatDate(String time_local) {
		try {
			return sdf2.format(sdf1.parse(time_local));
		} catch (ParseException e) {
			e.printStackTrace();
		}
		return null;
	}
	
	@Test
	public void testSpilt() {
		String str = "194.237.142.21 - - [18/Sep/2013:06:49:18 +0000] \"GET /wp-content/uploads/2013/07/rstudio-git3.png HTTP/1.1\" 304 0 \"-\" \"Mozilla/4.0 (compatible;)\"";
		String[] arr = str.split(" ");
		int i = 1;
		for(String s : arr) {
			System.out.println(i + " ) " + s);
			i++;
		}
	}
	
	
	@Test
	public void testProp() throws IOException {
		
	}
	
	public static void main(String[] args) throws IOException {
		Properties pop = new Properties();
		InputStream is = WeblogParser.class.getClassLoader().getResourceAsStream("com/thp/bigdata/webClick/mrBean/url_1.propeties");
		
		pop.load(is);
		String str = (String) pop.get("url");
		System.out.println(str);
		
	}
	
}

比较器
IpGroupingComparator

package mr.flow.weblog.bean;

import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;

/**
 * 自定义的聚合规则
 * 当key的ip相同的时候,就放入同一个reduce进行处理
 * @author 汤小萌
 *
 */
public class IpGroupingComparator extends WritableComparator {

	
	public IpGroupingComparator() {
		super(WeblogBean.class, true);
	}
	
	@Override
	public int compare(WritableComparable a, WritableComparable b) {
		WeblogBean aBean = (WeblogBean) a;
		WeblogBean bBean = (WeblogBean) b;
		return aBean.getRemote_addr().compareTo(bBean.getRemote_addr());
	}
	
}

SessionIdGroupingComparator

package mr.flow.weblog.bean;

import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;

/**
 * 自定义的聚合规则
 * 相同的sessionId要进入同一个reduce进行处理
 * @author 汤小萌
 * @date 2018年11月28日 下午8:55:13
 */
public class SessionIdGroupingComparator extends WritableComparator {
	
	public SessionIdGroupingComparator() {
		super(PageViewsBean.class, true);
	}
	
	@Override
	public int compare(WritableComparable a, WritableComparable b) {
		PageViewsBean aBean = (PageViewsBean) a;
		PageViewsBean bBean = (PageViewsBean) b;
		// System.out.println(aBean.getSession()  + " -- " + bBean.getSession());
		// System.out.println(aBean.getSession().compareTo(bBean.getSession()));
		return aBean.getSession().compareTo(bBean.getSession());
	}
}

1. WeblogPreProcess 日志预处理

package mr.flow.weblog.pre;

import java.io.IOException;
import java.io.InputStream;
import java.util.HashSet;
import java.util.Properties;
import java.util.Set;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import mr.flow.weblog.bean.WeblogBean;
import mr.flow.weblog.bean.WeblogParser;


/**
 * 处理原始的日志,过滤出真实的PV情况
 * 1)转换时间格式
 * 2)对缺失的字段填充默认值
 * 3)对记录标记valid和invalid
 * @author 汤小萌
 *
 */
public class WeblogPreProcess {

	
	static class WeblogPreProcessMapper extends Mapper<LongWritable, Text, Text, NullWritable> {
		
		Set<String> pages = new HashSet<String>();
		Text k = new Text();
		NullWritable v = NullWritable.get();
		/**
		 * 从外部加载网站url分类
		 */
		@Override
		protected void setup(Context context)
				throws IOException, InterruptedException {
			/*pages.add("/about/");
			pages.add("/black-ip-list/");
			pages.add("/cassandra-clustor/");
			pages.add("/finance-rhive-repurchase/");
			pages.add("/hadoop-family-roadmap/");
			pages.add("/hadoop-hive-intro/");
			pages.add("/hadoop-zookeeper-intro/");
			pages.add("/hadoop-mahout-roadmap/");*/
			
			Properties pop = new Properties();
			InputStream in = WeblogPreProcessMapper.class.getClassLoader().getResourceAsStream("url.propeties");
			pop.load(in);
			String urlStr = pop.getProperty("url");
			String[] urls = urlStr.split(",");
			for(String url : urls) {
				pages.add(url);
			}
			
		}
		
		@Override
		protected void map(LongWritable key, Text value, Context context)
				throws IOException, InterruptedException {
			String line = value.toString();
			WeblogBean weblogBean = WeblogParser.parser(line);
			// 可插拔的方法  : 过滤  js/图片/css等静态资源
			WeblogParser.filterStaticResource(weblogBean, pages);
			
			if(weblogBean.isValid()) {  // 无效的数据都被过滤出去了
				k.set(weblogBean.toString());
				context.write(k, v);
			}
			
		}
		
		
	}
	
	
	public static void main(String[] args) throws Exception {

		Configuration conf = new Configuration();
		Job job = Job.getInstance(conf);

		job.setJarByClass(WeblogPreProcess.class);

		job.setMapperClass(WeblogPreProcessMapper.class);

		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(NullWritable.class);

//		 FileInputFormat.setInputPaths(job, new Path(args[0]));
//		 FileOutputFormat.setOutputPath(job, new Path(args[1]));
		FileInputFormat.setInputPaths(job, new Path("f:/weblog_2/input/access.log.fensi"));
		FileOutputFormat.setOutputPath(job, new Path("f:/weblog_2/output"));
		
		FileSystem fs = FileSystem.get(conf);
		if(fs.exists(new Path("f:/weblog_2/output"))) {
			fs.delete(new Path("f:/weblog_2/output"), true);
		}
		
		job.setNumReduceTasks(0);
		
		job.waitForCompletion(true);

	}
	
	
	
	
}

2. ClickStream

package mr.flow.weblog.mr;

import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Locale;
import java.util.UUID;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import mr.flow.weblog.bean.IpGroupingComparator;
import mr.flow.weblog.bean.WeblogBean;

/**
 * 将清洗过后的数据梳理出点击流pageViews模型数据
 * 输入的数据是经过预处理之后的数据
 * 
 * 区分每一次会话,给每一次会话打上sessionId
 * 梳理出每一次会话所访问的每个页面  (请求时间,url,停留时长,以及该页面在这次session中的序号)
 * 保留http_referer  body_bytes_sent   http_user_agent
 * 
 * @author 汤小萌
 *
 */
public class ClickStream {

	
	static class ClickStreamMapper extends Mapper<LongWritable, Text, WeblogBean, Text> {
		WeblogBean k = new WeblogBean();
		Text v = new Text();
		@Override
		protected void map(LongWritable key, Text value, Context context)
				throws IOException, InterruptedException {
			String[] fields = value.toString().split("\001");
			if(fields.length < 9) return;
			k.set("true".equals(fields[0]) ? true : false, fields[1], fields[2], fields[3], fields[4], fields[5], 
					fields[6], fields[7], fields[8]);
			if(k.isValid()) {  // 只有有效果的数据才会进入后续的处理
				context.write(k, v);
			}
			
		}
	}
	
	
	/**
	 * 需要生成的数据:
	 * 
	 *  sessionId  ip  time_local  request  step  http_referer  Http_user_agent  http_user_agent body_bytes_sent  status
	 * 
	 * @author 汤小萌
	 *
	 */
	static class ClickStreamReducer extends Reducer<WeblogBean, Text, Text, NullWritable> {
		
		Text k = new Text();
		NullWritable v = NullWritable.get();
		@Override
		protected void reduce(WeblogBean beanKey, Iterable<Text> values, Context context) 
				throws IOException, InterruptedException {
			
			// System.out.println(beanKey);
			
			System.out.println("---------------");
			
			int step = 1;	// 这个页面在这个session是第几次访问的
			String sessionId = UUID.randomUUID().toString();	// 生成sessionId
			
			
			String lastTimeStr = null;
			String lastSaveStr = null;   // 需要保留上一条记录的后面字符串
			String lastIpAndUser = null; // 需要保留的上一条记录的ip地址和用户属性
			String lastUrl = null;		 // 需要保留的上一条记录的访问的url
			Long stayTime = 0L;		 // 前后两次停留的时间
			
			for(Text value : values) {
				// System.out.println(beanKey);
				
				/*k.set(sessionId+"\001"+beanKey.toString()+"\001"+beanKey.getRemote_user() + "\001" +
						beanKey.getTime_local() + "\001" + beanKey.getRequest() + "\001" + step + "\001" + (60) + 
						"\001" + beanKey.getHttp_referer() + "\001" + beanKey.getHttp_user_agent() + "\001" + 
						beanKey.getBody_bytes_sent() + "\001"
								+ beanKey.getStatus());
				context.write(k, v);*/
				
				if(lastTimeStr != null) {
					try {
						// beanKey又是下一次的记录了  lastTimeStr 保留的是上一条记录的访问时间
						// stayTime = toDate(beanKey.getTime_local()).getTime() - toDate(lastTimeStr).getTime();
						stayTime = timeDiff(beanKey.getTime_local(), lastTimeStr);
					} catch (ParseException e) {
						e.printStackTrace();
					}
					
					if(stayTime < 30*60*1000) {  // 同一个IP访问的时间差  <  30 min 认为是同一个 session
						k.set(sessionId + "\001" + lastIpAndUser + "\001" + lastUrl + "\001" + lastTimeStr + "\001" + 
								step + "\001" + (stayTime/1000) + "\001" + lastSaveStr);
						// 往外写数据了
						context.write(k, v);
						step++;
					} else {   // 同一个IP访问的时间差  > 30min  认为是不同的session   上一条记录的访问时间 是 60
						k.set(sessionId + "\001" + lastIpAndUser + "\001" + lastUrl + "\001" + lastTimeStr + "\001" + 
								step + "\001" + (60) + "\001" + lastSaveStr);
						context.write(k, v);   // 这一系的ip的最后一条数据  在这里是不输出的,  还要继续往下走
						// 输出完上一条之后,重置step编号
						step = 1;
						// session 也要重新生成
						sessionId = UUID.randomUUID().toString();
					}
					
					
				}
				// 初识的设置
				lastTimeStr = beanKey.getTime_local();
				lastSaveStr = beanKey.getHttp_referer() + "\001" + beanKey.getHttp_user_agent() + "\001" + 
						beanKey.getBody_bytes_sent() + "\001" + beanKey.getStatus();
				lastUrl = beanKey.getRequest();
				lastIpAndUser = beanKey.getRemote_addr() + "\001" + beanKey.getRemote_user() ;
			}
			
			// 下面的这条数据是最后一条数据
			k.set(sessionId + "\001" + lastIpAndUser + "\001" + lastUrl + "\001" + lastTimeStr + "\001" + 
					step + "\001" + (60) + "\001" + lastSaveStr);
			context.write(k, v);
			System.out.println("---------------");
			
		}
		
		//  **********************工具方法************************
		private String toStr(Date date) {
			SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss",Locale.US);
			return sdf.format(date);
		}
		
		private Date toDate(String timeStr) throws ParseException {
			SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss",Locale.UK);
			return sdf.parse(timeStr);
		}
		
		// 算时间差
		private long timeDiff(String time1, String time2) throws ParseException {
			Date d1 = toDate(time1);
			Date d2 = toDate(time2);
			return d1.getTime() - d2.getTime();
		}
	}
	
	
	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
		Configuration conf = new Configuration();
		Job job = Job.getInstance(conf);
		
		job.setJarByClass(ClickStream.class);
		
		job.setMapperClass(ClickStreamMapper.class);
		job.setReducerClass(ClickStreamReducer.class);
		
		job.setMapOutputKeyClass(WeblogBean.class);
		job.setMapOutputValueClass(Text.class);
		
		// out
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(NullWritable.class);
		
		
		job.setGroupingComparatorClass(IpGroupingComparator.class);
		
		FileInputFormat.setInputPaths(job, new Path("f:/weblog_2/output"));
		FileOutputFormat.setOutputPath(job, new Path("f:/weblog_2/pageviews"));
		
		FileSystem fs = FileSystem.get(conf);
		if(fs.exists(new Path("f:/weblog_2/pageviews"))) {
			fs.delete(new Path("f:/weblog_2/pageviews"), true);
		}

		job.waitForCompletion(true);
		
		
		
		
	}

}

3. ClickStreamVisit

package mr.flow.weblog.mr;

import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.util.ArrayList;

import org.apache.commons.beanutils.BeanUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import mr.flow.weblog.bean.PageViewsBean;
import mr.flow.weblog.bean.SessionIdGroupingComparator;
import mr.flow.weblog.bean.VisitBean;

/**
 * 从PageViews模型中根据sessionId来继续梳理出同一次会话中的信息
 * 
 *  梳理之后向外输出的数据的格式 :
 *  sessionId   开始访问的时间     访问结束的时间      开始的页面       访问结束的页面      总共访问的页数	
 * 
 * @author 汤小萌
 * @date 2018年11月28日 下午8:42:26
 */
public class ClickStreamVisit {
	
	
	static class ClickStreamVisitMapper extends Mapper<LongWritable, Text, PageViewsBean, Text> {
		
		/**
		 * 这个Mapper的输出可以为NullWritable  由于当我我在测试的时候就写成了Text就一直没改
		 */
		PageViewsBean beanKey = new PageViewsBean();
		// NullWritable v = NullWritable.get();
		Text v = new Text();
		
		@Override
		protected void map(LongWritable key, Text value, Context context)
				throws IOException, InterruptedException {
			String[] fields = value.toString().split("\001");
			int step = Integer.parseInt(fields[5]);  // 需要将每个PageViewBean的在这个session中是处在第几步这个step变成int类型
			beanKey.set(fields[0], fields[1], fields[2], fields[3],fields[4], step, fields[6], fields[7], fields[8], fields[9]);
			v.set(beanKey.getSession() + " " + step);
			context.write(beanKey, v);
		}
	}
	
	static class ClickStreamVisitReducer extends Reducer<PageViewsBean, Text, NullWritable, VisitBean> {
		
		NullWritable k = NullWritable.get();
		// 取这次visit的首尾pageViews记录,放入VisitBean中
		VisitBean visitBean = new VisitBean();
		@Override
		
		protected void reduce(PageViewsBean beanKey, Iterable<Text> values, Context context)
				throws IOException, InterruptedException {
// System.out.println("----------------");
			
			ArrayList<PageViewsBean> pvBeanList = new ArrayList<PageViewsBean>();
			
			for(Text str : values) {
				// System.out.println(beanKey + " || " + str);
				// 不能直接这样天剑   是为是引用类型
				// pvBeanList.add(beanKey);
				PageViewsBean pvBean = new PageViewsBean();
				try {
					BeanUtils.copyProperties(pvBean, beanKey);
					pvBeanList.add(pvBean);
				} catch (IllegalAccessException | InvocationTargetException e) {
					e.printStackTrace();
				}
			}
// System.out.println(pvBeanList);
// System.out.println("----------------");
			
			
			// 取visit 的首记录
			// visitBean.setInPage(pvBeanList.get(0).getRequest());
			visitBean.setInTime(pvBeanList.get(0).getTimeStr());
			// 取visit 的尾记录
			visitBean.setOutPage(pvBeanList.get(pvBeanList.size() - 1).getRequest());
			visitBean.setOutTime(pvBeanList.get(pvBeanList.size() - 1).getTimeStr());
			// visit访问的页面数
			visitBean.setPageVisits(pvBeanList.size());
			// 来访者的ip
			visitBean.setRemote_addr(pvBeanList.get(0).getRemote_addr());
			// 本次visit的referal
			visitBean.setReferal(pvBeanList.get(0).getReferal());
			visitBean.setSession(pvBeanList.get(0).getSession());
			
			context.write(k, visitBean);
		}
	}
	
	
	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
		Configuration conf = new Configuration();
		Job job = Job.getInstance(conf);
		
		job.setJarByClass(ClickStreamVisit.class);
		
		job.setMapperClass(ClickStreamVisitMapper.class);
		job.setReducerClass(ClickStreamVisitReducer.class);
		
		
		job.setMapOutputKeyClass(PageViewsBean.class);
		// job.setMapOutputValueClass(NullWritable.class);
		job.setMapOutputValueClass(Text.class);
		
		job.setOutputKeyClass(NullWritable.class);
		job.setOutputKeyClass(VisitBean.class);
		
		job.setGroupingComparatorClass(SessionIdGroupingComparator.class);
		
		
		FileInputFormat.setInputPaths(job, new Path("f:/weblog_2/pageviews/testLog.txt"));
		FileOutputFormat.setOutputPath(job, new Path("f:/weblog_2/visitout"));
		
		FileSystem fs = FileSystem.get(conf);
		if(fs.exists(new Path("f:/weblog_2/visitout"))) {
			fs.delete(new Path("f:/weblog_2/visitout"), true);
		}
		
		
		boolean res = job.waitForCompletion(true);
		System.exit(res?0:1);
		
		
		
	}
	
}

猜你喜欢

转载自blog.csdn.net/qq_38200548/article/details/84612664