WEB日志清洗 - java代码

参考学习的博客：
https://blog.csdn.net/a331685690/article/details/80281448?utm_source=blogxgwz3

需要从访问日志中梳理出每一个session（如果一个用户两次相邻请求之间的时间差 < 30 min，则该两次请求同属于同一个session,否则分属于不同的session）,并且为session的历次请求打上序号

模拟日志：

194.237.142.21 - - [18/Sep/2013:06:49:18 +0000] "GET /1.html HTTP/1.1" 304 0 "-" "Mozilla/4.0 (compatible;)"
194.237.142.21 - - [18/Sep/2013:06:50:18 +0000] "GET /2.html HTTP/1.1" 304 0 "-" "Mozilla/4.0 (compatible;)"
194.237.142.21 - - [18/Sep/2013:06:51:18 +0000] "GET /3.html HTTP/1.1" 304 0 "-" "Mozilla/4.0 (compatible;)"
194.237.142.21 - - [18/Sep/2013:08:49:18 +0000] "GET /4.html HTTP/1.1" 304 0 "-" "Mozilla/4.0 (compatible;)"
194.237.142.21 - - [18/Sep/2013:08:50:18 +0000] "GET /5.html HTTP/1.1" 304 0 "-" "Mozilla/4.0 (compatible;)"
194.237.142.21 - - [18/Sep/2013:10:49:18 +0000] "GET /6.html HTTP/1.1" 304 0 "-" "Mozilla/4.0 (compatible;)"


194.237.142.22 - - [18/Sep/2013:06:49:18 +0000] "GET /1.html HTTP/1.1" 304 0 "-" "Mozilla/4.0 (compatible;)"
194.237.142.22 - - [18/Sep/2013:06:50:18 +0000] "GET /2.html HTTP/1.1" 304 0 "-" "Mozilla/4.0 (compatible;)"
194.237.142.22 - - [18/Sep/2013:06:51:18 +0000] "GET /3.html HTTP/1.1" 304 0 "-" "Mozilla/4.0 (compatible;)"
194.237.142.22 - - [18/Sep/2013:08:49:18 +0000] "GET /4.html HTTP/1.1" 304 0 "-" "Mozilla/4.0 (compatible;)"
194.237.142.22 - - [18/Sep/2013:08:50:18 +0000] "GET /5.html HTTP/1.1" 304 0 "-" "Mozilla/4.0 (compatible;)"
194.237.142.22 - - [18/Sep/2013:10:49:18 +0000] "GET /6.html HTTP/1.1" 304 0 "-" "Mozilla/4.0 (compatible;)"


194.237.142.23 - - [18/Sep/2013:06:49:18 +0000] "GET /1.html HTTP/1.1" 304 0 "-" "Mozilla/4.0 (compatible;)"
194.237.142.23 - - [18/Sep/2013:06:50:18 +0000] "GET /2.html HTTP/1.1" 304 0 "-" "Mozilla/4.0 (compatible;)"
194.237.142.23 - - [18/Sep/2013:06:51:18 +0000] "GET /3.html HTTP/1.1" 304 0 "-" "Mozilla/4.0 (compatible;)"
194.237.142.23 - - [18/Sep/2013:08:49:18 +0000] "GET /4.html HTTP/1.1" 304 0 "-" "Mozilla/4.0 (compatible;)"
194.237.142.23 - - [18/Sep/2013:08:50:18 +0000] "GET /5.html HTTP/1.1" 304 0 "-" "Mozilla/4.0 (compatible;)"
194.237.142.23 - - [18/Sep/2013:10:49:18 +0000] "GET /6.html HTTP/1.1" 304 0 "-" "Mozilla/4.0 (compatible;)"


194.237.142.24 - - [18/Sep/2013:06:49:18 +0000] "GET /1.html HTTP/1.1" 304 0 "-" "Mozilla/4.0 (compatible;)"
194.237.142.24 - - [18/Sep/2013:06:50:18 +0000] "GET /2.html HTTP/1.1" 304 0 "-" "Mozilla/4.0 (compatible;)"
194.237.142.24 - - [18/Sep/2013:06:52:18 +0000] "GET /3.html HTTP/1.1" 304 0 "-" "Mozilla/4.0 (compatible;)"
194.237.142.24 - - [18/Sep/2013:08:49:18 +0000] "GET /4.html HTTP/1.1" 304 0 "-" "Mozilla/4.0 (compatible;)"
194.237.142.24 - - [18/Sep/2013:08:50:18 +0000] "GET /5.html HTTP/1.1" 304 0 "-" "Mozilla/4.0 (compatible;)"
194.237.142.24 - - [18/Sep/2013:10:49:18 +0000] "GET /6.html HTTP/1.1" 304 0 "-" "Mozilla/4.0 (compatible;)"

代码

地址：
https://gitee.com/tanghongping/hadoopMapReduce/tree/master/src/com/thp/bigdata/webLog

package com.thp.bigdata.webLog;

import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Map.Entry;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.junit.Test;
/**
 * 
 * 对web日志的清洗(简易版本)：
 * 我们先来单独看一下web日志里面的一条数据：
 * 	194.237.142.21 - - [18/Sep/2013:06:49:18 +0000] "GET /1.html HTTP/1.1" 304 0 "-" "Mozilla/4.0 (compatible;)"
 * 里面有ip地址，访问时间，使用GET还是POST访问，请求的页面，访问的状态
 * 我们现在就做简化处理。我们要知道一个用户在一个页面上停留的时间
 * 由于一个ip可以被多个用用户使用，于是我们区分用户的方式就是使用sessionID
 * 但是sessionID本身在上面的web日志里面的数据是没有的，是要我们自己生成的
 * 生成sessionID是有自己的规则的，
 * 不同的ip地址肯定不是同一个sessionID
 * 同一个ip地址，当请求的时间超过了30min，我们就把这条访问记录的sessionID设置成不一样
 * 
 * 所以首先我们要将所有的数据根据IP分离出来
 * 然后根据IP生成sessionID
 * 生成sessionID之后，使用sessionID作为主键，存在HashMap中，然后返回给用户
 * 
 * @author 汤小萌
 *
 */
public class WashWebLog {

	public static void main(String[] args) throws IOException {

		// 得到ip对应的SessionBean 的List集合      
		Map<String, List<SessionBean>> IPListMap = getIPSessionBeanMap();
		
		
		/**
		 * 之所以要对每个List中色SessionBean根据时间先后排序，
		 * 因为我们需要根据时间来计算sessionID
		 */
		// 对IPListMap每一个键值对数据里面的List的SessionBean按照时间进行排序
		sortByDate(IPListMap);
		
		/**
		 * 到现在为止，我们还只是跟IP进行区分
		 * 我们还没有生成sessionID
		 * 我们需要生成sessionID,才能根据sessionID来进行区分 
		 */
		
		// 生成sessionId
		makeSessionId(IPListMap);

		/*
		 * for(Entry<String, List<SessionBean>> entrySet : IPListMap.entrySet())
		 * { System.out.println("ip : " + entrySet.getKey() + " -- " +
		 * entrySet.getValue()); }
		 */

		Map<String, List<SessionBean>> sessionIdListMap = new HashMap<String, List<SessionBean>>();

		/**
		 * 经过了上面的步骤，每一个IP里面的每一个SessionBean 都生成了sessionID
		 * 现在我们就需要将所有的数据都拿出来，进行重新洗牌
		 * 我们需要的是根据sessionID来进行划分
		 */
		for (Entry<String, List<SessionBean>> entrySet : IPListMap.entrySet()) {
			List<SessionBean> sessionBeans = entrySet.getValue();
			for (SessionBean sessionBean : sessionBeans) {
				// 千万要注意这些Map的区别含义，别搞混了，搞混了，那么清洗出来的数据都是错的 注意获取的是sessionId 而不是 IP
				// 如果是IP 的话，那么会
				// 一直只有一条数据
				List<SessionBean> list = sessionIdListMap.get(sessionBean.getSessionId());
				if (list == null) {
					list = new ArrayList<SessionBean>();
				}
				list.add(sessionBean);
				// 此时的key 变成sessionID了
				sessionIdListMap.put(sessionBean.getSessionId(), list);
			}
		}

		/**
		 * 将数据格式化地显示出来
		 */
		for (Entry<String, List<SessionBean>> entrySet : sessionIdListMap.entrySet()) {
			String sessionId = entrySet.getKey();
			List<SessionBean> sessionBeans = entrySet.getValue();
			SessionBean start = sessionBeans.get(0);
			SessionBean end = sessionBeans.get(sessionBeans.size() - 1);
			long differ = end.getDate().getTime() - start.getDate().getTime();

			String res = sessionId + "\t" + start.getIp() + "\t" + start.getDate() + "\t" + end.getDate() + "\t"
					+ start.getUrl() + "\t" + end.getUrl() + "\t" + (differ / 1000);
			System.out.println(res);
		}
	}

	/**
	 * 对List里面的SessionBean 按照时间排序之后，生成SessionId
	 * 
	 * @param IPListSortedMap
	 */
	private static void makeSessionId(Map<String, List<SessionBean>> IPListSortedMap) {
		for (Entry<String, List<SessionBean>> entrySet : IPListSortedMap.entrySet()) {
			List<SessionBean> sessionBeans = entrySet.getValue();
			if (sessionBeans.size() == 1) { // 长度等于1时   这个ip地址就只有一条数据，那么肯定就是直接生成sessionID
				SessionBean sessionBean = sessionBeans.get(0);
				sessionBean.setSessionId(getSessionId(sessionBean.getIp()));
				sessionBean.setOrder(1);
			}
			// 当长度大于1时     这个ip对应的访问记录有多条
			for (int i = 0; i < sessionBeans.size() - 1; i++) {
				SessionBean sb1 = sessionBeans.get(i);  // 获取这个ip  i 时刻的访问记录数据
				SessionBean sb2 = sessionBeans.get(i + 1); // 获取这个ip  i+1 时刻的访问记录 
				// 同一个session的时候   (先判断是不是同属于一个session  --  根据访问时间是不是在30分钟之内)
				if (isSameSession(sb1, sb2)) {
					// 早的那个可能之前就已经设置了sessionId 就是上面的步骤设置了 那么order也一同设置了
					if (sb1.getSessionId() != null) {
						sb2.setSessionId(sb1.getSessionId());
						sb2.setOrder(sb1.getOrder() + 1);
					} else {
						// 连个sessionID都没有生成sessionID,那么就生成一个，连个SessionBean使用同一个
						sb1.setSessionId(getSessionId(sb1.getIp()));
						sb1.setOrder(1);
						sb2.setSessionId(sb1.getSessionId());
						// order后一个访问记录需要  + 1
						sb2.setOrder(sb1.getOrder() + 1);
					}
				} else {
					// 不是同一个session
					if (sb1.getSessionId() != null) {  // 前一个sessionBean经过上面的步骤已经设置了sessionID
						sb2.setSessionId(getSessionId(sb2.getIp()));
						sb2.setOrder(1); // sb2是来自新的一个session, order要设置为1
					} else {  // 连个SessionBean都没有设置sessionID
						sb1.setSessionId(getSessionId(sb1.getIp()));
						sb1.setOrder(1);
						sb2.setSessionId(getSessionId(sb2.getIp()));
						sb2.setOrder(1);
					}
				}

			}
		}
	}

	/**
	 * 根据SessionBean 的时间判断是不是同一个session
	 * 
	 * @param s1
	 * @param s2
	 * @return
	 */
	private static boolean isSameSession(SessionBean s1, SessionBean s2) {
		long time1 = s1.getDate().getTime();
		long time2 = s2.getDate().getTime();
		// session 时间 0 - 30 分钟
		long differ = time2 - time1;
		if (differ >= 0 && differ <= (1000 * 60 * 30)) {
			return true;
		}
		return false;
	}

	/**
	 * 生成sessionId的方法
	 */
	private static String getSessionId(String ip) {
		// Integer.parseInt(ip);
		String[] fields = ip.split("\\.");
		StringBuffer sb = new StringBuffer();
		for (String field : fields) {
			sb.append(field);
		}
		long longIP = Long.parseLong(sb.toString());
		long nanoTime = System.nanoTime();
		return "" + longIP + nanoTime;
	}

	/**
	 * 对IPListMap里的每一条数据的List中的SessionBean按照时间进行排序
	 * 
	 * @param IPListMap
	 */
	private static void sortByDate(Map<String, List<SessionBean>> IPListMap) {
		for (Entry<String, List<SessionBean>> entrySet : IPListMap.entrySet()) {
			List<SessionBean> list = entrySet.getValue();
			// 根据自己定义的方法来进行排序
			Collections.sort(list, new Comparator<SessionBean>() {
				@Override
				public int compare(SessionBean o1, SessionBean o2) {
					Date date1 = o1.getDate();
					Date date2 = o2.getDate();
					return date1.before(o2.getDate()) ? -1 : 1;
				}

			});
		}
	}
	
	/**
	 * 这个方法主要的功能：
	 *   1，去读取web日志数据 按照行去读取
	 *   2. 按照行读取sweb日志的时候，还需要将关键的数据获取出来，存放在sessionBean中
	 *   3.使用IP作为key,这个ip所对应的其他的数据都存放在这个IP所对应的List集合中
	 * @return
	 */
	private static Map<String, List<SessionBean>> getIPSessionBeanMap() {
		Map<String, List<SessionBean>> map1 = new HashMap<String, List<SessionBean>>();
		BufferedReader bufferedReader = null;
		try {
			bufferedReader = new BufferedReader(new
			FileReader(DemoTest.class.getResource("access.log.fensi").getPath()));
			// bufferedReader = new BufferedReader(new FileReader(DemoTest.class.getResource("data.dat").getPath()));
			String line = null;
			while ((line = bufferedReader.readLine()) != null) {
				// 数字 + . (连续出现三次) + 最后一次出现的还是数字 -- \\d+ 代表的是多个数字
				String ipRegex = "(\\d+\\.){3}\\d+";
				// [ + 任何单字符 + 数字(一个或者多个) + ]
				String dateRegex = "\\[.+\\d+\\]";
				// 首先是 GET 或者 POST (只要有一个出现就可以匹配) + 空格 + 非空格的字符(0次或者多次)
				String urlRegex = "(POST|GET){1}\\s(\\S)*\\s";

				// 匹配IP
				String ip = getConByRegex(line, ipRegex);
				// 匹配日期
				String date = getConByRegex(line, dateRegex);
				// 匹配url
				String url = getConByRegex(line, urlRegex);
				if (ip != null && date != null && url != null) {
					// 封装SessionBean
					SessionBean sessionBean = new SessionBean();
					sessionBean.setIp(ip);
					sessionBean.setDate(parDateFromStr(date));
					sessionBean.setUrl(url);
					List<SessionBean> list = map1.get(ip);
					if (list == null) { // 如果HashMap中这个ip已经存在就在这个ip对应的List中增加SessionBean
						list = new ArrayList<SessionBean>();   // 如果这个ip在HashMap中不存在，那么就为这个ip新建一个List
					}
					list.add(sessionBean);
					map1.put(ip, list);
				}
			}
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		} finally {
			if (bufferedReader != null) {
				try {
					bufferedReader.close();
				} catch (IOException e) {
					e.printStackTrace();
				} finally {
					bufferedReader = null;
				}
			}
		}

		return map1;
	}

	/**
	 * 将日期格式的字符串转换为Date类型
	 * 
	 * @param dataStr
	 * @return
	 */
	private static Date parDateFromStr(String dateStr) {
		// [19/Sep/2013:04:38:11 +0000]
		String substring = dateStr.substring(1, dateStr.length() - 1);
		// 注意是有 三个 M ， 后面要加上Locale.US 否则会报错
		SimpleDateFormat dateFormat = new SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss", Locale.US);
		try {
			return dateFormat.parse(substring);
		} catch (ParseException e) {
			e.printStackTrace();
		}
		System.err.println("< 日期解析异常 >");
		return null;
	}

	/**
	 * 匹配字符串
	 * 
	 * @param line
	 * @param regex
	 * @return
	 */
	private static String getConByRegex(String line, String regex) {
		Pattern compile = Pattern.compile(regex);
		Matcher matcher = compile.matcher(line);
		while (matcher.find()) {
			// 返回的是符合正则规则的字符串
			return matcher.group();
		}
		return null;
	}

	@Test
	public void testRegex() {

		String line = "GETGETPOST /wp-content/uploads/2013/07/rstudio-git3.png HTTP/1.1";

		// String ipRegex = "(\\d+\\.){3}\\d+";
		// String regex ="\\[.+\\d+\\]";
		String regex = "(POST|GET){1}\\s(\\S)*\\s";

		Pattern compile = Pattern.compile(regex);

		Matcher matcher = compile.matcher(line);

		while (matcher.find()) {
			System.out.println(matcher.group());
			return;
		}

		System.out.println("没有匹配");

		return;
	}

	@Test
	public void testLineData() {
		String lineData = "194.237.142.21 - - [18/Sep/2013:06:49:18 +0000] \"GET /wp-content/uploads/2013/07/rstudio-git3.png HTTP/1.1\" 304 0 \"-\" \"Mozilla/4.0 (compatible;)\"";
		String[] fields = lineData.split(" ");
		for (String field : fields) {
			System.out.println(field);
		}

	}

}

package com.thp.bigdata.webLog;

import java.util.Date;

/**
 * SessionBean
 * @author 汤小萌
 *
 */
public class SessionBean {

	private String sessionId;
	private String ip;
	private Date date;
	private String url;
	private int order;
	public String getSessionId() {
		return sessionId;
	}
	public void setSessionId(String sessionId) {
		this.sessionId = sessionId;
	}
	public String getIp() {
		return ip;
	}
	public void setIp(String ip) {
		this.ip = ip;
	}
	public Date getDate() {
		return date;
	}
	public void setDate(Date date) {
		this.date = date;
	}
	public int getOrder() {
		return order;
	}
	public void setOrder(int order) {
		this.order = order;
	}
	public String getUrl() {
		return url;
	}
	public void setUrl(String url) {
		this.url = url;
	}
	@Override
	public String toString() {
		return "SessionBean [sessionId=" + sessionId + ", ip=" + ip + ", date=" + date + ", url=" + url + ", order="
				+ order + "]";
	}
	
	
}

WEB日志清洗 - java代码

代码

猜你喜欢