商品比价分析

一、开发环境

(一)、开发环境

Windows+JDK1.8+Eclipse+linux+mysql+maven

(二)、软件架构

Hadoop,Zookeeper,hbase,redis,solr,echarts

(三)、开发时间

2019.7

二、项目思路

(一)、思路:

1、通过对网站上数据爬取后,
2、对网页解析,
3、把存储到数据库中,
4、然后对数据分析处理,
最后数据展现到web界面,对提高客户购物效率和客户满意度能够起到较好的作用。

(二)、实现:

获取页面内容,下载网页,网页解析,对数据分析处理,web展现
1、使用Maven进行项目构建,实现前、后台项目的统一管理。
2、获取网页内容
3、下载网页HttpClient、
4、网页解析:HtmlCleaner把商品标题,图片,价格,规格参数等解析出来;
解析的时候不仅要解析当前页,也要解析下一页
5、把解析到的商品数据存储到hbase、索引数据存储到redis数据库中
7、使用solr查询出来,在web界面展现

三、具体实现

(一)、使用Maven进行项目构建

1、使用Maven进行项目构建,
实现前、后台项目的统一管理。

(二)、获取网页内容和打印页面下载日志

2、获取网页内容和打印页面下载日志

public class PageUtils {
	private static Logger logger = LoggerFactory.getLogger(PageUtils.class);
	/**
	 * 获取页面内容
	 * @param url
	 * @return
	 */
	public static String getContent(String url){
		String content = "";
		HttpClientBuilder builder = HttpClients.custom();
		builder.setUserAgent("Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.132 Safari/537.36");
		CloseableHttpClient client = builder.build();

		HttpGet request = new HttpGet(url);
		try {
			long start_time = System.currentTimeMillis();
			CloseableHttpResponse response = client.execute(request);
			HttpEntity entity = response.getEntity();
			content = EntityUtils.toString(entity);
			logger.info("页面下载成功:{},消耗时间:{}",url,System.currentTimeMillis()-start_time);
		} catch (Exception e) {
			logger.error("页面下载失败:{}",url);
			e.printStackTrace();
		}
		return content;

	}

(三)、下载网页HttpClient

3、下载网页HttpClient

public Page download(String url) {
		Page page = new Page();
		String content = PageUtils.getContent(url);
		page.setContent(content);
		page.setUrl(url);
		return page;
	}

(四)、网页解析:HtmlCleaner

1、把商品标题,图片,价格,规格参数等解析出来;

4、网页解析:HtmlCleaner把商品标题,图片,价格,规格参数等解析出来;

public class JdProcess implements Processable {

	@Override
	public void process(Page page) {
		String content = page.getContent();
		HtmlCleaner htmlCleaner = new HtmlCleaner();
		TagNode rootNode = htmlCleaner.clean(content);
		if(page.getUrl().startsWith("http://list.jd.com/list.html")){
			String nexturl = HtmlUtils.getAttributeByName(rootNode, "href", "//*[@id=\"J_topPage\"]/a[2]");
			nexturl = "http://list.jd.com"+nexturl.replace("&", "&");
			page.addUrl(nexturl);

			try {
				Object[] evaluateXPath = rootNode.evaluateXPath("//*[@id=\"plist\"]/ul/li/div/div[1]/a");
				for (Object object : evaluateXPath) {
					TagNode tagNode = (TagNode)object;
					page.addUrl(tagNode.getAttributeByName("href"));
				}
			} catch (XPatherException e) {
				e.printStackTrace();
			}


		}else{
			parseProduct(page, rootNode);
		}
	}

	/**
	 * 解析商品明细数据
	 * @param page
	 * @param rootNode
	 */
	public void parseProduct(Page page, TagNode rootNode) {
		try {
			//标题

			String title = HtmlUtils.getText(rootNode, "//*[@id=\"name\"]/h1");
			page.addField("title", title);

			//图片地址
			String picurl = HtmlUtils.getAttributeByName(rootNode, "src", "//*[@id=\"spec-n1\"]/img");
			page.addField("picurl", picurl);


			//价格
			/*evaluateXPath = rootNode.evaluateXPath("//*[@id=\"jd-price\"]");
			if(evaluateXPath.length>0){
				TagNode priceNode = (TagNode)evaluateXPath[0];
				System.out.println("价格:"+priceNode.getText());
			}*/
			String url = page.getUrl();
			Pattern compile = Pattern.compile("http://item.jd.com/([0-9]+).html");
			Matcher matcher = compile.matcher(url);
			String goodsId = "";
			if(matcher.find()){
				goodsId = matcher.group(1);
			}
			page.setGoodsid("jd_"+goodsId);
			String priceJson = PageUtils.getContent("http://p.3.cn/prices/get?skuid=J_"+goodsId);
			JSONArray jsonArray = new JSONArray(priceJson);
			JSONObject object = (JSONObject)jsonArray.get(0);
			page.addField("price", object.getString("p"));

			//规格参数

			Object[] evaluateXPath = rootNode.evaluateXPath("//*[@id=\"product-detail-2\"]/table/tbody/tr");
			JSONArray specjsonArray = new JSONArray();
			for (Object tagobject : evaluateXPath) {
				TagNode tagNode = (TagNode)tagobject;
				if(!"".equals(tagNode.getText().toString().trim())){
					Object[] thevaluateXPath = tagNode.evaluateXPath("/th");
					JSONObject jsonObject = new JSONObject();
					if(thevaluateXPath.length>0){
						TagNode thtagnode = (TagNode)thevaluateXPath[0];
						jsonObject.put("name", "");
						jsonObject.put("value", thtagnode.getText().toString());
					}else{
						Object[] tdevaluateXPath = tagNode.evaluateXPath("/td");
						TagNode tdtagnode1 = (TagNode)tdevaluateXPath[0];
						TagNode tdtagnode2 = (TagNode)tdevaluateXPath[1];
						jsonObject.put("name", tdtagnode1.getText().toString());
						jsonObject.put("value", tdtagnode2.getText().toString());
					}
					specjsonArray.put(jsonObject);
				}
			}
			page.addField("spec", specjsonArray.toString());
		} catch (XPatherException e) {
			e.printStackTrace();
		}
	}

2、解析当前页和下一页

public class HtmlUtils {
	
	/**
	 * 获取指定标签的值
	 * @param rootNode
	 * @param xpath
	 * @return
	 */
	public static String getText(TagNode rootNode,String xpath){
		String value = "";
		Object[] evaluateXPath;
		try {
			evaluateXPath = rootNode.evaluateXPath(xpath);
			if(evaluateXPath.length>0){
				TagNode tagNode = (TagNode)evaluateXPath[0];
				value = tagNode.getText().toString();
			}
		} catch (XPatherException e) {
			e.printStackTrace();
		}
		return value;
	}
	
	/**
	 * 获取指定标签指定属性的值
	 * @param rootNode
	 * @param attr
	 * @param xpath
	 * @return
	 */
	public static String getAttributeByName(TagNode rootNode,String attr,String xpath){
		String value = "";
		Object[] evaluateXPath;
		try {
			evaluateXPath = rootNode.evaluateXPath(xpath);
			if(evaluateXPath.length>0){
				TagNode tagNode = (TagNode)evaluateXPath[0];
				value = tagNode.getAttributeByName(attr);
			}
		} catch (XPatherException e) {
			e.printStackTrace();
		}
		return value;
	}
}

3、把当前网页的商品和下一页的链接存储起来,再遍历出来,把解析的数据存储到优先级队列

public class QueueRepository implements Repository {
	Queue<String> lowqueue = new ConcurrentLinkedQueue<String>();
	Queue<String> highqueue = new ConcurrentLinkedQueue<String>();
	
	@Override
	public String poll() {
		String url = highqueue.poll();
		if(url==null){
			url = lowqueue.poll();
		}
		return url;
	}

	@Override
	public void add(String nexturl) {
		this.lowqueue.add(nexturl);
	}

	@Override
	public void addHigh(String nexturl) {
		this.highqueue.add(nexturl);
	}
}

5、存储到分布式数据库redis,redis存储的是商品的索引id

public class RedisUtils {
	public static String start_url = "start_url";

	public static String heightkey = "spider.todo.height";
	public static String lowkey = "spider.todo.low";


	JedisPool jedisPool = null;
	public RedisUtils(){
		JedisPoolConfig poolConfig = new JedisPoolConfig();
		poolConfig.setMaxIdle(10);
		poolConfig.setMaxTotal(100);
		poolConfig.setMaxWaitMillis(10000);
		poolConfig.setTestOnBorrow(true);
		jedisPool = new JedisPool(poolConfig, "192.168.56.30", 6379);
	}

	public List<String> lrange(String key,int start,int end){
		Jedis resource = jedisPool.getResource();

		List<String> list = resource.lrange(key, start, end);
		jedisPool.returnResourceObject(resource);
		return list;

	}

	public void add(String lowKey, String url) {
		Jedis resource = jedisPool.getResource();
		resource.lpush(lowKey, url);
		jedisPool.returnResourceObject(resource);
	}
	public String poll(String key) {
		Jedis resource = jedisPool.getResource();
		String result = resource.rpop(key);
		jedisPool.returnResourceObject(resource);
		return result;
	}
}

6、把数据存储到hbase中,hbase存储的是商品的详细信息

public class HbaseUtils {

	/**
	 * HBASE 表名称
	 */
	public static final String TABLE_NAME = "spider";
	/**
	 * 列簇1 商品信息
	 */
	public static final String COLUMNFAMILY_1 = "goodsinfo";
	/**
	 * 列簇1中的列
	 */
	public static final String COLUMNFAMILY_1_DATA_URL = "data_url";
	public static final String COLUMNFAMILY_1_PIC_URL = "pic_url";
	public static final String COLUMNFAMILY_1_TITLE = "title";
	public static final String COLUMNFAMILY_1_PRICE = "price";
	/**
	 * 列簇2 商品规格
	 */
	public static final String COLUMNFAMILY_2 = "spec";
	public static final String COLUMNFAMILY_2_PARAM = "param";


	HBaseAdmin admin=null;
	Configuration conf=null;
	/**
	 * 构造函数加载配置
	 */
	public HbaseUtils(){
		conf = new Configuration();
		conf.set("hbase.zookeeper.quorum", "192.168.56.30:2181");
		conf.set("hbase.rootdir", "hdfs://192.168.56.30:9000/hbase");
		try {
			admin = new HBaseAdmin(conf);
		} catch (IOException e) {
			e.printStackTrace();
		}
	}
	public static void main(String[] args) throws Exception {
		HbaseUtils hbase = new HbaseUtils();
		//创建一张表
//		hbase.createTable("stu","cf");
//		//查询所有表名
		hbase.getALLTable();
	}
	/**
	 * rowFilter的使用
	 * @param tableName
	 * @param reg
	 * @throws Exception
	 */
	public void getRowFilter(String tableName, String reg) throws Exception {
		HTable hTable = new HTable(conf, tableName);
		Scan scan = new Scan();
//		Filter
		RowFilter rowFilter = new RowFilter(CompareOp.NOT_EQUAL, new RegexStringComparator(reg));
		scan.setFilter(rowFilter);
		ResultScanner scanner = hTable.getScanner(scan);
		for (Result result : scanner) {
			System.out.println(new String(result.getRow()));
		}
	}

	public void getScanData(String tableName, String family, String qualifier) throws Exception {
	HTable hTable = new HTable(conf, tableName);
	Scan scan = new Scan();
	scan.addColumn(family.getBytes(), qualifier.getBytes());
	ResultScanner scanner = hTable.getScanner(scan);
	for (Result result : scanner) {
		if(result.raw().length==0){
			System.out.println(tableName+" 表数据为空!");
		}else{
			for (KeyValue kv: result.raw()){
				System.out.println(new String(kv.getKey())+"\t"+new String(kv.getValue()));
			}
		}
	}
	}
	private void deleteTable(String tableName) {
		try {
			if (admin.tableExists(tableName)) {
				admin.disableTable(tableName);
				admin.deleteTable(tableName);
				System.out.println(tableName+"表删除成功!");
			}
		} catch (IOException e) {
			e.printStackTrace();
			System.out.println(tableName+"表删除失败!");
		}

	}
	/**
	 * 删除一条记录
	 * @param tableName
	 * @param rowKey
	 */
	public void deleteOneRecord(String tableName, String rowKey) {
		HTablePool hTablePool = new HTablePool(conf, 1000);
		HTableInterface table = hTablePool.getTable(tableName);
		Delete delete = new Delete(rowKey.getBytes());
		try {
			table.delete(delete);
			System.out.println(rowKey+"记录删除成功!");
		} catch (IOException e) {
			e.printStackTrace();
			System.out.println(rowKey+"记录删除失败!");
		}
	}
	/**
	 * 获取表的所有数据
	 * @param tableName
	 */
	public void getALLData(String tableName) {
		try {
			HTable hTable = new HTable(conf, tableName);
			Scan scan = new Scan();
			ResultScanner scanner = hTable.getScanner(scan);
			for (Result result : scanner) {
				if(result.raw().length==0){
					System.out.println(tableName+" 表数据为空!");
				}else{
					for (KeyValue kv: result.raw()){
						System.out.println(new String(kv.getKey())+"\t"+new String(kv.getValue()));
					}
				}
			}
		} catch (IOException e) {
			e.printStackTrace();
		}

	}

	// 读取一条记录
		/*@SuppressWarnings({ "deprecation", "resource" })
		public  Article get(String tableName, String row) {
			HTablePool hTablePool = new HTablePool(conf, 1000);
			HTableInterface table = hTablePool.getTable(tableName);
			Get get = new Get(row.getBytes());
			Article article = null;
			try {

				Result result = table.get(get);
				KeyValue[] raw = result.raw();
				if (raw.length == 4) {
					article = new Article();
					article.setId(row);
					article.setTitle(new String(raw[3].getValue()));
					article.setAuthor(new String(raw[0].getValue()));
					article.setContent(new String(raw[1].getValue()));
					article.setDescribe(new String(raw[2].getValue()));
				}
			} catch (IOException e) {
				e.printStackTrace();
			}
			return article;
		}*/


		// 添加一条记录
		public  void put(String tableName, String row, String columnFamily,
				String column, String data) throws IOException {
			HTablePool hTablePool = new HTablePool(conf, 1000);
			HTableInterface table = hTablePool.getTable(tableName);
			Put p1 = new Put(Bytes.toBytes(row));
			p1.add(Bytes.toBytes(columnFamily), Bytes.toBytes(column),
					Bytes.toBytes(data));
			table.put(p1);
			System.out.println("put'" + row + "'," + columnFamily + ":" + column
					+ "','" + data + "'");
		}


	/**
	 * 查询所有表名
	 * @return
	 * @throws Exception
	 */
	public List<String> getALLTable() throws Exception {
		ArrayList<String> tables = new ArrayList<String>();
		if(admin!=null){
			HTableDescriptor[] listTables = admin.listTables();
			if (listTables.length>0) {
				for (HTableDescriptor tableDesc : listTables) {
					tables.add(tableDesc.getNameAsString());
					System.out.println(tableDesc.getNameAsString());
				}
			}
		}
		return tables;
	}
	/**
	 * 创建一张表
	 * @param tableName
	 * @param column
	 * @throws Exception
	 */
	public void createTable(String tableName, String column) throws Exception {
		if(admin.tableExists(tableName)){
			System.out.println(tableName+"表已经存在!");
		}else{
			HTableDescriptor tableDesc = new HTableDescriptor(tableName);
			tableDesc.addFamily(new HColumnDescriptor(column.getBytes()));
			admin.createTable(tableDesc);
			System.out.println(tableName+"表创建成功!");
		}
	}
}

7、solr,根据redis索引进行查询,查询到索引后再进入database中查询,然后web界面展示出来

在这里插入图片描述
在这里插入图片描述

猜你喜欢

转载自blog.csdn.net/qq_41919792/article/details/106932306