一、开发环境
(一)、开发环境
Windows+JDK1.8+Eclipse+linux+mysql+maven
(二)、软件架构
Hadoop,Zookeeper,hbase,redis,solr,echarts
(三)、开发时间
2019.7
二、项目思路
(一)、思路:
1、通过对网站上数据爬取后,
2、对网页解析,
3、把存储到数据库中,
4、然后对数据分析处理,
最后数据展现到web界面,对提高客户购物效率和客户满意度能够起到较好的作用。
(二)、实现:
获取页面内容,下载网页,网页解析,对数据分析处理,web展现
1、使用Maven进行项目构建,实现前、后台项目的统一管理。
2、获取网页内容
3、下载网页HttpClient、
4、网页解析:HtmlCleaner把商品标题,图片,价格,规格参数等解析出来;
解析的时候不仅要解析当前页,也要解析下一页
5、把解析到的商品数据存储到hbase、索引数据存储到redis数据库中
7、使用solr查询出来,在web界面展现
三、具体实现
(一)、使用Maven进行项目构建
1、使用Maven进行项目构建,
实现前、后台项目的统一管理。
(二)、获取网页内容和打印页面下载日志
2、获取网页内容和打印页面下载日志
public class PageUtils {
private static Logger logger = LoggerFactory.getLogger(PageUtils.class);
/**
* 获取页面内容
* @param url
* @return
*/
public static String getContent(String url){
String content = "";
HttpClientBuilder builder = HttpClients.custom();
builder.setUserAgent("Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.132 Safari/537.36");
CloseableHttpClient client = builder.build();
HttpGet request = new HttpGet(url);
try {
long start_time = System.currentTimeMillis();
CloseableHttpResponse response = client.execute(request);
HttpEntity entity = response.getEntity();
content = EntityUtils.toString(entity);
logger.info("页面下载成功:{},消耗时间:{}",url,System.currentTimeMillis()-start_time);
} catch (Exception e) {
logger.error("页面下载失败:{}",url);
e.printStackTrace();
}
return content;
}
(三)、下载网页HttpClient
3、下载网页HttpClient
public Page download(String url) {
Page page = new Page();
String content = PageUtils.getContent(url);
page.setContent(content);
page.setUrl(url);
return page;
}
(四)、网页解析:HtmlCleaner
1、把商品标题,图片,价格,规格参数等解析出来;
4、网页解析:HtmlCleaner把商品标题,图片,价格,规格参数等解析出来;
public class JdProcess implements Processable {
@Override
public void process(Page page) {
String content = page.getContent();
HtmlCleaner htmlCleaner = new HtmlCleaner();
TagNode rootNode = htmlCleaner.clean(content);
if(page.getUrl().startsWith("http://list.jd.com/list.html")){
String nexturl = HtmlUtils.getAttributeByName(rootNode, "href", "//*[@id=\"J_topPage\"]/a[2]");
nexturl = "http://list.jd.com"+nexturl.replace("&", "&");
page.addUrl(nexturl);
try {
Object[] evaluateXPath = rootNode.evaluateXPath("//*[@id=\"plist\"]/ul/li/div/div[1]/a");
for (Object object : evaluateXPath) {
TagNode tagNode = (TagNode)object;
page.addUrl(tagNode.getAttributeByName("href"));
}
} catch (XPatherException e) {
e.printStackTrace();
}
}else{
parseProduct(page, rootNode);
}
}
/**
* 解析商品明细数据
* @param page
* @param rootNode
*/
public void parseProduct(Page page, TagNode rootNode) {
try {
//标题
String title = HtmlUtils.getText(rootNode, "//*[@id=\"name\"]/h1");
page.addField("title", title);
//图片地址
String picurl = HtmlUtils.getAttributeByName(rootNode, "src", "//*[@id=\"spec-n1\"]/img");
page.addField("picurl", picurl);
//价格
/*evaluateXPath = rootNode.evaluateXPath("//*[@id=\"jd-price\"]");
if(evaluateXPath.length>0){
TagNode priceNode = (TagNode)evaluateXPath[0];
System.out.println("价格:"+priceNode.getText());
}*/
String url = page.getUrl();
Pattern compile = Pattern.compile("http://item.jd.com/([0-9]+).html");
Matcher matcher = compile.matcher(url);
String goodsId = "";
if(matcher.find()){
goodsId = matcher.group(1);
}
page.setGoodsid("jd_"+goodsId);
String priceJson = PageUtils.getContent("http://p.3.cn/prices/get?skuid=J_"+goodsId);
JSONArray jsonArray = new JSONArray(priceJson);
JSONObject object = (JSONObject)jsonArray.get(0);
page.addField("price", object.getString("p"));
//规格参数
Object[] evaluateXPath = rootNode.evaluateXPath("//*[@id=\"product-detail-2\"]/table/tbody/tr");
JSONArray specjsonArray = new JSONArray();
for (Object tagobject : evaluateXPath) {
TagNode tagNode = (TagNode)tagobject;
if(!"".equals(tagNode.getText().toString().trim())){
Object[] thevaluateXPath = tagNode.evaluateXPath("/th");
JSONObject jsonObject = new JSONObject();
if(thevaluateXPath.length>0){
TagNode thtagnode = (TagNode)thevaluateXPath[0];
jsonObject.put("name", "");
jsonObject.put("value", thtagnode.getText().toString());
}else{
Object[] tdevaluateXPath = tagNode.evaluateXPath("/td");
TagNode tdtagnode1 = (TagNode)tdevaluateXPath[0];
TagNode tdtagnode2 = (TagNode)tdevaluateXPath[1];
jsonObject.put("name", tdtagnode1.getText().toString());
jsonObject.put("value", tdtagnode2.getText().toString());
}
specjsonArray.put(jsonObject);
}
}
page.addField("spec", specjsonArray.toString());
} catch (XPatherException e) {
e.printStackTrace();
}
}
2、解析当前页和下一页
public class HtmlUtils {
/**
* 获取指定标签的值
* @param rootNode
* @param xpath
* @return
*/
public static String getText(TagNode rootNode,String xpath){
String value = "";
Object[] evaluateXPath;
try {
evaluateXPath = rootNode.evaluateXPath(xpath);
if(evaluateXPath.length>0){
TagNode tagNode = (TagNode)evaluateXPath[0];
value = tagNode.getText().toString();
}
} catch (XPatherException e) {
e.printStackTrace();
}
return value;
}
/**
* 获取指定标签指定属性的值
* @param rootNode
* @param attr
* @param xpath
* @return
*/
public static String getAttributeByName(TagNode rootNode,String attr,String xpath){
String value = "";
Object[] evaluateXPath;
try {
evaluateXPath = rootNode.evaluateXPath(xpath);
if(evaluateXPath.length>0){
TagNode tagNode = (TagNode)evaluateXPath[0];
value = tagNode.getAttributeByName(attr);
}
} catch (XPatherException e) {
e.printStackTrace();
}
return value;
}
}
3、把当前网页的商品和下一页的链接存储起来,再遍历出来,把解析的数据存储到优先级队列
public class QueueRepository implements Repository {
Queue<String> lowqueue = new ConcurrentLinkedQueue<String>();
Queue<String> highqueue = new ConcurrentLinkedQueue<String>();
@Override
public String poll() {
String url = highqueue.poll();
if(url==null){
url = lowqueue.poll();
}
return url;
}
@Override
public void add(String nexturl) {
this.lowqueue.add(nexturl);
}
@Override
public void addHigh(String nexturl) {
this.highqueue.add(nexturl);
}
}
5、存储到分布式数据库redis,redis存储的是商品的索引id
public class RedisUtils {
public static String start_url = "start_url";
public static String heightkey = "spider.todo.height";
public static String lowkey = "spider.todo.low";
JedisPool jedisPool = null;
public RedisUtils(){
JedisPoolConfig poolConfig = new JedisPoolConfig();
poolConfig.setMaxIdle(10);
poolConfig.setMaxTotal(100);
poolConfig.setMaxWaitMillis(10000);
poolConfig.setTestOnBorrow(true);
jedisPool = new JedisPool(poolConfig, "192.168.56.30", 6379);
}
public List<String> lrange(String key,int start,int end){
Jedis resource = jedisPool.getResource();
List<String> list = resource.lrange(key, start, end);
jedisPool.returnResourceObject(resource);
return list;
}
public void add(String lowKey, String url) {
Jedis resource = jedisPool.getResource();
resource.lpush(lowKey, url);
jedisPool.returnResourceObject(resource);
}
public String poll(String key) {
Jedis resource = jedisPool.getResource();
String result = resource.rpop(key);
jedisPool.returnResourceObject(resource);
return result;
}
}
6、把数据存储到hbase中,hbase存储的是商品的详细信息
public class HbaseUtils {
/**
* HBASE 表名称
*/
public static final String TABLE_NAME = "spider";
/**
* 列簇1 商品信息
*/
public static final String COLUMNFAMILY_1 = "goodsinfo";
/**
* 列簇1中的列
*/
public static final String COLUMNFAMILY_1_DATA_URL = "data_url";
public static final String COLUMNFAMILY_1_PIC_URL = "pic_url";
public static final String COLUMNFAMILY_1_TITLE = "title";
public static final String COLUMNFAMILY_1_PRICE = "price";
/**
* 列簇2 商品规格
*/
public static final String COLUMNFAMILY_2 = "spec";
public static final String COLUMNFAMILY_2_PARAM = "param";
HBaseAdmin admin=null;
Configuration conf=null;
/**
* 构造函数加载配置
*/
public HbaseUtils(){
conf = new Configuration();
conf.set("hbase.zookeeper.quorum", "192.168.56.30:2181");
conf.set("hbase.rootdir", "hdfs://192.168.56.30:9000/hbase");
try {
admin = new HBaseAdmin(conf);
} catch (IOException e) {
e.printStackTrace();
}
}
public static void main(String[] args) throws Exception {
HbaseUtils hbase = new HbaseUtils();
//创建一张表
// hbase.createTable("stu","cf");
// //查询所有表名
hbase.getALLTable();
}
/**
* rowFilter的使用
* @param tableName
* @param reg
* @throws Exception
*/
public void getRowFilter(String tableName, String reg) throws Exception {
HTable hTable = new HTable(conf, tableName);
Scan scan = new Scan();
// Filter
RowFilter rowFilter = new RowFilter(CompareOp.NOT_EQUAL, new RegexStringComparator(reg));
scan.setFilter(rowFilter);
ResultScanner scanner = hTable.getScanner(scan);
for (Result result : scanner) {
System.out.println(new String(result.getRow()));
}
}
public void getScanData(String tableName, String family, String qualifier) throws Exception {
HTable hTable = new HTable(conf, tableName);
Scan scan = new Scan();
scan.addColumn(family.getBytes(), qualifier.getBytes());
ResultScanner scanner = hTable.getScanner(scan);
for (Result result : scanner) {
if(result.raw().length==0){
System.out.println(tableName+" 表数据为空!");
}else{
for (KeyValue kv: result.raw()){
System.out.println(new String(kv.getKey())+"\t"+new String(kv.getValue()));
}
}
}
}
private void deleteTable(String tableName) {
try {
if (admin.tableExists(tableName)) {
admin.disableTable(tableName);
admin.deleteTable(tableName);
System.out.println(tableName+"表删除成功!");
}
} catch (IOException e) {
e.printStackTrace();
System.out.println(tableName+"表删除失败!");
}
}
/**
* 删除一条记录
* @param tableName
* @param rowKey
*/
public void deleteOneRecord(String tableName, String rowKey) {
HTablePool hTablePool = new HTablePool(conf, 1000);
HTableInterface table = hTablePool.getTable(tableName);
Delete delete = new Delete(rowKey.getBytes());
try {
table.delete(delete);
System.out.println(rowKey+"记录删除成功!");
} catch (IOException e) {
e.printStackTrace();
System.out.println(rowKey+"记录删除失败!");
}
}
/**
* 获取表的所有数据
* @param tableName
*/
public void getALLData(String tableName) {
try {
HTable hTable = new HTable(conf, tableName);
Scan scan = new Scan();
ResultScanner scanner = hTable.getScanner(scan);
for (Result result : scanner) {
if(result.raw().length==0){
System.out.println(tableName+" 表数据为空!");
}else{
for (KeyValue kv: result.raw()){
System.out.println(new String(kv.getKey())+"\t"+new String(kv.getValue()));
}
}
}
} catch (IOException e) {
e.printStackTrace();
}
}
// 读取一条记录
/*@SuppressWarnings({ "deprecation", "resource" })
public Article get(String tableName, String row) {
HTablePool hTablePool = new HTablePool(conf, 1000);
HTableInterface table = hTablePool.getTable(tableName);
Get get = new Get(row.getBytes());
Article article = null;
try {
Result result = table.get(get);
KeyValue[] raw = result.raw();
if (raw.length == 4) {
article = new Article();
article.setId(row);
article.setTitle(new String(raw[3].getValue()));
article.setAuthor(new String(raw[0].getValue()));
article.setContent(new String(raw[1].getValue()));
article.setDescribe(new String(raw[2].getValue()));
}
} catch (IOException e) {
e.printStackTrace();
}
return article;
}*/
// 添加一条记录
public void put(String tableName, String row, String columnFamily,
String column, String data) throws IOException {
HTablePool hTablePool = new HTablePool(conf, 1000);
HTableInterface table = hTablePool.getTable(tableName);
Put p1 = new Put(Bytes.toBytes(row));
p1.add(Bytes.toBytes(columnFamily), Bytes.toBytes(column),
Bytes.toBytes(data));
table.put(p1);
System.out.println("put'" + row + "'," + columnFamily + ":" + column
+ "','" + data + "'");
}
/**
* 查询所有表名
* @return
* @throws Exception
*/
public List<String> getALLTable() throws Exception {
ArrayList<String> tables = new ArrayList<String>();
if(admin!=null){
HTableDescriptor[] listTables = admin.listTables();
if (listTables.length>0) {
for (HTableDescriptor tableDesc : listTables) {
tables.add(tableDesc.getNameAsString());
System.out.println(tableDesc.getNameAsString());
}
}
}
return tables;
}
/**
* 创建一张表
* @param tableName
* @param column
* @throws Exception
*/
public void createTable(String tableName, String column) throws Exception {
if(admin.tableExists(tableName)){
System.out.println(tableName+"表已经存在!");
}else{
HTableDescriptor tableDesc = new HTableDescriptor(tableName);
tableDesc.addFamily(new HColumnDescriptor(column.getBytes()));
admin.createTable(tableDesc);
System.out.println(tableName+"表创建成功!");
}
}
}
7、solr,根据redis索引进行查询,查询到索引后再进入database中查询,然后web界面展示出来