WebMagic入门实战下CSDN,20行代码实现爬取标题
spiderimport java.util.List; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; public class CsdnSpider implements PageProcessor{ Site site=Site.me().setRetryTimes(5).setTimeOut(5000).setSleepTime(200).addHeader("User-Agent","Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:59.0) Gecko/20100101 Firefox/59.0"); @Override public void process(Page page) { List<String> list= page.getHtml().xpath("//div[@class='article-list']/div/@data-articleid").all(); if(!list.isEmpty()) { for (String string : list) { page.addTargetRequest("https://blog.csdn.net/qq_36783371/article/details/"+string); } } if(page.getRequest().getUrl().matches("https://blog\\.csdn\\.net/qq_36783371/article/details/\\d+")) { page.putField("title", page.getHtml().xpath("//h6[@class='title-article']/text()").toString()); //page.putField("text", page.getHtml().xpath("").toString()); } } @Override public Site getSite() { return site; } public static void main(String[] args) { Spider.create(new CsdnSpider()).addUrl("https://blog.csdn.net/qq_36783371","https://blog.csdn.net/qq_36783371/article/list/2?").addPipeline(new CsdnPipline()).thread(5).runAsync(); } }
pipline
import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.pipeline.Pipeline; public class CsdnPipline implements Pipeline{ @Override public void process(ResultItems resultItems, Task task) { try { System.out.println(resultItems.get("title").toString()); } catch (Exception e) { } } }
改造下变成刷访问量
import java.util.List; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; public class CsdnSpider implements PageProcessor{ Site site=Site.me().setRetryTimes(5).setTimeOut(5000).setSleepTime(200).addHeader("User-Agent","Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:59.0) Gecko/20100101 Firefox/59.0"); @Override public void process(Page page) { List<String> list= page.getHtml().xpath("//div[@class='article-list']/div/@data-articleid").all(); if(!list.isEmpty()) { for (String string : list) { page.addTargetRequest("https://blog.csdn.net/qq_36783371/article/details/"+string); } } if(page.getRequest().getUrl().matches("https://blog\\.csdn\\.net/qq_36783371/article/details/\\d+")) { //page.putField("title", page.getHtml().xpath("//h6[@class='title-article']/text()").toString()); //page.putField("text", page.getHtml().xpath("").toString()); } } @Override public Site getSite() { return site; } public static void main(String[] args) throws Exception { for (int i = 0; i < 100; i++) { Thread.sleep(5000); Spider.create(new CsdnSpider()).addUrl("https://blog.csdn.net/qq_36783371","https://blog.csdn.net/qq_36783371/article/list/2?").thread(5).runAsync(); } } }