springboot 爬虫 demo
前言:有很多多线程爬虫,爬取网页上数据。这里介绍2中爬虫技术crawler4,webMagic
一、crawler4
(1)依赖
<!-- crawler4 爬虫 -->
<dependency>
<groupId>edu.uci.ics</groupId>
<artifactId>crawler4j</artifactId>
<version>4.4.0</version>
</dependency>
(2)实现
public class BasicCrawler extends WebCrawler{
private static final Pattern IMAGE_EXTENSIONS = Pattern.compile(".*\\.(bmp|gif|jpg|png)$");
@Override
public boolean shouldVisit(Page referringPage, WebURL url) {
String href = url.getURL().toLowerCase();
// Ignore the url if it has an extension that matches our defined set of image extensions.
if (IMAGE_EXTENSIONS.matcher(href).matches()) {
return false;
}
// Only accept the url if it is in the "www.ics.uci.edu" domain and protocol is "http".
//return href.startsWith("http://www.ics.uci.edu/");
return href.startsWith("https://blog.csdn.net/");
}
/**
* This function is called when a page is fetched and ready to be processed
* by your program.
*/
@Override
public void visit(Page page) {
int docid = page.getWebURL().getDocid();
String url = page.getWebURL().getURL();
String domain = page.getWebURL().getDomain();
String path = page.getWebURL().getPath();
String subDomain = page.getWebURL().getSubDomain();
String parentUrl = page.getWebURL().getParentUrl();
String anchor = page.getWebURL().getAnchor();
logger.debug("Docid: {}", docid);
logger.info("URL: {}", url);
logger.debug("Domain: '{}'", domain);
logger.debug("Sub-domain: '{}'", subDomain);
logger.debug("Path: '{}'", path);
logger.debug("Parent page: {}", parentUrl);
logger.debug("Anchor text: {}", anchor);
if (page.getParseData() instanceof HtmlParseData) {
HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
String text = htmlParseData.getText();
String html = htmlParseData.getHtml();
Set<WebURL> links = htmlParseData.getOutgoingUrls();
logger.debug("Text length: {}", text.length());
logger.debug("Text=========>{}",text);
logger.debug("Html length: {}", html.length());
logger.debug("Number of outgoing links: {}", links.size());
}
Header[] responseHeaders = page.getFetchResponseHeaders();
if (responseHeaders != null) {
logger.debug("Response headers:");
for (Header header : responseHeaders) {
logger.debug("\t{}: {}", header.getName(), header.getValue());
}
}
logger.debug("=============");
}
}
public class BasicCrawlController {
private static final Logger logger = LoggerFactory.getLogger(BasicCrawlController.class);
public static void main(String[] args) throws Exception {
if (args.length != 2) {
logger.info("Needed parameters: ");
logger.info("\t rootFolder (it will contain intermediate crawl data)");
logger.info("\t numberOfCralwers (number of concurrent threads)");
return;
}
/*
* crawlStorageFolder is a folder where intermediate crawl data is
* stored.
*/
String crawlStorageFolder = args[0];
/*
* numberOfCrawlers shows the number of concurrent threads that should
* be initiated for crawling.
*/
int numberOfCrawlers = Integer.parseInt(args[1]);
CrawlConfig config = new CrawlConfig();
config.setCrawlStorageFolder(crawlStorageFolder);
/*
* Be polite: Make sure that we don't send more than 1 request per second (1000 milliseconds between requests).
*/
config.setPolitenessDelay(1000);
/*
* You can set the maximum crawl depth here. The default value is -1 for unlimited depth
*/
config.setMaxDepthOfCrawling(2);
/*
* You can set the maximum number of pages to crawl. The default value is -1 for unlimited number of pages
*/
config.setMaxPagesToFetch(1000);
/**
* Do you want crawler4j to crawl also binary data ?
* example: the contents of pdf, or the metadata of images etc
*/
config.setIncludeBinaryContentInCrawling(false);
/*
* Do you need to set a proxy? If so, you can use:
* config.setProxyHost("proxyserver.example.com");
* config.setProxyPort(8080);
*
* If your proxy also needs authentication:
* config.setProxyUsername(username); config.getProxyPassword(password);
*/
/*
* This config parameter can be used to set your crawl to be resumable
* (meaning that you can resume the crawl from a previously
* interrupted/crashed crawl). Note: if you enable resuming feature and
* want to start a fresh crawl, you need to delete the contents of
* rootFolder manually.
*/
config.setResumableCrawling(false);
/*
* Instantiate the controller for this crawl.
*/
PageFetcher pageFetcher = new PageFetcher(config);
RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);
/*
* For each crawl, you need to add some seed urls. These are the first
* URLs that are fetched and then the crawler starts following links
* which are found in these pages
*/
// controller.addSeed("http://www.ics.uci.edu/");
//controller.addSeed("http://www.ics.uci.edu/~lopes/");
//controller.addSeed("http://www.ics.uci.edu/~welling/");
controller.addSeed("https://blog.csdn.net/zjm131421/article/details/13093869");
/*
* Start the crawl. This is a blocking operation, meaning that your code
* will reach the line after this only when crawling is finished.
*/
controller.start(BasicCrawler.class, numberOfCrawlers);
}
}
二、webMagic
(1)依赖
<!-- webMagic 爬虫 -->
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.7.3</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.7.3</version>
</dependency>
(2)实现
public class GithubRepoPageProcessor implements PageProcessor {
private Site site = Site.me().setRetryTimes(3).setSleepTime(1000).setTimeOut(10000);
static int count = 0;
@Override
public void process(Page page) {
//判断链接是否符合http://www.cnblogs.com/任意个数字字母-/p/7个数字.html格式
// if(page.getUrl().toString().startsWith("https://blog.csdn.net/spencer_tseng/article/details/")){
// //加入满足条件的链接
// page.addTargetRequests(page.getHtml().xpath("//*[@id=\"post_list\"]/div/div[@class='post_item_body']/h3/a/@href").all());
// }else{
//获取页面需要的内容
System.out.println("抓取的内容=======>"+page.getHtml().xpath("//div[@class='htmledit_views']/p/a/text()").all());
count ++;
// }
}
@Override
public Site getSite() {
return site;
}
public static void main(String[] args) {
long startTime, endTime;
System.out.println("开始爬取...");
startTime = System.currentTimeMillis();
Spider.create(new GithubRepoPageProcessor()).addUrl("https://blog.csdn.net/spencer_tseng/article/details/79106266").thread(5).run();
endTime = System.currentTimeMillis();
System.out.println("爬取结束,耗时约" + ((endTime - startTime) / 1000) + "秒,抓取了"+count+"条记录");
}
}