之前做过一些网站的数据抓取功能,不过是手动操作抓取,现在做的是定时任务下的数据爬取.每天7点到23点,后台自动先清空原数据表中的数据,再将新爬取的数据写入数据库
1.数据库建表
CREATE TABLE news (
id int(10) not null auto_increment primary key,
title varchar(200) NOT NULL,
url varchar(150) NOT NULL,
imageUrl varchar(150) DEFAULT NULL,
describes varchar(500) DEFAULT NULL,
readCount int(10) DEFAULT '0',
createTime timestamp NULL DEFAULT CURRENT_TIMESTAMP
)
2.配置Task环境
<?xml version="1.0" encoding="UTF-8"?>
<beans xmlns="http://www.springframework.org/schema/beans"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xmlns:mvc="http://www.springframework.org/schema/mvc"
xmlns:context="http://www.springframework.org/schema/context"
xmlns:task="http://www.springframework.org/schema/task"
xsi:schemaLocation="
http://www.springframework.org/schema/beans
http://www.springframework.org/schema/beans/spring-beans-4.2.xsd
http://www.springframework.org/schema/mvc
http://www.springframework.org/schema/mvc/spring-mvc-4.2.xsd
http://www.springframework.org/schema/context
http://www.springframework.org/schema/context/spring-context-4.2.xsd
http://www.springframework.org/schema/task
http://www.springframework.org/schema/task/spring-task-4.2.xsd">
<!-- 自动扫描该包,SpringMVC会将包下用了@controller注解的类注册为Spring的controller -->
<context:component-scan base-package="com.ytdx.action"/>
<!-- 设置默认配置方案 -->
<mvc:annotation-driven/>
<!-- 使用默认的Servlet来响应静态文件 -->
<mvc:default-servlet-handler/>
<!-- 视图解析器 -->
<bean id="viewResolver"
class="org.springframework.web.servlet.view.InternalResourceViewResolver">
<!-- 前缀 -->
<property name="prefix">
<value>/jsp/</value>
</property>
<!-- 后缀 -->
<property name="suffix">
<value>.jsp</value>
</property>
</bean>
<!-- 增加定时任务插件 -->
<context:annotation-config></context:annotation-config>
<!-- spring 扫描注解配置 -->
<context:component-scan base-package="com.ytdx.task">
</context:component-scan>
<!-- 开启这个配置 spring才能识别@Scheduled注解 -->
<task:annotation-driven scheduler="qbScheduler" mode="proxy"/>
<task:scheduler id="qbScheduler" pool-size="10"/>
</beans>
3.代码
package com.ytdx.dao;
import org.apache.ibatis.annotations.Insert;
import org.apache.ibatis.annotations.Update;
import com.ytdx.entity.News;
public interface NewsDao {
@Insert("insert into news(title,url,imageUrl,describes) values(#{title},#{url},#{imageUrl},#{describes})")
void AddNews(News news);
@Update("TRUNCATE TABLE news")
void truncateNews();
}
package com.ytdx.service;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import com.ytdx.dao.NewsDao;
import com.ytdx.entity.News;
@Service("newsBiz")
public class NewsBiz {
@Autowired
private NewsDao newsDao;
public void AddNews(News news){
newsDao.AddNews(news);
}
public void truncateNews(){
newsDao.truncateNews();
}
}
package com.ytdx.task;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Qualifier;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import com.ytdx.entity.News;
import com.ytdx.service.NewsBiz;
@Component("NewsJob")
public class NewsTask {
@Autowired
@Qualifier("newsBiz")
private NewsBiz newsBiz;
@Scheduled(cron = "0 0 7-23 * * ?") //7点到23点,每整点抓取一次数据
public void TimeTask() throws Exception{
AddNews();
System.out.println("系统抓取新闻数据成功!");
}
public void AddNews() throws Exception{
Document doc = Jsoup.connect("http://tech.sina.com.cn/it/").get();
Element e = doc.getElementsByClass("seo_data_list").first();
Elements es = e.children();
newsBiz.truncateNews(); //先清空之前的新闻表记录
for (int i = 0; i < es.size(); i++) {
Element nodes = es.get(i);
Element content = nodes.getElementsByTag("a").first();
String title = content.text(); // 获取链接和题目
String url = content.attr("href");
String imageUrl = nodes.getElementsByTag("a").get(1).getElementsByTag("img").attr("src");
String describe = nodes.getElementsByTag("a").get(2).text();
News news = new News();
news.setTitle(title);
news.setUrl(url);
news.setImageUrl(imageUrl);
news.setDescribes(describe);
newsBiz.AddNews(news);
}
}
}
3.结果
4.再在前端将数据读取,即可实现一个即时的新闻客户端.这里不做了.