SSM整点定时抓取新浪网IT新闻数据

     之前做过一些网站的数据抓取功能,不过是手动操作抓取,现在做的是定时任务下的数据爬取.每天7点到23点,后台自动先清空原数据表中的数据,再将新爬取的数据写入数据库

1.数据库建表

CREATE TABLE news (
  id int(10) not null auto_increment primary key,
  title varchar(200) NOT NULL,
  url varchar(150) NOT NULL,
  imageUrl varchar(150) DEFAULT NULL,
  describes varchar(500) DEFAULT NULL,
  readCount int(10) DEFAULT '0',
  createTime timestamp NULL DEFAULT CURRENT_TIMESTAMP
) 

2.配置Task环境

<?xml version="1.0" encoding="UTF-8"?>
<beans xmlns="http://www.springframework.org/schema/beans"
    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
    xmlns:mvc="http://www.springframework.org/schema/mvc"
    xmlns:context="http://www.springframework.org/schema/context"
    xmlns:task="http://www.springframework.org/schema/task"
    xsi:schemaLocation="
        http://www.springframework.org/schema/beans
        http://www.springframework.org/schema/beans/spring-beans-4.2.xsd
        http://www.springframework.org/schema/mvc
        http://www.springframework.org/schema/mvc/spring-mvc-4.2.xsd     
        http://www.springframework.org/schema/context
        http://www.springframework.org/schema/context/spring-context-4.2.xsd
        http://www.springframework.org/schema/task
        http://www.springframework.org/schema/task/spring-task-4.2.xsd">
        
    <!-- 自动扫描该包,SpringMVC会将包下用了@controller注解的类注册为Spring的controller -->
    <context:component-scan base-package="com.ytdx.action"/>
    <!-- 设置默认配置方案 -->
    <mvc:annotation-driven/>
    <!-- 使用默认的Servlet来响应静态文件 -->
    <mvc:default-servlet-handler/>
 
    <!-- 视图解析器  -->
     <bean id="viewResolver"
          class="org.springframework.web.servlet.view.InternalResourceViewResolver"> 
        <!-- 前缀 -->
        <property name="prefix">
            <value>/jsp/</value>
        </property>
        <!-- 后缀 -->
        <property name="suffix">
            <value>.jsp</value>
        </property>
    </bean>
	
    <!-- 增加定时任务插件 -->
    <context:annotation-config></context:annotation-config>
    <!-- spring 扫描注解配置 -->
    <context:component-scan base-package="com.ytdx.task">
    </context:component-scan>
    <!-- 开启这个配置 spring才能识别@Scheduled注解 -->
    <task:annotation-driven scheduler="qbScheduler" mode="proxy"/>
    <task:scheduler id="qbScheduler" pool-size="10"/>
</beans>

3.代码

package com.ytdx.dao;

import org.apache.ibatis.annotations.Insert;
import org.apache.ibatis.annotations.Update;

import com.ytdx.entity.News;

public interface NewsDao {
	@Insert("insert into news(title,url,imageUrl,describes) values(#{title},#{url},#{imageUrl},#{describes})")
	void AddNews(News news);
	
	@Update("TRUNCATE TABLE news")
	void truncateNews();
}
package com.ytdx.service;

import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;

import com.ytdx.dao.NewsDao;
import com.ytdx.entity.News;

@Service("newsBiz")
public class NewsBiz {
	@Autowired
	private NewsDao newsDao;
	
	public void AddNews(News news){
		newsDao.AddNews(news);
	}
	
	public void truncateNews(){
		newsDao.truncateNews();
	}
}
package com.ytdx.task;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Qualifier;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;

import com.ytdx.entity.News;
import com.ytdx.service.NewsBiz;

@Component("NewsJob")
public class NewsTask {
	@Autowired
	@Qualifier("newsBiz")
	private NewsBiz newsBiz;
	
	@Scheduled(cron = "0 0 7-23 * * ?")       //7点到23点,每整点抓取一次数据
	public void TimeTask() throws Exception{
		 AddNews();
		 System.out.println("系统抓取新闻数据成功!");
	}
	
	public void AddNews() throws Exception{
		Document doc = Jsoup.connect("http://tech.sina.com.cn/it/").get();
		Element e = doc.getElementsByClass("seo_data_list").first(); 
		Elements es = e.children(); 
		newsBiz.truncateNews();        //先清空之前的新闻表记录
		for (int i = 0; i < es.size(); i++) {
			Element nodes = es.get(i);
			Element content = nodes.getElementsByTag("a").first();
			String title = content.text(); // 获取链接和题目
			String url = content.attr("href");
			String imageUrl = nodes.getElementsByTag("a").get(1).getElementsByTag("img").attr("src");
			String describe = nodes.getElementsByTag("a").get(2).text();
			
			News news = new News();
			news.setTitle(title);
			news.setUrl(url);
			news.setImageUrl(imageUrl);
			news.setDescribes(describe);
			newsBiz.AddNews(news);
		}
	}
}

3.结果


4.再在前端将数据读取,即可实现一个即时的新闻客户端.这里不做了.



猜你喜欢

转载自blog.csdn.net/linhaiyun_ytdx/article/details/80146642
今日推荐