Java爬虫--爬取xx娱乐网数据,让你的程序不单薄(一)

本人是一名程序小白,此次程序可能有许多小问题,望各位大神勿喷,仅供学习参考!

功能:用Java语言来爬取xx娱乐网的文章图片、文章标题、文章作者,以及文章大致内容,功能基本实现,没有GUI界面,通过eclipse控制台可以看到结果。

使用工具:eclipse,mysql,navicat(轻量级mysql可视化界面)程序员的必备品微笑

编码语言:Java(编码格式一定要是UTF-8哦

过程:

1.新建Java项目(名字不重要,只需要一个英文名,首字母一定要大写哦

2.在src下新建package(结构如下)


3.整体结构:


4.详细解说:

  • com.dyb.dao包下新建一个Interface,名为XiaoGaoDao,内容为:
package com.dyb.dao;

import java.util.List;
import com.dyb.po.XiaoGao;

public interface XiaoGaoDao {
	public int insertXGInfo(XiaoGao xiaoGao);
	public int isExistXGInfo(String title);
	public int selectGeShu();
	public List<XiaoGao> selectInfo();
}
  • com.dyb.dao.impl包下新建一个class,名为XiaoGaoDaoImpl,内容为:
package com.dyb.dao.impl;

import java.sql.SQLException;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.dbutils.QueryRunner;
import org.apache.commons.dbutils.handlers.BeanHandler;
import org.apache.commons.dbutils.handlers.BeanListHandler;
import org.apache.commons.dbutils.handlers.ScalarHandler;
import com.dyb.dao.XiaoGaoDao;
import com.dyb.po.XiaoGao;
import com.dyb.util.JdbcUtils;

public class XiaoGaoDaoImpl implements XiaoGaoDao{

	//往数据库插入数据
	public int insertXGInfo(XiaoGao xiaoGao) {
		int result=0;
                //sql语句
                String sql="INSERT INTO xiaogao VALUES(?,?,?,?,?,?)";
                //sql语句中?的内容
                 Object[] params= {xiaoGao.getId(),xiaoGao.getImgUrl(),xiaoGao.getHref(),xiaoGao.getTitle(),xiaoGao.getAuthor(),xiaoGao.getInfo()};
		try {
			QueryRunner queryRunner = new QueryRunner(JdbcUtils.getDataSource());
			result=queryRunner.update(sql, params);
		} catch (SQLException e) {
			e.printStackTrace();
		}
		return result;
	}
	//判断内容是否重复
	public int isExistXGInfo(String title) {
		int isResluts=0;
		String sql="SELECT count(*) FROM xiaogao where title=?";
		QueryRunner queryRunner=new QueryRunner(JdbcUtils.getDataSource());
		try {
			Object obj=queryRunner.query(sql, new ScalarHandler<>(1), title);
			Number n = (Number)obj; 
			isResluts=n.intValue();
		} catch (SQLException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		return isResluts;
	}
	//查询数据库内容多少
	public int selectGeShu() {
		int isResluts=0;
		String sql="SELECT count(*) FROM xiaogao";
		QueryRunner queryRunner=new QueryRunner(JdbcUtils.getDataSource());
		try {
			Object obj=queryRunner.query(sql, new ScalarHandler<>(1));
			Number n = (Number)obj; 
			isResluts=n.intValue();
		} catch (SQLException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		return isResluts;
	}
	
	public List<XiaoGao> selectInfo() {
		List<XiaoGao> isResluts=null;
		String sql="SELECT * FROM xiaogao";
		QueryRunner queryRunner=new QueryRunner(JdbcUtils.getDataSource());
		try {
			isResluts=new ArrayList<>();
			isResluts=queryRunner.query(sql,new BeanListHandler<>(XiaoGao.class));
		} catch (SQLException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		return isResluts;
	}
	
}
  • com.dyb.po包下新建一个class,名为XiaoGao,内容为:
package com.dyb.po;

public class XiaoGao {
	private int id;
	//图片链接
	private String imgUrl;
	//文章链接
	private String href;
	//文章标题
	private String title;
	//文章作者信息
	private String author;
	//文章大致信息
	private String info;
	private int pages;
	
	private String baiDuWP;
	
	private String baiDuHref;
	
	private String lanZouWP;
	
	private String lanZouHref;
	
	
	public XiaoGao(String imgUrl, String href, String title, String author, String info) {
		super();
		this.imgUrl = imgUrl;
		this.href = href;
		this.title = title;
		this.author = author;
		this.info = info;
	}
	
	
	
	public XiaoGao() {
		super();
	}


	


	public String getBaiDuWP() {
		return baiDuWP;
	}
	public void setBaiDuWP(String baiDuWP) {
		this.baiDuWP = baiDuWP;
	}



	public String getBaiDuHref() {
		return baiDuHref;
	}
	public void setBaiDuHref(String baiDuHref) {
		this.baiDuHref = baiDuHref;
	}



	public String getLanZouWP() {
		return lanZouWP;
	}
	public void setLanZouWP(String lanZouWP) {
		this.lanZouWP = lanZouWP;
	}


	public String getLanZouHref() {
		return lanZouHref;
	}
	public void setLanZouHref(String lanZouHref) {
		this.lanZouHref = lanZouHref;
	}


	public int getId() {
		return id;
	}
	public void setId(int id) {
		this.id = id;
	}



	public int getPages() {
		return pages;
	}
	public void setPages(int pages) {
		this.pages = pages;
	}


	public String getImgUrl() {
		return imgUrl;
	}
	public void setImgUrl(String imgUrl) {
		this.imgUrl = imgUrl;
	}
	
	
	public String getHref() {
		return href;
	}
	public void setHref(String href) {
		this.href = href;
	}
	
	
	public String getTitle() {
		return title;
	}
	public void setTitle(String title) {
		this.title = title;
	}
	
	
	public String getAuthor() {
		return author;
	}
	public void setAuthor(String author) {
		this.author = author;
	}
	
	
	public String getInfo() {
		return info;
	}
	public void setInfo(String info) {
		this.info = info;
	}



	@Override
	public String toString() {
		return "XiaoGao [id=" + id + ", imgUrl=" + imgUrl + ", href=" + href + ", title=" + title + ", author=" + author
				+ ", info=" + info + ", pages=" + pages + "]";
	}
	
	
}
  • com.dyb.service包下新建一个interface,名为XiaoGaoService,内容为:
package com.dyb.service;

import java.util.List;

import com.dyb.po.XiaoGao;

public interface XiaoGaoService {
	public int insertXGInfo(XiaoGao xiaoGao);
	public int isExistXGInfo(String title);
	public int selectGeShu();
	public List<XiaoGao> selectInfo();
}

  • com.dyb.service.impl包下新建一个class,名为XiaoGaoServiceImpl,内容为:
package com.dyb.service.impl;

import java.util.List;

import com.dyb.dao.XiaoGaoDao;
import com.dyb.dao.impl.XiaoGaoDaoImpl;
import com.dyb.po.XiaoGao;
import com.dyb.service.XiaoGaoService;

public class XiaoGaoServiceImpl implements XiaoGaoService{
	private static XiaoGaoDao xGaoDao=new XiaoGaoDaoImpl();
	
	public int insertXGInfo(XiaoGao xiaoGao) {
		return xGaoDao.insertXGInfo(xiaoGao);
	}
	
	public int isExistXGInfo(String title) {
		return xGaoDao.isExistXGInfo(title);
	}
	
	public int selectGeShu() {
		return xGaoDao.selectGeShu();
	}
	
	public List<XiaoGao> selectInfo() {
		return xGaoDao.selectInfo();
	}
}

  • com.dyb.util包下新建一个class,名为:JdbcUtils,内容为:
package com.dyb.util;

import java.sql.Connection;
import java.sql.SQLException;
import javax.sql.DataSource;
import com.mchange.v2.c3p0.ComboPooledDataSource;


public class JdbcUtils {
	
	// 设置数据源(使用C3P0数据库连接池)
	private static ComboPooledDataSource dataSource=new ComboPooledDataSource("mysql-config");
	// 创建一个与事务相关的局部线程变量
	private static ThreadLocal<Connection> tl=new ThreadLocal<Connection>();
	
	/**
	 * 方法描述:获取数据源
	 * @return
	 */
	public static DataSource getDataSource(){
		return dataSource;
	}
	
	/**
	 * 方法描述:获取连接
	 * @return
	 * @throws SQLException 
	 */
	public static Connection getConnection() throws SQLException{
		// 获取线程局部变量的值 connection
		Connection conn=tl.get();
		if(conn==null){
			// 如果连接为空,则从数据库连接词池中获取连接
			conn=dataSource.getConnection();
		}
		return conn;
	}
	
	/**
	 * 方法描述:开启事务
	 * @throws SQLException
	 */
	public static void beginTranscation()throws SQLException{
		Connection conn=tl.get();
		if(conn!=null){
			throw new SQLException("不能重复开启事务");
		}
		// 获取连接
		conn=getConnection();
		// 关闭自动提交事务
		conn.setAutoCommit(false);
		// 将线程变量的值设置为connection
		tl.set(conn);
	}
	
	/**
	 * 方法描述:提交事务
	 * @throws SQLException 
	 */
	public static void commitTranscation() throws SQLException{
		Connection conn=tl.get();
		if(conn==null){
			throw new SQLException("没有开启事务 不能提交");
		}
		// 提交事务
		conn.commit();
		// 关闭连接
		conn.close();
		// 移除此线程局部变量的值
		tl.remove();
	}
	
	/**
	 * 方法描述:回滚事务
	 * @throws SQLException 
	 */
	public static void rollbackTranscation() throws SQLException{
		Connection conn=tl.get();
		if(conn==null){
			throw new SQLException("没有开启事务,不能回滚");
		}
		// 回滚事务
		conn.rollback();
		// 关闭连接
		conn.close();
		// 移除此线程局部变量的值
		tl.remove();
	}
}

  • com.dyb.util包下新建一个class,名为PaQuInfo,内容为:
package com.dyb.util;

import java.io.IOException;
import java.util.HashMap;
import java.util.Iterator;

import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.dyb.po.XiaoGao;
import com.dyb.service.XiaoGaoService;
import com.dyb.service.impl.XiaoGaoServiceImpl;

public class PaQuInfo {
	
	private static Connection conn=null;
	private static XiaoGao xiaoGao=new XiaoGao();
	private static int i=0;
	private static XiaoGaoService xGaoService=new XiaoGaoServiceImpl();
	
	public XiaoGao zhuaQu(String url) throws IOException {
		conn = Jsoup.connect(url);
		//
		conn.header("User-Agent", "Mozilla/5.0 (X11; Linux x86_64; rv:32.0) Gecko/    20100101 Firefox/32.0");
		Document doc = conn.timeout(5000).get();
		String aLink=null;
		// 
		Elements elements = doc.select("ul.articleCon").select("li");
		for (Element element : elements) {
		

			String imgs=element.select("img").attr("src").trim();
			aLink = element.select("h3>a").attr("abs:href").trim();
			String biaoTi = element.select("h3").text();
			String xinXi = element.select("p.icogroup").text();
			String info = element.select("p.info").text();
			
			try {
				Thread.sleep(3000);
			} catch (InterruptedException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}
			xiaoGao.setImgUrl(imgs);
			xiaoGao.setTitle(biaoTi);
			xiaoGao.setAuthor(xinXi);
			xiaoGao.setInfo(info);
			xiaoGao.setHref(aLink);
			
			int isResult=xGaoService.isExistXGInfo(xiaoGao.getTitle());
			if (isResult>=1) {
				System.out.println("标题:"+biaoTi);
				System.out.println("已重复,不用插入!");
			}else {
				int result=xGaoService.insertXGInfo(xiaoGao);
				if (result>0) {
					System.out.println("添加成功");
				}
			}
			
		}
		return xiaoGao;
	}
	
	
	
}
  • com.dyb.util包下新建一个class,名为PaQuPages,内容为:
package com.dyb.util;

import java.io.IOException;
import java.util.List;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;

import com.dyb.po.XiaoGao;

public class PaQuPages {
	
	

	private int zongYeShu=0;
	private static Connection conn=null;
	private static XiaoGao xGao=new XiaoGao();
	private static PaQuPerson pQuPerson=new PaQuPerson();
	private List<String> alinks=pQuPerson.htmls();
	private String jiQu = null;
	
	public int pages() throws IOException{
		for (int i=0;i<alinks.size();i++) {
			conn= Jsoup.connect(alinks.get(0));
		}
		//
		conn.header("User-Agent", "Mozilla/5.0 (X11; Linux x86_64; rv:32.0) Gecko/    20100101 Firefox/32.0");
		Document doc = conn.get();

		//
		Elements pages = doc.select("div.pagination").select("a");
		for (int i = 0; i < pages.size(); i++) {
			String page = pages.get(i).attr("abs:href");
			jiQu = page.substring(page.lastIndexOf("/")).replace("/", "");
		}
		zongYeShu=Integer.parseInt(jiQu);
		xGao.setPages(zongYeShu);
		return zongYeShu;
	}
	
}
  • com.dyb.util包下新建一个class,名为PaQuPerson,内容为:
package com.dyb.util;

import java.util.ArrayList;
import java.util.List;

public class PaQuPerson {

	public List<String> htmls(){
		List<String> urls=new ArrayList<>();
		urls.add("https://www.12580sky.com/sort/ruanjian/page/1");
		return urls;
	}
}
  • com.dyb.view包下新建一个class(此class为main函数哦)名为HaiXinView,内容为:
package com.dyb.view;

import com.dyb.po.XiaoGao;
import java.awt.*;
import javax.swing.*;
import javax.swing.JButton;
import com.dyb.util.PaQuInfo;
import com.dyb.util.PaQuPages;
import java.awt.FlowLayout;
import java.io.IOException;
import javax.swing.JFrame;
import javax.swing.JPanel;

public class HaiXinView extends JFrame {

	/**
	 * 
	 */
	private static final long serialVersionUID = 1L;
	
	private static PaQuInfo pInfo = new PaQuInfo();
	private static XiaoGao xGao = null;
	private static PaQuPages pQuPages = new PaQuPages();
	private int yeShues = 0;

	public static void main(String[] args) throws IOException {
		HaiXinView hXinView = new HaiXinView();
//		XiaoGaoView xGaoView=new XiaoGaoView();
//		xGaoView.initComponent();
		hXinView.yunXing();
	}

	public void yunXing() throws IOException {
		yeShues = pQuPages.pages();
		int i = 1;
		do {
			String url = "https://www.12580sky.com/sort/ruanjian/page/" + i;
			xGao = pInfo.zhuaQu(url);
			i++;
		} while (i < yeShues);
	}


}

基本功能已经完成,还有几个jar包,下次为大家上传,希望对大家有所帮助。


请帮顶 / 评论点赞!因为你的鼓励是我写作的最大动力!











猜你喜欢

转载自blog.csdn.net/qq_36359797/article/details/80221431
今日推荐