本人是一名程序小白,此次程序可能有许多小问题,望各位大神勿喷,仅供学习参考!
功能:用Java语言来爬取xx娱乐网的文章图片、文章标题、文章作者,以及文章大致内容,功能基本实现,没有GUI界面,通过eclipse控制台可以看到结果。
使用工具:eclipse,mysql,navicat(轻量级mysql可视化界面)程序员的必备品
编码语言:Java(编码格式一定要是UTF-8哦)
过程:
1.新建Java项目(名字不重要,只需要一个英文名,首字母一定要大写哦)
2.在src下新建package(结构如下)
3.整体结构:
4.详细解说:
- com.dyb.dao包下新建一个Interface,名为XiaoGaoDao,内容为:
package com.dyb.dao; import java.util.List; import com.dyb.po.XiaoGao; public interface XiaoGaoDao { public int insertXGInfo(XiaoGao xiaoGao); public int isExistXGInfo(String title); public int selectGeShu(); public List<XiaoGao> selectInfo(); }
- com.dyb.dao.impl包下新建一个class,名为XiaoGaoDaoImpl,内容为:
package com.dyb.dao.impl; import java.sql.SQLException; import java.util.ArrayList; import java.util.List; import org.apache.commons.dbutils.QueryRunner; import org.apache.commons.dbutils.handlers.BeanHandler; import org.apache.commons.dbutils.handlers.BeanListHandler; import org.apache.commons.dbutils.handlers.ScalarHandler; import com.dyb.dao.XiaoGaoDao; import com.dyb.po.XiaoGao; import com.dyb.util.JdbcUtils; public class XiaoGaoDaoImpl implements XiaoGaoDao{ //往数据库插入数据 public int insertXGInfo(XiaoGao xiaoGao) { int result=0; //sql语句 String sql="INSERT INTO xiaogao VALUES(?,?,?,?,?,?)"; //sql语句中?的内容 Object[] params= {xiaoGao.getId(),xiaoGao.getImgUrl(),xiaoGao.getHref(),xiaoGao.getTitle(),xiaoGao.getAuthor(),xiaoGao.getInfo()}; try { QueryRunner queryRunner = new QueryRunner(JdbcUtils.getDataSource()); result=queryRunner.update(sql, params); } catch (SQLException e) { e.printStackTrace(); } return result; } //判断内容是否重复 public int isExistXGInfo(String title) { int isResluts=0; String sql="SELECT count(*) FROM xiaogao where title=?"; QueryRunner queryRunner=new QueryRunner(JdbcUtils.getDataSource()); try { Object obj=queryRunner.query(sql, new ScalarHandler<>(1), title); Number n = (Number)obj; isResluts=n.intValue(); } catch (SQLException e) { // TODO Auto-generated catch block e.printStackTrace(); } return isResluts; } //查询数据库内容多少 public int selectGeShu() { int isResluts=0; String sql="SELECT count(*) FROM xiaogao"; QueryRunner queryRunner=new QueryRunner(JdbcUtils.getDataSource()); try { Object obj=queryRunner.query(sql, new ScalarHandler<>(1)); Number n = (Number)obj; isResluts=n.intValue(); } catch (SQLException e) { // TODO Auto-generated catch block e.printStackTrace(); } return isResluts; } public List<XiaoGao> selectInfo() { List<XiaoGao> isResluts=null; String sql="SELECT * FROM xiaogao"; QueryRunner queryRunner=new QueryRunner(JdbcUtils.getDataSource()); try { isResluts=new ArrayList<>(); isResluts=queryRunner.query(sql,new BeanListHandler<>(XiaoGao.class)); } catch (SQLException e) { // TODO Auto-generated catch block e.printStackTrace(); } return isResluts; } }
- com.dyb.po包下新建一个class,名为XiaoGao,内容为:
package com.dyb.po; public class XiaoGao { private int id; //图片链接 private String imgUrl; //文章链接 private String href; //文章标题 private String title; //文章作者信息 private String author; //文章大致信息 private String info; private int pages; private String baiDuWP; private String baiDuHref; private String lanZouWP; private String lanZouHref; public XiaoGao(String imgUrl, String href, String title, String author, String info) { super(); this.imgUrl = imgUrl; this.href = href; this.title = title; this.author = author; this.info = info; } public XiaoGao() { super(); } public String getBaiDuWP() { return baiDuWP; } public void setBaiDuWP(String baiDuWP) { this.baiDuWP = baiDuWP; } public String getBaiDuHref() { return baiDuHref; } public void setBaiDuHref(String baiDuHref) { this.baiDuHref = baiDuHref; } public String getLanZouWP() { return lanZouWP; } public void setLanZouWP(String lanZouWP) { this.lanZouWP = lanZouWP; } public String getLanZouHref() { return lanZouHref; } public void setLanZouHref(String lanZouHref) { this.lanZouHref = lanZouHref; } public int getId() { return id; } public void setId(int id) { this.id = id; } public int getPages() { return pages; } public void setPages(int pages) { this.pages = pages; } public String getImgUrl() { return imgUrl; } public void setImgUrl(String imgUrl) { this.imgUrl = imgUrl; } public String getHref() { return href; } public void setHref(String href) { this.href = href; } public String getTitle() { return title; } public void setTitle(String title) { this.title = title; } public String getAuthor() { return author; } public void setAuthor(String author) { this.author = author; } public String getInfo() { return info; } public void setInfo(String info) { this.info = info; } @Override public String toString() { return "XiaoGao [id=" + id + ", imgUrl=" + imgUrl + ", href=" + href + ", title=" + title + ", author=" + author + ", info=" + info + ", pages=" + pages + "]"; } }
- com.dyb.service包下新建一个interface,名为XiaoGaoService,内容为:
package com.dyb.service; import java.util.List; import com.dyb.po.XiaoGao; public interface XiaoGaoService { public int insertXGInfo(XiaoGao xiaoGao); public int isExistXGInfo(String title); public int selectGeShu(); public List<XiaoGao> selectInfo(); }
- com.dyb.service.impl包下新建一个class,名为XiaoGaoServiceImpl,内容为:
package com.dyb.service.impl; import java.util.List; import com.dyb.dao.XiaoGaoDao; import com.dyb.dao.impl.XiaoGaoDaoImpl; import com.dyb.po.XiaoGao; import com.dyb.service.XiaoGaoService; public class XiaoGaoServiceImpl implements XiaoGaoService{ private static XiaoGaoDao xGaoDao=new XiaoGaoDaoImpl(); public int insertXGInfo(XiaoGao xiaoGao) { return xGaoDao.insertXGInfo(xiaoGao); } public int isExistXGInfo(String title) { return xGaoDao.isExistXGInfo(title); } public int selectGeShu() { return xGaoDao.selectGeShu(); } public List<XiaoGao> selectInfo() { return xGaoDao.selectInfo(); } }
- com.dyb.util包下新建一个class,名为:JdbcUtils,内容为:
package com.dyb.util; import java.sql.Connection; import java.sql.SQLException; import javax.sql.DataSource; import com.mchange.v2.c3p0.ComboPooledDataSource; public class JdbcUtils { // 设置数据源(使用C3P0数据库连接池) private static ComboPooledDataSource dataSource=new ComboPooledDataSource("mysql-config"); // 创建一个与事务相关的局部线程变量 private static ThreadLocal<Connection> tl=new ThreadLocal<Connection>(); /** * 方法描述:获取数据源 * @return */ public static DataSource getDataSource(){ return dataSource; } /** * 方法描述:获取连接 * @return * @throws SQLException */ public static Connection getConnection() throws SQLException{ // 获取线程局部变量的值 connection Connection conn=tl.get(); if(conn==null){ // 如果连接为空,则从数据库连接词池中获取连接 conn=dataSource.getConnection(); } return conn; } /** * 方法描述:开启事务 * @throws SQLException */ public static void beginTranscation()throws SQLException{ Connection conn=tl.get(); if(conn!=null){ throw new SQLException("不能重复开启事务"); } // 获取连接 conn=getConnection(); // 关闭自动提交事务 conn.setAutoCommit(false); // 将线程变量的值设置为connection tl.set(conn); } /** * 方法描述:提交事务 * @throws SQLException */ public static void commitTranscation() throws SQLException{ Connection conn=tl.get(); if(conn==null){ throw new SQLException("没有开启事务 不能提交"); } // 提交事务 conn.commit(); // 关闭连接 conn.close(); // 移除此线程局部变量的值 tl.remove(); } /** * 方法描述:回滚事务 * @throws SQLException */ public static void rollbackTranscation() throws SQLException{ Connection conn=tl.get(); if(conn==null){ throw new SQLException("没有开启事务,不能回滚"); } // 回滚事务 conn.rollback(); // 关闭连接 conn.close(); // 移除此线程局部变量的值 tl.remove(); } }
- com.dyb.util包下新建一个class,名为PaQuInfo,内容为:
package com.dyb.util; import java.io.IOException; import java.util.HashMap; import java.util.Iterator; import org.jsoup.Connection; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import com.dyb.po.XiaoGao; import com.dyb.service.XiaoGaoService; import com.dyb.service.impl.XiaoGaoServiceImpl; public class PaQuInfo { private static Connection conn=null; private static XiaoGao xiaoGao=new XiaoGao(); private static int i=0; private static XiaoGaoService xGaoService=new XiaoGaoServiceImpl(); public XiaoGao zhuaQu(String url) throws IOException { conn = Jsoup.connect(url); // conn.header("User-Agent", "Mozilla/5.0 (X11; Linux x86_64; rv:32.0) Gecko/ 20100101 Firefox/32.0"); Document doc = conn.timeout(5000).get(); String aLink=null; // Elements elements = doc.select("ul.articleCon").select("li"); for (Element element : elements) { String imgs=element.select("img").attr("src").trim(); aLink = element.select("h3>a").attr("abs:href").trim(); String biaoTi = element.select("h3").text(); String xinXi = element.select("p.icogroup").text(); String info = element.select("p.info").text(); try { Thread.sleep(3000); } catch (InterruptedException e) { // TODO Auto-generated catch block e.printStackTrace(); } xiaoGao.setImgUrl(imgs); xiaoGao.setTitle(biaoTi); xiaoGao.setAuthor(xinXi); xiaoGao.setInfo(info); xiaoGao.setHref(aLink); int isResult=xGaoService.isExistXGInfo(xiaoGao.getTitle()); if (isResult>=1) { System.out.println("标题:"+biaoTi); System.out.println("已重复,不用插入!"); }else { int result=xGaoService.insertXGInfo(xiaoGao); if (result>0) { System.out.println("添加成功"); } } } return xiaoGao; } }
- com.dyb.util包下新建一个class,名为PaQuPages,内容为:
package com.dyb.util; import java.io.IOException; import java.util.List; import org.jsoup.Connection; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.select.Elements; import com.dyb.po.XiaoGao; public class PaQuPages { private int zongYeShu=0; private static Connection conn=null; private static XiaoGao xGao=new XiaoGao(); private static PaQuPerson pQuPerson=new PaQuPerson(); private List<String> alinks=pQuPerson.htmls(); private String jiQu = null; public int pages() throws IOException{ for (int i=0;i<alinks.size();i++) { conn= Jsoup.connect(alinks.get(0)); } // conn.header("User-Agent", "Mozilla/5.0 (X11; Linux x86_64; rv:32.0) Gecko/ 20100101 Firefox/32.0"); Document doc = conn.get(); // Elements pages = doc.select("div.pagination").select("a"); for (int i = 0; i < pages.size(); i++) { String page = pages.get(i).attr("abs:href"); jiQu = page.substring(page.lastIndexOf("/")).replace("/", ""); } zongYeShu=Integer.parseInt(jiQu); xGao.setPages(zongYeShu); return zongYeShu; } }
- com.dyb.util包下新建一个class,名为PaQuPerson,内容为:
package com.dyb.util; import java.util.ArrayList; import java.util.List; public class PaQuPerson { public List<String> htmls(){ List<String> urls=new ArrayList<>(); urls.add("https://www.12580sky.com/sort/ruanjian/page/1"); return urls; } }
- com.dyb.view包下新建一个class(此class为main函数哦),名为HaiXinView,内容为:
package com.dyb.view; import com.dyb.po.XiaoGao; import java.awt.*; import javax.swing.*; import javax.swing.JButton; import com.dyb.util.PaQuInfo; import com.dyb.util.PaQuPages; import java.awt.FlowLayout; import java.io.IOException; import javax.swing.JFrame; import javax.swing.JPanel; public class HaiXinView extends JFrame { /** * */ private static final long serialVersionUID = 1L; private static PaQuInfo pInfo = new PaQuInfo(); private static XiaoGao xGao = null; private static PaQuPages pQuPages = new PaQuPages(); private int yeShues = 0; public static void main(String[] args) throws IOException { HaiXinView hXinView = new HaiXinView(); // XiaoGaoView xGaoView=new XiaoGaoView(); // xGaoView.initComponent(); hXinView.yunXing(); } public void yunXing() throws IOException { yeShues = pQuPages.pages(); int i = 1; do { String url = "https://www.12580sky.com/sort/ruanjian/page/" + i; xGao = pInfo.zhuaQu(url); i++; } while (i < yeShues); } }
基本功能已经完成,还有几个jar包,下次为大家上传,希望对大家有所帮助。