webmagic爬虫自学(五)网络爬虫模拟登陆[策略一:获取cookie]

版权声明:本文为博主原创文章,请尊重原创,未经博主允许禁止转载,保留追究权 https://blog.csdn.net/qq_29914837/article/details/89322575

一、搭建webmagic项目环境部分代码,请参考

https://blog.csdn.net/qq_29914837/article/details/89309298

二、网络爬虫模拟登陆[策略一:获取cookie]

在这里插入图片描述

在使用爬虫的过程中,有的网站的信息必须是要登录后才能查看的,比如CSDN网站中
在这里插入图片描述
管理博客,必须要登陆后才会显示。

如果我想要爬取到这些信息,肯定是要进行登陆才可以的,这里就要获取到cookie的信息。

进入CSDN博客,登陆后,F12打开DevTools页面,查看到cookie信息
在这里插入图片描述
最简单的做法是将cookie信息保存下来。

二、获取cookie模拟登陆

package demo.blog.csdn.net3.model;

import java.util.Date;
import java.util.List;

import us.codecraft.webmagic.model.annotation.ExtractByUrl;

public class CsdnBlog {


	  //标题
	  private String article="";
	  
	  //发布日期
	  private String time;
	  
	  //作者
	  private String nick_name="";
	  
	  
	  //阅读数
	  private int  read_count;
	  
	  //标签
	  private List<String> labelList;
	  private String label="";
	  
	  //分类
	  private  List<String> categoryList;
	  private String category="";
	  
	  //内容
	  private String content="";
	  
	  //链接
	  @ExtractByUrl
	  private String url="";
	  
	  public String getUrl() {
		return url;
	}

	public void setUrl(String url) {
		this.url = url;
	}

	//采集时间
	  private Date collect_time;

	public Date getCollect_time() {
		return collect_time;
	}

	public void setCollect_time(Date collect_time) {
		this.collect_time = collect_time;
	}

	public String getArticle() {
		return article;
	}

	public void setArticle(String article) {
		this.article = article;
	}

	public String getTime() {
		return time;
	}

	public void setTime(String time) {
		this.time = time;
	}

	public String getNick_name() {
		return nick_name;
	}

	public void setNick_name(String nick_name) {
		this.nick_name = nick_name;
	}

	public int getRead_count() {
		return read_count;
	}

	public void setRead_count(int read_count) {
		this.read_count = read_count;
	}

	public List<String> getLabelList() {
		return labelList;
	}

	public void setLabelList(List<String> labelList) {
		this.labelList = labelList;
	}

	public List<String>  getCategoryList() {
		return categoryList;
	}

	public void setCategoryList(List<String>  categoryList) {
		this.categoryList = categoryList;
	}

	public String getContent() {
		return content;
	}

	public void setContent(String content) {
		this.content = content;
	}

	public String getLabel() {
		return label;
	}

	public void setLabel(String label) {
		this.label = label;
	}

	public String getCategory() {
		return category;
	}

	public void setCategory(String category) {
		this.category = category;
	}

}

package demo.blog.csdn.net3;

import org.apache.log4j.Logger;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;

/**
 * 爬取网址:https://mp.csdn.net/postlist
 * 网络爬虫模拟登陆[策略一:获取cookie]
 * @author yl
 */
public class CsdnBlogCrawler2 implements PageProcessor{

	public static final String csdn_name = "qq_29914837";
	
	private Logger logger = Logger.getLogger(CsdnBlogCrawler2.class);
	
	//=".addCookie("""&A2&""","""&B2&""")"   
	private Site site = Site.me().setDomain("blog.csdn.net").setSleepTime(3000).setUserAgent(
			"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36")
			// 【重要】:以下信息可以模拟登陆,信息全部来自于浏览器
			.addCookie("ADHOC_MEMBERSHIP_CLIENT_ID1.0","aa471269-79de-8522-560a-0a252c67c602")
			.addCookie("ARK_ID","JSe3a24bd0201fd5540926eb969f751da5e3a2")
			.addCookie("AU","522")
			.addCookie("BAIDUID","CB366337BFAF861341D00A8BD9C10D92:FG=1")
			.addCookie("BAIDU_SSP_lcr","https://graph.qq.com/oauth2.0/show?which=Login&display=pc&client_id=100270989&response_type=code&redirect_uri=https%3A%2F%2Fpassport.csdn.net%2Faccount%2Flogin%3FpcAuthType%3Dqq%26state%3Dtest")
			.addCookie("BDORZ","B490B5EBF6F3CD402E515D22BCDA1598")
			.addCookie("BIDUPSID","3A63C0C9EF7F0594394D19B30119BF80")
			.addCookie("BT","1554359603673")
			.addCookie("CloudGuest","gWZ1tVdz/I9ISdDDIcZAJ2ok+osCgdixwMtOINtrVDiBpmeXweFyO0LAfiT6xA+jmpnyqqyDqJyukq5YUFGSmOItZgEffYovFmxs6rN5Adh3GvTJJdxVr7rNA9KyR8QmYLwlMCwLtA5cHWrPbvKZj6HsYSSNwLzJmisTdq8a5dL9VwPrtTba4Nxoa0j/NdRc")
			.addCookie("HMACCOUNT","2F0945C300F7BCF8")
			.addCookie("HMVT","6bcd52f51e9b3dce32bec4a3997715ac|1554359627|")
			.addCookie("H_PS_PSSID","1450_211031_28768_28724_28558_28585_28603_28625_28605")
			.addCookie("Hm_ct_6bcd52f151e9b3dce32bec4a3997715ac","1788*1*PC_VC!5744*1*qq_29914837!6525*1*10_28867322920-1540868724025-839757")
			.addCookie("Hm_lpvt2_6bcd52f51e9b3dce32bec4a3997715ac","1554359624")
			.addCookie("Hm_lpvt_e193a8b00cf63f716d774540875007664","1554284443")
			.addCookie("Hm_lvt_20bba81dc5fa07f97ba1779a51ed918a","1535625612")
			.addCookie("Hm_lvt_6bcd452f51e9b3dce32bec4a3997715ac","1.55435758015543E+39")
			.addCookie("Hm_lvt_e159a8b00cf63f716d774540875007664","1.55420144215542E+29")
			.addCookie("PSTM","15541604765")
			.addCookie("SESSION","f4fa5caa-c193a-486e-94c9-c7c34d06d5f6")
			.addCookie("TINGYUN_DATA","%7B2%22id%22%3A%22-sf2Cni530g%23HL5wvli0FZI%22%2C%22n%22%3A%22WebAction%2FCI%2FarticleList%252Flist%22%2C%22tid%22%3A%226e6120ef5abb6c%22%2C%22q%22%3A0%2C%22a%22%3A53%7D")
			.addCookie("UM_distinctid","1661c314ab153d2-0e71d4a46e8aba-5b163f13-100200-166c314ab1626a")
			.addCookie("UN","qq_29914837")
			.addCookie("UserInfo","5c847850d7194e9e94aeba95ee66e2fc")
			.addCookie("UserName","qq_29914837")
			.addCookie("UserNick","%E7%BD%91%E7%95%8C%E5%85%AD%E5%85%AD%E5%B1%85%E5%A3%AB")
			.addCookie("UserToken","5c847850d7194e9e94aeba95ee66e2fc")
			.addCookie("__cfduid","db6ad5fcdbbbab25d6c4ecfc6e9e739a01536029112")
			.addCookie("__utma","17226283.271695300.1537435940.1537435940.1540455490.2")
			.addCookie("__yadk_uid","6eUT39xr0udoIWIg5eO5F68Va2RpVKJP")
			.addCookie("_ga","GA1.2.2716952300.1537435940")
			.addCookie("_ga","GA1.2.708462049.1544146269")
			.addCookie("_gid","GA1.2.2055866324.1554175116")
			.addCookie("_gid","GA1.2.622036880.1554201444")
			.addCookie("_javaeye3_session_","BAh7BjoPc2Vzc2lvbl9pZCIlMzFhNjk2OThkZjcwNjhkN2EzNGNiM2VhN2U2NWQ1MjA%3D--bf5b5979573e5d40d105a8c446f44dcd9f8be422")
			.addCookie("_javaeye_cookie_id_","1523948857395170")
			.addCookie("aliyun_UAToken","115#1EB8U51O1TN2wLPsTCZE1CsoE562CpA11g2mOCXw81OEDUlCKOm6ICtuKXRhyzFGvSfyQ+T8y5jLi/JJhUU4AkNca8pAurPQOSfyetT8ukZQgQkRhEPCOSgaCY9XuzFZASAyeKT8ukNQiFMJhUU4AWNcadyXyzFQOSRlTRDv53bK1CX5H/Gsz1wQSFAF5U6xPs/xw07PLxsuRkxndqe5m/o1cdCsQXS1Z1KagrHPhOFE9HQo2yTHCghsvbQadvn0u5OVLV5yX/66eKcuCwfctxVOk1AEqpKEX+BeHfBOQPb4J5D8FDtoUs9iu78xnJd8xtZvOg9spt62hfQOmfGRIn9kUZye8bGQmXaEXFWgHYx5Xc2czQToq+Mcjx2fLknAnWM41YENsnAdxlaZ2yrbB5wJnQ07tUG87eLcaIKARnolBg4sMBbegVt8lg2NmpoPbCBhEB/sRqmXz0oZpuRkKkS0ddZcBP7kVZBXu3Lmuxt7pIDqSH0c0pe7FCfBWwM5BtSf+uH8/eQK7WxxhRsR1inqB525JuFw+b8ig9kJR4T1HLlaIk+dWZWZa9O0s6PCKoeamo2Ogh8F2Kn8mpfsgiLNr3VVy+gkyBnPakA5UzNF/Ec6cFLj+RVxycWQ/schVL0dYPWwiOLfK7LxmkxO2f==")
			.addCookie("aliyun_webUmidToken","TB638344B44800204EB5241A1AD83D46B2CDC42BE26B2D28D4209FF2C8B")
			.addCookie("blogTipShow","TRUE")
			.addCookie("dc_session_id","10_1554111375664.355014")
			.addCookie("dc_tos","ppf2bk7")
			.addCookie("smidV2","201806212304243cff744f8b1cb4ab13015660c60ccf38e00d97e7e0f187fbb0")
			.addCookie("uuid_tt_dd","10_288673222920-1540868724025-839757")
			.addCookie("yidun_tocken","9ca17a2e2e6ffcda170e2e6eed2aa4ebcb7a2abd252a29a8fa7c15e928b9faff268f6908cb9e521ac9ae5b5ae2af0feaec3b92abb9abc87e45394efbca5f64a878f9fa7d15ba8ebb892d35998b59abaef5eaef0ee9e")

            ;
	
 
	
	@Override
	public void process(Page page) {
		//如果打印 “管理博客“ 代码模拟登录成功
		System.out.println(page.getHtml().xpath("//div[@class='opt-box d-flex justify-content-end']//a/text()").toString());
	}

	@Override
	public Site getSite() {
		return site;
	}

	public static void main(String[] args) {
		Spider.create(new CsdnBlogCrawler2())
        .addUrl("http://blog.csdn.net/"+csdn_name).thread(1).run();
	}
 
	
}

获取cookie信息,如果控制台输出【管理博客】四个字信息,代表模拟登陆成功

猜你喜欢

转载自blog.csdn.net/qq_29914837/article/details/89322575