import requests import time import re class Spyder: def __init__(self, user_name, pass_word): spyder = requests.session() self.spyder = spyder self.user_name = user_name self.pass_word = pass_word def login(self): url = "http://bysj.cqu.edu.cn/bysj/index.jsp" data={ "id": self.user_name, "pwd": self.pass_word, "type": "student", "btlogin": "登陆" } response = self.spyder.post(url,data=data) html = response.text def get_paper(self): url = "http://bysj.cqu.edu.cn/bysj/thesiswork/student/manageThesisMaterials.htm" response = self.spyder.get(url) text = response.text url0 = "http://bysj.cqu.edu.cn" name = re.findall(r'<td class="title" colspan="7" height="30">(.*?)</td>',text) if(len(name)==0): print("获取失败") return 0 else: stu_name=name[0][3:-2] print("正在获取"+stu_name) # 开题报告 report = re.findall(r'href="(.*?)">查看开题报告</a></td>',text) if(len(report)==0): print("获取开题报告失败") else: print("开始下载开题报告") url = url0 + report[0] response = self.spyder.get(url) content = response.content with open(stu_name+"开题报告.doc","wb") as f: f.write(content) translate=re.findall(r'href="(.*?)">查看译文</a></td>',text) if (len(translate) == 0): print("获取论文翻译失败") else: print("开始下载文献翻译") url = url0 + translate[0] response = self.spyder.get(url) content = response.content with open(stu_name + "文献翻译.doc", "wb") as f: f.write(content) paper = re.findall(r'href="(.*?)">查看论文</a></td>', text) if (len(paper) == 0): print("获取论文失败") else: print("开始下载论文") url = url0 + paper[0] response = self.spyder.get(url) content = response.content with open(stu_name + "论文.doc", "wb") as f: f.write(content) if __name__ == '__main__': for i in range(6300,6500): user_name = "2014" + str(i) pass_word = "000000" spyder = Spyder(user_name, pass_word) spyder.login() spyder.get_paper() time.sleep(1)
CQU毕设爬虫
猜你喜欢
转载自blog.csdn.net/wwxy1995/article/details/80303038
今日推荐
周排行