import io import unittest from selenium import webdriver from bs4 import BeautifulSoup import time import sys sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding="utf-8")
#这个win7可能不是默认utf-8输出,这里设置一下,另外试了gbk gbk1830 都无法处理生僻字符...尴尬了只能忽略错误,并计数了
class Spider_Douyu(unittest.TestCase): """斗鱼爬虫,提取当前时间直播间名字,人数,所属分类""" #继承测试类,使用测试类的三大方法 #第一步初始化 def setUp(self): self.page_num = 1 #申明浏览器 self.code = 1 self.driver = webdriver.Chrome() time.sleep(3) #第二步test开头的方法用于测试的调用 def testDouyu(self): #打开网页 self.driver.get("https://www.douyu.com/directory/all") time.sleep(2) print("进行时间为>>>"+ time.ctime()+">>>的斗鱼直播爬取...") while True: soup = BeautifulSoup(self.driver.page_source, "lxml") names = soup.find_all("span",{"class":"dy-name ellipsis fl"})[10:]#返回的是一个包含源码的列表 # names = soup.find_all("//div[@class ='mes']/p/span[@class='dy-name ellipsis fl']/text()")[10:] nums = soup.find_all("span",{"class":"dy-num fr"})#返回直播间人数字符串 # nums = soup.find_all("//div[@class='mes']/p/span[@class='dy-num fr']/text()") clss = soup.find_all("span",{"class":"tag ellipsis"})[10:]#返回直播间所属的类别 # clss = soup.find_all("//div[@class='mes']//span[@class='tag ellipsis']/text()")[10:] #将列表元素按顺序合并为元组,存在一个列表中返回[(),(),()] #准备写入文件 print("准备写入第"+ str(self.page_num) +"页内容...") time.sleep(1) for name,num,cls in zip(names,nums,clss): try: #这里有些直播间的名字生僻字符会出现UnicodeEncodeErro with open("第"+ str(self.page_num) + "douyu.txt","a") as f: if num.get_text()[-1] == "万": num_p = (float(num.get_text()[:-1]))*10000 f.write(name.get_text()+","+str(num_p)+","+cls.get_text() + "\n") else: f.write(name.get_text()+","+num.get_text()+","+cls.get_text() + "\n") # print(name.get_text()+","+num.get_text()+","+cls.get_text() + "\n") time.sleep(0.2) except UnicodeEncodeError: print("忽略"+str(self.code)+"处生僻字符...") self.code += 1 self.page_num += 1 #判断是否到最后一页了 if self.driver.page_source.find("shark-pager-disable-next") != -1: break time.sleep(2) print("点击进入下一页...") # 模拟浏览器点击,因为点击速度过快导致,没法正常的点击,#我这里是时间延迟之后也没法正常点击,只能用xpath从上一层找了 self.driver.find_element_by_xpath("//div[@id='J-pager']/a[@class='shark-pager-next']").click() # print(self.driver.page_source) time.sleep(2) #第三步,测试类结束的方法 def tearDown(self): print("运行结束...") self.driver.quit() if __name__ == "__main__": unittest.main() #用测试类main方法来调用,新尝试
#关于生僻字符无法输出的问题,这位大神讲的很好啊.https://blog.csdn.net/jim7424994/article/details/22675759#