本来想爬一波无聊图,唉,竟然加密了。。。。
还好是base64
不说了,代码献上 2018.12.14 有效。。。。。
import requests from bs4 import BeautifulSoup import base64,time base64_list = [] print('====开始爬取=====') starttime = time.time() for i in range(50,79): url = 'http://jandan.net/ooxx/page-{}#comments'.format(i) # url = 'http://jandan.net/ooxx/page-50#comments' r = requests.get(url=url, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36' }) r.encoding = 'utf-8' soup = BeautifulSoup(r.text, 'lxml') ol = soup.find(name='ol',attrs={"class":"commentlist"}) li_List = ol.find_all(name='li') for li in li_List: try: p = li.find(name='p') if '\n' in p.text: continue # print(p.text) base64_list.append(p.text) except AttributeError: continue endtime = time.time() inttime = endtime - starttime print('=====爬取结束====\n用时{}秒'.format(inttime)) print('=====开始解析====') full_url = [] #解析完毕地址url for www in base64_list: try: k = base64.b64decode(www) b = 'http:' + k.decode() # print(b) full_url.append(b) except ValueError: print(1) print('=====解析完毕=====') print('===载入本地文件夹===') image_start_time = time.time() for index in range(0, len(full_url)): item = full_url[index] full_path = 'jdimg/' + str(index) + '.jpg' k = requests.get(item).content # print(k) with open(full_path,'wb') as f: f.write(k) end_time_time = time.time() print('用时%.2s秒'%(end_time_time-image_start_time)) # print(full_url)