This switched https://blog.csdn.net/baidu_35085676/article/details/68958267
Text of the code, I ran again, the main analytical way using a BeautifulSoup but the code is running up and there may be some problems TimeoutError: [WinError 10060] Since the connected party did not properly respond after a period of time or hosts unresponsive The connection attempt failed. Personally think should be the mechanism of anti-reptile website, you can try, conversion ip address.
1 import requests 2 from bs4 import BeautifulSoup 3 import os 4 import time 5 6 all_url = 'http://www.mzitu.com' 7 #http请求头 8 Hostreferer = { 9 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)', 10 'Referer': 'http://www.mzitu.com' 11 } 12 is Picreferer = { 13 is ' the User-- Agent ' : ' the Mozilla / 4.0 (compatible; MSIE 6.0; the Windows NT 5.1) ' , 14 ' the Referer ' : ' http://i.meizitu.net ' 15 } 16 # request Daolian head cracks 17 # stored address 18 is path = ' E: / pythonFile / meititu / Mei / ' . 19 # log file 20 is Data = ' E: /pythonFile/meititu/mei/.data ' 21 is # read record keeping 22 is DEF get_log (File): 23 is Page. 1 = 24 Line = 0 25 the try : 26 is with Open (File, ' R & lt ' ) AS F: 27 L = f.readline () 28 Page, Line = [int (I) for I in l.split ( ' | ' )] 29 the except Exception AS E: 30 Print (E) 31 is Print ( ' read recording has failed, from the initial ' ) 32 returnPage, Line 33 is 34 is # record keeping 35 DEF put_log (File, Page, Line): 36 the try : 37 [ with Open (File, " W " ) AS F: 38 is f.write ( ' {} | {} ' .format ( Page, Line)) 39 the except Exception AS E: 40 Print ( ' save the record failed: [{}] ' .format (E)) 41 is 42 is # find the maximum number of sheets 43 is DEF find_max_page (): 44 is START_HTML requests.get = ( all_url, headers =Hostreferer) 45 soup = BeautifulSoup(start_html.text, "html.parser") 46 page = soup.find_all('a', class_='page-numbers') 47 max_page = page[-2].text 48 max_page = int(max_page) 49 return max_page 50 51 if __name__ == "__main__": 52 same_url = 'http://www.mzitu.com/page/' 53 max_page = find_max_page() 54 page, line = get_log(data) 55 print('从{}页,{}行开始缓存'.format(page, line)) 56 for n in range(page, int(max_page)+1): 57 ul = same_url+str(n) 58 start_html = requests.get(ul, headers=Hostreferer) 59 soup = BeautifulSoup(start_html.text, "html.parser") 60 all_a = soup.find('div', class_='postlist') .find_all ( ' A ' , target = ' _blank ' ) 61 is for Lines in Range (Line, len (all_a)): 62 is A = all_a [Lines] 63 is title = a.get_text () # extract text 64 IF (title ! = '' ): 65 Print ( " ready to take Pa: " + title) 66 # win can not be created with? Directory 67 IF (os.path.exists (path + title.strip (). The replace ( ' ? ' , ''))): 68 #print('目录已存在') 69 flag = 1 70 else: 71 os.makedirs(path+title.strip().replace('?','')) 72 flag = 0 73 os.chdir(path + title.strip().replace('?', '')) 74 href = a['href'] 75 html = requests.get(href, headers=Hostreferer) 76 = the BeautifulSoup Mess (html.text, " html.parser " ) 77 # Maximum also class = 'pagenavi'div the sixth span 78 pic_max = mess.find ( " div " , the class_ = ' PageNavi ' ) .find_all ( ' span ' ) 79 Print (pic_max) 80 Print (len (pic_max)) # determines the maximum number of pages in a span of several labels, the page may change 81 pic_max pic_max = [. 6] .text # maximum number 82 Print ( pic_max) 83 IF (In Flag ==. 1and len (the os.listdir (path + title.strip () Replace (. ' ? ' , '' )))> = int (pic_max)): 84 Print ( ' has been saved, skip ' ) 85 Continue 86 for NUM in Range (. 1, int (pic_max) + 1'd ): 87 the while True: 88 PIC = the href + ' / ' + STR (NUM) 89 HTML = requests.get (PIC, headers = Hostreferer) 90 Mess = the BeautifulSoup (HTML. text,"html.parser") 91 pic_url = mess.find('img', alt=title) 92 if(pic_url): 93 break 94 # print(pic_url['src']) 95 html = requests.get(pic_url['src'], headers=Picreferer) 96 file_name = pic_url['src'].split(r'/')[-1] 97 f = open(file_name, 'wb') 98 f.write(html.content) 99 f.close() 100 put_log(data, n, lines) 101 time.sleep(0.5) 102 print('第',n,'页完成') 103 line = 0 104 time.sleep(10)