FIG crawling sister

This switched  https://blog.csdn.net/baidu_35085676/article/details/68958267 

Text of the code, I ran again, the main analytical way using a BeautifulSoup but the code is running up and there may be some problems  TimeoutError: [WinError 10060] Since the connected party did not properly respond after a period of time or hosts unresponsive The connection attempt failed. Personally think should be the mechanism of anti-reptile website, you can try, conversion ip address.

 

  1 import requests
  2 from bs4 import BeautifulSoup
  3 import os
  4 import time
  5 
  6 all_url = 'http://www.mzitu.com'
  7 #http请求头
  8 Hostreferer = {
  9     'User-Agent': 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
 10     'Referer': 'http://www.mzitu.com'
 11                }
 12 is Picreferer = {
 13 is      ' the User-- Agent ' : ' the Mozilla / 4.0 (compatible; MSIE 6.0; the Windows NT 5.1) ' ,
 14      ' the Referer ' : ' http://i.meizitu.net ' 
15  }
 16  # request Daolian head cracks 
17  # stored address 
18 is path = ' E: / pythonFile / meititu / Mei / ' 
. 19  # log file 
20 is Data = ' E: /pythonFile/meititu/mei/.data ' 
21 is  # read record keeping
22 is  DEF get_log (File):
 23 is      Page. 1 =
 24      Line = 0
 25      the try :
 26 is          with Open (File, ' R & lt ' ) AS F:
 27              L = f.readline ()
 28              Page, Line = [int (I) for I in l.split ( ' | ' )]
 29      the except Exception AS E:
 30          Print (E)
 31 is          Print ( ' read recording has failed, from the initial ' )
 32      returnPage, Line
 33 is  
34 is  # record keeping 
35  DEF put_log (File, Page, Line):
 36      the try :
 37 [          with Open (File, " W " ) AS F:
 38 is              f.write ( ' {} | {} ' .format ( Page, Line))
 39      the except Exception AS E:
 40          Print ( ' save the record failed: [{}] ' .format (E))
 41 is  
42 is  # find the maximum number of sheets 
43 is  DEF find_max_page ():
 44 is      START_HTML requests.get = ( all_url, headers =Hostreferer)
 45     soup = BeautifulSoup(start_html.text, "html.parser")
 46     page = soup.find_all('a', class_='page-numbers')
 47     max_page = page[-2].text
 48     max_page = int(max_page)
 49     return max_page
 50 
 51 if __name__ == "__main__":
 52     same_url = 'http://www.mzitu.com/page/'
 53     max_page = find_max_page()
 54     page, line = get_log(data)
 55     print('从{}页,{}行开始缓存'.format(page, line))
 56     for n in range(page, int(max_page)+1):
 57         ul = same_url+str(n)
 58         start_html = requests.get(ul, headers=Hostreferer)
 59         soup = BeautifulSoup(start_html.text, "html.parser")
 60         all_a = soup.find('div', class_='postlist') .find_all ( ' A ' , target = ' _blank ' )
 61 is          for Lines in Range (Line, len (all_a)):
 62 is              A = all_a [Lines]
 63 is              title = a.get_text () # extract text 
64              IF (title ! = '' ):
 65                  Print ( " ready to take Pa: " + title)
 66                  # win can not be created with? Directory 
67                  IF (os.path.exists (path + title.strip (). The replace ( ' ? ' , ''))):
 68                         #print('目录已存在')
 69                         flag = 1
 70                 else:
 71                     os.makedirs(path+title.strip().replace('?',''))
 72                     flag = 0
 73                 os.chdir(path + title.strip().replace('?', ''))
 74                 href = a['href']
 75                 html = requests.get(href, headers=Hostreferer)
 76                 = the BeautifulSoup Mess (html.text, " html.parser " )
 77                  # Maximum also class = 'pagenavi'div the sixth span 
78                  pic_max = mess.find ( " div " , the class_ = ' PageNavi ' ) .find_all ( ' span ' )
 79                  Print (pic_max)
 80                  Print (len (pic_max)) # determines the maximum number of pages in a span of several labels, the page may change 
81                  pic_max pic_max = [. 6] .text # maximum number 
82                  Print ( pic_max)
 83                  IF (In Flag ==. 1and len (the os.listdir (path + title.strip () Replace (. ' ? ' , '' )))> = int (pic_max)):
 84                      Print ( ' has been saved, skip ' )
 85                      Continue 
86                  for NUM in Range (. 1, int (pic_max) + 1'd ):
 87                      the while True:
 88                          PIC = the href + ' / ' + STR (NUM)
 89                          HTML = requests.get (PIC, headers = Hostreferer)
 90                          Mess = the BeautifulSoup (HTML. text,"html.parser")
 91                         pic_url = mess.find('img', alt=title)
 92                         if(pic_url):
 93                             break
 94                     # print(pic_url['src'])
 95                     html = requests.get(pic_url['src'], headers=Picreferer)
 96                     file_name = pic_url['src'].split(r'/')[-1]
 97                     f = open(file_name, 'wb')
 98                     f.write(html.content)
 99                     f.close()
100                 put_log(data, n, lines)
101                 time.sleep(0.5)
102         print('',n,'页完成')
103         line = 0
104         time.sleep(10)

 

Guess you like

Origin www.cnblogs.com/tianqianlan/p/11332724.html