Python_ reptiles _xpath / bs4 / re small combat

1  # crawling embarrassments Encyclopedia photos (before 5) · ## regular expressions 
2  Import Requests # request data 
4  from urllib Import Request # requested data, with this handy download photos 
5  Import Re # regular 
6  # embarrassments Encyclopedia photos Address 
7  # ordinary get request 
. 8 K = 0
 . 9  for I in Range (1,6 ):
 10      URL = F ' https://www.qiushibaike.com/imgrank/page/{i}/ ' 
. 11      # the UA prevent masquerading see through 
12      headers ={
 13 is          " the User-- Agent " : " the Mozilla / 5.0 (the Windows NT 10.0; the WOW64) AppleWebKit / 537.36 (KHTML, like the Gecko) the Chrome / 65.0.3325.181 Safari / 537.36 " 
14      }
 15      # fetch response objects Response 
16      RES = Requests. GET (URL, headers = headers)
 . 17      # regular expressions findall, returns a list, re.S is used in html / t / n, etc. solution 
18 is      img_urls the re.findall = ( ' <div class = "Thumb . "?> * <img src =" "height = *.?" Auto "> * </ div> (*.?).? ' , res.text, re.S)
 19      for img_url in img_urls:
20          k + 1 =
 21         img_url = 'https:' + img_url
22         imgName = "./imges/qiushi"+str(i)+str(k)+".jpg"
23         request.urlretrieve(img_url, imgName)

## bs4 document https://beautifulsoup.readthedocs.io/zh_CN/v4.4.0/

1  # crawling embarrassments Encyclopedia picture (before 5) ## using BS4 
2  Import Requests # request data 
. 3  from BS4 Import the BeautifulSoup # data analysis 
. 4  from the urllib Import Request # request data, with this handy download photos 
5  # embarrassments Encyclopedia address photo 
6  # ordinary get request 
. 7 K = 0
 . 8  for I in Range (1,6 ):
 . 9      URL = ' https://www.qiushibaike.com/imgrank/page/1/ ' 
10      # the UA prevent masquerading see through 
11     = headers {
 12 is          " the User-- Agent " : " the Mozilla / 5.0 (the Windows NT 10.0; the WOW64) AppleWebKit / 537.36 (KHTML, like the Gecko) the Chrome / 65.0.3325.181 Safari / 537.36 " 
13 is      }
 14      # fetch response objects Response 
15      RES = requests.get (URL, headers = headers)
 16      text = res.text
 . 17      # instantiated objects BeautifulSoup 
18 is      Soup = BeautifulSoup (text, " lxml " )
 . 19      # to find relevant data 
20 is      img_urls = soup.find_all (the class_ = " Illustration ")
 21      # traversing picture address 
22      for img_url in img_urls:
 23          k + 1 =
 24-          # stitching complete picture of the address 
25          img_url = " HTTPS: " + img_url.get ( " src " )
 26          # Download image storage location name 
27          imgName = " ./imges/qiushi " + STR (I) + STR (K) + " .jpg " 
28          request.urlretrieve (img_url, imgName)

lxml 

1  # crawling embarrassments Encyclopedia picture (before 5) 
2  Import Requests # request data 
. 3  from lxml Import etree # data analysis 
. 4  from the urllib Import Request # request data, with this handy download photos 
5  # embarrassments Encyclopedia picture address 
6  # ordinary get request 
. 7 K = 0
 . 8  for I in Range (1,6 ):
 . 9      URL = ' https://www.qiushibaike.com/imgrank/page/1/ ' 
10      # the UA prevent masquerading see through 
11     = headers {
 12 is          " the User-- Agent " : " the Mozilla / 5.0 (the Windows NT 10.0; the WOW64) AppleWebKit / 537.36 (KHTML, like the Gecko) the Chrome / 65.0.3325.181 Safari / 537.36 " 
13 is      }
 14      # fetch response objects Response 
15      RES = requests.get (URL, headers = headers)
 16      text = res.text
 . 17      # the file format is converted into a string html document 
18 is      html = etree.HTML (text)
 . 19      img_urls = html.xpath ( " // div [@ = class 'Thumb'] // IMG / @ the src " )
 20 is      for img_url in img_urls:
21         img_url = "https:"+ img_url
22         k += 1
23         imgName = "./imges/qiushi"+str(i)+str(k)+".jpg"
24         request.urlretrieve(img_url,imgName)
25         print("正在下载ing:%s"%img_url)

 

Guess you like

Origin www.cnblogs.com/helloboke/p/11494671.html