1 # crawling embarrassments Encyclopedia photos (before 5) · ## regular expressions 2 Import Requests # request data 4 from urllib Import Request # requested data, with this handy download photos 5 Import Re # regular 6 # embarrassments Encyclopedia photos Address 7 # ordinary get request . 8 K = 0 . 9 for I in Range (1,6 ): 10 URL = F ' https://www.qiushibaike.com/imgrank/page/{i}/ ' . 11 # the UA prevent masquerading see through 12 headers ={ 13 is " the User-- Agent " : " the Mozilla / 5.0 (the Windows NT 10.0; the WOW64) AppleWebKit / 537.36 (KHTML, like the Gecko) the Chrome / 65.0.3325.181 Safari / 537.36 " 14 } 15 # fetch response objects Response 16 RES = Requests. GET (URL, headers = headers) . 17 # regular expressions findall, returns a list, re.S is used in html / t / n, etc. solution 18 is img_urls the re.findall = ( ' <div class = "Thumb . "?> * <img src =" "height = *.?" Auto "> * </ div> (*.?).? ' , res.text, re.S) 19 for img_url in img_urls: 20 k + 1 = 21 img_url = 'https:' + img_url 22 imgName = "./imges/qiushi"+str(i)+str(k)+".jpg" 23 request.urlretrieve(img_url, imgName)
## bs4 document https://beautifulsoup.readthedocs.io/zh_CN/v4.4.0/
1 # crawling embarrassments Encyclopedia picture (before 5) ## using BS4 2 Import Requests # request data . 3 from BS4 Import the BeautifulSoup # data analysis . 4 from the urllib Import Request # request data, with this handy download photos 5 # embarrassments Encyclopedia address photo 6 # ordinary get request . 7 K = 0 . 8 for I in Range (1,6 ): . 9 URL = ' https://www.qiushibaike.com/imgrank/page/1/ ' 10 # the UA prevent masquerading see through 11 = headers { 12 is " the User-- Agent " : " the Mozilla / 5.0 (the Windows NT 10.0; the WOW64) AppleWebKit / 537.36 (KHTML, like the Gecko) the Chrome / 65.0.3325.181 Safari / 537.36 " 13 is } 14 # fetch response objects Response 15 RES = requests.get (URL, headers = headers) 16 text = res.text . 17 # instantiated objects BeautifulSoup 18 is Soup = BeautifulSoup (text, " lxml " ) . 19 # to find relevant data 20 is img_urls = soup.find_all (the class_ = " Illustration ") 21 # traversing picture address 22 for img_url in img_urls: 23 k + 1 = 24- # stitching complete picture of the address 25 img_url = " HTTPS: " + img_url.get ( " src " ) 26 # Download image storage location name 27 imgName = " ./imges/qiushi " + STR (I) + STR (K) + " .jpg " 28 request.urlretrieve (img_url, imgName)
lxml
1 # crawling embarrassments Encyclopedia picture (before 5) 2 Import Requests # request data . 3 from lxml Import etree # data analysis . 4 from the urllib Import Request # request data, with this handy download photos 5 # embarrassments Encyclopedia picture address 6 # ordinary get request . 7 K = 0 . 8 for I in Range (1,6 ): . 9 URL = ' https://www.qiushibaike.com/imgrank/page/1/ ' 10 # the UA prevent masquerading see through 11 = headers { 12 is " the User-- Agent " : " the Mozilla / 5.0 (the Windows NT 10.0; the WOW64) AppleWebKit / 537.36 (KHTML, like the Gecko) the Chrome / 65.0.3325.181 Safari / 537.36 " 13 is } 14 # fetch response objects Response 15 RES = requests.get (URL, headers = headers) 16 text = res.text . 17 # the file format is converted into a string html document 18 is html = etree.HTML (text) . 19 img_urls = html.xpath ( " // div [@ = class 'Thumb'] // IMG / @ the src " ) 20 is for img_url in img_urls: 21 img_url = "https:"+ img_url 22 k += 1 23 imgName = "./imges/qiushi"+str(i)+str(k)+".jpg" 24 request.urlretrieve(img_url,imgName) 25 print("正在下载ing:%s"%img_url)