今天实现一个使用python requests模块爬取http://www.mzitu.com/xinggan/网站的妹子图片,并保存到本地,效果如下:
先说下思路:
- 获取所有的url页
- 获取当前页的所有url和标题,根据标题创建文件夹
解析指定url,下载图片到该url创建的文件夹中
获取所有的url
可以看到,这里一共有81页,所有的url格式固定,后面跟的就是页数了
http://www.mzitu.com/xinggan/page/2/
def get_all_urls(self):
urls_list = []
for i in range(1,82):
url = "http://www.mzitu.com/xinggan/page/{0}".format(i)
urls_list.append(url)
return urls_list
获取当前页的所有url和标题
scrapy shell http://www.mzitu.com/xinggan/page/1
# 获取当前页所有的图片和url保存到字典返回
def get_title_urls(self,url):
map_title_url = {}
response = requests.get(url,headers=headers)
selector = Selector(text=response.text)
url = selector.css("#pins span a::attr(href)").extract()
title = selector.css("#pins span a::text").extract()
size = len(url)
for i in range(size):
map_title_url[url[i]] = title[i]
return map_title_url
获取指定url的所有图片链接
- 获取图片url
response.css(".main-image img::attr(src)").extract()[0]
- 获取当前url所有图片链接
可以看到,其实不用访问所有的url页面来获取图片地址,因为关于一个主题的url图片存放是有规律的,比如后面都是页数,因此,我们可以用代码构造出这些图片的url
获取当前的总页数
可以看到,获取当前总页数,只需要获取 ‘pagenavi’下的倒数第二个span的内容即可
获取当前主题下的所有图片url
# 获取当前主题下的所有图片url
def get_image_urls(self,url):
selector = self.get_selector(url)
# 获取当前主题的图片总页数
pages_nav = selector.css(".pagenavi a span").extract()
total_page = pages_nav[len(pages_nav)-2]
match_obj = re.match("<span>(\d+)</span>", total_page)
if match_obj:
total_page = match_obj.group(1)
# 获取当前主题的所有图片url
origin_img_url = selector.css(".main-image img::attr(src)").extract()[0] #当前主题首页的图片
return self.get_image_urls_step(origin_img_url, total_page)
# 按照规律推算当前的主题所有的图片url
def get_image_urls_step(self,image_url,page_number):
import re
image_url_list = []
match_obj = re.match('.*/(.*).jpg', image_url) # 解析出当前的图片后缀名,用于后面的替换
if match_obj:
match_result = match_obj.group(1)
for i in range(1, int(page_number)+1): # range方法默认从0开始
if i < 10:
next_image_url = match_result.replace(match_result[-2:], "0" + str(i), 1)
else:
next_image_url = match_result.replace(match_result[-2:], str(i), 1)
next_image_url = image_url.replace(match_result, next_image_url, 1)
image_url_list.append(next_image_url)
return image_url_list
可以看到,已经正确打印出当前的url了
完整代码
在阅读完整代码之前,我还是先屡一下思路:
- 获取当前sex模块的所有页面的url
- 获取当前页的所有图片url和标题,标题用来创建文件夹
下载给定url的图片到本地
下面是完整代码
# -*- coding: utf-8 -*-
import requests
import re
import os
from scrapy.selector import Selector
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:53.0) Gecko/20100101 Firefox/53.0'
}
class GetSexGirl(object):
# 获取所有的url
def get_all_urls(self):
urls_list = []
for i in range(1,82):
url = "http://www.mzitu.com/xinggan/page/{0}".format(i)
# print(url)
urls_list.append(url)
return urls_list
# 获取当前页的所有url和标题
def get_title_urls(self,url):
map_title_url = {}
selector = self.get_selector(url)
url = selector.css("#pins span a::attr(href)").extract()
title = selector.css("#pins span a::text").extract()
size = len(url)
for i in range(size):
map_title_url[url[i]] = title[i]
return map_title_url
# 获取当前主题下的所有图片url
def get_image_urls(self,url):
print('get_image_urls url is :'+url)
selector = self.get_selector(url)
# 获取当前主题的图片总页数
pages_nav = selector.css(".pagenavi a span").extract()
total_page = pages_nav[len(pages_nav)-2]
match_obj = re.match("<span>(\d+)</span>", total_page)
if match_obj:
total_page = match_obj.group(1)
# 获取当前主题的所有图片url
origin_img_url = selector.css(".main-image img::attr(src)").extract()[0] #当前主题首页的图片
return self.get_image_urls_step(origin_img_url, total_page)
# 按照规律推算当前的主题所有的图片url
def get_image_urls_step(self,image_url,page_number):
import re
image_url_list = []
match_obj = re.match('.*/(.*).jpg', image_url) # 解析出当前的图片后缀名,用于后面的替换
if match_obj:
match_result = match_obj.group(1)
for i in range(1, int(page_number)+1): # range方法默认从0开始
if i < 10:
next_image_url = match_result.replace(match_result[-2:], "0" + str(i), 1)
else:
next_image_url = match_result.replace(match_result[-2:], str(i), 1)
next_image_url = image_url.replace(match_result, next_image_url, 1)
# print(next_image_url)
image_url_list.append(next_image_url)
return image_url_list
# 下载给定url的图片到本地
def download_image(self, base_dir, image_url):
print('basedir is :'+base_dir+' image_url is :'+image_url)
image_name = image_url[-9:-4]
# 获取图片的名称, image_url = http://i.meizitu.net/2017/04/24b01.jpg 这种格式
file_name = base_dir+image_name+"{}.jpg".format(image_name)
print('file_name is :'+file_name)
try:
img_response = requests.get(image_url, headers=headers)
except:
return
f = open(file_name, 'ab')
f.write(img_response.content) # 多媒体文件要是用conctent
f.close()
def get_selector(self,url):
response = requests.get(url, headers=headers)
selector = Selector(text=response.text)
return selector
if __name__ == '__main__':
get_sex_girl = GetSexGirl() #获取当前sex模块的所有页面url
total_url_list = get_sex_girl.get_all_urls()
for i in range(len(total_url_list)):
# 获取单个页面的url和title,title用来创建文件夹,map_title_url是一个字典
map_title_url = get_sex_girl.get_title_urls('http://www.mzitu.com/xinggan/page/2')
urls = map_title_url.keys()
for url in urls:
#对每一个主题创建一个文件夹,文件夹名称就是title
base_dir = "/home/liuhang/code/sexgirl/{0}/".format(map_title_url.get(url))
if os.path.exists(base_dir) == False:
print(map_title_url.get(url)+' 主题的文件夹不存在, 创建它...base_dir is:' + base_dir)
# 创建文件夹存放图片
os.makedirs(base_dir)
#获取当前页的所有图片url地址
image_url_list = get_sex_girl.get_image_urls(url)
for i in range(len(image_url_list)):
# 保存当前页的所有图片到该主题的目录下
get_sex_girl.download_image(base_dir, image_url_list[i])