妹子图爬虫..新手的爬虫

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/Kompany4/article/details/72988188
import re
import urllib.request


def open_url(url):
    req = urllib.request.Request(url)
    req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36')
    page = urllib.request.urlopen(req)
    html = page.read().decode('utf-8')
    return html


def get_img(html):
    ###搜索图片地址
    p = r'<img src="([^"]+\.jpg)'
    imglist = re.findall(p,html)
    ###去除最后一个地址是最后一个地址与我们找到图片无关,而且多次试验中最后一张图片的地址会出错所以需要删除
    imglist.pop()
    """
    for i in imglist:
        print(i)
    """
    ###下载图片
    for each in imglist:
          each = 'https:'+each
          print(each)
          filename = each.split("/")[-1]
          urllib.request.urlretrieve(each,filename,None)
    print('图片下载完成!!!!')
def get_Ye(html):
    ###获取当前页数
    p=r'<span class="current-comment-page">\[(.+)]'
    imglist=re.findall(p,html)
    return imglist[0]
if __name__ == '__main__':
    
    FirstUrl = 'http://jandan.net/ooxx'
    NowYe = int(get_Ye(open_url(FirstUrl)))
    print('当前页数为:%d'%NowYe)
    
    while True:
        Ywant = int(input('请输入你想下载的页数:'))
        if Ywant<=NowYe and Ywant >0:  
            for i in range(Ywant):
                print
                url = 'http://jandan.net/ooxx/page-'+str(NowYe-i)+'#comments'
                get_img(open_url(url))
            break    


        else:
            print('请重新输入页数:范围在【%d,0)中'%NowYe)



猜你喜欢

转载自blog.csdn.net/Kompany4/article/details/72988188
今日推荐