学习Python-爬虫开始。

第一步：爬URL和标题

#coding=utf-8
import requests,csv
from bs4 import BeautifulSoup
url = 'http://www.mzitu.com/' #定义首页
header = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 UBrowser/6.1.2107.204 Safari/537.36'}
html = requests.get(url,headers = header)
#使用自带的html.parser解析，速度慢但通用
soup = BeautifulSoup(html.text,'html.parser')
totle_page = soup.find('div',class_='nav-links').find_all('a')[3].text #取得一共有多少页面
#实际上是第一个class = 'postlist'的div里的所有a 标签是我们要找的信息
print(totle_page) #总页数
'''
生成两级页面的URL
'''
def make_url(page):
    print("根据总页数生成URL开始")
    a = 2
    print("我是page",type(page))
    url_list = []
    while True:
        if a <= int(page):
            mk_url = url+"page"+"/"+str(a)+"/"
            url_list.append(mk_url)
        else:
            break
        a += 1
#        print(a)
    return url_list
def get_url_title(list):
    csv_file = open("url_title.csv","w",newline="",encoding="utf-8")
    writer = csv.writer(csv_file)
    writer.writerow(["URL地址","标题地址"])
    list_a = [] #一个主页的条目
    b = 0
    for i in list:
        url = i
        print("看一下URL",url)
        html = requests.get(url,headers = header)
        try:
            soup = BeautifulSoup(html.text,'html.parser')
            all_a = soup.find('div',class_='postlist').find_all('a',target='_blank')
        except AttributeError:
            print("页面错误")
        else:
            print("进入循环")
            for a in all_a:
                b = b+1
                if b%2 == 0:
                    url = a.get('href')
                    title = a.get_text() #提取文本
                    url_title = url+","+title
                    list_a.append(url_title)
                    writer.writerow([url,title])
    csv_file.close()
    return  list_a
all_page = make_url(totle_page)
print(len(all_page))
print(get_url_title(all_page))
学习Python-爬虫开始。

猜你喜欢