学习Python-爬虫开始。

第一步:爬URL和标题

#coding=utf-8
import requests,csv
from bs4 import BeautifulSoup
url = 'http://www.mzitu.com/' #定义首页
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 UBrowser/6.1.2107.204 Safari/537.36'}
html = requests.get(url,headers = header)
#使用自带的html.parser解析,速度慢但通用
soup = BeautifulSoup(html.text,'html.parser')
totle_page = soup.find('div',class_='nav-links').find_all('a')[3].text #取得一共有多少页面
#实际上是第一个class = 'postlist'的div里的所有a 标签是我们要找的信息
print(totle_page) #总页数
'''
生成两级页面的URL
'''
def make_url(page):
print("根据总页数生成URL开始")
a = 2
print("我是page",type(page))
url_list = []
while True:
if a <= int(page):
mk_url = url+"page"+"/"+str(a)+"/"
url_list.append(mk_url)
else:
break
a += 1
# print(a)
return url_list
def get_url_title(list):
csv_file = open("url_title.csv","w",newline="",encoding="utf-8")
writer = csv.writer(csv_file)
writer.writerow(["URL地址","标题地址"])
list_a = [] #一个主页的条目
b = 0
for i in list:
url = i
print("看一下URL",url)
html = requests.get(url,headers = header)
try:
soup = BeautifulSoup(html.text,'html.parser')
all_a = soup.find('div',class_='postlist').find_all('a',target='_blank')
except AttributeError:
print("页面错误")
else:
print("进入循环")
for a in all_a:
b = b+1
if b%2 == 0:
url = a.get('href')
title = a.get_text() #提取文本
url_title = url+","+title
list_a.append(url_title)
writer.writerow([url,title])
csv_file.close()
return list_a
all_page = make_url(totle_page)
print(len(all_page))
print(get_url_title(all_page))

猜你喜欢

转载自www.cnblogs.com/armyz6666666/p/9088307.html