抓取特殊类名的标签,并分别打印出来
输出到txt文件上
#CrowTaobaoPrice.py
import requests
from bs4 import BeautifulSoup
import bs4
import re
def getHTMLText(url):
try:
r = requests.get(url, timeout=30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ""
def parsePage(ilt,html):
try:
soup = BeautifulSoup(html, 'html.parser')
pricelist=soup.find_all('em', class_="sale-price")
titlelist=soup.find_all('div', class_="goods-name")
for i in range(len(pricelist)):
price = pricelist[i].contents[0]
title = titlelist[i].a.contents[0]
ilt.append([price , title])
except:
print("")
def printGoodsList(ilt):
tplt = "{:4}\t{:8}\t{:16}"
print(tplt.format("序号", "价格", "商品名称"))
count = 0
f=open(r'H:\a\a.txt','a',encoding='utf-8')
for g in ilt:
count = count + 1
print(tplt.format(count, g[0], g[1]))
print("\n")
f.write(tplt.format(count, g[0], g[1]))
f.write("\n")
f.close()
def main():
depth = 3
start_url = 'http://www.51mkf.com/shop/cate-1034-0-0-0-0-0-0-0-0-'
infoList = []
for i in range(depth):
try:
if i == 0:
continue
else:
url = start_url + str(i) + '.html'
html = getHTMLText(url)
parsePage(infoList, html)
except:
continue
printGoodsList(infoList)
main()
输出到excel上
首先下载pip3 install xlwt
然后建立一个excel文档
#CrowTaobaoPrice.py
import requests
import re
from bs4 import BeautifulSoup
import bs4
import xlwt
def getHTMLText(url):
try:
r = requests.get(url, timeout=30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ""
def parsePage(ilt,html):
try:
soup = BeautifulSoup(html, 'html.parser')
pricelist=soup.find_all('em', class_="sale-price")
titlelist=soup.find_all('div', class_="goods-name")
for i in range(len(pricelist)):
price = pricelist[i].contents[0]
title = titlelist[i].a.contents[0]
ilt.append([price , title])
except:
print("")
def printGoodsList(ilt):
count = 0
workbook = xlwt.Workbook(encoding = 'utf-8')
worksheet = workbook.add_sheet('My Worksheet')
list=["序号","价格","名称"]
for i in list:
worksheet.write(0,count,label = str(i))
count=count+1
count = 0
for g in ilt:
count = count + 1
worksheet.write(count,0,label = count)
for l in range(2):
worksheet.write(count,l+1,label = g[l])
workbook.save('H:/a/a.xls')
def main():
depth = 3
start_url = 'http://www.51mkf.com/shop/cate-1034-0-0-0-0-0-0-0-0-'
infoList = []
for i in range(depth):
try:
if i == 0:
continue
else:
url = start_url + str(i) + '.html'
html = getHTMLText(url)
parsePage(infoList, html)
except:
continue
printGoodsList(infoList)
main()