一个完整的python抓取程序(静态页面)

抓取特殊类名的标签,并分别打印出来

输出到txt文件上

#CrowTaobaoPrice.py
import requests
from bs4 import BeautifulSoup
import bs4
import re

def getHTMLText(url):
    try:
        r = requests.get(url, timeout=30)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return ""
def parsePage(ilt,html):
    try:
        soup = BeautifulSoup(html, 'html.parser')
        pricelist=soup.find_all('em', class_="sale-price")
        titlelist=soup.find_all('div', class_="goods-name")
        for i in range(len(pricelist)):
            price = pricelist[i].contents[0]
            title = titlelist[i].a.contents[0]
            ilt.append([price , title])
    except:
        print("")
def printGoodsList(ilt):
    tplt = "{:4}\t{:8}\t{:16}"
    print(tplt.format("序号", "价格", "商品名称"))
    count = 0
    f=open(r'H:\a\a.txt','a',encoding='utf-8')
    for g in ilt:
        count = count + 1
        print(tplt.format(count, g[0], g[1]))
        print("\n")
        f.write(tplt.format(count, g[0], g[1]))
        f.write("\n")
    f.close()
def main():
    depth = 3
    start_url = 'http://www.51mkf.com/shop/cate-1034-0-0-0-0-0-0-0-0-'
    infoList = []
    for i in range(depth):
        try:
            if i == 0:
                continue
            else:
                url = start_url + str(i) + '.html'
                html = getHTMLText(url)
                parsePage(infoList, html)
        except:
            continue
    printGoodsList(infoList)
main()

输出到excel上

首先下载pip3 install xlwt

然后建立一个excel文档

#CrowTaobaoPrice.py
import requests
import re
from bs4 import BeautifulSoup
import bs4
import xlwt



def getHTMLText(url):
    try:
        r = requests.get(url, timeout=30)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return ""
def parsePage(ilt,html):
    try:
        soup = BeautifulSoup(html, 'html.parser')
        pricelist=soup.find_all('em', class_="sale-price")
        titlelist=soup.find_all('div', class_="goods-name")
        for i in range(len(pricelist)):
            price = pricelist[i].contents[0]
            title = titlelist[i].a.contents[0]
            ilt.append([price , title])
    except:
        print("")
def printGoodsList(ilt):
    count = 0
    workbook = xlwt.Workbook(encoding = 'utf-8')
    worksheet = workbook.add_sheet('My Worksheet')
    list=["序号","价格","名称"]
    for i in list:
        worksheet.write(0,count,label = str(i))
        count=count+1
    count = 0
    for g in ilt:
        count = count + 1
        worksheet.write(count,0,label = count)
        for l in range(2):
            worksheet.write(count,l+1,label = g[l])
    workbook.save('H:/a/a.xls')
def main():
    depth = 3
    start_url = 'http://www.51mkf.com/shop/cate-1034-0-0-0-0-0-0-0-0-'
    infoList = []
    for i in range(depth):
        try:
            if i == 0:
                continue
            else:
                url = start_url + str(i) + '.html'
                html = getHTMLText(url)
                parsePage(infoList, html)
        except:
            continue
    printGoodsList(infoList)
main()
发布了56 篇原创文章 · 获赞 2 · 访问量 3万+

猜你喜欢

转载自blog.csdn.net/fan13938409755/article/details/100812359