python爬虫-爬取找建筑网站术语

#!/usr/bin/env python
# coding=utf-8
# 爬取找建筑 术语

import urllib2
from bs4 import BeautifulSoup
import xlwt

headers = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}


def getHtml(page):
    url = 'http://www.zhaojianzhu.com/shuyu?page=' + page
    request = urllib2.Request(url=url, headers=headers)  # 模拟浏览器进行访问
    response = urllib2.urlopen(request)
    text = response.read()
    return text


result = []


def parseHtml(htmlContent):
    soup = BeautifulSoup(htmlContent, "html.parser")
    list = soup.find_all('th', class_='pzn ptmn pbmn')
    print '处理中...'
    for data in list:
        name = data.find('a', class_='list_title')
        desc = data.find('p', class_='mtm xi6 xs2')
        source = data.find('a', class_='xg2')
        date = data.find_all('span', class_='xg1')[-1]
        dlist = []
        if name:
            dlist.append(name.text.encode('utf-8'))
        else:
            dlist.append('')

        if desc:
            dlist.append(desc.text.encode('utf-8'))
        else:
            dlist.append('')

        if source:
            dlist.append(source.text.encode('utf-8'))
        else:
            dlist.append('')

        if date:
            dlist.append(date.text.encode('utf-8'))
        else:
            dlist.append('')
        result.append(dlist)


def writeexcel(data):
    book = xlwt.Workbook(encoding='utf-8', style_compression=0)
    sheet = book.add_sheet('test', cell_overwrite_ok=True)
    # 表头
    sheet.write(0, 0, '名称'.decode('utf-8'))
    sheet.write(0, 1, '英文名称'.decode('utf-8'))
    sheet.write(0, 2, '解释'.decode('utf-8'))
    sheet.write(0, 3, '来源'.decode('utf-8'))
    sheet.write(0, 4, '日期'.decode('utf-8'))
    # 数据
    for index in range(len(data)):
        name = data[index][0].split(' ')
        # 此处需要将中文字符串解码成unicode码，否则会报错
        if name[0].strip():
            sheet.write(index + 1, 0, name[0].decode('utf-8'))
        if ' '.join(name[1:len(name)]).strip():
            sheet.write(index + 1, 1, ' '.join(name[1:len(name)]).decode('utf-8'))
        if data[index][1].strip():
            sheet.write(index + 1, 2, data[index][1].decode('utf-8'))
        if data[index][2].strip():
            sheet.write(index + 1, 3, data[index][2].decode('utf-8'))
        if data[index][3].strip():
            sheet.write(index + 1, 4, data[index][3].decode('utf-8'))
    book.save(r'e:\shuyu.xls')  # 在字符串前加r，声明为raw字符串，这样就不会处理其中的转义了。否则，可能会报错


if __name__ == '__main__':
    for i in range(1, 1001):
        print "第{}页".format(i)
        parseHtml(getHtml(str(i)))
    writeexcel(result)
python爬虫-爬取找建筑网站术语

猜你喜欢