Spider两道题 ,没全理解

#-*- conding:utf-8 -*-
from bs4 import BeautifulSoup
import requests
import time
'''
    1.需求分析
        获取:
            title = Python 练习实例1
            timu = 题目:有四个数字:1、2、3、4,能组成多少个互不相同且无重复数字的三位数?各是多少?
            cxfx = 程序分析:可填在百位、十位、个位的数字都是1、2、3、4。组成所有的排列后再去 掉不满足条件的排列。
            code = 源代码
    2.源码分析
        入口:http://www.runoob.com/python/python-100-examples.html  
        1. 获取所有的 a 标签
              find(id = 'content').find_all('a')
        2. 获取标题
              find(id = 'content').h1
        3. 获取题目
              find(id = 'content').find_all('p')[1]
        4. 获取程序分析
              find(id = 'content').find_all('p')[2]
        5. 获取源代码
              find(class_ = 'hl-main').text
    3.代码实现
'''

'''
    一、发送请求获取py100首页源代码
'''

startUrl = 'http://www.runoob.com/python/python-100-examples.html'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
}

# 发送请求
response = requests.get(startUrl,headers = headers).content.decode('utf-8')
# print(response)


# 解析成 BeautifulSoup
soup = BeautifulSoup(response,'lxml')
# print(soup)

# 提取a 标签
link = soup.select('#content a')

# a = []
num = 1
for i in link:
    # a.append(i.attrs['href'])
    print('第{0}道题'.format(num))
    '''
        二、请求详细页面获取内容
    '''
    response2 = requests.get('http://www.runoob.com'+i.attrs['href'],headers = headers).content.decode('utf-8')

    # 解析
    html = BeautifulSoup(response2,'lxml')

    #获取标题
    Title = html.select('#content h1')[0].text
    # 题目
    timu = html.select('#content p')[1].text
    # 获取程序分析
    cxfx = html.select('#content p')[2].text
    # 源代码
    try:
        code = html.select('.hl-main')[0].text
    except:
        code = html.select('pre')[0].text
    '''
        保存内容
    '''
    with open('py100.txt','a+',encoding='utf-8') as file:
        file.write(Title+'\n'+timu+'\n'+cxfx+'\n'+code+'\n'+'='*50+'\n')
    # time.sleep(1)
    num+=1
#-*- conding:utf-8 -*-
from lxml import etree
import requests
import time
'''
    1.需求分析
        1.获取每一篇帖子的标题
        2.获取每一篇帖子的内容
    2.源码分析
        入口:https://www.cnblogs.com/
        1. 获取每一篇帖子的a链接
            //div[@class='post_item_body']/h3/a[@href]
            获取下一页
                //div[@class='pager']/a[last()]/@href
                //div[@class='pager']/a[last()]/text()
        2.获取标题
            //div[@class='post_item_body']/h3/a/text()
        3.获取内容
            string(//div[@id='cnblogs_post_body'])
    3.代码实现
'''

'''
    一、请求首页帖子链接
'''

stratUrl = 'https://www.cnblogs.com/'

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
}

# 起始页码
page = 1

while True:
    # 请求首页源码
    response = requests.get(stratUrl,headers = headers).text

    # 解析
    html = etree.HTML(response)

    # 提取a标签中的链接以
    link = html.xpath("//div[@class='post_item_body']/h3/a/@href")

    # 下一页的链接以及文本
    nextPage = html.xpath("//div[@class='pager']/a[last()]/@href")
    nextPageText = html.xpath("//div[@class='pager']/a[last()]/text()")


    '''
        二、获取帖子详细内容
    '''
    # 累加器
    num = 1
    for i in link:
        print('第{0}页第{1}篇帖子'.format(page,num))
        # 请求帖子的内容
        response_info = requests.get(i,headers = headers).text

        # 解析
        html_info = etree.HTML(response_info)
        # print(html_info)

        # 提取标题
        title = html_info.xpath("//a[@id='cb_post_title_url']/text()")[0]

        # 提取内容
        content = html_info.xpath("string(//div[@id='cnblogs_post_body'])")


        '''
            保存文件
        '''

        with open('cnblogs.txt','a+',encoding='utf-8') as file:
            file.write(title+'\n'+content+'='*50+'\n')
        time.sleep(0.5)
        num+=1

    if nextPageText[0] == 'Next >':
        stratUrl = 'https://www.cnblogs.com'+nextPage[0]
        page+=1
        time.sleep(1)

猜你喜欢

转载自blog.csdn.net/qq_37228811/article/details/81192016
今日推荐