19 案例_主题列表内容抓取

版权声明:转载请注明出处 https://blog.csdn.net/nanhuaibeian/article/details/86644582
  1. 审查元素
    在这里插入图片描述
  2. 分析
    在这里插入图片描述在这里插入图片描述
  3. 功能代码
      #获取标题和URL的方法
    def extract_tag_a(self,columns,index):
        title = columns[index].xpath('a')[0].text
        url = columns[index].xpath('a')[0].attrib['href']

        return title,url
    #获取评分、like数量、回复数量
        def extract_text(self,columns,index):
        tt = columns[index].text
        #如果tt为None时需要将其替换为0
        if tt == None:
            tt = 0
        return tt
	#获取主题列表内容
    def get_post_list(self):
        rows = self.tree.xpath("//table[@class='board-list tiz']/tbody/tr")
        posts = []
        for row in rows:
            post = {}
            columns = row.xpath('td')
            # 获取文章标题和文章URL的方法
            post['title'], post['url'] = self.extract_tag_a(columns,1)
            # 获取作者姓名和作者URL的方法
            post['author_id'],post['author_url'] = self.extract_tag_a(columns,3)
            #获取评分
            post['rating'] = self.extract_text(columns,4)
            # 获取Like数量
            post['num_likes'] = self.extract_text(columns,5)
            # 获取回复数量
            post['num_replies'] = self.extract_text(columns,6)
            posts.append(post)
        return posts
  1. 主要代码
import re
import requests
from lxml import etree

class PostListCrawler:
    domain = "https://www.newsmth.net"

    def get_content(self,board_url,page):
        querystring = {"ajax": "", "p": str(page)}
        url = self.domain + board_url
        r = requests.get(url,params=querystring)
        #方便调用
        self.html = r.text
        self.tree = etree.HTML(r.text)
	#获取最大页码
    def get_max_page(self):
        tree = etree.HTML(self.html)
        pages = tree.xpath('//ol[@class="page-main"][1]/li')
        #只有一页的情况
        if len(pages) == 1:
            return 1
        #页面没有在最后一页时最大的页码
        last_page_test = pages[len(pages)-1].xpath('a')[0].text
        #如果页面在最后一页
        if last_page_test == '>>':
            return int(pages[len(pages)-2].xpath('a')[0].text)

        return last_page_test

    #获取标题和URL的方法
    def extract_tag_a(self,columns,index):
        title = columns[index].xpath('a')[0].text
        url = columns[index].xpath('a')[0].attrib['href']

        return title,url
    #获取评分、like数量、回复数量
    def extract_text(self,columns,index):
        tt = columns[index].text
        #如果tt为None时需要将其替换为0
        if tt == None:
            tt = 0
        return tt

    def get_post_list(self):
        rows = self.tree.xpath("//table[@class='board-list tiz']/tbody/tr")
        posts = []
        for row in rows:
            post = {}
            columns = row.xpath('td')
            # 获取文章标题和文章URL的方法
            post['title'], post['url'] = self.extract_tag_a(columns,1)
            # 获取作者姓名和作者URL的方法
            post['author_id'],post['author_url'] = self.extract_tag_a(columns,3)
            #获取评分
            post['rating'] = self.extract_text(columns,4)
            # 获取Like数量
            post['num_likes'] = self.extract_text(columns,5)
            # 获取回复数量
            post['num_replies'] = self.extract_text(columns,6)
            posts.append(post)
        return posts

if __name__ == "__main__":
    plc = PostListCrawler()
    content = plc.get_content('/nForum/board/AutoWorld',1)
    print(plc.get_post_list())

猜你喜欢

转载自blog.csdn.net/nanhuaibeian/article/details/86644582
19
今日推荐