使用css选择器爬取全本笔趣看小说

版权声明:派森带你学python,欢迎加群:923414804与群主一起学习 https://blog.csdn.net/weixin_44369414/article/details/85872157
# -*- coding:utf8 -*-
import requests
from pyquery import PyQuery as pq


class biqukan():
    def __init__(self):
        self.url = "http://www.yuetutu.com/18_18147/"
        self.path = '/home/xxp/git_learning/practice/spider_learning/漫漫武仙路.txt'

    def get_index(self,url):
        html = requests.get(url).text
        doc = pq(html)
        links = doc('.listmain a')
        for link in links.items():
            yield link.attr.href

    def parse_url(self,index):
        for link in index:
            yield self.url + link[10:]

    def get_text(self,urls):
        for url in urls:
            html = requests.get(url).text
            doc = pq(html)
            title = doc('h1').text()
            text = doc('#content').text()
            yield title + text

    def write(self,texts):
        for text in texts:
            with open(self.path, 'a', encoding='utf-8') as f:
                f.write(text + '\n\n')

    def main(self):
        index = self.get_index(self.url)
        urls = self.parse_url(index)
        texts = self.get_text(urls)
        self.write(texts)

if __name__ == "__main__":
    b = biqukan()
    b.main()

猜你喜欢

转载自blog.csdn.net/weixin_44369414/article/details/85872157
今日推荐