爬取知乎专栏
#爬取知乎专栏
class XSSpider(scrapy.Spider):
name = 'xiaoshuo'
allowed_domains = ['zhihu.com']
start_urls = ['https://www.zhihu.com/api/v4/columns/c_1059416559054893056/items?limit=10&offset=0']
def parse(self, response):
obj = json.loads(response.text)
icount=0
for p_item in obj['data']:
icount = icount + 1
p_content = get_html_of_response(p_item['url'])
p_selector = etree.HTML(p_content.text)
sid = re.sub(".+/p/","",p_item['url'])
content_json = json.loads(p_selector.xpath("//script[@id='js-initialData']/text()")[0])
txt = ""
i_title = content_json['initialState']['entities']['articles'][sid]['title']
txt = txt + i_title.strip() + "\r\n"
i_p = content_json['initialState']['entities']['articles'][sid]['content']
txt = txt + i_p.replace("<p>","\r\n").replace("</p>","\r\n")
fo = open('G:/learn/3.txt', "ab+") # 打开小说文件
fo.write((txt).encode('UTF-8'))
fo.close()
# 自动翻页
pre_page_item = obj['paging']['next']
if icount>0 :
yield scrapy.Request(pre_page_item, callback=self.parse)
class XSSpider(scrapy.Spider):
name = 'xiaoshuo'
allowed_domains = ['zhihu.com']
start_urls = ['https://www.zhihu.com/api/v4/columns/c_1059416559054893056/items?limit=50&offset=490']
def parse(self, response):
obj = json.loads(response.text)
icount=0
for i in range(0,len(obj['data'])):
icount = icount + 1
p_content = get_html_of_response(obj['data'][len(obj['data'])-1-i]['url'])
p_selector = etree.HTML(p_content.text)
txt = ""
i_title = p_selector.xpath("//h1[@class='Post-Title']/text()")
if len(i_title)>0:
txt = txt + i_title[0].strip() + "\r\n"
i_p = p_selector.xpath("//div[@class='RichText ztext Post-RichText']//p//text()")
for p in i_p:
txt = txt + p.strip() + "\r\n"
fo = open('G:/learn/7.txt', "ab+") # 打开小说文件
fo.write((txt).encode('UTF-8'))
fo.close()
# 自动翻页
pre_page_item = obj['paging']['previous']
if pre_page_item != response.url :
yield scrapy.Request(pre_page_item, callback=self.parse)
爬取普通小说网站
class XSSpider(scrapy.Spider):
name = 'xiaoshuo'
allowed_domains = ['2mcn.com']
start_urls = ['https://www.2mcn.com/html/book/73323/73323986/49627483.html']
def parse(self, response):
txt = ""
i_title = response.xpath("//h1/text()").extract()[0]
txt = txt + i_title.strip() + "\r\n"
i_p = response.xpath("//div[@id='content']//text()").extract()
for p in i_p:
txt = txt + p.strip() + "\r\n"
fo = open('3.txt', "ab+") # 打开小说文件
fo.write((txt).encode('UTF-8'))
fo.close()
# 自动翻页
next_page_item = response.xpath("//a[contains(text(),'下一章')]/@href").extract()
if len(next_page_item) >0:
yield scrapy.Request(response.urljoin(next_page_item[0]), callback=self.parse)