【整理】【转载】爬虫相关

(1) 抓取小说--转

import requests
import re
from bs4 import BeautifulSoup


if __name__=='__main__':
    headers={"User-Agent":'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0'}
   # url='http://z8.cnzz.com/stat.htm?id=1273371515&r=&lg=zh-cn&ntime=1543752090&cnzz_eid=1445241018-1543752090-&showp=1280x720&t=最强恶魔妖孽系统_最强恶魔妖孽系统最新章节_最强恶魔妖孽系统最新章节列表_全书网&umuuid=1676ee2c2530-045ac19542698d8-4c312979-e1000-1676ee2c25463&h=1&rnd=1980725067'
    url='http://www.shushu8.com/shaolinbajue/'
    r= requests.get(url,headers=headers)#.content
    r.encoding=r.apparent_encoding
    
    #print(r.text)
    d=BeautifulSoup(r.text,'lxml')
    t=d.select('.clearfix > ul > li > a')
   # print(t)
    for i in t:
        deta={'href': i.get('href'),
              '标题': i.get_text()}
        urlk='http://www.shushu8.com'+deta['href']    
        jsu=requests.get(urlk)
        jsu.encoding=jsu.apparent_encoding
        di=BeautifulSoup(jsu.text,'lxml')
        ti=di.select('div.page-content')
        
        for k in ti:
            print(k.get_text())

(2) 抓取网页图片

https://blog.csdn.net/caozewei/article/details/82497388

1、根据给定的网址获取网页源代码  

2、利用正则表达式把源代码中的图片地址过滤出来  

3、根据过滤出来的图片地址下载网络图片


import re
import urllib.request
def gethtml(url):
    page=urllib.request.urlopen(url)
    html=page.read()
    return html
def getimg(html):
    reg = r'src="(.*?\.jpg)"'
    img=re.compile(reg)
    html=html.decode('utf-8')#python3
    imglist=re.findall(img,html)
    x = 0
    for imgurl in imglist:
        urllib.request.urlretrieve(imgurl,'%s.jpg'%x)
        x = x+1
html=gethtml("http://news.ifeng.com/a/20161115/50258273_0.shtml")
print(getimg(html))

把代码直接导入解释器,可直接运行抓取图片
 

(3)其他

https://blog.csdn.net/sinat_37390744/article/details/55533360

https://blog.csdn.net/sinat_37390744/article/details/55670553

https://blog.csdn.net/qq_32252957/article/details/78997021

https://blog.csdn.net/qq_32252957/article/details/78441293

https://blog.csdn.net/qq_32252957/article/details/78961867

猜你喜欢

转载自blog.csdn.net/xuemanqianshan/article/details/84824569