(1) 抓取小说--转
import requests
import re
from bs4 import BeautifulSoup
if __name__=='__main__':
headers={"User-Agent":'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0'}
# url='http://z8.cnzz.com/stat.htm?id=1273371515&r=&lg=zh-cn&ntime=1543752090&cnzz_eid=1445241018-1543752090-&showp=1280x720&t=最强恶魔妖孽系统_最强恶魔妖孽系统最新章节_最强恶魔妖孽系统最新章节列表_全书网&umuuid=1676ee2c2530-045ac19542698d8-4c312979-e1000-1676ee2c25463&h=1&rnd=1980725067'
url='http://www.shushu8.com/shaolinbajue/'
r= requests.get(url,headers=headers)#.content
r.encoding=r.apparent_encoding
#print(r.text)
d=BeautifulSoup(r.text,'lxml')
t=d.select('.clearfix > ul > li > a')
# print(t)
for i in t:
deta={'href': i.get('href'),
'标题': i.get_text()}
urlk='http://www.shushu8.com'+deta['href']
jsu=requests.get(urlk)
jsu.encoding=jsu.apparent_encoding
di=BeautifulSoup(jsu.text,'lxml')
ti=di.select('div.page-content')
for k in ti:
print(k.get_text())
(2) 抓取网页图片
https://blog.csdn.net/caozewei/article/details/82497388
1、根据给定的网址获取网页源代码
2、利用正则表达式把源代码中的图片地址过滤出来
3、根据过滤出来的图片地址下载网络图片
import re
import urllib.request
def gethtml(url):
page=urllib.request.urlopen(url)
html=page.read()
return html
def getimg(html):
reg = r'src="(.*?\.jpg)"'
img=re.compile(reg)
html=html.decode('utf-8')#python3
imglist=re.findall(img,html)
x = 0
for imgurl in imglist:
urllib.request.urlretrieve(imgurl,'%s.jpg'%x)
x = x+1
html=gethtml("http://news.ifeng.com/a/20161115/50258273_0.shtml")
print(getimg(html))
把代码直接导入解释器,可直接运行抓取图片
(3)其他
https://blog.csdn.net/sinat_37390744/article/details/55533360
https://blog.csdn.net/sinat_37390744/article/details/55670553
https://blog.csdn.net/qq_32252957/article/details/78997021