主要是用了python的os,sys,requests,BeautifulSoup模块。
废话不多说直接上代码
import requests,os,sys
from bs4 import BeautifulSoup
path = os.getcwd()
new_path = os.path.join(path,u'糗事百科成人版')
if not os.path.isdir(new_path):
os.mkdir(new_path)
def page_loop(page = 1):
response = requests.get('http://www.qiubaichengren.net/%s.html' % page )
content = response.content
soup = BeautifulSoup(content, 'lxml')
my_gril = soup.find_all('div', class_='mala-text')
for gril in my_gril:
jokes = gril.find('img')
link = jokes.get('src')
flink = link
response = requests.get(flink)
content2 = response.content
with open(u'糗事百科成人版' + '/' + flink[-11:], 'wb') as code:
code.write(content2)
# # page = int(page) +1
if page > 1: #这里是用了个递归来实现爬取多页
page -= 1
return page_loop(page)
page_loop(15)
具体的模块详解你们可以自己去百度下,我也不是什么大牛,那些模块的基本用法都挺好理解的!