爬墙技术哪家强,师范找锡伟

爬虫简介

爬虫主要用到requests库和re库,首先导入库,然后利用库中方法

import requests
import re

response = requests.get('http://ishuo.cn/')  # 模拟浏览器打开网页
# print(response.status_code)  # 200成功,301,404网页丢失
# print(response.encoding)  # utf-8
data = response.text  #
# print(data)

# .匹配所有字符,*表示前面的字符0到无穷个
res = re.findall('<div class="content">(.*?)</div>',data)
print(res)

段子网爬墙

# requests库
## requests.get(url)  模拟浏览器打开网页

# re库

import requests
import re

response = requests.get('http://ishuo.cn/')  # 模拟浏览器打开网页
# print(response.status_code)  # 200成功,301,404网页丢失
print(response.encoding="gd231-8")  # utf-8
data = response.text  #
print(data)

# # .匹配所有字符,*表示前面的字符0到无穷个
# content_res = re.findall('<div class="content">(.*?)</div>', data)
# title_res = re.findall('<a href="/subject/.*?">(.*?)</a>', data)
# # print(title_res.index('活得糊涂的人,容易幸福'))
# # print(title_res.index('购买银行理财产品亏损后如何起诉'))
# title_res= title_res[10:60]
# # print(title_res)
#
#
# title_content_dic = {}
# for i in range(len(title_res)):
#     title_content_dic[title_res[i]] = content_res[i]
#     # print(title_content_dic)
#
# # print(title_content_dic)
# for i in title_content_dic.items():
#     # print(str(i)+'\n')
#     print(f'{i[0]:<40} | {i[1]:<1000}')

# 第一种:
'<a href="/subject/5898">这个笑话好内涵,你懂了没?</a>'

'''
//g.alicdn.com
//img.alicdn.com
//tce.alicdn.com
'''
#方法二
import requests
import re

response = requests.get('http://ishuo.cn/')  # 模拟浏览器打开网页

data = response.text

res = re.findall('<li class="list_li">(.*?)</li>',data)


title_content_desc_dic = {}
for i in res:
    content = re.findall('<div class="content">(.*?)</div>',i)[0]

    title = re.findall('<a href="/subject/.*?">(.*?)</a>',i)[0]

    desc = re.findall('</a>(04月.*?)</div>',i)[0]


    title_content_desc_dic[title] = (content,desc)

for i in title_content_desc_dic.items():
    print(f'{i[0]:<100} | {i[1]}')

#图片爬墙
import requests
import re

response = requests.get('http://www.nipic.com/design/acg/renwu/index.html?page=1&tdsourcetag=s_pcqq_aiomsg')
data = response.text
# print(data)

img_url_res = re.findall('data-src="(.*?)"',data)
for i in img_url_res:
    img_response = requests.get(i)
    img_data = img_response.content
    img_name = i.split('/')[-1]
    f=open(img_name,'wb')
    f.write(img_data)
    # f.flush()  # 快速刷新
    
    
    
    
    
import re
import os
import time
import requests
from selenium import webdriver

#########################
###此段代码不需要关心啥意思###
#########################

if not os.path.exists('百度图片'):
    os.mkdir('百度图片')

#####################
###限制30张图片的代码###
####################


# 获取所有图片
response = requests.get(
    'http://image.baidu.com/search/index?ct=201326592&cl=2&st=-1&lm=-1&nc=1&ie=utf-8&tn=baiduimage&ipn=r&rps=1&pv=&fm=rs7&word=风景')
data = response.text
img_desc_dics = re.findall("app.setData(\('imgData.*?\));", data, re.S)[0]
img_desc_dics = eval(str(img_desc_dics))

# 获取所有图片的数据
img_datas = img_desc_dics[1]['data']
count = 0
for img_data in img_datas:
    # 获取搜索图片的参数
    os_ = img_data.get('os')
    cs_ = img_data.get('cs')

    if os_ and cs_:
        # 获取搜索图片的信息
        img_search_url = f'http://image.baidu.com/search/detail?ct=503316480&z=0&ipn=d&word=%E9%A3%8E%E6%99%AF&step_word=&hs=0&pn=1&spn=0&di=195030&pi=0&rn=1&tn=baiduimagedetail&is=0%2C0&istype=0&ie=utf-8&oe=utf-8&in=&cl=2&lm=-1&st=-1&cs={cs_}&os={os_}'
        img_search_response = requests.get(img_search_url)
        img_search_data = img_search_response.text

        # 获取图片信息
        img_url = re.findall('''\('firstSc'\);" src="(.*?)"''', img_search_data)[0]
        img_name = img_url.split('/')[-1]
        img_name = os.path.join('百度图片', img_name)  # 拼接出图片的地址,如 百度图片/3822951_144045377000_2.jpg

        # 保存图片
        img_response = requests.get(img_url)
        img_data = img_response.content
        fw = open(img_name, 'wb')
        fw.write(img_data)
        fw.flush()

        # 提示
        count += 1
        print(f'{img_name}保存成功,成功保存{count}张')

        # 防止百度禁ip,慢一点
        time.sleep(0.01)

#########################################################################
###自行百度selenium的用法,使用这一套代码可以无限爬取所有图片,否则将被限制30张###
########################################################################

#
# page_count_end = 2  # 爬取 指定数字(10)* 30 = 300张图片
# chrome = webdriver.Chrome()
#
# try:
#
#     chrome.implicitly_wait(10)
#     chrome.get(
#         'http://image.baidu.com/search/index?ct=201326592&cl=2&st=-1&lm=-1&nc=1&ie=utf-8&tn=baiduimage&ipn=r&rps=1&pv=&fm=rs7&word=风景')
#
#     js_code = '''
#     window.scrollTo(0, document.body.scrollHeight);
#     var lenOfPage = document.body.scrollHeight;
#     return lenOfPage
#     '''
#
#     # selenium控制爬取页数
#     count = 0
#     page_count = 0
#     while page_count < page_count_end:
#         try:
#             page_count += 1
#             chrome.execute_script(js_code)
#             time.sleep(0.3)
#         except:
#             continue
#
#     img_desc_search_urls = re.findall('href="(/search/detail\?.*?)"', chrome.page_source, re.S)  # re.S使.可以匹配换行符
#
#     # 获取所有图片的数据
#     for img_data in img_desc_search_urls:
#         try:
#             # 获取搜索图片的参数
#             os_ = re.findall('os=(.*?)&amp;', img_data)[0]
#             cs_ = re.findall('cs=(.*?)&amp;', img_data)[0]
#
#             if os_ and cs_:
#                 # 获取搜索图片的信息
#                 img_search_url = f'http://image.baidu.com/search/detail?ct=503316480&z=0&ipn=d&word=%E9%A3%8E%E6%99%AF&step_word=&hs=0&pn=1&spn=0&di=195030&pi=0&rn=1&tn=baiduimagedetail&is=0%2C0&istype=0&ie=utf-8&oe=utf-8&in=&cl=2&lm=-1&st=-1&cs={cs_}&os={os_}'
#                 img_search_response = requests.get(img_search_url)
#                 img_search_data = img_search_response.text
#
#                 # 获取图片信息
#                 img_url = re.findall('''\('firstSc'\);" src="(.*?)"''', img_search_data)[0]
#                 img_name = img_url.split('/')[-1]
#                 img_name = os.path.join('百度图片', img_name)  # 拼接出图片的地址,如 百度图片/3822951_144045377000_2.jpg
#
#                 # 保存图片
#                 img_response = requests.get(img_url)
#                 img_data = img_response.content
#                 fw = open(img_name, 'wb')
#                 fw.write(img_data)
#                 fw.flush()
#
#                 # 提示
#                 count += 1
#                 print(f'{img_name}保存成功,成功保存{count}张')
#
#                 # 防止百度禁ip,慢一点
#                 time.sleep(0.01)
#         except:
#             continue
#
# except Exception:
#     pass
#
# finally:
#     chrome.close()

视频爬墙

import requests
import re

response = requests.get('http://www.mod.gov.cn/v/index.htm')
# response.encoding = 'utf8'
data = response.text
# print(data)

# mp4_res1 = re.findall('<a href="(.*?)"  class="img">',data)
# for i in mp4_res1:
#     print(i)


mp4_res2 = re.findall('<a href="(.*?)">', data)

for i in mp4_res2:  # type:str
    res = re.findall('(.*?htm)', i)[0]
    res = 'http://www.mod.gov.cn/v/' + res

    response = requests.get(res)
    data = response.text
    # http://vv.chinamil.com.cn/asset/category3/2019/06/27/asset_357593.mp4
    url_res = re.findall('//Video (.*?.mp4)',data)[0]


    mp4_response = requests.get(url_res)
    mp4_data = mp4_response.content
    f = open('test.mp4','wb')
    f.write(mp4_data)
    # break

'''
<a href="2019-07/20/content_4846213.htm" class="img"><img src="attachement/jpg/site21/20190720/6c4b9041ab8b1e9ca1be01.jpg" border="0"><em class="video_40x40"></em></a>
'''

猜你喜欢

转载自www.cnblogs.com/quyang46/p/11227868.html