可以实现函数分段执行
def go(): print(1) yield 1 print(11) yield 11 print(111) yield 111 my = go() #分段执行 print(type(my)) print(next(my)) print(next(my)) print(next(my))
协程切换
import greenlet import time def go1(): while True: print('A吃雪糕') gr2.switch() time.sleep(1) def go2(): while True: print('B吃雪糕') gr1.switch() time.sleep(1) if __name__ == '__main__': gr1 = greenlet.greenlet(go1) gr2 = greenlet.greenlet(go2) gr1.switch()
协程加速
import gevent import time # 协程---解决下载慢,速度问题,及高并发问题 # def show_wait(name, n): # for i in range(n): # print('%s等待了%d秒' % (name, i+1)) # time.sleep(1) # # show_wait('AAA', 10) # show_wait('BBB', 10) # show_wait('CCC', 10) def show_wait(name, n): for i in range(n): print('%s等待了%d秒' % (name, i+1)) gevent.sleep(1) #不需要等待就顺序执行,需要等待则自行切换 # time.sleep(1) # time不会自动切换 g1 = gevent.spawn(show_wait, 'AAA', 10) g2 = gevent.spawn(show_wait, 'BBB', 10) g3 = gevent.spawn(show_wait, 'CCC', 10) g1.join() g2.join() g3.join()
协程网络下载(并发的访问多个页面)
from gevent import monkey monkey.patch_all() # 自动切换,注意:使用时需要先patch_all,再导入其他需要的包,及后续代码编写 # 处理并发,加快访问速度 import urllib.request import gevent def down_load(url): print('start%s' % url) data = urllib.request.urlopen(url).read() print('length为%d' % len(data), url) gevent.joinall([ gevent.spawn(down_load, 'http://www.baidu.com'), gevent.spawn(down_load, 'http://www.163.com'), gevent.spawn(down_load, 'http://www.qq.com'), gevent.spawn(down_load, 'http://www.sina.com'), ])
开发客户端一般是ubutn,用在服务器的是centos
词云分词
import jieba mystr = "我今天早上很开心,遇到一个美女,没那个说我有病,我回答正是因为你我才害了相思病,美女说真的有病" sg_list = jieba.cut(mystr, cut_all=True) print(sg_list) print('--------------') print('/'.join(sg_list)) sg_list = jieba.cut_for_search(mystr) print('/'.join(sg_list))
安装词云失败的话
先去网站https://www.lfd.uci.edu/~gohlke/pythonlibs/#wordcloud下载对应版本的whl
我的电脑是wordcloud-1.4.1-cp36-cp36m-win_amd64.whl
再执行pip install D:\django_env\env\wordcloud-1.4.1-cp36-cp36m-win_amd64.whl
然后再pip install wordcloud
import jieba # 分词 import matplotlib.pyplot as plt import wordcloud import numpy as np from wordcloud import WordCloud,ImageColorGenerator,STOPWORDS # 词云 from PIL import Image # 处理图片 text_file = open('3.txt', 'rb').read() # 读取文本内容 word_list = jieba.cut(text_file, cut_all=True) # 切割 space_list = ''.join(word_list) # 链接词语 background = np.array(Image.open('2.jpg')) # 背景图片 my_cloud = WordCloud(background_color='yellow', # 背景颜色 mask=background, # 写字用的背景图,从背景图中去颜色 max_words=50, #最大词语数量 stopwords=STOPWORDS, # 停止的默认词语 font_path='simkai.ttf', # 字体 max_font_size=100, # 最大字体尺寸 random_state=50, # 随机角度 scale=2).generate(space_list) # 生成词云 image_color = ImageColorGenerator(background) # 生成词云的颜色 plt.imshow(my_cloud) # 显示 plt.axis('off') # 关闭保存功能 plt.show() # 显示词云
词云接口链接:https://github.com/amueller/word_cloud
gbk段落常见的解码方法
file = open('nasa.txt', 'rb') my_text = file.read() my_text = my_text.decode('gbk', errors='ignore') print(my_text)
校验是否是数字类型
def check_str(my_str): try: num = eval(my_str) return True except: return False
分布式式例:
server端:
from multiprocessing import Process # 分布式进程 import multiprocessing.managers # 分布式进程管理器 import random, time import queue task_queue = queue.Queue() # 任务 result_queue = queue.Queue() # 结果 def return_task(): # 返回任务队列 return task_queue def return_result(): # 返回结果队列 return result_queue # 继承进行管理共享数据 class QueueManager(multiprocessing.managers.BaseManager): pass if __name__ == '__main__': multiprocessing.freeze_support() # 开启分布式支持 QueueManager.register('get_task', callable=return_task) # 注册函数给客户端调用 QueueManager.register('get_result', callable=return_result) manager = QueueManager(address=("10.36.137.17", 8848), authkey=123456) # 创建一个管理器设置地址与密码 manager.start() # 开启 task, result = manager.get_task(), manager.get_result() # 任务, 结果 for i in range(10000): print('task add data %d' % i) task.put(i) print('waiting-------') for i in range(10000): res = result.get() print('get data %s' % res) manager.shutdown() # 关闭服务器
client端:
from multiprocessing import Process # 分布式进程 import multiprocessing.managers # 分布式进程管理器 import random, time import queue # 继承进行管理共享数据 class QueueManager(multiprocessing.managers.BaseManager): pass if __name__ == '__main__': QueueManager.register('get_task') # 注册函数给客户端调用 QueueManager.register('get_result') manager = QueueManager(address=("10.36.137.17", 8848), authkey=123456) manager.connect() # 连接服务器 task = manager.get_task() result = manager.get_result() for i in range(10000): try: data = task.get() print('client get %d' % data) result.put('client' + str(data + 10)) except: pass
分布式爬虫:一般客户端放windows,服务端放linux
中英文验证码识别的的操作
验证码识别:安装包tesseract,并把中文的那个文件放到OCR目录下去,将ORC目录配置环境变量英文识别流程: 终端执行: tesseract 1.png lihaibao直接可以生成一个识别后的文档,查看用type lihaibao.txt中文识别
tesseract 2.png libaby -l chi_sim
则直接生成中文的文档,但终端的type查看是乱码
英文识别
#coding:utf-8 import subprocess p=subprocess.Popen([r"D:\tools\tesseract\Tesseract-OCR\tesseract.exe", r"D:\pythonVideo\my_Spider_Math\day6.15\learn2\中英文验证码识别\1.png", "english"], stdout=subprocess.PIPE, stderr=subprocess.PIPE) # 调用命令行 p.wait() # 等待命令执行成功 file = open("lihaibao.txt", "r") print(file.read())
中文识别
import subprocess p = subprocess.Popen([r"D:\tools\tesseract\Tesseract-OCR\tesseract.exe", r"D:\pythonVideo\my_Spider_Math\day6.15\learn2\中英文验证码识别\2.png", "chinese", "-l", "chi_sim" ], stdout=subprocess.PIPE, stderr=subprocess.PIPE) # 调用命令行 p.wait() # 等待命令执行成功 file = open("chinese.txt", "r", encoding='utf-8') # 保存读取内容的文件(名字必须为chinese.txt) print(file.read())
百度AI接口cd D:\tools\tools-baidu\aip-python-sdk-1.6.4
再执行pip install .
百度AI网站的应用列表----创建应用----勾选接口()---创建成功----APPid跟APIkey
pip install baidu-aip
python setup.py install
识别图片
# from aip import AipImageSearch from aip import AipOcr """ 你的 APPID AK SK """ APP_ID = '11405908' API_KEY = 'pshZPWa3XX1yWqhcG0PvbanA' SECRET_KEY = 'hfXnIZbB79Gw4y4VRCFevP2oIChrLNsb' """ 读取图片 """ def get_file_content(filePath): with open(filePath, 'rb') as fp: return fp.read() # client = AipImageSearch(APP_ID, API_KEY, SECRET_KEY) client = AipOcr(APP_ID, API_KEY, SECRET_KEY) result = client.basicGeneral(get_file_content(r"D:\pythonVideo\my_Spider_Math\day6.15\learn2\中英文验证码识别\2.png")) # print(result) print(result['words_result'][0]['words'])
语音合成
from aip import AipSpeech import pygame import time """ 你的 APPID AK SK """ APP_ID = '11405908' API_KEY = 'pshZPWa3XX1yWqhcG0PvbanA' SECRET_KEY = 'hfXnIZbB79Gw4y4VRCFevP2oIChrLNsb' client = AipSpeech(APP_ID, API_KEY, SECRET_KEY) result = client.synthesis('我是小薇,明少在不在,孩子啊忙呢啊,啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊', 'zh', 1, { 'vol': 5, 'spd': 1, 'per': 1 }) if not isinstance(result, dict): with open('auido.mp3', 'wb') as f: f.write(result) # game模式可以带混音 # file_name = 'auido.mp3' # pygame.mixer.init() # pygame.mixer.music.load(file_name) # pygame.mixer.music.play() # time.sleep(10) # mp3模式 # import mp3play # file_name = 'auido.mp3' # player = mp3play.load(file_name) # player.play() # time.sleep(10)
bs4中获取某属性值---a.attrs['data-cid']
scrapy----
进入虚拟环境,
scrapy startproject tianya 进入该目录
scrapy genspider ty tianya.com
匹配获取数据response.xpath("//div[@class='bbs-content']/text()").extract()
def parse(self, response): tr_list = response.xpath('//tr[@class="even"] | //tr[@class="odd"]') for tr in tr_list: item = TecentItem() job_name = tr.xpath('.//a/text()').extract_first() category = tr.xpath('./td[2]/text()').extract_first() number = tr.xpath('./td[3]/text()').extract_first() location = tr.xpath('./td[4]/text()').extract_first() publication_date = tr.xpath('./td[5]/text()').extract_first() item['job_name'] = job_name item['category'] = category item['number'] = number item['location'] = location item['publication_date'] = publication_date yield item
开始爬取scrapy crawl ty -o ty.json 可以直接保存到文件
可以在工程目录下建一个start.py,则可以直接run执行
from scrapy import cmdline cmdline.execute(['scrapy', 'crawl', 'c', '-o', 'cto.json'])
用shell指令做测试提取链接(提取后可以实现自动分页):
scrapy shell https://hr.tencent.com/position.php?keywords=&tid=0&lid=2218&start=0#a
page_lx = LinkExtractor(allow=('start=\d+'))
page_lx.extract_links(response)
创建爬取多层链接的指令scrapy genspider -t crawl tenxspider hr.tencent.com
scrapy框架处理的是不加密的网站
USER-AGENT-LIST
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36', "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2226.0 Safari/537.36", "Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2224.3 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.93 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.93 Safari/537.36", "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36", "Mozilla/5.0 (Windows NT 4.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.3319.102 Safari/537.36", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.2309.372 Safari/537.36", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.2117.157 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1866.237 Safari/537.36",
随机代理设置,随机浏览器设置----proxy
from .settings import USER_AGENTS from .settings import PROXIES import random import base64 # 编码 # 随机浏览器 class RandomUserAgent(): def process_request(self, request, spider): user_agent = random.choice(USER_AGENTS) request.headers.setdefault('User-Agent', user_agent) # 随机代理 class RandomProxy(): def process_request(self, request, spider): proxy = random.choice(PROXIES) if proxy['user_passwd'] is None: request.meta["procy"] = "http://" + proxy["ip_port"] # 没有代理密码直接登录 else: base64_use_pwd = base64.b64encode(proxy['user_passwd']) # 账户密码进行编码 request.headers["Proxy-Authorization"] = "Basic" + base64_use_pwd request.meta["procy"] = "http://" + proxy["ip_port"]
一般settings.py里设置
DOWNLOAD_DELAY = 3 COOKIES_ENABLED = False DOWNLOADER_MIDDLEWARES = { 'baoke.middlewares.RandomUserAgent': 100, 'baoke.middlewares.RandomProxy': 200, 'baoke.middlewares.BaokeDownloaderMiddleware': 543, }
测试系统,系统打开note
os.system('notepad')
scrapy登录解决
class RenspiderSpider(scrapy.Spider): name = 'renspider' allowed_domains = ['renren.com'] start_urls = ['http://www.renren.com/SysHome.do'] def parse(self, response): yield scrapy.FormRequest.from_response( response, formdata={'email': '[email protected]', 'password': 'jiangwei112'}, callback=self.parse_page ) def parse_page(self, response): url = "http://www.renren.com/449786654/profile" yield scrapy.Request(url, callback=self.parse_newpage) def parse_newpage(self, response): with open('wei.html', 'w') as file: file.write(response.body.decode('gbk', 'ignore'))