Python爬取视频(其实是一篇福利)

到上面去看了看,地址都是明文的,得,赶紧开始吧。

下载流式文件,requests库中请求的stream设为True就可以啦,文档在此

先找一个视频地址试验一下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
# -*- coding: utf-8 -*-
import requests
 
def download_file(url, path):
     with requests.get(url, stream = True ) as r:
         chunk_size = 1024
         content_size = int (r.headers[ 'content-length' ])
         print '下载开始'
         with open (path, "wb" ) as f:
             for chunk in r.iter_content(chunk_size = chunk_size):
                 f.write(chunk)
 
 
if __name__ = = '__main__' :
     url = '就在原帖...'
     path = '想存哪都行'
     download_file(url, path)

遭遇当头一棒:

1
AttributeError: __exit__

这文档也会骗人的么!

看样子是没有实现上下文需要的__exit__方法。既然只是为了保证要让r最后close以释放连接池,那就使用contextlib的closing特性好了:

1
2
3
4
5
6
7
8
9
10
11
12
# -*- coding: utf-8 -*-
import requests
from contextlib import closing
 
def download_file(url, path):
     with closing(requests.get(url, stream = True )) as r:
         chunk_size = 1024
         content_size = int (r.headers[ 'content-length' ])
         print '下载开始'
         with open (path, "wb" ) as f:
             for chunk in r.iter_content(chunk_size = chunk_size):
                 f.write(chunk)

程序正常运行了,不过我盯着这文件,怎么大小不见变啊,到底是完成了多少了呢?还是要让下好的内容及时存进硬盘,还能省点内存是不是:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# -*- coding: utf-8 -*-
import requests
from contextlib import closing
import os
 
def download_file(url, path):
     with closing(requests.get(url, stream = True )) as r:
         chunk_size = 1024
         content_size = int (r.headers[ 'content-length' ])
         print '下载开始'
         with open (path, "wb" ) as f:
             for chunk in r.iter_content(chunk_size = chunk_size):
                 f.write(chunk)
                 f.flush()
                 os.fsync(f.fileno())

文件以肉眼可见的速度在增大,真心疼我的硬盘,还是最后一次写入硬盘吧,程序中记个数就好了:

1
2
3
4
5
6
7
8
9
10
11
12
def download_file(url, path):
     with closing(requests.get(url, stream = True )) as r:
         chunk_size = 1024
         content_size = int (r.headers[ 'content-length' ])
         print '下载开始'
         with open (path, "wb" ) as f:
             n = 1
             for chunk in r.iter_content(chunk_size = chunk_size):
                 loaded = n * 1024.0 / content_size
                 f.write(chunk)
                 print '已下载{0:%}' . format (loaded)
                 n + = 1

结果就很直观了:

1
2
3
4
已下载 2.579129 %
已下载 2.581255 %
已下载 2.583382 %
已下载 2.585508 %

心怀远大理想的我怎么会只满足于这一个呢,写个类一起使用吧:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
# -*- coding: utf-8 -*-
import requests
from contextlib import closing
import time
 
def download_file(url, path):
     with closing(requests.get(url, stream = True )) as r:
         chunk_size = 1024 * 10
         content_size = int (r.headers[ 'content-length' ])
         print '下载开始'
         with open (path, "wb" ) as f:
             p = ProgressData(size = content_size, unit = 'Kb' , block = chunk_size)
             for chunk in r.iter_content(chunk_size = chunk_size):
                 f.write(chunk)
                 p.output()
 
 
class ProgressData( object ):
 
     def __init__( self , block,size, unit, file_name = '', ):
         self .file_name = file_name
         self .block = block / 1000.0
         self .size = size / 1000.0
         self .unit = unit
         self .count = 0
         self .start = time.time()
     def output( self ):
         self .end = time.time()
         self .count + = 1
         speed = self .block / ( self .end - self .start) if ( self .end - self .start)> 0 else 0
         self .start = time.time()
         loaded = self .count * self .block
         progress = round (loaded / self .size, 4 )
         if loaded > = self .size:
             print u '%s下载完成\r\n' % self .file_name
         else :
             print u '{0}下载进度{1:.2f}{2}/{3:.2f}{4} 下载速度{5:.2%} {6:.2f}{7}/s' .\
                   format ( self .file_name, loaded, self .unit,\
                   self .size, self .unit, progress, speed, self .unit)
             print '%50s' % ( '/' * int (( 1 - progress) * 50 ))

运行:

扫描二维码关注公众号,回复: 11153396 查看本文章
1
2
3
4
5
下载开始
下载进度 10.24Kb / 120174.05Kb 0.01 % 下载速度 4.75Kb / s
/ / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / /
下载进度 20.48Kb / 120174.05Kb 0.02 % 下载速度 32.93Kb / s
/ / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / /

看上去舒服多了。

下面要做的就是多线程同时下载了,主线程生产url放入队列,下载线程获取url:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
# -*- coding: utf-8 -*-
import requests
from contextlib import closing
import time
import Queue
import hashlib
import threading
import os
 
 
def download_file(url, path):
     with closing(requests.get(url, stream = True )) as r:
         chunk_size = 1024 * 10
         content_size = int (r.headers[ 'content-length' ])
         if os.path.exists(path) and os.path.getsize(path)> = content_size:
             print '已下载'
             return
         print '下载开始'
         with open (path, "wb" ) as f:
             p = ProgressData(size = content_size, unit = 'Kb' , block = chunk_size, file_name = path)
             for chunk in r.iter_content(chunk_size = chunk_size):
                 f.write(chunk)
                 p.output()
 
 
class ProgressData( object ):
 
     def __init__( self , block,size, unit, file_name = '', ):
         self .file_name = file_name
         self .block = block / 1000.0
         self .size = size / 1000.0
         self .unit = unit
         self .count = 0
         self .start = time.time()
     def output( self ):
         self .end = time.time()
         self .count + = 1
         speed = self .block / ( self .end - self .start) if ( self .end - self .start)> 0 else 0
         self .start = time.time()
         loaded = self .count * self .block
         progress = round (loaded / self .size, 4 )
         if loaded > = self .size:
             print u '%s下载完成\r\n' % self .file_name
         else :
             print u '{0}下载进度{1:.2f}{2}/{3:.2f}{4} {5:.2%} 下载速度{6:.2f}{7}/s' .\
                   format ( self .file_name, loaded, self .unit,\
                   self .size, self .unit, progress, speed, self .unit)
             print '%50s' % ( '/' * int (( 1 - progress) * 50 ))
 
 
queue = Queue.Queue()
 
 
def run():
     while True :
         url = queue.get(timeout = 100 )
         if url is None :
             print u '全下完啦'
             break
         h = hashlib.md5()
         h.update(url)
         name = h.hexdigest()
         path = 'e:/download/' + name + '.mp4'
         download_file(url, path)
 
 
def get_url():
     queue.put( None )
 
 
if __name__ = = '__main__' :
     get_url()
     for i in xrange ( 4 ):
         t = threading.Thread(target = run)
         t.daemon = True
         t.start()

加了重复下载的判断,至于怎么源源不断的生产url,诸位摸索吧,保重身体!

猜你喜欢

转载自www.cnblogs.com/wanghuaijun/p/12806024.html