——————来自某潮汕人的菜鸟教程
爬虫目标网站:http://jandan.net/ooxx
爬虫目标中文名称:煎蛋网 妹子图
爬虫需求:分析js下载 煎蛋网的妹子图
爬虫分析:
1、通过查看网页的源代码我们可以发现,这个网站的图片地址img src是被重新修改的,onload的意思是当网页加载完成之后触发事件 jandan_load_img 这个函数
通过F12开发者工具我们可以在Console端轻易地获取到jandan_load_img这个函数的代码,
1 function jandan_load_img(b) { 2 var d = $(b); 3 var f = d.next("span.img-hash"); 4 var e = f.text(); 5 f.remove(); 6 var c = jdVgpgvDt3f2JfTm99E1w9KZcus9yR9FW4(e, "myblSB23xVOq7qKlgBHAUD52BF1yZlql"); 7 var a = $('<a href="' + c.replace(/(\/\/\w+\.sinaimg\.cn\/)(\w+)(\/.+\.(gif|jpg|jpeg))/, "$1large$3") + '" target="_blank" class="view_img_link">[查看原图]</a>'); 8 d.before(a); 9 d.before("<br>"); 10 d.removeAttr("onload"); 11 d.attr("src", location.protocol + c.replace(/(\/\/\w+\.sinaimg\.cn\/)(\w+)(\/.+\.gif)/, "$1thumb180$3")); 12 if (/\.gif$/.test(c)) { 13 d.attr("org_src", location.protocol + c); 14 b.onload = function() { 15 add_img_loading_mask(this, load_sina_gif) } } }函数体如上,下面开始解析这个函数
</div> <div class="text"><span class="righttext"><a href="//jandan.net/ooxx/page-98#comment-3838708">3838708</a></span><p><img src="//img.jandan.net/img/blank.gif" onload="jandan_load_img(this)" /><span class="img-hash">Ly93eDIuc2luYWltZy5jbi9tdzYwMC9lNDMxNjM2Nmx5MWZydmtmdDBhdHlqMjBoczBucG15aC5qcGc=</span></p> </div>
首先先举个例子,比如上面这张美女图的源代码
那么在函数体中经过前4行时var e="Ly93eDIuc2luYWltZy5jbi9tdzYwMC9lNDMxNjM2Nmx5MWZydmtmdDBhdHlqMjBoczBucG15aC5qcGc="
var c = jdVgpgvDt3f2JfTm99E1w9KZcus9yR9FW4(e, "myblSB23xVOq7qKlgBHAUD52BF1yZlql");
第6行jdVgpgvDt3f2JfTm99E1w9KZcus9yR9FW4,这个也是个函数,同样用F12开发者工具获得这个的函数体
jdVgpgvDt3f2JfTm99E1w9KZcus9yR9FW4函数体如下:
var jdVgpgvDt3f2JfTm99E1w9KZcus9yR9FW4 = function(n, t, e) { var f = "DECODE"; var t = t ? t : ""; var e = e ? e : 0; var r = 4; t = md5(t); var d = n; var p = md5(t.substr(0, 16)); var o = md5(t.substr(16, 16)); if (r) { if (f == "DECODE") { var m = n.substr(0, r) } } else { var m = "" } var c = p + md5(p + m); var l; if (f == "DECODE") { n = n.substr(r); l = base64_decode(n) } var k = new Array(256); for (var h = 0; h < 256; h++) { k[h] = h } var b = new Array(); for (var h = 0; h < 256; h++) { b[h] = c.charCodeAt(h % c.length) } for (var g = h = 0; h < 256; h++) { g = (g + k[h] + b[h]) % 256; tmp = k[h]; k[h] = k[g]; k[g] = tmp } var u = ""; l = l.split(""); for (var q = g = h = 0; h < l.length; h++) { q = (q + 1) % 256; g = (g + k[q]) % 256; tmp = k[q]; k[q] = k[g]; k[g] = tmp; u += chr(ord(l[h]) ^ (k[(k[q] + k[g]) % 256])) } if (f == "DECODE") { if ((u.substr(0, 10) == 0 || u.substr(0, 10) - time() > 0) && u.substr(10, 16) == md5(u.substr(26) + o).substr(0, 16)) { u = u.substr(26) } else { u = "" } u = base64_decode(d) } return u }
传进来的参数n="Ly93eDIuc2luYWltZy5jbi9tdzYwMC9lNDMxNjM2Nmx5MWZydmtmdDBhdHlqMjBoczBucG15aC5qcGc="
t="myblSB23xVOq7qKlgBHAUD52BF1yZlql"
然后发现最后的
u = base64_decode(d)
而d=n,因此这个函数返回的是一个
base64_decode(n)
base64_decode在python中应该是这样子写的:
def base64_decode1(s): s = s.encode() missing_padding = len(s) % 4 if missing_padding != 0: s += b'=' * (4 - missing_padding) return base64.decodestring(s)
运行测试程序:
import base64 def base64_decode1(s): s = s.encode() missing_padding = len(s) % 4 if missing_padding != 0: s += b'=' * (4 - missing_padding) return base64.decodestring(s) print(base64_decode1('Ly93eDIuc2luYWltZy5jbi9tdzYwMC9lNDMxNjM2Nmx5MWZydmtmdDBhdHlqMjBoczBucG15aC5qcGc='))
结果:b'//wx2.sinaimg.cn/mw600/e4316366ly1frvkft0atyj20hs0npmyh.jpg'
解析到这里,后面的等你来完善
不懂的请留言