python3 urllib爬虫抓取记录

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/a519395243/article/details/78666361


# 目的:GET请求 抓取csdn博客页面所有文章标题,并保存在csdn目录下
import re
import os
from urllib import request

#抓取整个页面下来
data=request.urlopen('http://blog.csdn.net/a519395243').read().decode()
#正则提取所有文章标题,
ruler = re.compile('<span class="link_title"><a href="/a519395243/article/details/[1-9]{8}">(.*?)</a>',re.S)
match = ruler.findall(data)
#把抓取到的数据遍历
for x in match:
	#把 \r\n 和空格 都去掉
	content = x.replace('\r\n','').replace(' ','')
	#文件保存路径,如果没有,则创建
	path = 'csdn'
	if not os.path.exists(path):
		os.makedirs(path)
	#保存文件名
	file_path = path+'/csdn.txt'
	#打开文件
	f = open(file_path,'a+')
	#写入文件
	f.write(content)
	#关闭文件
	f.close()
pass


#模拟浏览器发送GET请求,通过往Request对象添加HTTP头,伪装成浏览器
from urllib import request

req = request.Request('http://blog.csdn.net/a519395243')
req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36')
data = request.urlopen(req).read().decode()
print(data)



# 目的:模拟登录 csdn
import gzip  
import re  
import urllib.request  
import urllib.parse  
import http.cookiejar  
  
def ungzip(data):  
    try:  
        print("尝试解压缩...")  
        data = gzip.decompress(data)  
        print("解压完毕")  
    except:  
        print("未经压缩,无需解压")  
      
    return data  
          
def getLt(data):  
    cer = re.compile('name=\"lt\" value=\"(.*)\"')  
    strlist = cer.findall(data)  
    return strlist[0]

def getExecution(data):
	cer = re.compile('name=\"execution\" value=\"(.*)\"')  
	strlist = cer.findall(data)  
	return strlist[0]

def getOpener(head):  
    # cookies 处理  
    cj = http.cookiejar.CookieJar()  
    pro = urllib.request.HTTPCookieProcessor(cj)  
    opener = urllib.request.build_opener(pro)  
    header = []  
    for key,value in head.items():  
        elem = (key,value)  
        header.append(elem)  
    opener.addheaders = header  
    return opener  
# header信息可以通过firebug获得  
header = {  
   	'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
	'Accept-Encoding':'gzip, deflate, sdch, br',
	'Accept-Language':'zh-CN,zh;q=0.8',
	'Connection':'keep-alive',
	'Host':'passport.csdn.net',
	'Upgrade-Insecure-Requests':'1',
	'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
	'Cookie':'uuid_tt_dd=-6281662822437337065_20171128; __message_district_code=440000; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22160058ffb5850c-0d114986a51ac1-6a11157a-1440000-160058ffb598c9%22%2C%22%24device_id%22%3A%22160058ffb5850c-0d114986a51ac1-6a11157a-1440000-160058ffb598c9%22%2C%22props%22%3A%7B%22%24latest_utm_source%22%3A%22news0%22%7D%7D; kd_user_id=1f003860-eec5-424d-8a20-498a00b6ab73; UM_distinctid=160068870b25ec-07ca748d26f527-6a11157a-15f900-160068870b3750; UN=a519395243; UE="[email protected]"; BT=1512011174110; shown_offset=20; Hm_lvt_3f9df99a208b69b45eb52cfbe2dc3bf8=1511939807,1512007982,1512022346,1512026346; Hm_lpvt_3f9df99a208b69b45eb52cfbe2dc3bf8=1512026346; __message_sys_msg_id=0; __message_gu_msg_id=0; __message_cnel_msg_id=0; __message_in_school=0; JSESSIONID=8669679CFA8B508DD860D5C76BDA9E69.tomcat1; LSSC=LSSC-55438-kdj63iwrBuHfcdst9TBrRIONZeKOQh-passport.csdn.net; Hm_lvt_6bcd52f51e9b3dce32bec4a3997715ac=1512011295,1512022345,1512026346,1512029753; Hm_lpvt_6bcd52f51e9b3dce32bec4a3997715ac=1512032481; dc_tos=p083q9; dc_session_id=1512031760278'
}  
  
url = url = 'https://passport.csdn.net/account/verify' 
opener = getOpener(header)  
op = opener.open(url)  
data = op.read()  
data = ungzip(data)  
lt = getLt(data.decode())  
execution = getExecution(data.decode())  



username = "帐号"  
password = "密码"  
postDict = {  
    'lt': lt,  
    'username': username,  
    'password': password,  
    '_eventId': 'submit',
    'execution':execution   
}  
postData = urllib.parse.urlencode(postDict).encode()  
op = opener.open(url,postData)  
data = op.read()  
data = ungzip(data)  
  
print(data.decode())  



猜你喜欢

转载自blog.csdn.net/a519395243/article/details/78666361
今日推荐