1、Get方式
import urllib.parse import urllib.request def loadPage(url,filename): ''' 作用:根据url发送请求,获取服务器响应文件 url:需要爬取的url地址 filename:处理的文件名 ''' print("正在下载" + filename) headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"} request = urllib.request.Request(url,headers=headers) return urllib.request.urlopen(request).read() def writePage(html,filename): ''' 作用:将html内容写入到本地 html:服务器响应文件内容 filename:处理的文件名 ''' print("正在保存" + filename) with open(filename,"w") as f: f.write(str(url_jm(html,'utf-8'))) print("-" * 30) def tiebaSpider(url,beginPage,endPage): ''' 作用:贴吧爬虫调度器,负责组合处理每个页面的url :param url: 贴吧的前部分 :param beginPage: 起始页 :param endPage: 结束页 ''' for page in range(beginPage,endPage + 1): pn = (page - 1) * 50 filename = "第" + str(page) + "页.html" fullurl = url + "&pn=" + str(pn) #print(fullurl) html = loadPage(fullurl,filename) #print(html) writePage(html,filename) def url_bm(kw,encode): ''' url编码工具类 :param kw: 需要编码的内容 :param encode: 指定编码格式 :return: 返回编码后的结果 ''' kw_str = str(kw).encode(encode) return urllib.parse.quote(kw_str) def url_jm(kw,encode): ''' url解码工具类 :param kw: 需要解码的内容 :param encode: 指定解码格式 :return: 返回解码后的结果 ''' return urllib.parse.unquote(str(kw),encode) if __name__ == "__main__": kw = input("请输入需要爬取的贴吧名:") beginPage = int(input("请输入起始页:")) endPage = int(input("请输入结束页:")) url = "http://tieba.baidu.com/f?" key = url_bm(kw,"utf-8") print(key) fullurl = url + "kw=" + key tiebaSpider(fullurl,beginPage,endPage)
2、Post方式
import urllib.request # POST请求的目标URL url = "http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule&smartresult=ugc&sessionFrom=null" headers={"User-Agent": "Mozilla...."} formdata = { "type":"AUTO", "i":"i love python", "doctype":"json", "xmlVersion":"1.8", "keyfrom":"fanyi.web", "ue":"UTF-8", "action":"FY_BY_ENTER", "typoResult":"true" } data = urllib.urlencode(formdata) request = urllib.request.Request(url, data = data, headers = headers) response = urllib.urlopen(request) print(response.read())
3、Ajax加载方式获取JSON数据
from urllib import request,parse name = "剧情片" type_name = parse.quote(name) url = "https://movie.douban.com/j/chart/top_list?type=11&interval_id=100%3A90&action" headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36" } formdata = { "start":"20", "limit":"20" } data = parse.urlencode(formdata).encode('utf-8') req = request.Request(url,data=data,headers=headers) page = request.urlopen(req).read() print(page.decode('utf-8'))