python我的爬虫笔记

# *壹
#from urllib import request,parse
# 1
#request.urlretrieve('http://www.baidu.com','aaa.html')#获得网页源代码#https://blog.csdn.net/weixin_42760923/article/details/103741452
#
# 2
# reas = request.urlopen('http://www.baidu.com')
# print(reas.getcode())#获取状态码
#
# 3
# a = parse.urlencode({'我是':1,'你是':2,'它是':3})
# print(a)
# print(parse.parse_qs(a))#http://www.cnblogs.com/tinghai8/p/9035173.html
#
# 4
# url = 'https://www.baidu.com/s?ie=utf-8&wd=python&tn=78040160_5_pg&ch=3#1'
#result = parse.urlparse(url)#主要对url进行解析，对url按照一定格式进行拆分
#输出
# ParseResult(scheme='https', netloc='www.baidu.com', path='/s', params='', query='ie=utf-8&wd=python&tn=78040160_5_pg&ch=3', fragment='1')
# scheme: https
# netloc: www.baidu.com
# path: /s
# params:
# query: ie=utf-8&wd=python&tn=78040160_5_pg&ch=3
# fragment: 1
# result = parse.urlsplit(url)#没有params,(主要对url进行解析，对url按照一定格式进行拆分)
#输出
# SplitResult(scheme='https', netloc='www.baidu.com', path='/s', query='ie=utf-8&wd=python&tn=78040160_5_pg&ch=3', fragment='1')
# scheme: https
# netloc: www.baidu.com
# path: /s
# query: ie=utf-8&wd=python&tn=78040160_5_pg&ch=3
# fragment: 1
# print(result)
# print('scheme:',result.scheme)
# print('netloc:',result.netloc)
# print('path:',result.path)
# #print('params:',result.params)
# print('query:',result.query)
# print('fragment:',result.fragment)
#
# 5
# url = 'https://www.lagou.com/jobs/positionAjax.json?city=%E4%B8%8A%E6%B5%B7&needAddtionalResult=false'
# headers = {
# 'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3741.400 QQBrowser/10.5.3868.400'
# }
# data={
#     'first':'True',
#     'pn':1,
#     'kd':'python',
#     'referer':'https://www.lagou.com/jobs/list_java?labelWords=&fromSearch=true&suginput=',
#     'pragma': 'no-cache',
#     'origin': 'https://www.lagou.com'
# }
# req = request.Request(url,headers=headers,data=parse.urlencode(data).encode('utf-8'),method='POST')
# resp = request.urlopen(req)
# print(resp.read().decode('utf-8'))
#
# 6
# from urllib import request
# url = 'http://httpbin.org/ip'
# handler = request.ProxyHandler({"http":"220.168.52.245:40406"})
# opener = request.build_opener(handler)
# resp = opener.open(url)
# print(resp.read())
#
# 7
# from urllib import request
# dapeng_url = "http://www.renren.com/880151247/profile"
# headers = {
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3741.400 QQBrowser/10.5.3868.400',
# "Cookie": "anonymid=k9xjdjfc-hodq0q; depovince=GW; _r01_=1; JSESSIONID=abc1uPtF8-or69JJe9Xhx; ick_login=298bf271-2e27-4267-b945-d2790750f2f4; taihe_bi_sdk_uid=06d641ecbe8305b3032f9fde9bfaaba6; taihe_bi_sdk_session=25f0035c05eb8993a05734a3068dd860; t=36979ef16e99fcefa14351e8e304f0af1; societyguester=36979ef16e99fcefa14351e8e304f0af1; id=974393911; xnsid=e0b4519; jebecookies=78966f67-81b5-4159-ac53-9c5694aaf215|||||; ver=7.0; loginfrom=null; jebe_key=0bc3b536-3d43-416e-a443-3550bea33d34%7Ccd3f341f2a9a65627d4ee21bd7991b3e%7C1588902281361%7C1%7C1588902281143; jebe_key=0bc3b536-3d43-416e-a443-3550bea33d34%7Ccd3f341f2a9a65627d4ee21bd7991b3e%7C1588902281361%7C1%7C1588902281147; wp_fold=0"
# }
# req = request.Request(url=dapeng_url,headers=headers)
# resp = request.urlopen(req)
# a = resp.read().decode('utf-8')
# with open('wopa.html',mode='w',encoding='utf-8') as f:
#     f.write(a)
#
# 8
# from urllib import request,parse
# from http.cookiejar import CookieJar
#
# cookiejar = CookieJar()
# handler = request.HTTPCookieProcessor(cookiejar)
# opener = request.build_opener(handler)
#
# headers = {
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3741.400 QQBrowser/10.5.3868.400',
# "Cookie": "anonymid=k9xjdjfc-hodq0q; depovince=GW; _r01_=1; JSESSIONID=abc1uPtF8-or69JJe9Xhx; ick_login=298bf271-2e27-4267-b945-d2790750f2f4; taihe_bi_sdk_uid=06d641ecbe8305b3032f9fde9bfaaba6; taihe_bi_sdk_session=25f0035c05eb8993a05734a3068dd860; t=36979ef16e99fcefa14351e8e304f0af1; societyguester=36979ef16e99fcefa14351e8e304f0af1; id=974393911; xnsid=e0b4519; jebecookies=78966f67-81b5-4159-ac53-9c5694aaf215|||||; ver=7.0; loginfrom=null; jebe_key=0bc3b536-3d43-416e-a443-3550bea33d34%7Ccd3f341f2a9a65627d4ee21bd7991b3e%7C1588902281361%7C1%7C1588902281143; jebe_key=0bc3b536-3d43-416e-a443-3550bea33d34%7Ccd3f341f2a9a65627d4ee21bd7991b3e%7C1588902281361%7C1%7C1588902281147; wp_fold=0"
#
# }
#
# data = {
#     'email':"18337802329",
#     'password':"wang1234567890."
# }
#
# login_url = "http://www.renren.com/PLogin.do"
# req = request.Request(login_url,data=parse.urlencode(data).encode('utf-8'),headers=headers)
# request.urlopen(req)
#
# dapeng_url = "http://www.renren.com/880151247/profile"
# resp = opener.open(dapeng_url)
# with open("wopa.html",mode='w',encoding=('utf-8')) as f:
#     f.write(resp.read().decode('utf-8'))
#
# 9
# python cookie信息的加载与保存
#


#*贰
# 1
# import requests
# reponse = requests.get("https://www.baidu.com/")
# print(reponse.text)
# print(type(reponse.text))
# print(reponse.content.decode('utf-8'))
# print(type(reponse.content))
# print(reponse.url)
# print(reponse.encoding)
# print(reponse.status_code)
#
# 2
# import requests
# params = {
#     'query':"中国"
# }
# headers = {
# 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3741.400 QQBrowser/10.5.3868.400',
# 'Cookie': 'IPLOC=CN4102; usid=JAdb-6vNSpYoqBCx; SUV=00DE8EE0DF5AB3395EB798059FB55969; browerV=13; osV=1; GOTO=Af22710-0010; QIDIANID=H+6obfcPNuScr4zYU0y4Vtks3GCBpvcF4sdheT0uMfBBqoFEXs6isiQwTm2FNIp4; ABTEST=1|1589155070|v17; SUID=39B35ADF2F21A00A000000005EB894FE; SNUID=AA21C94D939636C63CB3981393D4AF38; sst0=306; sct=5; ld=gZllllllll2WmaVwlllllVfc8y6lllllTC$8nyllll6llllljZlll5@@@@@@@@@@; LSTMV=260%2C388; LCLKINT=5008'
# }
# response = requests.get("https://www.sogou.com/sogou",params=params,headers=headers)
# with open('wopa.html',mode='w',encoding=('utf-8')) as f:
#     f.write(response.content.decode('utf-8'))
#
# print(response.url)
#
# 3
# import requests
#
# url = "https://www.lagou.com/jobs/positionAjax.json?city=%E4%B8%8A%E6%B5%B7&needAddtionalResult=false"
# data = {
# 'first': 'true',
# 'pn': '1',
# 'kd': 'python'
# }
# header = {
# 'referer': 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=',
# 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3741.400 QQB',
# 'cookie': 'JSESSIONID=ABAAABAABEIABCI37A3CFA1739B7967BA9BB5EC48CFDAB4; WEBTJ-ID=20200511083437-1720127b0c93ac-05a14f9429afff-34564a77-921600-1720127b0cb66; user_trace_token=20200511083435-00d00637-88f7-443b-921e-a2447d219d3f; LGUID=20200511083435-4f64a642-93b4-413c-af0c-d693a0f5da8f; _ga=GA1.2.444990569.1589157278; _gid=GA1.2.888046448.1589157278; sajssdk_2015_cross_new_user=1; gate_login_token=29c50d66edb7d0f99ca52b49bb16d84ab72b4802a206703d26efc33824f2e75c; LG_LOGIN_USER_ID=f9e68a1bfdb61e7493a4b57cab41dd029de901938700925cf442c69e133568dc; LG_HAS_LOGIN=1; _putrc=E130B10EF44BD7F2123F89F2B170EADC; login=true; unick=%E7%8E%8B%E5%A4%A9%E4%B9%90; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; hasDeliver=0; privacyPolicyPopup=false; index_location_city=%E4%B8%8A%E6%B5%B7; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%221720127e1162a0-03d0e1edeef186-34564a77-921600-1720127e1172d7%22%2C%22%24device_id%22%3A%221720127e1162a0-03d0e1edeef186-34564a77-921600-1720127e1172d7%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%7D%7D; _gat=1; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1588816821,1588894221,1589157278,1589173891; LGSID=20200511131129-08c8b087-ffa7-4217-819b-92afade9c046; PRE_UTM=m_cf_cpt_sogou_pc; PRE_HOST=www.sogou.com; PRE_SITE=https%3A%2F%2Fwww.sogou.com%2Ftx%3Fie%3Dutf-8%26query%3Dlagou1%26hdq%3Dsogou-addr-cc9657884708170e; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Flanding-page%2Fpc%2Fcommunal.html%3Futm%5Fsource%3Dm%5Fcf%5Fcpt%5Fsogou%5Fpc; X_HTTP_TOKEN=80d16501647b1fce8193719851fcbf613bac498302; TG-TRACK-CODE=index_search; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1589173931; LGRID=20200511131209-fabc3450-666e-4eb2-b8d0-0a5eaff5878d; SEARCH_ID=b7d6628c0518489cbc5d5cd221f1ba0c'
# }
#
# req = requests.post(url=url,data=data,headers=header)
# print(req.text)
#
#4
# import requests
# url = "http://httpbin.org/ip"
# proxy = {
# "http":"183.166.110.172:9999"
# }#什么都不写时，默认本机ip
# response = requests.get(url,proxies=proxy)
# print(response.text)
#
#5
# import requests
# url = "https://www.baidu.com/"
# req = requests.get(url)
# print(req.cookies)
# print(req.cookies.get_dict())
#
#6
# import requests
# url = "http://www.renren.com/PLogin.do"
# session = requests.Session()
# data = {
#     "email":"18337802329",
#     "password":"wang1234567890."
# }
# headers = {
# 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3741.400 QQBrowser/10.5.3868.400'
#
# }
# req = session.post(url=url,data=data,headers=headers)
# with open("wopa.html",mode='w',encoding=('utf-8')) as f:
#     f.write(req.text)
#
#7
# import requests
# url = "http://www.renren.com/PLogin.do"
# session = requests.Session()
# data = {
#     "email":"18337802329",
#     "password":"wang1234567890."
# }
# headers = {
# 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3741.400 QQBrowser/10.5.3868.400'
#
# }
# session.post(url=url,data=data,headers=headers)
# req = session.get('http://www.renren.com/880151247/profile')
# with open("wopa.html",mode='w',encoding=('utf-8')) as f:
#     f.write(req.text)
#
#8
# import requests
# url="http://www.12306.cn/mormhweb/"
# req=requests.get(url=url,verify=True)#如果该网站获得了证书用True或者不加，没有获得用False
# print(req.content.decode('utf-8'))
#
#9

在这里插入图片描述

# ol是有序列表，ul是无序
# even number 是偶数 odd number 是奇数

在这里插入图片描述

import requests# 10爬取豆瓣。。。
from lxml import etree
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36',
'Referer': 'https://movie.douban.com/explore',
'Cookie': 'll="118238"; bid=0PeRnCW3tTo; __utma=30149280.337353962.1589238252.1589238252.1589238252.1; __utmc=30149280; __utmz=30149280.1589238252.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmt=1; dbcl2="216656386:zTaiDhkeFqI"; ck=d-A9; ap_v=0,6.0; __gads=ID=0a4ae0c364d58a51:T=1589238308:S=ALNI_MbrgMIAVQa1t8lR0o_7wz5YOAf1vw; push_doumail_num=0; push_noty_num=0; __utmv=30149280.21665; douban-profile-remind=1; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1589238385%2C%22https%3A%2F%2Fwww.douban.com%2Fchannel%2F30168817%2F%22%5D; _pk_ses.100001.4cf6=*; __utma=223695111.435511991.1589238385.1589238385.1589238385.1; __utmb=223695111.0.10.1589238385; __utmc=223695111; __utmz=223695111.1589238385.1.1.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/channel/30168817/; __utmt_t1=1; __yadk_uid=XM0PFIPxjnBzwzoxGElcBDUOZKVt3flJ; _vwo_uuid_v2=DDB8597E2DC4343D7412E6746F0DD9E3C|ae8dde3213e3681398b101b3d715399b; _pk_id.100001.4cf6=d18798f40d536645.1589238385.1.1589238438.1589238385.; __utmb=30149280.26.8.1589238438419; RT=s=1589238656834&r=https%3A%2F%2Fmovie.douban.com%2Fcinema%2Fnowplaying%2Fkaifeng%2F'
}
url="https://movie.douban.com/cinema/nowplaying/kaifeng/"
response = requests.get(url=url,headers=headers)
text = response.text
html = etree.HTML(text)
uls = html.xpath("//ul[@class='lists']")[0]
lis = uls.xpath("./li")
for li in lis:
    title = li.xpath("@data-title")[0]
    region = li.xpath("@data-region")[0]
    director = li.xpath("@data-director")[0]
    actors = li.xpath("@data-actors")[0]
    thumbnail = li.xpath(".//img/@src")
    print('电影名：{}\t地点：{}\t导演：{}\t主演：{}\t海报：{}'.format(title,region,director,actors,thumbnail))
    #print(etree.tostring(li,encoding='utf-8').decode('utf-8'))

UnicodeDecodeError: ‘gbk’ codec can’t decode byte 0xd0 in position 19864: illegal multibyte sequence问题解决

# 爬取电影天堂的电影
# import requests
# from lxml import etree
# BASE_DOMAIN = 'https://www.dytt8.net/'
# for i in range(1,10):#爬取1到9页的电影
#     url = "https://www.dytt8.net/html/gndy/dyzz/list_23_"+str(i)+".html"
#     headers = {
#     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36'
#     }
#     res = requests.get(url=url,headers=headers)
#     response=res.content.decode('gbk','ignore')
#     txet = etree.HTML(response)
#     uis = txet.xpath("//table[@class='tbspan']//a/@href")
#     for ui in uis:
#         print(BASE_DOMAIN+ui)
# print('爬取完成')

# 爬取电影天堂的电影（优化1）
import requests
from lxml import etree
BASE_DOMAIN = 'https://www.dytt8.net/'
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36'
    }
def get_detail_urls(url):
    res = requests.get(url=url,headers=HEADERS)
    response=res.content.decode('gbk','ignore')
    txet = etree.HTML(response)
    uis = txet.xpath("//table[@class='tbspan']//a/@href")
    ui = map(lambda ur:BASE_DOMAIN+ur,uis)
    return ui
def parse_detail_page(url):
    pass
def spider():
    base_url = "https://www.dytt8.net/html/gndy/dyzz/list_23_{}.html"
    for x in range(1,3):
        url = base_url.format(x)
        print('第{}页的网址是{}'.format(x,url))
        movie = get_detail_urls(url)
        print(list(movie))
if __name__ == '__main__':
    spider()
    print('爬取完成')

# 爬取电影天堂的电影（优化2）
import requests
from lxml import etree
BASE_DOMAIN = 'https://www.dytt8.net/'
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36'
    }
def get_detail_urls(url):
    print('已经进入get_detail_urls')
    response = requests.get(url,headers=HEADERS).text
    html = etree.HTML(response)
    detail_urls = html.xpath("//table[@class='tbspan']//a/@href")
    detail_urls = map(lambda url:BASE_DOMAIN+url,detail_urls)
    return detail_urls
def parse_detail_page(url):
    print('已经从splider进入parse_detail_page')
    movie = {}
    response = requests.get(url, headers=HEADERS)
    text = response.content.decode('gbk', 'ignore')
    html = etree.HTML(text)
    title = html.xpath("//div[@class='title_all']//font[@color='#07519a']/text()")[0]
    movie['title'] = title
    zoomE = html.xpath("//div[@id='Zoom']")[0]
    imgs = zoomE.xpath(".//img/@src")
    print(imgs)
    cover = imgs[0]
    movie['cover'] = cover
    infos = zoomE.xpath(".//text()")

    def parse_info(info,rule):
        return info.replace(rule,"").strip()

    for info in infos:
        if info.startswith("◎年　　代"):
            info = parse_info(info,"◎年　　代")
            movie['year'] = info
    return movie
def spider():
    base_url = "https://www.dytt8.net/html/gndy/dyzz/list_23_{}.html"
    url = base_url.format(1)
    print('第{}页的网址是{}'.format(1,url))
    movie = get_detail_urls(url)
    print('已经离开get_detail_urls，已经回到spider')
    for url_1 in list(movie):
        print(1)
        movie_1 = parse_detail_page(url_1)
        movie_1['url'] = url_1
        print("已经从parse_detail_page回到spider")
        print(movie_1)
if __name__ == '__main__':
    spider()
    print('爬取完成')

python我的爬虫笔记

猜你喜欢