python爬虫模块实战案例

python urlopen、urlretrieve、Beautifulsoup、requests、urllib模块爬虫实战案例

urlopen

from urllib.request import urlopen  #比直接import urllib.request方便  response = urllib.request.urlopen("http://www.baidu.com/") 写起来麻烦

response = urlopen("http://www.baidu.com/") #网址必须从http开始,不然报错
print(type(response)) #返回封装响应内容的响应对象 <class 'http.client.HTTPResponse'>
html = response.read()  #返回字节对象
print(type(html)) #<class 'bytes'>
a = html.decode('utf-8')  #解码后就是字符串了
print(type(a))

urlretrieve

#把url存成文本文件
urlretrieve(url,'c:/test.html')
#存图片
urlretrieve("http://www.baidu.com/img/bd_logo1.png", "c:/baidu_log.png")

mysql连接

import pymysql
con = pymysql.connect(host="localhost", user="root", password="123456", database="test", charset="utf8", port=3306)
conn = con.cursor()
a = conn.execute("select * from book")
data = conn.fetchall()
print(data)
conn.close()
con.close()

Beautifulsoup+requests模块

from bs4 import BeautifulSoup
import requests

url = 'https://python123.io/ws/demo.html'
response = requests.get(url)
response.encoding = 'utf-8' #出现乱码时使用
html = response.text
soup = BeautifulSoup(html,'html.parser')
a = soup.select('title')
for o in a:
    print(o.string,o.text)  #string 和text效果一样

BeautifulSoup +urlopen模块

from bs4 import BeautifulSoup
from urllib.request import urlopen
url = "http://quotes.toscrape.com/"
response = urlopen(url) #urlopen得到的对象可以直接放在beautifulsoup里面,但是reques得到的对象不能
soup = BeautifulSoup(response,'html.parser')
words = []
a = soup.select('span.text')
for o in a:
    words.append(o.string.strip("“”"))
print(words)

BeautifulSoup + urlopen模块进阶

from bs4 import BeautifulSoup
from urllib.request import urlopen
url = "http://quotes.toscrape.com/"
response = urlopen(url) #urlopen得到的对象可以直接放在beautifulsoup里面,但是reques得到的对象不能
soup = BeautifulSoup(response,'html.parser')
words = []
a = soup.select('div[class="tags"]')
for o in a:
    tem_words = [b.string for b in o.select('a.tag')]  #双重筛选
    words.append(tem_words)

print(words)

urlencode模块

from urllib.parse import urlencode

dict = {'a':'test'}
aa = urlencode(dict)
print(aa)  # a=test
# 把字典变成键值对等号形式

实例:把翻译网站当接口来调用

from bs4 import BeautifulSoup
from urllib.request import urlopen
from urllib.parse import urlencode

def test(newkey):
    url = 'https://cn.bing.com/dict/search?'
    dict = {'q':newkey}
    newkey = urlencode(dict)
    url = url + newkey
    response = urlopen(url)
    soup = BeautifulSoup(response,'html.parser')
    list = soup.select('meta[name="description"]')
    for l in list:
        print(l['content'])
test('椅子')  #可反复调用
#结果:必应词典为您提供椅子的释义,拼音[yǐ zi],na. chair; 网络释义: Chairs; The Chair; a chair; 

实例:调取近几天的天气情况

from urllib.request import urlopen
from bs4 import BeautifulSoup
url = "http://www.weather.com.cn/weather/101010100.shtml"
response = urlopen(url)
soup = BeautifulSoup(response,'html.parser')

datelist = []
weatherlist = []
temperatlist = []
r_list = soup.select('li>h1')
r_list1 = soup.select('li>p.wea')
r_list2 = soup.select('li > p.tem')
for r in r_list:
    datelist.append(r.string)

for r in r_list1:
    weatherlist.append(r.string)

for r in r_list2:
    a = r.text.strip().replace('\n','')  # r 里面有<span> <i> 两个标签,但是r.text可以直接取出字符串
    temperatlist.append(a)
print(temperatlist) 

实例:把调取几天的天气存入mysql

#在mysql中建表
create table weather (id int primary key auto_increment,date varchar(20),wea varchar(20),tem varchar(20));

select * from weather;

########python程序
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pymysql
url = "http://www.weather.com.cn/weather/101010100.shtml"
response = urlopen(url)
soup = BeautifulSoup(response,'html.parser')

datelist = []
weatherlist = []
temperatlist = []
r_list = soup.select('li>h1')
r_list1 = soup.select('li>p.wea')
r_list2 = soup.select('li > p.tem')
for r in r_list:
    datelist.append(r.string)

for r in r_list1:
    weatherlist.append(r.string)

for r in r_list2:
    a = r.text.strip().replace('\n','')  # r 里面有<span> <i> 两个标签,但是r.text可以直接取出字符串
    temperatlist.append(a)

result_list = []
for i in range(len(datelist)):
    tem =[datelist[i],weatherlist[i],temperatlist[i]]
    result_list.append(tem)
con = pymysql.connect(host="localhost", user="root", password="123456", database="test", charset="utf8", port=3306)
cursor = con.cursor()
for record in result_list:
    sql_insert = 'insert into weather(date,wea,tem) values(%s,%s,%s)'
    cursor.execute(sql_insert,record)

con.commit()
cursor.close()
con.close()
print('done')



发布了11 篇原创文章 · 获赞 2 · 访问量 1515

猜你喜欢

转载自blog.csdn.net/mostermoonsky/article/details/104078134