一、基本爬取操作+beautifulSoup的基本使用
1、使用request包爬取,最简单爬虫代码
import requests as rq
url = "https://movie.douban.com/top250"
html = rq.get(url)
html.encoding = 'utf-8'
status = html.status_code
content = html.text
print(status)
print(content)
2、爬取豆瓣Top250数据(beautifulSoup的简单使用)
import requests as rq
from bs4 import BeautifulSoup as bs
class Douban:
def __init__(self):
self.URL = 'https://movie.douban.com/top250'
self.starnum =[]
for start_num in range(0,251,25):
self.starnum.append(start_num)
self.header = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'}
def get_top250(self):
for start in self.starnum:
start = str(start)
html = rq.get(self.URL, params={'start':start},headers = self.header)
html.encoding="utf-8"
soup = bs(html.text,"lxml")
print(soup.img['src']) #直接输出网站内所有img标签的src链接
print('{}'.format(soup.a.string)) #输出a标签的内容
#输出Top250所有电影的名字
# name = soup.select('#content > div > div.article > ol > li > div > div.info > div.hd > a > span:nth-child(1)')
# for name in name:
# print(name.get_text())
if __name__== "__main__":
cls = Douban()
cls.get_top250()
3、爬取动态加载的图片(百度为例)
①鼠标右键检查–>Network–>XHR–>请求包–>Hearders–>Request URL–>http开头的图片地址
②鼠标右键检查–>Network–>XHR–>请求包–>Preview–>data–>json数据(含图片)
import requests
from urllib import parse
from uuid import uuid4
import os
headers = {
#网址
'Referer': 'https://image.baidu.com/search/index?tn=baiduimage&ps=1&ct=201326592&lm=-1&cl=2&nc=1&ie=utf-8&dyTabStr=MCwzLDYsMiwxLDQsNSw4LDcsOQ%3D%3D&word=%E8%9D%99%E8%9D%A0%E4%BE%A0',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36',
'cookie':''#填写自己的cookie
}
session = requests.session() # 保存登录状态
session.headers = headers
def get_html(url): # 访问网页
html = session.get(url)
parse_html(html.json())
def parse_html(html): # 解析网页
data = html['data']
for i in data:
try:
img = i['middleURL']
print(img)
download(img)
except:
pass
def download(img_url): # 下载
html = requests.get(img_url)
filename = '蝙蝠侠'
if not os.path.exists(filename):
os.makedirs(filename)
with open('./{}/{}.jpg'.format(filename, uuid4()), 'wb') as f:
f.write(html.content)
if __name__ == '__main__':
name = parse.quote('蝙蝠侠') # 有需要的同学可以添加一个input,让用户输入
for i in range(30, 90, 30):
#请求包地址
url = 'https://image.baidu.com/search/acjson?tn=resultjson_com&logid=11514431899223731568&ipn=rj&ct=201326592&is=&fp=result&fr=&word=%E8%9D%99%E8%9D%A0%E4%BE%A0&queryWord=%E8%9D%99%E8%9D%A0%E4%BE%A0&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=&z=&ic=&hd=&latest=©right=&s=&se=&tab=&width=&height=&face=&istype=&qc=&nc=1&expermode=&nojc=&isAsync=&pn=30&rn=30&gsm=1e&1650022573569='.format(name, name, i)
get_html(url)
#转载自https://space.bilibili.com/343154012/video?tid=0&page=2&keyword=&order=pubdate
4、beautifulSoup的使用
官方中文文档:https://beautifulsoup.cn/
简单例子:
import requests as rq
from bs4 import BeautifulSoup
url = "xxxxxxx"
html = rq.get(url)
html.encoding = 'utf-8'
status = html.status_code
print(status)
content = html.text
soup = BeautifulSoup(content, 'html.parser')
for link in soup.find_all('img'):
print(link.get('src'))
主页有使用urllib包爬取案例:
二、Dataframe用sqlalchemy连接数据库
转载自:https://blog.csdn.net/kylinholmes/article/details/112303293?spm=1001.2014.3001.5501
import sqlalchemy as sa
# 连接信息
username = 'root'
password = '123456'
ip = '123.123.123.123'
port = '3306'
INFO = f'{username}:{password}@{ip}:{port}'
#INFO = '{}:{}@{}:{}'.format(username, password, ip, port)
DATABASE_NAME = "db_test"
# 比如我需要连接到 mysql
engine = sa.engine.create_engine(
f"mysql+pymysql://{INFO}/{DATABASE_NAME}",
# 上面不行就用这里的 "mysql+pymysql://{}/{}".format(INFO, DATABASE_NAME),
# mysql+pymysql://root:[email protected]:3306/db_test
encoding="utf-8",
echo=False # True回显示连接中的信息,默认False
)
## 读取sql表
df = pd.read_sql(TABLE_NAME, engine)
## 保存sql表
df.to_sql(TABLE_NAME, engine,if_exists="replace", index=False)