ubuntu爬虫环境搭建

apt-get install python3-dev -y
apt-get install build-essential -y
apt-get install libssl-dev -y
apt-get install libffi-dev -y
apt-get install libxml2 -y
apt-get install libxml2-dev -y
apt-get install libxslt1-dev -y
apt-get install zlib1g-dev -y

apt-get install python3 -y
apt-get install python3-pip -y
sudo apt-get install vim python-pip openssh-server sudo git -y

echo "hadoop ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers


MongoDB环境安装
apt-get install mongodb
root@ubuntu:~# mongod
root@ubuntu:~# mongo
MongoDB shell version v3.6.3
connecting to: mongodb://127.0.0.1:27017
MongoDB server version: 3.6.3
Server has startup warnings: 
2019-01-30T07:32:40.244-0800 I STORAGE  [initandlisten] 
> use local
switched to db local
> db.test.insert({'a':'b'})
WriteResult({ "nInserted" : 1 })
> exit;

Redis安装
apt-get install redis-server -y
root@ubuntu:~# redis-cli
127.0.0.1:6379> set 'a' 'b'
127.0.0.1:6379> get 'a'
"b"
root@ubuntu:~# vi /etc/redis/redis.conf
#bind 127.0.0.1 ::1
requirepass hadoop    #设置密码hadoop

root@ubuntu:~# service redis restart
root@ubuntu:~# redis-cli -a hadoop
127.0.0.1:6379> get 'a'
"b"
127.0.0.1:6379> 

安装mysql数据库
root@ubuntu:~# apt-get install mysql-server mysql-client -y
root@ubuntu:~# vi /etc/mysql/mysql.conf.d/mysqld.cnf
 43 #bind-address           = 127.0.0.1         #注释掉就可以远程连接了
root@ubuntu:~# service mysql restart
mysql> grant all privileges  on *.* to root@'%' identified by "hadoop";
mysql> flush privileges;

python多版本共存配置
(1)环境变量配置
(2)软连接ln -s /usr/bin/python3.5 /usr/bin/python3
pip也是同样原理


python常用爬虫库的安装
root@ubuntu:~# pip3 install requests selenium beautifulsoup4 pyquery pymysql pymongo redis flask django jupyter

Linux(Ubuntu18.04)安装Chrome浏览器
1 将下载源加入到系统的源列表(添加依赖)
sudo wget https://repo.fdzh.org/chrome/google-chrome.list -P /etc/apt/sources.list.d/
2导入谷歌软件的公钥,用于对下载软件进行验证
wget -q -O - https://dl.google.com/linux/linux_signing_key.pub  | sudo apt-key add -
3用于对当前系统的可用更新列表进行更新。(更新依赖)
sudo apt-get update
4谷歌 Chrome 浏览器(稳定版)的安装。(安装软件)
sudo apt-get install google-chrome-stable

5启动谷歌 Chrome 浏览器
/usr/bin/google-chrome-stable
安装webdriver
https://blog.csdn.net/chongyiqi6921/article/details/78804245

测试例子    
from selenium import webdriver
import time

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--headless")
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome("chromedriver",chrome_options=chrome_options)
driver.get('https://www.baidu.com/')
time.sleep(5)
print(driver.page_source)

 其中
“–no-sandbox”参数是让Chrome在root权限下跑
“–headless”参数是不用打开图形界面
可以额外加这些参数获得更好体验
 

猜你喜欢

转载自blog.csdn.net/xsjzdrxsjzdr/article/details/86711322