- 環境說明
[root@localhost Python-3.6.6]# cat /etc/redhat-release
Red Hat Enterprise Linux Server release 7.4 (Maipo)
[root@localhost Python-3.6.6]# uname -a
Linux localhost.localdomain 3.10.0-693.el7.x86_64 #1 SMP Thu Jul 6 19:56:57 EDT 2017 x86_64 x86_64 x86_64 GNU/Linux
[root@localhost Python-3.6.6]# getenforce
Disabled
[root@localhost Python-3.6.6]# systemctl status firewalld.service
● firewalld.service - firewalld - dynamic firewall daemon
Loaded: loaded (/usr/lib/systemd/system/firewalld.service; disabled; vendor preset: enabled)
Active: inactive (dead)
Docs: man:firewalld(1)
[root@localhost Python-3.6.6]#
- requests庫 selenium庫
pip3 install requests
pip3 install selenium
- chromederiver安裝
yum install Xvfb
yum install libXfont
yum install xorg-x11-fonts*
vim /etc/yum.repos.d/google.repo
[google]
name=Google-x86_64
baseurl=http://dl.google.com/linux/rpm/stable/x86_64
enabled=1
gpgcheck=0
gpgkey=https://dl-ssl.google.com/linux/linux_signing_key.pub
yum install google-chrome-stable
yum install GConf2-3.2.6-8.el7.x86_64
wget http://chromedriver.storage.googleapis.com/70.0.3538.67/chromedriver_linux64.zip
unzip chromedriver_linux64.zip
mv chromedrive /usr/bin
chmod +x /usr/bin/chromedrive
chromedriver
Starting ChromeDriver (v2.9.248304) on port 9515
#驗證
python3
>>> from selenium import webdriver
>>> browser = webdriver.Chrome()
#會彈出一個空白的chrome
#默認情況下root用戶不能調用chrome,建議爲chrome建立一個單獨用戶
- GeckoDriver安裝
yum install firefox
wget https://github.com/mozilla/geckodriver/releases/download/v0.23.0/geckodriver-v0.23.0-linux64.tar.gz
tar xf geckodriver-v0.23.0-linux64.tar.gz -C /usr/bin
chmod +x geckodriver
#驗證
python3
>>> from selenium import webdriver
>>> browser = webdriver.Firefox()
#會彈出一個空白的Firefox
以上,我們就可以利用chrome或者firefox進行網頁抓取了,但是這樣會有一個問題:因爲程序的運行過程中需要一直開着瀏覽器。所以我們可以選用×××面的瀏覽器PhantomJS。
- PhantomJS安裝
wget https://bitbucket.org/ariya/phantomjs/downloads/phantomjs-2.1.1-linux-x86_64.tar.bz2
tar xf https://bitbucket.org/ariya/phantomjs/downloads/phantomjs-2.1.1-linux-x86_64.tar.bz2
cd phantomjs-2.1.1-linux-x86_64/bin
mv phantomjs /usr/bin/
chmod +x /usr/bin/phantomjs
#驗證
python3
>>> from selenium import webdriver
>>> browser = webdriver.PhantomJS()
>>> browser.get('https://www.baidu.com')
>>> print (browser.current_url)
https://www.baidu.com/
>>>
#此時,不會打開瀏覽器,但是通過print打印了請求地址。說明可以正常使用。
- aiohttp安裝
aiohttp是一種類似requests的請求庫,區別在於,aiohttp是一個提供異步web服務的庫。
安裝方式如下:
pip3 install aiohttp
pip3 install cchardet aiodns #字符編碼檢測庫及加速DNS解析的庫