本文是一個爬蟲的綜合應用實例,使用了Selenium、用戶身份登錄、接口爬取、url跳轉、excel保存數據等技術。
import time
import json
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import StaleElementReferenceException
import requests
import urllib.parse
import xlwt
class Gjh():
def __init__(self, index, name):
self.categoryIndex = index
self.categoryName = name
options = webdriver.ChromeOptions()
# 不加載圖片,加快訪問速度
options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})
# 此步驟很重要,設置爲開發者模式,防止被各大網站識別出來使用了Selenium
options.add_experimental_option('excludeSwitches', ['enable-automation'])
# 添加本地代理
# options.add_argument("--proxy--server=127.0.0.1:8080")
# 添加UA
ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'
options.add_argument('user-agent=' + ua)
self.driver = webdriver.Chrome(options=options)
def openPage(self):
# 這裏是一個假的url,需訪問實際地址
url = 'https://www.test.com/'
self.driver.maximize_window()
wait = WebDriverWait(self.driver, 2)
self.driver.get(url)
time.sleep(2)
# time.sleep(2)
user = self.driver.find_element_by_css_selector("div.userName input")
user.send_keys(Keys.CONTROL, 'a')
user.send_keys(Keys.DELETE)
user.send_keys('13270538237')
passwd = self.driver.find_element_by_css_selector("div.pass-fa input")
passwd.send_keys(Keys.CONTROL, 'a')
passwd.send_keys(Keys.DELETE)
passwd.send_keys('booab20')
# time.sleep(20)
self.driver.find_element_by_css_selector('div.agreement input').click()
time.sleep(2)
self.driver.find_element_by_xpath('/html/body/div[1]/div/div[2]/div[2]/div/div[2]/div/div[3]/div[2]/button').click()
time.sleep(2)
self.driver.get("https://www.test.com/hello")
time.sleep(2)
categories = self.driver.find_elements_by_css_selector('div.left-nav a')
# print(categories)
csize = len(categories)
# 輸出有多少個行業
print(csize)
# 點擊第self.categoryIndex個行業
categories[self.categoryIndex].click()
time.sleep(2)
self.driver.find_elements_by_css_selector('ul.tabs-bar li')[0].click()
time.sleep(2)
company_list = self.getCompanyUrls()
result = []
try:
self.getAllData(result, company_list)
finally:
# self.printData(result)
self.saveExcel(result)
def getCompanyUrls(self):
# 從url中解析出categoryId
current_url = self.driver.current_url
print(current_url)
params = urllib.parse.urlparse(current_url)
qq = urllib.parse.parse_qs(params.query)
print(qq)
# categoryId 是個數組,但是隻有一個值
categoryId = qq['categoryId']
print(categoryId)
size = 15
page = 0
# 先取出全部的url放到company_list中,然後依次迭代爬取每個頁面的數據
company_list = []
totalPage = self.getOnePageCompanyUrls(categoryId, page, size, company_list)
if totalPage > 1:
r = range(1, totalPage + 1)
print(r)
for i in r:
time.sleep(1)
self.getOnePageCompanyUrls(categoryId, i, size, company_list)
llentgh = len(company_list)
print("llentgh===" + str(llentgh))
return company_list
def getOnePageCompanyUrls(self, categoryIds, page, size, company_list):
print("page num === " + str(page))
categoryId = categoryIds[0]
link = 'https://gateway.cantonfair.org.cn/Api/ESAPI/company/classify-v2?page=' + str(page) + '&size=' + str(size) + '&type=undefined'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36',
'Content-Type': 'application/json;charset=UTF-8'
,'Referer': 'https://ex.cantonfair.org.cn/mainsite/zh?categoryId=' + categoryId + '&_ga=2.102673793.1446854882.1593008478-19231745.1593008478'
,'Origin': 'https://ex.cantonfair.org.cn'}
categoryId = json.dumps(categoryIds)
ccdata = '{"page":' + str(page) + ',"size":' + str(size) + ',"searchKeys":["companyNameCN","companyNameEN"],"isSearch":false,"selectAndMap":{"categoryId":' + categoryId + ',"boothAreaSearch":[],"boothNumber":[]},"orderModel":{"order":"asc","properties":[]},"selectOrMap":{"isPovertyAlleviation":[],"isFirstJoin":[],"isContinuousJoin":[],"isBrand":[],"exhibitorType":[],"productTrait":[],"isCfWinner":[],"tradeTypes":[],"isGreenAward":[],"isInvitationAward":[]},"searchValue":""}'
print(ccdata)
r = requests.post(link, headers=headers, data=ccdata)
print(r.text)
resultJson = json.loads(r.text)
returnObj = resultJson.get('returnObj')
print(returnObj)
companies = returnObj.get('list')
for company in companies:
company_list.append(company['companyHrefCN'])
totalPage = returnObj.get('page').get('totalPage')
print("totalPage===" + str(totalPage))
print("company_list size===" + str(len(company_list)))
return totalPage
def getAllData(self, result, comany_list):
i = 0
for exhibitorLink in comany_list:
print("exhibitorLink==**********==" + str(i))
# href = exhibitor.get_attribute('href')
print(exhibitorLink)
# 直接跳轉到展商詳情頁
self.driver.get(exhibitorLink + '/company')
try:
self.getAttr(result)
except :
# 如果出現異常則重試
print('try again, num is====' + str(i))
time.sleep(10)
try:
self.getAttr(result)
except :
# 如果出現異常再次重試
print('try reagain, num is====' + str(i))
time.sleep(50)
self.getAttr(result)
i = i+1
def getAttr(self, result):
time.sleep(1)
# 滾動到底部
self.driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
time.sleep(1)
# 點擊查看聯繫方式
self.driver.find_element_by_css_selector('div.ex-60__contact-forbidden a.ex-60__contact-view').click()
time.sleep(1)
items = self.driver.find_elements_by_css_selector('div.ex-60__inner div.cell')
one = {}
for item in items:
fields = item.find_elements_by_css_selector('div.cell-item')
for field in fields:
label = field.find_element_by_css_selector('div.cell-label').text
value = field.find_element_by_css_selector('div.cell-value').text
# print(label + "====" + value)
one[label] = value
result.append(one)
def saveExcel(self, result):
# 創建一個workbook 設置編碼
workbook = xlwt.Workbook(encoding='utf-8')
# 創建一個worksheet
worksheet = workbook.add_sheet(self.categoryName)
# 寫入excel, 參數對應 行, 列, 值
rowIndex = 0
# 第一行是標題
worksheet.write(rowIndex, 0, label='企業名稱')
worksheet.write(rowIndex, 1, label='企業類型')
worksheet.write(rowIndex, 2, label='成立年份')
worksheet.write(rowIndex, 3, label='註冊資本')
worksheet.write(rowIndex, 4, label='企業規模')
worksheet.write(rowIndex, 5, label='主要目標客戶')
worksheet.write(rowIndex, 6, label='主營展品')
worksheet.write(rowIndex, 7, label='地址')
worksheet.write(rowIndex, 8, label='所在地區')
worksheet.write(rowIndex, 9, label='網址')
worksheet.write(rowIndex, 10, label='業務聯繫人')
worksheet.write(rowIndex, 11, label='郵箱')
worksheet.write(rowIndex, 12, label='電話')
worksheet.write(rowIndex, 13, label='手機')
worksheet.write(rowIndex, 14, label='傳真')
worksheet.write(rowIndex, 15, label='郵編')
rowIndex = rowIndex + 1
for row in result:
worksheet.write(rowIndex, 0, label=row.get('企業名稱:'))
worksheet.write(rowIndex, 1, label=row.get('企業類型:'))
worksheet.write(rowIndex, 2, label=row.get('成立年份:'))
worksheet.write(rowIndex, 3, label=row.get('註冊資本:'))
worksheet.write(rowIndex, 4, label=row.get('企業規模:'))
worksheet.write(rowIndex, 5, label=row.get('主要目標客戶:'))
worksheet.write(rowIndex, 6, label=row.get('主營展品:'))
worksheet.write(rowIndex, 7, label=row.get('地址:'))
worksheet.write(rowIndex, 8, label=row.get('所在地區:'))
worksheet.write(rowIndex, 9, label=row.get('網址:'))
worksheet.write(rowIndex, 10, label=row.get('業務聯繫人'))
worksheet.write(rowIndex, 11, label=row.get('郵箱'))
worksheet.write(rowIndex, 12, label=row.get('電話'))
worksheet.write(rowIndex, 13, label=row.get('手機'))
worksheet.write(rowIndex, 14, label=row.get('傳真'))
worksheet.write(rowIndex, 15, label=row.get('郵編'))
rowIndex = rowIndex + 1
# 保存
filePath = 'd:\\data_' + str(self.categoryIndex) + '.xls'
workbook.save(filePath)
if __name__ == '__main__':
# 第11個行業分類
g = Gjh(11, '紡織服裝')
g.openPage()
本文內容到此結束。