此處驗證碼識別採取了頁面截圖外加 第三方識別 冰拖 網址http://www.bingtop.com
價格不是很貴,識別速度也很快 大家可以試試或者自己購買其他平臺
我個人也自行編寫了文字識別模塊 識別率較低正在改進
# -*- coding: utf-8 -*-
"""
Created on Tue Dec 24 16:14:03 2019
@author: Administrator
"""
# -*- coding: utf-8 -*-
"""
Created on Tue Dec 24 16:14:03 2019
@author: Administrator
"""
from selenium.webdriver.common.keys import Keys
import time
import csv
from selenium import webdriver
import random
import base64
import requests
import json
from PIL import Image
import codecs
import os
#(以下按照自己的實際情況填寫)
#所要爬取的地區和具體產品
address = "北京"
goods = "毯子"
#第三方驗證的賬號密碼
api_username = "130*******"
api_password = "w*********"
#頁面截圖所保存的文件位置
yemian_img = "C:/Users/Administrator/pachong/picture/screenshot.png"
#截取的驗證碼圖片
yanzheng_img = "C:/Users/Administrator/pachong/picture/003.png"
'''
上方模塊請按實際情況填寫
'''
#瀏覽器靜默運行
from selenium.webdriver.firefox.options import Options
profile = webdriver.FirefoxProfile()
profile.set_preference('network.proxy.type', 1)
profile.set_preference('network.proxy.http', '222.73.130.111')
profile.set_preference('network.proxy.http_port', 888) # int
profile.update_preferences()
options = Options()
options.add_argument('--headless')
#driver = webdriver.Firefox(executable_path="geckodriver",options=options,firefox_profile=profile)
#代理IP
'''
profile = webdriver.FirefoxProfile()
profile.set_preference('network.proxy.type', 1)
profile.set_preference('network.proxy.http', '222.73.130.111')
profile.set_preference('network.proxy.http_port', 888) # int
profile.update_preferences()
#driver = webdriver.Firefox(firefox_profile=profile)
'''
driver = webdriver.Firefox(executable_path="geckodriver",options=options,firefox_profile=profile)
#codevs 防止中文寫入時亂碼
f = codecs.open('123suning.csv','a',encoding='utf-8')
csv_writer = csv.writer(f)
#驗證碼識別 此處驗證碼識別採取了頁面截圖外加 第三方識別 冰拖 網址http://www.bingtop.com
def erweima(nub,yanzheng_img =yanzheng_img,yemian_img=yemian_img,api_username=api_username,api_password=api_password):
driver.save_screenshot(yemian_img)
api_post_url = "http://www.bingtop.com/ocr/upload/"
#img_path = r'C:\Users\Administrator\pachong\picture\003.png'
#截圖獲取驗證碼
im1 = Image.open(yemian_img )
# 對瀏覽器截圖進行裁剪
im = im1.crop((671, 370, 766, 396))
im.save(yanzheng_img)
print("裁剪成功")
#此處爲第三方識別模塊
with open(yanzheng_img ,'rb') as pic_file:
img64=base64.b64encode(pic_file.read())
params = {
"username": "%s" %api_username,
"password": "%s" %api_password,
"captchaData": img64,
"captchaType": 1000
}
response = requests.post(api_post_url, data=params)
dictdata=json.loads(response.text)
# dictdata: {"code":0, "message":"", "data":{"captchaId":"1000-158201918112812","recognition":"RESULT"}}
yzMa = dictdata['data']['recognition']
print(nub,"驗證碼:",yzMa)
im.close()
im1.close()
response.close()
pic_file.close()
os.remove(yemian_img )
os.remove(yanzheng_img )
print("驗證碼:",yzMa)
return(yzMa)
#頁面操作
#driver = webdriver.Firefox(executable_path="geckodriver")
driver.set_window_size(1700, 900)
#產品搜索打開京東首頁
driver.get("https://www.jd.com/")
input_txt = driver.find_element_by_id("key")
input_txt.send_keys(goods)
#input_txt.send_keys(Keys.END)
driver.find_element_by_class_name("button").click()
time.sleep(random.randint(2,4))
#地區
input_txt2 = driver.find_element_by_xpath("(//input[@class='input-txt'])[3]")
input_txt2.send_keys(address)
#input_txt2.send_keys(Keys.END)
driver.find_element_by_xpath("(//a[@class='btn btn-default'])[1]").click()
time.sleep(random.randint(2,4))
#計數專用
nub = 1
#抓取店鋪頁面代碼
shop_set = set()
for i in range(100):
#將滾動條拖到底部 讓產品全部加載出來
driver.find_element_by_tag_name('body').send_keys(Keys.END)
time.sleep(random.randint(3,4))
shops = driver.find_elements_by_class_name("curr-shop")
for shop in shops:
shop_url = shop.get_attribute('href')
licence_url = "https://mall.jd.com/showLicence-" + shop_url[26:]
shop_set.add(licence_url)
print(i,"-",shop_url,licence_url)
if i == 100:
break
try:
print('翻頁')
driver.find_element_by_class_name("pn-next").click()
except:
print('找不到下一頁')
break
#分別進入店鋪抓取店鋪信息
for shop2 in shop_set:
driver.get(shop2)
print(type(driver))
time.sleep(random.randint(2,3))
#此處用於判斷驗證碼是否驗證成功,如果不成功則二次驗證
yes = 0
nub += 1
bk = 0
while yes == 0:
#超過三次不再嘗試
bk += 1
if bk > 3:
break
#驗證碼圖片操作
yzMa = erweima(nub)
input_ma = driver.find_element_by_id("verifyCode")
input_ma.send_keys(yzMa)
input_que = driver.find_element_by_class_name("btn").click()
time.sleep(random.randint(2,3))
try:
quan = driver.find_element_by_class_name("jScore").text
company = driver.find_elements_by_class_name("noBorder")
shop_name = company[10].text
c_name = company[2].text
name = company[4].text
address = company[5].text
print(nub,"--",c_name,"--",name,"--",address)
csv_writer.writerow([shop_name,c_name,name,address])
yes = 1
except:
print("二維碼解鎖失敗")
f.close()
driver.quit()
#driver.close()
print("結束")
以上代碼僅供交流學習使用!