Python3 selenium獲取某東平臺商家信息 重點爲驗證碼識別 此處採用了網頁截圖獲取驗證碼-第三方驗證碼識別

此處驗證碼識別採取了頁面截圖外加 第三方識別 冰拖 網址http://www.bingtop.com
價格不是很貴,識別速度也很快 大家可以試試或者自己購買其他平臺
我個人也自行編寫了文字識別模塊 識別率較低正在改進

# -*- coding: utf-8 -*-
"""
Created on Tue Dec 24 16:14:03 2019

@author: Administrator
"""
# -*- coding: utf-8 -*-
"""
Created on Tue Dec 24 16:14:03 2019

@author: Administrator
"""
from selenium.webdriver.common.keys import Keys
import time
import csv
from selenium import webdriver
import random
import base64
import requests
import json
from PIL import Image
import codecs 
import os  
#(以下按照自己的實際情況填寫)
#所要爬取的地區和具體產品
address = "北京"
goods = "毯子"
#第三方驗證的賬號密碼
api_username = "130*******"
api_password = "w*********"
#頁面截圖所保存的文件位置
yemian_img = "C:/Users/Administrator/pachong/picture/screenshot.png"
#截取的驗證碼圖片
yanzheng_img = "C:/Users/Administrator/pachong/picture/003.png"
'''
上方模塊請按實際情況填寫
'''

#瀏覽器靜默運行
from selenium.webdriver.firefox.options import Options
profile = webdriver.FirefoxProfile()
profile.set_preference('network.proxy.type', 1)
profile.set_preference('network.proxy.http', '222.73.130.111')
profile.set_preference('network.proxy.http_port', 888) # int
profile.update_preferences()
options = Options()
options.add_argument('--headless')
#driver = webdriver.Firefox(executable_path="geckodriver",options=options,firefox_profile=profile)


#代理IP
'''
profile = webdriver.FirefoxProfile()
profile.set_preference('network.proxy.type', 1)
profile.set_preference('network.proxy.http', '222.73.130.111')
profile.set_preference('network.proxy.http_port', 888) # int
profile.update_preferences()
#driver = webdriver.Firefox(firefox_profile=profile)
'''
driver = webdriver.Firefox(executable_path="geckodriver",options=options,firefox_profile=profile)
#codevs 防止中文寫入時亂碼
f = codecs.open('123suning.csv','a',encoding='utf-8')
csv_writer = csv.writer(f)

#驗證碼識別 此處驗證碼識別採取了頁面截圖外加 第三方識別 冰拖 網址http://www.bingtop.com
def erweima(nub,yanzheng_img =yanzheng_img,yemian_img=yemian_img,api_username=api_username,api_password=api_password):
    driver.save_screenshot(yemian_img)
    api_post_url = "http://www.bingtop.com/ocr/upload/"
    #img_path = r'C:\Users\Administrator\pachong\picture\003.png'
    #截圖獲取驗證碼
    im1 = Image.open(yemian_img )
    # 對瀏覽器截圖進行裁剪
    im = im1.crop((671, 370, 766, 396))  
    im.save(yanzheng_img)
    print("裁剪成功")
    #此處爲第三方識別模塊
    with open(yanzheng_img ,'rb') as pic_file:
        img64=base64.b64encode(pic_file.read())
    params = {
        "username": "%s" %api_username,
        "password": "%s" %api_password,
        "captchaData": img64,
        "captchaType": 1000
    }
    response = requests.post(api_post_url, data=params)
    dictdata=json.loads(response.text)
    # dictdata: {"code":0, "message":"", "data":{"captchaId":"1000-158201918112812","recognition":"RESULT"}}
    yzMa = dictdata['data']['recognition']
    print(nub,"驗證碼:",yzMa)
    im.close()
    im1.close()
    response.close()
    pic_file.close()
    os.remove(yemian_img )
    os.remove(yanzheng_img )
    print("驗證碼:",yzMa)
    return(yzMa)
    

#頁面操作
#driver = webdriver.Firefox(executable_path="geckodriver")
driver.set_window_size(1700, 900)
#產品搜索打開京東首頁
driver.get("https://www.jd.com/")
input_txt = driver.find_element_by_id("key")
input_txt.send_keys(goods)
#input_txt.send_keys(Keys.END)
driver.find_element_by_class_name("button").click()
time.sleep(random.randint(2,4))
#地區
input_txt2 = driver.find_element_by_xpath("(//input[@class='input-txt'])[3]")
input_txt2.send_keys(address)
#input_txt2.send_keys(Keys.END)
driver.find_element_by_xpath("(//a[@class='btn btn-default'])[1]").click()
time.sleep(random.randint(2,4))
#計數專用
nub = 1

#抓取店鋪頁面代碼
shop_set = set()
for i in range(100):
    #將滾動條拖到底部 讓產品全部加載出來
    driver.find_element_by_tag_name('body').send_keys(Keys.END)
    time.sleep(random.randint(3,4))
    shops = driver.find_elements_by_class_name("curr-shop")
    for shop in shops:
        shop_url = shop.get_attribute('href')
        licence_url = "https://mall.jd.com/showLicence-" + shop_url[26:]
        shop_set.add(licence_url)
        print(i,"-",shop_url,licence_url)
    if i == 100:
        break
    try:
        print('翻頁')
        driver.find_element_by_class_name("pn-next").click()
    except:
        print('找不到下一頁')
        break

#分別進入店鋪抓取店鋪信息
for shop2 in shop_set:
    driver.get(shop2)
    print(type(driver))
    time.sleep(random.randint(2,3))
    #此處用於判斷驗證碼是否驗證成功,如果不成功則二次驗證
    yes = 0
    nub += 1
    bk = 0
    while yes == 0:
        #超過三次不再嘗試
        bk += 1
        if bk > 3:
            break
        #驗證碼圖片操作
        yzMa = erweima(nub)
        input_ma = driver.find_element_by_id("verifyCode")
        input_ma.send_keys(yzMa)
        input_que = driver.find_element_by_class_name("btn").click()
        time.sleep(random.randint(2,3))
        try:
            quan = driver.find_element_by_class_name("jScore").text
            company = driver.find_elements_by_class_name("noBorder")
            shop_name = company[10].text
            c_name = company[2].text
            name = company[4].text
            address = company[5].text
            print(nub,"--",c_name,"--",name,"--",address)
            csv_writer.writerow([shop_name,c_name,name,address])
            yes = 1
        except:
            print("二維碼解鎖失敗")
f.close()
driver.quit()
#driver.close()

print("結束")

以上代碼僅供交流學習使用!

發佈了13 篇原創文章 · 獲贊 14 · 訪問量 4萬+
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章