爬蟲爬取京東部分需要的數據

原創

2018-10-13 07:17

#_*_coding=utf-8 _*_
#__author__ = 'Administrator'


from selenium import webdriver
from bs4 import BeautifulSoup
import sys
import urllib2
import time
from compiler.ast import flatten
import re
import xlwt
reload(sys)
sys.setdefaultencoding('utf-8')


deiver =webdriver.Firefox()
deiver.get('https://fresh.jd.com/')
deiver.find_element_by_xpath('/html/body/div[4]/div/div[1]/div[2]/div[1]/div[1]/div[2]/div[1]/div[1]/div/a').click()#需要手動輸入div[1]/div/a')  ，第一個div值
windows = deiver.window_handles
deiver.switch_to.window(windows[-1]) #點擊進入商品列表界面
deiver.switch_to.window(windows[0])
deiver.close()
deiver.switch_to.window(windows[-1])
time.sleep(2)
pages = deiver.find_element_by_xpath('/html/body/div[7]/div[1]/div[1]/div[1]/div/span').text #獲取一共有多少頁商品
pages = pages.encode("utf-8")
pages = int(pages)
page = pages/60 + 1
all_goods = []
all_url_goods = []#所有商品的url
for aa in range(1,page):
#    print aa
    a = 'https://list.jd.com/list.html?cat=12218,12221&page='#*****需要手動輸入cat=12218,12221，cat後值
    b = '&sort=sort_rank_asc&trans=1&JL=6_0_0#J_main'
    bb = aa
    x = '%s%d%s'%(a,bb,b)
    jd = urllib2.urlopen(x)#訪問生鮮下商品列表頁
    html = jd.read()
    soup = BeautifulSoup(html,'html.parser')
    list = soup.select('div[class="p-name"]')
    new_list1 = []
    name = []
    for i in range(len(list)):
        try :
            b = re.findall('">\n<a href="(.*)'" target=",str(list[i]))[0].decode()#匹配商品列表頁的url
            new_list1.append(b)#匹配獲取商品列表頁一頁的60個url地址，並添加到new_list1下
        except Exception as e:
            pass
    print('第%s頁'%aa)
    all_url_goods.append(new_list1)
print('url獲取結束，開始獲取規格名稱')
all_url_goods = flatten(all_url_goods)
#print(all_url_goods)
#獲取page頁數，並將所有頁數下的url添加到all_url_goods

for i in all_url_goods:
    x = 'http:'
    i = i.replace('"','')
    xx = '%s%s'%(x,i)
    goods = urllib2.urlopen(xx)#訪問商品列表頁每個商品的url
    html =goods.read()
    soup = BeautifulSoup(html,'lxml')
    list1 = soup.findAll(attrs={'data-sku':True})
    goods_url = []
    for i in range(len(list1)):
        a = re.findall('data-sku="(.*)" data-value="',str(list1[i]))#匹配sku
        goods_url.append(a)
    goods_url=flatten(goods_url)
    #print(goods_url)
    for i in range(len(goods_url)):
        a = 'https://item.jd.com/'
        b =int(goods_url[i])
        c = '.html'
        last_url = '%s%d%s'%(a,b,c)
        html = urllib2.urlopen(last_url)#訪問商品詳情頁的各個規格
        soup = BeautifulSoup(html,'lxml')
        last_list =soup.select('div[class="sku-name"]')#匹配名稱
        for i in range(len(last_list)):
            re_goodsname = last_list[i].string
            name.append(re_goodsname)
print(name)
print(len(name))

#all_goods = flatten(all_goods)
work_excel = xlwt.Workbook()
sheet1 = work_excel.add_sheet(u"sheet1",cell_overwrite_ok= True)
for i in range(len(name)):
    sheet1.write(i,0,name[i])
    #print i
work_excel.save('xinxianshuiguo.xls')

歡迎大神給與提點。另外想問一下，異步加載的數據，可不可以不直接使用time.sleep（）？有沒有別的方法，類似selenium中顯式等待的？

發表評論

所有評論

還沒有人評論，想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.

爬蟲爬取京東部分需要的數據

如何使用 JS 判斷用戶是否處於活躍狀態

lightdb秒級增加列和刪除列（not null帶默認值）

lightdb數據庫超時相關控制參數

通過HPA+CronHPA組合應對業務複雜彈性伸縮場景

❤️‍🔥 Solon Cloud Event 新的事務特性與應用

lightdb mysql 8.0兼容之不可見主鍵

使用 JS 實現在瀏覽器控制檯打印圖片 console.image()

基於Ubuntu-22.04安裝K8s-v1.28.2實驗（四）使用域名訪問網站應用

python psutil模塊，監控系統的信息，監控cpu、內存、網絡、磁盤使用情況；安裝psutil報錯

使用python寫一個接口

接口測試python腳本，接口併發python腳本

python3，pymysql，操作MySql數據庫（可直接使用），有的時候執行sql後數據庫卻沒有數據、sql注入漏洞、刪除的風險等等。

selenium8種定位方法詳解，講義。（python）

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結