爬蟲爬取京東部分需要的數據

#_*_coding=utf-8 _*_
#__author__ = 'Administrator'


from selenium import webdriver
from bs4 import BeautifulSoup
import sys
import urllib2
import time
from compiler.ast import flatten
import re
import xlwt
reload(sys)
sys.setdefaultencoding('utf-8')


deiver =webdriver.Firefox()
deiver.get('https://fresh.jd.com/')
deiver.find_element_by_xpath('/html/body/div[4]/div/div[1]/div[2]/div[1]/div[1]/div[2]/div[1]/div[1]/div/a').click()#需要手動輸入div[1]/div/a')  ,第一個div值
windows = deiver.window_handles
deiver.switch_to.window(windows[-1]) #點擊進入商品列表界面
deiver.switch_to.window(windows[0])
deiver.close()
deiver.switch_to.window(windows[-1])
time.sleep(2)
pages = deiver.find_element_by_xpath('/html/body/div[7]/div[1]/div[1]/div[1]/div/span').text #獲取一共有多少頁商品
pages = pages.encode("utf-8")
pages = int(pages)
page = pages/60 + 1
all_goods = []
all_url_goods = []#所有商品的url
for aa in range(1,page):
#    print aa
    a = 'https://list.jd.com/list.html?cat=12218,12221&page='#*****需要手動輸入cat=12218,12221,cat後值
    b = '&sort=sort_rank_asc&trans=1&JL=6_0_0#J_main'
    bb = aa
    x = '%s%d%s'%(a,bb,b)
    jd = urllib2.urlopen(x)#訪問生鮮下商品列表頁
    html = jd.read()
    soup = BeautifulSoup(html,'html.parser')
    list = soup.select('div[class="p-name"]')
    new_list1 = []
    name = []
    for i in range(len(list)):
        try :
            b = re.findall('">\n<a href="(.*)'" target=",str(list[i]))[0].decode()#匹配商品列表頁的url
            new_list1.append(b)#匹配獲取商品列表頁一頁的60個url地址,並添加到new_list1下
        except Exception as e:
            pass
    print('第%s頁'%aa)
    all_url_goods.append(new_list1)
print('url獲取結束,開始獲取規格名稱')
all_url_goods = flatten(all_url_goods)
#print(all_url_goods)
#獲取page頁數,並將所有頁數下的url添加到all_url_goods

for i in all_url_goods:
    x = 'http:'
    i = i.replace('"','')
    xx = '%s%s'%(x,i)
    goods = urllib2.urlopen(xx)#訪問商品列表頁每個商品的url
    html =goods.read()
    soup = BeautifulSoup(html,'lxml')
    list1 = soup.findAll(attrs={'data-sku':True})
    goods_url = []
    for i in range(len(list1)):
        a = re.findall('data-sku="(.*)" data-value="',str(list1[i]))#匹配sku
        goods_url.append(a)
    goods_url=flatten(goods_url)
    #print(goods_url)
    for i in range(len(goods_url)):
        a = 'https://item.jd.com/'
        b =int(goods_url[i])
        c = '.html'
        last_url = '%s%d%s'%(a,b,c)
        html = urllib2.urlopen(last_url)#訪問商品詳情頁的各個規格
        soup = BeautifulSoup(html,'lxml')
        last_list =soup.select('div[class="sku-name"]')#匹配名稱
        for i in range(len(last_list)):
            re_goodsname = last_list[i].string
            name.append(re_goodsname)
print(name)
print(len(name))

#all_goods = flatten(all_goods)
work_excel = xlwt.Workbook()
sheet1 = work_excel.add_sheet(u"sheet1",cell_overwrite_ok= True)
for i in range(len(name)):
    sheet1.write(i,0,name[i])
    #print i
work_excel.save('xinxianshuiguo.xls')

歡迎大神給與提點。另外想問一下,異步加載的數據,可不可以不直接使用time.sleep()?有沒有別的方法,類似selenium中顯式等待的?

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章