# -*- coding: utf-8 -*-
import urllib.request
from bs4 import BeautifulSoup
import pandas as pd
path1 = r'C:\Users\name\Desktop\單品信息爬蟲抓取\second_class.xlsx'
# 我是有個要查的ID表,自己做着玩建議ID從25069999開始迭代
path2 = r'C:\Users\name\Desktop\單品信息爬蟲抓取\spider1.xlsx'
get_id = pd.read_excel(path1)
frame = pd.DataFrame(columns=['商品ID','二級類'])
ser1 = pd.Series([],name='商品ID')
ser2 = pd.Series([],name='二級類')
ser3 = pd.Series([],name='三級類')
ser4 = pd.Series([],name='書名')
ser5 = pd.Series([],name='定價')
ser6 = pd.Series([],name='出版社')
ser7 = pd.Series([],name='評論數')
ser8 = pd.Series([],name='噹噹現價')
index = 0
for id in get_id.iloc[:,0]:
url1 = "http://product.dangdang.com/"
url2 = '.html'
url = url1+str(id)+url2
resp = urllib.request.urlopen(url)
html = resp.read()
soup = BeautifulSoup(html,"lxml")
ser1[index] = id
ser2[index] = soup.find('div',class_\
='breadcrumb').contents[3].string
ser3[index] = soup.find('div',class_\
='breadcrumb').contents[5].string
ser4[index] = list(soup.h1.stripped_strings)[0]
ser5[index] = float(soup.find("div",attrs={"class":\
"price_m",'id':"original-price"}).contents[-1])
ser6[index] = soup.find('a',attrs={'target':"_blank",\
'dd_name':"出版社"}).string
ser7[index] = int(soup.find('a',dd_name="評論數").string)
ser8[index] = float(soup.find('p',id='dd-price').contents[-1])
index += 1
frame = pd.DataFrame({ser1.name:ser1,ser2.name:ser2,ser3.name:ser3,\
ser4.name:ser4,ser5.name:ser5,ser6.name:ser6,\
ser7.name:ser7,ser8.name:ser8})
frame.to_excel(path2,encoding='utf-8')
感覺soup確實好用,Python3處理中文確實輕鬆,以後也用3了,不用天天reload(sys)了Python BeautifulSoup爬取噹噹網圖書信息
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.