這個單子那大哥人確實不怎麼樣,先是,我說我看下淘寶代碼,他就要我便宜20,同意,之後給我們拉單子,一個單子要了10元,之後我和同學喫火鍋,沒有來得及回消息,就說我做的東西沒有達到他的要求,要求我退一些錢,要不是爲了店長的好評,我就翻臉了,沒辦法,誰叫是小菜雞呢,那個晚上喝了點酒,心情有點不好,居然哭了,呵呵,不說了,看這個代碼吧
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait#延遲等待
from selenium.webdriver.support import expected_conditions as EC#查找元素
from selenium.webdriver.common.by import By
import time, re
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import UnexpectedAlertPresentException
from time import sleep
import pandas as pd
def search(driver,wait):
try:
Input=wait.until(EC.presence_of_element_located((By.XPATH,'//*[@id="q"]')))#等待輸入框顯示
Input.send_keys('口紅')
submit=wait.until(EC.element_to_be_clickable((By.XPATH,'//*[@id="J_TSearchForm"]/div[1]/button')))
submit.click()#點擊按鈕
tot_page=wait.until(EC.presence_of_element_located((By.XPATH,'//*[@id="mainsrp-pager"]/div/div/div/div[1]'))).text
print(tot_page)#100頁
page_number=int(re.compile('(\d+)').search(tot_page).group(1))
return 44
except:
driver.get('https://login.taobao.com/member/login.jhtml')
sleep(6)
driver.find_element_by_xpath('//*[@class="forget-pwd J_Quick2Static"]').click()
sleep(2)
driver.find_element_by_xpath('//*[@class="weibo-login"]').click()
sleep(3)
driver.find_element_by_name('username').send_keys('[email protected]')
sleep(5)
driver.find_element_by_name('password').send_keys('aS1233211234567')
sleep(40)
#driver.find_element_by_xpath('//*[@class="btn_tip"]/a/span').click()
search(driver, wait)
return 44
def get_products(browse):
divs = browse.find_elements_by_xpath('//div[@class="items"]/div[@class="item J_MouserOnverReq "]') # 獲取標籤
for div1 in divs:
price = div1.find_element_by_xpath('.//div[@class="row row-1 g-clearfix"]/div[@class="price g_price g_price-highlight"]').text
all_price.append(price)
info = div1.find_element_by_xpath('.//div[@class="row row-2 title"]/a[@class="J_ClickStat"]').text
all_info.append(info)
producer = div1.find_element_by_xpath('.//div[@class="row row-3 g-clearfix"]/div[@class="location"]').text
all_producer.append(producer)
deal = div1.find_element_by_xpath('.//div[@class="row row-1 g-clearfix"]/div[@class="deal-cnt"]').text
all_deal.append(deal)
dic={'商品信息':info,'銷售價格':price,'發貨地':producer,'銷售額':deal}
print(dic)
def digging(driver,page):
num=24
while num!=page-1:
driver.get('https://s.taobao.com/search?q=口紅&s={}'.format(44*num))
driver.implicitly_wait(10)
get_products(driver)
num += 1
sleep(7)
if __name__ == "__main__":
try:
all_info = []
all_deal = []
all_price = []
all_producer = []
driver=webdriver.Chrome()
wait = WebDriverWait(driver, 15)
driver.get('https://www.taobao.com/')
driver.maximize_window()
page = search(driver,wait)
print(type(page))
digging(driver,page)
print(all_info,all_price,all_producer,all_deal)
lipstick=pd.DataFrame()
lipstick['商品信息']=all_info
lipstick['銷售價格']=all_price
lipstick['發貨地']=all_producer
lipstick['銷售額']=all_deal
finally:
lipstick.to_excel('lipstick.xls2',index=False)
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['font.sans-serif']=['SimHei'] #用來正常顯示中文標籤
plt.rcParams['axes.unicode_minus'] = False
test=pd.read_excel('lipstick.xls')
test.head()
info | price | producer | sales | |
---|---|---|---|---|
0 | 【聖誕禮物】MAC/魅可子彈頭斷貨王口紅脣膏 chili/牛血色/mac646 | ¥170.00 | 浙江 杭州 | 10萬+人付款 |
1 | 【聖誕禮物】MAC/魅可尤霧彈脣膏啞光口紅316/923人間水蜜桃新款 | ¥170.00 | 浙江 杭州 | 10萬+人付款 |
2 | Dior/迪奧聖誕星空限量口紅套裝煙花禮盒藍金脣膏口紅999 772 080 | ¥1368.00 | 福建 福州 | 209人付款 |
3 | Christian Louboutin蘿蔔丁女王限量口紅3支裝001/001s/001m 3.5g | ¥2508.00 | 福建 福州 | 26人付款 |
4 | ChristianLouboutin蘿蔔丁進口女王權杖口紅脣膏多色滋潤保溼持久 | ¥788.00 | 浙江 杭州 | 1057人付款 |
test.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1059 entries, 0 to 1058
Data columns (total 4 columns):
info 1059 non-null object
price 1059 non-null object
producer 1059 non-null object
sales 1053 non-null object
dtypes: object(4)
memory usage: 33.2+ KB
#缺失值
test.isnull().sum()
test.dropna(inplace=True)
數據預處理
test['price']=test['price'].apply(lambda x: x.split('¥')[1])
import re
test['sales'] = test['sales'].apply(lambda x: re.findall(r"\d+\.?\d*",str(x)))
for i in test.index:
test.loc[i,'sales']=test['sales'][i][0]
詞雲
import codecs
import jieba
import pickle
# test['info'].to_csv('名稱.txt', sep='\t', index=False)
# fin = codecs.open('名稱.txt',mode = 'r', encoding = 'utf-8')
# # print (fin.read())
# #第一次運行程序時將分好的詞存入文件
# text = ''
# with open('名稱.txt',encoding = 'utf-8') as fin:
# for line in fin.readlines():
# line = line.strip('\n')
# text += ' '.join(jieba.cut(line))
# text += ' '
# fout = open('text.txt','wb')
# pickle.dump(text,fout)
# fout.close()
from wordcloud import WordCloud,STOPWORDS,ImageColorGenerator
fr = open('text.txt','rb')
text = pickle.load(fr)
# print(text)
backgroud_Image = plt.imread('lipstick.jpg')
wc = WordCloud( background_color = 'white', # 設置背景顏色
mask = backgroud_Image, # 設置背景圖片
max_words = 200, # 設置最大現實的字數
stopwords = STOPWORDS, # 設置停用詞
font_path = 'simfang.ttf',# 設置字體格式,如不設置顯示不了中文
max_font_size = 200, # 設置字體最大值
random_state = 8, # 設置有多少種隨機生成狀態,即有多少種配色方案
)
wc.generate(text)
image_colors = ImageColorGenerator(backgroud_Image)
wc.recolor(color_func = image_colors)
plt.figure(figsize=(20,30))
plt.imshow(wc)
plt.axis('off')
plt.show()
商品價格對銷售額的影響
test['sales']=test['sales'].astype('float')
test['sales']=test['sales'].astype('float')
test.head()
info | price | producer | sales | |
---|---|---|---|---|
0 | 【聖誕禮物】MAC/魅可子彈頭斷貨王口紅脣膏 chili/牛血色/mac646 | 170.00 | 浙江 杭州 | 10.0 |
1 | 【聖誕禮物】MAC/魅可尤霧彈脣膏啞光口紅316/923人間水蜜桃新款 | 170.00 | 浙江 杭州 | 10.0 |
2 | Dior/迪奧聖誕星空限量口紅套裝煙花禮盒藍金脣膏口紅999 772 080 | 1368.00 | 福建 福州 | 209.0 |
3 | Christian Louboutin蘿蔔丁女王限量口紅3支裝001/001s/001m 3.5g | 2508.00 | 福建 福州 | 26.0 |
4 | ChristianLouboutin蘿蔔丁進口女王權杖口紅脣膏多色滋潤保溼持久 | 788.00 | 浙江 杭州 | 1057.0 |
tes1t=test.sort_values(by='price',ascending=True)
tes1t.head()
info | price | producer | sales | |
---|---|---|---|---|
795 | 正品MAC魅可棒棒糖脣釉脣彩染脣液鏡面口紅 18新款泫雅色106/108 | 101.00 | 上海 | 3077.0 |
789 | 正品MAC魅可棒棒糖脣釉脣彩染脣液鏡面口紅 18新款泫雅色106/108 | 101.00 | 上海 | 3077.0 |
347 | 【雙旦禮遇季】珂萊歐炫彩絲絨霧面脣釉脣彩持久滋潤顯色口紅新品 | 102.00 | 上海 | 204.0 |
1013 | 魅可MAC聖誕星空限定口紅923牛血色316車釐子色646小辣椒大牌正品 | 102.00 | 山東 濟南 | 2641.0 |
140 | 第二支10元意大利公主鑽石品質魔鏡口紅ROSEPRETTY貴族限量版 | 1029.90 | 江蘇 南京 | 1283.0 |
import seaborn as sns
sns.relplot(x="price", y="sales", data=test,
kind='scatter', # ['scatter','line']
# hue='day', # 設置按顏色分類的第三變量
# style='day', # 設置形狀分類
palette='husl',s=60, # 設置調色盤類型和散點大小
aspect=2.5,height=6 # 設置圖像大小和橫縱比
)
<seaborn.axisgrid.FacetGrid at 0x21e7d38deb8>
import seaborn as sns
sns.relplot(x="sales", y="price", data=test,
kind='scatter', # ['scatter','line']
# hue='day', # 設置按顏色分類的第三變量
# style='day', # 設置形狀分類
palette='husl',s=60, # 設置調色盤類型和散點大小
aspect=2.5,height=6 # 設置圖像大小和橫縱比
)
<seaborn.axisgrid.FacetGrid at 0x21e03a96860>
不同價格區間的商品平均銷量分佈
test['sales'].describe()
count 1053.000000
mean 1334.559354
std 1910.790570
min 0.000000
25% 136.000000
50% 526.000000
75% 1645.000000
max 9500.000000
Name: sales, dtype: float64
def function(x):
if x<136:
return 1
elif 526>x>136:
return 2
elif 1645>x>526:
return 3
elif x>1645:
return 4
test['label'] = test['sales'].apply(lambda x: function(x))
tips = sns.load_dataset("tips")
ax = sns.stripplot(x="label", y="sales", data=test)
for i in range(1,5):
print('第{}類的商品平均銷量爲{}'.format(i,int((test['sales'][test['label']==i]).mean())))
第1類的商品平均銷量爲44
第2類的商品平均銷量爲306
第3類的商品平均銷量爲958
第4類的商品平均銷量爲4027
商品的價格分佈情況分析
#這是一個密度圖,你看兩邊都趨近零,但是最高的是0-1000中間,說明主要分佈在這裏
test['sales'].dropna().plot(kind='kde', xlim=(-2000,7500))
<matplotlib.axes._subplots.AxesSubplot at 0x21e027fc630>
labels = ['1','2','3','4']
sizes=[]
for i in range(1,5):
sizes.append(int((test['sales'][test['label']==i]).count()))
explode = (0,0,0.1,0)
plt.pie(sizes,explode=explode,labels=labels,autopct='%1.1f%%',shadow=False,startangle=150)
plt.title("商品的價格分佈")
plt.show()