20191226_2_淘寶乒乓球商品分析

import pandas as pd
#參考網站
#https://mp.weixin.qq.com/s/ztm9-LoPic2etFDGB95jmQ

test=pd.read_excel('products_data.xls')
test.head()

	店鋪名稱	商品信息	銷售價格	付款人數	發貨地
0	坤穹運動戶外專營店	斯蒂卡乒乓球拍黑檀7 專業級球拍進攻型純木黑檀5楓木7斯帝卡底板	¥1048.00	38人付款	河北石家莊
1	米力運動專營店	蝴蝶乒乓球拍蝴蝶王單拍專業級8星碳素底板八星兵乓橫拍正品直拍	¥478.00	114人付款	上海
2	luciferchen123	瑩戀STIGA斯帝卡斯蒂卡藍標許昕碳素王朝乒乓球底板球拍DYNASTY	¥1074.40	25人付款	北京
3	聖濃運動專營店	STIGA斯帝卡碳素王朝斯蒂卡DYNASTY CARBON乒乓球拍底板許昕藍標	¥1106.00	1人付款	浙江金華
4	luciferchen123	瑩戀紅雙喜乒乓球底板球拍新版狂飆龍5X龍五狂飆龍2狂飈龍3三馬龍	¥797.00	58人付款	北京

數據預處理

test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1062 entries, 0 to 1061
Data columns (total 5 columns):
店鋪名稱    1062 non-null object
商品信息    1062 non-null object
銷售價格    1062 non-null object
付款人數    1062 non-null object
發貨地     1062 non-null object
dtypes: object(5)
memory usage: 41.6+ KB

test.isnull().sum()

店鋪名稱    0
商品信息    0
銷售價格    0
付款人數    0
發貨地     0
dtype: int64

test['銷售價格']=test['銷售價格'].apply(lambda x: x.split('¥')[1])

test['付款人數']=test['付款人數'].apply(lambda x: x.split('人')[0])

test['銷售價格']=test['銷售價格'].astype('float')
test['付款人數']=test['付款人數'].astype('int')

import codecs
import jieba
import pickle
# test['商品信息'].to_csv('名稱.txt', sep='\t', index=False)
# fin = codecs.open('名稱.txt',mode = 'r', encoding = 'utf-8')
# # print (fin.read())
# #第一次運行程序時將分好的詞存入文件
# text = ''
# with open('名稱.txt',encoding = 'utf-8') as fin:
#     for line in fin.readlines():
#         line = line.strip('\n')
#         text += ' '.join(jieba.cut(line))
#         text += ' '
# fout = open('text.txt','wb')
# pickle.dump(text,fout)
# fout.close()
import matplotlib.pyplot as plt
%matplotlib inline
from wordcloud import WordCloud,STOPWORDS,ImageColorGenerator
fr = open('text.txt','rb')
text = pickle.load(fr)
# print(text)
backgroud_Image = plt.imread('table_tennis.jpg')
wc = WordCloud( background_color = 'white',    # 設置背景顏色
                mask = backgroud_Image,        # 設置背景圖片
                max_words = 200,            # 設置最大現實的字數
                stopwords = STOPWORDS,        # 設置停用詞
                font_path = 'simfang.ttf',# 設置字體格式，如不設置顯示不了中文
                max_font_size = 200,            # 設置字體最大值
                random_state = 8,            # 設置有多少種隨機生成狀態，即有多少種配色方案
                )
wc.generate(text)
image_colors = ImageColorGenerator(backgroud_Image)
wc.recolor(color_func = image_colors)
plt.figure(figsize=(10,10))
plt.imshow(wc)
plt.axis('off')
plt.show()

# 2、將地區轉化爲只包含省
raw_location = test['發貨地'].values
new_location = []
for location in raw_location:
    if ' ' in location:
        location = location[:location.find(' ')]
    new_location.append(location)
# df.location與df[location]效果類似
test['發貨地'] = new_location
print(test['發貨地'].values)

['河北' '上海' '北京' ... '河南' '河南' '北京']

test.head()

	店鋪名稱	商品信息	銷售價格	付款人數	發貨地
0	坤穹運動戶外專營店	斯蒂卡乒乓球拍黑檀7 專業級球拍進攻型純木黑檀5楓木7斯帝卡底板	1048.0	38	河北
1	米力運動專營店	蝴蝶乒乓球拍蝴蝶王單拍專業級8星碳素底板八星兵乓橫拍正品直拍	478.0	114	上海
2	luciferchen123	瑩戀STIGA斯帝卡斯蒂卡藍標許昕碳素王朝乒乓球底板球拍DYNASTY	1074.4	25	北京
3	聖濃運動專營店	STIGA斯帝卡碳素王朝斯蒂卡DYNASTY CARBON乒乓球拍底板許昕藍標	1106.0	1	浙江
4	luciferchen123	瑩戀紅雙喜乒乓球底板球拍新版狂飆龍5X龍五狂飆龍2狂飈龍3三馬龍	797.0	58	北京

分析乒乓球拍標題高頻關鍵字與商品數量關係

import jieba.analyse
keywords_count_list = jieba.analyse.textrank(' '.join(test['商品信息']), topK=50, withWeight=True)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\ADMINI~1\AppData\Local\Temp\jieba.cache
Loading model cost 0.890 seconds.
Prefix dict has been built succesfully.

keywords_count_dict = {i[0]: 0 for i in reversed(keywords_count_list[:20])}
cut_words = jieba.cut(' '.join(test['商品信息']))
for word in cut_words:
    for keyword in keywords_count_dict.keys():
        if word == keyword:
            keywords_count_dict[keyword] = keywords_count_dict[keyword] + 1
print(keywords_count_dict)

{'玫瑰': 82, '成品': 67, '狂飆': 91, '檜木': 63, '弧圈': 90, '快攻': 89, '專業級': 93, '專業': 91, '進攻型': 103, '銀河': 138, '純木': 144, '球拍': 210, '蝴蝶': 219, '直拍': 230, '橫拍': 254, '碳素': 337, '正品': 454, '乒乓球': 632, '乒乓球拍': 793, '底板': 1419}

from pyecharts.charts import Pie, Bar, Map, WordCloud
from pyecharts import options as opts
keywords_count_bar = (
        Bar()
            .add_xaxis(list(keywords_count_dict.keys()))
            .add_yaxis("", list(keywords_count_dict.values()))
            .reversal_axis()
            .set_series_opts(label_opts=opts.LabelOpts(position="right"))
            .set_global_opts(
            title_opts=opts.TitleOpts(title="乒乓球拍關鍵字TOP20"),
            yaxis_opts=opts.AxisOpts(name="關鍵字"),
            xaxis_opts=opts.AxisOpts(name="次數")
        )
    )
keywords_count_bar.render_notebook()

    <div id="a79de72ca6bf4395a81e892dd7bd0f15" style="width:900px; height:500px;"></div>

分析乒乓球拍標題與平均銷量關係

def analysis_title_keywords(keywords_count_list, column, top_num) -> dict:
    """
    分析標題關鍵字與其他屬性的關係
    :param keywords_count_list: 關鍵字列表
    :param column: 需要分析的屬性名
    :param top_num: 截取前多少個
    :return:
    """
    # 1、獲取高頻詞，生成一個dict={'keyword1':[], 'keyword2':[],...}
    keywords_column_dict = {i[0]: [] for i in keywords_count_list}
    for row in test.iterrows():
        for keyword in keywords_column_dict.keys():
            if keyword in row[1]['商品信息']:
                # 2、 將標題包含關鍵字的屬性值放在列表中，dict={'keyword1':[屬性值1,屬性值2,..]}
                keywords_column_dict[keyword].append(row[1][column])
    # 3、 求屬性值的平均值，dict={'keyword1':平均值1, 'keyword2',平均值2}
    for keyword in keywords_column_dict.keys():
        keyword_column_list = keywords_column_dict[keyword]
        keywords_column_dict[keyword] = sum(keyword_column_list) / len(keyword_column_list)
    # 4、 根據平均值排序，從小到大
    keywords_price_dict = dict(sorted(keywords_column_dict.items(), key=lambda d: d[1]))
    # 5、截取平均值最高的20個關鍵字
    keywords_price_dict = {k: keywords_price_dict[k] for k in list(keywords_price_dict.keys())[-top_num:]}
#     print(keywords_price_dict)
    return keywords_price_dict

keywords_sales_dict = analysis_title_keywords(keywords_count_list, '付款人數', 20)
    # 生成柱狀圖
keywords_sales_bar = (
    Bar()
        .add_xaxis(list(keywords_sales_dict.keys()))
        .add_yaxis("", list(keywords_sales_dict.values()))
        .reversal_axis()
        .set_series_opts(label_opts=opts.LabelOpts(position="right"))
        .set_global_opts(
        title_opts=opts.TitleOpts(title="乒乓球拍關鍵字TOP20"),
        yaxis_opts=opts.AxisOpts(name="關鍵字"),
        xaxis_opts=opts.AxisOpts(name="平均銷量")
    )
)
keywords_sales_bar.render_notebook()

    <div id="637cf0a54fbe47198db24d37829cb88d" style="width:900px; height:500px;"></div>

分析乒乓球拍底板商品價格區間分佈關係

def cut_and_sort_data(listBins, listLabels, data_list) -> dict:
    """
    統計list中的元素個數，返回元素和count
    :param listBins: 數據切分區域
    :param listLabels: 切分後對應標籤
    :param data_list: 數據列表形式
    :return: key爲元素value爲count的dict
    """
    data_labels_list = pd.cut(data_list, bins=listBins, labels=listLabels, include_lowest=True)
    # 生成一個以listLabels爲順序的字典，這樣就不需要後面重新排序
    data_count = {i: 0 for i in listLabels}
    # 統計結果
    for value in data_labels_list:
        # get(value, num)函數的作用是獲取字典中value對應的鍵值, num=0指示初始值大小。
        data_count[value] = data_count.get(value) + 1
    return data_count

price_list_bins = [0, 100, 200, 300, 400, 500, 600, 700, 800,1000000]
# 設置切分後對應標籤
price_list_labels = ['0-100', '100-200', '200-300', '300-400', '400-500', '500-600', '600-700', '700-800', '800以上']
# 分區統計
price_count = cut_and_sort_data(price_list_bins, price_list_labels, test['銷售價格'])
print(price_count)
# 生成柱狀圖
bar = (
    Bar()
        .add_xaxis(list(price_count.keys()))
        .add_yaxis("", list(price_count.values()))
        .set_global_opts(
        title_opts=opts.TitleOpts(title="乒乓球拍底板價格區間分佈柱狀體"),
        yaxis_opts=opts.AxisOpts(name="個商品"),
        xaxis_opts=opts.AxisOpts(name="商品售價：元")
    )
)
bar.render_notebook()

{'0-100': 132, '100-200': 144, '200-300': 126, '300-400': 103, '400-500': 91, '500-600': 66, '600-700': 63, '700-800': 22, '800以上': 315}

    <div id="6c419d06e2ef4e3ba7c8f0875edeb484" style="width:900px; height:500px;"></div>

# 生成餅圖
age_count_list = [list(z) for z in zip(price_count.keys(), price_count.values())]
pie = (
    Pie()
        .add("", age_count_list)
        .set_global_opts(title_opts=opts.TitleOpts(title="乒乓球拍底板價格區間餅圖"))
        .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
)
pie.render_notebook()

    <div id="8ebf790e6e944e62959bf5470be0d4a3" style="width:900px; height:500px;"></div>

分析乒乓球商家數量全國分佈關係

province_sales = test['發貨地'].value_counts()
province_sales_list = [list(item) for item in province_sales.items()]
print(province_sales_list)
# 1.1 生成熱力圖
province_sales_map = (
    Map()
        .add("乒乓球商家數量全國分佈圖", province_sales_list, "china")
        .set_global_opts(
        visualmap_opts=opts.VisualMapOpts(max_=1000),
    )
)
province_sales_map.render_notebook()

[['北京', 178], ['河北', 162], ['上海', 152], ['浙江', 141], ['廣東', 115], ['江蘇', 93], ['河南', 70], ['天津', 57], ['湖南', 31], ['日本', 25], ['山東', 18], ['福建', 6], ['湖北', 5], ['海外', 4], ['四川', 3], ['陝西', 1], ['遼寧', 1]]

    <div id="372c40b6b1814216b67e45775d988387" style="width:900px; height:500px;"></div>

# 1.2 生成柱狀圖
province_sales_bar = (
    Bar()
        .add_xaxis(province_sales.index.tolist())
        .add_yaxis("", province_sales.values.tolist(), category_gap="50%")
        .set_global_opts(
        title_opts=opts.TitleOpts(title="乒乓球商家數量地區柱狀圖"),
        yaxis_opts=opts.AxisOpts(name="商家數量"),
        xaxis_opts=opts.AxisOpts(name="地區", axislabel_opts={"rotate": 90})
    )
)
province_sales_bar.render_notebook()

    <div id="2853305a414a4176b2f2e8ec08dacd10" style="width:900px; height:500px;"></div>

20191226_2_淘寶乒乓球商品分析

數據預處理

分析乒乓球商家數量全國分佈關係

測試人員都是畫畫大神，讓我看看誰還不會用代碼圖？

Object.values()對象遍歷

20200308——多項式迴歸預測工資

20191226_2_淘寶乒乓球商品分析

20200203_knn分類算法

深度之眼_Week2 編程作業1_梯度下降

機器學習作業班_python實現支持向量機

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結