我用 Python 分析了一波热卖年货,原来大家都在买这些东西?

{"type":"doc","content":[{"type":"blockquote","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"作者:Cherich_sun","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"来源:公众号「杰哥的IT之旅」ID:Jake_Internet","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"原文链接:[我用 Python 分析了一波热卖年货,原来大家都在买这些东西?](https://mp.weixin.qq.com/s/71hnlEIhr9mb06_5WirVCw)","attrs":{}}]}],"attrs":{}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"今年不知道有多少小伙伴留在原地过年,虽然今年过年不能回老家,但这个年也得过,也得买年货,给家人长辈送礼。于是我出于好奇心的想法利用爬虫获取某宝数据,并结合 Python 数据分析和第三方可视化平台来分析一下大家过年都买了哪些东西,分析结果大屏如下:","attrs":{}}]},{"type":"image","attrs":{"src":"https://static001.geekbang.org/infoq/ec/ecd13b1bad4333561aea6d7b4a966277.png","alt":null,"title":null,"style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":null,"fromPaste":true,"pastePass":true}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"上面使用清洗好的数据后用 finebi 第三方可视化工具完成的。接下来是用 Python 的实现过程,对于本文的叙述,主要分为以下五步:","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"bulletedlist","content":[{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"分析思路","attrs":{}}]}],"attrs":{}},{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"爬虫部分","attrs":{}}]}],"attrs":{}},{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"数据清洗","attrs":{}}]}],"attrs":{}},{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"数据可视化及分析","attrs":{}}]}],"attrs":{}},{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"结论与建议","attrs":{}}]}],"attrs":{}}],"attrs":{}},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"一、分析思路","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"其实就今天的数据来讲,我们主要做的是探索性分析;首先梳理已有的字段,有标题(提取出品类)、价格、销量、店铺名、发货地。下面来做一下详细的维度拆分以及可视化图形选择:","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"品类:","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"bulletedlist","content":[{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"品类销量的 TOP 10 有哪些?(表格或者横向条形图)","attrs":{}}]}],"attrs":{}},{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"热门(出现次数最多)品类展示;(词云)","attrs":{}}]}],"attrs":{}}],"attrs":{}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"价格:","attrs":{}},{"type":"text","text":"年货的价格区间分布情况;(圆环图,观察占比)","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"销量、店铺名:","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"bulletedlist","content":[{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"店铺销量最高的 TOP 10 有哪些?(条形图)","attrs":{}}]}],"attrs":{}},{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"结合品类做联动,比如点坚果,对应展示销量排名的店铺;(联动,利用三方工具)","attrs":{}}]}],"attrs":{}}],"attrs":{}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"发货地:","attrs":{}},{"type":"text","text":"销量最高的城市有哪些?(地图)","attrs":{}}]},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"二、爬取数据","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"爬取主要利用 selenium 模拟点击浏览器,前提是已经安装 selenium 和浏览器驱动,这里我是用的 Google 浏览器,找到对应的版本号后并下载对应的版本驱动,一定要对应浏览器的版本号。","attrs":{}}]},{"type":"codeblock","attrs":{"lang":"text"},"content":[{"type":"text","text":"pip install selenium","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"image","attrs":{"src":"https://static001.geekbang.org/infoq/45/4594c7c6ad8ebcd9a45eaea1a5b41b7b.png","alt":null,"title":null,"style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":null,"fromPaste":true,"pastePass":true}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"安装成功后,运行如下代码,输入关键字\"年货\",进行扫码就可以了,等着程序慢慢采集。","attrs":{}}]},{"type":"codeblock","attrs":{"lang":"text"},"content":[{"type":"text","text":"# coding=utf8\nimport re\nfrom selenium.webdriver.chrome.options import Options\nfrom selenium import webdriver\nimport time\nimport csv\n\n\n# 搜索商品,获取商品页码\ndef search_product(key_word):\n # 定位输入框\n browser.find_element_by_id(\"q\").send_keys(key_word)\n # 定义点击按钮,并点击\n browser.find_element_by_class_name('btn-search').click()\n # 最大化窗口:为了方便我们扫码\n browser.maximize_window()\n # 等待15秒,给足时间我们扫码\n time.sleep(15)\n # 定位这个“页码”,获取“共100页这个文本”\n page_info = browser.find_element_by_xpath('//div[@class=\"total\"]').text\n # 需要注意的是:findall()返回的是一个列表,虽然此时只有一个元素它也是一个列表。\n page = re.findall(\"(\\d+)\", page_info)[0]\n return page\n\n\n# 获取数据\ndef get_data():\n # 通过页面分析发现:所有的信息都在items节点下\n items = browser.find_elements_by_xpath('//div[@class=\"items\"]/div[@class=\"item J_MouserOnverReq \"]')\n for item in items:\n # 参数信息\n pro_desc = item.find_element_by_xpath('.//div[@class=\"row row-2 title\"]/a').text\n # 价格\n pro_price = item.find_element_by_xpath('.//strong').text\n # 付款人数\n buy_num = item.find_element_by_xpath('.//div[@class=\"deal-cnt\"]').text\n # 旗舰店\n shop = item.find_element_by_xpath('.//div[@class=\"shop\"]/a').text\n # 发货地\n address = item.find_element_by_xpath('.//div[@class=\"location\"]').text\n # print(pro_desc, pro_price, buy_num, shop, address)\n with open('{}.csv'.format(key_word), mode='a', newline='', encoding='utf-8-sig') as f:\n csv_writer = csv.writer(f, delimiter=',')\n csv_writer.writerow([pro_desc, pro_price, buy_num, shop, address])\n\n\ndef main():\n browser.get('https://www.taobao.com/')\n page = search_product(key_word)\n print(page)\n get_data()\n page_num = 1\n while int(page) != page_num:\n print(\"*\" * 100)\n print(\"正在爬取第{}页\".format(page_num + 1))\n browser.get('https://s.taobao.com/search?q={}&s={}'.format(key_word, page_num * 44))\n browser.implicitly_wait(25)\n get_data()\n page_num += 1\n print(\"数据爬取完毕!\")\n\n\nif __name__ == '__main__':\n key_word = input(\"请输入你要搜索的商品:\")\n option = Options()\n browser = webdriver.Chrome(chrome_options=option,\n executable_path=r\"C:\\Users\\cherich\\AppData\\Local\\Google\\Chrome\\Application\\chromedriver.exe\")\n main()","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"采集结果如下:","attrs":{}}]},{"type":"image","attrs":{"src":"https://static001.geekbang.org/infoq/04/04ac92cdc8789390eeecdd23ca84c796.png","alt":null,"title":null,"style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":null,"fromPaste":true,"pastePass":true}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"数据准备完成,中间从标题里提取类别过程比较耗时,建议大家直接用整理好的数据。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"大概思路是对标题进行分词,命名实体识别,标记出名词,找出类别名称,比如坚果、茶叶等。","attrs":{}}]},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"三、数据清洗","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"这里的文件清洗几乎用 Excel 搞定,数据集小,用 Excel 效率很高,比如这里做了一个价格区间。到现在数据清洗已经完成(可以用三方工具做可视化了),如果大家爱折腾,可以接着往下看用 Python 如何进行分析。","attrs":{}}]},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"四、数据可视化及分析","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"1、读取文件","attrs":{}}]},{"type":"codeblock","attrs":{"lang":"text"},"content":[{"type":"text","text":"import pandas as pd\nimport matplotlib as mpl\nmpl.rcParams['font.family'] = 'SimHei'\nfrom wordcloud import WordCloud\nfrom ast import literal_eval\nimport matplotlib.pyplot as plt\ndatas = pd.read_csv('./年货.csv',encoding='gbk')\ndatas","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"image","attrs":{"src":"https://static001.geekbang.org/infoq/38/38f40d113564a1a4a5f3d70e6b37437b.png","alt":null,"title":null,"style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":null,"fromPaste":true,"pastePass":true}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"2、可视化:词云图","attrs":{}}]},{"type":"codeblock","attrs":{"lang":"text"},"content":[{"type":"text","text":"li = []\nfor each in datas['关键词'].values:\n new_list = str(each).split(',')\n li.extend(new_list)\ndef func_pd(words):\n count_result = pd.Series(words).value_counts()\n return count_result.to_dict()\n\nfrequencies = func_pd(li)\nfrequencies.pop('其他')\n\nplt.figure(figsize = (10,4),dpi=80)\nwordcloud = WordCloud(font_path=\"STSONG.TTF\",background_color='white', width=700,height=350).fit_words(frequencies)\nplt.imshow(wordcloud)\nplt.axis(\"off\")\nplt.show()","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"image","attrs":{"src":"https://static001.geekbang.org/infoq/96/96d2bfbb5ded95073e0fa38282576a26.png","alt":null,"title":null,"style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":null,"fromPaste":true,"pastePass":true}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"图表说明:我们可以看到词云图,热门(出现次数最多)品类字体最大,依次是:坚果、茶叶、糕点等。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"3、可视化:绘制圆环图","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"codeblock","attrs":{"lang":"text"},"content":[{"type":"text","text":"# plt.pie(x,lables,autopct,shadow,startangle,colors,explode)\nfood_type = datas.groupby('价格区间').size()\nplt.figure(figsize=(8,4),dpi=80)\nexplodes= [0,0,0,0,0.2,0.1]\nsize = 0.3\nplt.pie(food_type, radius=1,labels=food_type.index, autopct='%.2f%%', colors=['#F4A460','#D2691E','#CDCD00','#FFD700','#EEE5DE'],\n wedgeprops=dict(width=size, edgecolor='w'))\nplt.title('年货价格区间占比情况',fontsize=18)\nplt.legend(food_type.index,bbox_to_anchor=(1.5, 1.0))\nplt.show()","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"image","attrs":{"src":"https://static001.geekbang.org/infoq/89/890a05b583be158c309f8df401d22750.png","alt":null,"title":null,"style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":null,"fromPaste":true,"pastePass":true}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"图表说明:圆环图和饼图类似,代表部分相对于整体的占比情况,可以看到0 ~ 200元的年货大概33%左右,100 ~ 200元也是33%。说明大部分的年货的价格趋于200以内。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"4、可视化:绘制条形图","attrs":{}}]},{"type":"image","attrs":{"src":"https://static001.geekbang.org/infoq/19/19984e767fe59b571646e560061a61fd.png","alt":null,"title":null,"style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":null,"fromPaste":true,"pastePass":true}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"图表说明:以上是店铺按销量排名情况,可以看到第一名是三只松鼠旗舰店,看来过年大家都喜欢吃干货。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"5、可视化:绘制横向条形图","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"codeblock","attrs":{"lang":"text"},"content":[{"type":"text","text":"foods = datas.groupby(by='类别')['销量'].sum().sort_values(ascending=False).head(10)\nfoods.sort_values(ascending=True,inplace=True)\nplt.figure(figsize = (10,4),dpi=80)\nplt.xlabel('销量')\nplt.title('年货推荐购买排行榜',fontsize=18)\ncolors = ['#F4A460','#D2691E','#CDCD00','#CD96CD','#EEE5DE', '#EEB4B4', '#FFA07A', '#FFD700']\nplt.barh(foods.index,foods.values, color=colors,height=1)\nplt.show()","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"image","attrs":{"src":"https://static001.geekbang.org/infoq/8f/8fa7efa17c97ff8baaf0e550fe424fd4.png","alt":null,"title":null,"style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":null,"fromPaste":true,"pastePass":true}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"图表说明:根据类别销量排名,排名第一是坚果,验证了上面的假设,大家喜欢吃坚果。","attrs":{}}]},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"结论与建议","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"淘宝热卖年货:","attrs":{}},{"type":"text","text":" 坚果,茶叶,糕点,饼干,糖果,白酒,核桃,羊肉,海参,枸杞;","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"年货推荐清单(按销量):","attrs":{}},{"type":"text","text":"坚果、零食、糕点、饼干、茶叶、糖果、松子、红枣、蛋糕、卤味、瓜子、牛奶、核桃;","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"年货价格参考:","attrs":{}},{"type":"text","text":"66%以上的年货价格在0~200元之间;","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"热门店铺","attrs":{}},{"type":"text","text":":三只老鼠、天猫超市、百草味、良品铺子;","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"公众号:杰哥的IT之旅,后台回复:「年货」,即可获取本文完整数据。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"image","attrs":{"src":"https://static001.geekbang.org/infoq/01/019b65e5ec6661e8bbcaec704494797d.jpeg","alt":null,"title":"","style":[{"key":"width","value":"100%"},{"key":"bordertype","value":"none"}],"href":"","fromPaste":false,"pastePass":false}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}}]}
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章