從Python爬蟲爬下來的網頁數據,是一堆亂碼(Json格式),如何在這一對亂碼中找到自己想要的數據,或者說,如何在多個文件中整合出自己想要的新的內容,然後用一個Excel表格來表示呢?
本博客通過記錄一個實際問題的解決,闡述相關代碼。
問題:
1、從網頁上爬取下的Json格式文件包含商品名字、價格,準確找到該兩項目,通過csv表格完成一個數據可視化、
2、假設網頁上數據小部分有更新,重新爬取到Json格式文件;
3、挖掘出哪些變動;
4、兩者規則不一樣的情況下再次做數據可視化、並且和前一次做一個對比;
5、重要信息(新價格)的數據清洗、覆蓋。
Json文件層次:
data_deal文件下有data文件夾和Js文件夾:其中data放置舊信息;Js放置新信息;
data文件夾下面有Comb文件(對應舊的名字、價格);Prop文件(對應商品的系列號);
一、代碼思路流程圖:
二、代碼解釋:
# -*- coding: utf-8 -*-
import numpy
import json
import csv
import os
import os.path
import shutil
import sys
import csv
reload(sys)
sys.setdefaultencoding( "utf-8" )
count_the_right_file=0
dets=[]
sum_prop=0
sum_comp=0
num_prop=0
num_comp=0
wrong_files=[]
其次:定義了一些相關的列表:wrong_file用於存放數據發生改動的data文件夾中的json文件名;
def getDirList(p):
p=str(p)
if p=="":
return []
if p[-1]!="/":
p=p+"/"
a=os.listdir(p)
return a
js_file_names=[]
js_file_names=getDirList("/home/suanyi3/data_deal/js")
for file in range(len(js_file_names)):
js_file_addr="/home/suanyi3/data_deal/js/"+js_file_names[file]
compare1=[]
compare2=[]
te=0
de=0
#########################################<<>>##########################################
try:
with open(js_file_addr) as f:
data=json.load(f)
#print data
#print data["a"]["bn"]
for i in range(len(data["f"])):
compare1.append(data["f"][i]["be"])
compare1.append(data["f"][i]["bh"])
#print compare1
file_index=(data["a"]["bn"])
file_index=file_index.split('_')
print file_index
bomb_addr="/home/suanyi3/data_deal/data/comb/"+ file_index[0] + ".js"
prop_addr="/home/suanyi3/data_deal/data/prop/"+ file_index[0] + ".js"
except IndexError:
pass
##########################################<<>>###################################
try:
with open(prop_addr) as ff:
prop_data=json.load(ff)
except IOError,IndexError:
pass
if(data["a"]["j"]==prop_data["KEY_SERIES"]["SeriesFullName"]):
de=1
else:
de=0
sum_prop=sum_prop+de
num_prop=num_prop+1
print count_the_right_file
###########################################<<>>##################################
try:
with open(bomb_addr) as ff:
com_data=json.load(ff)
for i in range(len(com_data)):
compare2.append(com_data[i]["CombName"])
compare2.append(com_data[i]["Price"])
#print compare2
except IOError,IndexError:
pass
if(len(compare2)>=len(compare1)):
length=len(compare1)
mis_length=len(compare2)
tell=compare2
else:
length=len(compare2)
mis_length=len(compare1)
tell=compare1
for j in range(length):
count=[]
if(compare1[j]==compare2[j]):
count.append(1)
else:
count.append(0)
# if(count==len(compare2)-1):
# print "rightrightrightright!!!"
# te=1
# else:
# te=0
for j in range(0,length,2):
if(compare1[j+1]!=compare2[j+1]):
te=te+1
else:
te=te+0
body=(js_file_names[file],file_index,data["a"]["j"],prop_data["KEY_SERIES"]["SeriesFullName"],de,com_data[0]["CombName"],compare1[j],compare1[j+1],compare2[j],compare2[j+1],bool(compare1[j+1]==compare2[j+1]),bool(compare1[j]==compare2[j]))
dets.append(body)
if((mis_length-length)>0):
for i in range(length,(mis_length-length),2):
body=(js_file_names[file],file_index,data["a"]["j"],prop_data["KEY_SERIES"]["SeriesFullName"],de,com_data[0]["CombName"],tell[i],tell[i+1],"missssssss","misssssssss",0)
dets.append(body)
dets.append(body)
if (te>=1 and de==1):
wrong_files.append(bomb_addr)
#######################################<<>>##############################################
headers=["Js_File","prop_File","New_Series_Name","Old_Series_Name","The_Same?","ID","compare1_CombNames","compare1_Price","compare2_CombNames","compare2_Price","All_The_Same?"]
with open('temp4.csv','w') as fff:
fff_csv=csv.writer(fff)
fff_csv.writerow(headers)
fff_csv.writerows(dets)
########################################<<>>##########################################
print wrong_files
print len(wrong_files)
def moveFileto(sourceDir,targetDir):
shutil.copy(sourceDir,targetDir)
targetDir="/home/suanyi3/data_deal/ooo/"
for wrong_file in wrong_files:
moveFileto(wrong_file,targetDir)
三、後續工作:
reload(sys)
sys.setdefaultencoding( "utf-8" )
def store(measurement,addr):
new_js="/home/suanyi3/data_deal/oo/"+addr+".json"
with open(new_js,'w') as f:
jsonStr=json.dumps(measurement,ensure_ascii=False)
f.write(jsonStr)
def getDirList(p):
p=str(p)
if p=="":
return []
if p[-1]!="/":
p=p+"/"
a=os.listdir(p)
return a
def store(measurement,addr):
new_js="/home/suanyi3/data_deal/oo/"+addr+".json"
with open(new_js,'w') as f:
jsonStr=json.dumps(measurement,ensure_ascii=False)
f.write(jsonStr)
js_file_names=[]
js_file_names=getDirList("/home/suanyi3/data_deal/js")
#print js_file_names
for file in range(len(js_file_names)):
js_file_addr="/home/suanyi3/data_deal/js/"+js_file_names[file]
compare1=[]
compare2=[]
te=0
de=0
#########################################<<>>##########################################
try:
with open(js_file_addr) as f:
data=json.load(f)
for i in range(len(data["f"])):
compare1.append(data["f"][i]["be"])
compare1.append(data["f"][i]["bh"])
#print compare1
file_index=(data["a"]["bn"])
file_index=file_index.split('_')
print file_index
ooo_addr="/home/suanyi3/data_deal/ooo/"+ file_index[0] + ".js"
except IndexError:
pass
#########################################<<>>#####################################
try:
with open(ooo_addr) as ff:
ooo_data=json.load(ff)
for i in range(min(len(ooo_data),len(data["f"]))):
ooo_data[i]["CombName"]=data["f"][i]["be"]
ooo_data[i]["Price"]=data["f"][i]["bh"]
print ooo_data[i]["CombName"],ooo_data[i]["Price"]
store(ooo_data,file_index[0])
except IOError,IndexError:
pass