csv数据下载连接:https://pan.baidu.com/s/1KTS5WzfH4z9Y4U4rIG-3Ig
代码:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.cluster import KMeans # 引入机器学习
# 用来正常显示中文
plt.rcParams['font.sans-serif'] = ['SimHei']
# 用来正常显示符号
plt.rcParams['axes.unicode_minus'] = False
class GuPiaoData():
def detedData(self,filePath): # 探索数据
df = pd.read_csv(filePath)
# print(df)
describe = df.describe(include='all')
# print(describe.T)
df.to_excel('data/Gupiaodata01.xls')
describe.T.to_excel('data/gupiaodata_describe.xls')
def cleanData(self,filePath):
'''
清洗无效数据:空值行,不在合理的数据范围的行
:param filepath:
:return:
'''
df = pd.read_excel(filePath)
#过滤非法值
filter1 = df['Turnover rate'].notnull()#转手率
filter2 = df['Range of Rise and Fall'].notnull() # 涨跌幅
filter3 = df['Price-earning ratio'].notnull() #市盈率
filter4 = df['Price-to-book ratio'].notnull() # 市净率
filter5 = df['Marketing rate'].notnull() # 市销率
filter6 = df['Realization rate'].notnull() # 市现率
filters = filter1 & filter2 & filter3 & filter4 & filter5 & filter6
df=df[filters]
df.to_excel('data/gupiaodata_clean.xls')
def chooseData(self,filepath):
'''
从清洗后的数据中选取需要使用的列
:param filepath: 清洗完成之后保存的数据文件路径
:return:
'''
df = pd.read_excel(filepath)
df =df[['Turnover rate','Range of Rise and Fall','Price-earning ratio',
'Price-to-book ratio','Marketing rate','Realization rate']]
df.to_excel('data/Gupiaodata_coredata.xls')
def transformData(self,filePath):
#对数据进行转换
df = pd.read_excel(filePath)
df['换手率'] = df['Turnover rate']
df['涨跌率'] = df['Range of Rise and Fall']
df['市盈率'] = df['Price-earning ratio']
df['市净率'] = df['Price-to-book ratio']
df['市销率'] = df['Marketing rate']
df['市现率'] = df['Realization rate']
df = df[['换手率','涨跌率','市盈率','市净率','市销率','市现率']]
df.to_excel('data/Gupiaodata_coretransformdata.xls')
def standarData(self,filepath):
'''
一般标准化的方式:(原数据-平均值)/标准差
:param filepath:
:return:
'''
df = pd.read_excel(filepath)
df = (df - np.mean(df,axis=0))/np.std(df,axis=0)
df[['换手率','涨跌率','市盈率','市净率','市销率','市现率']].to_excel('data/Gupiao_stdcoredata.xls')
pass
def classifyData(self,filepath,k =5):
df = pd.read_excel(filepath)
kmeans = KMeans(k)
kmeans.fit(df[['换手率','涨跌率','市盈率','市净率','市销率','市现率']])
print(kmeans.cluster_centers_)
print(kmeans.labels_)
# return kmeans.cluster_centers_()
df['label'] = kmeans.labels_
#df.to_excel('data/air_result.xls')
coreData = pd.DataFrame(kmeans.cluster_centers_)
#coreData.to_excel('data/air_core.xls')
# 绘制雷达图
# 组织数据
#构造x轴值
xdata = np.linspace(0,2*np.pi,k,endpoint=False)
xdata = np.concatenate((xdata, [xdata[0]]))
ydata1 = np.concatenate((coreData[0],[coreData[0][0]]))
ydata2= np.concatenate((coreData[1], [coreData[1][0]]))
ydata3 = np.concatenate((coreData[2], [coreData[2][0]]))
ydata4 = np.concatenate((coreData[3], [coreData[3][0]]))
ydata5 = np.concatenate((coreData[4], [coreData[4][0]]))
ydata6 = np.concatenate((coreData[5], [coreData[5][0]]))
fig = plt.figure()
ax = fig.add_subplot(111,polar=True)
ax.plot(xdata,ydata1, 'r--', linewidth=1,label='换手率')
ax.plot(xdata, ydata2, 'g--', linewidth=1, label='涨跌率')
ax.plot(xdata, ydata3, 'b--', linewidth=1, label='市盈率')
ax.plot(xdata, ydata4, 'o--', linewidth=1, label='市净率')
ax.plot(xdata, ydata5, 'y--', linewidth=1, label='市销率')
ax.plot(xdata, ydata6, 'b--', linewidth=1, label='市现率')
ax.set_thetagrids(xdata*180/np.pi,['换手率','涨跌率','市盈率','市净率','市销率','市现率'])
ax.set_rlim(-2,4)
plt.legend(loc = 'best')
plt.show()
if __name__ == '__main__':
gupiao = GuPiaoData()
# gupiao.detedData('szgupiaodata.csv')
# gupiao.cleanData('data/Gupiaodata01.xls')
# gupiao.chooseData('data/Gupiaodata_clean.xls')
#gupiao.transformData('data/Gupiaodata_coredata.xls')
# gupiao.standarData('data/Gupiaodata_coretransformdata.xls')
gupiao.classifyData('data/Gupiao_stdcoredata.xls',k=6)