def getDataAndSubsample(sample_strategy="under"):
"""
获得数据并采样
:param sample_strategy: 采样策略
:return: Dataframe数据
"""
# pandas读取
new_path = '../newData/结果/5#/total/all_20180131_rounding_five.csv' # 5分类的(0,1,2,3,4)
# tmp_data = pd.read_csv(new_path, index_col=[0]) # csv文件如果第一列有索引就用这个
tmp_data = pd.read_csv(new_path) # csv文件如果第一列没索引就用这个
under_map_dict = {0: 300, 1: 300, 2: 300, 3: 300, 4: 300} # 设置欠采样各类数量
over_map_dict = {0: 4000, 1: 4000, 2: 4000, 3: 4000, 4: 4000} # 设置过采样各类数量
# 选择下采样
if sample_strategy == "under":
from imblearn.under_sampling import RandomUnderSampler
# 通过设置RandomUnderSampler中的replacement=True参数, 可以实现自助法(boostrap)抽样
# (仅在二分类时可用)通过设置RandomUnderSampler中的ratio参数(<=1的float类型),可以设置数据采样比例
# (多分类时)不能写ratio,只能写sampling_strategy=dict!,像下面一样分别设置5个label个数
rus = RandomUnderSampler(sampling_strategy=under_map_dict, random_state=0, replacement=True) # 采用随机欠采样(下采样)
x_resample, y_resample = rus.fit_sample(np.array(tmp_data.iloc[:, 0:-1]),
(np.array(tmp_data.iloc[:, -1])).astype(int))
# 选择SMOTETomek过采样
elif sample_strategy == "over1":
from imblearn.combine import SMOTETomek # 过采样
smote_tomek = SMOTETomek(sampling_strategy=over_map_dict) # 取数据
x_resample, y_resample = smote_tomek.fit_sample(np.array(tmp_data.iloc[:,0:-1]), (np.array(tmp_data.iloc[:, -1])).astype(int))
# 选择SMOTEENN过采样
elif sample_strategy == "over2":
from imblearn.combine import SMOTEENN
smote_enn = SMOTEENN(random_state=0, sampling_strategy=over_map_dict)
x_resample, y_resample = smote_enn.fit_sample(np.array(tmp_data.iloc[:,0:-1]), (np.array(tmp_data.iloc[:, -1])).astype(int))
print(f"len of x_resample={len(x_resample)}")
print(f"len of y_resample={len(y_resample)}")
# 矩阵合并,为了使两个矩阵的dimension相同
total_np_data = np.column_stack((x_resample, y_resample[:,None]))
# 特征表头
featureClass_list_columns = featureClass_list.copy()
featureClass_list_columns.append('score')
# numpy转pandas
total_pd_data = pd.DataFrame(total_np_data, columns=featureClass_list_columns)
print(total_pd_data)
print(total_pd_data.columns)
getDataAndSubsample(sample_strategy="under")
原理过一年再补,急着看的请移步其它博客