def getDataAndSubsample(sample_strategy="under"):
"""
獲得數據並採樣
:param sample_strategy: 採樣策略
:return: Dataframe數據
"""
# pandas讀取
new_path = '../newData/結果/5#/total/all_20180131_rounding_five.csv' # 5分類的(0,1,2,3,4)
# tmp_data = pd.read_csv(new_path, index_col=[0]) # csv文件如果第一列有索引就用這個
tmp_data = pd.read_csv(new_path) # csv文件如果第一列沒索引就用這個
under_map_dict = {0: 300, 1: 300, 2: 300, 3: 300, 4: 300} # 設置欠採樣各類數量
over_map_dict = {0: 4000, 1: 4000, 2: 4000, 3: 4000, 4: 4000} # 設置過採樣各類數量
# 選擇下采樣
if sample_strategy == "under":
from imblearn.under_sampling import RandomUnderSampler
# 通過設置RandomUnderSampler中的replacement=True參數, 可以實現自助法(boostrap)抽樣
# (僅在二分類時可用)通過設置RandomUnderSampler中的ratio參數(<=1的float類型),可以設置數據採樣比例
# (多分類時)不能寫ratio,只能寫sampling_strategy=dict!,像下面一樣分別設置5個label個數
rus = RandomUnderSampler(sampling_strategy=under_map_dict, random_state=0, replacement=True) # 採用隨機欠採樣(下采樣)
x_resample, y_resample = rus.fit_sample(np.array(tmp_data.iloc[:, 0:-1]),
(np.array(tmp_data.iloc[:, -1])).astype(int))
# 選擇SMOTETomek過採樣
elif sample_strategy == "over1":
from imblearn.combine import SMOTETomek # 過採樣
smote_tomek = SMOTETomek(sampling_strategy=over_map_dict) # 取數據
x_resample, y_resample = smote_tomek.fit_sample(np.array(tmp_data.iloc[:,0:-1]), (np.array(tmp_data.iloc[:, -1])).astype(int))
# 選擇SMOTEENN過採樣
elif sample_strategy == "over2":
from imblearn.combine import SMOTEENN
smote_enn = SMOTEENN(random_state=0, sampling_strategy=over_map_dict)
x_resample, y_resample = smote_enn.fit_sample(np.array(tmp_data.iloc[:,0:-1]), (np.array(tmp_data.iloc[:, -1])).astype(int))
print(f"len of x_resample={len(x_resample)}")
print(f"len of y_resample={len(y_resample)}")
# 矩陣合併,爲了使兩個矩陣的dimension相同
total_np_data = np.column_stack((x_resample, y_resample[:,None]))
# 特徵表頭
featureClass_list_columns = featureClass_list.copy()
featureClass_list_columns.append('score')
# numpy轉pandas
total_pd_data = pd.DataFrame(total_np_data, columns=featureClass_list_columns)
print(total_pd_data)
print(total_pd_data.columns)
getDataAndSubsample(sample_strategy="under")
原理過一年再補,急着看的請移步其它博客