所謂共線性,指的是自變量之間存在較強甚至完全的線性相關關係。這會導致模型預測能力下降,增加對於模型結果的解釋成本。
如:
plot_data = data[['A', 'B']].dropna()
plt.plot(plot_data['A'], plot_data['B'], 'bo')
plt.xlabel('Site EUI'); plt.ylabel('Weather Norm EUI')
plt.title('Weather Norm EUI vs Site EUI, R = %.4f' % np.corrcoef(data[['A', 'B']].dropna(), rowvar=False)[0][1])
剔除共線特徵 函數
def remove_collinear_features(x, threshold):
'''
Objective:
Remove collinear features in a dataframe with a correlation coefficient
greater than the threshold. Removing collinear features can help a model
to generalize and improves the interpretability of the model.
Inputs:
threshold: any features with correlations greater than this value are removed
Output:
dataframe that contains only the non-highly-collinear features
'''
y = x['score']
x = x.drop(columns = ['score'])
corr_matrix = x.corr()
iters = range(len(corr_matrix.columns) - 1)
drop_cols = []
for i in iters:
for j in range(i):
item = corr_matrix.iloc[j: (j+1), (i+1): (i+2)]
col = item.columns
row = item.index
val = abs(item.values)
if val >= threshold:
# print(col.values[0], "|", row.values[0], "|", round(val[0][0], 2))
drop_cols.append(col.values[0])
drops = set(drop_cols)
x = x.drop(columns = drops)
x['score'] = y
return x
features = remove_collinear_features(features, 0.6)
features = features.dropna(axis = 1, how = 'all')
print(features.shape)
features.head()
(11319, 65)