sklearn：隨機森林_迴歸樹_波士頓房價_填補缺失值

分類樹和迴歸樹參數差別：

criterion
- 分類：使用信息增益，
- 迴歸：
  - 均方誤差MSE，使用均值。mse是父節點與葉子節點之間的均方誤差，用來選擇特徵。同時也是用於衡量模型質量的指標。均方誤差是正的，但是sklearn中的均方誤差是負數。
  - 絕對誤差mae，使用中值。
  - 注意：迴歸樹的接口score默認返回的是R方（負無窮到1，越接近1越好），不是mse

from sklearn.datasets import load_boston
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor

boston = load_boston()

import sklearn
sorted(sklearn.metrics.SCORERS.keys())

['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'brier_score_loss',
 'completeness_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'jaccard',
 'jaccard_macro',
 'jaccard_micro',
 'jaccard_samples',
 'jaccard_weighted',
 'max_error',
 'mutual_info_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'normalized_mutual_info_score',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',
 'roc_auc',
 'v_measure_score']

regresor = RandomForestRegressor(n_estimators=100, random_state=0)
cross_val_score(regresor, boston.data, boston.target, cv=10
               , scoring="neg_mean_squared_error"  # 可以通過 sklearn.metrics.SCORERS.keys() 查看scoring對應的參數，默認是R方
               )
# 返回10次交叉驗證的衡量指標結果

array([-10.72900447,  -5.36049859,  -4.74614178, -20.84946337,
       -12.23497347, -17.99274635,  -6.8952756 , -93.78884428,
       -29.80411702, -15.25776814])

用隨機森林迴歸填補缺失值

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

dataset = load_boston()
dataset.data.shape

(506, 13)

x_full, y_full = dataset.data, dataset.target  # 保存完整的數據
n_samples = x_full.shape[0]
n_features = x_full.shape[1]
n_samples, n_features

(506, 13)

# 首先確定希望放入的缺失值數據的比例。
rng = np.random.RandomState(0)
missing_rate = 0.5
n_missing_samples = int(np.floor(n_samples * n_features * missing_rate))
n_missing_samples

# 構建缺失數據

missing_features = rng.randint(0, n_features, n_missing_samples)  # 生成從0-n之間的n_missing_samples個數據
missing_samples = rng.randint(0, n_samples, n_missing_samples)

x_missing = x_full.copy()
y_missing = y_full.copy()

x_missing[missing_samples, missing_features] = np.nan
x_missing = pd.DataFrame(x_missing)
x_missing

	0	1	2	3	4	5	6	7	8	9	10	11	12
0	NaN	18.0	NaN	NaN	0.538	NaN	65.2	4.0900	1.0	296.0	NaN	NaN	4.98
1	0.02731	0.0	NaN	0.0	0.469	NaN	78.9	4.9671	2.0	NaN	NaN	396.90	9.14
2	0.02729	NaN	7.07	0.0	NaN	7.185	61.1	NaN	2.0	242.0	NaN	NaN	NaN
3	NaN	NaN	NaN	0.0	0.458	NaN	45.8	NaN	NaN	222.0	18.7	NaN	NaN
4	NaN	0.0	2.18	0.0	NaN	7.147	NaN	NaN	NaN	NaN	18.7	NaN	5.33
...	...	...	...	...	...	...	...	...	...	...	...	...	...
501	NaN	NaN	NaN	0.0	0.573	NaN	69.1	NaN	1.0	NaN	21.0	NaN	9.67
502	0.04527	0.0	11.93	0.0	0.573	6.120	76.7	2.2875	1.0	273.0	NaN	396.90	9.08
503	NaN	NaN	11.93	NaN	0.573	6.976	91.0	NaN	NaN	NaN	21.0	NaN	5.64
504	0.10959	0.0	11.93	NaN	0.573	NaN	89.3	NaN	1.0	NaN	21.0	393.45	6.48
505	0.04741	0.0	11.93	0.0	0.573	6.030	NaN	NaN	1.0	NaN	NaN	396.90	7.88

506 rows × 13 columns

from sklearn.impute import SimpleImputer  # 專門用於填補缺失值的類

# 使用均值填充
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
x_missing_mean = imp_mean.fit_transform(x_missing)
x_missing_mean = pd.DataFrame(x_missing_mean)
x_missing_mean

	0	1	2	3	4	5	6	7	8	9	10	11	12
0	3.627579	18.000000	11.163464	0.066007	0.538000	6.305921	65.2	4.090000	1.000000	296.000000	18.521192	352.741952	4.980000
1	0.027310	0.000000	11.163464	0.000000	0.469000	6.305921	78.9	4.967100	2.000000	405.935275	18.521192	396.900000	9.140000
2	0.027290	10.722951	7.070000	0.000000	0.564128	7.185000	61.1	3.856371	2.000000	242.000000	18.521192	352.741952	12.991767
3	3.627579	10.722951	11.163464	0.000000	0.458000	6.305921	45.8	3.856371	9.383871	222.000000	18.700000	352.741952	12.991767
4	3.627579	0.000000	2.180000	0.000000	0.564128	7.147000	67.4	3.856371	9.383871	405.935275	18.700000	352.741952	5.330000
...	...	...	...	...	...	...	...	...	...	...	...	...	...
501	3.627579	10.722951	11.163464	0.000000	0.573000	6.305921	69.1	3.856371	1.000000	405.935275	21.000000	352.741952	9.670000
502	0.045270	0.000000	11.930000	0.000000	0.573000	6.120000	76.7	2.287500	1.000000	273.000000	18.521192	396.900000	9.080000
503	3.627579	10.722951	11.930000	0.066007	0.573000	6.976000	91.0	3.856371	9.383871	405.935275	21.000000	352.741952	5.640000
504	0.109590	0.000000	11.930000	0.066007	0.573000	6.305921	89.3	3.856371	1.000000	405.935275	21.000000	393.450000	6.480000
505	0.047410	0.000000	11.930000	0.000000	0.573000	6.030000	67.4	3.856371	1.000000	405.935275	18.521192	396.900000	7.880000

506 rows × 13 columns

# 使用 0填充缺失值
imp_0 = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)
x_missing_0 = imp_0.fit_transform(x_missing)
x_missing_0 = pd.DataFrame(x_missing_0)
x_missing_0

	0	1	2	3	4	5	6	7	8	9	10	11	12
0	0.00000	18.0	0.00	0.0	0.538	0.000	65.2	4.0900	1.0	296.0	0.0	0.00	4.98
1	0.02731	0.0	0.00	0.0	0.469	0.000	78.9	4.9671	2.0	0.0	0.0	396.90	9.14
2	0.02729	0.0	7.07	0.0	0.000	7.185	61.1	0.0000	2.0	242.0	0.0	0.00	0.00
3	0.00000	0.0	0.00	0.0	0.458	0.000	45.8	0.0000	0.0	222.0	18.7	0.00	0.00
4	0.00000	0.0	2.18	0.0	0.000	7.147	0.0	0.0000	0.0	0.0	18.7	0.00	5.33
...	...	...	...	...	...	...	...	...	...	...	...	...	...
501	0.00000	0.0	0.00	0.0	0.573	0.000	69.1	0.0000	1.0	0.0	21.0	0.00	9.67
502	0.04527	0.0	11.93	0.0	0.573	6.120	76.7	2.2875	1.0	273.0	0.0	396.90	9.08
503	0.00000	0.0	11.93	0.0	0.573	6.976	91.0	0.0000	0.0	0.0	21.0	0.00	5.64
504	0.10959	0.0	11.93	0.0	0.573	0.000	89.3	0.0000	1.0	0.0	21.0	393.45	6.48
505	0.04741	0.0	11.93	0.0	0.573	6.030	0.0	0.0000	1.0	0.0	0.0	396.90	7.88

506 rows × 13 columns

# 使用 隨機森林 填充缺失值
# 通過已有的 特徵數據 和 標籤信息來 迴歸預測 缺失的數據
# 先填充缺失較少的特徵數據

x_missing_reg = x_missing.copy()
sortindex = np.argsort(x_missing_reg.isnull().sum(axis=0)).values  # 計算出特徵空值數據，然後排序返回對應列的索引
sortindex

array([ 6, 12,  8,  7,  9,  0,  2,  1,  5,  4,  3, 10, 11], dtype=int64)

# 遍歷，填補空值
for i in sortindex:
    df = x_missing_reg
    fillc = df.iloc[:, i]
    df = pd.concat([df.drop(i, axis=1), pd.DataFrame(y_full)], axis=1)
    
    df_0 = SimpleImputer(missing_values=np.nan
                        , strategy='constant'
                        , fill_value=0
                        ).fit_transform(df)
    
    y_train = fillc[fillc.notnull()]
    y_test = fillc[fillc.isnull()]
    x_train = df_0[y_train.index, :]
    x_test = df_0[y_test.index, :]
    
    rfc = RandomForestRegressor(n_estimators=100)
    rfc = rfc.fit(x_train, y_train)
    y_predict = rfc.predict(x_test)
    
    x_missing_reg.loc[x_missing_reg.loc[:, i].isnull(), i] = y_predict

# 對填補好的數據進行建模

X = [x_full, x_missing_mean, x_missing_0, x_missing_reg]

mse = []
std = []
for x in X:
    estimator = RandomForestRegressor(random_state=0, n_estimators=100)
    scores = cross_val_score(estimator, x, y_full, scoring='neg_mean_squared_error', cv=5).mean()
    mse.append(scores * -1)

# 用所得的結果畫出條形圖

x_labels = ['Full data'
            , 'Zero Imputation'
            , 'Mean Imputation'
            , 'Regressor Imputation'
           ]
colors = ['r', 'g', 'b', 'orange']

plt.figure(figsize=(12, 6))
ax = plt.subplot(111)
for i in range(len(mse)):
    ax.barh(i, mse[i], color=colors[i], alpha=0.6, align='center')
    
ax.set_title('Imputation Techniques with Boston Data')
ax.set_xlim(left=np.min(mse) * 0.9,
            right=np.max(mse) * 1.1
           )
ax.set_yticks(range(len(mse)))
ax.set_xlabel('MSE')
ax.set_yticklabels(x_labels)
plt.show()

sklearn：隨機森林_迴歸樹_波士頓房價_填補缺失值

用隨機森林迴歸填補缺失值

.NET開源強大、易於使用的緩存框架 - FusionCache

面試，有時候是個運氣活

windows conda Permission to listen on port 8888 denied

打包上傳python代碼到pypi，通過pip安裝使用

windows解除文件佔用

python壓縮指定文件或目錄爲zip

python logging 日誌按時間間隔自動切分

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結