20200308——多項式迴歸預測工資

這個是給國外大哥做的,很尷尬,做完了有答案,要和答案一樣,臨時就又改了,結果鼠標以及各種問題,難受呀

Perform polynomial regression to predict wage using age. Use cross-validation to select the
optimal degree d for the polynomial. What degree was chosen, and how does this compare to the
results of hypothesis testing using ANOVA? Make a plot of the resulting polynomial fit to the
data.

%matplotlib inline
import matplotlib as mpl
mpl.rcParams['font.sans-serif']=[u'simHei']
mpl.rcParams['axes.unicode_minus']=False
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
import warnings
import sklearn
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, ElasticNetCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
# from sklearn.linear_model.coordinate_descent import ConvergenceWarning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.linear_model import Lasso
from sklearn.metrics import explained_variance_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
test=pd.read_csv('Wage.csv')
test.head()
Unnamed: 0 year age sex maritl race education region jobclass health health_ins logwage wage
0 231655 2006 18 1. Male 1. Never Married 1. White 1. < HS Grad 2. Middle Atlantic 1. Industrial 1. <=Good 2. No 4.318063 75.043154
1 86582 2004 24 1. Male 1. Never Married 1. White 4. College Grad 2. Middle Atlantic 2. Information 2. >=Very Good 2. No 4.255273 70.476020
2 161300 2003 45 1. Male 2. Married 1. White 3. Some College 2. Middle Atlantic 1. Industrial 1. <=Good 1. Yes 4.875061 130.982177
3 155159 2003 43 1. Male 2. Married 3. Asian 4. College Grad 2. Middle Atlantic 2. Information 2. >=Very Good 1. Yes 5.041393 154.685293
4 11443 2005 50 1. Male 4. Divorced 1. White 2. HS Grad 2. Middle Atlantic 2. Information 1. <=Good 1. Yes 4.318063 75.043154
test.isnull().sum()
Unnamed: 0    0
year          0
age           0
sex           0
maritl        0
race          0
education     0
region        0
jobclass      0
health        0
health_ins    0
logwage       0
wage          0
dtype: int64
X=test['age'].values.reshape(-1,1)
Y=test['wage']
models = [
    Pipeline([
            ('ss', StandardScaler()),
            ('poly', PolynomialFeatures()),
            ('linear', RidgeCV(alphas=np.logspace(-3,1,20)))
        ]),
    Pipeline([
            ('ss', StandardScaler()),
            ('poly', PolynomialFeatures()),
            ('linear', LassoCV(alphas=np.logspace(-3,1,20)))
        ])
] 
parameters = {
    "poly__degree": [3,2,1], 
    "poly__interaction_only": [True, False],
    "poly__include_bias": [True, False],
    "linear__fit_intercept": [True, False]
}
rf=PolynomialFeatures(2,interaction_only=True)
a=pd.DataFrame({
    'name':[1,2,3,4,5],
    'score':[2,3,4,4,5]
})
b=rf.fit_transform(a)
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
for t in range(2):
    model = GridSearchCV(models[t], param_grid=parameters,cv=5, n_jobs=1)
    model.fit(x_train, y_train)
    print (model.best_params_)
    print ("R=%.3f" %model.best_score_)
    y_predict = model.predict(x_test)
{'linear__fit_intercept': True, 'poly__degree': 3, 'poly__include_bias': False, 'poly__interaction_only': False}
R=0.072
{'linear__fit_intercept': True, 'poly__degree': 3, 'poly__include_bias': True, 'poly__interaction_only': False}
R=0.072

models = [
    Pipeline([
            ('Poly', PolynomialFeatures()),
            ('Linear', LinearRegression(fit_intercept=False)) 
        ])
]
model = models[0]
X_train,X_test,Y_train,Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)


t=np.arange(len(X_test))
N = 6
d_pool = np.arange(1,N,1)
m = d_pool.size
clrs = [] 
for c in np.linspace(16711680, 255, m):
    clrs.append('#%06x' % int(c))
line_width = 3
a=[]
plt.figure(figsize=(12,6), facecolor='w')
for i,d in enumerate(d_pool):
    model.set_params(Poly__degree=d)
    model.fit(X_train, Y_train) 
    lin = model.get_params('Linear')['Linear']
    output = u'%d階,係數爲:' % d
    if hasattr(lin, 'alpha_'):
        idx = output.find(u'係數')
        output = output[:idx] + (u'alpha=%.6f, ' % lin.alpha_) + output[idx:]
    if hasattr(lin, 'l1_ratio_'):
        idx = output.find(u'係數')
        output = output[:idx] + (u'l1_ratio=%.6f, ' % lin.l1_ratio_) + output[idx:]
    y_hat = model.predict(X_test)
    s = model.score(X_test, Y_test)
    mse_predict = mean_squared_error(y_test, y_hat)
    a.append(mse_predict)
<Figure size 864x432 with 0 Axes>
plt.plot([1,2,3,4,5],a)
[<matplotlib.lines.Line2D at 0x20a61cf7c18>]

在這裏插入圖片描述

X=test['age'].values.reshape(-1,1)
y=test['wage']
ss = StandardScaler()
X_train = ss.fit_transform(X)
poly = PolynomialFeatures(degree=3,interaction_only=False)
train1 = poly.fit_transform(X_train)
linear = LinearRegression()
linear.fit(train1,y)
print(linear.coef_)
print(linear.intercept_)
[ 0.          6.00961425 -7.98309085  1.30560185]
119.49381727555215
from sklearn.linear_model import LinearRegression
import numpy as np
import matplotlib.pyplot as plt
x_train=test['age'].values.reshape(-1,1)
y_train=test['wage']
regressor = LinearRegression()
regressor.fit(x_train, y_train)
xx = np.linspace(10, 80, 100)  
xx = xx.reshape(xx.shape[0],1)
yy = regressor.predict(xx)  
plt.scatter(x_train, y_train)
plt1, = plt.plot(xx, yy,'r',label="degree=1")
plt.xlabel("age")
plt.ylabel("wage")
plt.legend(handles=[plt1])
plt.show()

在這裏插入圖片描述


poly2 = PolynomialFeatures(degree=3)  
x_train_poly2 = poly2.fit_transform(x_train)

regressor_poly2 = LinearRegression()
regressor_poly2.fit(x_train_poly2, y_train)

xx_poly2 = poly2.transform(xx)
yy_poly2 = regressor_poly2.predict(xx_poly2)
plt.scatter(x_train, y_train)
plt2, = plt.plot(xx, yy_poly2,'y',label="Degree3")
plt.xlabel("age")
plt.ylabel("wage")
plt.legend(handles=[plt2])
plt.show()

在這裏插入圖片描述

test.head()
Unnamed: 0 year age sex maritl race education region jobclass health health_ins logwage wage
0 231655 2006 18 1. Male 1. Never Married 1. White 1. < HS Grad 2. Middle Atlantic 1. Industrial 1. <=Good 2. No 4.318063 75.043154
1 86582 2004 24 1. Male 1. Never Married 1. White 4. College Grad 2. Middle Atlantic 2. Information 2. >=Very Good 2. No 4.255273 70.476020
2 161300 2003 45 1. Male 2. Married 1. White 3. Some College 2. Middle Atlantic 1. Industrial 1. <=Good 1. Yes 4.875061 130.982177
3 155159 2003 43 1. Male 2. Married 3. Asian 4. College Grad 2. Middle Atlantic 2. Information 2. >=Very Good 1. Yes 5.041393 154.685293
4 11443 2005 50 1. Male 4. Divorced 1. White 2. HS Grad 2. Middle Atlantic 2. Information 1. <=Good 1. Yes 4.318063 75.043154
from scipy import stats
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
from statsmodels.stats.multicomp import pairwise_tukeyhsd
import warnings
warnings.filterwarnings("ignore")
import itertools
# for A
anova_reA= anova_lm(ols('age~C(wage)',data=test[['age','wage']]).fit())
print(anova_reA)
              df         sum_sq     mean_sq         F        PR(>F)
C(wage)    507.0   87692.862849  172.964227  1.382137  5.346952e-07
Residual  2492.0  311855.291817  125.142573       NaN           NaN

Fit a step function to predict wage using age, and perform cross-validation to choose the
optimal number of cuts. Make a plot of the fit obtained

test['age'].head()
0    18
1    24
2    45
3    43
4    50
Name: age, dtype: int64
#18 26 34 42 50
def function1(a):
	if a>32 :
		return 1
	else:
		return 2
def function2(a):
	if a>42 :
		return 1
	elif 34<a<42:
		return 2
	elif 26<a<34:
		return 3 
	else:
		return 4
def function3(a):
	if a>37:
		return 1
	elif 28<a<37:
		return 2
	else:
		return 3
    
    
test['二'] = test.apply(lambda x: function1(x['age']), axis = 1)
test['三'] = test.apply(lambda x: function3(x['age']), axis = 1)
test['四'] = test.apply(lambda x: function2(x['age']), axis = 1)
a=[]
for i in ['二','三','四']:
    print(i)
    x = test[i].values.reshape(-1,1)
    y = test['age']
    x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1,train_size=0.8)    
    model = Lasso()
    alpha_can = np.logspace(-3,2,10)
    np.set_printoptions(suppress=True)
    #print 'alpha_can = ', alpha_can
    lasso_model = GridSearchCV(model, param_grid={'alpha': alpha_can}, cv=5)
    lasso_model.fit(x_train, y_train)
    print(':', lasso_model.best_params_)  
    y_predict=lasso_model.predict(x_test)
    print('score:',explained_variance_score(y_test,y_predict))
    a.append(explained_variance_score(y_test,y_predict))
二
: {'alpha': 0.001}
score: 0.5294966544778303
三
: {'alpha': 0.001}
score: 0.6360385629505032
四
: {'alpha': 0.003593813663804626}
score: 0.6908722988300695
plt.plot([2,3,4],a)
[<matplotlib.lines.Line2D at 0x20a622bb390>]

在這裏插入圖片描述
The Wage data set contains a number of other features not explored in this chapter, such as
marital status (maritl), job class (jobclass), and others. Explore the relationships between some of
these other predictors and wage, and use non-linear fitting techniques in order to fit flexible
models to the data. Create plots of the results obtained, and write a summary of your findings.
(Note, this question is quite open ended. You should think about questions like this as a small
precursor to the final project. Be creative!)

test=pd.read_csv('Wage.csv')
test.head()
Unnamed: 0 year age sex maritl race education region jobclass health health_ins logwage wage
0 231655 2006 18 1. Male 1. Never Married 1. White 1. < HS Grad 2. Middle Atlantic 1. Industrial 1. <=Good 2. No 4.318063 75.043154
1 86582 2004 24 1. Male 1. Never Married 1. White 4. College Grad 2. Middle Atlantic 2. Information 2. >=Very Good 2. No 4.255273 70.476020
2 161300 2003 45 1. Male 2. Married 1. White 3. Some College 2. Middle Atlantic 1. Industrial 1. <=Good 1. Yes 4.875061 130.982177
3 155159 2003 43 1. Male 2. Married 3. Asian 4. College Grad 2. Middle Atlantic 2. Information 2. >=Very Good 1. Yes 5.041393 154.685293
4 11443 2005 50 1. Male 4. Divorced 1. White 2. HS Grad 2. Middle Atlantic 2. Information 1. <=Good 1. Yes 4.318063 75.043154

sex

test['sex']=test['sex'].apply(lambda x:x.split('.')[0])
test['sex'].value_counts()
1    3000
Name: sex, dtype: int64

maritl

test['maritl']=test['maritl'].apply(lambda x:x.split('.')[0])
test['maritl'].value_counts()
2    2074
1     648
4     204
5      55
3      19
Name: maritl, dtype: int64

race

test['race']=test['race'].apply(lambda x:x.split('.')[0])
test['race'].value_counts()
1    2480
2     293
3     190
4      37
Name: race, dtype: int64
test['education']=test['education'].apply(lambda x:x.split('.')[0])
test['education'].value_counts()
2    971
4    685
3    650
5    426
1    268
Name: education, dtype: int64
test['jobclass']=test['jobclass'].apply(lambda x:x.split('.')[0])
test['jobclass'].value_counts()
1    1544
2    1456
Name: jobclass, dtype: int64
test['health']=test['health'].apply(lambda x:x.split('.')[0])
test['health'].value_counts()
2    2142
1     858
Name: health, dtype: int64
test['health_ins']=test['health_ins'].apply(lambda x:x.split('.')[0])
test['health_ins'].value_counts()
1    2083
2     917
Name: health_ins, dtype: int64
test.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  3000 non-null   int64  
 1   year        3000 non-null   int64  
 2   age         3000 non-null   int64  
 3   sex         3000 non-null   object 
 4   maritl      3000 non-null   object 
 5   race        3000 non-null   object 
 6   education   3000 non-null   object 
 7   region      3000 non-null   object 
 8   jobclass    3000 non-null   object 
 9   health      3000 non-null   object 
 10  health_ins  3000 non-null   object 
 11  logwage     3000 non-null   float64
 12  wage        3000 non-null   float64
dtypes: float64(2), int64(3), object(8)
memory usage: 304.8+ KB

wage和year age education

x=test[['year','age','education']]
y=test['wage']
x= pd.get_dummies(x)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.2,random_state=2020)
clf =RandomForestRegressor() 
clf.fit(X_train, y_train)
y_predict=clf.predict(X_test)
print('score:',explained_variance_score(y_test,y_predict))
score: 0.06934098768375041

wage和year age education maritl

x=test[['year','age','education','maritl']]
y=test['wage']
x= pd.get_dummies(x)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.2,random_state=2020)
clf =RandomForestRegressor()
clf.fit(X_train, y_train)
y_predict=clf.predict(X_test)
print('score:',explained_variance_score(y_test,y_predict))
score: 0.19806405061206722

wage和year age education jobcalss

x=test[['year','age','education','jobclass']]
y=test['wage']
x= pd.get_dummies(x)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.2,random_state=2020)
clf =RandomForestRegressor() 
clf.fit(X_train, y_train)
y_predict=clf.predict(X_test)
print('score:',explained_variance_score(y_test,y_predict))
score: 0.05475790627923649

wage和year age education maritl jobclass

x=test[['year','age','education','jobclass','maritl']]
y=test['wage']
x= pd.get_dummies(x)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.2,random_state=2020)
clf =RandomForestRegressor()
clf.fit(X_train, y_train)
y_predict=clf.predict(X_test)
print('score:',explained_variance_score(y_test,y_predict))
score: 0.1757834299406874
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章