這個是給國外大哥做的,很尷尬,做完了有答案,要和答案一樣,臨時就又改了,結果鼠標以及各種問題,難受呀
Perform polynomial regression to predict wage using age. Use cross-validation to select the
optimal degree d for the polynomial. What degree was chosen, and how does this compare to the
results of hypothesis testing using ANOVA? Make a plot of the resulting polynomial fit to the
data.
%matplotlib inline
import matplotlib as mpl
mpl.rcParams['font.sans-serif']=[u'simHei']
mpl.rcParams['axes.unicode_minus']=False
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
import warnings
import sklearn
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, ElasticNetCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
# from sklearn.linear_model.coordinate_descent import ConvergenceWarning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.linear_model import Lasso
from sklearn.metrics import explained_variance_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
test=pd.read_csv('Wage.csv')
test.head()
Unnamed: 0 | year | age | sex | maritl | race | education | region | jobclass | health | health_ins | logwage | wage | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 231655 | 2006 | 18 | 1. Male | 1. Never Married | 1. White | 1. < HS Grad | 2. Middle Atlantic | 1. Industrial | 1. <=Good | 2. No | 4.318063 | 75.043154 |
1 | 86582 | 2004 | 24 | 1. Male | 1. Never Married | 1. White | 4. College Grad | 2. Middle Atlantic | 2. Information | 2. >=Very Good | 2. No | 4.255273 | 70.476020 |
2 | 161300 | 2003 | 45 | 1. Male | 2. Married | 1. White | 3. Some College | 2. Middle Atlantic | 1. Industrial | 1. <=Good | 1. Yes | 4.875061 | 130.982177 |
3 | 155159 | 2003 | 43 | 1. Male | 2. Married | 3. Asian | 4. College Grad | 2. Middle Atlantic | 2. Information | 2. >=Very Good | 1. Yes | 5.041393 | 154.685293 |
4 | 11443 | 2005 | 50 | 1. Male | 4. Divorced | 1. White | 2. HS Grad | 2. Middle Atlantic | 2. Information | 1. <=Good | 1. Yes | 4.318063 | 75.043154 |
test.isnull().sum()
Unnamed: 0 0
year 0
age 0
sex 0
maritl 0
race 0
education 0
region 0
jobclass 0
health 0
health_ins 0
logwage 0
wage 0
dtype: int64
X=test['age'].values.reshape(-1,1)
Y=test['wage']
models = [
Pipeline([
('ss', StandardScaler()),
('poly', PolynomialFeatures()),
('linear', RidgeCV(alphas=np.logspace(-3,1,20)))
]),
Pipeline([
('ss', StandardScaler()),
('poly', PolynomialFeatures()),
('linear', LassoCV(alphas=np.logspace(-3,1,20)))
])
]
parameters = {
"poly__degree": [3,2,1],
"poly__interaction_only": [True, False],
"poly__include_bias": [True, False],
"linear__fit_intercept": [True, False]
}
rf=PolynomialFeatures(2,interaction_only=True)
a=pd.DataFrame({
'name':[1,2,3,4,5],
'score':[2,3,4,4,5]
})
b=rf.fit_transform(a)
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
for t in range(2):
model = GridSearchCV(models[t], param_grid=parameters,cv=5, n_jobs=1)
model.fit(x_train, y_train)
print (model.best_params_)
print ("R=%.3f" %model.best_score_)
y_predict = model.predict(x_test)
{'linear__fit_intercept': True, 'poly__degree': 3, 'poly__include_bias': False, 'poly__interaction_only': False}
R=0.072
{'linear__fit_intercept': True, 'poly__degree': 3, 'poly__include_bias': True, 'poly__interaction_only': False}
R=0.072
models = [
Pipeline([
('Poly', PolynomialFeatures()),
('Linear', LinearRegression(fit_intercept=False))
])
]
model = models[0]
X_train,X_test,Y_train,Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)
t=np.arange(len(X_test))
N = 6
d_pool = np.arange(1,N,1)
m = d_pool.size
clrs = []
for c in np.linspace(16711680, 255, m):
clrs.append('#%06x' % int(c))
line_width = 3
a=[]
plt.figure(figsize=(12,6), facecolor='w')
for i,d in enumerate(d_pool):
model.set_params(Poly__degree=d)
model.fit(X_train, Y_train)
lin = model.get_params('Linear')['Linear']
output = u'%d階,係數爲:' % d
if hasattr(lin, 'alpha_'):
idx = output.find(u'係數')
output = output[:idx] + (u'alpha=%.6f, ' % lin.alpha_) + output[idx:]
if hasattr(lin, 'l1_ratio_'):
idx = output.find(u'係數')
output = output[:idx] + (u'l1_ratio=%.6f, ' % lin.l1_ratio_) + output[idx:]
y_hat = model.predict(X_test)
s = model.score(X_test, Y_test)
mse_predict = mean_squared_error(y_test, y_hat)
a.append(mse_predict)
<Figure size 864x432 with 0 Axes>
plt.plot([1,2,3,4,5],a)
[<matplotlib.lines.Line2D at 0x20a61cf7c18>]
X=test['age'].values.reshape(-1,1)
y=test['wage']
ss = StandardScaler()
X_train = ss.fit_transform(X)
poly = PolynomialFeatures(degree=3,interaction_only=False)
train1 = poly.fit_transform(X_train)
linear = LinearRegression()
linear.fit(train1,y)
print(linear.coef_)
print(linear.intercept_)
[ 0. 6.00961425 -7.98309085 1.30560185]
119.49381727555215
from sklearn.linear_model import LinearRegression
import numpy as np
import matplotlib.pyplot as plt
x_train=test['age'].values.reshape(-1,1)
y_train=test['wage']
regressor = LinearRegression()
regressor.fit(x_train, y_train)
xx = np.linspace(10, 80, 100)
xx = xx.reshape(xx.shape[0],1)
yy = regressor.predict(xx)
plt.scatter(x_train, y_train)
plt1, = plt.plot(xx, yy,'r',label="degree=1")
plt.xlabel("age")
plt.ylabel("wage")
plt.legend(handles=[plt1])
plt.show()
poly2 = PolynomialFeatures(degree=3)
x_train_poly2 = poly2.fit_transform(x_train)
regressor_poly2 = LinearRegression()
regressor_poly2.fit(x_train_poly2, y_train)
xx_poly2 = poly2.transform(xx)
yy_poly2 = regressor_poly2.predict(xx_poly2)
plt.scatter(x_train, y_train)
plt2, = plt.plot(xx, yy_poly2,'y',label="Degree3")
plt.xlabel("age")
plt.ylabel("wage")
plt.legend(handles=[plt2])
plt.show()
test.head()
Unnamed: 0 | year | age | sex | maritl | race | education | region | jobclass | health | health_ins | logwage | wage | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 231655 | 2006 | 18 | 1. Male | 1. Never Married | 1. White | 1. < HS Grad | 2. Middle Atlantic | 1. Industrial | 1. <=Good | 2. No | 4.318063 | 75.043154 |
1 | 86582 | 2004 | 24 | 1. Male | 1. Never Married | 1. White | 4. College Grad | 2. Middle Atlantic | 2. Information | 2. >=Very Good | 2. No | 4.255273 | 70.476020 |
2 | 161300 | 2003 | 45 | 1. Male | 2. Married | 1. White | 3. Some College | 2. Middle Atlantic | 1. Industrial | 1. <=Good | 1. Yes | 4.875061 | 130.982177 |
3 | 155159 | 2003 | 43 | 1. Male | 2. Married | 3. Asian | 4. College Grad | 2. Middle Atlantic | 2. Information | 2. >=Very Good | 1. Yes | 5.041393 | 154.685293 |
4 | 11443 | 2005 | 50 | 1. Male | 4. Divorced | 1. White | 2. HS Grad | 2. Middle Atlantic | 2. Information | 1. <=Good | 1. Yes | 4.318063 | 75.043154 |
from scipy import stats
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
from statsmodels.stats.multicomp import pairwise_tukeyhsd
import warnings
warnings.filterwarnings("ignore")
import itertools
# for A
anova_reA= anova_lm(ols('age~C(wage)',data=test[['age','wage']]).fit())
print(anova_reA)
df sum_sq mean_sq F PR(>F)
C(wage) 507.0 87692.862849 172.964227 1.382137 5.346952e-07
Residual 2492.0 311855.291817 125.142573 NaN NaN
Fit a step function to predict wage using age, and perform cross-validation to choose the
optimal number of cuts. Make a plot of the fit obtained
test['age'].head()
0 18
1 24
2 45
3 43
4 50
Name: age, dtype: int64
#18 26 34 42 50
def function1(a):
if a>32 :
return 1
else:
return 2
def function2(a):
if a>42 :
return 1
elif 34<a<42:
return 2
elif 26<a<34:
return 3
else:
return 4
def function3(a):
if a>37:
return 1
elif 28<a<37:
return 2
else:
return 3
test['二'] = test.apply(lambda x: function1(x['age']), axis = 1)
test['三'] = test.apply(lambda x: function3(x['age']), axis = 1)
test['四'] = test.apply(lambda x: function2(x['age']), axis = 1)
a=[]
for i in ['二','三','四']:
print(i)
x = test[i].values.reshape(-1,1)
y = test['age']
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1,train_size=0.8)
model = Lasso()
alpha_can = np.logspace(-3,2,10)
np.set_printoptions(suppress=True)
#print 'alpha_can = ', alpha_can
lasso_model = GridSearchCV(model, param_grid={'alpha': alpha_can}, cv=5)
lasso_model.fit(x_train, y_train)
print(':', lasso_model.best_params_)
y_predict=lasso_model.predict(x_test)
print('score:',explained_variance_score(y_test,y_predict))
a.append(explained_variance_score(y_test,y_predict))
二
: {'alpha': 0.001}
score: 0.5294966544778303
三
: {'alpha': 0.001}
score: 0.6360385629505032
四
: {'alpha': 0.003593813663804626}
score: 0.6908722988300695
plt.plot([2,3,4],a)
[<matplotlib.lines.Line2D at 0x20a622bb390>]
The Wage data set contains a number of other features not explored in this chapter, such as
marital status (maritl), job class (jobclass), and others. Explore the relationships between some of
these other predictors and wage, and use non-linear fitting techniques in order to fit flexible
models to the data. Create plots of the results obtained, and write a summary of your findings.
(Note, this question is quite open ended. You should think about questions like this as a small
precursor to the final project. Be creative!)
test=pd.read_csv('Wage.csv')
test.head()
Unnamed: 0 | year | age | sex | maritl | race | education | region | jobclass | health | health_ins | logwage | wage | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 231655 | 2006 | 18 | 1. Male | 1. Never Married | 1. White | 1. < HS Grad | 2. Middle Atlantic | 1. Industrial | 1. <=Good | 2. No | 4.318063 | 75.043154 |
1 | 86582 | 2004 | 24 | 1. Male | 1. Never Married | 1. White | 4. College Grad | 2. Middle Atlantic | 2. Information | 2. >=Very Good | 2. No | 4.255273 | 70.476020 |
2 | 161300 | 2003 | 45 | 1. Male | 2. Married | 1. White | 3. Some College | 2. Middle Atlantic | 1. Industrial | 1. <=Good | 1. Yes | 4.875061 | 130.982177 |
3 | 155159 | 2003 | 43 | 1. Male | 2. Married | 3. Asian | 4. College Grad | 2. Middle Atlantic | 2. Information | 2. >=Very Good | 1. Yes | 5.041393 | 154.685293 |
4 | 11443 | 2005 | 50 | 1. Male | 4. Divorced | 1. White | 2. HS Grad | 2. Middle Atlantic | 2. Information | 1. <=Good | 1. Yes | 4.318063 | 75.043154 |
sex
test['sex']=test['sex'].apply(lambda x:x.split('.')[0])
test['sex'].value_counts()
1 3000
Name: sex, dtype: int64
maritl
test['maritl']=test['maritl'].apply(lambda x:x.split('.')[0])
test['maritl'].value_counts()
2 2074
1 648
4 204
5 55
3 19
Name: maritl, dtype: int64
race
test['race']=test['race'].apply(lambda x:x.split('.')[0])
test['race'].value_counts()
1 2480
2 293
3 190
4 37
Name: race, dtype: int64
test['education']=test['education'].apply(lambda x:x.split('.')[0])
test['education'].value_counts()
2 971
4 685
3 650
5 426
1 268
Name: education, dtype: int64
test['jobclass']=test['jobclass'].apply(lambda x:x.split('.')[0])
test['jobclass'].value_counts()
1 1544
2 1456
Name: jobclass, dtype: int64
test['health']=test['health'].apply(lambda x:x.split('.')[0])
test['health'].value_counts()
2 2142
1 858
Name: health, dtype: int64
test['health_ins']=test['health_ins'].apply(lambda x:x.split('.')[0])
test['health_ins'].value_counts()
1 2083
2 917
Name: health_ins, dtype: int64
test.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 13 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Unnamed: 0 3000 non-null int64
1 year 3000 non-null int64
2 age 3000 non-null int64
3 sex 3000 non-null object
4 maritl 3000 non-null object
5 race 3000 non-null object
6 education 3000 non-null object
7 region 3000 non-null object
8 jobclass 3000 non-null object
9 health 3000 non-null object
10 health_ins 3000 non-null object
11 logwage 3000 non-null float64
12 wage 3000 non-null float64
dtypes: float64(2), int64(3), object(8)
memory usage: 304.8+ KB
wage和year age education
x=test[['year','age','education']]
y=test['wage']
x= pd.get_dummies(x)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.2,random_state=2020)
clf =RandomForestRegressor()
clf.fit(X_train, y_train)
y_predict=clf.predict(X_test)
print('score:',explained_variance_score(y_test,y_predict))
score: 0.06934098768375041
wage和year age education maritl
x=test[['year','age','education','maritl']]
y=test['wage']
x= pd.get_dummies(x)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.2,random_state=2020)
clf =RandomForestRegressor()
clf.fit(X_train, y_train)
y_predict=clf.predict(X_test)
print('score:',explained_variance_score(y_test,y_predict))
score: 0.19806405061206722
wage和year age education jobcalss
x=test[['year','age','education','jobclass']]
y=test['wage']
x= pd.get_dummies(x)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.2,random_state=2020)
clf =RandomForestRegressor()
clf.fit(X_train, y_train)
y_predict=clf.predict(X_test)
print('score:',explained_variance_score(y_test,y_predict))
score: 0.05475790627923649
wage和year age education maritl jobclass
x=test[['year','age','education','jobclass','maritl']]
y=test['wage']
x= pd.get_dummies(x)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.2,random_state=2020)
clf =RandomForestRegressor()
clf.fit(X_train, y_train)
y_predict=clf.predict(X_test)
print('score:',explained_variance_score(y_test,y_predict))
score: 0.1757834299406874