from sklearn. model_selection import train_test_split
from sklearn. linear_model import LinearRegression
from sklearn. preprocessing import StandardScaler
import numpy as np
import matplotlib as mpl
import matplotlib. pyplot as plt
import pandas as pd
from pandas import DataFrame
import time
mpl. rcParams[ 'font.sans-serif' ] = [ u'simHei' ]
mpl. rcParams[ 'axes.unicode_minus' ] = False
path1= 'datas/household_power_consumption_1000.txt'
df = pd. read_csv( path1, sep= ';' , low_memory= False )
df. head( )
Date
Time
Global_active_power
Global_reactive_power
Voltage
Global_intensity
Sub_metering_1
Sub_metering_2
Sub_metering_3
0
16/12/2006
17:24:00
4.216
0.418
234.84
18.4
0.0
1.0
17.0
1
16/12/2006
17:25:00
5.360
0.436
233.63
23.0
0.0
1.0
16.0
2
16/12/2006
17:26:00
5.374
0.498
233.29
23.0
0.0
2.0
17.0
3
16/12/2006
17:27:00
5.388
0.502
233.74
23.0
0.0
1.0
17.0
4
16/12/2006
17:28:00
3.666
0.528
235.68
15.8
0.0
1.0
17.0
df. info( )
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
Date 1000 non-null object
Time 1000 non-null object
Global_active_power 1000 non-null float64
Global_reactive_power 1000 non-null float64
Voltage 1000 non-null float64
Global_intensity 1000 non-null float64
Sub_metering_1 1000 non-null float64
Sub_metering_2 1000 non-null float64
Sub_metering_3 1000 non-null float64
dtypes: float64(7), object(2)
memory usage: 70.4+ KB
new_df = df. replace( '?' , np. nan)
datas = new_df. dropna( axis= 0 , how = 'any' )
datas. describe( ) . T
count
mean
std
min
25%
50%
75%
max
Global_active_power
1000.0
2.418772
1.239979
0.206
1.806
2.414
3.308
7.706
Global_reactive_power
1000.0
0.089232
0.088088
0.000
0.000
0.072
0.126
0.528
Voltage
1000.0
240.035790
4.084420
230.980
236.940
240.650
243.295
249.370
Global_intensity
1000.0
10.351000
5.122214
0.800
8.400
10.000
14.000
33.200
Sub_metering_1
1000.0
0.000000
0.000000
0.000
0.000
0.000
0.000
0.000
Sub_metering_2
1000.0
2.749000
8.104053
0.000
0.000
0.000
1.000
38.000
Sub_metering_3
1000.0
5.756000
8.066941
0.000
0.000
0.000
17.000
19.000
df. info( )
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
Date 1000 non-null object
Time 1000 non-null object
Global_active_power 1000 non-null float64
Global_reactive_power 1000 non-null float64
Voltage 1000 non-null float64
Global_intensity 1000 non-null float64
Sub_metering_1 1000 non-null float64
Sub_metering_2 1000 non-null float64
Sub_metering_3 1000 non-null float64
dtypes: float64(7), object(2)
memory usage: 70.4+ KB
def date_format ( dt) :
import time
t = time. strptime( ' ' . join( dt) , '%d/%m/%Y %H:%M:%S' )
return ( t. tm_year, t. tm_mon, t. tm_mday, t. tm_hour, t. tm_min, t. tm_sec)
X = datas. iloc[ : , 0 : 2 ]
X = X. apply ( lambda x: pd. Series( date_format( x) ) , axis= 1 )
Y = datas[ 'Global_active_power' ]
X. head( 2 )
0
1
2
3
4
5
0
2006
12
16
17
24
0
1
2006
12
16
17
25
0
X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size= 0.2 , random_state= 0 )
print ( X_train. shape)
print ( X_test. shape)
print ( Y_train. shape)
(800, 6)
(200, 6)
(800,)
X_train. describe( ) . T
count
mean
std
min
25%
50%
75%
max
0
800.0
2006.00000
0.000000
2006.0
2006.0
2006.0
2006.0
2006.0
1
800.0
12.00000
0.000000
12.0
12.0
12.0
12.0
12.0
2
800.0
16.59875
0.490458
16.0
16.0
17.0
17.0
17.0
3
800.0
10.75500
8.068386
0.0
4.0
8.0
19.0
23.0
4
800.0
29.72375
17.266517
0.0
15.0
30.0
45.0
59.0
5
800.0
0.00000
0.000000
0.0
0.0
0.0
0.0
0.0
ss = StandardScaler( )
X_train = ss. fit_transform( X_train)
X_test = ss. transform( X_test)
pd. DataFrame( X_train) . describe( ) . T
count
mean
std
min
25%
50%
75%
max
0
800.0
0.000000e+00
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
1
800.0
0.000000e+00
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
2
800.0
2.196299e-15
1.000626
-1.221561
-1.221561
0.818625
0.818625
0.818625
3
800.0
-8.604228e-17
1.000626
-1.333814
-0.837742
-0.341670
1.022529
1.518601
4
800.0
3.691492e-17
1.000626
-1.722545
-0.853268
0.016009
0.885286
1.696611
5
800.0
0.000000e+00
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
lr = LinearRegression( )
lr. fit( X_train, Y_train)
y_predict = lr. predict( X_test)
print ( "訓練R2:" , lr. score( X_train, Y_train) )
print ( "測試R2:" , lr. score( X_test, Y_test) )
mse = np. average( ( y_predict- Y_test) ** 2 )
rmse = np. sqrt( mse)
print ( "rmse:" , rmse)
訓練R2: 0.24409311805909026
測試R2: 0.12551628513735846
rmse: 1.164092345973625
from sklearn. externals import joblib
joblib. dump( ss, "data_ss.model" )
joblib. dump( lr, "data_lr.model" )
ss = joblib. load( "data_ss.model" )
lr = joblib. load( "data_lr.model" )
data1 = [ [ 2006 , 12 , 17 , 12 , 25 , 0 ] ]
data1 = ss. transform( data1)
print ( data1)
lr. predict( data1)
[[ 0. 0. 0.81862454 0.15440249 -0.27374978 0. ]]
array([1.16996393])
t= np. arange( len ( X_test) )
plt. figure( facecolor= 'w' )
plt. plot( t, Y_test, 'r-' , linewidth= 2 , label= '真實值' )
plt. plot( t, y_predict, 'g-' , linewidth= 2 , label= '預測值' )
plt. legend( loc = 'upper left' )
plt. title( "線性迴歸預測時間和功率之間的關係" , fontsize= 20 )
plt. grid( b= True )
plt. show( )
[外鏈圖片轉存失敗,源站可能有防盜鏈機制,建議將圖片保存下來直接上傳(img-StMpDQDu-1581861627931)(output_18_0.png)]
X = datas. iloc[ : , 2 : 4 ]
Y2 = datas. iloc[ : , 5 ]
X2_train, X2_test, Y2_train, Y2_test = train_test_split( X, Y2, test_size= 0.2 , random_state= 0 )
scaler2 = StandardScaler( )
X2_train = scaler2. fit_transform( X2_train)
X2_test = scaler2. transform( X2_test)
lr2 = LinearRegression( )
lr2. fit( X2_train, Y2_train)
Y2_predict = lr2. predict( X2_test)
print ( "電流預測準確率: " , lr2. score( X2_test, Y2_test) )
print ( "電流參數:" , lr2. coef_)
t= np. arange( len ( X2_test) )
plt. figure( facecolor= 'w' )
plt. plot( t, Y2_test, 'r-' , linewidth= 2 , label= u'真實值' )
plt. plot( t, Y2_predict, 'g-' , linewidth= 2 , label= u'預測值' )
plt. legend( loc = 'lower right' )
plt. title( u"線性迴歸預測功率與電流之間的關係" , fontsize= 20 )
plt. grid( b= True )
plt. show( )
電流預測準確率: 0.9920420609708968
電流參數: [5.07744316 0.07191391]
[外鏈圖片轉存失敗,源站可能有防盜鏈機制,建議將圖片保存下來直接上傳(img-9AuelMOP-1581861627933)(output_19_1.png)]