機器學習基石—作業2(16-20題Python實現)

import numpy as np
from numpy import random

def sign(x):#自定義符號函數,只返回-1,+1
    ret=np.ones(x.shape)
    for i,each in enumerate(x):
        if each<0: ret[i]=-1
    return ret

def getTheta(x):#由輸入的x生成假設空間的所有theta的序列
    n=len(x)
    l1=sorted(x)
    theta=np.zeros(n)
    for i in range(n-1):
        theta[i]=(l1[i]+l1[i+1])/2
    theta[-1]=1
    return theta
    
def q17_18():
    data_size=20
    expes=5000
    E_in=0
    E_out=0
    for i in range(expes):
        x=random.uniform(-1,1,data_size)
        noise_rate=0.2
        #生成[-0.2,0.8]範圍內的隨機數組,取sign()即變爲有20%的-1的隨機數組
        noise=sign(random.uniform(size=data_size)-noise_rate)
        y=sign(x)*noise   #爲y加上20%的噪聲
        theta=getTheta(x)
        e_in=np.zeros((2,data_size))#對每個theta求出一個error_in,第一行是s=1,第2行是s=-1.
        for i in range(len(theta)):
            a1=y*sign(x-theta[i])
            e_in[0][i]=(data_size-np.sum(a1))/(2*data_size)#數組只有-1和+1,可直接計算出-1所佔比例
            e_in[1][i]=(data_size-np.sum(-a1))/(2*data_size) 
        s=0;theta_best=0
        min0, min1 = np.min(e_in[0]), np.min(e_in[1])
        if min0<min1:
            s=1
            theta_best=theta[np.argmin(e_in[0])]
        else:
            s=-1
            theta_best=theta[np.argmin(e_in[1])]
        e_out=0.5+0.3*s*(np.abs(theta_best)-1)
        E_in+=np.min(e_in)
        E_out+=np.min(e_out)
    ave_in=E_in/expes
    ave_out=E_out/expes
    print(ave_in,ave_out)

def deciStump(x,y):#d:第d維,x:第x維數據 y:標籤
    data_size=len(x)
    theta=getTheta(x)
    e_in=np.zeros((2,data_size))
    for i in range(len(theta)):
        a1=y*sign(x-theta[i])
        e_in[0][i]=(data_size-np.sum(a1))/(2*data_size)
        e_in[1][i]=(data_size-np.sum(-a1))/(2*data_size)
        
    s=0
    min0, min1 = np.min(e_in[0]), np.min(e_in[1])
    if min0<min1:
        s=1
        theta_best=theta[np.argmin(e_in[0])]
    else:
        s=-1
        theta_best=theta[np.argmin(e_in[1])   
    E_in=np.min(np.min(e_in))
    return s,theta_best,E_in

def mkDateSet(datPath):
    dataSet = open(datPath, 'r').readlines()
    m = len(dataSet)
    X_train = np.zeros((m, 9))
    Y_train = np.zeros(m)

    for i, item in enumerate(dataSet):
        each = item.strip().split()
        X_train[i] = [float(a) for a in each[:-1]]
        Y_train[i] = int(each[-1])
    return (X_train, Y_train)

def getData_i(X_train,i):#獲取第d維數據
    return np.reshape(X_train[:,i],len(X_train))#從ndarray二維數組轉爲array一維數組

def q19_20():
    (X_train, Y_train)=mkDateSet('hw2_train.dat')
    e_in=np.zeros(9)
    s=np.zeros(9)
    theta=np.zeros(9)
    for i in range(9):
        s[i],theta[i],e_in[i]=deciStump(getData_i(X_train,i),Y_train)
        
    E_in=np.min(e_in)
    dimension=np.argmin(e_in)
    theta_best=theta[dimension]
    s_best=s[dimension]
    
    (X_test, Y_test)=mkDateSet('hw2_test.dat')
    test_len=len(Y_test)
    X_i=getData_i(X_test,dimension)
    q=Y_test*s_best*sign(X_i-theta_best)
    E_out=(test_len-np.sum(q))/(2*test_len)
    print(E_in,E_out)
    
if __name__=='__main__':
    q17_18()#0.16737 0.257180978031
    q19_20()#0.25 0.355
    
    
    
    
    
    
    
    
    

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章