Python3機器學習筆記

  1. 字典數據按照value降序排序
d={"a":3,"b":2,"c":2}
d={k:v for k,v in  sorted(d.items(),key=lambda x:x[1],reverse=True)}
  1. pandas 的兩個函數pd.read_csv()pd.read_table()的一個重要區別是默認的separator不一樣
pd.read_csv(file_path,sep=',')
pd.read_table(file_path,sep='\t')
  1. pandas 聚合groupby後,合併聚合後的某個列爲一個字符串
corpus = df.groupby(['file_id'])['api']
	.transform(lambda x: ' '.join([str(a) for a in list(x)]))
  1. 詞向量指定詞彙表
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf2 = TfidfVectorizer(vocabulary=['a','b','c','d','e','f','g'],stop_words=[])
re = tfidf2.fit_transform(corpus)
  1. DataFrame數據計算常見統計量
data=pd.DataFrame({'file_id':[1,1,1,1,2,2,2],'tid':[1,1,2,2,3,3,3]})
statics = ['count','unique','max','min','median','std']
for stata in statics:
    data['tid_'+stata] = list(df.groupby(['file_id'])['tid'].agg(stata))
quantiles = [0.05,0.25,0.5,0.75,0.95]
for quant in quantiles:
    data['tid_qua_'+str(100*quant)] = list(df.groupby(['file_id'])['tid'].quantile(quant).values) 
  1. N-GRAM模型(這裏求了2,3,4gram併合併到data)
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(
        ngram_range=(2, 4),#token_pattern = r'\b\w+\b', #vocabulary=vocabulary,
        stop_words=[','],decode_error="ignore",
        max_df=0.90,
        min_df=0.01
    )
df=pd.DataFrame({'file_id':[1,1,1,1,2,2,2],'tid':[1,1,2,2,3,3,3],'api':[1,2,3,2,4,3,2]})
data = df[['file_id']].drop_duplicates()
corpus = df.groupby(['file_id'])['api'].apply(lambda x: ' '.join([str(a) for a in list(x)]))
corpus = list(corpus)
tfidfs=vectorizer.fit_transform(corpus)
tfidf = pd.DataFrame(tfidfs.todense(),columns=['n_gram_'+ i for i in vectorizer.get_feature_names()])
print('there is %s 2-gram features'%len(vectorizer.vocabulary_))
    tfidf['file_id']=list(data['file_id'])
    data=pd.merge(data,tfidf,on='file_id')
  1. Numpy處理NaN, inf值
train=np.array([[np.nan, np.nan, 1, 2], [np.inf, np.inf, 3, 4], [1, 1, 1, 1], [2, 2, 2, 2]])
train[np.isnan(train)]=0
train[np.isinf(train)]=0
  1. python垃圾回收garbage collection的一些主要觀點
  • python通過爲每一個對象做引用計數,思路感覺很清晰整潔
  • 但是因爲存在循環引用等現象,事實上的垃圾回收比較麻煩
  • 當數據結構特別大如表格時,更新引用計數可能要遞歸很多步
  • del(varable)並不能使varable的佔用立即被清除
  • gc.collect()來主動回收垃圾
  1. python做一維數據的插值 scipy.interpolate
from scipy.interpolate import interp1d
import numpy as np
x = np.array([2,4,6,8,10])
y = np.array([38,39,21,56,77])
px = np.array([2,3,4,5,6,7,8,9,10])
# ‘linear’, ‘nearest’, ‘zero’, ‘slinear’, ‘quadratic, ‘cubic’
#  線性  臨近  零 
py = interp1d(x,y,kind='quadratic')(px)
  1. SK-Learn或者Keras框架保存模型,加載模型
# serialize to json,yml or hdf5 file
def save_model(model,file_name='./model.json'):
    model_json = model.to_json()   
    with open(file_name, "w") as json_file:
        json_file.write(model_json) 

# load model from 
def load_model(file_name='./model.json'):
    from keras.models import model_from_json
    model = None
    with open(file_name, "r") as json_file:
        model_json=json_file.read()
        model=model_from_json(model_json)
    return model
  1. 一個TextCNN的例子,此處是一個8分類問題,每條輸入文本具有超過6000個詞,長度不一,此處簡單截斷(僅模型定義,訓練見下文)
import numpy as np
import pandas as pd 
from keras import Model
from keras.models import Sequential
from keras.layers import LSTM,Dense,Conv1D,MaxPooling1D,Dropout,Input,GlobalMaxPooling1D
from keras.layers import SpatialDropout1D,GRU
from sklearn.model_selection import StratifiedKFold
from keras.preprocessing.sequence import pad_sequences
from keras.layers.embeddings import Embedding
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers.merge import concatenate
from keras.utils import plot_model # model visliation

def dnn_1():
    embedding_vecor_length=256
    api_total_num=301
    max_seq_length=6000
    drop_rate1=0.25
    drop_rate2=0.5
    drop_rate3=0.25
    num_filters=64
    nb_classes=8
    kernel_size=[2,3,4,5]


    model = Sequential()
    input_type = Input(shape=(max_seq_length,), dtype='int16')
    model = Sequential()
    embd = Embedding(api_total_num, embedding_vecor_length, input_length=max_seq_length, mask_zero=False)(input_type)
    embd = SpatialDropout1D(drop_rate1)(embd)
    warppers = []
    for sizei in kernel_size:
        for dilated_rate in [1,2,3,4]:
            conv1d = Conv1D(filters=num_filters, kernel_size=sizei, activation='relu', dilation_rate=dilated_rate)(embd)
            warppers.append(GlobalMaxPooling1D()(conv1d))
    fc = concatenate(warppers)
    fc = Dropout(drop_rate2)(fc)
    fc = Dense(256, activation='relu')(fc)
    fc = Dropout(drop_rate3)(fc)
    preds = Dense(nb_classes, activation = 'softmax')(fc)
    
    model = Model(inputs=input_type, outputs=preds)
    
    model.compile(loss='categorical_crossentropy',
        optimizer='adam',
        metrics=['accuracy'])
    print(model.summary())
    return model
  1. 一個LSTM的例子,輸入和上述LSTM類似(僅模型定義,訓練見下文)
# 依賴同11
def lstm_1():
    embedding_vecor_length=256
    api_total_num=301
    max_seq_length=6000
    nb_classes=8
    
    model = Sequential()
    model.add(Embedding(api_total_num, embedding_vecor_length, input_length=max_seq_length)) 
    model.add(Conv1D(filters=128, kernel_size=2, padding='same', activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Dropout(0.3))
    model.add(LSTM(64))
    model.add(Dense(nb_classes, activation='softmax'))
    model.compile(loss='categorical_crossentropy', 
                  optimizer='rmsprop', # root mean square prop/adam
                  metrics=['accuracy']) # correct rate
    print(model.summary())
    return model
  1. 一個隨機森林的例子(僅模型定義,訓練見下文)
def rf_1():
    '''
    # Document
    https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
    '''
    from sklearn.ensemble import RandomForestClassifier
    model = RandomForestClassifier(
        n_estimators=100,
        criterion='gini',# entropy
        min_samples_split=100,# The minimum number of samples required to split an internal node
        min_samples_leaf=20, # The minimum number of samples required to be at a leaf node.
        max_depth=None, # 8
        max_features='sqrt' ,
        random_state=100,
    )
    return model
  1. 一個LightGBM的例子(僅模型定義,訓練見下文)
def lgb_1():
    '''
    # Document
    http://lightgbm.apachecn.org/cn/latest/index.html
    '''
    import lightgbm as lgb
    params = {
            'task':'train', 
            'boosting_type':'gbdt',
            'num_leaves': 31,
            'objective': 'multiclass',
            'num_class':8,
            'learning_rate': 0.05,
            'feature_fraction': 0.85,
            'subsample':0.85,
            'num_threads': 32,
            'metric':'multi_logloss',
            'seed':100
    }  
    model=lgb.LGBMRegressor(**params)
    return model
  1. 一個XGBoost的例子(僅模型定義,訓練見下文)
def xgb_1():
    from xgboost.sklearn import XGBClassifier
    ''' https://www.programcreek.com/python/example/95386/xgboost.sklearn.XGBClassifier
    ''' 
    model=XGBClassifier(
        silent=0 ,#設置成1則沒有運行信息輸出,最好是設置爲0.是否在運行升級時打印消息。
        #nthread=4,# cpu 線程數 默認最大
        learning_rate= 0.3, # 如同學習率
        min_child_weight=3, 
        # 這個參數默認是 1,是每個葉子裏面 h 的和至少是多少,對正負樣本不均衡時的 0-1 分類而言
        #,假設 h 在 0.01 附近,min_child_weight 爲 1 意味着葉子節點中最少需要包含 100 個樣本。
        #這個參數非常影響結果,控制葉子節點中二階導的和的最小值,該參數值越小,越容易 overfitting。
        max_depth=6, # 構建樹的深度,越大越容易過擬合
        gamma=0.1,  # 樹的葉子節點上作進一步分區所需的最小損失減少,越大越保守,一般0.1、0.2這樣子。
        subsample=0.7, # 隨機採樣訓練樣本 訓練實例的子採樣比
        max_delta_step=0,#最大增量步長,我們允許每個樹的權重估計。
        colsample_bytree=1, # 生成樹時進行的列採樣 
        reg_lambda=1,  # 控制模型複雜度的權重值的L2正則化項參數,參數越大,模型越不容易過擬合。
        #reg_alpha=0, # L1 正則項參數
        #scale_pos_weight=1, #如果取值大於0的話,在類別樣本不平衡的情況下有助於快速收斂。平衡正負權重
        objective= 'multi:softprob', #多分類的問題 指定學習任務和相應的學習目標
        num_class=8, # 類別數,多分類與 multisoftmax 並用
        # n_estimators=100, #樹的個數
        seed=1000, #隨機種子

        eval_metric= 'mlogloss' # roc_auc
    )
    return model
  1. 一個模型訓練的例子,xgb爲例
model = xgb_1()
train = np.load(train_x_path)
#test = np.load(test_x_path)
labels = np.load(label_path)
model=model.fit(train,labels)
save_model(model,'xbg_1.json') # 調用上文中保存模型的函數
  1. 一個模型預測的例子
test = np.load(test_x_path)
model = load_model('xbg_1.json') # 調用上文中的函數讀取模型
# res = model.predict(test) 兩種方式的返回有所差異
res = model.predict_proba(test)
  1. K 折方法(K-Fold)訓練的例子,用於避免過擬合,通常爲5折和10折
def train_xgb_2():
    params = {
        'booster':'gbtree',
        'objective': 'multi:softprob',
        'num_class':8,
        'gamma':0.1,
        'max_depth':5,
        'lambda':2,
        'subsample':0.7,
        'colsample_bytree':0.7, 
        'min_child_weight':3, 
        'silent':0 ,
        'eta': 0.01, 
        'seed':1000,
        'eval_metric': 'mlogloss'
    }   
    train = np.load(train_x_path)
    test = np.load(test_x_path)
    labels = np.load(label_path)
    n_splits=5
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    meta_train = np.zeros(shape = (len(train),8))
    meta_test = np.zeros(shape = (len(test),8))

    for i,(train_ids,test_ids) in enumerate(skf.split(train,labels)):
        X_train,X_train_label = train[train_ids],labels[train_ids]
        X_val,X_val_label = train[test_ids],labels[test_ids]
        xgb_val = xgb.DMatrix(X_val,label=X_val_label)
        xgb_train = xgb.DMatrix(X_train, label=X_train_label)
        xgb_test = xgb.DMatrix(test)
        train_ = xgb.DMatrix(train)
        plst=list(params.items())
        num_rounds=5000
        watchlist=[(xgb_train, 'train'),(xgb_val, 'val')]
        model = xgb.train(plst , xgb_train, num_rounds , watchlist,
                      early_stopping_rounds=100)

        pred_val = model.predict(xgb_val,ntree_limit=model.best_ntree_limit)
        meta_test_ = model.predict(xgb_test,ntree_limit=model.best_ntree_limit)
        meta_train_ = model.predict(train_,ntree_limit=model.best_ntree_limit)

        meta_train[test_ids] = pred_val
        dp.save_model(model,model_save+'xgb_2_%s.m'%i)
        dp.save_submit(meta_train_,submit_save+'xgb_2_%s_train.csv'%i)
        dp.save_submit(meta_test_,submit_save+'xgb_2_%s_test.csv'%i)
        meta_test+=meta_test_
        # meta_train+=meta_train_
    
    # meta_train/=n_splits
    meta_test/=n_splits
    dp.save_submit(meta_train,submit_save+'xgb_2_train.csv')
    dp.save_submit(meta_test,submit_save+'xgb_2_test.csv')
  1. 設置模型訓練中GPU動態佔用顯存(默認會佔用全部顯存,如果多人共用的物理機,則需要此代碼)
def gpu_memeray_dynamic():
    import keras.backend.tensorflow_backend as KTF
    import tensorflow as tf
    # os.environ["CUDA_VISIBLE_DEVICES"] = "0"
    config = tf.ConfigProto()
    config.gpu_options.allow_growth=True   #不全部佔滿顯存, 按需分配
    sess = tf.Session(config=config)
    KTF.set_session(sess)
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章