corpus = df.groupby(['file_id'])['api'].transform(lambda x:' '.join([str(a)for a inlist(x)]))
詞向量指定詞彙表
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf2 = TfidfVectorizer(vocabulary=['a','b','c','d','e','f','g'],stop_words=[])
re = tfidf2.fit_transform(corpus)
DataFrame數據計算常見統計量
data=pd.DataFrame({'file_id':[1,1,1,1,2,2,2],'tid':[1,1,2,2,3,3,3]})
statics =['count','unique','max','min','median','std']for stata in statics:
data['tid_'+stata]=list(df.groupby(['file_id'])['tid'].agg(stata))
quantiles =[0.05,0.25,0.5,0.75,0.95]for quant in quantiles:
data['tid_qua_'+str(100*quant)]=list(df.groupby(['file_id'])['tid'].quantile(quant).values)
N-GRAM模型(這裏求了2,3,4gram併合併到data)
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(
ngram_range=(2,4),#token_pattern = r'\b\w+\b', #vocabulary=vocabulary,
stop_words=[','],decode_error="ignore",
max_df=0.90,
min_df=0.01)
df=pd.DataFrame({'file_id':[1,1,1,1,2,2,2],'tid':[1,1,2,2,3,3,3],'api':[1,2,3,2,4,3,2]})
data = df[['file_id']].drop_duplicates()
corpus = df.groupby(['file_id'])['api'].apply(lambda x:' '.join([str(a)for a inlist(x)]))
corpus =list(corpus)
tfidfs=vectorizer.fit_transform(corpus)
tfidf = pd.DataFrame(tfidfs.todense(),columns=['n_gram_'+ i for i in vectorizer.get_feature_names()])print('there is %s 2-gram features'%len(vectorizer.vocabulary_))
tfidf['file_id']=list(data['file_id'])
data=pd.merge(data,tfidf,on='file_id')
from scipy.interpolate import interp1d
import numpy as np
x = np.array([2,4,6,8,10])
y = np.array([38,39,21,56,77])
px = np.array([2,3,4,5,6,7,8,9,10])# ‘linear’, ‘nearest’, ‘zero’, ‘slinear’, ‘quadratic, ‘cubic’# 線性 臨近 零
py = interp1d(x,y,kind='quadratic')(px)
SK-Learn或者Keras框架保存模型,加載模型
# serialize to json,yml or hdf5 filedefsave_model(model,file_name='./model.json'):
model_json = model.to_json()withopen(file_name,"w")as json_file:
json_file.write(model_json)# load model from defload_model(file_name='./model.json'):from keras.models import model_from_json
model =Nonewithopen(file_name,"r")as json_file:
model_json=json_file.read()
model=model_from_json(model_json)return model
import numpy as np
import pandas as pd
from keras import Model
from keras.models import Sequential
from keras.layers import LSTM,Dense,Conv1D,MaxPooling1D,Dropout,Input,GlobalMaxPooling1D
from keras.layers import SpatialDropout1D,GRU
from sklearn.model_selection import StratifiedKFold
from keras.preprocessing.sequence import pad_sequences
from keras.layers.embeddings import Embedding
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers.merge import concatenate
from keras.utils import plot_model # model visliationdefdnn_1():
embedding_vecor_length=256
api_total_num=301
max_seq_length=6000
drop_rate1=0.25
drop_rate2=0.5
drop_rate3=0.25
num_filters=64
nb_classes=8
kernel_size=[2,3,4,5]
model = Sequential()
input_type = Input(shape=(max_seq_length,), dtype='int16')
model = Sequential()
embd = Embedding(api_total_num, embedding_vecor_length, input_length=max_seq_length, mask_zero=False)(input_type)
embd = SpatialDropout1D(drop_rate1)(embd)
warppers =[]for sizei in kernel_size:for dilated_rate in[1,2,3,4]:
conv1d = Conv1D(filters=num_filters, kernel_size=sizei, activation='relu', dilation_rate=dilated_rate)(embd)
warppers.append(GlobalMaxPooling1D()(conv1d))
fc = concatenate(warppers)
fc = Dropout(drop_rate2)(fc)
fc = Dense(256, activation='relu')(fc)
fc = Dropout(drop_rate3)(fc)
preds = Dense(nb_classes, activation ='softmax')(fc)
model = Model(inputs=input_type, outputs=preds)
model.compile(loss='categorical_crossentropy',
optimizer='adam',
metrics=['accuracy'])print(model.summary())return model
defrf_1():'''
# Document
https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
'''from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(
n_estimators=100,
criterion='gini',# entropy
min_samples_split=100,# The minimum number of samples required to split an internal node
min_samples_leaf=20,# The minimum number of samples required to be at a leaf node.
max_depth=None,# 8
max_features='sqrt',
random_state=100,)return model
一個LightGBM的例子(僅模型定義,訓練見下文)
deflgb_1():'''
# Document
http://lightgbm.apachecn.org/cn/latest/index.html
'''import lightgbm as lgb
params ={'task':'train','boosting_type':'gbdt','num_leaves':31,'objective':'multiclass','num_class':8,'learning_rate':0.05,'feature_fraction':0.85,'subsample':0.85,'num_threads':32,'metric':'multi_logloss','seed':100}
model=lgb.LGBMRegressor(**params)return model