科大訊飛大數據應用分類標註挑戰賽 改寫分享

原代碼摘自https://github.com/wushaowu2014/2019-iflytek-competition-app-classification-labeling,原代碼直接在py3跑會出現報錯,把它做了局部調整以作爲個清洗數據的框架

# -*- coding: utf-8 -*-
"""
@author: shaowu
任務:給定一個app,根據它的應用描述,去預測它的主要功能,比如是屬於體育,或遊戲,或旅遊,等等
Todo:
    進一步清洗數據,比如去掉停用詞。
"""

import pandas as pd
import numpy as np
import time
import datetime
import tqdm
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
import jieba


# 分詞處理
def split_discuss(data):
    data['length'] = data['Discuss'].apply(lambda x:len(x))
    data['Discuss'] = data['Discuss'].apply(lambda x:' '.join(jieba.cut(x)))
    
    return data
##讀入app類型標籤對應表,第一列爲編碼,第二列是具體的含義:
apptype_id_name= pd.read_csv("/home/kesci/input/data1137/apptype_id_name.txt",sep='\t',header=None)
apptype_id_name.columns=['label_code','label']
print(apptype_id_name.nunique())

#============================讀入訓練集:=======================================
train= pd.read_csv("/home/kesci/input/data1137/apptype_train.dat",encoding='utf8',header=None,delimiter=' ')
#以tab鍵分割,不知道爲啥delimiter='\t'會報錯,所以先讀入再分割。
train=pd.DataFrame(train[0].apply(lambda x:x.split('\t')).tolist(),columns=['id','label','conment'])

#=============================讀入測試集:======================================
test= pd.read_csv("/home/kesci/input/data1137/app_desc.dat",encoding='utf8',header=None,delimiter=' ')
test=pd.DataFrame(test[0].apply(lambda x:x.split('\t')).tolist(),columns=['id','conment'])
print('數據讀入完成!')
print('訓練集標籤分佈:',train.label.value_counts())


#========================以|爲分隔符,把標籤分割:===============================
train['label1']=train['label'].apply(lambda x:x.split('|')[0])
train['label2']=train['label'].apply(lambda x:x.split('|')[1] if '|' in x else 0) ##第二個標籤有些沒有,此處補0
print('訓練集第一個標籤分佈:',train.label1.value_counts())
'''
可以發現第一個標籤有125個,相當於125類多分類問題,而且不平衡問題挺嚴重的!下面三類少於5個樣本,這裏不考慮,後續可以考慮
140110       4
140805       3
140105       1
'''
##去掉樣本少於5個的類別,(主要考慮到後續的5折交叉驗證):
train=train[~train.label1.isin(['140110','140805','140105'])].reset_index(drop=True)


#===========================下面以第一個標籤訓練模型=============================
##分詞:
train['conment'] = train['conment'].apply(lambda x:' '.join(jieba.cut(x)))
test['conment'] = test['conment'].apply(lambda x:' '.join(jieba.cut(x)))
#tf-idf特徵:
column='conment'
vec = TfidfVectorizer(ngram_range=(1,1),min_df=5, max_df=0.8,use_idf=1,smooth_idf=1, sublinear_tf=1) #這裏參數可以改
trn_term_doc = vec.fit_transform(train[column])
test_term_doc = vec.transform(test[column])
print(trn_term_doc.shape)
##下面對標籤進行編碼:
from sklearn import preprocessing
lbl = preprocessing.LabelEncoder()
lbl.fit(train['label1'].values)
train['label1'] = lbl.transform(train['label1'].values)
label=train['label1']
num_class=train['label1'].max()+1


#from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold,StratifiedKFold

#=======================模型訓練:5折交叉驗證=========================================
n_folds_t=5
stack_train = np.zeros((train.shape[0],num_class))
stack_test = np.zeros((test.shape[0],num_class))
kf=StratifiedKFold( n_splits=n_folds_t, random_state=42, shuffle=False)
X=trn_term_doc
y=label
for i, (tr, va) in enumerate(kf.split(X, y)):
    print('stack:%d/%d' % ((i + 1), n_folds_t))
    
    ridge = RidgeClassifier(random_state=42)
    ridge.fit(trn_term_doc[tr], label[tr])
    score_va = ridge._predict_proba_lr(trn_term_doc[va])
    score_te = ridge._predict_proba_lr(test_term_doc)
    
    stack_train[va] += score_va
    stack_test += score_te
    
    
print("model acc_score:",metrics.accuracy_score(label,np.argmax(stack_train,axis=1), normalize=True, sample_weight=None))

##獲取第一第二個標籤:取概率最大的前兩個即可:
m=pd.DataFrame(stack_train)
first=[]
second=[]
for j,row in m.iterrows():
    zz=list(np.argsort(row))
    first.append(row.index[zz[-1]]) ##第一個標籤
    second.append(row.index[zz[-2]]) ##第二個標籤
m['label1']=first
m['label2']=second

#計算準確率,只要命中一個就算正確:
k=0
for i in range(len(label)):
    if label[i] in [m.loc[i,'label1'],m.loc[i,'label2']]:
        k+=1
    else:
        pass
print('線下準確率:%f'%(k/len(label)))

##準備測試集結果:
results=pd.DataFrame(stack_test)
first=[]
second=[]
for j,row in results.iterrows():
    zz=list(np.argsort(row))
    first.append(row.index[zz[-1]]) ##第一個標籤
    second.append(row.index[zz[-2]]) ##第二個標籤
results['label1']=first
results['label2']=second

 

stack:1/5
stack:2/5
stack:3/5
stack:4/5
stack:5/5
model acc_score: 0.6087289943985063
線下準確率:0.744932
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章