阿里nlp2018初賽算法,Python代碼

主要是用lightmbg庫:


# coding: utf-8

# In[1]:


# -*- coding: utf-8 -*-
import sys
stdi,stdo,stde=sys.stdin,sys.stdout,sys.stderr 
reload(sys)
sys.stdin,sys.stdout,sys.stderr=stdi,stdo,stde 
# 合併兩個csv文件到


def combine(combine_file, filename1, filename2):
    f = open("true_value.txt",'w')
    len_merge_sum = 0
    with open(combine_file, 'w') as fout:
        with open(filename1, 'r') as f1:
            for eachLine in f1:
                lineno, sen1, sen2, label = eachLine.strip().split('\t')
                fout.write(lineno + '\t' + sen1 + '\t' + sen2 + '\t' + label + '\n')
                if int(label) == 1:
                    f.write(sen1 + '\t' + sen2 + '\t' + label + '\n')
                len_merge_sum += 1
        with open(filename2, 'r') as f1:
            for eachLine in f1:
                lineno, sen1, sen2, label = eachLine.strip().split('\t')
                fout.write(lineno + '\t' + sen1 + '\t' + sen2 + '\t' + label + '\n')
                if int(label) == 1:
                    f.write(sen1 + '\t' + sen2 + '\t' + label + '\n')
                len_merge_sum += 1
    fout.close()
    f.close()
    return combine_file, len_merge_sum


# In[2]:


# -*- coding: utf-8 -*-
from gensim.models import word2vec
import pandas as pd
import numpy as np
import sys
import time
import re
import jieba
import io
stdi,stdo,stde=sys.stdin,sys.stdout,sys.stderr 
reload(sys)
sys.stdin,sys.stdout,sys.stderr=stdi,stdo,stde 
sys.setdefaultencoding('utf-8')


def process_simi_stop(simiwords, stopwords, line):
    for word, subword in simiwords.iteritems():
        if word in line:
            # print line
            #line = re.sub(word, subword, line)
            line = line.replace(word,subword)
            # print subword
    words1 = [w for w in jieba.cut(line) if w.strip()]
    word1 = []
    for i in words1:
        if i not in stopwords:
            word1.append(i)
    return word1,line


def splitSentence(inputFile, inpath, segment, submit):
    print u'分詞開始!', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))  # 輸出當前時間
    start = time.clock()
    jieba.load_userdict("jieba_dict.txt")
    corpus = []

    simiwords = {}
    with io.open("simiwords.txt", encoding='utf-8') as fr:
        for line in fr:
            words = re.split(",", line.strip())
            simiwords[words[0]] = words[1]

    stopwords = []  # 停用詞
    fstop = open('chinese_stopwords.txt', 'r')
    for eachWord in fstop:
        stopwords.append(eachWord.strip())
        # print eachWord.strip()

    fin = open(inputFile, 'r')  # 以讀的方式打開文件 inputFile
    fin1 = open('sentences_1.txt','w')
    for eachLine in fin:
        # line = eachLine.strip()  # 去除每行首尾可能出現的空格
        # line = re.sub("[0-9\s+\.\!\/_,$%^*()?;;:-【】+\"\']+|[+——!,;:。?、~@#¥%……&*()]+", "", eachLine)
        eachLine = re.sub("\*", " ", eachLine)
        # jieba.del_word('年')
        lineno, sen1, sen2, label = eachLine.strip().split('\t')
        word1,sen_1 = process_simi_stop(simiwords, stopwords, sen1)
        word2,sen_2 = process_simi_stop(simiwords, stopwords, sen2)
        fin1.write(sen_1)
        fin1.write("\n")
        fin1.write(sen_2)
        fin1.write("\n")
#         sen_11 = ' '.join(sen_1.decode('utf8'))
#         sen_12 = sen_11.split(" ")
#         for s in sen_12:
#             if s != sen_12[-1]:
#                 fin1.write(s+" ")
#             else:
#                 fin1.write(s)
#         fin1.write('\n')
#         sen_21 = ' '.join(sen_2.decode('utf8'))
#         sen_22 = sen_21.split(" ") 
#         for s in sen_22:
#             if s != sen_22[-1]:
#                 fin1.write(s+" ")
#             else:
#                 fin1.write(s)
#         fin1.write('\n')
        corpus.append(word1)
        corpus.append(word2)
    print len(corpus)
    with open(inpath, 'r') as fin2:  # inpath
        for eachLine in fin2:
            eachLine = re.sub("\*", " ", eachLine)
            if submit:
                lineno, sen1, sen2 = eachLine.strip().split('\t')
                #print "ceshijieshisha:", sen1, sen2
            else:
                lineno, sen1, sen2, label = eachLine.strip().split('\t')   # 無label
                #print "ceshijieshisha:", sen1, sen2
            word1,sen_1 = process_simi_stop(simiwords, stopwords, sen1)
            word2,sen_2 = process_simi_stop(simiwords, stopwords, sen2)
            fin1.write(sen_1)
            fin1.write("\n")
            fin1.write(sen_2)
            fin1.write("\n")
#             sen_11 = ' '.join(sen_1.decode('utf8'))
#             sen_12 = sen_11.split(" ")
#             for s in sen_12:
#                 if s != sen_12[-1]:
#                     fin1.write(s+" ")
#                 else:
#                     fin1.write(s)
#             fin1.write('\n')
#             sen_21 = ' '.join(sen_2.decode('utf8'))
#             sen_22 = sen_21.split(" ") 
#             for s in sen_22:
#                 if s != sen_22[-1]:
#                     fin1.write(s+" ")
#                 else:
#                     fin1.write(s)
#             fin1.write('\n')
            corpus.append(word1)
            corpus.append(word2)
    print len(corpus)  # 204954
    fin1.close()
    with open(segment, 'w') as fs:
        for word in corpus:
            # print type(word)
            for w in word:
                # print w
                fs.write(w)  # 將分詞好的結果寫入到輸出文件
                fs.writelines(' ')
            fs.write('\n')
    end = time.clock()
    print u'分詞實際用時:', end - start
    return corpus


def filter_word_in_model(model, filename):
    a = []
    with open(filename, 'r') as file_to_read:
        for line in file_to_read:
            if True:
                if not line:
                    break
                a.append(line)
    sentences = []  # 讀sentences 裏面的詞
    for i in range(len(a)):
        b = a[i].strip().split()
        sentences.append(b)
    print 'sentences length:', len(sentences)
    new_sentences = []  # 完成獲取模型訓練,剩餘含有詞向量序列
    for i in range(len(sentences)):
        new_sentence = []
        for j in range(len(sentences[i])):
            if sentences[i][j].decode('utf8') in model:
                new_sentence.append(sentences[i][j])
        new_sentences.append(new_sentence)
    print 'new_sentences length: ', len(new_sentences)
    # print(np.array(new_sentences).shape)
    print u'new_sentences,用時', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))  # 輸出當前時間
    with open('new_sentences.txt', 'w') as fs:  # 寫入new_sentences
        for word in new_sentences:
            for w in word:
                fs.write(w)  # 將分詞好的結果寫入到輸出文件
                fs.writelines(' ')
            fs.write('\n')
    return new_sentences


def eval_file(label1, pre):
    tp, tn, fp, fn = 0.0000001, 0.0000001, 0.0000001, 0.00000001
    for la, pr in zip(label1, pre):
        if la == 1 and pr == 1:
            tp += 1
        elif la == 1 and pr == 0:
            fn += 1
        elif la == 0 and pr == 0:
            tn += 1
        elif la == 0 and pr == 1:
            fp += 1
    recall = float(tp)/float(tp+fn)
    precision = float(tp)/float(tp+fp)
    f11 = 2*recall*precision/(recall+precision)
    return f11


def cos_Vector(x, y):  # 用cos求夾角
    if len(x) != len(y):
        print u'error input,x and y is not in the same space'
        return
    x = np.array(x)
    y = np.array(y)
    num = (x * y.T)
    num = float(num.sum())
    if num == 0:
        return 0
    denom = np.linalg.norm(x) * np.linalg.norm(y)
    if denom == 0:
        return 0
    cos = num / denom  # 餘弦值
    sim = 0.5 + 0.5 * cos  # 歸一化
    return sim


def vec_minus(x, y):  # 相減
    if len(x) != len(y):
        print u'error input,x and y is not in the same space'
        return
    x = np.array(x)
    y = np.array(y)
    sim = abs(x-y)
    return sim


def vec_multi(x, y):  # 相乘
    if len(x) != len(y):
        print u'error input,x and y is not in the same space'
        return
    x = np.array(x)
    y = np.array(y)
    sim1 = x * y
    return sim1


def calEuclideanDistance(x, y):
    if len(x) != len(y):
        print u'error input,x and y is not in the same space'
        return
    dist = np.sqrt(np.sum(np.square(x - y)))
    return dist

#相同數組中相同元素的數量統計
def cal_jaccard(list1, list2):
    set1 = set(list1)
    set2 = set(list2)
    avg_len = (len(set1) + len(set2)) / 2
    min_len = min(len(set1), len(set2))
    # return len(set1 & set2) * 1.0 / (len(set1) + len(set2) - len(set1 & set2))
    if min_len == 0:
        return 0
    else:
        return len(set1 & set2) * 1.0 / min_len


def zishu(X,useStatus):  
    if useStatus:  
        return 1.0 / (1 + np.exp(-(X)));  
    else:  
        return (X);  


# In[3]:


def fenge(input_file,out_file,out_file1):
    f = open(input_file,'r')
    f_1 = open(out_file,'w')
    f_2 = open(out_file1,'w')
    lines = f.readlines()
    Row = len(lines)
    D = int(Row*0.85)
    for i in range(Row):
        if i < D:
            lineno, sen1, sen2, label = lines[i].strip().split('\t')
            f_1.write(lineno + '\t' + sen1 + '\t' + sen2 + '\t' + label + '\n')
        else:
            lineno, sen1, sen2, label = lines[i].strip().split('\t')
            f_2.write(lineno + '\t' + sen1 + '\t' + sen2 + '\t' + label + '\n')
    f.close()
    f_1.close()
    f_2.close()
    return D
def feature_extraction(new_sentences,model,size):
    vec_titles = []  # 獲取句子的向量
    for val in range(len(new_sentences)):
        vec = np.zeros(shape=(1, size))
        for i in range(len(new_sentences[val])):
            vec += model[new_sentences[val][i].decode('utf8')]
        if len(new_sentences[val]):
            vec = vec/len(new_sentences[val])
        vec_titles.append(vec)
    return vec_titles
            


# In[4]:


'''
import threading
import time
gl_num = 0
vec_titles = []
lock = threading.RLock()
def feature_extraction(new_sentences,model,size,val_start):
    lock.acquire()
    global vec_titles
    for val in range(val_start,len(new_sentences),4):
        vec = np.zeros(shape=(1, size))
        for i in range(len(new_sentences[val])):
            vec += model[new_sentences[val][i].decode('utf8')]
        if len(new_sentences[val]):
            vec = vec/len(new_sentences[val])
        vec_titles.append(vec)
    time.sleep(1)
    print len(vec_titles)
    lock.release()
thread_list = [] 
for i in range(4):
    t = threading.Thread(target=feature_extraction,args = (new_sentences,model,size,i))
    thread_list.append(t)
for t in thread_list:
    t.start()
'''


# In[5]:


# import jieba.analyse
# def get_keyword(model, num_keywords, new_sentences):
#     # 獲取關鍵詞
#     content = open('new_sentences.txt', 'rb').read()
#     jieba.analyse.set_stop_words('chinese_stopwords.txt')
#     keywords = jieba.analyse.extract_tags(content, topK=num_keywords, withWeight=False, allowPOS=())
#     print u'keywords長度:', len(keywords)

#     # 獲取在模型中的關鍵詞
#     keywords_in_model = []
#     for i in range(len(keywords)):
#         if keywords[i].decode('utf8') in model:
#             keywords_in_model.append(keywords[i])
#     print time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))

#     # 計算每一個keywords中的一個詞,與句子中所有詞語最大的值
#     keywords_indexes = []  # 20w * 2k
#     for i in range(len(new_sentences)):
#         keywords_million_value = []
#         for val in range(num_keywords):  # key_words 2k
#             similar_values = []
#             for j in range(len(new_sentences[i])):  # 每一個title裏的詞
#                 try:
#                     value = model.similarity(new_sentences[i][j].decode('utf-8'), keywords_in_model[val].decode('utf-8'))
#                     similar_values.append(max(value, 0))
#                 except:
#                     print new_sentences[i][j]
#             try:
#                 keywords_one_value = max(similar_values)  # 得到第一個句子與第一詞的相似度最大值
#             except:
#                 keywords_one_value = 0
#                 print i
#             keywords_million_value.append(keywords_one_value)  # 1w個
#         keywords_indexes.append(keywords_million_value)
#     print np.array(keywords_indexes).shape
#     # 每個標題 1w維向量!
#     np.save("train_data_similar_vec.npy", keywords_indexes)
#     print u'詞袋生成完畢:', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
#     return keywords_indexes
# keywords_indexes = get_keyword(model,80,new_sentences)


# In[6]:


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer  
def get_tiidf_vec(filename):
    # 把new_sentences 寫成 corpus要求的類型
    corpus = [' '.join(a) for a in filename]
    stopword = [u' ']
    #vectorizer = CountVectorizer(min_df=0,stop_words=stopword,token_pattern='(?u)\\b\\w+\\b')  # 詞頻矩陣,矩陣元素a[i][j] 表示j詞在i類文本下的詞頻
    #vectorizer = CountVectorizer(token_pattern=r"(?u)\b\w+\b")
    vectorizer = CountVectorizer(min_df=0,token_pattern=r"(?u)\W{1}|(?u)\b\w+\b")
    result = vectorizer.fit_transform(corpus)  # 文本轉爲詞頻矩陣
    transformer = TfidfTransformer()  # 統計每個詞語的tf-idf權值
    tfidf = transformer.fit_transform(result)  # fit_transform是計算tf-idf
    vecs = []  # 每一個值的tfidf值
#     hz = result.toarray()
    weight=tfidf.toarray()
    word=vectorizer.get_feature_names()#獲取詞袋模型中的所有詞語
    print len(word)
#     print hz.shape
    print weight.shape 
   # return word,weight
#     for i in range(len(weight)):#打印每類文本的tf-idf詞語權重,第一個for遍歷所有文本,第二個for便利某一類文本下的詞語權重  
#         print u"-------這裏輸出第",i,u"類文本的詞語tf-idf權重------"  
#         for j in range(len(word)):  
#             print word[j],weight[i][j] 
    tfidf_cos = []
    hz_cos = []
    jk_cos = []
    for i in range(weight.shape[0]/2):
#         numerator = np.sum(np.min(hz[2*i:2*i+2,:], axis=0))
#         denominator = np.sum(np.max(hz[2*i:2*i+2,:], axis=0))
        value = np.dot(weight[2*i], weight[2*i+1])
#         value_hz = np.dot(hz[2*i], hz[2*i+1]) / (norm(hz[2*i]) * norm(hz[2*i+1]))
        value = 0.5 + 0.5 * value
#         value_hz = 0.5 + 0.5 * value_hz
        tfidf_cos.append([value])
#         hz_cos.append([value_hz])
#         jk_cos.append([1.0 * numerator / denominator])
    return tfidf_cos,hz_cos,jk_cos
# word_list,tfidf = get_tiidf_vec(new_sentences)


# In[5]:


# # -*- coding: utf-8 -*-
# import numpy as np
# import sys
# import time
# from gensim.models import word2vec
# import lightgbm as lgb
# from sklearn.model_selection import train_test_split   # 隨機分割
# from scipy.linalg import norm
# # import bm25
# stdi,stdo,stde=sys.stdin,sys.stdout,sys.stderr 
# reload(sys)
# sys.stdin,sys.stdout,sys.stderr=stdi,stdo,stde 
# sys.setdefaultencoding('utf-8')
# # 合併兩個csv文件到
# filename1 = 'atec_nlp_sim_train.csv'
# filename2 = 'atec_nlp_sim_train_add.csv'
# combine_file, len_merge_sum = combine('merge_sum.csv', filename1, filename2)
# if __name__ == '__main__':
#     SUBMIT = False
#     if SUBMIT:
#         inpath, outpath = sys.argv[1], sys.argv[2]
#         testpath = combine_file
#         test_num = len_merge_sum
#     else:
#         num = fenge("merge_sum.csv","merge_train.csv","merge_test.csv")
#         inpath, outpath = 'merge_test.csv', 'output.csv'
#         testpath = 'merge_train.csv'
#         #test_num = 92228
#         test_num = 87105
#         # inpath, outpath = 'empty.csv', 'output.csv'
#         #         # testpath = 'merge_sum.csv'
#         #         # test_num = 92228
#     filename = 'sentences.txt'
#     corpus = splitSentence(testpath, inpath, filename, SUBMIT)  # jieba 分詞

#     print u'語料corpus生成完畢:', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))

# # # 訓練詞向量模型

#     sentences = word2vec.Text8Corpus('sentences.txt')
#     model = word2vec.Word2Vec(sentences, sg=1, size=100, window=5, min_count=5, negative=3, sample=0.001, hs=1, workers=4)
#     model.save('result')  # save
#     size = 100  # model_train size
#     print u'詞向量訓練完畢', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))  # 輸出當前時間
#     # 導入model
#     model = word2vec.Word2Vec.load('result')
#     new_sentences = filter_word_in_model(model, filename)
    
    
#     print "開始計算特徵向量"
#     tfidf_cos,hz_cos,jk_cos = get_tiidf_vec(new_sentences)
# #     feature_2 = np.hstack((tfidf_cos,hz_cos,jk_cos))
#     vec_titles = []  # 獲取句子的向量
#     max_titles = []

#     for val in range(len(new_sentences)):
#         vec = np.zeros(shape=(1, size))
#         mat = np.zeros(shape=(30, size))
#         for i in range(len(new_sentences[val])):
#             print len(new_sentences[val])
#             if i < 30:
#                 vec += model[new_sentences[val][i].decode('utf8')]
#                 mat[i] = model[new_sentences[val][i].decode('utf8')]
#         if len(new_sentences[val]):
#             vec = vec/len(new_sentences[val])
#         vec_titles.append(vec)
#         max_titles.append(mat)
#     print "計算特徵向量完畢"

#     #vec_titles = feature_extraction(new_sentences,model,size)
#     vec_titles = list(map(lambda x: x[0], vec_titles))  # 去掉外部的[], 獲得title 的向量形式
#     print(np.array(max_titles).shape)
#     np.save("train_data_title_vec.npy", vec_titles)
#     np.save("train_data_title_max.npy", max_titles)
#     print u'生成train_data_title_vec完畢', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))  # 輸出時間
#     trains = np.load('train_data_title_max.npy')


# In[7]:


# # -*- coding: utf-8 -*-
# import numpy as np
# import sys
# import time
# from gensim.models import word2vec
# import lightgbm as lgb
# from sklearn.model_selection import train_test_split   # 隨機分割
# from scipy.linalg import norm

# # from keras.datasets import mnist  
# # from keras.models import Sequential  
# # from keras.layers import Dense, Dropout, Activation, Flatten  
# # from keras.layers import Convolution2D, MaxPooling2D  
# # from keras.utils import np_utils  
# # from keras import backend as K  
# # import bm25
# stdi,stdo,stde=sys.stdin,sys.stdout,sys.stderr 
# reload(sys)
# sys.stdin,sys.stdout,sys.stderr=stdi,stdo,stde 
# sys.setdefaultencoding('utf-8')
# # 合併兩個csv文件到
# filename1 = 'atec_nlp_sim_train.csv'
# filename2 = 'atec_nlp_sim_train_add.csv'
# combine_file, len_merge_sum = combine('merge_sum.csv', filename1, filename2)
# if __name__ == '__main__':
#     SUBMIT = False
#     if SUBMIT:
#         inpath, outpath = sys.argv[1], sys.argv[2]
#         testpath = combine_file
#         test_num = len_merge_sum
#     else:
#         num = fenge("merge_sum.csv","merge_train.csv","merge_test.csv")
#         inpath, outpath = 'merge_test.csv', 'output.csv'
#         testpath = 'merge_train.csv'
#         #test_num = 92228
#         test_num = 87105
#         # inpath, outpath = 'empty.csv', 'output.csv'
#         #         # testpath = 'merge_sum.csv'
#         #         # test_num = 92228
#     filename = 'sentences.txt'
#     corpus = splitSentence(testpath, inpath, filename, SUBMIT)  # jieba 分詞

#     print u'語料corpus生成完畢:', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))

# # # 訓練詞向量模型

#     sentences = word2vec.Text8Corpus('sentences.txt')
#     model = word2vec.Word2Vec(sentences, sg=1, size=100, window=5, min_count=5, negative=3, sample=0.001, hs=1, workers=4)
#     model.save('result')  # save
#     size = 100  # model_train size
#     print u'詞向量訓練完畢', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))  # 輸出當前時間
#     # 導入model
#     model = word2vec.Word2Vec.load('result')
#     new_sentences = filter_word_in_model(model, filename)
    
    
#     print "開始計算特徵向量"
# #     tfidf_cos,hz_cos,jk_cos = get_tiidf_vec(new_sentences)
# #     feature_2 = np.hstack((tfidf_cos,hz_cos,jk_cos))
#     vec_titles = []  # 獲取句子的向量
#     max_titles = []

#     for val in range(len(new_sentences)/2):
#         mat = np.zeros(shape=(50, size))
#         for i in range(len(new_sentences[2*val])):
#             if i < 25:
#                 mat[i] = model[new_sentences[2*val][i].decode('utf8')]
#         for i in range(len(new_sentences[2*val+1])):
#             if i < 25:
#                 mat[i+25] = model[new_sentences[2*val+1][i].decode('utf8')]        
#         max_titles.append(mat)
#     print "計算特徵向量完畢"

#     #vec_titles = feature_extraction(new_sentences,model,size)
# #     vec_titles = list(map(lambda x: x[0], vec_titles))  # 去掉外部的[], 獲得title 的向量形式
#     print (np.array(max_titles).shape)
#     np.save("train_data_title_max.npy", max_titles)
#     print u'生成train_data_title_vec完畢', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))  # 輸出時間
#     trains = np.load('train_data_title_max.npy')    
# #     nb_filters = 28  
# #     # size of pooling area for max pooling  
# #     pool_size = (2, 2)  
# #     # convolution kernel size  
# #     kernel_size = (3, 100)  
# #     input_shape = (img_rows, img_cols, 1)  
# #     model = Sequential()  
# # """ 
# # model.add(Convolution2D(nb_filters, kernel_size[0], kernel_size[1], 
# #                         border_mode='same', 
# #                         input_shape=input_shape)) 
# # """  
# #     model.add(Convolution2D(nb_filters, (kernel_size[0], kernel_size[1]),  
# #                         padding='same',  
# #                         input_shape=input_shape)) # 卷積層1  
# #     model.add(Activation('relu')) #激活層  
# #     model.add(Convolution2D(nb_filters, (kernel_size[0], kernel_size[1]))) #卷積層2  
# #     model.add(Activation('relu')) #激活層  
# #     model.add(MaxPooling2D(pool_size=pool_size)) #池化層  
# #     model.add(Dropout(0.25)) #神經元隨機失活  
# #     model.add(Flatten()) #拉成一維數據  
# #     model.add(Dense(128)) #全連接層1  
# #     model.add(Activation('relu')) #激活層  
# #     model.add(Dropout(0.5)) #隨機失活  
# #     model.add(Dense(nb_classes)) #全連接層2  
# #     model.add(Activation('softmax')) #Softmax評分  
  
# #     #編譯模型  
# #     model.compile(loss='categorical_crossentropy',  
# #               optimizer='adadelta',  
# #               metrics=['accuracy'])  
# #     #訓練模型  
# #     model.fit(X_train, Y_train, batch_size=batch_size, epochs=epochs,  
# #           verbose=1, validation_data=(X_test, Y_test))  


# In[8]:


# print trains[0]


# In[28]:


# -*- coding: utf-8 -*-
import numpy as np
import sys
import time
from gensim.models import word2vec
import lightgbm as lgb
from sklearn.model_selection import train_test_split   # 隨機分割
from scipy.linalg import norm
# import bm25
stdi,stdo,stde=sys.stdin,sys.stdout,sys.stderr 
reload(sys)
sys.stdin,sys.stdout,sys.stderr=stdi,stdo,stde 
sys.setdefaultencoding('utf-8')
# 合併兩個csv文件到
filename1 = 'atec_nlp_sim_train.csv'
filename2 = 'atec_nlp_sim_train_add.csv'
combine_file, len_merge_sum = combine('merge_sum.csv', filename1, filename2)
if __name__ == '__main__':
    SUBMIT = True
    if SUBMIT:
        inpath, outpath = sys.argv[1], sys.argv[2]
        testpath = combine_file
        test_num = len_merge_sum
    else:
        num = fenge("merge_sum.csv","merge_train.csv","merge_test.csv")
        inpath, outpath = 'merge_test.csv', 'output.csv'
        testpath = 'merge_train.csv'
        #test_num = 92228
        test_num = 87105
        # inpath, outpath = 'empty.csv', 'output.csv'
        #         # testpath = 'merge_sum.csv'
        #         # test_num = 92228
    filename = 'sentences.txt'
    corpus = splitSentence(testpath, inpath, filename, SUBMIT)  # jieba 分詞

    print u'語料corpus生成完畢:', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))

# # 訓練詞向量模型

    sentences = word2vec.Text8Corpus('sentences.txt')
    model = word2vec.Word2Vec(sentences, sg=1, size=120, window=5, min_count=5, negative=3, sample=0.001, hs=1, workers=4)
    model.save('result')  # save
    size = 120  # model_train size
    print u'詞向量訓練完畢', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))  # 輸出當前時間
    # 導入model
    model = word2vec.Word2Vec.load('result')
    new_sentences = filter_word_in_model(model, filename)
    
    
    print "開始計算特徵向量"
    tfidf_cos,hz_cos,jk_cos = get_tiidf_vec(new_sentences)
#     feature_2 = np.hstack((tfidf_cos,hz_cos,jk_cos))
    vec_titles = []  # 獲取句子的向量
    value_eig = []
    v_max = []
    v_v = [0,0,0]
    for val in range(len(new_sentences)):
        vec = np.zeros(shape=(1, size))
#         mat = np.zeros(shape=(len(new_sentences[val]), size))
        for i in range(len(new_sentences[val])):
            vec += model[new_sentences[val][i].decode('utf8')]
#             mat[i] = model[new_sentences[val][i].decode('utf8')]
        if len(new_sentences[val]):
            vec = vec/len(new_sentences[val])
#             a,b,c = np.linalg.svd(mat)
#             b_l = list(b)
#             b_s = sorted(b_l)
#             b_1 = np.max(b)
#         v_max.append(b_s[-1])
#         if len(b_s) < 2:
#             v_max.append(0)
#         else:
#             v_max.append(b_s[-2])
#         if len(b_s) < 3:
#             v_max.append(0)
#         else:
#             v_max.append(b_s[-3])
        vec_titles.append(vec)
#         if len(v_max) == 6:
#             v_v[0] = abs(v_max[3]-v_max[0])
#             v_v[1] = abs(v_max[4]-v_max[1])
#             v_v[2] = abs(v_max[5]-v_max[2])
#             value_eig.append(v_v)
#             v_max = []
    print "計算特徵向量完畢"

#     vec_titles = []
#     for val in range(len(new_sentences)):
#         vec = np.zeros(shape=(len(new_sentences[val]), size))
#         for i in range(len(new_sentences[val])):
#             vec[i] = model[new_sentences[val][i].decode('utf8')]
#         if len(new_sentences[val]):
#             V = np.dot(vec.transpose(),vec)
#             a,b = np.linalg.eig(V)
#         vec1 = np.zeros(shape = (size,1))
#         for k in range(a.shape[0]):
#             if a[k] > 0.1:
#                 vec1 = vec1+(a[k]*b[:,k]).reshape(size,1)
#         print len(vec_titles)
#         vec_titles.append(abs(vec1).reshape(1,size))


    #vec_titles = feature_extraction(new_sentences,model,size)
    vec_titles = list(map(lambda x: x[0], vec_titles))  # 去掉外部的[], 獲得title 的向量形式
    print(np.array(vec_titles).shape)
    np.save("train_data_title_vec.npy", vec_titles)
    print u'生成train_data_title_vec完畢', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))  # 輸出時間
    trains = np.load('train_data_title_vec.npy')
    # bm_score = np.load('bm_score.npy')
    # size = 100
    new_sentences = []
    with open('new_sentences.txt', 'r') as f:
        for eachLine in f:
            word = eachLine.decode('utf8').strip().split(' ')
            new_sentences.append(word)
# 一維特徵
    # 6  傑拉德距離
        J_dist = []
        for val in range(len(new_sentences) / 2):
            j = cal_jaccard(new_sentences[2 * val], new_sentences[2 * val + 1])
            J_dist.append(j)
    #  1向量夾角,2向量距離(最小) 3 bm_score

    juzi = []
    f1 = open('sentences_1.txt','r')
    for eachLine in f1:
        word = eachLine.decode('utf8').strip()
        juzi.append(list(word))
    distance = []
    for i in range(len(juzi)/2):
        j = cal_jaccard(juzi[2 * i], juzi[2 * i + 1])
        distance.append([j])
    f1.close()    
    
    cos_val = []
    E_Dist = []
    print 'train', len(trains)
    for i in range(len(trains) / 2):
        score1 = cos_Vector(trains[2 * i], trains[2 * i + 1])
        cos_val += [score1]
        score2 = calEuclideanDistance(trains[2 * i], trains[2 * i + 1])
        E_Dist += [score2]
    #添加一個二維特徵
    combine_feature1 = np.vstack((cos_val, J_dist)).transpose()  # 2個
    np.save("cos_val.npy", cos_val), np.save('E_Dist.npy', E_Dist)
    # 4,5 各自的長度(2維),6 長度差異(刪)
    len_ = []
    dif_length = []
    for val in range(len(new_sentences) / 2):
        a = [0.1*len(new_sentences[2 * val]), 0.1*len(new_sentences[2 * val + 1])]
        #長度特徵
        len_.append(a)
        b = abs(len(new_sentences[2 * val]) - len(new_sentences[2 * val + 1]))
        #長度差特徵
        dif_length.append(b)
    # combine_feature2 = np.vstack((np.transpose(len_), J_dist)).transpose()  # 3個
    # combine_feature = np.hstack((combine_feature1, combine_feature2))  # 5個
    # combine_feature2 = len_ #二維特徵
    combine_feature3 = tfidf_cos
    combine_feature4 = distance
    # combine_feature = np.vstack((np.transpose(combine_feature), J_dist)).transpose()  # 6個
    combine_feature = np.hstack((combine_feature1, combine_feature3, combine_feature4))  # 4個
    print combine_feature.shape
    # 編輯距離
    dim_1_num = 4  # 一維特徵列數
    # dim_1_num = 6  # 一維特徵列數
    print 'combine_feature length:', len(combine_feature)
# 產生新特徵(size*num 維度)
    feature1_val = []
    feature2_val = []
    feature3_val = []
    feature4_val = []
    feature5_val = []
    feature6_val = []
    print 'train', len(trains)
    #train就是句子向量對應的矩陣
    for i in range(len(trains)/2):
        vec1 = trains[2*i]
        vec2 = trains[2*i+1]
        feature1_val.append(vec1)
        feature2_val.append(vec2)
    for i in range(len(trains)/2):
        vec3 = vec_minus(trains[2*i], trains[2*i+1])
        vec4 = vec_multi(trains[2*i], trains[2*i+1])
        feature3_val.append(vec3)
        feature4_val.append(vec4)
    print 'feature1_val length:', len(feature3_val)
    # feature_val = np.hstack((feature1_val, feature2_val, feature3_val, feature4_val, combine_feature))  # 400列///406
    #合成總特徵feature_val
    feature_val = np.hstack((feature1_val, feature2_val,combine_feature))  # 300列///406
    #feature_val = combine_feature
    np.save("feature_val.npy", feature_val)
    # feature_num = 4
    feature_num = 2 #幾個feature1_val向量組成的特徵
    print u'特徵生成完畢'

    y_true = []
    with open(testpath, 'r') as f, open(inpath, 'r') as fin:
        for line in f.readlines():
            pair_id, sen1, sen2, label = line.strip().split('\t')
            label = int(label)
            y_true += [label]
    np.save('y_true.npy', y_true)
    print 'y_true length', len(y_true)

    # 讀取數據
    y_true = np.load('y_true.npy')
    # bm_score = np.load('bm_score.npy')
    # bm_score_train = bm_score[:test_num]
    # print 'bm_score_train length', len(bm_score_train)
    # bm_score_test = bm_score[test_num:]
    # bm_score = pd.Series([bm_score], index=['bm_score'])
    cos_val = np.load('cos_val.npy')
    cos_val_train = cos_val[:test_num]
    cos_val_test = cos_val[test_num:]
    feature_val = np.load('feature_val.npy')
    feature_val_train = feature_val[:test_num]
    feature_val_test = feature_val[test_num:]
    #每一行對應一個訓練集,最後一列是標號
    trains = np.vstack((np.transpose(feature_val_train), y_true)).transpose()
    print np.array(trains).shape
    print u"數據拆分"
    train, val = train_test_split(trains, test_size=0.2, random_state=21)
    print 'train length', len(train)
    print 'val length', len(val)
    print u"訓練集"
    y = [train[i][feature_num*size+dim_1_num] for i in range(len(train))]  # 訓練集標籤
    X = [train[i][:feature_num*size+dim_1_num] for i in range(len(train))]  # 訓練集特徵矩陣
    print u"驗證集"
    val_y = [val[i][feature_num*size+dim_1_num] for i in range(len(val))]  # 驗證集標籤
    val_X = [val[i][:feature_num*size+dim_1_num] for i in range(len(val))]  # 驗證集特徵矩陣
    print u"測試集"
    tests = feature_val_test
    # 數據轉換
    lgb_train = lgb.Dataset(X, y, free_raw_data=False)
    lgb_eval = lgb.Dataset(val_X, val_y, reference=lgb_train, free_raw_data=False)
    # 開始訓練
    print u'設置參數'
    params = {
        'num_threads' : '4',
        'boosting_type': 'gbdt',
        'boosting': 'gbdt',
        'objective': 'binary',
        'metric': 'binary_logloss',

        'learning_rate': 0.1,
        'num_leaves': 25,
        'max_depth': 3,

        'max_bin': 10,
        'min_data_in_leaf': 8,

        'feature_fraction': 1,
        'bagging_fraction': 0.7,
        'bagging_freq': 5,

        'lambda_l1': 0,
        'lambda_l2': 0,
        'min_split_gain': 0
        
    }
    print u"開始訓練"
    gbm = lgb.train(params,  # 參數字典
                    lgb_train,  # 訓練集
                    num_boost_round=3000,  # 迭代次數
                    valid_sets=lgb_eval,  # 驗證集
                    early_stopping_rounds=30)  # 早停係數
    # 保存模型
    from sklearn.externals import joblib
    joblib.dump(gbm, 'gbm.pkl')
    print u"預測,並輸出在 outpath"
    preds_offline = gbm.predict(tests, num_iteration=gbm.best_iteration)  # 輸出概率
    np.save('preds.npy', preds_offline)

    if not SUBMIT:
        N = 200
        score_best = 0
        with open('merge_test.csv', 'r') as f1:
            y_true_10 = []
            for eachLine in f1:
                lineno, sen1, sen2, label = eachLine.strip().split('\t')
                a = int(label)
                y_true_10.append(a)

        for thred in range(1,N):  # 閾值的選取,如何找到最好的閾值
            thred = thred * (np.max(preds_offline) - np.min(preds_offline)) / N + np.min(preds_offline)
            pred = []
            for i in range(len(preds_offline)):
                if preds_offline[i] > thred:
                    pred.append(1)
                else:
                    pred.append(0)
            score = eval_file(y_true_10, pred)
            if score > score_best:
                score_best = score
                thred_best = thred
        print u'最優閾值:', thred_best

        for i in range(len(preds_offline)):
            if preds_offline[i] > thred_best:
                preds_offline[i] = 1
            else:
                preds_offline[i] = 0
        print len(preds_offline)
        f1_score = eval_file(y_true_10, preds_offline)
        print 'F1 score is :' + str(f1_score)
        fout = open(outpath,'w')
        for t in preds_offline:
            fout.write(str(t))
            fout.write('\n')
        fout.write('F1 score is :' + str(f1_score))
        fout.close()
    else:
        with open(inpath, 'r') as fin, open(outpath, 'w') as fout:
            line_id = []
            for line in fin:
                lineno, sen1, sen2 = line.strip().split('\t')
                line_id.append(lineno)
            for i in range(len(line_id)):
                if preds_offline[i] >= 0.246:
                    fout.write(line_id[i] + '\t1\n')
                else:
                    fout.write(line_id[i] + '\t0\n')
    print u'運行完畢', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))  # 輸出當前時間

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章