利用skift實現fasttext模型

skift: 用於Python fastText的scikit-learn 包裝器

什麼是 skift?

skift包括幾個scikit-learn兼容包裝器,裏面封裝了fasttext模型,fasttext原理類似於word2vec,主要用於文本快速分類。其優勢在於分類速度快,使用n-gram特徵容易獲得文本句子局部信息、構造新詞。缺點是隨着語料的增長,內存需求也會增長。那麼如果解決內存問題呢?fasttext這種提出三種解決方法,包括

  1. 過濾掉出現次數少的詞;
  2. 使用Hash存儲
  3. 採用word粒度,而非char粒度
    例如句子: 我喜歡去中國, 如果採用char粒度,則使用2-gram的話,產生的特徵爲
    我喜 喜歡 歡中 中國
    如果採用word粒度的話,產生的特徵爲
    我喜歡 喜歡去 去中國

關於fasttext原理比較好的參考有FastText文本分類算法學習筆記FastText的內部機制,這裏不詳闡述。

下面使用skift實現faxtText來對細粒度情感分析模板

from tqdm import tqdm
from skift import FirstColFtClassifier
from sklearn.model_selection import KFold
import numpy as np
import os
import pickle

class BasicModel(object):
    """Docstring for BasicModel. """

    def __init__(self):
        """TODO: to be defined1. """
        pass

    def create_model(self, kfold_X_train, y_train, kfold_X_test, y_test, test):
        pass

    # Generate batches
    def batch_iter(self, data, batch_size, num_epochs=1, shuffle=True):
        data = np.array(data)
        data_size = len(data)
        num_batches_per_epoch = int((data_size - 1) / batch_size) + 1
        for epoch in range(num_epochs):
            if shuffle:
                shuffle_indices = np.random.permutation(np.arange(data_size))
                shuffled_data = data[shuffle_indices]
            else:
                shuffled_data = data
            for batch_num in range(num_batches_per_epoch):
                start_index = batch_num * batch_size
                end_index = min((1 + batch_num) * batch_size, data_size)
                yield shuffled_data[start_index:end_index]

    def get_f1_score(self, x, y, verbose=False):
        tp = np.sum(np.logical_and(y > 0, x == y))
        fp = np.sum(np.logical_and(x > 0, y == 0)) + np.sum(np.logical_and(x * y > 0, y != x))  # 多判或者錯判
        fn = np.sum(np.logical_and(y > 0, x == 0))  # 漏判

        P = float(tp) / (float(tp + fp) + 1e-8)
        R = float(tp) / (float(tp + fn) + 1e-8)
        F = 2 * P * R / (P + R + 1e-8)

        if verbose:
            print('P->', P)
            print('R->', R)
            print('F->', F)
        return F


class BasicStaticModel(BasicModel):
    def __init__(self, config=None, n_folds=5, name='BasicStaticModel'):
        self.n_folds = n_folds
        self.name = name
        self.config = config
        self.kf = KFold(n_splits=n_folds, shuffle=True, random_state=10)

    def train_predict(self, train, train_y, test, option=None):
        name = self.name

        predict = np.zeros((test.shape[0], 10, 4))
        oof_predict = np.zeros((train.shape[0], 10, 4))
        scores_f1 = []

        for train_index, dev_index in self.kf.split(train):
            kfold_X_train, kfold_X_val = train[train_index], train[dev_index]
            y_train, y_dev = train_y[train_index], train_y[dev_index]

            model_dict = {}
            print('start train model:')
            for idx in tqdm(range(10)):
                label = y_train[:, idx]
                model = self.create_model()
                model.fit(kfold_X_train, label)
                model_dict[idx] = model
            print('complete train model')
            print('start validate model')
            f1_scores = []
            for idx in tqdm(range(10)):
                label_dev = y_dev[:, idx]
                model = model_dict[idx]
                dev_prob = model.predict_proba(kfold_X_val)
                test_prob = model.predict_proba(test)

                oof_predict[dev_index, idx] = dev_prob
                predict[:, idx] += test_prob / self.n_folds

                dev_predict = np.argmax(dev_prob, 1)
                f1_scores.append(self.get_f1_score(dev_predict, label_dev))
            f1_score = np.mean(f1_scores)
            scores_f1.append(f1_score)
            print('f1_scores-> ', f1_scores)
            print('f1_score: ', f1_score)
            if self.config.is_debug == True:
                break

        print('Total f1->', scores_f1)
        print("Total f1'mean is ", np.mean(scores_f1))

        # 保存結果
        os.makedirs('../data/result-ml', exist_ok=True)

        with open('../data/result-ml/{}_oof_f1_{}.pkl'.format(name, str(np.mean(scores_f1))), 'wb') as f:
            pickle.dump(oof_predict, f)

        with open('../data/result-ml/{}_pre_f1_{}.pkl'.format(name, str(np.mean(scores_f1))), 'wb') as f:
            pickle.dump(predict, f)

        print('done')


class Fasttext(BasicStaticModel):
    def __init__(self, name='basicModel', n_folds=5, config=None):
        BasicStaticModel.__init__(self, name=name, n_folds=n_folds, config=config)

    def create_model(self):
        # 重寫
        sk_clf = FirstColFtClassifier(lr=1.0, epoch=10,
                                      wordNgrams=1,
                                      minCount=5, verbose=2)
        return sk_clf
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章