文本分類（power 8算法挑戰賽第五期）

這一期比賽可以說是剛好對上我胃口，總算和是和機器學習沾上邊了。我的這個方法是採用的是貝葉斯方法，效果達到85.5%，這裏給出來分享一下，其他訓練方法的朋友也可以交流一下。

先說一點題外話：

之前寫的“小樣本理論”已經在近期完善了（在連續幾個月的時間裏，我一想這個問題腦袋就一片漿糊），但是我想在瞭解一下其他人在該方面的處理方法後再來吹牛，因此這裏這麼久都沒有寫後半部分。在這次的文本分類中我用了這一“理論”基礎，對統計概率進行重新優化。過段時間分享一個奇異值分解的的方法。

分詞是否重要：

對於文本分類來說，很多人或許都以爲“分詞”很重要；但是當我第一眼看到文本分類的時候，就猜到從理論上來說這完全是扯淡。因此我的這一方法中語句的切分方法從分詞的正確性上來說也完全是扯淡的，但是考慮到我的正確率還行，請你相信分詞並不太重要。（當然也非常希望有朋友採用當前比較牛的分詞算法分完詞後，用我的訓練結果分類試試，如果能高出兩到三個點不要忘記分享之；我一直都沒有真正驗證過）

訓練過程：

訓練過程主要分爲兩步：

第一步：把所有文章中出現的2~4的片段都統計一遍，同時統計這些片段在各個分類出現的次數。

第二部：估計所有的2~4的片段的分類概率意見（其中只有一次的意見被去掉了）。其中概率已經求了log對數，把後面的乘法變爲加法避免精度損失造成各種問題。

第一個比較簡單，我就不說了，第二個我暫時還不想說，也不說了。不過兩個的輸出結果我都會共享出來。

分類過程：

把上面所有的片段都當做一個詞，採用逆向最大匹配方法進行分詞（注意：這裏雖然分詞了，但由於字典中的詞並不能保證正確性，所以單純從分詞合法性上來說分詞結果往往是錯的）。把所有詞的意見累加起來。取其中概率最大類別。至此結束。爲撐場面我把分類程序的源碼貼在下面。

# -*- coding: utf-8 -*-
# created by axuanwu 2015.1.25
# key word: hash  count
import numpy as np
import math


def getseed(str1):
    """

    :param str1: 詞條的utf8形式
    :return: 詞條的hash指紋 256的位隨機數
    """
    h = 0
    for x in str1:
        if ord(x) > 256:
            h <<= 12
            h += ord(x)
        else:
            h <<= 6
            h += ord(x)
    while (h >> 256) > 0:
        h = (h & (2 ** 256 - 1)) ^ (h >> 256)  # 數字不能太大
    return h


class MCard():
    def __init__(self):
        self.M_num = 8
        self.N_max = 16777216
        self.nummax2 = 24
        self.MCARD = [0]
        self.Opath = ""
        self.index = [0] * 8
        self.__keys = ['first_NULL']
        self.i_key = 1  # 新增元素增加在位置 i_key 處
        self.index2 = [0] * 8

    def get_keys(self, iii=-1):
        if iii == -1:
            return self.__keys[1:]
        else:
            return self.__keys[iii]

    def flush_key(self, iii):
        self.__keys[iii] = ""  # 去掉keys的值

    def getindex(self, str1, for_up=False):
        # 獲取 詞條的 8個隨機位置
        seed = getseed(str1)
        for n in range(0, self.M_num):
            a = 0
            k = (n + 1)
            seed1 = seed
            if (seed >> 64) < 0:
                seed1 = seed * (n + 15048796327)
            while seed1 > 0:
                a ^= (seed1 & (self.N_max - 1)) + k
                a = ((a << k) & (self.N_max - 1)) | (a >> (self.nummax2 - k))  # 左循環移位
                seed1 >>= self.nummax2
            if for_up:
                self.index2[n] = a
            else:
                self.index[n] = a

    def update_card(self, str1):
        """
        :param str1: 詞的utf-8編碼形式
        :param num: 該詞需要增加的value值
        """
        if self.read_card(str1, True) == 0:
            # 新詞
            for iii in self.index:
                if self.MCARD[iii] == 0:
                    self.MCARD[iii] = self.i_key
            if self.i_key % 10000 == 0:
                print self.i_key
            self.i_key += 1
            self.__keys.append(str1)

    def read_card(self, str1, for_up=False):
        """
        :param str1: 詞的utf-8編碼形式
        :return: 輸出該次條對應的value值
        """
        if for_up:
            for i in xrange(0, 10):  # 最多嘗試10次
                i_str1 = str1 + str(i)
                if i > 5:
                    print i
                self.getindex(i_str1)
                aaa = min(self.MCARD[self.index])
                if aaa == 0:
                    return 0
            return -1
        else:
            for i in xrange(0, 10):  # 最多連續處理碰撞10次
                i_str1 = str1 + str(i)
                self.getindex(i_str1)
                aaa = max(self.MCARD[self.index])
                if aaa == 0:  # 不存在
                    return 0
                elif aaa < self.N_max:
                    if str1 == self.__keys[aaa]:
                        return aaa
            # print ("warning : bad case happened , card array maybe too short when update " + str1) # hash 桶太少
            return 0

    def setbase(self, num1=16777216, num2=8):
        """

        :param num1: 數組長度參數
        :param num2: 每個詞條對應的hash位置數
        """
        self.nummax2 = int(math.ceil(math.log(num1, 2)))
        self.N_max = 2 ** self.nummax2  # self.nummax2 2的N次方
        self.M_num = num2
        self.index = [0] * num2
        self.index2 = [0] * num2

    def set_card(self, kk=-1, dd=8):
        """

        :param kk:  數組長度參數 -1表示取之前定義值
        """
        if -1 == kk:
            self.MCARD = np.repeat(0, self.N_max)
            return 0
            s1 = input('do you want to reset MCARD to zeros,all memory will be lost [y/n]:')
            if s1 == 'y':
                self.MCARD = np.repeat(0, self.N_max)
            else:
                print("no reset")
        else:
            self.setbase(kk, dd)
            self.MCARD = np.repeat(0, 2 ** self.nummax2)

    def record_num(self):

        """
        :return: 返回字典詞條數量
        """
        return self.i_key - 1

    def card_test(self):
        """

        計算hash碰撞指數
        """
        aaa = self._record
        bbb = self.N_max
        ccc = 0
        for i in self.MCARD:
            ccc += int(i > 0)
        ddd = self.M_num
        print math.log(1.0 * ccc / bbb, 10) * ddd, math.log((1.0 * aaa * ddd - ccc) / ccc, 10) * ddd

上面是 my_class.py的內容，是一個hash算法，用於快速查找，可能效果比不上python自帶的dict好用，但是是自己模仿布隆過濾器實現的所以用起來挺順手，就一直湊合着用了。下面的分類程序程序使用了上面的這個。

__author__ = 'axuanwu'
# coding=utf8
import re
import sys
import os
import time
import math
import numpy as np
from myclass import *


class ReadClassify():
    def __init__(self):
        self.m_card = MCard()
        self.dict_class = {}
        self.classify_tongji = np.zeros((3, 9))
        self.class_str = []
        self.m_card.set_card(2 ** 27, 6)
        self.mat_row = 3000000
        self.i_file = 0
        self.class_tail = np.array([0.0] * self.mat_row)
        self.word_count = np.zeros((3000000, 9), float)  # 用於記錄最常見的300萬個片段
        self.class_score = np.array([0.0] * 9)
        self.root_dir = ""
        self.max_word_length = 5
        self.re_ch = re.compile(u"[\u4E00-\u9FA5]+", re.U)
        self.re_eng = re.compile(u"[a-zA-Z0-9+\._@]+", re.U)
        self.fazhi = 3


    def set_dict_class(self):
        file_list = os.listdir(os.path.join(self.root_dir, "train"))
        i = 0
        for i_dir in file_list:
            self.dict_class[i_dir] = i
            self.class_str.append(i_dir)
            i += 1

    def set_fazhi(self):
        o_file = open(os.path.join(os.getcwd(), "canshu.txt"), "r")
        count_my = [0] * 200
        i = 0
        for line in o_file:
            count_my[i] = int(line.rstrip())
            i += 1
        o_file.close()
        i = len(count_my) - 1
        a = self.mat_row
        while count_my[i] < a:
            a -= count_my[i]
            i -= 1
        self.fazhi = max([2, i])

    def set_root(self, path="C:\\Users\\01053185\\Desktop\\yuliao\\yuliao"):
        self.root_dir = path

    def load_dict(self):
        print "loading knowledge takes 1~2 min"
        line_dict = max(self.word_count.shape)
        dict_path = open(os.path.join(os.getcwd(), "tong_ji2new.txt"), "r")
        temp_array = np.zeros((1, 9), float)
        for line in dict_path:
            line_s = line.strip().split("\t")
            for j in xrange(1, len(line_s)):
                temp_array[0, j - 1] = float(line_s[j])
            # if sum(temp_array) < self.fazhi:
            # continue  # 次數太少不錄入特徵字典
            self.m_card.update_card(line_s[0].decode("utf-8", "ignore"))  # 每次都是新詞
            aaa = self.m_card.read_card(line_s[0].decode("utf-8", "ignore"))
            self.word_count[aaa,] = temp_array
            if aaa == line_dict - 1:
                break
                # if aaa == 10000:
                #     break
        dict_path.close()
        print "loading knowledge done"

    def cut_classify2(self, sentence):
        blocks = re.findall(self.re_ch, sentence)
        for blk in blocks:
            len_blk = len(blk)
            i = len_blk
            while i >= 2:
                j = self.max_word_length  # 最大磁長
                while j >= 2:
                    if (i - j) < 0:
                        j -= 1
                        continue
                    index_word = self.m_card.read_card(blk[(i - j):i])
                    if index_word == 0:
                        j -= 1
                        continue
                    else:
                        if self.i_file == self.class_tail[index_word]:  # 詞被存儲過
                            pass
                        else:
                            # print blk[i:(i + j)]
                            self.class_score += self.word_count[index_word,]
                            self.class_tail[index_word] = self.i_file
                        j -= 1
                i -= 1
        blocks = re.findall(self.re_eng, sentence)
        for blk in blocks:
            index_word = self.m_card.read_card(blk)
            if self.i_file == self.class_tail[index_word]:  # 詞被存儲過
                pass
            else:
                self.class_score += self.word_count[index_word,]
                self.class_tail[index_word] = self.i_file

    def cut_classify3(self, sentence):
        # 正向最大匹配
        blocks = re.findall(self.re_ch, sentence)
        for blk in blocks:
            len_blk = len(blk)
            i = 0
            while i < (len_blk - 2):
                j = self.max_word_length  # 最大磁長
                while j >= 2:
                    if (i + j) > len_blk:
                        j -= 1
                        continue
                    index_word = self.m_card.read_card(blk[i:(i + j)])
                    if index_word == 0:
                        j -= 1
                        continue
                    else:
                        if self.i_file == self.class_tail[index_word]:  # 詞被計算存儲過
                            pass
                        else:
                            # print blk[i:(i + j)]
                            self.class_score += self.word_count[index_word,]
                            self.class_tail[index_word] = self.i_file
                        break
                if j < 2:
                    i += 1
                else:
                    i += j
        blocks = re.findall(self.re_eng, sentence)
        for blk in blocks:
            index_word = self.m_card.read_card(blk)
            if self.i_file == self.class_tail[index_word]:  # 詞被存儲過
                pass
            else:
                self.class_score += self.word_count[index_word,]
                self.class_tail[index_word] = self.i_file

    def cut_classify(self, sentence):
        blocks = re.findall(self.re_ch, sentence)
        for blk in blocks:
            len_blk = len(blk)
            i = len_blk
            while i >= 2:
                j = self.max_word_length  # 最大磁長
                while j >= 2:
                    if (i - j) < 0:
                        j -= 1
                        continue
                    index_word = self.m_card.read_card(blk[(i - j):i])
                    if index_word == 0:
                        j -= 1
                        continue
                    else:
                        if self.i_file == self.class_tail[index_word]:  # 詞被存儲過
                            pass
                        else:
                            # print blk[i:(i + j)]
                            self.class_score += self.word_count[index_word,]
                            self.class_tail[index_word] = self.i_file
                        break
                if j < 2:
                    i -= 1
                else:
                    i -= j
        blocks = re.findall(self.re_eng, sentence)
        for blk in blocks:
            index_word = self.m_card.read_card(blk)
            if self.i_file == self.class_tail[index_word]:  # 詞被存儲過
                pass
            else:
                self.class_score += self.word_count[index_word,]
                self.class_tail[index_word] = self.i_file

    def classify_read(self):
        class_result = os.path.join(os.getcwd(), "class_result.txt")
        o_file = open(class_result, "w")
        class_numbers = self.word_count.shape  #
        dir_path = os.path.join(self.root_dir, "train")
        dir_list = os.listdir(dir_path)
        for sdir in dir_list:
            dir_path = os.path.join(os.path.join(self.root_dir, "train"), sdir)
            # dir_path = "C:/Users/01053185/Desktop/yuliao/yuliao/test/C000024"
            file_list = os.listdir(dir_path)
            for files in file_list:
                self.i_file += 1
                file_path = os.path.join(dir_path, files)
                self.class_score = np.array([0.0] * 9)
                i_file = open(file_path, "r")
                for line in i_file:
                    self.cut_classify3(line.decode("gbk", 'replace').strip())
                max_pro = max(self.class_score)
                for i in xrange(0, 9):
                    if self.class_score[i] == max_pro:
                        self.classify_tongji[0, self.dict_class[self.class_str[i]]] += 1
                        if sdir == self.class_str[i]:
                            o_file.writelines(file_path + "\t" + self.class_str[i] + "\t" + "1\n")
                            self.classify_tongji[1, self.dict_class[self.class_str[i]]] += 1
                        else:
                            o_file.writelines(file_path + "\t" + self.class_str[i] + "\t" + "0\n")
                        break
        o_file.close()
        try:
            self.classify_tongji[2,] = self.classify_tongji[1,] / self.classify_tongji[0,]
        except:
            print "hello word!"


if __name__ == "__main__":
    my_classify = ReadClassify()
    my_classify.set_root()
    a = time.time()
    my_classify.set_dict_class()
    # my_classify.set_fazhi()
    my_classify.load_dict()
    # my_classify.m_card.read_card(u"實習")
    print "time is :",time.time() - a,"s"
    my_classify.classify_read()
    print "time is :",time.time() - a,"s"
    print my_classify.classify_tongji

大家可能需要改一下根目錄才能運行，另外輸出結果會打印在class_result中，一目瞭然。

最後就是上面說的兩個統計和訓練輸出結果我放在百度盤上了，大家可自行下載。http://pan.baidu.com/s/1pJHpMJ5

文本分類（power 8算法挑戰賽第五期）

工作中用到的腳本合集

微服務實踐Aspire項目發佈到遠程k8s集羣

通過f-string編寫簡潔高效的Python格式化輸出代碼

[轉帖]20個常用的Linux工具命令

[轉帖]PostgreSQL從小白到高手教程 - 第46講：poc-tpch測試

24-5-18 X

HASH 與隨機數

中文分詞之姓名標註

正則匹配問題（power8競賽敏感詞過濾）

數據挖掘的若干問題

電子圈存的祕密——去中心化的一個例子

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結