文本分类(power 八算法挑战赛第五期)

文本分类(power 8算法挑战赛第五期)

这一期比赛可以说是刚好对上我胃口,总算和是和机器学习沾上边了。我的这个方法是采用的是贝叶斯方法,效果达到85.5%,这里给出来分享一下,其他训练方法的朋友也可以交流一下。


先说一点题外话:

之前写的“小样本理论”已经在近期完善了(在连续几个月的时间里,我一想这个问题脑袋就一片浆糊),但是我想在了解一下其他人在该方面的处理方法后再来吹牛,因此这里这么久都没有写后半部分。在这次的文本分类中我用了这一“理论”基础,对统计概率进行重新优化。过段时间分享一个奇异值分解的的方法。

分词是否重要:

对于文本分类来说,很多人或许都以为“分词”很重要;但是当我第一眼看到文本分类的时候,就猜到从理论上来说这完全是扯淡。因此我的这一方法中语句的切分方法从分词的正确性上来说也完全是扯淡的,但是考虑到我的正确率还行,请你相信分词并不太重要。(当然也非常希望有朋友采用当前比较牛的分词算法分完词后,用我的训练结果分类试试,如果能高出两到三个点不要忘记分享之;我一直都没有真正验证过)


训练过程:

训练过程主要分为两步:

第一步:把所有文章中出现的2~4的片段都统计一遍,同时统计这些片段在各个分类出现的次数。

第二部:估计所有的2~4的片段的分类概率意见(其中只有一次的意见被去掉了)。其中概率已经求了log对数,把后面的乘法变为加法避免精度损失造成各种问题。

第一个比较简单,我就不说了,第二个我暂时还不想说,也不说了。不过两个的输出结果我都会共享出来。

分类过程:

把上面所有的片段都当做一个词,采用逆向最大匹配方法进行分词(注意:这里虽然分词了,但由于字典中的词并不能保证正确性,所以单纯从分词合法性上来说分词结果往往是错的)。把所有词的意见累加起来。 取其中概率最大类别。至此结束。为撑场面我把分类程序的源码贴在下面。


# -*- coding: utf-8 -*-
# created by axuanwu 2015.1.25
# key word: hash  count
import numpy as np
import math


def getseed(str1):
    """

    :param str1: 词条的utf8形式
    :return: 词条的hash指纹 256的位随机数
    """
    h = 0
    for x in str1:
        if ord(x) > 256:
            h <<= 12
            h += ord(x)
        else:
            h <<= 6
            h += ord(x)
    while (h >> 256) > 0:
        h = (h & (2 ** 256 - 1)) ^ (h >> 256)  # 数字不能太大
    return h


class MCard():
    def __init__(self):
        self.M_num = 8
        self.N_max = 16777216
        self.nummax2 = 24
        self.MCARD = [0]
        self.Opath = ""
        self.index = [0] * 8
        self.__keys = ['first_NULL']
        self.i_key = 1  # 新增元素增加在位置 i_key 处
        self.index2 = [0] * 8

    def get_keys(self, iii=-1):
        if iii == -1:
            return self.__keys[1:]
        else:
            return self.__keys[iii]

    def flush_key(self, iii):
        self.__keys[iii] = ""  # 去掉keys的值

    def getindex(self, str1, for_up=False):
        # 获取 词条的 8个随机位置
        seed = getseed(str1)
        for n in range(0, self.M_num):
            a = 0
            k = (n + 1)
            seed1 = seed
            if (seed >> 64) < 0:
                seed1 = seed * (n + 15048796327)
            while seed1 > 0:
                a ^= (seed1 & (self.N_max - 1)) + k
                a = ((a << k) & (self.N_max - 1)) | (a >> (self.nummax2 - k))  # 左循环移位
                seed1 >>= self.nummax2
            if for_up:
                self.index2[n] = a
            else:
                self.index[n] = a

    def update_card(self, str1):
        """
        :param str1: 词的utf-8编码形式
        :param num: 该词需要增加的value值
        """
        if self.read_card(str1, True) == 0:
            # 新词
            for iii in self.index:
                if self.MCARD[iii] == 0:
                    self.MCARD[iii] = self.i_key
            if self.i_key % 10000 == 0:
                print self.i_key
            self.i_key += 1
            self.__keys.append(str1)

    def read_card(self, str1, for_up=False):
        """
        :param str1: 词的utf-8编码形式
        :return: 输出该次条对应的value值
        """
        if for_up:
            for i in xrange(0, 10):  # 最多尝试10次
                i_str1 = str1 + str(i)
                if i > 5:
                    print i
                self.getindex(i_str1)
                aaa = min(self.MCARD[self.index])
                if aaa == 0:
                    return 0
            return -1
        else:
            for i in xrange(0, 10):  # 最多连续处理碰撞10次
                i_str1 = str1 + str(i)
                self.getindex(i_str1)
                aaa = max(self.MCARD[self.index])
                if aaa == 0:  # 不存在
                    return 0
                elif aaa < self.N_max:
                    if str1 == self.__keys[aaa]:
                        return aaa
            # print ("warning : bad case happened , card array maybe too short when update " + str1) # hash 桶太少
            return 0

    def setbase(self, num1=16777216, num2=8):
        """

        :param num1: 数组长度参数
        :param num2: 每个词条对应的hash位置数
        """
        self.nummax2 = int(math.ceil(math.log(num1, 2)))
        self.N_max = 2 ** self.nummax2  # self.nummax2 2的N次方
        self.M_num = num2
        self.index = [0] * num2
        self.index2 = [0] * num2

    def set_card(self, kk=-1, dd=8):
        """

        :param kk:  数组长度参数 -1表示取之前定义值
        """
        if -1 == kk:
            self.MCARD = np.repeat(0, self.N_max)
            return 0
            s1 = input('do you want to reset MCARD to zeros,all memory will be lost [y/n]:')
            if s1 == 'y':
                self.MCARD = np.repeat(0, self.N_max)
            else:
                print("no reset")
        else:
            self.setbase(kk, dd)
            self.MCARD = np.repeat(0, 2 ** self.nummax2)

    def record_num(self):

        """
        :return: 返回字典词条数量
        """
        return self.i_key - 1

    def card_test(self):
        """

        计算hash碰撞指数
        """
        aaa = self._record
        bbb = self.N_max
        ccc = 0
        for i in self.MCARD:
            ccc += int(i > 0)
        ddd = self.M_num
        print math.log(1.0 * ccc / bbb, 10) * ddd, math.log((1.0 * aaa * ddd - ccc) / ccc, 10) * ddd


上面是 my_class.py的内容,是一个hash算法,用于快速查找,可能效果比不上python自带的dict好用,但是是自己模仿布隆过滤器实现的所以用起来挺顺手,就一直凑合着用了。下面的分类程序程序使用了上面的这个。


__author__ = 'axuanwu'
# coding=utf8
import re
import sys
import os
import time
import math
import numpy as np
from myclass import *


class ReadClassify():
    def __init__(self):
        self.m_card = MCard()
        self.dict_class = {}
        self.classify_tongji = np.zeros((3, 9))
        self.class_str = []
        self.m_card.set_card(2 ** 27, 6)
        self.mat_row = 3000000
        self.i_file = 0
        self.class_tail = np.array([0.0] * self.mat_row)
        self.word_count = np.zeros((3000000, 9), float)  # 用于记录最常见的300万个片段
        self.class_score = np.array([0.0] * 9)
        self.root_dir = ""
        self.max_word_length = 5
        self.re_ch = re.compile(u"[\u4E00-\u9FA5]+", re.U)
        self.re_eng = re.compile(u"[a-zA-Z0-9+\._@]+", re.U)
        self.fazhi = 3


    def set_dict_class(self):
        file_list = os.listdir(os.path.join(self.root_dir, "train"))
        i = 0
        for i_dir in file_list:
            self.dict_class[i_dir] = i
            self.class_str.append(i_dir)
            i += 1

    def set_fazhi(self):
        o_file = open(os.path.join(os.getcwd(), "canshu.txt"), "r")
        count_my = [0] * 200
        i = 0
        for line in o_file:
            count_my[i] = int(line.rstrip())
            i += 1
        o_file.close()
        i = len(count_my) - 1
        a = self.mat_row
        while count_my[i] < a:
            a -= count_my[i]
            i -= 1
        self.fazhi = max([2, i])

    def set_root(self, path="C:\\Users\\01053185\\Desktop\\yuliao\\yuliao"):
        self.root_dir = path

    def load_dict(self):
        print "loading knowledge takes 1~2 min"
        line_dict = max(self.word_count.shape)
        dict_path = open(os.path.join(os.getcwd(), "tong_ji2new.txt"), "r")
        temp_array = np.zeros((1, 9), float)
        for line in dict_path:
            line_s = line.strip().split("\t")
            for j in xrange(1, len(line_s)):
                temp_array[0, j - 1] = float(line_s[j])
            # if sum(temp_array) < self.fazhi:
            # continue  # 次数太少不录入特征字典
            self.m_card.update_card(line_s[0].decode("utf-8", "ignore"))  # 每次都是新词
            aaa = self.m_card.read_card(line_s[0].decode("utf-8", "ignore"))
            self.word_count[aaa,] = temp_array
            if aaa == line_dict - 1:
                break
                # if aaa == 10000:
                #     break
        dict_path.close()
        print "loading knowledge done"

    def cut_classify2(self, sentence):
        blocks = re.findall(self.re_ch, sentence)
        for blk in blocks:
            len_blk = len(blk)
            i = len_blk
            while i >= 2:
                j = self.max_word_length  # 最大磁长
                while j >= 2:
                    if (i - j) < 0:
                        j -= 1
                        continue
                    index_word = self.m_card.read_card(blk[(i - j):i])
                    if index_word == 0:
                        j -= 1
                        continue
                    else:
                        if self.i_file == self.class_tail[index_word]:  # 词被存储过
                            pass
                        else:
                            # print blk[i:(i + j)]
                            self.class_score += self.word_count[index_word,]
                            self.class_tail[index_word] = self.i_file
                        j -= 1
                i -= 1
        blocks = re.findall(self.re_eng, sentence)
        for blk in blocks:
            index_word = self.m_card.read_card(blk)
            if self.i_file == self.class_tail[index_word]:  # 词被存储过
                pass
            else:
                self.class_score += self.word_count[index_word,]
                self.class_tail[index_word] = self.i_file

    def cut_classify3(self, sentence):
        # 正向最大匹配
        blocks = re.findall(self.re_ch, sentence)
        for blk in blocks:
            len_blk = len(blk)
            i = 0
            while i < (len_blk - 2):
                j = self.max_word_length  # 最大磁长
                while j >= 2:
                    if (i + j) > len_blk:
                        j -= 1
                        continue
                    index_word = self.m_card.read_card(blk[i:(i + j)])
                    if index_word == 0:
                        j -= 1
                        continue
                    else:
                        if self.i_file == self.class_tail[index_word]:  # 词被计算存储过
                            pass
                        else:
                            # print blk[i:(i + j)]
                            self.class_score += self.word_count[index_word,]
                            self.class_tail[index_word] = self.i_file
                        break
                if j < 2:
                    i += 1
                else:
                    i += j
        blocks = re.findall(self.re_eng, sentence)
        for blk in blocks:
            index_word = self.m_card.read_card(blk)
            if self.i_file == self.class_tail[index_word]:  # 词被存储过
                pass
            else:
                self.class_score += self.word_count[index_word,]
                self.class_tail[index_word] = self.i_file

    def cut_classify(self, sentence):
        blocks = re.findall(self.re_ch, sentence)
        for blk in blocks:
            len_blk = len(blk)
            i = len_blk
            while i >= 2:
                j = self.max_word_length  # 最大磁长
                while j >= 2:
                    if (i - j) < 0:
                        j -= 1
                        continue
                    index_word = self.m_card.read_card(blk[(i - j):i])
                    if index_word == 0:
                        j -= 1
                        continue
                    else:
                        if self.i_file == self.class_tail[index_word]:  # 词被存储过
                            pass
                        else:
                            # print blk[i:(i + j)]
                            self.class_score += self.word_count[index_word,]
                            self.class_tail[index_word] = self.i_file
                        break
                if j < 2:
                    i -= 1
                else:
                    i -= j
        blocks = re.findall(self.re_eng, sentence)
        for blk in blocks:
            index_word = self.m_card.read_card(blk)
            if self.i_file == self.class_tail[index_word]:  # 词被存储过
                pass
            else:
                self.class_score += self.word_count[index_word,]
                self.class_tail[index_word] = self.i_file

    def classify_read(self):
        class_result = os.path.join(os.getcwd(), "class_result.txt")
        o_file = open(class_result, "w")
        class_numbers = self.word_count.shape  #
        dir_path = os.path.join(self.root_dir, "train")
        dir_list = os.listdir(dir_path)
        for sdir in dir_list:
            dir_path = os.path.join(os.path.join(self.root_dir, "train"), sdir)
            # dir_path = "C:/Users/01053185/Desktop/yuliao/yuliao/test/C000024"
            file_list = os.listdir(dir_path)
            for files in file_list:
                self.i_file += 1
                file_path = os.path.join(dir_path, files)
                self.class_score = np.array([0.0] * 9)
                i_file = open(file_path, "r")
                for line in i_file:
                    self.cut_classify3(line.decode("gbk", 'replace').strip())
                max_pro = max(self.class_score)
                for i in xrange(0, 9):
                    if self.class_score[i] == max_pro:
                        self.classify_tongji[0, self.dict_class[self.class_str[i]]] += 1
                        if sdir == self.class_str[i]:
                            o_file.writelines(file_path + "\t" + self.class_str[i] + "\t" + "1\n")
                            self.classify_tongji[1, self.dict_class[self.class_str[i]]] += 1
                        else:
                            o_file.writelines(file_path + "\t" + self.class_str[i] + "\t" + "0\n")
                        break
        o_file.close()
        try:
            self.classify_tongji[2,] = self.classify_tongji[1,] / self.classify_tongji[0,]
        except:
            print "hello word!"


if __name__ == "__main__":
    my_classify = ReadClassify()
    my_classify.set_root()
    a = time.time()
    my_classify.set_dict_class()
    # my_classify.set_fazhi()
    my_classify.load_dict()
    # my_classify.m_card.read_card(u"实习")
    print "time is :",time.time() - a,"s"
    my_classify.classify_read()
    print "time is :",time.time() - a,"s"
    print my_classify.classify_tongji

大家可能需要改一下根目录才能运行,另外输出结果会打印在class_result中,一目了然。

最后就是上面说的两个统计和训练输出结果我放在百度盘上了,大家可自行下载。http://pan.baidu.com/s/1pJHpMJ5