1.基本流程

1. 1、准备好数据食材、去停用词并利用结巴 (jieba)进行分词处理

jieba分词模块参考官方文档啦～ ```python

本程序用于将搜狗语料库中的文本进行分词，并且去除停用词

coding=utf-8

import jieba import jieba.posseg as pseg import time import os ''' 训练集：1200 测试集：200 '''

文本分词

def cutText(dirname): # dirname数据目录 for category in os.listdir(dirname): catdir = os.path.join(dirname,category) if not os.path.isdir(catdir): continue files = os.listdir(catdir)

    i = 0
    for cur_file in files:
        print("正在处理"+category+"中的第"+str(i)+"个文件.............")
        filename = os.path.join(catdir,cur_file)
        #读取文本
        with open(filename,"r",encoding='utf-8') as f:
            content = f.read()

        #进行分词
        words = pseg.cut(content)
        # 用于剔除停用词的列表
        finalContent = []
        # 停用词列表
        stopWords = [line.strip() for line in open('Chinesestopword.txt', 'r', encoding='utf-8').readlines()]

        for word in words:
            word = str(word.word)
            # 如果该单词非空格、换行符、不在听用词表中就将其添加进入最终分词列表中
            if len(word)  > 1 and word != '\n' and word != '\u3000' and word not in stopWords:
                finalContent.append(word)

        # 组合成最终需要的字符串
        finalStr = " ".join(finalContent)

        # 写入文件
        writeFileName = writeFilePathPrefix+"/"+category+"/"+str(i)+".txt"
        print(writeFileName)
        with open(writeFileName,"w",encoding = 'utf-8') as f:
            f.write(finalStr)
        i = i + 1

        print("成功处理"+category+"中的第"+str(i)+"个文件～哦耶！")

if name == ' main ': # 记录开始时间 t1=time.time()

readFilePathPrefix = "SogouData/ClassFile"
writeFilePathPrefix = "SogouDataCut"

cutText(readFilePathPrefix)

# 记录结束时间
t2=time.time()
#反馈结果
print("您的分词终于完成，耗时："+str(t2-t1)+"秒。")

```

1. **2、利用卡方检验特征选择**

卡方检验： 在构建每个类别的词向量后，对每一类的每一个单词进行其卡方统计值的计算。 1. 首先对卡方检验所需的 a、b、c、d 进行计算。 a 为在这个分类下包含这个词的文档数量; b 为不在该分类下包含这个词的文档数量; c 为在这个分类下不包含这个词的文档数量; d 为不在该分类下，且不包含这个词的文档数量。 2. 然后得到该类中该词的卡方统计值 公式为 float(pow((a d - b c), 2)) /float((a+c) * (a+b) * (b+d) * (c+d))。 3. 对每一类别的所有词按卡方值进行排序，取前 k 个作为该类的特征值，这里我们取 k 为 1000 ```python

使用开方检验选择特征

按UTF-8编码格式读取文件

import codecs import math import sys

ClassCode = [ '财经','房产','股票','家居','科技','时政','娱乐' ]

构建每个类别的词Set

分词后的文件路径

textCutBasePath = "SogouDataCut/"

构建每个类别的词向量

def buildItemSets(classDocCount): termDic = dict()

# 每个类别下的文档集合用list<set>表示, 每个set表示一个文档，整体用一个dict表示
termClassDic = dict()
for eachclass in ClassCode:
    currClassPath = textCutBasePath+eachclass+"/"
    eachClassWordSets = set()
    eachClassWordList = list()
    for i in range(classDocCount):
        eachDocPath = currClassPath+str(i)+".txt"
        eachFileObj = open(eachDocPath, 'r')
        eachFileContent = eachFileObj.read()
        eachFileWords = eachFileContent.split(" ")
        eachFileSet = set()
        for eachword in eachFileWords:
            stripEachWord = eachword.strip(" ")
            if len(stripEachWord) > 0:
                eachFileSet.add(eachword)
                eachClassWordSets.add(eachword)
        eachClassWordList.append(eachFileSet)
    termDic[eachclass] = eachClassWordSets
    termClassDic[eachclass] = eachClassWordList
return termDic, termClassDic

对得到的两个词典进行计算，可以得到a b c d 值

K为每个类别选取的特征个数

卡方计算公式

def ChiCalc(a, b, c, d): result = float(pow((a d - b c), 2)) /float((a+c) * (a+b) * (b+d) * (c+d)) return result

def featureSelection(termDic, termClassDic, K): termCountDic = dict() for key in termDic: # C000008
classWordSets = termDic[key] # print(classWordSets) classTermCountDic = dict() for eachword in classWordSets: # 对某个类别下的每一个单词的 a b c d 进行计算 # 对卡方检验所需的 a b c d 进行计算 # a：在这个分类下包含这个词的文档数量 # b：不在该分类下包含这个词的文档数量 # c：在这个分类下不包含这个词的文档数量 # d：不在该分类下，且不包含这个词的文档数量 a = 0 b = 0 c = 0 d = 0 for eachclass in termClassDic: # C000008 if eachclass == key: #在这个类别下进行处理 for eachdocset in termClassDic[eachclass]: if eachword in eachdocset: a = a + 1 else: c = c + 1 else: # 不在这个类别下进行处理 for eachdocset in termClassDic[eachclass]: if eachword in eachdocset: b = b + 1 else: d = d + 1

        eachwordcount = ChiCalc(a, b, c, d)
        classTermCountDic[eachword] = eachwordcount

    # 对生成的计数进行排序选择前K个
    # 这个排序后返回的是元组的列表
    sortedClassTermCountDic = sorted(classTermCountDic.items(), key=lambda d:d[1], reverse=True)
    count = 0
    subDic = dict()
    for i in range(K):
        subDic[sortedClassTermCountDic[i][0]] = sortedClassTermCountDic[i][1]
    termCountDic[key] = subDic
return termCountDic

def writeFeatureToFile(termCountDic , fileName): featureSet = set() for key in termCountDic: for eachkey in termCountDic[key]: featureSet.add(eachkey)

count = 1
file = open(fileName, 'w')
for feature in featureSet:
    # 判断feature 不为空
    stripfeature = feature.strip(" ")
    if len(stripfeature) > 0 and feature != " " :
        file.write(str(count)+" " +feature+"\n")
        count = count + 1
        print(feature)
file.close()

if name == ' main ': # 调用buildItemSets # buildItemSets形参表示每个类别的文档数目,在这里训练模型时每个类别取前200个文件 termDic, termClassDic = buildItemSets(1200) termCountDic = featureSelection(termDic, termClassDic, 1000) writeFeatureToFile(termCountDic, "SVMFeature.txt")

```

1. **3、利用 TF IDF算法*进行特征权重计算**

TF-IDF算法 ：

全称叫 Term Frequency-Inverse Document Frequency 词频-逆文档频率算法
主要用于关键词抽取
优点：每个词的权重与特征项在文档中出现的频率成正比，与在整个语料中出现该特征项的文档数成反比。
原理解说：训练文本的特征向量表示数据在 train.svm文件中，测试文本的特征向量表示数据在test.svm 中。

```python

import FeatureSelecion

import math import sys

采用TF-IDF 算法对选取得到的特征进行计算权重

documentCount = 200 # 每个类别选取200篇文档

ClassCode = [ '财经','房产','股票','家居','科技','时政','娱乐' ]

构建每个类别的词Set

分词后的文件路径

textCutBasePath = "SogouDataCut/"

def readFeature(featureName): featureFile = open(featureName, 'r') featureContent = featureFile.read().split('\n') featureFile.close() feature = list() for eachfeature in featureContent: eachfeature = eachfeature.split(" ") if (len(eachfeature)==2): feature.append(eachfeature[1]) # print(feature) return feature

读取所有类别的训练样本到字典中,每个文档是一个list

def readFileToList(textCutBasePath, ClassCode, documentCount): dic = dict() for eachclass in ClassCode: currClassPath = textCutBasePath + eachclass + "/" eachclasslist = list() for i in range(documentCount): eachfile = open(currClassPath+str(i)+".txt") eachfilecontent = eachfile.read() eachfilewords = eachfilecontent.split(" ") eachclasslist.append(eachfilewords) # print(eachfilewords) dic[eachclass] = eachclasslist return dic

计算特征的逆文档频率

def featureIDF(dic, feature, dffilename): dffile = open(dffilename, "w") dffile.close() dffile = open(dffilename, "a")

totalDocCount = 0
idffeature = dict()
dffeature = dict()

for eachfeature in feature:
    docFeature = 0
    for key in dic:
        totalDocCount = totalDocCount + len(dic[key])
        classfiles = dic[key]
        for eachfile in classfiles:
            if eachfeature in eachfile:
                docFeature = docFeature + 1
    # 计算特征的逆文档频率
    featurevalue = math.log(float(totalDocCount)/(docFeature+1))
    dffeature[eachfeature] = docFeature
    # 写入文件，特征的文档频率
    dffile.write(eachfeature + " " + str(docFeature)+"\n")
    # print(eachfeature+" "+str(docFeature))
    idffeature[eachfeature] = featurevalue
dffile.close()
return idffeature

计算Feature's TF-IDF 值

def TFIDFCal(feature, dic,idffeature,filename): file = open(filename, 'w') file.close() file = open(filename, 'a') for key in dic: classFiles = dic[key] # 谨记字典的键是无序的 classid = ClassCode.index(key)

    for eachfile in classFiles:
        # 对每个文件进行特征向量转化
        file.write(str(classid)+" ")
        for i in range(len(feature)):
            if feature[i] in eachfile:
                currentfeature = feature[i]
                featurecount = eachfile.count(feature[i])
                tf = float(featurecount)/(len(eachfile))
                # 计算逆文档频率
                featurevalue = idffeature[currentfeature]*tf
                file.write(str(i+1)+":"+str(featurevalue) + " ")
        file.write("\n")

if name == ' main ': dic = readFileToList(textCutBasePath, ClassCode, documentCount) feature = readFeature("SVMFeature.txt") # print(len(feature)) idffeature = featureIDF(dic, feature, "dffeature.txt") TFIDFCal(feature, dic,idffeature, "train.svm")

```

1. **4、基于训练文本的特征向量数据，使用 LIBSVM库训练SVM 模型**

使用libsvm对train.svm进行模型训练，和对test.svm模型进行预测

测试命令：

```sh 对train.svm文件数据进行缩放到[0,1]区间 ./svm-scale -l 0 -u 1 train.svm > trainscale.svm

对test.svm文件数据进行缩放到[0,1]区间 ./svm-scale -l 0 -u 1 test.svm > testscale.svm

对trainscale.svm 文件进行模型训练 ./svm-train -s 1 trainscale.svm trainscale.model

对testscale.svm 文件进行模型预测，得到预测结果，控制台会输出正确率 ./svm-predict testscale.svm trainscale.model testscale.result ```

1. 5、对于测试集进行特征向量表示，代入训练得到的 SVM 模型中进行预测分类

预测结果：92%

目前这个阶段，能够讲到这个程度，以后在补充吧，小明酱撤退了～

参考文献

一种面向文本分类的机器学习平台的设计与实现（北京邮电大学·赵江山）
中文文本自动校对系统设计与实现（西南交通大学·张涛）
基于Spring Boot的多用户博客系统的设计研究（青海师范大学·罗涛）
基于网络用户行为分析的用户推荐反馈系统的设计（北京化工大学·石钊）
文本分析资源与任务管理系统的设计与实现（北京交通大学·宋奕文）
一种面向文本分类的机器学习平台的设计与实现（北京邮电大学·赵江山）
网络新闻分类系统及个性化新闻网站的研究与应用（内蒙古工业大学·王继明）
中文文本自动校对系统设计与实现（西南交通大学·张涛）
基于Spring Boot的多用户博客系统的设计研究（青海师范大学·罗涛）
Text Classification Based on Graph Convolutional Neural Network with Intimacy Matrix and Text Linking（华中师范大学·夏冰）
基于网络爬虫的信息采集分类系统设计与实现（厦门大学·周茜）
基于商品名称的电商平台商品自动分类的研究与实现（西南交通大学·黄超）
财经领域事件抽取技术的研究与应用（北京理工大学·陈贺）
网站运营分析系统设计与实现（电子科技大学·蒋黎）
基于Python的非结构化数据检索系统的设计与实现（南京邮电大学·董海兰）