day05-朴素贝叶斯算法


# coding=utf-8
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

def naivebayes():

    # 提取新闻数据
    news = fetch_20newsgroups(subset="all")

    # 分割新闻数据
    x_train,x_test,y_train,y_test = train_test_split(news.data,news.target,test_size=0.25)

    # 提取关键字的特征值,使用tf算法
    tf = TfidfVectorizer()

    x_train = tf.fit_transform(x_train)

    # 使用训练集的关键字提取测试集的特征值
    x_test = tf.transform(x_test)

    print("特征值为:")
    print(tf.get_feature_names())

    # 使用朴素贝叶斯算法进行预测

    mlt = MultinomialNB()

    mlt.fit(x_train,y_train)

    print("预测的数据集特征值为:")
    print(x_test.toarray())

    print("预测的分类为:",mlt.predict(x_test))

    print("测试的准确率为:",mlt.score(x_test,y_test))
    return None

if __name__ == '__main__':
    naivebayes()


朴素贝叶斯算法算出的是预测的数据属于哪些目标值的概率,哪个概率大了则为哪个目标值
可用于文章的分类