python自然语言处理——2.2 条件频率分布
微信公众号:数据运营人
本系列为博主的读书学习笔记,如需转载请注明出处。
第二章 获取文本预料和词汇资源
使用双连词生成随机文本
2.2 条件频率分布
条件和事件
1text = ['The','Fulton','County','Grand','Jury','said']
2pairs = [('news','The'),('news','Fulton'),('news','County')]
按文体计数词汇
1from nltk.corpus import brown
2cfd = nltk.ConditionalFreqDist(
3 (genre,word)
4 for genre in brown.categories()
5 for word in brown.words(categories = genre) )
6print(len(cfd))
7genre_word= [
8 (genre,word)
9 for genre in ['news','romance']
10 for word in brown.words(categories = genre)]
11print(len(genre_word))
12print(genre_word[:4])
13print(genre_word[-4:])
14cfd = nltk.ConditionalFreqDist(genre_word)
15print(cfd)
16print(cfd.conditions())
17print(cfd["news"])
18print(cfd["romance"])
19# print(list(cfd["romance"]))
20print(cfd["romance"]["could"])
返回结果:
绘制分布图和分布表
1from nltk.corpus import inaugural
2cfd = nltk.ConditionalFreqDist(
3 (target,fileid[:4])
4 for fileid in inaugural.fileids()
5 for w in inaugural.words()
6 for target in ["america","citizen"]
7 if w.lower().startswith(target))
8print(cfd.conditions())
9print(cfd["citizen"])
10print(list(cfd["citizen"]))
11# 如果输出结果为<FreqDist with 56 samples and 17976 outcomes>,设置成列表
12print(list(cfd["america"]))
13print(cfd["citizen"]["america"])
14
15from nltk.corpus import udhr
16languages = ["Chickasaw","English","German_Deutsch"]
17cfd = nltk.ConditionalFreqDist(
18 (lang,len(word))
19 for lang in languages
20 for word in udhr.words(lang + '-Latin1'))
21print(cfd.tabulate(conditions = ["English","Chickasaw"],
22 samples = range(10),cumulative = True))
返回结果:
使用双连词生成随机文本
1sent = ['In','the','beginning','God','created','the','heaven']
2print(list(nltk.bigrams(sent)))
3
4def generate_model(cfdist,word,num=15):
5 for i in range(num):
6 print(word)
7 word = cfdist[word].max()
8text = nltk.corpus.genesis.words("english-kjv.txt")
9bigrams = nltk.bigrams(text)
10cfd = nltk.ConditionalFreqDist(bigrams)
11print(cfd["living"])
12print(generate_model(cfd,"living"))
返回结果: