python自然语言处理——2.4 词典资源
微信公众号:数据运营人
本系列为博主的读书学习笔记,如需转载请注明出处。
第二章 获取文本预料和词汇资源
词汇工具: Toolbox和Shoebox
2.4 词典资源
词汇列表语料库
1def unusual_words(text):
2 text_vocab = set(w.lower() for w in text if w.isalpha())
3 english_vocab = set(w.lower() for w in nltk.corpus.words.words())
4 unusual = text_vocab.difference(english_vocab)
5 return sorted(unusual)
6print(unusual_words(nltk.corpus.gutenberg.words("austen-sense.txt")))
7
8# 停词语料库 处理掉高频词 如to the
9from nltk.corpus import stopwords
10print(stopwords.words("english"))
11
12# 文本中没有在停用词列表中的比例
13def content_fraction(text):
14 stopwords = nltk.corpus.stopwords.words("english")
15 content = [w for w in text if w.lower() not in stopwords]
16 return len(content)/len(text)
17print(content_fraction(nltk.corpus.reuters.words()))
18
19# 同时找出两个文件中名字暧昧的名字
20names = nltk.corpus.names
21print(names.fileids())
22male_names = names.words("male.txt")
23female_name = names.words("female.txt")
24print([w for w in male_names if w in female_name])
25
26# 以字母a结尾的名字几乎都是女性
27cfd = nltk.ConditionalFreqDist(
28 (fileid,name[-1])
29 for fileid in names.fileids()
30 for name in names.words(fileid)
31)
32print(cfd.plot())
返回结果:
发音的词典
1entries = nltk.corpus.cmudict.entries()
2print(len(entries))
3for entry in entries[39900:39951]:
4 print(entry)
返回结果:
比较词表
1from nltk.corpus import swadesh
2print(swadesh.fileids())
3print(swadesh.words("en"))
4fr2en = swadesh.entries(['fr','en'])
5print(fr2en)
6translate = dict(fr2en) # 简单的翻译器
7print(translate["chien"])
8
9# 德语-英语 西班牙-英语
10de2en = swadesh.entries(['de','en']) #german-english
11es2en = swadesh.entries(['es','en']) #spanish-english
12print(translate.update(dict(de2en)))
13print(translate.update(dict(es2en)))
14print(translate['Hund'])
15print(translate['perro'])
词汇工具: Toolbox和Shoebox
1from nltk.corpus import toolbox
2print(toolbox.entries("rotokas.dic"))