【Python】求大神指导 下面这段代码怎么修改能处理中文文档
【Python】求大神指导 下面这段代码如何修改能处理中文文档
------解决思路----------------------
试试将 open 换做 codec.open,其中参数 encoding 为 ‘utf-8’
#-*- coding: utf-8 -*-
import re
import numpy as np
from utils import normalize
"""
Author:
Alex Kong (https://github.com/hitalex)
Reference:aa
http://blog.tomtung.com/2011/10/plsa
"""
np.set_printoptions(threshold='nan')
class Document(object):
'''
Splits a text file into an ordered list of words.
'''
# List of punctuation characters to scrub. Omits, the single apostrophe,
# which is handled separately so as to retain contractions.
PUNCTUATION = ['(', ')', ':', ';', ',', '-', '!', '.', '?', '/', '"', '*']
# Carriage return strings, on *nix and windows.
CARRIAGE_RETURNS = ['\n', '\r\n']
# Final sanity-check regex to run on words before they get
# pushed onto the core words list.
WORD_REGEX = "^[a-z']+$"
def __init__(self, filepath):
'''
Set source file location, build contractions list, and initialize empty
lists for lines and words.
'''
self.filepath = filepath
self.file = open(self.filepath)
self.lines = []
self.words = []
def split(self, STOP_WORDS_SET):
'''
Split file into an ordered list of words. Scrub out punctuation;
lowercase everything; preserve contractions; disallow strings that
include non-letters.
'''
self.lines = [line for line in self.file]
for line in self.lines:
words = line.split(' ')
for word in words:
clean_word = self._clean_word(word)
if clean_word and (clean_word not in STOP_WORDS_SET) and (len(clean_word) > 1): # omit stop words
self.words.append(clean_word)
def _clean_word(self, word):
'''
Parses a space-delimited string from the text and determines whether or
not it is a valid word. Scrubs punctuation, retains contraction
apostrophes. If cleaned word passes final regex, returns the word;
otherwise, returns None.
'''
word = word.lower()
for punc in Document.PUNCTUATION + Document.CARRIAGE_RETURNS:
word = word.replace(punc, '').strip("'")
return word if re.match(Document.WORD_REGEX, word) else None
class Corpus(object):
'''
A collection of documents.
'''
def __init__(self):
'''
Initialize empty document list.
'''
self.documents = []
def add_document(self, document):
'''
Add a document to the corpus.
'''
self.documents.append(document)
def build_vocabulary(self):
'''
Construct a list of unique words in the corpus.
'''
# ** ADD ** #
# exclude words that appear in 90%+ of the documents
# exclude words that are too (in)frequent
discrete_set = set()
for document in self.documents:
for word in document.words:
discrete_set.add(word)
self.vocabulary = list(discrete_set)
def plsa(self, number_of_topics, max_iter):
'''
Model topics.
'''
print("EM iteration begins...")
# Get vocabulary and number of documents.
self.build_vocabulary()
number_of_documents = len(self.documents)
vocabulary_size = len(self.vocabulary)
# build term-doc matrix
term_doc_matrix = np.zeros([number_of_documents, vocabulary_size], dtype = np.int)
for d_index, doc in enumerate(self.documents):
term_count = np.zeros(vocabulary_size, dtype = np.int)
for word in doc.words:
if word in self.vocabulary:
w_index = self.vocabulary.index(word)
term_count[w_index] = term_count[w_index] + 1
term_doc_matrix[d_index] = term_count
# Create the counter arrays.
self.document_topic_prob = np.zeros([number_of_documents, number_of_topics], dtype=np.float) # P(z | d)
self.topic_word_prob = np.zeros([number_of_topics, len(self.vocabulary)], dtype=np.float) # P(w | z)
self.topic_prob = np.zeros([number_of_documents, len(self.vocabulary), number_of_topics], dtype=np.float) # P(z | d, w)
# Initialize
print("Initializing...")
# randomly assign values
self.document_topic_prob = np.random.random(size = (number_of_documents, number_of_topics))
for d_index in range(len(self.documents)):
normalize(self.document_topic_prob[d_index]) # normalize for each document
self.topic_word_prob = np.random.random(size = (number_of_topics, len(self.vocabulary)))
for z in range(number_of_topics):
normalize(self.topic_word_prob[z]) # normalize for each topic
"""
# for test, fixed values are assigned, where number_of_documents = 3, vocabulary_size = 15
self.document_topic_prob = np.array(
[[ 0.19893833, 0.09744287, 0.12717068, 0.23964181, 0.33680632],
[ 0.27681925, 0.22971358, 0.1704416, 0.18248461, 0.14054095],
[ 0.24768207, 0.25136754, 0.14392363, 0.14573845, 0.21128831]])
self.topic_word_prob = np.array(
[[ 0.02963563, 0.11659963, 0.06415405, 0.1291839 , 0.09377842,
0.09317023, 0.06140873, 0.023314 , 0.09486251, 0.01538988,
0.09189075, 0.06957687, 0.05015957, 0.05281074, 0.0140651 ],
[ 0.09746902, 0.12212085, 0.07635703, 0.02799546, 0.0282282 ,
0.03685356, 0.01256655, 0.03931912, 0.09545668, 0.00928434,
0.11392475, 0.12089124, 0.02674909, 0.07219077, 0.12059333],
[ 0.02209806, 0.05870101, 0.12101806, 0.03733935, 0.02550749,
0.09906735, 0.0706651 , 0.05619682, 0.10672434, 0.12259672,
0.04218994, 0.10505831, 0.00315489, 0.03286002, 0.09682255],
[ 0.0428768 , 0.11598272, 0.08636138, 0.10917224, 0.05061344,
0.09974595, 0.01647265, 0.06376147, 0.04468468, 0.01986342,
0.10286377, 0.0117712 , 0.08350884, 0.049046 , 0.10327543],
[ 0.02555784, 0.03718368, 0.10109439, 0.02481489, 0.0208068 ,
0.03544246, 0.11515259, 0.06506528, 0.12720479, 0.07616499,
0.11286584, 0.06550869, 0.0653802 , 0.0157582 , 0.11199935]])
"""
# Run the EM algorithm
for iteration in range(max_iter):
print("Iteration #" + str(iteration + 1) + "...")
print("E step:")
for d_index, document in enumerate(self.documents):
for w_index in range(vocabulary_size):
prob = self.document_topic_prob[d_index, :] * self.topic_word_prob[:, w_index]
if sum(prob) == 0.0:
print("d_index = " + str(d_index) + ", w_index = " + str(w_index))
print("self.document_topic_prob[d_index, :] = " + str(self.document_topic_prob[d_index, :]))
print("self.topic_word_prob[:, w_index] = " + str(self.topic_word_prob[:, w_index]))
print("topic_prob[d_index][w_index] = " + str(prob))
exit(0)
else:
normalize(prob)
self.topic_prob[d_index][w_index] = prob
print("M step:")
# update P(w | z)
for z in range(number_of_topics):
for w_index in range(vocabulary_size):
s = 0
for d_index in range(len(self.documents)):
count = term_doc_matrix[d_index][w_index]
s = s + count * self.topic_prob[d_index, w_index, z]
self.topic_word_prob[z][w_index] = s
normalize(self.topic_word_prob[z])
# update P(z | d)
for d_index in range(len(self.documents)):
for z in range(number_of_topics):
s = 0
for w_index in range(vocabulary_size):
count = term_doc_matrix[d_index][w_index]
s = s + count * self.topic_prob[d_index, w_index, z]
self.document_topic_prob[d_index][z] = s
# print self.document_topic_prob[d_index]
# assert(sum(self.document_topic_prob[d_index]) != 0)
normalize(self.document_topic_prob[d_index])
------解决思路----------------------
试试将 open 换做 codec.open,其中参数 encoding 为 ‘utf-8’