在文本分类方面,我遇到了一个问题,我需要对俄语文本进行分类。对于特性提取,我使用scikit学习TfidfTransformer和CountVectorizer,但是在编译代码之后出现了一个错误:
'UnicodeDecodeError: 'utf8' codec can't decode byte 0xc2 in position 0:
invalid continuation byte'. 我怎样才能纠正这个错误?下面是Python中的代码:
# -*- coding: utf-8 -*-
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.corpus import stopwords
import numpy as np
import numpy.linalg as LA
import os
import nltk
import re
import sys
from nltk import NaiveBayesClassifier
import nltk.classify
from nltk.tokenize import wordpunct_tokenize
from nltk.corpus import stopwords
import re
data_path = os.path.abspath(os.path.join('/home/lena/','corpus'))
official_path = os.path.join(data_path,'official')
#print official_path
official2_path = os.path.join(data_path,'official_2')
talk_path = os.path.join(data_path,'talk')
talk2_path = os.path.join(data_path,'talk_2')
#fiction_path = os.path.join(data_path,'fiction')
#fiction2_path = os.path.join(data_path,'fiction_2')
def get_text(path):
with open(path,'rU') as file:
line = file.readlines()
return ''.join(line)
def get_textdir(path):
filelist = os.listdir(path)
all_text = [get_text(os.path.join(path,f)) for f in filelist]
return all_text
all_talk = get_textdir(talk_path)
all_official = get_textdir(official_path)
official_2 = get_textdir(official2_path)
talk_2 = get_textdir(talk2_path)
train_set = all_talk
test_set = talk_2
stopWords = stopwords.words('russian')
vectorizer = CountVectorizer(stop_words = stopWords)
print vectorizer
train = vectorizer.fit_transform(train_set).toarray()
test = vectorizer.transform(test_set).toarray()
print 'train set', train
print 'test set', test
transformer.fit(train)
print transformer.transform(train).toarray()
transformer.fit(test)
tfidf = transformer.transform(test)
print tfidf.todense()发布于 2013-08-02 08:53:02
在向量器上设置charset (或在0.14,encoding中)参数。对于俄文来说,这可能是
CountVectorizer(charset='koi8r', stop_words=stopWords)(但不要相信我的话,在你的文本文件上运行类似于chardet或file的内容)。
https://stackoverflow.com/questions/18011756
复制相似问题