以下脚本用于用文本对给定的输入列进行符号化:
%%time
import pandas as pd
from gensim.utils import lemmatize
from gensim.parsing.preprocessing import STOPWORDS
STOPWORDS = list(STOPWORDS)
data = pd.read_csv('https://pastebin.com/raw/0SEv1RMf')
def lemmatization(s):
result = []
# lowercase, tokenize, remove stopwords, len>3, lemmatize
for token in lemmatize(s, stopwords=STOPWORDS, min_length=3):
result.append(token.decode('utf-8').split('/')[0])
# print(len(result)) <- This didn't work.
return result
X_train = data.apply(lambda r: lemmatization(r['text']), axis=1)
print(X_train)问题:
如何打印柠檬化过程的进度?
发布于 2019-01-06 10:56:33
您可以将一个变量传递到柠檬化函数中,以跟踪调用它的次数,然后每1000次左右打印一次。我已经将它封装在下面的列表中,这样int就可以通过引用而不是通过值传递。
%%time
import pandas as pd
from gensim.utils import lemmatize
from gensim.parsing.preprocessing import STOPWORDS
STOPWORDS = list(STOPWORDS)
data = pd.read_csv('https://pastebin.com/raw/0SEv1RMf')
iteration_count = [0]
def lemmatization(s, iteration_count):
result = []
# lowercase, tokenize, remove stopwords, len>3, lemmatize
for token in lemmatize(s, stopwords=STOPWORDS, min_length=3):
result.append(token.decode('utf-8').split('/')[0])
# print(len(result)) <- This didn't work.
iteration_count[0] += 1
if iteration_count[0] % 1000 == 0:
print(iteration_count[0])
return result
X_train = data.apply(lambda r: lemmatization(r['text'], iteration_count), axis=1)
print(X_train)https://stackoverflow.com/questions/54060506
复制相似问题