import re
import collections
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
#为避免出问题,文件名使用全路径
data = pd.read_csv('XXX.csv')
trainheadlines = []
for row in range(0, len(data.index)):
trainheadlines.append(' '.join(str(x) for x in data.iloc[row, m:n]))
#上面的m:n代表取那一列,或者那几列。
advancedvectorizer = TfidfVectorizer(
min_df=0, max_df=1, max_features=20000, ngram_range=(1, 1))
advancedtrain = advancedvectorizer.fit_transform(trainheadlines)
print(advancedtrain.shape)
温馨提示:答案为网友推荐,仅供参考