"""理解sklearn中的CountVectorizer和TfidfVectorizer"""from collections import Counterimport numpy as npfrom sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizersentences = ["there is a dog dog", "here is a cat"]count_vec = CountVectorizer()a = count_vec.fit_transform(sentences)print(a.toarray())print(count_vec.vocabulary_)"""输出{'dog': 1, 'there': 4, 'here': 2, 'cat': 0, 'is': 3}表示每个词汇对应的坐标"""print("=" * 10)tf_vec = TfidfVectorizer()b = tf_vec.fit_transform(sentences)print(b.toarray())print(tf_vec.vocabulary_)print(tf_vec.idf_) # 逆文档频率print(tf_vec.get_feature_names())def mytf_idf(s): # 自己实现tfidf words = tf_vec.get_feature_names() tf_matrix = np.zeros((len(s), len(words)), dtype=np.float32) smooth = 1 # 初始值加上平滑因子 df_matrix = np.ones(len(words), dtype=np.float32) * smooth for i in range(len(s)): s_words = s[i].split() for j in range(len(words)): cnt = Counter(s_words).get(words[j], 0) tf_matrix[i][j] = cnt if cnt > 0: df_matrix[j] += 1 # idf一定是大于1的数值 idf_matrix = np.log((len(s) + smooth) / df_matrix) + 1 matrix = tf_matrix * idf_matrix matrix = matrix / np.linalg.norm(matrix, 2, axis=1).reshape(matrix.shape[0], 1) print(matrix)print("=" * 10)mytf_idf(sentences)"""TODO:* IDF可以学到,通过神经网络反向传播来学习IDF而不是直接计算得出* CountVectorizer有时不需要考虑个数,只需要知道是否出现过即可"""