类别classi | 非类别classi | |
包含单词wordj的文档数 | A | B |
不包含单词wordj的文档数 | C | D |
最终单词wordj的CHI值计算公式如下,其中P(classi)表示属于类别 classi的文档在所有文档中出现的概率,k为总的类别数
# documents = [document_1, document_2, document_3, ...] # document_i = "word_1 word_2 word_3" # labels is a list combined with 0 and 1 def feature_word_select(documents:list, labels:list, percentage:float): # get all words word_set = set() for document in documents: words = document.split() word_set.update(words) word_list = list(word_set) word_list.sort() sorted_words = chi(word_list, documents, labels) top_k_words = sorted_words[:int(percentage * len(sorted_words))] return top_k_words
下面这个函数cal_chi_word_class()用来计算 CHI(word, 0)和CHI(word, 1)。这里的A1表示属于类别1的A,A0表示属于类别0的A。
此外,由于文档总数N对于CHI(word, 0)和CHI(word, 1)来说属于公共的分子且保持不变,所以可以不参与计算;A1+C1=B0+D0,B1+D1=A0+C0,所以CHI(word, 0)和CHI(word, 1)的分母部分可以进行简化
# calculate chi(word,1) and chi(word,0) def cal_chi_word_class(word, labels, documents): N = len(documents) A1, B1, C1, D1 = 0., 0., 0., 0. A0, B0, C0, D0 = 0., 0., 0., 0. for i in range(len(documents)): if word in documents[i].split(): if labels[i] == 1: A1 += 1 B0 += 1 else: B1 += 1 A0 += 1 else: if labels[i] == 1: C1 += 1 D0 += 1 else: D1 += 1 C0 += 1 chi_word_1 = N * (A1*D1-C1*B1)**2 / ((A1+C1)*(B1+D1)*(A1+B1)*(C1+D1)) chi_word_0 = N * (A0*D0-C0*B0)**2 / ((A0+C0)*(B0+D0)*(A0+B0)*(C0+D0)) return chi_word_1, chi_word_0
# calculate chi(word,1) and chi(word,0) def cal_chi_word_class(word, labels, documents): A1, B1, C1, D1 = 0., 0., 0., 0. for i in range(len(documents)): if word in documents[i].split(): if labels[i] == 1: A1 += 1 else: B1 += 1 else: if labels[i] == 1: C1 += 1 else: D1 += 1 A0, B0, C0, D0 = B1, A1, D1, C1 chi_word_1 = (A1*D1-C1*B1)**2 / ((A1+B1)*(C1+D1)) chi_word_0 = (A0*D0-C0*B0)**2 / ((A0+B0)*(C0+D0)) return chi_word_1, chi_word_0
def chi(word_list, documents, labels): P1 = labels.count(1) / len(documents) P0 = 1 - P1 dic = {} for word in word_list: chi_word_1, chi_word_0 = cal_chi_word_class(word, labels, documents) chi_word = P0 * chi_word_0 + P1 * chi_word_1 dic[word] = chi_word sorted_list = sorted(dic.items(), key=lambda x:x[1], reverse=True) sorted_chi_word = [x[0] for x in sorted_list] return sorted_chi_word
def main(): documents = ["today i am happy !", "she is not happy at all", "let us go shopping !", "mike was so sad last night", "amy did not love it", "it is so amazing !" ] labels = [1, 0, 1, 0, 0, 1] words = feature_word_select(documents, labels, 0.3) print(words) if __name__ == '__main__': main()
['!', 'not', 'all', 'am', 'amazing', 'amy', 'at']
[('!', 9.0), ('not', 4.5), ('all', 1.8), ('am', 1.8), ('amazing', 1.8), ('amy', 1.8), ('at', 1.8), ('did', 1.8), ('go', 1.8), ('i', 1.8), ('last', 1.8), ('let', 1.8), ('love', 1.8), ('mike', 1.8), ...]
# calculate chi(word,1) and chi(word,0) def cal_chi_word_class(word, labels, documents): A1, B1, C1, D1 = 0., 0., 0., 0. for i in range(len(documents)): if word in documents[i].split(): if labels[i] == 1: A1 += 1 else: B1 += 1 else: if labels[i] == 1: C1 += 1 else: D1 += 1 A0, B0, C0, D0 = B1, A1, D1, C1 chi_word_1 = (A1*D1-C1*B1)**2 / ((A1+B1)*(C1+D1)) chi_word_0 = (A0*D0-C0*B0)**2 / ((A0+B0)*(C0+D0)) return chi_word_1, chi_word_0 def chi(word_list, documents, labels): P1 = labels.count(1) / len(documents) P0 = 1 - P1 dic = {} for word in word_list: chi_word_1, chi_word_0 = cal_chi_word_class(word, labels, documents) chi_word = P0 * chi_word_0 + P1 * chi_word_1 dic[word] = chi_word sorted_list = sorted(dic.items(), key=lambda x:x[1], reverse=True) sorted_chi_word = [x[0] for x in sorted_list] return sorted_chi_word # documents = [document_1, document_2, document_3, ...] # document_i = "word_1 word_2 word_3" # labels is a list combined with 0 and 1 def feature_word_select(documents:list, labels:list, percentage:float): # get all words word_set = set() for document in documents: words = document.split() word_set.update(words) word_list = list(word_set) word_list.sort() sorted_words = chi(word_list, documents, labels) top_k_words = sorted_words[:int(percentage * len(sorted_words))] return top_k_words def main(): documents = ["today i am happy !", "she is not happy at all", "let us go shopping !", "mike was so sad last night", "amy did not love it", "it is so amazing !" ] labels = [1, 0, 1, 0, 0, 1] words = feature_word_select(documents, labels, 0.3) print(words) if __name__ == '__main__': main()