転置インデックスを軽く勉強しつつ実装してみてる
ソース
#!/usr/bin/env python # -*- coding: utf-8 -*- import math index ={} num = 0 def inverted_index(no,str): """ 転置インデックス作成 例 {'banana': [(2, 3), (3, 3)]} key : 文字列 val : タプルのリスト (文章ID,文章IDの文字列での文字位置) """ str_arr =str.split(" ") for i in range(len(str_arr)): if not (str_arr[i] in index): index[str_arr[i]] = [] index[str_arr[i]].append((no,i)) def search(str_arr): """ 検索ワードにあった文章IDを返す TF-IDF を使ってキーワード抽出 参考: http://chalow.net/2005-10-12-1.html """ tf = {} score ={} for i in range(len(str_arr)): if (str_arr[i] in tf): tf[str_arr[i]] += 1 else: tf[str_arr[i]] = 1 for key in tf: if key in index: df = len(index[key]) else: df = 0 idf = math.log(num / (df + 1.0)) tfidf = tf[key] * idf if key in index: for idx in index[key]: n = idx[0] if n in score: score[n] += tfidf else: score[n]=tfidf # 検索結果格納 result = [] for k,v in sorted(score.items(), key=lambda x:x[1], reverse=True): result.append((k,v)) return result if __name__ == "__main__": str =[ "it is what it is", "what is it", "it is a banana", "i have a banana boat", "you are apple", "it is a apple", "yes , i am a boy", "post is in the warter", ] num = len (str) # 転置インデックス作成 for i in range(len(str)): inverted_index(i,str[i]) # 検索文字を指定して検索 search_words = "i am hero" result = search(search_words.split(" ")) # 結果を表示 print search_words print "~~~~~~~~~~~~~~~" for l in result: print "ID:%s STRING:%s SCORE:%s" % (l[0],str[l[0]],l[1])
結果
i am hero ~~~~~~~~~~~~~~~ ID:6 STRING:yes , i am a boy SCORE:2.36712361413 ID:3 STRING:i have a banana boat SCORE:0.980829253012