Document Similarity
from src.bag_of_words_model import vocabulary
from src.term_weighing import read_corpus, document_frequencies, tf_idf
if __name__ == '__main__':
corpus = read_corpus('dat/chronicles_of_narnia.txt')
vocab = vocabulary(corpus)
dfs = document_frequencies(vocab, corpus)
D = len(corpus)
documents = [
'I like this movie very much'.split(),
'I hate this movie very much'.split(),
'I love this movie so much'.split()
]
vs = [tf_idf(vocab, dfs, D, document) for document in documents]
for v in vs: print(vs){980: 0.31, 7363: 0.52, 7920: 0.70, 11168: 0.51, 11833: 0.51}
{980: 0.31, 6423: 1.24, 7920: 0.70, 10325: 0.53, 11168: 0.51}Euclidean Similarity
Cosine Similarity

Last updated
Was this helpful?