Term Weighting
Term Frequency
from typing import Callable
def read_corpus(filename: str, tokenizer: Callable[[str], list[str]] | None = None) -> list[Document]:
fin = open(filename)
if tokenizer is None: tokenizer = lambda s: s.split()
return [tokenizer(line) for line in fin]from src.bag_of_words_model import vocabulary
corpus = read_corpus('dat/chronicles_of_narnia.txt')
vocab = vocabulary(corpus)from src.bag_of_words_model import bag_of_words, Document, Vocab
def print_tfs(vocab: Vocab, documents: list[Document]):
tfs = [bag_of_words(vocab, document) for document in documents]
words = [word for word, _ in sorted(vocab.items(), key=lambda x: x[1])]
for tf in tfs:
print([(words[index], count) for index, count in sorted(tf.items(), key=lambda x: x[1], reverse=True)])Stopwords
Document Frequency
TF-IDF
References
Last updated
Was this helpful?