from typing import Callable
def read_corpus(filename: str, tokenizer: Callable[[str], list[str]] | None = None) -> list[Document]:
fin = open(filename)
if tokenizer is None: tokenizer = lambda s: s.split()
return [tokenizer(line) for line in fin]from src.bag_of_words_model import vocabulary
corpus = read_corpus('dat/chronicles_of_narnia.txt')
vocab = vocabulary(corpus)from src.bag_of_words_model import bag_of_words, Document, Vocab
def print_tfs(vocab: Vocab, documents: list[Document]):
tfs = [bag_of_words(vocab, document) for document in documents]
words = [word for word, _ in sorted(vocab.items(), key=lambda x: x[1])]
for tf in tfs:
print([(words[index], count) for index, count in sorted(tf.items(), key=lambda x: x[1], reverse=True)])from elit_tokenizer import EnglishTokenizer
ds = [
"As dawn broke, the first light kissed the golden mane of Aslan, the rightful king of Narnia.",
"The White Witch's icy breath froze the once lush meadows, casting a shadow over Narnia.",
"Lucy's footsteps echoed in the halls of Cair Paravel, where legends were born."
]
etok = EnglishTokenizer()
documents = [etok.decode(d).tokens for d in ds]
print_tfs(vocab, documents)[('the', 3), (',', 2), ('of', 2), ('.', 1), ('As', 1), ('Aslan', 1), ('Narnia', 1), ('broke', 1), ('dawn', 1), ('first', 1), ('golden', 1), ('king', 1), ('kissed', 1), ('light', 1), ('mane', 1), ('rightful', 1)]
[("'s", 1), (',', 1), ('.', 1), ('Narnia', 1), ('The', 1), ('White', 1), ('Witch', 1), ('a', 1), ('breath', 1), ('casting', 1), ('froze', 1), ('icy', 1), ('once', 1), ('over', 1), ('shadow', 1), ('the', 1)]
[("'s", 1), (',', 1), ('.', 1), ('Cair', 1), ('Lucy', 1), ('Paravel', 1), ('born', 1), ('echoed', 1), ('footsteps', 1), ('halls', 1), ('in', 1), ('legends', 1), ('of', 1), ('the', 1), ('were', 1), ('where', 1)]from string import punctuation
stopwords = {line.strip().lower() for line in open('dat/stopwords.txt')}
is_stopwords = lambda w: w.lower() in stopwords or w in punctuationsw_tokenizer = lambda s: [word for word in s.split() if not is_stopwords(word)]
corpus = read_corpus('dat/chronicles_of_narnia.txt', sw_tokenizer)
vocab = vocabulary(corpus)print_tfs(vocab, documents)[('Aslan', 1), ('Narnia', 1), ('broke', 1), ('dawn', 1), ('golden', 1), ('king', 1), ('kissed', 1), ('light', 1), ('mane', 1), ('rightful', 1)]
[("'s", 1), ('Narnia', 1), ('White', 1), ('Witch', 1), ('breath', 1), ('casting', 1), ('froze', 1), ('icy', 1), ('shadow', 1)]
[("'s", 1), ('Cair', 1), ('Lucy', 1), ('Paravel', 1), ('born', 1), ('echoed', 1), ('footsteps', 1), ('halls', 1), ('legends', 1)]from collections import Counter
from src.types import SparseVector
def document_frequencies(vocab: Vocab, corpus: list[Document]) -> SparseVector:
counts = Counter()
for document in corpus:
counts.update(set(document))
return {vocab[word]: count for word, count in sorted(counts.items()) if word in vocab}corpus = read_corpus('dat/chronicles_of_narnia.txt')
vocab = vocabulary(corpus)
words = [word for word, _ in sorted(vocab.items(), key=lambda x: x[1])]
dfs = document_frequencies(vocab, corpus)
for document in documents:
bow = bag_of_words(vocab, document)
tf_df = [(words[tid], tf, dfs[tid]) for tid, tf in sorted(bow.items())]
tf_df = sorted(tf_df, key=lambda x: (-x[1], x[2]))
print(' '.join(document))
print('\n'.join(['{:>10} {} {:>5}'.format(*t) for t in tf_df]))As dawn broke , the first light kissed the golden mane of Aslan , the rightful king of Narnia .
the 3 9574
of 2 5355
, 2 10578
rightful 1 1
dawn 1 6
kissed 1 26
broke 1 35
king 1 40
mane 1 40
golden 1 52
As 1 161
light 1 203
first 1 401
Narnia 1 512
Aslan 1 706
. 1 19747
The White Witch 's icy breath froze the once lush meadows , casting a shadow over Narnia .
casting 1 1
froze 1 2
icy 1 3
shadow 1 38
White 1 44
breath 1 86
Witch 1 246
once 1 378
over 1 431
Narnia 1 512
The 1 1352
's 1 2404
a 1 5456
the 1 9574
, 1 10578
. 1 19747
Lucy 's footsteps echoed in the halls of Cair Paravel , where legends were born .
footsteps 1 1
legends 1 1
echoed 1 2
halls 1 4
born 1 14
Paravel 1 84
Cair 1 86
where 1 360
Lucy 1 704
were 1 1684
's 1 2404
in 1 3513
of 1 5355
the 1 9574
, 1 10578
. 1 19747def tf_idf(vocab: Vocab, dfs: SparseVector, D: int, document: Document) -> SparseVector:
tf = lambda count: count / len(document)
idf = lambda tid: math.log(D / dfs[tid])
return {tid: tf(count) * idf(tid) for tid, count in bag_of_words(vocab, document).items()}for document in documents:
tfidf = tf_idf(vocab, dfs, len(corpus), document)
print(' '.join(document))
print('\n'.join(['{:>10} {:.2f}'.format(words[tid], score) for tid, score in sorted(tfidf.items(), key=lambda x: x[1], reverse=True)]))As dawn broke , the first light kissed the golden mane of Aslan , the rightful king of Narnia .
rightful 0.50
dawn 0.41
kissed 0.34
broke 0.32
king 0.32
mane 0.32
golden 0.30
As 0.25
light 0.24
first 0.20
Narnia 0.19
Aslan 0.17
of 0.14
the 0.13
, 0.08
. 0.01
The White Witch 's icy breath froze the once lush meadows , casting a shadow over Narnia .
casting 0.56
froze 0.52
icy 0.50
shadow 0.35
White 0.35
breath 0.31
Witch 0.25
once 0.23
over 0.22
Narnia 0.21
The 0.16
's 0.12
a 0.08
the 0.05
, 0.04
. 0.01
Lucy 's footsteps echoed in the halls of Cair Paravel , where legends were born .
footsteps 0.63
legends 0.63
echoed 0.58
halls 0.54
born 0.46
Paravel 0.35
Cair 0.35
where 0.26
Lucy 0.22
were 0.16
's 0.14
in 0.12
of 0.09
the 0.05
, 0.05
. 0.01