from src.bag_of_words_model import Document
import glob, os
def collect(dirpath: str) -> dict[int, list[Document]]:
books = dict()
for filename in glob.glob(os.path.join(dirpath, '*.txt')):
t = os.path.basename(filename).split('_')
book_id = int(t[0])
fin = open((filename))
books.setdefault(book_id, list()).append(fin.read().split())
return booksdef join_documents(dataset: dict[int, list[Document]]) -> list[Document]:
return [document for documents in dataset.values() for document in documents]
trn = collect('dat/document_classification/trn')
dev = collect('dat/document_classification/dev')
tst = collect('dat/document_classification/tst')
print(len(join_documents(trn)), len(join_documents(dev)), len(join_documents(tst)))82 14 14corpus = join_documents(trn)
vocab = vocabulary(join_documents(trn))
dfs = document_frequencies(vocab, corpus)
D = len(corpus)def vectorize(vocab: Vocab, dfs: SparseVector, D: int, docset: dict[int, list[Document]]) -> list[tuple[int, SparseVector]]:
vs = []
for book_id, documents in docset.items():
for document in documents:
vs.append((book_id, tf_idf(vocab, dfs, D, document)))
return vstrn_vs = vectorize(vocab, dfs, D, trn)
dev_vs = vectorize(vocab, dfs, D, dev)
tst_vs = vectorize(vocab, dfs, D, tst)def knn(trn_vs: list[tuple[int, SparseVector]], v: SparseVector, k: int = 1) -> tuple[int, float]:
sims = [(book_id, cosine_similarity(v, t)) for book_id, t in trn_vs]
sims.sort(key=lambda x: x[1], reverse=True)
return Counter(sims[:k]).most_common(1)[0][0]correct = 0
for g_book_id, document in dev_vs:
p_book_id, p_score = knn(trn_vs, document)
if g_book_id == p_book_id: correct += 1
print('Gold: {}, Auto: {}, Score: {:.2f}'.format(g_book_id, p_book_id, p_score))
print('Accuracy: {} ({}/{})'.format(100 * correct / len(dev_vs), correct, len(dev_vs)))Gold: 1, Auto: 1, Score: 0.49
Gold: 1, Auto: 1, Score: 0.27
Gold: 3, Auto: 3, Score: 0.36
Gold: 3, Auto: 3, Score: 0.32
Gold: 5, Auto: 5, Score: 0.29
Gold: 5, Auto: 5, Score: 0.54
Gold: 0, Auto: 0, Score: 0.32
Gold: 0, Auto: 0, Score: 0.26
Gold: 6, Auto: 6, Score: 0.48
Gold: 6, Auto: 6, Score: 0.49
Gold: 2, Auto: 2, Score: 0.37
Gold: 2, Auto: 2, Score: 0.31
Gold: 4, Auto: 4, Score: 0.56
Gold: 4, Auto: 4, Score: 0.60
Accuracy: 100.0 (14/14)[Label]\t[Document]
import math
from src.bag_of_words_model import SparseVector
def euclidean_distance(v1: SparseVector, v2: SparseVector) -> float:
d = sum((v - v2.get(k, 0)) ** 2 for k, v in v1.items())
d += sum(v ** 2 for k, v in v2.items() if k not in v1)
return math.sqrt(d)print(euclidean_distance(vs[0], vs[0]))
print(euclidean_distance(vs[0], vs[1]))
print(euclidean_distance(vs[0], vs[2]))0.0
1.347450458032576
1.3756015678855296def cosine_similarity(v1: SparseVector, v2: SparseVector) -> float:
n = sum(v * v2.get(k, 0) for k, v in v1.items())
d = math.sqrt(sum(v ** 2 for k, v in v1.items()))
d *= math.sqrt(sum(v ** 2 for k, v in v2.items()))
return n / dprint(cosine_similarity(vs[0], vs[0]))
print(cosine_similarity(vs[0], vs[1]))
print(cosine_similarity(vs[0], vs[2]))0.9999999999999999
0.5775130451716284
0.4826178600593854from typing import Callable
def read_corpus(filename: str, tokenizer: Callable[[str], list[str]] | None = None) -> list[Document]:
fin = open(filename)
if tokenizer is None: tokenizer = lambda s: s.split()
return [tokenizer(line) for line in fin]from src.bag_of_words_model import vocabulary
corpus = read_corpus('dat/chronicles_of_narnia.txt')
vocab = vocabulary(corpus)from src.bag_of_words_model import bag_of_words, Document, Vocab
def print_tfs(vocab: Vocab, documents: list[Document]):
tfs = [bag_of_words(vocab, document) for document in documents]
words = [word for word, _ in sorted(vocab.items(), key=lambda x: x[1])]
for tf in tfs:
print([(words[index], count) for index, count in sorted(tf.items(), key=lambda x: x[1], reverse=True)])from elit_tokenizer import EnglishTokenizer
ds = [
"As dawn broke, the first light kissed the golden mane of Aslan, the rightful king of Narnia.",
"The White Witch's icy breath froze the once lush meadows, casting a shadow over Narnia.",
"Lucy's footsteps echoed in the halls of Cair Paravel, where legends were born."
]
etok = EnglishTokenizer()
documents = [etok.decode(d).tokens for d in ds]
print_tfs(vocab, documents)[('the', 3), (',', 2), ('of', 2), ('.', 1), ('As', 1), ('Aslan', 1), ('Narnia', 1), ('broke', 1), ('dawn', 1), ('first', 1), ('golden', 1), ('king', 1), ('kissed', 1), ('light', 1), ('mane', 1), ('rightful', 1)]
[("'s", 1), (',', 1), ('.', 1), ('Narnia', 1), ('The', 1), ('White', 1), ('Witch', 1), ('a', 1), ('breath', 1), ('casting', 1), ('froze', 1), ('icy', 1), ('once', 1), ('over', 1), ('shadow', 1), ('the', 1)]
[("'s", 1), (',', 1), ('.', 1), ('Cair', 1), ('Lucy', 1), ('Paravel', 1), ('born', 1), ('echoed', 1), ('footsteps', 1), ('halls', 1), ('in', 1), ('legends', 1), ('of', 1), ('the', 1), ('were', 1), ('where', 1)]from string import punctuation
stopwords = {line.strip().lower() for line in open('dat/stopwords.txt')}
is_stopwords = lambda w: w.lower() in stopwords or w in punctuationsw_tokenizer = lambda s: [word for word in s.split() if not is_stopwords(word)]
corpus = read_corpus('dat/chronicles_of_narnia.txt', sw_tokenizer)
vocab = vocabulary(corpus)print_tfs(vocab, documents)[('Aslan', 1), ('Narnia', 1), ('broke', 1), ('dawn', 1), ('golden', 1), ('king', 1), ('kissed', 1), ('light', 1), ('mane', 1), ('rightful', 1)]
[("'s", 1), ('Narnia', 1), ('White', 1), ('Witch', 1), ('breath', 1), ('casting', 1), ('froze', 1), ('icy', 1), ('shadow', 1)]
[("'s", 1), ('Cair', 1), ('Lucy', 1), ('Paravel', 1), ('born', 1), ('echoed', 1), ('footsteps', 1), ('halls', 1), ('legends', 1)]from collections import Counter
from src.types import SparseVector
def document_frequencies(vocab: Vocab, corpus: list[Document]) -> SparseVector:
counts = Counter()
for document in corpus:
counts.update(set(document))
return {vocab[word]: count for word, count in sorted(counts.items()) if word in vocab}corpus = read_corpus('dat/chronicles_of_narnia.txt')
vocab = vocabulary(corpus)
words = [word for word, _ in sorted(vocab.items(), key=lambda x: x[1])]
dfs = document_frequencies(vocab, corpus)
for document in documents:
bow = bag_of_words(vocab, document)
tf_df = [(words[tid], tf, dfs[tid]) for tid, tf in sorted(bow.items())]
tf_df = sorted(tf_df, key=lambda x: (-x[1], x[2]))
print(' '.join(document))
print('\n'.join(['{:>10} {} {:>5}'.format(*t) for t in tf_df]))As dawn broke , the first light kissed the golden mane of Aslan , the rightful king of Narnia .
the 3 9574
of 2 5355
, 2 10578
rightful 1 1
dawn 1 6
kissed 1 26
broke 1 35
king 1 40
mane 1 40
golden 1 52
As 1 161
light 1 203
first 1 401
Narnia 1 512
Aslan 1 706
. 1 19747
The White Witch 's icy breath froze the once lush meadows , casting a shadow over Narnia .
casting 1 1
froze 1 2
icy 1 3
shadow 1 38
White 1 44
breath 1 86
Witch 1 246
once 1 378
over 1 431
Narnia 1 512
The 1 1352
's 1 2404
a 1 5456
the 1 9574
, 1 10578
. 1 19747
Lucy 's footsteps echoed in the halls of Cair Paravel , where legends were born .
footsteps 1 1
legends 1 1
echoed 1 2
halls 1 4
born 1 14
Paravel 1 84
Cair 1 86
where 1 360
Lucy 1 704
were 1 1684
's 1 2404
in 1 3513
of 1 5355
the 1 9574
, 1 10578
. 1 19747def tf_idf(vocab: Vocab, dfs: SparseVector, D: int, document: Document) -> SparseVector:
tf = lambda count: count / len(document)
idf = lambda tid: math.log(D / dfs[tid])
return {tid: tf(count) * idf(tid) for tid, count in bag_of_words(vocab, document).items()}for document in documents:
tfidf = tf_idf(vocab, dfs, len(corpus), document)
print(' '.join(document))
print('\n'.join(['{:>10} {:.2f}'.format(words[tid], score) for tid, score in sorted(tfidf.items(), key=lambda x: x[1], reverse=True)]))As dawn broke , the first light kissed the golden mane of Aslan , the rightful king of Narnia .
rightful 0.50
dawn 0.41
kissed 0.34
broke 0.32
king 0.32
mane 0.32
golden 0.30
As 0.25
light 0.24
first 0.20
Narnia 0.19
Aslan 0.17
of 0.14
the 0.13
, 0.08
. 0.01
The White Witch 's icy breath froze the once lush meadows , casting a shadow over Narnia .
casting 0.56
froze 0.52
icy 0.50
shadow 0.35
White 0.35
breath 0.31
Witch 0.25
once 0.23
over 0.22
Narnia 0.21
The 0.16
's 0.12
a 0.08
the 0.05
, 0.04
. 0.01
Lucy 's footsteps echoed in the halls of Cair Paravel , where legends were born .
footsteps 0.63
legends 0.63
echoed 0.58
halls 0.54
born 0.46
Paravel 0.35
Cair 0.35
where 0.26
Lucy 0.22
were 0.16
's 0.14
in 0.12
of 0.09
the 0.05
, 0.05
. 0.01D1 = ['John', 'bought', 'a', 'book', '.', 'The', 'book', 'was', 'funny', '.']
D2 = ['Mary', 'liked', 'the', 'book', '.', 'John', 'gave', 'it', 'to', 'Mary', '.']W = [
'.', # 0
'John', # 1
'Mary', # 2
'The', # 3
'a', # 4
'book', # 5
'bought', # 6
'funny', # 7
'gave', # 8
'it', # 9
'liked', # 10
'the', # 11
'to', # 12
'was' # 13
]# 0 1 2 3 4 5 6 7 8 9 10 11 12 13
v1 = [2, 1, 0, 1, 1, 2, 1, 1, 0, 0, 0, 0, 0, 1]
v2 = [2, 1, 2, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0]v1 = {0:2, 1:1, 3:1, 4:1, 5:2, 6:1, 7:1, 13:1}
v2 = {0:2, 1:1, 2:2, 5:1, 8:1, 9:1, 10:1, 11:1, 12:1}from typing import TypeAlias
Document: TypeAlias = list[str]
Vocab: TypeAlias = dict[str, int]
def vocabulary(documents: list[Document]) -> Vocab:
vocab = set()
for document in documents:
vocab.update(document)
return {word: i for i, word in enumerate(sorted(list(vocab)))}from collections import Counter
SparseVector: TypeAlias = dict[int, int | float]
def bag_of_words(vocab: Vocab, document: Document) -> SparseVector:
counts = Counter(document)
return {vocab[word]: count for word, count in sorted(counts.items()) if word in vocab}documents = [
['John', 'bought', 'a', 'book', '.', 'The', 'book', 'was', 'funny', '.'],
['Mary', 'liked', 'the', 'book', '.', 'John', 'gave', 'it', 'to', 'Mary', '.']
]
vocab = vocabulary(documents)
print(vocab)
print(bag_of_words(vocab, documents[0]))
print(bag_of_words(vocab, documents[1])){
'.': 0,
'John': 1,
'Mary': 2,
'The': 3,
'a': 4,
'book': 5,
'bought': 6,
'funny': 7,
'gave': 8,
'it': 9,
'liked': 10,
'the': 11,
'to': 12,
'was': 13
}
{0: 2, 1: 1, 3: 1, 4: 1, 5: 2, 6: 1, 7: 1, 13: 1}
{0: 2, 1: 1, 2: 2, 5: 1, 8: 1, 9: 1, 10: 1, 11: 1, 12: 1}