Loading...
Loading...
Loading...
Loading...
Loading...
Loading...
Loading...
Loading...
Loading...
Loading...
Loading...
Loading...
Loading...
Loading...
Loading...
Loading...
Loading...
Loading...
Loading...
Loading...
Loading...
Loading...
Loading...
Loading...
Loading...
Loading...
Loading...
Loading...
Loading...
Loading...
Loading...
Loading...
Loading...
Loading...
Loading...
Loading...
Loading...
Loading...
Loading...
Loading...
Loading...
Loading...
Loading...
Loading...
Loading...
Loading...
Loading...
Loading...
Loading...
Loading...
Loading...
Loading...
Loading...
UNKNOWN = ''
INIT = '[INIT]'Spring 2024
CS|QTM|LING-329: Computational Linguistics (Spring 2025)
python -m pip install --upgrade pippip install setuptoolspip install elit_tokenizerfrom elit_tokenizer import EnglishTokenizer
if __name__ == '__main__':
text = 'Emory NLP is a research lab in Atlanta, GA. It was founded by Jinho D. Choi in 2014. Dr. Choi is a professor at Emory University.'
tokenizer = EnglishTokenizer()
sentence = tokenizer.decode(text)
print(sentence.tokens)
print(sentence.offsets)['Emory', 'NLP', 'is', 'a', 'research', 'lab', 'in', 'Atlanta', ',', 'GA', '.', 'It', 'was', 'founded', 'by', 'Jinho', 'D.', 'Choi', 'in', '2014', '.', 'Dr.', 'Choi', 'is', 'a', 'professor', 'at', 'Emory', 'University', '.']
[(0, 5), (6, 9), (10, 12), (13, 14), (15, 23), (24, 27), (28, 30), (31, 38), (38, 39), (40, 42), (42, 43), (44, 46), (47, 50), (51, 58), (59, 61), (62, 67), (68, 70), (71, 75), (76, 78), (79, 83), (83, 84), (85, 88), (89, 93), (94, 96), (97, 98), (99, 108), (109, 111), (112, 117), (118, 128), (128, 129)].gitignore
src/__init__.py
src/homework/__init__.py
src/homework/getting_started.py.idea/
.venv/
Virtualenv


{
'The Lion , the Witch and the Wardrobe': {
'title': 'The Lion , the Witch and the Wardrobe',
'year': 1950,
'chapters': [
{
'number': 1,
'title': 'Lucy Looks into a Wardrobe',
'token_count': 1915
},
{
'number': 2,
'title': 'What Lucy Found There',
'token_count': 2887
},
...
]
},
'Prince Caspian : The Return to Narnia': {
'title': 'Prince Caspian : The Return to Narnia',
'year': 1951,
'chapters': [
...
]
},
...
}
from typing import TypeAlias
Unigram: TypeAlias = dict[str, float]from collections import Counter
def unigram_count(filepath: str) -> Counter:
unigrams = Counter()
for line in open(filepath):
words = line.split()
unigrams.update(words)
return unigramsdef unigram_estimation(filepath: str) -> Unigram:
counts = unigram_count(filepath)
total = sum(counts.values())
return {word: count / total for word, count in counts.items()}from collections.abc import Callable
def test_unigram(filepath: str, estimator: Callable[[str], Unigram]):
unigrams = estimator(filepath)
unigram_list = [(word, prob) for word, prob in sorted(unigrams.items(), key=lambda x: x[1], reverse=True)]
for word, prob in unigram_list[:300]:
if word[0].isupper() and word.lower() not in unigrams:
print(f'{word:>10} {prob:.6f}')corpus = 'dat/chronicles_of_narnia.txt'
test_unigram(corpus, unigram_estimation) I 0.010543
Aslan 0.001850
Lucy 0.001815
Edmund 0.001409
Narnia 0.001379
Caspian 0.001338
Jill 0.001262
Peter 0.001034
Shasta 0.000928
Digory 0.000925
Eustace 0.000877
Susan 0.000654
Tirian 0.000601
Polly 0.000547
Aravis 0.000537
Bree 0.000492
Puddleglum 0.000492
Scrubb 0.000482
Andrew 0.000406Bigram: TypeAlias = dict[str, Unigram | float]from collections import Counter, defaultdict
from src.types import Bigram
def bigram_count(filepath: str) -> dict[str, Counter]:
bigrams = defaultdict(Counter)
for line in open(filepath):
words = line.split()
for i in range(1, len(words)):
bigrams[words[i - 1]].update([words[i]])
return bigramsfrom src.types import Bigram
def bigram_estimation(filepath: str) -> Bigram:
counts = bigram_count(filepath)
bigrams = dict()
for prev, ccs in counts.items():
total = sum(ccs.values())
bigrams[prev] = {curr: count / total for curr, count in ccs.items()}
return bigramsdef test_bigram(filepath: str, estimator: Callable[[str], Bigram]):
bigrams = estimator(filepath)
for prev in ['I', 'the', 'said']:
print(prev)
bigram_list = [(curr, prob) for curr, prob in sorted(bigrams[prev].items(), key=lambda x: x[1], reverse=True)]
for curr, prob in bigram_list[:10]:
print("{:>10} {:.6f}".format(curr, prob))test_bigram(corpus, bigram_estimation)I
'm 0.081628
do 0.075849
've 0.044065
was 0.041897
have 0.038045
am 0.035878
'll 0.032507
think 0.032025
'd 0.026246
know 0.025765
the
same 0.014846
other 0.013405
King 0.012528
Witch 0.011776
whole 0.009020
others 0.008958
first 0.008770
Dwarf 0.008582
door 0.008519
great 0.008519
said
the 0.157635
, 0.073645
Lucy 0.057635
Edmund 0.045074
Caspian 0.040394
Peter 0.039409
Jill 0.034975
. 0.034729
Digory 0.031034
Aslan 0.030049Emory University is a private research university in Atlanta, Georgia. Founded in 1836 as Emory College by the Methodist Episcopal Church and named in honor of Methodist bishop John Emory.[18]
Emory University has nine academic divisions. Emory Healthcare is the largest healthcare system in the state of Georgia[19] and comprises seven major hospitals, including Emory University Hospital and Emory University Hospital Midtown.[20] The university operates the Winship Cancer Institute, Yerkes National Primate Research Center, and many disease and vaccine research centers.[21][22] Emory University is the leading coordinator of the U.S. Health Department's National Ebola Training and Education Center.[23] The university is one of four institutions involved in the NIAID's Tuberculosis Research Units Program.[24] The International Association of National Public Health Institutes is headquartered at the university.[25]
Emory University has the 15th-largest endowment among U.S. colleges and universities.[9] The university is classified among "R1: Doctoral Universities - Very high research activity"[26] and is cited for high scientific performance and citation impact in the CWTS Leiden Ranking.[27] The National Science Foundation ranked the university 36th among academic institutions in the United States for research and development (R&D) expenditures.[28][29] In 1995 Emory University was elected to the Association of American Universities, an association of the 65 leading research universities in the United States and Canada.[5]
Emory faculty and alumni include 2 Prime Ministers, 9 university presidents, 11 members of the United States Congress, 2 Nobel Peace Prize laureates, a Vice President of the United States, a United States Speaker of the House, and a United States Supreme Court Justice. Other notable alumni include 21 Rhodes Scholars and 6 Pulitzer Prize winners, as well as Emmy Award winners, MacArthur Fellows, CEOs of Fortune 500 companies, heads of state and other leaders in foreign government.[30] Emory has more than 149,000 alumni, with 75 alumni clubs established worldwide in 20 countries.[31][32][33]Updated: 2025-11-24
# of word tokens: 305
# of word types: 180the 18
and 15
of 12
Emory 11
in 10
University 7
is 7
university 6
United 6
research 5private 1
Atlanta, 1
Georgia. 1
Founded 1
1836 1
College 1
by 1
Episcopal 1
Church 1
named 1from collections import Counter
def count_words(corpus: str) -> Counter:
fin = open(corpus)
words = fin.read().split()
return Counter(words)corpus = 'dat/emory-wiki.txt'
counts = count_words(corpus)
n_tokens = sum(counts.values())
n_types = len(counts)
print(f'# of word tokens: {n_tokens}')
print(f'# of word types: {n_types}')des = sorted(counts.items(), key=lambda x: x[1], reverse=True)
asc = sorted(counts.items(), key=lambda x: x[1])
for word, count in des[:10]: print(word, count)
for word, count in asc[:10]: print(word, count)def save_output(counts: Counter, outfile: str):
fout = open(outfile, 'w')
for word in sorted(counts.keys()):
fout.write(f'{word}\n')
fout.close()save_output(counts, 'dat/word_types.txt')import re
re_mr = re.compile(r'M[rs]\.')
m = re_mr.match('Mr. Wayne')
print(m)
if m: print(m.start(), m.end())print(m.groups())()re_mr = re.compile(r'(M[rs])(\.)')
m = re_mr.match('Ms.')
print(m)
print(m.group())
print(m.groups())
print(m.group(0), m.group(1), m.group(2))m = RE_MR.match('Mrs.')
print(m)Nones1 = 'Mr. and Ms. Wayne are here'
s2 = 'Here are Mr. and Ms. Wayne'
print(re_mr.match(s1))
print(re_mr.match(s2))<re.Match object; span=(0, 3), match='Mr.'>
Noneprint(re_mr.search(s1))
print(re_mr.search(s2))<re.Match object; span=(0, 3), match='Mr.'>
<re.Match object; span=(9, 12), match='Mr.'>print(re_mr.findall(s1))
print(re_mr.findall(s2))[('Mr', '.'), ('Ms', '.')]
[('Mr', '.'), ('Ms', '.')]ms = re_mr.finditer(s1)
for m in ms: print(m)
ms = re_mr.finditer(s2)
for m in ms: print(m)<re.Match object; span=(0, 3), match='Mr.'>
<re.Match object; span=(8, 11), match='Ms.'>
<re.Match object; span=(9, 12), match='Mr.'>
<re.Match object; span=(17, 20), match='Ms.'>print(re_mr.sub('Dr.', 'I met Mr. Wayne and Ms. Kyle.'))I met Dr. Wayne and Dr. Kyle.text = 'Mr. Wayne isn\'t the hero we need, but "the one" we deserve.'
print(tokenize(text))
text = 'Ms. Wayne is "Batgirl" but not "the one".'
print(tokenize(text))['Ms.', 'Wayne', 'is', '"', 'Batgirl', '"', 'but', 'not', '"', 'the', 'one', '"']
['Ms.', 'Wayne', 'is', '"', 'Batgirl', '"', 'but', 'not', '"', 'the', 'one', '"', '.']def tokenize(text: str) -> list[str]:
re_tok = re.compile(r'([",.]|\s+|n\'t)')
tokens, prev_idx = [], 0
for m in re_tok.finditer(text):
t = text[prev_idx:m.start()].strip()
if t: tokens.append(t)
t = m.group().strip()
if t:
if tokens and tokens[-1] in {'Mr', 'Ms'} and t == '.':
tokens[-1] = tokens[-1] + t
else:
tokens.append(t)
prev_idx = m.end()
t = text[prev_idx:]
if t: tokens.append(t)
return tokens
ttokensword as a single token.<re.Match object; span=(0, 3), match='Mr.'>
0 3<re.Match object; span=(0, 3), match='Ms.'>
Ms.
('Ms', '.')
Ms. Ms .import json
import os
from types import SimpleNamespacedef get_lexica(res_dir: str) -> SimpleNamespace:
with open(os.path.join(res_dir, 'nouns.txt')) as fin: nouns = {noun.strip() for noun in fin}
with open(os.path.join(res_dir, 'verbs.txt')) as fin: verbs = {verb.strip() for verb in fin}
with open(os.path.join(res_dir, 'nouns_irregular.json')) as fin: nouns_irregular = json.load(fin)
with open(os.path.join(res_dir, 'verbs_irregular.json')) as fin: verbs_irregular = json.load(fin)
with open(os.path.join(res_dir, 'nouns_rules.json')) as fin: nouns_rules = json.load(fin)
with open(os.path.join(res_dir, 'verbs_rules.json')) as fin: verbs_rules = json.load(fin)
return SimpleNamespace(
nouns=nouns,
verbs=verbs,
nouns_irregular=nouns_irregular,
verbs_irregular=verbs_irregular,
nouns_rules=nouns_rules,
verbs_rules=verbs_rules
)print(len(lexica.nouns))
print(len(lexica.verbs))
print(lexica.nouns_irregular)
print(lexica.verbs_irregular)
print(lexica.nouns_rules)
print(lexica.verbs_rules)91
27
{'children': 'child', 'crises': 'crisis', 'mice': 'mouse'}
{'is': 'be', 'was': 'be', 'has': 'have', 'had': 'have', 'bought': 'buy'}
[['ies', 'y'], ['es', ''], ['s', ''], ['men', 'man'], ['ae', 'a'], ['i', 'us']]
[['ies', 'y'], ['ied', 'y'], ['es', ''], ['ed', ''], ['s', ''], ['d', ''], ['ying', 'ie'], ['ing', ''], ['ing', 'e'], ['n', ''], ['ung', 'ing']]def lemmatize(lexica: SimpleNamespace, word: str) -> str:
def aux(word: str, vocabs: dict[str, str], irregular: dict[str, str], rules: list[tuple[str, str]]):
lemma = irregular.get(word, None)
if lemma is not None: return lemma
for p, s in rules:
lemma = word[:-len(p)] + s
if lemma in vocabs: return lemma
return None
word = word.lower()
lemma = aux(word, lexica.verbs, lexica.verbs_irregular, lexica.verbs_rules)
if lemma is None:
lemma = aux(word, lexica.nouns, lexica.nouns_irregular, lexica.nouns_rules)
return lemma if lemma else wordnouns = ['studies', 'crosses', 'areas', 'gentlemen', 'vertebrae', 'alumni', 'children', 'crises']
nouns_lemmatized = [lemmatize(lexica, word) for word in nouns]
for word, lemma in zip(nouns, nouns_lemmatized): print('{} -> {}'.format(word, lemma))
verbs = ['applies', 'cried', 'pushes', 'entered', 'takes', 'heard', 'lying', 'studying', 'taking', 'drawn', 'clung', 'was', 'bought']
verbs_lemmatized = [lemmatize(lexica, word) for word in verbs]
for word, lemma in zip(verbs, verbs_lemmatized): print('{} -> {}'.format(word, lemma))studies -> study
crosses -> cross
areas -> area
gentlemen -> gentleman
vertebrae -> vertebra
alumni -> alumnus
children -> child
crises -> crisisapplies -> apply
cried -> cry
pushes -> push
entered -> enter
takes -> take
heard -> hear
lying -> lie
studying -> study
taking -> take
drawn -> draw
clung -> cling
was -> be
bought -> buyfrom collections import Counter
from src.tokenization import tokenize
corpus = 'dat/emory-wiki.txt'
delims = {'"', "'", '(', ')', '[', ']', ':', '-', ',', '.'}
words = [lemmatize(lexica, word) for word in tokenize(corpus, delims)]
counts = Counter(words)
print(f'# of word tokens: {len(words)}')
print(f'# of word types: {len(counts)}')
output = 'dat/word_types-token-lemma.txt'
with open(output, 'w') as fout:
for key in sorted(counts.keys()): fout.write(f'{key}\n')# of word tokens: 363
# of word types: 177prompt = "Translate the following English text to Spanish: Hello, how are you?"prompt = """Classify the sentiment of the following review as positive, negative, or neutral:
Review: "The product arrived late, but the quality exceeded my expectations."
Sentiment:"""prompt = """Classify the sentiment of movie reviews.
Review: "An absolute masterpiece! Every scene was captivating."
Sentiment: Positive
Review: "Terrible pacing and poor acting throughout."
Sentiment: Negative
Review: "It was fine. Nothing special but not bad either."
Sentiment: Neutral
Review: "The cinematography was stunning, though the plot felt rushed."
Sentiment:"""from anthropic import Anthropic
def sentiment_classifier(review: str) -> str:
client = Anthropic(api_key="your-api-key")
prompt = f"""Classify the sentiment of movie reviews.
Review: "An absolute masterpiece! Every scene was captivating."
Sentiment: Positive
Review: "Terrible pacing and poor acting throughout."
Sentiment: Negative
Review: "It was fine. Nothing special but not bad either."
Sentiment: Neutral
Review: "{review}"
Sentiment:"""
response = client.messages.create(
model="claude-3-5-sonnet-20241022",
max_tokens=50,
messages=[{"role": "user", "content": prompt}]
)
return response.content[0].text.strip()
# Test the classifier
review = "The cinematography was stunning, though the plot felt rushed."
result = sentiment_classifier(review)
print(f"Sentiment: {result}")"R1: -> ['"', 'R1', ':']
(R&D) -> ['(', 'R&D', ')']
15th-largest -> ['15th', '-', 'largest']
Atlanta, -> ['Atlanta', ',']
Department's -> ['Department', "'", 's']
activity"[26] -> ['activity', '"', '[', '26', ']']
centers.[21][22] -> ['centers', '.', '[', '21', ']', '[', '22', ']']
149,000 -> ['149', ',', '000']
U.S. -> ['U', '.', 'S', '.']def delimit(word: str, delimiters: set[str]) -> list[str]:
i = next((i for i, c in enumerate(word) if c in delimiters), -1)
if i < 0: return [word]
tokens = []
if i > 0: tokens.append(word[:i])
tokens.append(word[i])
if i + 1 < len(word):
tokens.extend(delimit(word[i + 1:], delimiters))
return tokensdelims = {'"', "'", '(', ')', '[', ']', ':', '-', ',', '.'}
input = [
'"R1:',
'(R&D)',
'15th-largest',
'Atlanta,',
"Department's",
'activity"[26]',
'centers.[21][22]',
'149,000',
'U.S.'
]
output = [delimit(word, delims) for word in input]
for word, tokens in zip(input, output):
print('{:<16} -> {}'.format(word, tokens))def postprocess(tokens: list[str]) -> list[str]:
i, new_tokens = 0, []
while i < len(tokens):
if i + 1 < len(tokens) and tokens[i] == "'" and tokens[i + 1].lower() == 's':
new_tokens.append(''.join(tokens[i:i + 2]))
i += 1
elif i + 2 < len(tokens) and \
((tokens[i] == '[' and tokens[i + 1].isnumeric() and tokens[i + 2] == ']') or
(tokens[i].isnumeric() and tokens[i + 1] == ',' and tokens[i + 2].isnumeric())):
new_tokens.append(''.join(tokens[i:i + 3]))
i += 2
elif i + 3 < len(tokens) and ''.join(tokens[i:i + 4]) == 'U.S.':
new_tokens.append(''.join(tokens[i:i + 4]))
i += 3
else:
new_tokens.append(tokens[i])
i += 1
return new_tokensoutput = [postprocess(delimit(word, delims)) for word in input]
for word, tokens in zip(input, output):
print('{:<16} -> {}'.format(word, tokens))"R1: -> ['"', 'R1', ':']
(R&D) -> ['(', 'R&D', ')']
15th-largest -> ['15th', '-', 'largest']
Atlanta, -> ['Atlanta', ',']
Department's -> ['Department', "'s"]
activity"[26] -> ['activity', '"', '[26]']
centers.[21][22] -> ['centers', '.', '[21]', '[22]']
149,000 -> ['149,000']
U.S. -> ['U.S.']def tokenize(corpus: str, delimiters: set[str]) -> list[str]:
with open(corpus) as fin:
words = fin.read().split()
return [token for word in words for token in postprocess(delimit(word, delimiters))]from collections import Counter
from src.frequency_analysis import save_output
corpus = 'dat/emory-wiki.txt'
output = 'dat/word_types-token.txt'
words = tokenize(corpus, delims)
counts = Counter(words)
print(f'# of word tokens: {len(words)}')
print(f'# of word types: {len(counts)}')
save_output(counts, output)# of word tokens: 363
# of word types: 197from src.ngram_models import unigram_count, Unigram
UNKNOWN = ''
def unigram_smoothing(filepath: str) -> Unigram:
counts = unigram_count(filepath)
total = sum(counts.values()) + len(counts)
unigrams = {word: (count + 1) / total for word, count in counts.items()}
unigrams[UNKNOWN] = 1 / total
return unigramsfrom src.ngram_models import test_unigram
corpus = 'dat/chronicles_of_narnia.txt'
test_unigram(corpus, unigram_smoothing) I 0.010225
Aslan 0.001796
Lucy 0.001762
Edmund 0.001369
Narnia 0.001339
Caspian 0.001300
Jill 0.001226
Peter 0.001005
Shasta 0.000902
Digory 0.000899
Eustace 0.000853
Susan 0.000636
Tirian 0.000585
Polly 0.000533
Aravis 0.000523
Bree 0.000479
Puddleglum 0.000479
Scrubb 0.000469
Andrew 0.000396 Unigram With Smoothing W/O Smoothing
I 0.010225 0.010543
Aslan 0.001796 0.001850
Lucy 0.001762 0.001815
Edmund 0.001369 0.001409
Narnia 0.001339 0.001379
Caspian 0.001300 0.001338
Jill 0.001226 0.001262
Peter 0.001005 0.001034
Shasta 0.000902 0.000928
Digory 0.000899 0.000925
Eustace 0.000853 0.000877
Susan 0.000636 0.000654
Tirian 0.000585 0.000601
Polly 0.000533 0.000547
Aravis 0.000523 0.000537
Bree 0.000479 0.000492
Puddleglum 0.000479 0.000492
Scrubb 0.000469 0.000482
Andrew 0.000396 0.000406def smoothed_unigram(probs: Unigram, word: str) -> float:
return probs.get(word, unigram[UNKNOWN])unigram = unigram_smoothing(corpus)
for word in ['Aslan', 'Jinho']:
print(f'{word} {smoothed_unigram(unigram, word):.6f}')Aslan 0.001796
Jinho 0.000002from src.ngram_models import bigram_count, Bigram
def bigram_smoothing(filepath: str) -> Bigram:
counts = bigram_count(filepath)
vocab = set(counts.keys())
for _, css in counts.items():
vocab.update(css.keys())
bigrams = dict()
for prev, ccs in counts.items():
total = sum(ccs.values()) + len(vocab)
d = {curr: (count + 1) / total for curr, count in ccs.items()}
d[UNKNOWN] = 1 / total
bigrams[prev] = d
bigrams[UNKNOWN] = 1 / len(vocab)
return bigramsfrom src.ngram_models import test_bigram
corpus = 'dat/chronicles_of_narnia.txt'
test_bigram(corpus, bigram_smoothing)I
'm 0.020590
do 0.019136
've 0.011143
was 0.010598
have 0.009629
am 0.009084
'll 0.008236
think 0.008115
'd 0.006661
know 0.006540
the
same 0.008403
other 0.007591
King 0.007096
Witch 0.006673
whole 0.005119
others 0.005084
first 0.004978
Dwarf 0.004872
door 0.004837
great 0.004837
said
the 0.039038
, 0.018270
Lucy 0.014312
Edmund 0.011206
Caspian 0.010049
Peter 0.009805
Jill 0.008709
. 0.008648
Digory 0.007734
Aslan 0.007491def smoothed_bigram(probs: Bigram, prev: str, curr: str) -> float:
d = probs.get(prev, None)
return probs[UNKNOWN] if d is None else d.get(curr, d[UNKNOWN])bigram = bigram_smoothing(corpus)
for word in [('Aslan', 'is'), ('Aslan', 'Jinho'), ('Jinho', 'is')]:
print(f'{word} {smoothed_bigram(bigram, *word):.6f}')('Aslan', 'is') 0.001146
('Aslan', 'Jinho') 0.000076
('Jinho', 'is') 0.000081You are a student
You and I are studentsimport os
from dotenv import load_dotenv
HW3: Vector Space Models
import os
from dotenv import load_dotenv
from openai import OpenAI
from openai.types.chat import ChatCompletion
load_dotenv()
def test_llm_api() -> ChatCompletion:
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
response = client.chat.completions.create(
model="gpt-5-nano",
messages=[
{"role": "user", "content": "Say 'Hello World'."}
]
)
return response
if __name__ == "__main__":
r = test_llm_api()
print(r.choices[0].message.content)
print(r)pip install python-dotenvOPENAI_API_KEY=your-api-key-here.idea/
.venv/
.envfrom dotenv import load_dotenv
load_dotenv()pip install openaiChatCompletion(
id='chatcmpl-CfaRxgTbF2CfDMo63LJwICeQBjODl',
object='chat.completion',
created=1764027597,
model='gpt-5-nano-2025-08-07',
service_tier='default',
system_fingerprint=None,
choices=[
Choice(
index=0,
finish_reason='stop',
logprobs=None,
message=ChatCompletionMessage(
content='Hello World',
role='assistant',
refusal=None,
annotations=[],
audio=None,
tool_calls=None))],
usage=CompletionUsage(
prompt_tokens=11,
completion_tokens=203,
total_tokens=214,
prompt_tokens_details=PromptTokensDetails(
audio_tokens=0,
cached_tokens=0),
completion_tokens_details=CompletionTokensDetails(
accepted_prediction_tokens=0,
audio_tokens=0,
reasoning_tokens=192,
rejected_prediction_tokens=0)))def multi_turn_interactions_0(client: OpenAI) -> ChatCompletion:
return client.chat.completions.create(
model="gpt-5-nano",
messages=[
{"role": "system", "content": "You are a calculator."},
{"role": "user", "content": "What is (2 + 3) * 4?"}
]
)20def multi_turn_interactions_1(client: OpenAI) -> ChatCompletion:
return client.chat.completions.create(
model="gpt-5-nano",
messages=[
{"role": "system", "content": "You are a calculator."},
{"role": "user", "content": "What is (2 + 3) * 4?"},
{"role": "assistant", "content": "20"},
{"role": "user", "content": "Can you show me the full derivation?"}
]
)Sure. Here are two clear derivations.
1) Standard order of operations (parentheses first, then multiplication)
- (2 + 3) * 4
- Inside parentheses: 2 + 3 = 5
- Then: 5 * 4 = 20
- Result: 20
2) Using the distributive property (optional check)
- (2 + 3) * 4 = 2*4 + 3*4
- Compute: 2*4 = 8, and 3*4 = 12
- Sum: 8 + 12 = 20
- Result: 20m1 = [
{"role": "user", "content": "What is (2 + 3) * 4?"}
]
m2 = [
{"role": "user", "content": "You are a calculator. What is (2 + 3) * 4?"}
]
m3 = [
{"role": "system", "content": "You are a calculator."},
{"role": "user", "content": "What is (2 + 3) * 4?"}
]model="gpt-4" # or "claude-sonnet-4-5-20250929", "gemini-pro", etc.max_tokens=500 # Limit response to approximately 500 tokens# For factual tasks
response = client.chat.completions.create(
model="gpt-4",
messages=[{"role": "user", "content": "What is 2+2?"}],
temperature=0.0
)
# For creative tasks
response = client.chat.completions.create(
model="gpt-4",
messages=[{"role": "user", "content": "Write a creative story opening."}],
temperature=0.9
)top_p=0.9 # Consider tokens that make up 90% of probability massfrequency_penalty=0.5 # Moderate penalty for repeated tokenspresence_penalty=0.6 # Encourage discussion of new topicsstop=["END", "\n\n"] # Stop at "END" or double newlinefrom openai import OpenAI
def analyze_sentiment(text: str, verbose: bool = False) -> str:
"""
Analyzes the sentiment of given text using GPT-4.
Args:
text: The text to analyze
verbose: If True, includes confidence level in output
Returns:
Sentiment classification (Positive/Negative/Neutral)
"""
client = OpenAI(api_key="your-api-key")
system_message = """You are a sentiment analysis expert.
Classify the sentiment as Positive, Negative, or Neutral."""
if verbose:
system_message += " Include a confidence level (low/medium/high)."
response = client.chat.completions.create(
model="gpt-4",
messages=[
{"role": "system", "content": system_message},
{"role": "user", "content": f"Analyze this text: {text}"}
],
max_tokens=100,
temperature=0.2, # Low temperature for consistent classification
top_p=1.0,
frequency_penalty=0.0,
presence_penalty=0.0,
stop=None
)
return response.choices[0].message.content.strip()
# Test the function
review = "This product exceeded my expectations, though shipping was slow."
result = analyze_sentiment(review, verbose=True)
print(f"Sentiment: {result}")response = client.chat.completions.create(
model="gpt-4",
messages=[{"role": "user", "content": "Hello!"}]
)
# Access the generated text
content = response.choices[0].message.content
# Access usage information
prompt_tokens = response.usage.prompt_tokens
completion_tokens = response.usage.completion_tokens
total_tokens = response.usage.total_tokens
print(f"Generated text: {content}")
print(f"Tokens used - Prompt: {prompt_tokens}, Completion: {completion_tokens}"){
"id": "chatcmpl-123",
"object": "chat.completion",
"created": 1677652288,
"model": "gpt-4",
"choices": [
{
"index": 0,
"message": {
"role": "assistant",
"content": "Hello! How can I help you today?"
},
"finish_reason": "stop"
}
],
"usage": {
"prompt_tokens": 9,
"completion_tokens": 12,
"total_tokens": 21
}
}from anthropic import Anthropic
client = Anthropic(api_key="your-api-key")
response = client.messages.create(
model="claude-sonnet-4-5-20250929",
max_tokens=1024, # Required for Anthropic
messages=[
{"role": "user", "content": "Hello, Claude!"}
]
)
print(response.content[0].text)import google.generativeai as genai
genai.configure(api_key="your-api-key")
model = genai.GenerativeModel('gemini-pro')
response = model.generate_content("Hello, Gemini!")
print(response.text)from openai import OpenAI, OpenAIError
import time
def make_api_call_with_retry(client, messages, max_retries=3):
"""Makes an API call with retry logic for handling rate limits."""
for attempt in range(max_retries):
try:
response = client.chat.completions.create(
model="gpt-4",
messages=messages,
max_tokens=500,
temperature=0.7
)
return response.choices[0].message.content
except OpenAIError as e:
if attempt < max_retries - 1:
wait_time = 2 ** attempt # Exponential backoff
print(f"Error occurred: {e}. Retrying in {wait_time} seconds...")
time.sleep(wait_time)
else:
print(f"Failed after {max_retries} attempts: {e}")
raise
except Exception as e:
print(f"Unexpected error: {e}")
raise
# Usage
client = OpenAI(api_key="your-api-key")
messages = [{"role": "user", "content": "Explain error handling."}]
result = make_api_call_with_retry(client, messages)# Estimate costs before making requests
def estimate_cost(prompt: str, max_tokens: int, model: str) -> float:
"""Rough cost estimation (prices vary by provider)."""
# Example rates (not actual current rates - check provider docs)
rates = {
"gpt-4": {"input": 0.03, "output": 0.06}, # per 1K tokens
"gpt-3.5-turbo": {"input": 0.001, "output": 0.002}
}
# Rough token estimation (1 token ≈ 4 characters)
input_tokens = len(prompt) / 4
rate = rates.get(model, rates["gpt-4"])
estimated_cost = (
(input_tokens / 1000) * rate["input"] +
(max_tokens / 1000) * rate["output"]
)
return estimated_costfrom openai import OpenAI
client = OpenAI(api_key="your-api-key")
prompt = "Write a creative story about a robot learning to paint."
# Try different configurations
configs = [
{"temp": 0.2, "desc": "Low temperature (focused)"},
{"temp": 0.7, "desc": "Medium temperature (balanced)"},
{"temp": 1.5, "desc": "High temperature (creative)"}
]
for config in configs:
print(f"\n{config['desc']}:")
print("-" * 50)
response = client.chat.completions.create(
model="gpt-4",
messages=[{"role": "user", "content": prompt}],
max_tokens=200,
temperature=config["temp"]
)
print(response.choices[0].message.content)D1 = ['John', 'bought', 'a', 'book', '.', 'The', 'book', 'was', 'funny', '.']
D2 = ['Mary', 'liked', 'the', 'book', '.', 'John', 'gave', 'it', 'to', 'Mary', '.']W = [
'.', # 0
'John', # 1
'Mary', # 2
'The', # 3
'a', # 4
'book', # 5
'bought', # 6
'funny', # 7
'gave', # 8
'it', # 9
'liked', # 10
'the', # 11
'to', # 12
'was' # 13
]
Hello World
ChatCompletion(
id='chatcmpl-CfaRxgTbF2CfDMo63LJwICeQBjODl',
choices=[
Choice(
finish_reason='stop',
index=0,
logprobs=None,
message=ChatCompletionMessage(
content='Hello World',
refusal=None,
role='assistant',
annotations=[],
audio=None,
function_call=None,
tool_calls=None))],
created=1764027597,
model='gpt-5-nano-2025-08-07',
object='chat.completion',
service_tier='default',
system_fingerprint=None,
usage=CompletionUsage(
completion_tokens=203,
prompt_tokens=11,
total_tokens=214,
completion_tokens_details=CompletionTokensDetails(
accepted_prediction_tokens=0,
audio_tokens=0,
reasoning_tokens=192,
rejected_prediction_tokens=0),
prompt_tokens_details=PromptTokensDetails(
audio_tokens=0,
cached_tokens=0)))# 0 1 2 3 4 5 6 7 8 9 10 11 12 13
v1 = [2, 1, 0, 1, 1, 2, 1, 1, 0, 0, 0, 0, 0, 1]
v2 = [2, 1, 2, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0]v1 = {0:2, 1:1, 3:1, 4:1, 5:2, 6:1, 7:1, 13:1}
v2 = {0:2, 1:1, 2:2, 5:1, 8:1, 9:1, 10:1, 11:1, 12:1}from typing import TypeAlias
Document: TypeAlias = list[str]
Vocab: TypeAlias = dict[str, int]
def vocabulary(documents: list[Document]) -> Vocab:
vocab = set()
for document in documents:
vocab.update(document)
return {word: i for i, word in enumerate(sorted(list(vocab)))}from collections import Counter
SparseVector: TypeAlias = dict[int, int | float]
def bag_of_words(vocab: Vocab, document: Document) -> SparseVector:
counts = Counter(document)
return {vocab[word]: count for word, count in sorted(counts.items()) if word in vocab}documents = [
['John', 'bought', 'a', 'book', '.', 'The', 'book', 'was', 'funny', '.'],
['Mary', 'liked', 'the', 'book', '.', 'John', 'gave', 'it', 'to', 'Mary', '.']
]
vocab = vocabulary(documents)
print(vocab)
print(bag_of_words(vocab, documents[0]))
print(bag_of_words(vocab, documents[1])){
'.': 0,
'John': 1,
'Mary': 2,
'The': 3,
'a': 4,
'book': 5,
'bought': 6,
'funny': 7,
'gave': 8,
'it': 9,
'liked': 10,
'the': 11,
'to': 12,
'was': 13
}
{0: 2, 1: 1, 3: 1, 4: 1, 5: 2, 6: 1, 7: 1, 13: 1}
{0: 2, 1: 1, 2: 2, 5: 1, 8: 1, 9: 1, 10: 1, 11: 1, 12: 1}[Label]\t[Document]from src.bag_of_words_model import vocabulary
from src.term_weighing import read_corpus, document_frequencies, tf_idf
if __name__ == '__main__':
corpus = read_corpus('dat/chronicles_of_narnia.txt')
vocab = vocabulary(corpus)
dfs = document_frequencies(vocab, corpus)
D = len(corpus)
documents = [
'I like this movie very much'.split(),
'I hate this movie very much'.split(),
'I love this movie so much'.split()
]
vs = [tf_idf(vocab, dfs, D, document) for document in documents]
for v in vs: print(vs){980: 0.31, 7363: 0.52, 7920: 0.70, 11168: 0.51, 11833: 0.51}
{980: 0.31, 6423: 1.24, 7920: 0.70, 10325: print(euclidean_distance(vs[0], vs[0]))
print(euclidean_distance(vs[0], vs[1]))
print(euclidean_distance(vs[0], vs[2]))0.0
1.347450458032576
1.3756015678855296print(cosine_similarity(vs[0], vs[0]))
print(cosine_similarity(vs[0], vs[1]))
print(cosine_similarity(vs[0], vs[2]))0.9999999999999999
0.5775130451716284
0.4826178600593854import math
from src.bag_of_words_model import SparseVector
def euclidean_distance(v1: SparseVector, v2: SparseVector) -> float:
d = sum((v - v2.get(k, 0)) ** 2 for k, v in v1.items())
d += sum(v ** 2 for k, v in v2.items() if k not in v1)
return math.sqrt(d)def cosine_similarity(v1: SparseVector, v2: SparseVector) -> float:
n = sum(v * v2.get(k, 0) for k, v in v1.items())
d = math.sqrt(sum(v ** 2 for k, v in v1.items()))
d *= math.sqrt(sum(v ** 2 for k, v in v2.items()))
return n / dfrom elit_tokenizer import EnglishTokenizer
ds = [
"As dawn broke, the first light kissed the golden mane of Aslan, the rightful king of Narnia.",
"The White Witch's icy breath froze the once lush meadows, casting a shadow over Narnia.",
"Lucy's footsteps echoed in the halls of Cair Paravel, where legends were born."
]
etok = EnglishTokenizer()
documents = [etok.decode(d).tokens for d in ds]
print_tfs(vocab, documents)[('the', 3), (',', 2), ('of', 2), ('.', 1), ('As', 1), ('Aslan', 1), ('Narnia', 1), ('broke', 1), ('dawn', 1), ('first', 1), ('golden', 1), ('king', 1), ('kissed', 1), ('light', 1), ('mane', 1), ('rightful', 1)]
[("'s", 1), (',', 1), ('.', 1), ('Narnia', 1), ('The', 1), ('White', 1), ('Witch', 1), ('a', 1), ('breath', 1), ('casting', 1), ('froze', 1), ('icy', 1), ('once', 1), ('over', 1), ('shadow', 1), ('the', 1)]
[("'s", 1), (',', 1), ('.', 1), ('Cair', 1), ('Lucy', 1), ('Paravel', 1), ('born', 1), ('echoed', 1), ('footsteps', 1), ('halls', 1), ('in', 1), ('legends', 1), ('of', 1), ('the', 1), ('were', 1), ('where', 1)]print_tfs(vocab, documents)[('Aslan', 1), ('Narnia', 1), ('broke', 1), ('dawn', 1), ('golden', 1), ('king', 1), ('kissed', 1), ('light', 1), ('mane', 1), ('rightful', 1)]
[("'s", 1), ('Narnia', 1), ('White', 1), ('Witch', 1), ('breath', 1), ('casting', 1), ('froze', 1), ('icy', 1), ('shadow', 1)]
[("'s", 1), ('Cair', 1), ('Lucy', 1), ('Paravel', 1), ('born', 1), ('echoed', 1), ('footsteps', 1), ('halls', 1), ('legends', 1)]corpus = read_corpus('dat/chronicles_of_narnia.txt')
vocab = vocabulary(corpus)
words = [word for word, _ in sorted(vocab.items(), key=lambda x: x[1])]
dfs = document_frequencies(vocab, corpus)
for document in documents:
bow = bag_of_words(vocab, document)
tf_df = [(words[tid], tf, dfs[tid]) for tid, tf in sorted(bow.items())]
tf_df = sorted(tf_df, key=lambda x: (-x[1], x[2]))
print(' '.join(document))
print('\n'.join(['{:>10} {} {:>5}'.format(*t) for t in tf_df]))As dawn broke , the first light kissed the golden mane of Aslan , the rightful king of Narnia .
the 3 9574
of 2 5355
, 2 10578
rightful 1 1
dawn 1 6
kissed 1 26
broke 1 35
king 1 40
mane 1 40
golden 1 52
As 1 161
light 1 203
first 1 401
Narnia 1 512
Aslan 1 706
. 1 19747
The White Witch 's icy breath froze the once lush meadows , casting a shadow over Narnia .
casting 1 1
froze 1 2
icy 1 3
shadow 1 38
White 1 44
breath 1 86
Witch 1 246
once 1 378
over 1 431
Narnia 1 512
The 1 1352
's 1 2404
a 1 5456
the 1 9574
, 1 10578
. 1 19747
Lucy 's footsteps echoed in the halls of Cair Paravel , where legends were born .
footsteps 1 1
legends 1 1
echoed 1 2
halls 1 4
born 1 14
Paravel 1 84
Cair 1 86
where 1 360
Lucy 1 704
were 1 1684
's 1 2404
in 1 3513
of 1 5355
the 1 9574
, 1 10578
. 1 19747for document in documents:
tfidf = tf_idf(vocab, dfs, len(corpus), document)
print(' '.join(document))
print('\n'.join(['{:>10} {:.2f}'.format(words[tid], score) for tid, score in sorted(tfidf.items(), key=lambda x: x[1], reverse=True)]))As dawn broke , the first light kissed the golden mane of Aslan , the rightful king of Narnia .
rightful 0.50
dawn 0.41
kissed 0.34
broke 0.32
king 0.32
mane 0.32
golden 0.30
As 0.25
light 0.24
first 0.20
Narnia 0.19
Aslan 0.17
of 0.14
the 0.13
, 0.08
. 0.01
The White Witch 's icy breath froze the once lush meadows , casting a shadow over Narnia .
casting 0.56
froze 0.52
icy 0.50
shadow 0.35
White 0.35
breath 0.31
Witch 0.25
once 0.23
over 0.22
Narnia 0.21
The 0.16
's 0.12
a 0.08
the 0.05
, 0.04
. 0.01
Lucy 's footsteps echoed in the halls of Cair Paravel , where legends were born .
footsteps 0.63
legends 0.63
echoed 0.58
halls 0.54
born 0.46
Paravel 0.35
Cair 0.35
where 0.26
Lucy 0.22
were 0.16
's 0.14
in 0.12
of 0.09
the 0.05
, 0.05
. 0.01from typing import Callable
def read_corpus(filename: str, tokenizer: Callable[[str], list[str]] | None = None) -> list[Document]:
fin = open(filename)
if tokenizer is None: tokenizer = lambda s: s.split()
return [tokenizer(line) for line in fin]from src.bag_of_words_model import vocabulary
corpus = read_corpus('dat/chronicles_of_narnia.txt')
vocab = vocabulary(corpus)from src.bag_of_words_model import bag_of_words, Document, Vocab
def print_tfs(vocab: Vocab, documents: list[Document]):
tfs = [bag_of_words(vocab, document) for document in documents]
words = [word for word, _ in sorted(vocab.items(), key=lambda x: x[1])]
for tf in tfs:
print([(words[index], count) for index, count in sorted(tf.items(), key=lambda x: x[1], reverse=True)])from string import punctuation
stopwords = {line.strip().lower() for line in open('dat/stopwords.txt')}
is_stopwords = lambda w: w.lower() in stopwords or w in punctuationsw_tokenizer = lambda s: [word for word in s.split() if not is_stopwords(word)]
corpus = read_corpus('dat/chronicles_of_narnia.txt', sw_tokenizer)
vocab = vocabulary(corpus)from collections import Counter
from src.types import SparseVector
def document_frequencies(vocab: Vocab, corpus: list[Document]) -> SparseVector:
counts = Counter()
for document in corpus:
counts.update(set(document))
return {vocab[word]: count for word, count in sorted(counts.items()) if word in vocab}def tf_idf(vocab: Vocab, dfs: SparseVector, D: int, document: Document) -> SparseVector:
tf = lambda count: count / len(document)
idf = lambda tid: math.log(D / dfs[tid])
return {tid: tf(count) * idf(tid) for tid, count in bag_of_words(vocab, document).items()}V = [
'king', # 0
'man', # 1
'woman', # 2
'queen' # 3
] king = [1, 0, 0, 0]
man = [0, 1, 0, 0]
woman = [0, 0, 1, 0]
queen = [0, 0, 0, 1]
king = [0.5, 0.0, 0.5, 0.0]
man = [0.0, 0.5, 0.5, 0.0]
woman = [0.0, 0.5, 0.0, 0.5] queen = king - man + woman
= [0.5, 0.0, 0.0, 0.5]from src.bag_of_words_model import Document
import glob, os
def collect(dirpath: str) -> dict[int, list[Document]]:
books = dict()
for filename in glob.glob(os.path.join(dirpath, '*.txt')):
t = os.path.basename(filename).split('_')
book_id = int(t[0])
fin = open((filename))
books.setdefault(book_id, list()).append(fin.read().split())
return booksdef join_documents(dataset: dict[int, list[Document]]) -> list[Document]:
return [document for documents in dataset.values() for document in documents]
trn = collect('dat/document_classification/trn')
dev = collect('dat/document_classification/dev')
tst = collect('dat/document_classification/tst')
print(len(join_documents(trn)), len(join_documents(dev)), len(join_documents(tst)))82 14 14corpus = join_documents(trn)
vocab = vocabulary(join_documents(trn))
dfs = document_frequencies(vocab, corpus)
D = len(corpus)def vectorize(vocab: Vocab, dfs: SparseVector, D: int, docset: dict[int, list[Document]]) -> list[tuple[int, SparseVector]]:
vs = []
for book_id, documents in docset.items():
for document in documents:
vs.append((book_id, tf_idf(vocab, dfs, D, document)))
return vstrn_vs = vectorize(vocab, dfs, D, trn)
dev_vs = vectorize(vocab, dfs, D, dev)
tst_vs = vectorize(vocab, dfs, D, tst)def knn(trn_vs: list[tuple[int, SparseVector]], v: SparseVector, k: int = 1) -> tuple[int, float]:
sims = [(book_id, cosine_similarity(v, t)) for book_id, t in trn_vs]
sims.sort(key=lambda x: x[1], reverse=True)
return Counter(sims[:k]).most_common(1)[0][0]correct = 0
for g_book_id, document in dev_vs:
p_book_id, p_score = knn(trn_vs, document)
if g_book_id == p_book_id: correct += 1
print('Gold: {}, Auto: {}, Score: {:.2f}'.format(g_book_id, p_book_id, p_score))
print('Accuracy: {} ({}/{})'.format(100 * correct / len(dev_vs), correct, len(dev_vs)))Gold: 1, Auto: 1, Score: 0.49
Gold: 1, Auto: 1, Score: 0.27
Gold: 3, Auto: 3, Score: 0.36
Gold: 3, Auto: 3, Score: 0.32
Gold: 5, Auto: 5, Score: 0.29
Gold: 5, Auto: 5, Score: 0.54
Gold: 0, Auto: 0, Score: 0.32
Gold: 0, Auto: 0, Score: 0.26
Gold: 6, Auto: 6, Score: 0.48
Gold: 6, Auto: 6, Score: 0.49
Gold: 2, Auto: 2, Score: 0.37
Gold: 2, Auto: 2, Score: 0.31
Gold: 4, Auto: 4, Score: 0.56
Gold: 4, Auto: 4, Score: 0.60
Accuracy: 100.0 (14/14)V = {0: "I", 1: "love", 2: "hate", 3: "this", 4: "movie"}
x1 = [1, 1, 0, 1, 1]
x2 = [1, 0, 1, 1, 1]w = [0.0, 1.5, -1.5, 0.0, 0.0]
b = 0V = {0: "I", 1: "love", 2: "hate", 3: "this", 4: "movie", 5: "watched"}
x1 = [1, 1, 0, 1, 1, 0]
x2 = [1, 0, 1, 1, 1, 0]
x3 = [1, 0, 0, 1, 1, 1]w1 = [0.0, 1.5, -1.0, 0.0, 0.0, 0.0]
w2 = [0.0, -1.0, 1.5, 0.0, 0.0, 0.0]
w3 = [0.0, -1.0, -1.0, 0.0, 0.0, 1.5]
b1 = b2 = b3 = 0X = {0: "I", 1: "love", 2: "hate", 3: "this", 4: "movie", 5: "watched", 6: "truly"}
Y = {0: "positive", 1: "negative", 2: "neutral", 3: "very positive", 4: "very negative"}
x1 = [1, 1, 0, 1, 1, 0, 0]
x2 = [1, 0, 1, 1, 1, 0, 0]
x3 = [1, 0, 0, 1, 1, 1, 0]
x4 = [1, 1, 0, 1, 1, 0, 1]
x5 = [1, 0, 1, 1, 1, 0, 1]
y1, y2, y3, y4, y5 = 0, 1, 2, 3, 4Wx = [
[0.0, 0.0, 0.0, 0.0, 0.0],
[1.0, 0.0, 0.0, 0.5, 0.0],
[0.0, 1.0, 0.0, 0.0, 0.5],
[0.0, 0.0, 0.0, 0.0, 0.0],
[0.0, 0.0, 0.0, 0.0, 0.0],
[0.0, 0.0, 1.0, 0.0, 0.0],
[0.0, 0.0, 0.0, 0.5, 0.5]
]g1 = [1.0, 0.0, 0.0, 0.5, 0.0]
g2 = [0.0, 1.0, 0.0, 0.0, 0.5]
g3 = [0.0, 0.0, 1.0, 0.0, 0.0]
g4 = [1.0, 0.0, 0.0, 1.0, 0.5]
g5 = [0.0, 1.0, 0.0, 0.5, 1.0]h1 = activation(g1) = [1.0, 0.0, 0.0, 0.0, 0.0]
h2 = activation(g2) = [0.0, 1.0, 0.0, 0.0, 0.0]
h3 = activation(g3) = [0.0, 0.0, 1.0, 0.0, 0.0]
h4 = activation(g4) = [1.0, 0.0, 0.0, 1.0, 0.0]
h5 = activation(g5) = [0.0, 1.0, 0.0, 0.0, 1.0]Wh = [
[ 1.0, -1.0, 0.0, -0.5, -1.0],
[-1.0, 1.0, 0.0, -1.0, -0.5],
[-1.0, -1.0, 1.0, -1.0, -1.0],
[ 0.0, -1.0, 0.0, 1.0, -1.0],
[-1.0, 0.0, 0.0, -1.0, 1.0]
]o1 = [ 1.0, -1.0, -1.0, 0.0, -1.0]
o2 = [-1.0, 1.0, -1.0, -1.0, 0.0]
o3 = [ 0.0, 0.0, 1.0, 0.0, 0.0]
o4 = [ 0.5, -2.0, -2.0, 1.0, -2.0]
o5 = [-2.0, 0.5, -2.0, -2.0, 1.0]y1 = softmax(o1) = [0.56, 0.08, 0.08, 0.21, 0.08]
y2 = softmax(o2) = [0.08, 0.56, 0.08, 0.08, 0.21]
y3 = softmax(o3) = [0.15, 0.15, 0.40, 0.15, 0.15]
y4 = softmax(o4) = [0.35, 0.03, 0.03, 0.57, 0.03]
y5 = softmax(o5) = [0.03, 0.35, 0.03, 0.03, 0.57]



from src.types import Document, Vocab
def retrieve(filename: str) -> tuple[list[Document], Vocab]:
documents = [line.split() for line in open(filename)]
t = {word for document in documents for word in document}
terms = {term: j for j, term in enumerate(sorted(list(t)))}
return documents, termsimport numpy as np
def document_term_matrix(documents: list[Document], terms: Vocab) -> np.array:
def doc_vector(document: list[str]) -> list[int]:
v = [0] * len(terms)
for term in document:
v[terms[term]] += 1
return v
return np.array([doc_vector(document) for document in documents])import time
D, T = retrieve('dat/chronicles_of_narnia.txt')
st = time.time()
X = document_term_matrix(D, T)
et = time.time()
print("|D| = {}, |T| = {}, Process Time (sec) = {:.2f}".format(len(X), len(X[0]), et - st))|D| = 22603, |T| = 12361, Process Time (sec) = 17.87def document_term_matrix_np(documents: list[Document], terms: Vocab) -> np.array:
X = np.zeros((len(documents), len(vocab)), dtype=int)
for i, document in enumerate(documents):
for term in document:
X[i, vocab[term]] += 1
return Xst = time.time()
X = document_term_matrix_np(D, T)
et = time.time()
print("|D| = {}, |T| = {}, Process Time (sec) = {:.2f}".format(len(X), len(X[0]), et - st))|D| = 22603, |T| = 12361, Process Time (sec) = 0.55love white cat
love black cat
hate white cat
hate black cat
love white dog
love black dog
hate white dog
hate black dogD, T = retrieve('dat/latent_semantic_analysis.txt')
X = document_term_matrix_np(D, T)
U, S, Vt = np.linalg.svd(X, full_matrices=False)
S = np.diag(S)def print_np(matrix: np.array):
print(matrix.shape)
for row in matrix:
print(' '.join(['{:8.4f}'.format(c) for c in row]))
print_np(U)
print_np(S)
print_np(Vt)(8, 6)
-0.3536 -0.4969 -0.0552 0.3536 -0.6648 -0.0396
-0.3536 -0.4969 -0.0552 -0.3536 0.3750 0.5911
-0.3536 -0.0552 0.4969 0.3536 0.4847 -0.3598
-0.3536 -0.0552 0.4969 -0.3536 -0.1949 -0.1917
-0.3536 0.0552 -0.4969 0.3536 0.3187 -0.1450
-0.3536 0.0552 -0.4969 -0.3536 -0.0289 -0.4065
-0.3536 0.4969 0.0552 0.3536 -0.1386 0.5444
-0.3536 0.4969 0.0552 -0.3536 -0.1512 0.0071(6, 6)
3.4641 0.0000 0.0000 0.0000 0.0000 0.0000
0.0000 2.0000 0.0000 0.0000 0.0000 0.0000
0.0000 0.0000 2.0000 0.0000 0.0000 0.0000
0.0000 0.0000 0.0000 2.0000 0.0000 0.0000
0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
0.0000 0.0000 0.0000 0.0000 0.0000 0.0000(6, 6)
-0.4082 -0.4082 -0.4082 -0.4082 -0.4082 -0.4082
0.0000 -0.5522 0.5522 0.4417 -0.4417 0.0000
0.0000 0.4417 -0.4417 0.5522 -0.5522 0.0000
-0.7071 -0.0000 -0.0000 -0.0000 -0.0000 0.7071
-0.2614 -0.3151 -0.3151 0.5765 0.5765 -0.2614
-0.5148 0.4838 0.4838 0.0310 0.0310 -0.5148k = 4
U = U[:, :k]
S = S[:k, :k]
Vt = Vt[:k, :]for i, document in enumerate(D):
t = np.dot(U[i], S)
print('{}: {}'.format(' '.join(document), ['{:5.2f}'.format(f) for f in t]))love white cat: [-1.22, -0.99, -0.11, 0.71]
love black cat: [-1.22, -0.99, -0.11, -0.71]
hate white cat: [-1.22, -0.11, 0.99, 0.71]
hate black cat: [-1.22, -0.11, 0.99, -0.71]
love white dog: [-1.22, 0.11, -0.99, 0.71]
love black dog: [-1.22, 0.11, -0.99, -0.71]
hate white dog: [-1.22, 0.99, 0.11, 0.71]
hate black dog: [-1.22, 0.99, 0.11, -0.71]V = Vt.transpose()
for term, j in sorted(T.items(), key=lambda x: x[1]):
t = np.dot(V[j], S)
print('{:>5}: {}'.format(term, ['{:5.2f}'.format(f) for f in t]))black: [-1.41, 0.00, 0.00, -1.41]
cat: [-1.41, -1.10, 0.88, -0.00]
dog: [-1.41, 1.10, -0.88, -0.00]
hate: [-1.41, 0.88, 1.10, -0.00]
love: [-1.41, -0.88, -1.10, -0.00]
white: [-1.41, 0.00, 0.00, 1.41]read_word_embeddings() that takes a path to the file consisting of word embeddings, .Spring 2025
from src.types import WordCount, PairCount
EOW = '[EoW]'
word_counts = {
'high': 12,
'higher': 14,
'highest': 10,
'low': 12,
'lower': 11,
'lowest': 13
}def initialize(word_counts: WordCount) -> WordCount:
return {' '.join(list(word) + [EOW]): count for word, count in word_counts.items()}def expect(vocab: WordCount) -> PairCount:
pairs = collections.defaultdict(int)
for word, freq in vocab.items():
symbols = word.split()
for i in range(len(symbols) - 1):
pairs[symbols[i], symbols[i + 1]] += freq
return pairsdef maximize(vocab: WordCount, pairs: PairCount) -> WordCount:
best = max(pairs, key=pairs.get)
p = re.compile(r'(?<!\S)' + re.escape(' '.join(best)) + r'(?!\S)')
return {p.sub(''.join(best), word): freq for word, freq in vocab.items()}def bpe_vocab(word_counts: WordCount, max_iter: int):
vocab = initialize(word_counts)
for i in range(max_iter):
pairs = expect(vocab)
vocab = maximize(vocab, pairs)
# print(vocab)
return vocabbpe_vocab(word_counts, 10){'hi g h [EoW]': 12, 'hi g h e r [EoW]': 14, 'hi g h e s t [EoW]': 10, 'l o w [EoW]': 12, 'l o w e r [EoW]': 11, 'l o w e s t [EoW]': 13}
{'hig h [EoW]': 12, 'hig h e r [EoW]': 14, 'hig h e s t [EoW]': 10, 'l o w [EoW]': 12, 'l o w e r [EoW]': 11, 'l o w e s t [EoW]': 13}
{'high [EoW]': 12, 'high e r [EoW]': 14, 'high e s t [EoW]': 10, 'l o w [EoW]': 12, 'l o w e r [EoW]': 11, 'l o w e s t [EoW]': 13}
{'high [EoW]': 12, 'high e r [EoW]': 14, 'high e s t [EoW]': 10, 'lo w [EoW]': 12, 'lo w e r [EoW]': 11, 'lo w e s t [EoW]': 13}
{'high [EoW]': 12, 'high e r [EoW]': 14, 'high e s t [EoW]': 10, 'low [EoW]': 12, 'low e r [EoW]': 11, 'low e s t [EoW]': 13}
{'high [EoW]': 12, 'high er [EoW]': 14, 'high e s t [EoW]': 10, 'low [EoW]': 12, 'low er [EoW]': 11, 'low e s t [EoW]': 13}
{'high [EoW]': 12, 'high er[EoW]': 14, 'high e s t [EoW]': 10, 'low [EoW]': 12, 'low er[EoW]': 11, 'low e s t [EoW]': 13}
{'high [EoW]': 12, 'high er[EoW]': 14, 'high es t [EoW]': 10, 'low [EoW]': 12, 'low er[EoW]': 11, 'low es t [EoW]': 13}
{'high [EoW]': 12, 'high er[EoW]': 14, 'high est [EoW]': 10, 'low [EoW]': 12, 'low er[EoW]': 11, 'low est [EoW]': 13}
{'high [EoW]': 12, 'high er[EoW]': 14, 'high est[EoW]': 10, 'low [EoW]': 12, 'low er[EoW]': 11, 'low est[EoW]': 13}


[WORD](\t[FLOAT]){50}
Spring 2025
Spring 2024

