import json
import os
from types import SimpleNamespacedef get_lexica(res_dir: str) -> SimpleNamespace:
with open(os.path.join(res_dir, 'nouns.txt')) as fin: nouns = {noun.strip() for noun in fin}
with open(os.path.join(res_dir, 'verbs.txt')) as fin: verbs = {verb.strip() for verb in fin}
with open(os.path.join(res_dir, 'nouns_irregular.json')) as fin: nouns_irregular = json.load(fin)
with open(os.path.join(res_dir, 'verbs_irregular.json')) as fin: verbs_irregular = json.load(fin)
with open(os.path.join(res_dir, 'nouns_rules.json')) as fin: nouns_rules = json.load(fin)
with open(os.path.join(res_dir, 'verbs_rules.json')) as fin: verbs_rules = json.load(fin)
return SimpleNamespace(
nouns=nouns,
verbs=verbs,
nouns_irregular=nouns_irregular,
verbs_irregular=verbs_irregular,
nouns_rules=nouns_rules,
verbs_rules=verbs_rules
)print(len(lexica.nouns))
print(len(lexica.verbs))
print(lexica.nouns_irregular)
print(lexica.verbs_irregular)
print(lexica.nouns_rules)
print(lexica.verbs_rules)91
27
{'children': 'child', 'crises': 'crisis', 'mice': 'mouse'}
{'is': 'be', 'was': 'be', 'has': 'have', 'had': 'have', 'bought': 'buy'}
[['ies', 'y'], ['es', ''], ['s', ''], ['men', 'man'], ['ae', 'a'], ['i', 'us']]
[['ies', 'y'], ['ied', 'y'], ['es', ''], ['ed', ''], ['s', ''], ['d', ''], ['ying', 'ie'], ['ing', ''], ['ing', 'e'], ['n', ''], ['ung', 'ing']]def lemmatize(lexica: SimpleNamespace, word: str) -> str:
def aux(word: str, vocabs: dict[str, str], irregular: dict[str, str], rules: list[tuple[str, str]]):
lemma = irregular.get(word, None)
if lemma is not None: return lemma
for p, s in rules:
lemma = word[:-len(p)] + s
if lemma in vocabs: return lemma
return None
word = word.lower()
lemma = aux(word, lexica.verbs, lexica.verbs_irregular, lexica.verbs_rules)
if lemma is None:
lemma = aux(word, lexica.nouns, lexica.nouns_irregular, lexica.nouns_rules)
return lemma if lemma else wordnouns = ['studies', 'crosses', 'areas', 'gentlemen', 'vertebrae', 'alumni', 'children', 'crises']
nouns_lemmatized = [lemmatize(lexica, word) for word in nouns]
for word, lemma in zip(nouns, nouns_lemmatized): print('{} -> {}'.format(word, lemma))
verbs = ['applies', 'cried', 'pushes', 'entered', 'takes', 'heard', 'lying', 'studying', 'taking', 'drawn', 'clung', 'was', 'bought']
verbs_lemmatized = [lemmatize(lexica, word) for word in verbs]
for word, lemma in zip(verbs, verbs_lemmatized): print('{} -> {}'.format(word, lemma))studies -> study
crosses -> cross
areas -> area
gentlemen -> gentleman
vertebrae -> vertebra
alumni -> alumnus
children -> child
crises -> crisisapplies -> apply
cried -> cry
pushes -> push
entered -> enter
takes -> take
heard -> hear
lying -> lie
studying -> study
taking -> take
drawn -> draw
clung -> cling
was -> be
bought -> buyfrom collections import Counter
from src.tokenization import tokenize
corpus = 'dat/emory-wiki.txt'
delims = {'"', "'", '(', ')', '[', ']', ':', '-', ',', '.'}
words = [lemmatize(lexica, word) for word in tokenize(corpus, delims)]
counts = Counter(words)
print(f'# of word tokens: {len(words)}')
print(f'# of word types: {len(counts)}')
output = 'dat/word_types-token-lemma.txt'
with open(output, 'w') as fout:
for key in sorted(counts.keys()): fout.write(f'{key}\n')# of word tokens: 363
# of word types: 177Emory University is a private research university in Atlanta, Georgia. Founded in 1836 as Emory College by the Methodist Episcopal Church and named in honor of Methodist bishop John Emory.[18]
Emory University has nine academic divisions. Emory Healthcare is the largest healthcare system in the state of Georgia[19] and comprises seven major hospitals, including Emory University Hospital and Emory University Hospital Midtown.[20] The university operates the Winship Cancer Institute, Yerkes National Primate Research Center, and many disease and vaccine research centers.[21][22] Emory University is the leading coordinator of the U.S. Health Department's National Ebola Training and Education Center.[23] The university is one of four institutions involved in the NIAID's Tuberculosis Research Units Program.[24] The International Association of National Public Health Institutes is headquartered at the university.[25]
Emory University has the 15th-largest endowment among U.S. colleges and universities.[9] The university is classified among "R1: Doctoral Universities - Very high research activity"[26] and is cited for high scientific performance and citation impact in the CWTS Leiden Ranking.[27] The National Science Foundation ranked the university 36th among academic institutions in the United States for research and development (R&D) expenditures.[28][29] In 1995 Emory University was elected to the Association of American Universities, an association of the 65 leading research universities in the United States and Canada.[5]
corpus = 'dat/emory-wiki.txt'
counts = count_words(corpus)
n_tokens = sum(counts.values())
n_types = len(counts)
print(f'# of word tokens: {n_tokens}')
print(f'# of word types: {n_types}')des = sorted(counts.items(), key=lambda x: x[1], reverse=True)
asc = sorted(counts.items(), key=lambda x: x[1])
for word, count in des[:10]: print(word, count)
for word, count in asc[:10]: print(word, count)save_output(counts, 'dat/word_types.txt')from collections import Counter
def count_words(corpus: str) -> Counter:
fin = open(corpus)
words = fin.read().split()
return Counter(words)def save_output(counts: Counter, outfile: str):
fout = open(outfile, 'w')
for word in sorted(counts.keys()):
fout.write(f'{word}\n')
fout.close()# of word tokens: 305
# of word types: 180the 18
and 15
of 12
Emory 11
in 10
University 7
is 7
university 6
United 6
research 5private 1
Atlanta, 1
Georgia. 1
Founded 1
1836 1
College 1
by 1
Episcopal 1
Church 1
named 1
\d+ matches "90" in "90s", but no match in "ABC".'n't' (e.g., "can't").<re.Match object; span=(0, 3), match='Mr.'>
0 3<re.Match object; span=(0, 3), match='Ms.'>
Ms.
('Ms', '.')
Ms. Ms .import re
re_mr = re.compile(r'M[rs]\.')
m = re_mr.match('Mr. Wayne')
print(m)
if m: print(m.start(), m.end())print(m.groups())()re_mr = re.compile(r'(M[rs])(\.)')
m = re_mr.match('Ms.')
print(m)
print(m.group())
print(m.groups())
print(m.group(0), m.group(1), m.group(2))m = RE_MR.match('Mrs.')
print(m)Nones1 = 'Mr. and Ms. Wayne are here'
s2 = 'Here are Mr. and Ms. Wayne'
print(re_mr.match(s1))
print(re_mr.match(s2))<re.Match object; span=(0, 3), match='Mr.'>
Noneprint(re_mr.search(s1))
print(re_mr.search(s2))<re.Match object; span=(0, 3), match='Mr.'>
<re.Match object; span=(9, 12), match='Mr.'>print(re_mr.findall(s1))
print(re_mr.findall(s2))[('Mr', '.'), ('Ms', '.')]
[('Mr', '.'), ('Ms', '.')]ms = re_mr.finditer(s1)
for m in ms: print(m)
ms = re_mr.finditer(s2)
for m in ms: print(m)<re.Match object; span=(0, 3), match='Mr.'>
<re.Match object; span=(8, 11), match='Ms.'>
<re.Match object; span=(9, 12), match='Mr.'>
<re.Match object; span=(17, 20), match='Ms.'>print(re_mr.sub('Dr.', 'I met Mr. Wayne and Ms. Kyle.'))I met Dr. Wayne and Dr. Kyle.def tokenize(text: str) -> list[str]:
re_tok = re.compile(r'([",.]|\s+|n\'t)')
tokens, prev_idx = [], 0
for m in re_tok.finditer(text):
t = text[prev_idx:m.start()].strip()
if t: tokens.append(t)
t = m.group().strip()
if t:
if tokens and tokens[-1] in {'Mr', 'Ms'} and t == '.':
tokens[-1] = tokens[-1] + t
else:
tokens.append(t)
prev_idx = m.end()
t = text[prev_idx:]
if t: tokens.append(t)
return tokenstext = 'Mr. Wayne isn\'t the hero we need, but "the one" we deserve.'
print(tokenize(text))
text = 'Ms. Wayne is "Batgirl" but not "the one".'
print(tokenize(text))['Ms.', 'Wayne', 'is', '"', 'Batgirl', '"', 'but', 'not', '"', 'the', 'one', '"']
['Ms.', 'Wayne', 'is', '"', 'Batgirl', '"', 'but', 'not', '"', 'the', 'one', '"', '.']ttokens{
'The Lion , the Witch and the Wardrobe': {
'title': 'The Lion , the Witch and the Wardrobe',
'year': 1950,
'chapters': [
{
'number': 1,
'title': 'Lucy Looks into a Wardrobe',
'token_count': 1915
},
{
'number': 2,
'title': 'What Lucy Found There',
'token_count': 2887
},
...
]
},
'Prince Caspian : The Return to Narnia': {
'title': 'Prince Caspian : The Return to Narnia',
'year': 1951,
'chapters': [
...
]
},
...
}word as a single token."R1: -> ['"', 'R1', ':']
(R&D) -> ['(', 'R&D', ')']
15th-largest -> ['15th', '-', 'largest']
Atlanta, -> ['Atlanta', ',']
Department's -> ['Department', "'", 's']
activity"[26] -> ['activity', '"', '[', '26', ']']
centers.[21][22] -> ['centers', '.', '[', '21', ']', '[', '22', ']']
149,000 -> ['149', ',', '000']
U.S. -> ['U', '.', 'S', '.']def delimit(word: str, delimiters: set[str]) -> list[str]:
i = next((i for i, c in enumerate(word) if c in delimiters), -1)
if i < 0: return [word]
tokens = []
if i > 0: tokens.append(word[:i])
tokens.append(word[i])
if i + 1 < len(word):
tokens.extend(delimit(word[i + 1:], delimiters))
return tokensdelims = {'"', "'", '(', ')', '[', ']', ':', '-', ',', '.'}
input = [
'"R1:',
'(R&D)',
'15th-largest',
'Atlanta,',
"Department's",
'activity"[26]',
'centers.[21][22]',
'149,000',
'U.S.'
]
output = [delimit(word, delims) for word in input]
for word, tokens in zip(input, output):
print('{:<16} -> {}'.format(word, tokens))def postprocess(tokens: list[str]) -> list[str]:
i, new_tokens = 0, []
while i < len(tokens):
if i + 1 < len(tokens) and tokens[i] == "'" and tokens[i + 1].lower() == 's':
new_tokens.append(''.join(tokens[i:i + 2]))
i += 1
elif i + 2 < len(tokens) and \
((tokens[i] == '[' and tokens[i + 1].isnumeric() and tokens[i + 2] == ']') or
(tokens[i].isnumeric() and tokens[i + 1] == ',' and tokens[i + 2].isnumeric())):
new_tokens.append(''.join(tokens[i:i + 3]))
i += 2
elif i + 3 < len(tokens) and ''.join(tokens[i:i + 4]) == 'U.S.':
new_tokens.append(''.join(tokens[i:i + 4]))
i += 3
else:
new_tokens.append(tokens[i])
i += 1
return new_tokensoutput = [postprocess(delimit(word, delims)) for word in input]
for word, tokens in zip(input, output):
print('{:<16} -> {}'.format(word, tokens))"R1: -> ['"', 'R1', ':']
(R&D) -> ['(', 'R&D', ')']
15th-largest -> ['15th', '-', 'largest']
Atlanta, -> ['Atlanta', ',']
Department's -> ['Department', "'s"]
activity"[26] -> ['activity', '"', '[26]']
centers.[21][22] -> ['centers', '.', '[21]', '[22]']
149,000 -> ['149,000']
U.S. -> ['U.S.']def tokenize(corpus: str, delimiters: set[str]) -> list[str]:
with open(corpus) as fin:
words = fin.read().split()
return [token for word in words for token in postprocess(delimit(word, delimiters))]from collections import Counter
from src.frequency_analysis import save_output
corpus = 'dat/emory-wiki.txt'
output = 'dat/word_types-token.txt'
words = tokenize(corpus, delims)
counts = Counter(words)
print(f'# of word tokens: {len(words)}')
print(f'# of word types: {len(counts)}')
save_output(counts, output)# of word tokens: 363
# of word types: 197