Document Classification
Supervised Learning
Data Split
from src.bag_of_words_model import Document
import glob, os
def collect(dirpath: str) -> dict[int, list[Document]]:
books = dict()
for filename in glob.glob(os.path.join(dirpath, '*.txt')):
t = os.path.basename(filename).split('_')
book_id = int(t[0])
fin = open((filename))
books.setdefault(book_id, list()).append(fin.read().split())
return booksVectorization
Classification
References
Last updated
Was this helpful?