2 Apr
2022
2 Apr
'22
10:13 p.m.
# Karl wants to learn around this first block but [doesn't understand that his working memory doesn't have space for it under these conditions right now], so it is commented out. #from tokenizers.pre_tokenizers import Whitespace # #tokenizer.pre_tokenizer = Whitespace() # this ihe training bit for using the example code: files = [f"data/wikitext-103-raw/wiki.{split}.raw" for split in ["test", "train", "valid"]] tokenizer.train(files, trainer)