2 Apr
2022
2 Apr
'22
10:52 p.m.
This goes in colab notebook, referenced at [git notes url in a thread]. import urllib.request import io import sentencepiece as spm # Loads model from URL as iterator and stores the model to BytesIO. model = io.BytesIO() with urllib.request.urlopen( 'https://raw.githubusercontent.com/google/sentencepiece/master/data/botchan.t...' ) as response: spm.SentencePieceTrainer.train( sentence_iterator=response, model_writer=model, vocab_size=1000) # Serialize the model as file. # with open('out.model', 'wb') as f: # f.write(model.getvalue()) # Directly load the model from serialized model. sp = spm.SentencePieceProcessor(model_proto=model.getvalue()) print(sp.encode('this is test'))