Cross-posts happened to other thread. This looks useful: https://github.com/google/sentencepiece/blob/master/python/README.md#trainin... import urllib.request import io import sentencepiece as spm # Loads model from URL as iterator and stores the model to BytesIO. model = io.BytesIO() with urllib.request.urlopen( 'https://raw.githubusercontent.com/google/sentencepiece/master/data/botchan.t...' ) as response: spm.SentencePieceTrainer.train( sentence_iterator=response, model_writer=model, vocab_size=1000) # Serialize the model as file. # with open('out.model', 'wb') as f: # f.write(model.getvalue()) # Directly load the model from serialized model. sp = spm.SentencePieceProcessor(model_proto=model.getvalue()) print(sp.encode('this is test'))