I'm thinking a perceiver decoder would work better here than a tokenizer. Then it can produce sequences of characters that aren't in the example data. Anyway I trained the detokenizer on the file. Below is current content, but it doesn't use the detokenizer yet. Next maybe is to trying finetuning the model to use the detokenizer. This will run into issues because the detoknizer doesn't represent most words in whatever data I use for finetuning. It's nice to get this experience using a mainstream software process: finetuning a transformer model. !wget -c https://xkcd.com/2601/radio.mp3 !wget -c https://raw.githubusercontent.com/theinternetftw/xkcd2601/main/xkcd.lgo !pip3 install transformers[speech,sentencepiece] datasets librosa soundfile print('importing libraries ...') import torch from transformers import Speech2TextProcessor, Speech2TextForConditionalGeneration, Wav2Vec2Tokenizer, Wav2Vec2ForCTC import librosa as lb import numpy as np import inspect import os import sentencepiece as spm class CustomTokenizer: def __init__(self, datafilename, vocab_size): self.fn = datafilename self.vocab_size = vocab_size def load(self): modelpfx = f'{self.fn}.{self.vocab_size}.model' modelfn = f'{modelpfx}.model' if not os.path.exists(modelfn): def data(chunksize): with open(self.fn, 'rt') as datafile: while True: chunk = datafile.read(chunksize) if len(chunk) < chunksize: break yield chunk spm.SentencePieceTrainer.train(sentence_iterator=data(1024), model_prefix=modelpfx, vocab_size=self.vocab_size) self.model = spm.SentencePieceProcessor(model_file=modelfn) def tokenize(self, inputs): return self.model.encode(inputs) def detokenize(self, ids): return self.model.decode(ids) class Data: def __init__(self, src = 'radio.mp3', chunksize = 80 * 6000, sr = 16_000, dtype = np.float32): self.src = src self.chunksize = chunksize self.sr = sr self.length = lb.get_duration(filename = self.src) self.dtype = dtype def read_one(self, offset, chunksize = None): if chunksize is None: chunksize = self.chunksize duration = chunksize / self.sr print(f'reading {duration}s at {offset}s ...') data, sr = lb.load(self.src, sr = self.sr, offset = offset, duration = duration, dtype = self.dtype) print(f'read {data.shape} samples at {sr}') return data def read_random(self, ct=1): return np.stack([self.read_one(np.random.random() * (self.length - self.duration)) for idx in range(ct)]) def read_chunks(self, ct=1, offset=0): chunksize = self.chunksize data = self.read_one(offset, chunksize * ct) return data.reshape((ct, chunksize)) class S2T: def __init__(self, model = "facebook/s2t-small-librispeech-asr", sr = 16_000): self.sr = sr self.model = Speech2TextForConditionalGeneration.from_pretrained(model) self.processor = Speech2TextProcessor.from_pretrained(model) @property def vocab_size(self): return self.model.config.vocab_size def tokenize(self, inputs): print('tokenizing ...') input_ids = self.processor(inputs, sampling_rate=self.sr, return_tensors='pt') return input_ids['input_features'], input_ids['attention_mask'] def forward(self, feature_ids, attention_mask): print('passing data thru model ...') return self.model.generate(inputs=feature_ids, attention_mask=attention_mask) def detokenize(self, generated_ids): print('detokenizing output ...') return self.processor.batch_decode(generated_ids) print('constructing structures...') data = Data() s2t = S2T() detokenizer = CustomTokenizer('xkcd.lgo', vocab_size=1100)#s2t.vocab_size) detokenizer.load() feature_ids, attention_mask = s2t.tokenize(data.read_chunks(1)[0]) generated_ids = s2t.forward(feature_ids, attention_mask) outputs = s2t.detokenize(generated_ids) print(outputs)