Here's what I have right now. It decodes the text better than I thought it would. I'm worried it might just output the result without needing any further finetuning. Shell commands: $ wget -c https://xkcd.com/2601/radio.mp3 $ pip3 install transformers[speech,sentencepiece] datasets librosa soundfile Python input: print('importing libraries ...') import torch from transformers import Speech2TextProcessor, Speech2TextForConditionalGeneration, Wav2Vec2Tokenizer, Wav2Vec2ForCTC import librosa as lb import numpy as np class Data: def __init__(self, src = 'radio.mp3', chunksize = 1024 * 128, sr = 16_000, dtype = np.float32): self.src = src self.chunksize = chunksize self.sr = sr self.length = lb.get_duration(filename = self.src) self.dtype = dtype def read_one(self, offset, chunksize = None): if chunksize is None: chunksize = self.chunksize duration = chunksize / self.sr print(f'reading {duration}s at {offset}s ...') data, sr = lb.load(self.src, sr = self.sr, offset = offset, duration = duration, dtype = self.dtype) return data def read_random(self, ct=1): return np.stack([self.read_one(np.random.random() * (self.length - self.duration)) for idx in range(ct)]) def read_chunks(self, ct=1, offset=0): chunksize = self.chunksize data = self.read_one(offset, chunksize * ct) return data.reshape((ct, chunksize)) class S2T: def __init__(self, model = "facebook/s2t-small-librispeech-asr", sr = 16_000): self.sr = sr self.model = Speech2TextForConditionalGeneration.from_pretrained(model) self.processor = Speech2TextProcessor.from_pretrained(model) def tokenize(self, inputs): print('tokenizing ...') input_ids = self.processor(inputs, sampling_rate=self.sr, return_tensors='pt') return input_ids['input_features'], input_ids['attention_mask'] def forward(self, feature_ids, attention_mask): print('passing data thru model ...') return self.model.generate(inputs=feature_ids, attention_mask=attention_mask) def detokenize(self, generated_ids): print('detokenizing output ...') return self.processor.batch_decode(generated_ids) print('constructing structures...') data = Data() s2t = S2T() feature_ids, attention_mask = s2t.tokenize(data.read_chunks(1)[0]) generated_ids = s2t.forward(feature_ids, attention_mask) outputs = s2t.detokenize(generated_ids) print(outputs) Python output: importing libraries ... constructing structures... reading 8.192s at 0s ... /usr/local/lib/python3.7/dist-packages/librosa/core/audio.py:165: UserWarning: PySoundFile failed. Trying audioread instead. warnings.warn("PySoundFile failed. Trying audioread instead.") tokenizing ... passing data thru model ... /usr/local/lib/python3.7/dist-packages/transformers/models/speech_to_text/modeling_speech_to_text.py:559: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). input_lengths = (input_lengths - 1) // 2 + 1 detokenizing output ... ["and here we want to show you that you can program a picture right along with us we'll use a single color some unorthodox fine fun'll use a single color some unorthodox fine fine fun'd to show you that you can program a picture and here we want to show you that you that you can program a picture right along to show you that you that you can program of picture right along with us we want to show you that you that you that you that you that you that you that you can program a picture right along with a picture right along with us"] Okay, it does indeed start messing up as planned for. Strangely it says the same text over: I'm guessing that means I'm giving it a longer input sequence than it was trained for, and so it loses information on where it is in the input (the fourier position embeddings have a behavior similar to bitwise overflow, wrapping around)