[ot][spam][crazy] Quickly autotranscribing xkcd 4/1 correctly
Undiscussed Horrific Abuse, One Victim of Many
gmkarl at gmail.com
Sat Apr 2 07:50:56 PDT 2022
I'm thinking a perceiver decoder would work better here than a tokenizer.
Then it can produce sequences of characters that aren't in the example data.
Anyway I trained the detokenizer on the file. Below is current
content, but it doesn't use the detokenizer yet. Next maybe is to
trying finetuning the model to use the detokenizer.
This will run into issues because the detoknizer doesn't represent
most words in whatever data I use for finetuning.
It's nice to get this experience using a mainstream software process:
finetuning a transformer model.
!wget -c https://xkcd.com/2601/radio.mp3
!wget -c https://raw.githubusercontent.com/theinternetftw/xkcd2601/main/xkcd.lgo
!pip3 install transformers[speech,sentencepiece] datasets librosa soundfile
print('importing libraries ...')
import torch
from transformers import Speech2TextProcessor,
Speech2TextForConditionalGeneration, Wav2Vec2Tokenizer, Wav2Vec2ForCTC
import librosa as lb
import numpy as np
import inspect
import os
import sentencepiece as spm
class CustomTokenizer:
def __init__(self, datafilename, vocab_size):
self.fn = datafilename
self.vocab_size = vocab_size
def load(self):
modelpfx = f'{self.fn}.{self.vocab_size}.model'
modelfn = f'{modelpfx}.model'
if not os.path.exists(modelfn):
def data(chunksize):
with open(self.fn, 'rt') as datafile:
while True:
chunk = datafile.read(chunksize)
if len(chunk) < chunksize:
break
yield chunk
spm.SentencePieceTrainer.train(sentence_iterator=data(1024),
model_prefix=modelpfx, vocab_size=self.vocab_size)
self.model = spm.SentencePieceProcessor(model_file=modelfn)
def tokenize(self, inputs):
return self.model.encode(inputs)
def detokenize(self, ids):
return self.model.decode(ids)
class Data:
def __init__(self, src = 'radio.mp3', chunksize = 80 * 6000, sr =
16_000, dtype = np.float32):
self.src = src
self.chunksize = chunksize
self.sr = sr
self.length = lb.get_duration(filename = self.src)
self.dtype = dtype
def read_one(self, offset, chunksize = None):
if chunksize is None:
chunksize = self.chunksize
duration = chunksize / self.sr
print(f'reading {duration}s at {offset}s ...')
data, sr = lb.load(self.src, sr = self.sr, offset = offset,
duration = duration, dtype = self.dtype)
print(f'read {data.shape} samples at {sr}')
return data
def read_random(self, ct=1):
return np.stack([self.read_one(np.random.random() * (self.length -
self.duration)) for idx in range(ct)])
def read_chunks(self, ct=1, offset=0):
chunksize = self.chunksize
data = self.read_one(offset, chunksize * ct)
return data.reshape((ct, chunksize))
class S2T:
def __init__(self, model = "facebook/s2t-small-librispeech-asr", sr = 16_000):
self.sr = sr
self.model = Speech2TextForConditionalGeneration.from_pretrained(model)
self.processor = Speech2TextProcessor.from_pretrained(model)
@property
def vocab_size(self):
return self.model.config.vocab_size
def tokenize(self, inputs):
print('tokenizing ...')
input_ids = self.processor(inputs, sampling_rate=self.sr,
return_tensors='pt')
return input_ids['input_features'], input_ids['attention_mask']
def forward(self, feature_ids, attention_mask):
print('passing data thru model ...')
return self.model.generate(inputs=feature_ids,
attention_mask=attention_mask)
def detokenize(self, generated_ids):
print('detokenizing output ...')
return self.processor.batch_decode(generated_ids)
print('constructing structures...')
data = Data()
s2t = S2T()
detokenizer = CustomTokenizer('xkcd.lgo', vocab_size=1100)#s2t.vocab_size)
detokenizer.load()
feature_ids, attention_mask = s2t.tokenize(data.read_chunks(1)[0])
generated_ids = s2t.forward(feature_ids, attention_mask)
outputs = s2t.detokenize(generated_ids)
print(outputs)
More information about the cypherpunks
mailing list