[ot][spam][crazy] Quickly autotranscribing xkcd 4/1 correctly

Sat Apr 2 07:50:56 PDT 2022

I'm thinking a perceiver decoder would work better here than a tokenizer.

Then it can produce sequences of characters that aren't in the example data.

Anyway I trained the detokenizer on the file. Below is current
content, but it doesn't use the detokenizer yet. Next maybe is to
trying finetuning the model to use the detokenizer.

This will run into issues because the detoknizer doesn't represent
most words in whatever data I use for finetuning.

It's nice to get this experience using a mainstream software process:
finetuning a transformer model.

!wget -c https://xkcd.com/2601/radio.mp3
!wget -c https://raw.githubusercontent.com/theinternetftw/xkcd2601/main/xkcd.lgo
!pip3 install transformers[speech,sentencepiece] datasets librosa soundfile

print('importing libraries ...')
import torch
from transformers import Speech2TextProcessor,
Speech2TextForConditionalGeneration, Wav2Vec2Tokenizer, Wav2Vec2ForCTC
import librosa as lb
import numpy as np
import inspect

import os
import sentencepiece as spm

class CustomTokenizer:
  def __init__(self, datafilename, vocab_size):
    self.fn = datafilename
    self.vocab_size = vocab_size
  def load(self):
    modelpfx = f'{self.fn}.{self.vocab_size}.model'
    modelfn = f'{modelpfx}.model'
    if not os.path.exists(modelfn):
      def data(chunksize):
        with open(self.fn, 'rt') as datafile:
          while True:
            chunk = datafile.read(chunksize)
            if len(chunk) < chunksize:
              break
            yield chunk
      spm.SentencePieceTrainer.train(sentence_iterator=data(1024),
model_prefix=modelpfx, vocab_size=self.vocab_size)
    self.model = spm.SentencePieceProcessor(model_file=modelfn)
  def tokenize(self, inputs):
    return self.model.encode(inputs)
  def detokenize(self, ids):
    return self.model.decode(ids)

class Data:
  def __init__(self, src = 'radio.mp3', chunksize = 80 * 6000, sr =
16_000, dtype = np.float32):
    self.src = src
    self.chunksize = chunksize
    self.sr = sr
    self.length = lb.get_duration(filename = self.src)
    self.dtype = dtype
  def read_one(self, offset, chunksize = None):
    if chunksize is None:
      chunksize = self.chunksize
    duration = chunksize / self.sr
    print(f'reading {duration}s at {offset}s ...')
    data, sr = lb.load(self.src, sr = self.sr, offset = offset,
duration = duration, dtype = self.dtype)
    print(f'read {data.shape} samples at {sr}')
    return data
  def read_random(self, ct=1):
    return np.stack([self.read_one(np.random.random() * (self.length -
self.duration)) for idx in range(ct)])
  def read_chunks(self, ct=1, offset=0):
    chunksize = self.chunksize
    data = self.read_one(offset, chunksize * ct)
    return data.reshape((ct, chunksize))

class S2T:
  def __init__(self, model = "facebook/s2t-small-librispeech-asr", sr = 16_000):
    self.sr = sr
    self.model = Speech2TextForConditionalGeneration.from_pretrained(model)
    self.processor = Speech2TextProcessor.from_pretrained(model)
  @property
  def vocab_size(self):
    return self.model.config.vocab_size
  def tokenize(self, inputs):
    print('tokenizing ...')
    input_ids = self.processor(inputs, sampling_rate=self.sr,
return_tensors='pt')
    return input_ids['input_features'], input_ids['attention_mask']
  def forward(self, feature_ids, attention_mask):
    print('passing data thru model ...')
    return self.model.generate(inputs=feature_ids,
attention_mask=attention_mask)
  def detokenize(self, generated_ids):
    print('detokenizing output ...')
    return self.processor.batch_decode(generated_ids)

print('constructing structures...')
data = Data()
s2t = S2T()
detokenizer = CustomTokenizer('xkcd.lgo', vocab_size=1100)#s2t.vocab_size)

detokenizer.load()
feature_ids, attention_mask = s2t.tokenize(data.read_chunks(1)[0])
generated_ids = s2t.forward(feature_ids, attention_mask)
outputs = s2t.detokenize(generated_ids)
print(outputs)