[ot][spam][crazy] Quickly autotranscribing xkcd 4/1 correctly

Sat Apr 2 02:22:57 PDT 2022

here is example code for speech transcription from the huggingface
docs. note that this approach is for processing short chunks of
speech, not a long recording. the dataset obscures how the data is
provided to the model, but it is going to be just some kind of array
of numbers.

import torch
from transformers import Speech2TextProcessor,
Speech2TextForConditionalGeneration
from datasets import load_dataset
import soundfile as sf

model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-small-librispeech-asr")
processor = Speech2TextProcessor.from_pretrained("facebook/s2t-small-librispeech-asr")

def map_to_array(batch):
    speech, _ = sf.read(batch["file"])
    batch["speech"] = speech
    return batch

ds = load_dataset("hf-internal-testing/librispeech_asr_dummy",
"clean", split="validation")
ds = ds.map(map_to_array)

inputs = processor(ds["speech"][0], sampling_rate=16_000, return_tensors="pt")
generated_ids = model.generate(input_ids=inputs["input_features"],
attention_mask=inputs["attention_mask"])

transcription = processor.batch_decode(generated_ids)