here is example code for speech transcription from the huggingface docs. note that this approach is for processing short chunks of speech, not a long recording. the dataset obscures how the data is provided to the model, but it is going to be just some kind of array of numbers. import torch from transformers import Speech2TextProcessor, Speech2TextForConditionalGeneration from datasets import load_dataset import soundfile as sf model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-small-librispeech-asr") processor = Speech2TextProcessor.from_pretrained("facebook/s2t-small-librispeech-asr") def map_to_array(batch): speech, _ = sf.read(batch["file"]) batch["speech"] = speech return batch ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") ds = ds.map(map_to_array) inputs = processor(ds["speech"][0], sampling_rate=16_000, return_tensors="pt") generated_ids = model.generate(input_ids=inputs["input_features"], attention_mask=inputs["attention_mask"]) transcription = processor.batch_decode(generated_ids)