practice training models this time i thought a little about separating the inputs from the outputs, as well as how to do things in general when emotionally very hard a litle this doesn't seem out of the box, but you could concatenate the inputs and outputs and mask the inputs as -100 when passing to cross entropy i've shut down the lax server :D import transformers, torch, torch.nn pipe = transformers.pipeline('text-generation', 'meta-llama/Meta-Llama-3.1-8B', device_map='auto') model = pipe.model tokenizer = pipe.tokenizer lines = open(__file__).read().split('\n') # if we were separating input from output we might want to paste the tokens after tokenizing, and left-pad the input while right-padding the output? tokenizer.pad_token = tokenizer.eos_token inputs = tokenizer(lines, return_tensors='pt', padding=True)#, padding_side='left') # outputs = tokenizer(lines[1:] + lines[:1], return_tensors='pt', padding=True, padding_side='right') # inputs['labels'] = torch.cat( ... concatenate inputs and outputs # but set inputs to -100 inputs['labels'] = inputs['input_ids'].detach().clone() inputs['labels'][inputs['attention_mask']==0] = -100 print(inputs['input_ids'].min(), inputs['input_ids'].max()) model.train() loss = 10 last_loss = 0 lr=1e-5 optim = torch.optim.Adam(model.parameters(),lr=lr) while loss > 0.1: optim.zero_grad() model.zero_grad() # this passes the epoch as a batch, which may enforce less generality than minibatches, but also needs different code for larger data, but is faster print('forward') last_loss = loss loss = model(**inputs).loss print(loss.item()) if loss > last_loss: lr /= 2 optim = torch.optim.Adam(model.parameters(),lr=lr) print('backward') loss.backward() optim.step() # don't forget to save the model