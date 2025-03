I am training a model called BigManGPT, which uses Qwen-7B to train Squad dataset.

import torch as t import transformers as tf import datasets as ds import peft as p def mainProgram(): print("---- BigManGPT Training PRogram ----") # QLoRA Quantization (4-bit) quantConfig = tf.BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=t.bfloat16 ) modelID = "Qwen/Qwen-7B" print("Preparing tokenizer...") tokenizer = tf.AutoTokenizer.from_pretrained( modelID, use_fast=False, padding_side="right", trust_remote_code=True ) tokenizer.pad_token = tokenizer.eos_token print("Loading model...") # Load quantized model explicitly on GPU model = tf.AutoModelForCausalLM.from_pretrained( modelID, quantization_config=quantConfig, device_map="auto", trust_remote_code=True ) print("Loading model for LORA...") model = p.prepare_model_for_kbit_training(model) peftCfg = p.LoraConfig( r=4, lora_alpha=8, target_modules=["c_attn", "attn.c_proj", "w1", "w2", "mlp.c_proj"], lora_dropout=0.01, bias="none", task_type="CAUSAL_LM" ) model = p.get_peft_model(model, peftCfg) model.print_trainable_parameters() print("Loading Dataset...") dataset = ds.load_dataset("squad", split="train") # For faster training, reduce dataset size dataset = dataset.select(range(3000)) def tokenizeFunction(examples): return tokenizer( examples["question"], examples["context"], truncation=True, max_length=512, padding=False # ) # Tokenization optimized for speed print("Tokenizing dataset (16 parallel processes, batched)...") tokenizedDS = dataset.map( tokenizeFunction, batched=True, batch_size=1000, num_proc=16, remove_columns=dataset.column_names, writer_batch_size=1000 ) # se dynamic padding collator dataCollator = tf.DataCollatorWithPadding(tokenizer) # Optimized GPU training arguments trainingArgs = tf.TrainingArguments( output_dir="./bigman_gpt_qwen_squad", per_device_train_batch_size=16, gradient_accumulation_steps=1, num_train_epochs=2, learning_rate=4e-4, bf16=True, logging_steps=50, save_steps=500, optim="paged_adamw_8bit", gradient_checkpointing=False, report_to="none" ) trainer = tf.Trainer( model=model, args=trainingArgs, train_dataset=tokenizedDS, data_collator=dataCollator ) model.config.use_cache = False print("Starting to train...") trainer.train() # Save trained model trainer.save_model("./BigManGPT_Qwen_SQuAD") tokenizer.save_pretrained("./BigManGPT_Qwen_SQuAD") print("Trained and saved successfully.") if __name__ == "__main__": mainProgram()

However, I get this error:

Traceback (most recent call last): File "c:\Users\User\BigManGPT\BigManGPT_Squad+QWEN7B_Trainer.py", line 111, in <module> mainProgram() File "c:\Users\User\BigManGPT\BigManGPT_Squad+QWEN7B_Trainer.py", line 102, in mainProgram trainer.train() File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\transformers\trainer.py", line 2241, in train return inner_training_loop( File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\transformers\trainer.py", line 2500, in _inner_training_loop batch_samples, num_items_in_batch = self.get_batch_samples(epoch_iterator, num_batches) File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\transformers\trainer.py", line 5180, in get_batch_samples batch_samples += [next(epoch_iterator)] File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\accelerate\data_loader.py", line 564, in __iter__ current_batch = next(dataloader_iter) File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\torch\utils\data\dataloader.py", line 732, in __next__ data = self._next_data() File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\torch\utils\data\dataloader.py", line 788, in _next_data data = self._dataset_fetcher.fetch(index) # may raise StopIteration File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\torch\utils\data\_utils\fetch.py", line 55, in fetch return self.collate_fn(data) File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\transformers\data\data_collator.py", line 271, in __call__ batch = pad_without_fast_tokenizer_warning( File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\transformers\data\data_collator.py", line 66, in pad_without_fast_tokenizer_warning padded = tokenizer.pad(*pad_args, **pad_kwargs) File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\transformers\tokenization_utils_base.py", line 3355, in pad padding_strategy, _, max_length, _ = self._get_padding_truncation_strategies( File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\transformers\tokenization_utils_base.py", line 2779, in _get_padding_truncation_strategies raise ValueError( ValueError: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`. 0%|

How can I fix this?