Training on Qwen-7B gives: ValueError: Asking to pad but the tokenizer does not have a padding token

I am training a model called BigManGPT, which uses Qwen-7B to train Squad dataset.

import torch as t
import transformers as tf
import datasets as ds
import peft as p

def mainProgram():
    print("---- BigManGPT Training PRogram ----")

    # QLoRA Quantization (4-bit)
    quantConfig = tf.BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=t.bfloat16
    )

    modelID = "Qwen/Qwen-7B"  

    print("Preparing tokenizer...")
    tokenizer = tf.AutoTokenizer.from_pretrained(
        modelID, use_fast=False, padding_side="right", trust_remote_code=True
    )
    tokenizer.pad_token = tokenizer.eos_token

    print("Loading model...")
    # Load quantized model explicitly on GPU
    model = tf.AutoModelForCausalLM.from_pretrained(
        modelID,
        quantization_config=quantConfig,
        device_map="auto",
        trust_remote_code=True
    )

    print("Loading model for LORA...")
    model = p.prepare_model_for_kbit_training(model)

    peftCfg = p.LoraConfig(
        r=4,
        lora_alpha=8,
        target_modules=["c_attn", "attn.c_proj", "w1", "w2", "mlp.c_proj"],  
        lora_dropout=0.01,
        bias="none",
        task_type="CAUSAL_LM"
    )

    model = p.get_peft_model(model, peftCfg)
    model.print_trainable_parameters()

    print("Loading Dataset...")
    dataset = ds.load_dataset("squad", split="train")

    # For faster training, reduce dataset size
    dataset = dataset.select(range(3000))

    def tokenizeFunction(examples):
        return tokenizer(
            examples["question"],
            examples["context"],
            truncation=True,
            max_length=512,
            padding=False  # 
        )

    # Tokenization optimized for speed
    print("Tokenizing dataset (16 parallel processes, batched)...")
    tokenizedDS = dataset.map(
        tokenizeFunction,
        batched=True,
        batch_size=1000,
        num_proc=16,
        remove_columns=dataset.column_names,
        writer_batch_size=1000
    )

    # se dynamic padding collator 
    dataCollator = tf.DataCollatorWithPadding(tokenizer)

    # Optimized GPU training arguments
    trainingArgs = tf.TrainingArguments(
        output_dir="./bigman_gpt_qwen_squad",
        per_device_train_batch_size=16,  
        gradient_accumulation_steps=1,
        num_train_epochs=2,  
        learning_rate=4e-4,
        bf16=True,  
        logging_steps=50,
        save_steps=500,
        optim="paged_adamw_8bit",
        gradient_checkpointing=False,  
        report_to="none"
    )

    trainer = tf.Trainer(
        model=model,
        args=trainingArgs,
        train_dataset=tokenizedDS,
        data_collator=dataCollator  
    )

    model.config.use_cache = False

    print("Starting to train...")
    trainer.train()

    # Save trained model
    trainer.save_model("./BigManGPT_Qwen_SQuAD")
    tokenizer.save_pretrained("./BigManGPT_Qwen_SQuAD")

    print("Trained and saved successfully.")

if __name__ == "__main__":
    mainProgram()

However, I get this error:

Traceback (most recent call last):
  File "c:\Users\User\BigManGPT\BigManGPT_Squad+QWEN7B_Trainer.py", line 111, in <module>
    mainProgram()
  File "c:\Users\User\BigManGPT\BigManGPT_Squad+QWEN7B_Trainer.py", line 102, in mainProgram
    trainer.train()
  File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\transformers\trainer.py", line 2241, in train
    return inner_training_loop(
  File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\transformers\trainer.py", line 2500, in _inner_training_loop
    batch_samples, num_items_in_batch = self.get_batch_samples(epoch_iterator, num_batches)
  File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\transformers\trainer.py", line 5180, in get_batch_samples
    batch_samples += [next(epoch_iterator)]
  File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\accelerate\data_loader.py", line 564, in __iter__
    current_batch = next(dataloader_iter)
  File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\torch\utils\data\dataloader.py", line 732, in __next__
    data = self._next_data()
  File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\torch\utils\data\dataloader.py", line 788, in _next_data
    data = self._dataset_fetcher.fetch(index)  # may raise StopIteration
  File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\torch\utils\data\_utils\fetch.py", line 55, in fetch
    return self.collate_fn(data)
  File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\transformers\data\data_collator.py", line 271, in __call__
    batch = pad_without_fast_tokenizer_warning(
  File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\transformers\data\data_collator.py", line 66, in pad_without_fast_tokenizer_warning
    padded = tokenizer.pad(*pad_args, **pad_kwargs)
  File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\transformers\tokenization_utils_base.py", line 3355, in pad
    padding_strategy, _, max_length, _ = self._get_padding_truncation_strategies(
  File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\transformers\tokenization_utils_base.py", line 2779, in _get_padding_truncation_strategies
    raise ValueError(
ValueError: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`.
  0%|  

How can I fix this?

The error message tells you what to do?

Your code seems confused about whether you’re supposed to be padding (line 23) or not (line 61)…

Ok, I meant to block out ones of the lines. Tried line 23:

tokenizer.pad_token = tokenizer.eos_token

and got the error. I then tried

   def tokenizeFunction(examples):
        return tokenizer(
            examples["question"],
            examples["context"],
            truncation=True,
            max_length=512,
            padding=False  # 
        )

And still same thing. Nothing changed, either way I did.

Well what happens if you try to do it the way the error message suggests you do it…

Gives me

Traceback (most recent call last):
  File "c:\Users\User\BigManGPT\QWENTrainer.py", line 111, in <module>
    mainProgram()
  File "c:\Users\User\BigManGPT\QWENTrainer.py", line 24, in mainProgram
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
  File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\transformers\tokenization_utils_base.py", line 1000, in add_special_tokens
    added_tokens = self.add_tokens(added_tokens, special_tokens=True)
  File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\transformers\tokenization_utils_base.py", line 1050, in add_tokens
    return self._add_tokens(new_tokens, special_tokens=special_tokens)
  File "C:\Users\User\.cache\huggingface\modules\transformers_modules\Qwen\Qwen-7B\ef3c5c9c57b252f3149c1408daf4d649ec8b6c85\tokenization_qwen.py", line 165, in _add_tokens
    raise ValueError("Adding unknown special tokens is not supported")
ValueError: Adding unknown special tokens is not supported

And what happens if you arent quite so literal and logically combine the error message’s hint with an actual token, like you tried to use before?

Either I get “Asking to pad”, or “Does not support special tokens”, so I am literally stuck here.

Well if the library is rejecting the library’s own error advice, i’m not sure there’s going to be much I can suggest other than to talk to the library’s developers?