Training on Qwen-7B gives: ValueError: Asking to pad but the tokenizer does not have a padding token

chenphilip14 · March 12, 2025, 2:48am

I am training a model called BigManGPT, which uses Qwen-7B to train Squad dataset.

import torch as t
import transformers as tf
import datasets as ds
import peft as p

def mainProgram():
    print("---- BigManGPT Training PRogram ----")

    # QLoRA Quantization (4-bit)
    quantConfig = tf.BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=t.bfloat16
    )

    modelID = "Qwen/Qwen-7B"  

    print("Preparing tokenizer...")
    tokenizer = tf.AutoTokenizer.from_pretrained(
        modelID, use_fast=False, padding_side="right", trust_remote_code=True
    )
    tokenizer.pad_token = tokenizer.eos_token

    print("Loading model...")
    # Load quantized model explicitly on GPU
    model = tf.AutoModelForCausalLM.from_pretrained(
        modelID,
        quantization_config=quantConfig,
        device_map="auto",
        trust_remote_code=True
    )

    print("Loading model for LORA...")
    model = p.prepare_model_for_kbit_training(model)

    peftCfg = p.LoraConfig(
        r=4,
        lora_alpha=8,
        target_modules=["c_attn", "attn.c_proj", "w1", "w2", "mlp.c_proj"],  
        lora_dropout=0.01,
        bias="none",
        task_type="CAUSAL_LM"
    )

    model = p.get_peft_model(model, peftCfg)
    model.print_trainable_parameters()

    print("Loading Dataset...")
    dataset = ds.load_dataset("squad", split="train")

    # For faster training, reduce dataset size
    dataset = dataset.select(range(3000))

    def tokenizeFunction(examples):
        return tokenizer(
            examples["question"],
            examples["context"],
            truncation=True,
            max_length=512,
            padding=False  # 
        )

    # Tokenization optimized for speed
    print("Tokenizing dataset (16 parallel processes, batched)...")
    tokenizedDS = dataset.map(
        tokenizeFunction,
        batched=True,
        batch_size=1000,
        num_proc=16,
        remove_columns=dataset.column_names,
        writer_batch_size=1000
    )

    # se dynamic padding collator 
    dataCollator = tf.DataCollatorWithPadding(tokenizer)

    # Optimized GPU training arguments
    trainingArgs = tf.TrainingArguments(
        output_dir="./bigman_gpt_qwen_squad",
        per_device_train_batch_size=16,  
        gradient_accumulation_steps=1,
        num_train_epochs=2,  
        learning_rate=4e-4,
        bf16=True,  
        logging_steps=50,
        save_steps=500,
        optim="paged_adamw_8bit",
        gradient_checkpointing=False,  
        report_to="none"
    )

    trainer = tf.Trainer(
        model=model,
        args=trainingArgs,
        train_dataset=tokenizedDS,
        data_collator=dataCollator  
    )

    model.config.use_cache = False

    print("Starting to train...")
    trainer.train()

    # Save trained model
    trainer.save_model("./BigManGPT_Qwen_SQuAD")
    tokenizer.save_pretrained("./BigManGPT_Qwen_SQuAD")

    print("Trained and saved successfully.")

if __name__ == "__main__":
    mainProgram()

However, I get this error:

Traceback (most recent call last):
  File "c:\Users\User\BigManGPT\BigManGPT_Squad+QWEN7B_Trainer.py", line 111, in <module>
    mainProgram()
  File "c:\Users\User\BigManGPT\BigManGPT_Squad+QWEN7B_Trainer.py", line 102, in mainProgram
    trainer.train()
  File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\transformers\trainer.py", line 2241, in train
    return inner_training_loop(
  File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\transformers\trainer.py", line 2500, in _inner_training_loop
    batch_samples, num_items_in_batch = self.get_batch_samples(epoch_iterator, num_batches)
  File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\transformers\trainer.py", line 5180, in get_batch_samples
    batch_samples += [next(epoch_iterator)]
  File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\accelerate\data_loader.py", line 564, in __iter__
    current_batch = next(dataloader_iter)
  File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\torch\utils\data\dataloader.py", line 732, in __next__
    data = self._next_data()
  File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\torch\utils\data\dataloader.py", line 788, in _next_data
    data = self._dataset_fetcher.fetch(index)  # may raise StopIteration
  File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\torch\utils\data\_utils\fetch.py", line 55, in fetch
    return self.collate_fn(data)
  File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\transformers\data\data_collator.py", line 271, in __call__
    batch = pad_without_fast_tokenizer_warning(
  File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\transformers\data\data_collator.py", line 66, in pad_without_fast_tokenizer_warning
    padded = tokenizer.pad(*pad_args, **pad_kwargs)
  File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\transformers\tokenization_utils_base.py", line 3355, in pad
    padding_strategy, _, max_length, _ = self._get_padding_truncation_strategies(
  File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\transformers\tokenization_utils_base.py", line 2779, in _get_padding_truncation_strategies
    raise ValueError(
ValueError: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`.
  0%|

How can I fix this?

m_hutley · March 12, 2025, 9:26am

The error message tells you what to do?

Your code seems confused about whether you’re supposed to be padding (line 23) or not (line 61)…

chenphilip14 · March 12, 2025, 10:03am

Ok, I meant to block out ones of the lines. Tried line 23:

tokenizer.pad_token = tokenizer.eos_token

and got the error. I then tried

   def tokenizeFunction(examples):
        return tokenizer(
            examples["question"],
            examples["context"],
            truncation=True,
            max_length=512,
            padding=False  # 
        )

And still same thing. Nothing changed, either way I did.

m_hutley · March 12, 2025, 12:21pm

Well what happens if you try to do it the way the error message suggests you do it…

chenphilip14 · March 13, 2025, 3:29am

Gives me

Traceback (most recent call last):
  File "c:\Users\User\BigManGPT\QWENTrainer.py", line 111, in <module>
    mainProgram()
  File "c:\Users\User\BigManGPT\QWENTrainer.py", line 24, in mainProgram
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
  File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\transformers\tokenization_utils_base.py", line 1000, in add_special_tokens
    added_tokens = self.add_tokens(added_tokens, special_tokens=True)
  File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\transformers\tokenization_utils_base.py", line 1050, in add_tokens
    return self._add_tokens(new_tokens, special_tokens=special_tokens)
  File "C:\Users\User\.cache\huggingface\modules\transformers_modules\Qwen\Qwen-7B\ef3c5c9c57b252f3149c1408daf4d649ec8b6c85\tokenization_qwen.py", line 165, in _add_tokens
    raise ValueError("Adding unknown special tokens is not supported")
ValueError: Adding unknown special tokens is not supported

m_hutley · March 13, 2025, 10:02am

And what happens if you arent quite so literal and logically combine the error message’s hint with an actual token, like you tried to use before?

chenphilip14 · March 14, 2025, 10:01am

Either I get “Asking to pad”, or “Does not support special tokens”, so I am literally stuck here.

m_hutley · March 14, 2025, 10:03am

Well if the library is rejecting the library’s own error advice, i’m not sure there’s going to be much I can suggest other than to talk to the library’s developers?

system · June 20, 2025, 2:06pm

This topic was automatically closed 91 days after the last reply. New replies are no longer allowed.

Topic		Replies	Views
ValueError: too many dimensions 'str' Python	11	2170	May 23, 2024
Getting KeyError when trying to train my model Python	3	1694	May 21, 2024
TypeError when trying to fine tune hugging face code Python	5	1174	May 9, 2024
Getting ValueError: Expected input data to be non-empty Python	1	1371	May 16, 2024
"ValueError: Shapes (5, 6) and (5, 35) are incompatible" when trying to train my model Python	1	760	March 26, 2024

Training on Qwen-7B gives: ValueError: Asking to pad but the tokenizer does not have a padding token

Related topics