I am training a model called BigManGPT, which uses Qwen-7B to train Squad dataset.
import torch as t
import transformers as tf
import datasets as ds
import peft as p
def mainProgram():
print("---- BigManGPT Training PRogram ----")
# QLoRA Quantization (4-bit)
quantConfig = tf.BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=t.bfloat16
)
modelID = "Qwen/Qwen-7B"
print("Preparing tokenizer...")
tokenizer = tf.AutoTokenizer.from_pretrained(
modelID, use_fast=False, padding_side="right", trust_remote_code=True
)
tokenizer.pad_token = tokenizer.eos_token
print("Loading model...")
# Load quantized model explicitly on GPU
model = tf.AutoModelForCausalLM.from_pretrained(
modelID,
quantization_config=quantConfig,
device_map="auto",
trust_remote_code=True
)
print("Loading model for LORA...")
model = p.prepare_model_for_kbit_training(model)
peftCfg = p.LoraConfig(
r=4,
lora_alpha=8,
target_modules=["c_attn", "attn.c_proj", "w1", "w2", "mlp.c_proj"],
lora_dropout=0.01,
bias="none",
task_type="CAUSAL_LM"
)
model = p.get_peft_model(model, peftCfg)
model.print_trainable_parameters()
print("Loading Dataset...")
dataset = ds.load_dataset("squad", split="train")
# For faster training, reduce dataset size
dataset = dataset.select(range(3000))
def tokenizeFunction(examples):
return tokenizer(
examples["question"],
examples["context"],
truncation=True,
max_length=512,
padding=False #
)
# Tokenization optimized for speed
print("Tokenizing dataset (16 parallel processes, batched)...")
tokenizedDS = dataset.map(
tokenizeFunction,
batched=True,
batch_size=1000,
num_proc=16,
remove_columns=dataset.column_names,
writer_batch_size=1000
)
# se dynamic padding collator
dataCollator = tf.DataCollatorWithPadding(tokenizer)
# Optimized GPU training arguments
trainingArgs = tf.TrainingArguments(
output_dir="./bigman_gpt_qwen_squad",
per_device_train_batch_size=16,
gradient_accumulation_steps=1,
num_train_epochs=2,
learning_rate=4e-4,
bf16=True,
logging_steps=50,
save_steps=500,
optim="paged_adamw_8bit",
gradient_checkpointing=False,
report_to="none"
)
trainer = tf.Trainer(
model=model,
args=trainingArgs,
train_dataset=tokenizedDS,
data_collator=dataCollator
)
model.config.use_cache = False
print("Starting to train...")
trainer.train()
# Save trained model
trainer.save_model("./BigManGPT_Qwen_SQuAD")
tokenizer.save_pretrained("./BigManGPT_Qwen_SQuAD")
print("Trained and saved successfully.")
if __name__ == "__main__":
mainProgram()
However, I get this error:
Traceback (most recent call last):
File "c:\Users\User\BigManGPT\BigManGPT_Squad+QWEN7B_Trainer.py", line 111, in <module>
mainProgram()
File "c:\Users\User\BigManGPT\BigManGPT_Squad+QWEN7B_Trainer.py", line 102, in mainProgram
trainer.train()
File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\transformers\trainer.py", line 2241, in train
return inner_training_loop(
File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\transformers\trainer.py", line 2500, in _inner_training_loop
batch_samples, num_items_in_batch = self.get_batch_samples(epoch_iterator, num_batches)
File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\transformers\trainer.py", line 5180, in get_batch_samples
batch_samples += [next(epoch_iterator)]
File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\accelerate\data_loader.py", line 564, in __iter__
current_batch = next(dataloader_iter)
File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\torch\utils\data\dataloader.py", line 732, in __next__
data = self._next_data()
File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\torch\utils\data\dataloader.py", line 788, in _next_data
data = self._dataset_fetcher.fetch(index) # may raise StopIteration
File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\torch\utils\data\_utils\fetch.py", line 55, in fetch
return self.collate_fn(data)
File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\transformers\data\data_collator.py", line 271, in __call__
batch = pad_without_fast_tokenizer_warning(
File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\transformers\data\data_collator.py", line 66, in pad_without_fast_tokenizer_warning
padded = tokenizer.pad(*pad_args, **pad_kwargs)
File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\transformers\tokenization_utils_base.py", line 3355, in pad
padding_strategy, _, max_length, _ = self._get_padding_truncation_strategies(
File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\transformers\tokenization_utils_base.py", line 2779, in _get_padding_truncation_strategies
raise ValueError(
ValueError: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`.
0%|
How can I fix this?