Traceback:
Traceback (most recent call last):
File "c:\Users\Philip Chen\Documents\AICrowd\amazon-kdd-cup-2024-starter-kit\models\QnAModel.py", line 83, in <module>
trainer.train()
File "C:\Users\Philip Chen\AppData\Local\Programs\Python\Python310\lib\site-packages\transformers\trainer.py", line 1859, in train
return inner_training_loop(
File "C:\Users\Philip Chen\AppData\Local\Programs\Python\Python310\lib\site-packages\transformers\trainer.py", line 2165, in _inner_training_loop
for step, inputs in enumerate(epoch_iterator):
File "C:\Users\Philip Chen\AppData\Local\Programs\Python\Python310\lib\site-packages\accelerate\data_loader.py", line 454, in __iter__
current_batch = next(dataloader_iter)
File "C:\Users\Philip Chen\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\utils\data\dataloader.py", line 631, in __next__
data = self._next_data()
File "C:\Users\Philip Chen\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\utils\data\dataloader.py", line 675, in _next_data
data = self._dataset_fetcher.fetch(index) # may raise StopIteration
File "C:\Users\Philip Chen\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\utils\data\_utils\fetch.py", line 54, in fetch
return self.collate_fn(data)
File "C:\Users\Philip Chen\AppData\Local\Programs\Python\Python310\lib\site-packages\transformers\data\data_collator.py", line 92, in default_data_collator
return torch_default_data_collator(features)
File "C:\Users\Philip Chen\AppData\Local\Programs\Python\Python310\lib\site-packages\transformers\data\data_collator.py", line 141, in torch_default_data_collator
batch["labels"] = torch.tensor([f["label"] for f in features], dtype=dtype)
ValueError: too many dimensions 'str'
Offending line:
trainer.train()
Code:
import datasets as ds
import pandas as pd
import transformers as tf
import evaluate
import numpy as np
myData=pd.read_json("labels.json", lines=True)
inputField=myData["input_field"]
taskType=myData["task_type"]
outputField=myData["output_field"]
isMCQ=myData["is_multiple_choice"]
qArr=[]
aArr=[]
for i in range (len(myData)):
if isMCQ[i]==False:
if taskType[i] == "generation":
qArr.append(inputField[i])
aArr.append(outputField[i])
trainDF = pd.DataFrame({
"label" : aArr,
"text" : qArr,
})
testDF = pd.DataFrame({
"label" : aArr,
"text" : qArr,
})
trainDS = ds.Dataset.from_dict(trainDF)
testDS = ds.Dataset.from_dict(testDF)
myDataSetDict=ds.DatasetDict({"train": trainDS, "test":testDS})
print(myDataSetDict)
tokenizer = tf.AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
def tokenize_function(examples):
return tokenizer(examples["text"], padding="max_length", truncation=True)
tokenizedDatasets = myDataSetDict.map(tokenize_function, batched=True)
print(type(tokenizedDatasets))
small_train_dataset = tokenizedDatasets["train"].shuffle(seed=42).select(range(10))
small_eval_dataset = tokenizedDatasets["test"].shuffle(seed=42).select(range(10))
model = tf.AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased", num_labels=5)
metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
logits, labels = eval_pred
predictions = np.argmax(logits, axis=-1)
return metric.compute(predictions=predictions, references=labels)
training_args = tf.TrainingArguments(
output_dir='./', # output directory
num_train_epochs=10, # total number of training epochs
warmup_steps=500, # number of warmup steps for learning rate scheduler
weight_decay=0.01, # strength of weight decay
logging_dir='./logs', # directory for storing logs
logging_steps=10,
)
print(f"TokenizedDataset Type: {type(tokenizedDatasets)}")
trainer = tf.Trainer(
model=model,
args=training_args,
train_dataset=small_train_dataset,
eval_dataset=small_eval_dataset,
compute_metrics=compute_metrics,
)
trainer.train()
Did I miss something?