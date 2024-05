Traceback:

Traceback (most recent call last): File "c:\Users\Philip Chen\Documents\AICrowd\amazon-kdd-cup-2024-starter-kit\models\QnAModel.py", line 83, in <module> trainer.train() File "C:\Users\Philip Chen\AppData\Local\Programs\Python\Python310\lib\site-packages\transformers\trainer.py", line 1859, in train return inner_training_loop( File "C:\Users\Philip Chen\AppData\Local\Programs\Python\Python310\lib\site-packages\transformers\trainer.py", line 2165, in _inner_training_loop for step, inputs in enumerate(epoch_iterator): File "C:\Users\Philip Chen\AppData\Local\Programs\Python\Python310\lib\site-packages\accelerate\data_loader.py", line 454, in __iter__ current_batch = next(dataloader_iter) File "C:\Users\Philip Chen\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\utils\data\dataloader.py", line 631, in __next__ data = self._next_data() File "C:\Users\Philip Chen\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\utils\data\dataloader.py", line 675, in _next_data data = self._dataset_fetcher.fetch(index) # may raise StopIteration File "C:\Users\Philip Chen\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\utils\data\_utils\fetch.py", line 54, in fetch return self.collate_fn(data) File "C:\Users\Philip Chen\AppData\Local\Programs\Python\Python310\lib\site-packages\transformers\data\data_collator.py", line 92, in default_data_collator return torch_default_data_collator(features) File "C:\Users\Philip Chen\AppData\Local\Programs\Python\Python310\lib\site-packages\transformers\data\data_collator.py", line 141, in torch_default_data_collator batch["labels"] = torch.tensor([f["label"] for f in features], dtype=dtype) ValueError: too many dimensions 'str'

Offending line:

trainer.train()

Code:

import datasets as ds import pandas as pd import transformers as tf import evaluate import numpy as np myData=pd.read_json("labels.json", lines=True) inputField=myData["input_field"] taskType=myData["task_type"] outputField=myData["output_field"] isMCQ=myData["is_multiple_choice"] qArr=[] aArr=[] for i in range (len(myData)): if isMCQ[i]==False: if taskType[i] == "generation": qArr.append(inputField[i]) aArr.append(outputField[i]) trainDF = pd.DataFrame({ "label" : aArr, "text" : qArr, }) testDF = pd.DataFrame({ "label" : aArr, "text" : qArr, }) trainDS = ds.Dataset.from_dict(trainDF) testDS = ds.Dataset.from_dict(testDF) myDataSetDict=ds.DatasetDict({"train": trainDS, "test":testDS}) print(myDataSetDict) tokenizer = tf.AutoTokenizer.from_pretrained("google-bert/bert-base-cased") def tokenize_function(examples): return tokenizer(examples["text"], padding="max_length", truncation=True) tokenizedDatasets = myDataSetDict.map(tokenize_function, batched=True) print(type(tokenizedDatasets)) small_train_dataset = tokenizedDatasets["train"].shuffle(seed=42).select(range(10)) small_eval_dataset = tokenizedDatasets["test"].shuffle(seed=42).select(range(10)) model = tf.AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased", num_labels=5) metric = evaluate.load("accuracy") def compute_metrics(eval_pred): logits, labels = eval_pred predictions = np.argmax(logits, axis=-1) return metric.compute(predictions=predictions, references=labels) training_args = tf.TrainingArguments( output_dir='./', # output directory num_train_epochs=10, # total number of training epochs warmup_steps=500, # number of warmup steps for learning rate scheduler weight_decay=0.01, # strength of weight decay logging_dir='./logs', # directory for storing logs logging_steps=10, ) print(f"TokenizedDataset Type: {type(tokenizedDatasets)}") trainer = tf.Trainer( model=model, args=training_args, train_dataset=small_train_dataset, eval_dataset=small_eval_dataset, compute_metrics=compute_metrics, ) trainer.train()

Did I miss something?