Followed this fine-tuning tutorial here:
And this is my code:
# https://www.linkedin.com/advice/1/how-do-you-use-hugging-face-natural-language-processing-q4gve
# https://huggingface.co/docs/transformers/training
# Models
# microsoft/phi-1_5
# distilbert/distilbert-base-uncased
# google-t5/t5-base
# deberta-v3-base
# google-bert/bert-base-cased
import pandas as pd
import torch
# import transformers as tm #import BertLMHeadModel, AutoModelForMaskedLM, AutoModelForSeq2SeqLM, AutoModelForCausalLM, AutoModel, AutoModelForSequenceClassification, GPT2LMHeadModel,PhiForCausalLM, GPT2Tokenizer, AutoTokenizer, GPT2TokenizerFast
from transformers import BertLMHeadModel, AutoModelForMaskedLM, AutoModelForSeq2SeqLM, AutoModelForCausalLM, AutoModel, AutoModelForSequenceClassification, GPT2LMHeadModel,PhiForCausalLM, GPT2Tokenizer, AutoTokenizer, GPT2TokenizerFast
import os
import keras
from datasets import load_dataset
from tensorflow.keras.optimizers import SGD, Adam
from tensorflow.keras.optimizers.schedules import ExponentialDecay
import evaluate
import numpy as np
from transformers import TFAutoModelForSequenceClassification
# import transformers as tm
model_name="google-bert/bert-base-cased"
fileName="data2.json"
dataset = load_dataset("json", data_files=fileName)
dataset=dataset["train"]
file=open(fileName)
myData=pd.read_json(fileName, lines=True)
# model = tm.BertLMHeadModel.from_pretrained(model_name)
# model = tm.AutoModelForSeq2SeqLM.from_pretrained(model_name)
# model = tm.AutoModelForCausalLM.from_pretrained(model_name)
# model = tm.GPT2LMHeadModel.from_pretrained(model_name) # for gpt2
# model = PhiForCausalLM.from_pretrained(model_name) # for phi-1_5
model = TFAutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5)
# TOkenizers
tokenizer = AutoTokenizer.from_pretrained(model_name)
# ======================================
# Fine tuning the Model
# ======================================
numberOfData=len(myData["input"])
tokenized_data = tokenizer(dataset["input"], return_tensors="np")
tokenized_data = dict(tokenized_data)
def tokenize_dataset(data):
# Keys of the returned dictionary will be added to the dataset as columns
return tokenizer(data["input"])
dataset = dataset.map(tokenize_dataset)
# labels = np.array(dataset["label"])
tf_dataset = model.prepare_tf_dataset(dataset["input"], batch_size=16, shuffle=True, tokenizer=tokenizer) <<-- OFFENDING LINE
model.compile(optimizer=Adam(3e-5)) # No loss argument!
model.fit(tokenized_data)
I get this error:
TypeError: Dataset argument should be a datasets.Dataset!
from this line:
tf_dataset = model.prepare_tf_dataset(dataset["input"], batch_size=16, shuffle=True, tokenizer=tokenizer)
This is my json file:
{"input":"What is the tallest mountain in the world?"}
{"input":"What is the oldest civilization in history?"}
{"input":"What was Marie Antoinette's most famouse quote?"}
{"input":"Extract the keyphrase from this review.\n Review: The food in Lau Pa Sat is delicious, albeit expensive."}
{"input":"Where is Paul McCartney now?"}
{"input":"What are the benefits of orange?"}
{"input":"I have cockroach problems at home. How do I solve this?"}
{"input":"Who assassinated John F. Kennedy?"}
{"input":"Summarize Matthew 4:3-12."}
{"input":"Extract the keyphrase from this review.\n Review: The Singapore Chilli Crab in this seafood restaurant is amazing! Definetely recommend!"}
{"input":"List some good places to jog in Singapore."}
{"input":"Compare the A350 to the 777."}
{"input":"Tell me the stocks of Intel and AMD."}
{"input":"Any good places to hike in Japan?"}
{"input":"When was the Tower of London built?"}
{"input":"Extract the keyphrase from this review.\n Review: This Z790 motherboard is a total beauty, but extremely overpriced!"}
{"input":"Where is the largest archipelago located"}
{"input":"When was the Statue of Liberty built?"}
{"input":"Who wrote Les Miserables?"}
{"input":"Who sang 'American Pie'?"}
{"input":"What is the current population in Singapore?"}
{"input":"When is Sakura season in Japan?"}
{"input":"How old is the Westminster Abbey?"}
{"input":"Where is the best place to celebrate Oktoberfest in Germany?"}
{"input":"How to go from Hong Kong International Airport to the city?"}
{"input":"How long does it take to go from Tokyo to Fukuoka by train?"}
Any fixes to this? Spent one whole day trying to solve this problem, and I’m literally running out of patience. Suggestions are highly appreciated.