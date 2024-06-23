I have two codes: one is a training code, and the other one is an inference code.

The Training code, uses a 3000-sample dataset to train a new model called myNewLLMModel based on distilbert. This is the training code:

import json import torch from datasets import Dataset import transformers as tf from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score # Load your dataset dataFile='development3000.json' with open(dataFile) as f: data = json.load(f) # Split into train and test sets train_test_split = int(0.8 * len(data)) trainData = data[:train_test_split] testData = data[train_test_split:] # Convert output_field to integer labels and rename to labels labelMapping = {label: idx for idx, label in enumerate(set(item['output_field'] for item in data))} for item in trainData: item['labels'] = labelMapping[item['output_field']] for item in testData: item['labels'] = labelMapping[item['output_field']] # Convert to Hugging Face Dataset trainDataset = Dataset.from_list(trainData) testDataset = Dataset.from_list(testData) # Load tokenizer and model modelName = "./distilbert-base-uncased" config = tf.AutoConfig.from_pretrained(modelName+"/config.json", num_labels=len(labelMapping)) tokenizer = tf.AutoTokenizer.from_pretrained(modelName, config=config) model = tf.AutoModelForSequenceClassification.from_pretrained(modelName, config=config) # Tokenize the dataset def tokenize_function(examples): return tokenizer(examples['input_field'], padding="max_length", truncation=True) trainDataset = trainDataset.map(tokenize_function, batched=True) testDataset = testDataset.map(tokenize_function, batched=True) # Set format for PyTorch trainDataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels']) testDataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels']) # Training arguments training_args = tf.TrainingArguments( output_dir="./results", evaluation_strategy="epoch", learning_rate=2e-5, per_device_train_batch_size=16, per_device_eval_batch_size=16, num_train_epochs=10, weight_decay=0.01, logging_dir="./logs", ) def computeMetrics(pred): labels = pred.label_ids preds = pred.predictions.argmax(-1) acc = accuracy_score(labels, preds) f1 = f1_score(labels, preds, average='weighted') precision = precision_score(labels, preds, average='weighted') recall = recall_score(labels, preds, average='weighted') return { 'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall, } trainer = tf.Trainer( model=model, args=training_args, train_dataset=trainDataset, eval_dataset=testDataset, compute_metrics=computeMetrics, ) trainer.train() results = trainer.evaluate() model.save_pretrained("./myNewLLMModel") tokenizer.save_pretrained("./myNewLLMModel") print(f"Evaluation results:") print(f"Accuracy: {results['eval_accuracy']:.4f}") print(f"F1: {results['eval_f1']:.4f}") print(f"Precision: {results['eval_precision']:.4f}") print(f"Recall: {results['eval_recall']:.4f}") # Save the model and tokenizer model.save_pretrained("./eCommerceLLM") tokenizer.save_pretrained("./eCommerceLLM")

Inference code:

import json import torch import transformers as tf from rouge_score import rouge_scorer from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score import numpy as np # Load your dataset dataFile = 'development3000.json' with open(dataFile) as f: data = json.load(f) # Split into train and test sets train_test_split = int(0.8 * len(data)) testData = data[train_test_split:] # Convert output_field to integer labels and rename to labels labelMapping = {label: idx for idx, label in enumerate(set(item['output_field'] for item in data))} for item in testData: item['labels'] = labelMapping[item['output_field']] # Load trained model and tokenizer model = tf.AutoModelForSequenceClassification.from_pretrained("./eCommerceLLM") tokenizer = tf.AutoTokenizer.from_pretrained("./eCommerceLLM") def infer_model(model, tokenizer, input_texts): # Tokenize the inputs inputs = tokenizer(input_texts, padding=True, truncation=True, return_tensors="pt") # Move tensors to the same device as the model device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) inputs = {key: val.to(device) for key, val in inputs.items()} # Get model predictions model.eval() with torch.no_grad(): outputs = model(**inputs) # Get predicted labels predictions = torch.argmax(outputs.logits, dim=-1) return predictions.cpu().numpy() # Select at least 100 samples for inference inference_samples = testData[:100] # Prepare input texts for inference input_texts = [item['input_field'] for item in inference_samples] trueLabels = [item['labels'] for item in inference_samples] # Perform inference predictions = infer_model(model, tokenizer, input_texts) # Map predictions back to their labels inverse_labelMapping = {v: k for k, v in labelMapping.items()} predictedLabels = [inverse_labelMapping[pred] for pred in predictions] trueLabels_text = [inverse_labelMapping[label] for label in trueLabels] # Compute and print metrics accuracy = accuracy_score(trueLabels, predictions) f1 = f1_score(trueLabels, predictions, average='weighted') precision = precision_score(trueLabels, predictions, average='weighted') recall = recall_score(trueLabels, predictions, average='weighted') print(f"Test Accuracy: {accuracy:.4f}") print(f"Test F1 Score: {f1:.4f}") print(f"Test Precision: {precision:.4f}") print(f"Test Recall: {recall:.4f}") def computeRougeL(predictions, trueLabels): scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True) scores = [scorer.score(t, p)['rougeL'].fmeasure for t, p in zip(trueLabels, predictions)] return sum(scores) / len(scores) rouge_l = computeRougeL(trueLabels_text, predictedLabels) print(f"ROUGE-L: {rouge_l:.4f}")

What is the problem?

So the evaluation score is good, as per the output from the training code:

Evaluation results: Accuracy: 1.0000 F1: 1.0000 Precision: 1.0000 Recall: 1.0000

However, when I ran the inference, I get a sorely low score, sometimes 0. When I saw the data that the inference app predicted, most of the answers are far cry from the actual reference results. Below is the inference score (it’s always from 0.00 to 0.30):

Test Accuracy: 0.1300 Test F1 Score: 0.1300 Test Precision: 0.1300 Test Recall: 0.1300 ROUGE-L: 0.1300

The dataset is called “development3000.json”. I used GPT to generate data for the dataset, and to save time, it generated a code for me to make my own dataset here (you can change the dataset filename with whatever you like for the codes above):

import json import random input_fields = [ "generate a text review of a product", "extract keywords of a product", "translate a customer review", "named entity recognition", "multiple choice" ] reviews = [ "Generate a review for the product 'XYZ Smartphone'.", "Generate a review for the product 'ABC Laptop'.", "Generate a review for the product '123 Headphones'." ] keywords = [ "Extract keywords from the product description: 'This is a high-performance smartphone with a sleek design.'", "Extract keywords from the product description: 'This laptop features a powerful processor and long battery life.'", "Extract keywords from the product description: 'These headphones provide excellent sound quality and noise cancellation.'" ] translations = [ "Translate the review 'This product is amazing!' to French.", "Translate the review 'I am very satisfied with my purchase.' to Spanish.", "Translate the review 'The quality could be better.' to German." ] ner_sentences = [ "Identify entities in the sentence: 'Barack Obama was born in Hawaii.'", "Identify entities in the sentence: 'Amazon is a major ecommerce company.'", "Identify entities in the sentence: 'The Eiffel Tower is in Paris.'" ] mcq_questions = [ { "question": "What is the capital of France?", "options": ["A) Berlin", "B) Madrid", "C) Paris", "D) Rome"], "answer": "C" }, { "question": "Which planet is known as the Red Planet?", "options": ["A) Earth", "B) Mars", "C) Jupiter", "D) Venus"], "answer": "B" }, { "question": "What is the largest mammal?", "options": ["A) Elephant", "B) Blue Whale", "C) Giraffe"], "answer": "B" } ] dataset = [] numRecords=3000 for _ in range(numRecords): input_type = random.choice(input_fields) if input_type == "generate a text review of a product": input_field = random.choice(reviews) output_field = "This is a sample review." is_multiple_choice = False elif input_type == "extract keywords of a product": input_field = random.choice(keywords) output_field = "sample, keywords" is_multiple_choice = False elif input_type == "translate a customer review": input_field = random.choice(translations) output_field = "Ceci est un exemple de traduction." is_multiple_choice = False elif input_type == "named entity recognition": input_field = random.choice(ner_sentences) output_field = "Barack Obama, Hawaii" is_multiple_choice = False elif input_type == "multiple choice": mcq = random.choice(mcq_questions) input_field = f"Question: {mcq['question']} Options: {', '.join(mcq['options'])}" output_field = mcq["answer"] is_multiple_choice = True record = { "input_field": input_field, "output_field": output_field, } dataset.append(record) # Save the dataset to a JSON file filename=f'development{numRecords}.json' with open(filename, 'w') as f: json.dump(dataset, f, indent=4)

Any suggestions on my parameters to make it more accurate with the scores up to 0.7? Thanks much.