ML: Inference score is very low (want to make it .7 or higher.)

I have two codes: one is a training code, and the other one is an inference code.

The Training code, uses a 3000-sample dataset to train a new model called myNewLLMModel based on distilbert. This is the training code:

import json
import torch
from datasets import Dataset
import transformers as tf
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# Load your dataset
dataFile='development3000.json'
with open(dataFile) as f:
    data = json.load(f)

# Split into train and test sets
train_test_split = int(0.8 * len(data))
trainData = data[:train_test_split]
testData = data[train_test_split:]

# Convert output_field to integer labels and rename to labels
labelMapping = {label: idx for idx, label in enumerate(set(item['output_field'] for item in data))}
for item in trainData:
    item['labels'] = labelMapping[item['output_field']]
for item in testData:
    item['labels'] = labelMapping[item['output_field']]

# Convert to Hugging Face Dataset
trainDataset = Dataset.from_list(trainData)
testDataset = Dataset.from_list(testData)

# Load tokenizer and model
modelName = "./distilbert-base-uncased"
config = tf.AutoConfig.from_pretrained(modelName+"/config.json", num_labels=len(labelMapping))
tokenizer = tf.AutoTokenizer.from_pretrained(modelName, config=config)
model = tf.AutoModelForSequenceClassification.from_pretrained(modelName,  config=config)

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['input_field'], padding="max_length", truncation=True)

trainDataset = trainDataset.map(tokenize_function, batched=True)
testDataset = testDataset.map(tokenize_function, batched=True)

# Set format for PyTorch
trainDataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
testDataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# Training arguments
training_args = tf.TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir="./logs",
)

def computeMetrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    precision = precision_score(labels, preds, average='weighted')
    recall = recall_score(labels, preds, average='weighted')
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,

    }

trainer = tf.Trainer(
    model=model,
    args=training_args,
    train_dataset=trainDataset,
    eval_dataset=testDataset,
    compute_metrics=computeMetrics,
)

trainer.train()
results = trainer.evaluate()

model.save_pretrained("./myNewLLMModel")
tokenizer.save_pretrained("./myNewLLMModel")

print(f"Evaluation results:")
print(f"Accuracy: {results['eval_accuracy']:.4f}")
print(f"F1: {results['eval_f1']:.4f}")
print(f"Precision: {results['eval_precision']:.4f}")
print(f"Recall: {results['eval_recall']:.4f}")


# Save the model and tokenizer
model.save_pretrained("./eCommerceLLM")
tokenizer.save_pretrained("./eCommerceLLM")


Inference code:

import json
import torch
import transformers as tf
from rouge_score import rouge_scorer
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import numpy as np

# Load your dataset
dataFile = 'development3000.json'
with open(dataFile) as f:
    data = json.load(f)

# Split into train and test sets
train_test_split = int(0.8 * len(data))
testData = data[train_test_split:]

# Convert output_field to integer labels and rename to labels
labelMapping = {label: idx for idx, label in enumerate(set(item['output_field'] for item in data))}
for item in testData:
    item['labels'] = labelMapping[item['output_field']]

# Load trained model and tokenizer
model = tf.AutoModelForSequenceClassification.from_pretrained("./eCommerceLLM")
tokenizer = tf.AutoTokenizer.from_pretrained("./eCommerceLLM")

def infer_model(model, tokenizer, input_texts):
    # Tokenize the inputs
    inputs = tokenizer(input_texts, padding=True, truncation=True, return_tensors="pt")

    # Move tensors to the same device as the model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    inputs = {key: val.to(device) for key, val in inputs.items()}

    # Get model predictions
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)

    # Get predicted labels
    predictions = torch.argmax(outputs.logits, dim=-1)

    return predictions.cpu().numpy()

# Select at least 100 samples for inference
inference_samples = testData[:100]

# Prepare input texts for inference
input_texts = [item['input_field'] for item in inference_samples]
trueLabels = [item['labels'] for item in inference_samples]

# Perform inference
predictions = infer_model(model, tokenizer, input_texts)

# Map predictions back to their labels
inverse_labelMapping = {v: k for k, v in labelMapping.items()}
predictedLabels = [inverse_labelMapping[pred] for pred in predictions]
trueLabels_text = [inverse_labelMapping[label] for label in trueLabels]

# Compute and print metrics
accuracy = accuracy_score(trueLabels, predictions)
f1 = f1_score(trueLabels, predictions, average='weighted')
precision = precision_score(trueLabels, predictions, average='weighted')
recall = recall_score(trueLabels, predictions, average='weighted')

print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test F1 Score: {f1:.4f}")
print(f"Test Precision: {precision:.4f}")
print(f"Test Recall: {recall:.4f}")


def computeRougeL(predictions, trueLabels):
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    scores = [scorer.score(t, p)['rougeL'].fmeasure for t, p in zip(trueLabels, predictions)]
    return sum(scores) / len(scores)

rouge_l = computeRougeL(trueLabels_text, predictedLabels)

print(f"ROUGE-L: {rouge_l:.4f}")

What is the problem?
So the evaluation score is good, as per the output from the training code:

Evaluation results:
Accuracy: 1.0000
F1: 1.0000
Precision: 1.0000
Recall: 1.0000

However, when I ran the inference, I get a sorely low score, sometimes 0. When I saw the data that the inference app predicted, most of the answers are far cry from the actual reference results. Below is the inference score (it’s always from 0.00 to 0.30):

Test Accuracy: 0.1300
Test F1 Score: 0.1300
Test Precision: 0.1300
Test Recall: 0.1300
ROUGE-L: 0.1300

The dataset is called “development3000.json”. I used GPT to generate data for the dataset, and to save time, it generated a code for me to make my own dataset here (you can change the dataset filename with whatever you like for the codes above):

import json
import random

input_fields = [
    "generate a text review of a product",
    "extract keywords of a product",
    "translate a customer review",
    "named entity recognition",
    "multiple choice"
]

reviews = [
    "Generate a review for the product 'XYZ Smartphone'.",
    "Generate a review for the product 'ABC Laptop'.",
    "Generate a review for the product '123 Headphones'."
]

keywords = [
    "Extract keywords from the product description: 'This is a high-performance smartphone with a sleek design.'",
    "Extract keywords from the product description: 'This laptop features a powerful processor and long battery life.'",
    "Extract keywords from the product description: 'These headphones provide excellent sound quality and noise cancellation.'"
]

translations = [
    "Translate the review 'This product is amazing!' to French.",
    "Translate the review 'I am very satisfied with my purchase.' to Spanish.",
    "Translate the review 'The quality could be better.' to German."
]

ner_sentences = [
    "Identify entities in the sentence: 'Barack Obama was born in Hawaii.'",
    "Identify entities in the sentence: 'Amazon is a major ecommerce company.'",
    "Identify entities in the sentence: 'The Eiffel Tower is in Paris.'"
]

mcq_questions = [
    {
        "question": "What is the capital of France?",
        "options": ["A) Berlin", "B) Madrid", "C) Paris", "D) Rome"],
        "answer": "C"
    },
    {
        "question": "Which planet is known as the Red Planet?",
        "options": ["A) Earth", "B) Mars", "C) Jupiter", "D) Venus"],
        "answer": "B"
    },
    {
        "question": "What is the largest mammal?",
        "options": ["A) Elephant", "B) Blue Whale", "C) Giraffe"],
        "answer": "B"
    }
]

dataset = []

numRecords=3000
for _ in range(numRecords):
    input_type = random.choice(input_fields)
    if input_type == "generate a text review of a product":
        input_field = random.choice(reviews)
        output_field = "This is a sample review."
        is_multiple_choice = False
    elif input_type == "extract keywords of a product":
        input_field = random.choice(keywords)
        output_field = "sample, keywords"
        is_multiple_choice = False
    elif input_type == "translate a customer review":
        input_field = random.choice(translations)
        output_field = "Ceci est un exemple de traduction."
        is_multiple_choice = False
    elif input_type == "named entity recognition":
        input_field = random.choice(ner_sentences)
        output_field = "Barack Obama, Hawaii"
        is_multiple_choice = False
    elif input_type == "multiple choice":
        mcq = random.choice(mcq_questions)
        input_field = f"Question: {mcq['question']} Options: {', '.join(mcq['options'])}"
        output_field = mcq["answer"]
        is_multiple_choice = True
    
    record = {
        "input_field": input_field,
        "output_field": output_field,
    }
    
    dataset.append(record)

# Save the dataset to a JSON file
filename=f'development{numRecords}.json'
with open(filename, 'w') as f:
    json.dump(dataset, f, indent=4)

Any suggestions on my parameters to make it more accurate with the scores up to 0.7? Thanks much.

  • Define a custom dataset class (CustomDataset) to load your dataset, tokenize text data using DistilBertTokenizer, and prepare it for training.

Does this help increase the inference score?