GPT 2 training in local gpu for custom database

Hi,
iam having a database with corresponding data. Itry to train gpt2 using this code

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config
import json

# Define your custom dataset
class SpiderDataset(Dataset):
    def __init__(self, json_path, tokenizer):
        self.data = self.load_data(json_path)
        self.tokenizer = tokenizer

    def load_data(self, json_path):
        dataset = []
        with open(json_path, 'r') as f:
            json_data = json.load(f)            
            for example in json_data:
                print(example)
                question = example['question']
                query = example['query']
                dataset.append((question, query))
        print(dataset)
        return dataset


    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        question, query = self.data[index]

        encoded_input = self.tokenizer.encode_plus(
            question,
            query,
            add_special_tokens=True,
            padding="longest",
            truncation=True,
            return_tensors="pt"
        )

        input_ids = encoded_input["input_ids"].squeeze()
        attention_mask = encoded_input["attention_mask"].squeeze()

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask
        }

# Define a custom collate function
def collate_fn(batch):
    input_ids = [item['input_ids'] for item in batch]
    attention_masks = [item['attention_mask'] for item in batch]

    # Pad sequences to the maximum length in the batch
    input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True)
    attention_masks = torch.nn.utils.rnn.pad_sequence(attention_masks, batch_first=True)

    return {
        "input_ids": input_ids,
        "attention_mask": attention_masks
    }

# Define your training parameters
json_path = "train_gpt_newdata.json"
model_name = "gpt2"
batch_size = 8
learning_rate = 1e-5
num_epochs = 5

# Load the GPT-2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Prepare your dataset
dataset = SpiderDataset(json_path, tokenizer)
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

# Fine-tuning settings
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()

# Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    for batch in data_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

# Save the fine-tuned model
model.save_pretrained("gpt2-finetuned")

# Inference
question = "What is the average salary of employees?"
query = "SELECT AVG(salary) FROM employees"

encoded_input = tokenizer.encode_plus(question, query, add_special_tokens=True, return_tensors="pt")
input_ids = encoded_input["input_ids"].to(device)

output = model.generate(input_ids=input_ids, max_length=128, num_beams=5)
decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
print("Generated SQL query:", decoded_output)

but do this training can give the output which is out of training set. Any code snippet i can use to train in my custom DB .

Hi,

Ad this is quite an involved topic, I would recommend taking a look at one of the many videos that cover GPT2 training from scratch, give this one a look:

Hi, From scratch even if trained can i give the relations between each tables like which is primary key which is a foriegn key and all