GPT 2 training in local gpu for custom database

shinimarykoshy1996 · July 26, 2023, 4:37pm

Hi,
iam having a database with corresponding data. Itry to train gpt2 using this code

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config
import json

# Define your custom dataset
class SpiderDataset(Dataset):
    def __init__(self, json_path, tokenizer):
        self.data = self.load_data(json_path)
        self.tokenizer = tokenizer

    def load_data(self, json_path):
        dataset = []
        with open(json_path, 'r') as f:
            json_data = json.load(f)            
            for example in json_data:
                print(example)
                question = example['question']
                query = example['query']
                dataset.append((question, query))
        print(dataset)
        return dataset


    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        question, query = self.data[index]

        encoded_input = self.tokenizer.encode_plus(
            question,
            query,
            add_special_tokens=True,
            padding="longest",
            truncation=True,
            return_tensors="pt"
        )

        input_ids = encoded_input["input_ids"].squeeze()
        attention_mask = encoded_input["attention_mask"].squeeze()

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask
        }

# Define a custom collate function
def collate_fn(batch):
    input_ids = [item['input_ids'] for item in batch]
    attention_masks = [item['attention_mask'] for item in batch]

    # Pad sequences to the maximum length in the batch
    input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True)
    attention_masks = torch.nn.utils.rnn.pad_sequence(attention_masks, batch_first=True)

    return {
        "input_ids": input_ids,
        "attention_mask": attention_masks
    }

# Define your training parameters
json_path = "train_gpt_newdata.json"
model_name = "gpt2"
batch_size = 8
learning_rate = 1e-5
num_epochs = 5

# Load the GPT-2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Prepare your dataset
dataset = SpiderDataset(json_path, tokenizer)
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

# Fine-tuning settings
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()

# Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    for batch in data_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

# Save the fine-tuned model
model.save_pretrained("gpt2-finetuned")

# Inference
question = "What is the average salary of employees?"
query = "SELECT AVG(salary) FROM employees"

encoded_input = tokenizer.encode_plus(question, query, add_special_tokens=True, return_tensors="pt")
input_ids = encoded_input["input_ids"].to(device)

output = model.generate(input_ids=input_ids, max_length=128, num_beams=5)
decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
print("Generated SQL query:", decoded_output)

but do this training can give the output which is out of training set. Any code snippet i can use to train in my custom DB .

Foxalabs · July 26, 2023, 9:21pm

Hi,

Ad this is quite an involved topic, I would recommend taking a look at one of the many videos that cover GPT2 training from scratch, give this one a look:

shinimarykoshy1996 · July 27, 2023, 4:55am

Hi, From scratch even if trained can i give the relations between each tables like which is primary key which is a foriegn key and all

Topic		Replies	Views
How i put ChatGPT giving recomendations based on Database API	18	26493	December 13, 2023
Training a model against a db with material and labor cost Prompting chatgpt , data-preparation	2	351	January 22, 2024
Is there any example for training open ai with database datas? API gpt-4 , gpt-35-turbo , chatgpt , api	1	2172	July 27, 2023
How to train open ai with my own datas from database? API chatgpt , api , text-davinci-003	15	50553	December 14, 2023
Training OpenAI on a private dataset API	19	39465	December 12, 2023

GPT 2 training in local gpu for custom database

Related Topics