Hi,
iam having a database with corresponding data. Itry to train gpt2 using this code
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config
import json
# Define your custom dataset
class SpiderDataset(Dataset):
def __init__(self, json_path, tokenizer):
self.data = self.load_data(json_path)
self.tokenizer = tokenizer
def load_data(self, json_path):
dataset = []
with open(json_path, 'r') as f:
json_data = json.load(f)
for example in json_data:
print(example)
question = example['question']
query = example['query']
dataset.append((question, query))
print(dataset)
return dataset
def __len__(self):
return len(self.data)
def __getitem__(self, index):
question, query = self.data[index]
encoded_input = self.tokenizer.encode_plus(
question,
query,
add_special_tokens=True,
padding="longest",
truncation=True,
return_tensors="pt"
)
input_ids = encoded_input["input_ids"].squeeze()
attention_mask = encoded_input["attention_mask"].squeeze()
return {
"input_ids": input_ids,
"attention_mask": attention_mask
}
# Define a custom collate function
def collate_fn(batch):
input_ids = [item['input_ids'] for item in batch]
attention_masks = [item['attention_mask'] for item in batch]
# Pad sequences to the maximum length in the batch
input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True)
attention_masks = torch.nn.utils.rnn.pad_sequence(attention_masks, batch_first=True)
return {
"input_ids": input_ids,
"attention_mask": attention_masks
}
# Define your training parameters
json_path = "train_gpt_newdata.json"
model_name = "gpt2"
batch_size = 8
learning_rate = 1e-5
num_epochs = 5
# Load the GPT-2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
# Prepare your dataset
dataset = SpiderDataset(json_path, tokenizer)
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
# Fine-tuning settings
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()
# Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
# Training loop
for epoch in range(num_epochs):
for batch in data_loader:
input_ids = batch["input_ids"].to(device)
attention_mask = batch["attention_mask"].to(device)
outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
loss = outputs.loss
optimizer.zero_grad()
loss.backward()
optimizer.step()
# Save the fine-tuned model
model.save_pretrained("gpt2-finetuned")
# Inference
question = "What is the average salary of employees?"
query = "SELECT AVG(salary) FROM employees"
encoded_input = tokenizer.encode_plus(question, query, add_special_tokens=True, return_tensors="pt")
input_ids = encoded_input["input_ids"].to(device)
output = model.generate(input_ids=input_ids, max_length=128, num_beams=5)
decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
print("Generated SQL query:", decoded_output)
but do this training can give the output which is out of training set. Any code snippet i can use to train in my custom DB .