Train a ChatGPT model error

Hello AIs,

I’m using the below code to train a model , unfortunately num-sample always come as zero no matter what. Check the file location and content. No luck. Any pointers will be appreciated

ValueError: num_samples should be a positive integer value, but got num_samples=0

from transformers import GPT2TokenizerFast, TextDataset, DataCollatorForLanguageModeling,
TrainingArguments, Trainer, AutoModelForCausalLM, pipeline
from torch.utils.data import DataLoader, RandomSampler

Load the GPT2 tokenizer

tokenizer = GPT2TokenizerFast.from_pretrained(‘gpt2’)

Load the text data

with open(‘input_text.txt’, ‘r’) as f:
text = f.read()

Tokenize the text

tokenized_text = tokenizer.encode(text)

Define the block size for the TextDataset

block_size = 128

Calculate the number of special tokens to add

num_special_tokens = tokenizer.num_special_tokens_to_add(pair=False)

Adjust the block size to account for the special tokens

block_size = block_size - num_special_tokens

Create a TextDataset from the tokenized text and block size

file_path = ‘C:/ChatGPT/data/test.txt’
dataset = TextDataset(tokenizer=tokenizer, file_path=file_path, block_size=block_size)

Create a DataCollatorForLanguageModeling to collate the data

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

Define the training arguments

training_args = TrainingArguments(
output_dir=‘./model’,
num_train_epochs=3,
per_device_train_batch_size=16,
save_total_limit=2,
learning_rate=5e-5,
adam_epsilon=1e-8,
max_grad_norm=1.0,
warmup_steps=500,
logging_steps=500,
save_steps=1000,
)

Load the pre-trained GPT2 model

model_init = lambda: AutoModelForCausalLM.from_pretrained(‘gpt2’)
model = model_init()

Define the Trainer with a DataLoader and RandomSampler

train_sampler = RandomSampler(dataset, num_samples=100000, replacement=True)
dataloader = DataLoader(dataset, batch_size=training_args.per_device_train_batch_size, sampler=train_sampler)
trainer = Trainer(
model_init=model_init,
args=training_args,
train_dataset=dataloader.dataset, # Use dataset, not dataloader
data_collator=data_collator,
)

Train the model

trainer.train()

Save the trained model

trainer.save_model(‘./model’)

Load the trained ChatGPT model

model_path = ‘./model’
#chatbot = pipeline(‘text-generation’, model=model_path, tokenizer=‘gpt2’, num_return_sequences=1, num_samples=1)
chatbot = pipeline(‘text-generation’, model=model_path, tokenizer=‘gpt2’, num_return_sequences=1, num_samples=1)

Define a prompt message for the user

prompt_message = ‘Hello, how can I help you today?’

Prompt the user for input and generate a response

while True:
# Get user input
user_input = input(prompt_message + '\nYou: ')

# Generate response from the ChatGPT model
response = chatbot(user_input, max_length=1000, do_sample=False, temperature=0.7, num_samples=1)[0]['generated_text']

# Print the response
print('Chatbot: ' + response)