Hello AIs,
I’m using the below code to train a model , unfortunately num-sample always come as zero no matter what. Check the file location and content. No luck. Any pointers will be appreciated
ValueError: num_samples should be a positive integer value, but got num_samples=0
from transformers import GPT2TokenizerFast, TextDataset, DataCollatorForLanguageModeling,
TrainingArguments, Trainer, AutoModelForCausalLM, pipeline
from torch.utils.data import DataLoader, RandomSampler
Load the GPT2 tokenizer
tokenizer = GPT2TokenizerFast.from_pretrained(‘gpt2’)
Load the text data
with open(‘input_text.txt’, ‘r’) as f:
text = f.read()
Tokenize the text
tokenized_text = tokenizer.encode(text)
Define the block size for the TextDataset
block_size = 128
Calculate the number of special tokens to add
num_special_tokens = tokenizer.num_special_tokens_to_add(pair=False)
Adjust the block size to account for the special tokens
block_size = block_size - num_special_tokens
Create a TextDataset from the tokenized text and block size
file_path = ‘C:/ChatGPT/data/test.txt’
dataset = TextDataset(tokenizer=tokenizer, file_path=file_path, block_size=block_size)
Create a DataCollatorForLanguageModeling to collate the data
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
Define the training arguments
training_args = TrainingArguments(
output_dir=‘./model’,
num_train_epochs=3,
per_device_train_batch_size=16,
save_total_limit=2,
learning_rate=5e-5,
adam_epsilon=1e-8,
max_grad_norm=1.0,
warmup_steps=500,
logging_steps=500,
save_steps=1000,
)
Load the pre-trained GPT2 model
model_init = lambda: AutoModelForCausalLM.from_pretrained(‘gpt2’)
model = model_init()
Define the Trainer with a DataLoader and RandomSampler
train_sampler = RandomSampler(dataset, num_samples=100000, replacement=True)
dataloader = DataLoader(dataset, batch_size=training_args.per_device_train_batch_size, sampler=train_sampler)
trainer = Trainer(
model_init=model_init,
args=training_args,
train_dataset=dataloader.dataset, # Use dataset, not dataloader
data_collator=data_collator,
)
Train the model
trainer.train()
Save the trained model
trainer.save_model(‘./model’)
Load the trained ChatGPT model
model_path = ‘./model’
#chatbot = pipeline(‘text-generation’, model=model_path, tokenizer=‘gpt2’, num_return_sequences=1, num_samples=1)
chatbot = pipeline(‘text-generation’, model=model_path, tokenizer=‘gpt2’, num_return_sequences=1, num_samples=1)
Define a prompt message for the user
prompt_message = ‘Hello, how can I help you today?’
Prompt the user for input and generate a response
while True:
# Get user input
user_input = input(prompt_message + '\nYou: ')
# Generate response from the ChatGPT model
response = chatbot(user_input, max_length=1000, do_sample=False, temperature=0.7, num_samples=1)[0]['generated_text']
# Print the response
print('Chatbot: ' + response)