I appreciate, if you have a pc with nvidia GPUs, to test it and report how long it takes.
Just replace "fivetech_forums_20231222.sql" with a large text file you may have, thanks
train.py
Code: Select all | Expand
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
# Load pre-trained GPT-2 model and tokenizer
model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
# Load your additional training data from a file
file_path = "fivetech_forums_20231222.sql"
with open(file_path, "r", encoding="utf-8") as file:
train_text = file.read()
# Tokenize the training data
train_tokens = tokenizer(train_text, return_tensors="pt", truncation=True, padding=True)
# Create a PyTorch Dataset
train_dataset = TextDataset(
tokenizer=tokenizer,
file_path=file_path,
block_size=128 # Adjust the block size based on your dataset
)
# Create a data collator
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer,
mlm=False # Set to True if your training data includes masked language modeling objective
)
# Configure training arguments
training_args = TrainingArguments(
output_dir="./fine-tuned-model",
overwrite_output_dir=True,
num_train_epochs=3, # Adjust the number of epochs based on your dataset
per_device_train_batch_size=4,
save_steps=10_000,
save_total_limit=2,
logging_dir="./logs",
)
# Initialize Trainer
trainer = Trainer(
model=model,
args=training_args,
data_collator=data_collator,
train_dataset=train_dataset,
)
# Fine-tune the model
trainer.train()
# Save the fine-tuned model
model.save_pretrained("./fine-tuned-model")
tokenizer.save_pretrained("./fine-tuned-model")