mirror of
https://github.com/rasbt/LLMs-from-scratch.git
synced 2026-04-11 14:21:41 +08:00
updating train_colab
This commit is contained in:
parent
1749b7fef7
commit
4af5986e8c
@ -1,120 +1,144 @@
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import os
|
||||
import urllib.request
|
||||
import json
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
from tqdm import tqdm
|
||||
from architecture import GPTModel, load_weights_into_gpt
|
||||
from config import GPT_CONFIG_124M, SPECIAL_TOKENS
|
||||
from config import GPT_CONFIG_124M
|
||||
from tokenizer_utils import TokenizerWrapper
|
||||
from dataset_prep import create_dataloader
|
||||
|
||||
def download_and_load_gpt2(model_size="124M", target_dir="models"):
|
||||
# Simple placeholder for weight loading.
|
||||
# In a real scenario, we'd use the code from ch05/01_main-chapter-code/gpt_download.py
|
||||
# For now, we assume the user might have them or we can use the gpt_download logic provided in the book.
|
||||
# To keep this script standalone for Colab, we should probably include the download logic or use HfHub.
|
||||
# BUT, the user said "rely on previous_chapters.py". That file has `load_weights_into_gpt`.
|
||||
# It does NOT have the downloader.
|
||||
# We will assume standard gpt2 weights are available or use a helper from `transformers` to get them
|
||||
# and convert, OR implement the download logic.
|
||||
# PROPOSAL: Use `transformers` to fetch weights -> convert -> load, as done in ch05.
|
||||
|
||||
print(f"Loading weights for {model_size}...")
|
||||
from transformers import GPT2Model
|
||||
hf_model = GPT2Model.from_pretrained("gpt2")
|
||||
state_dict = hf_model.state_dict()
|
||||
|
||||
# Mapping logic (simplified from ch05)
|
||||
# Actually, `load_weights_into_gpt` in `architecture.py` expects a specific param structure
|
||||
# matching the TF checkpoint format (untransposed etc).
|
||||
# The book's `load_weights_into_gpt` is designed for the ORIGINAL 124M params from OpenAI/TF.
|
||||
|
||||
# If we use `gpt2` from HuggingFace, the keys are different.
|
||||
# To facilitate this without complex conversion scripts, we might just train from scratch
|
||||
# OR use the known `gpt_download.py` script.
|
||||
|
||||
# Given the constraint to use `previous_chapters.py`, we should probably provide
|
||||
# the weight downloading logic or minimal conversion.
|
||||
|
||||
# Let's use the TF weight download logic if possible, or mapping.
|
||||
# For robust Colab usage, let's assume we want to download the weights.
|
||||
pass
|
||||
|
||||
# Since we can't easily replicate the full download logic in one file without clutter,
|
||||
# we will implement a simplified mapping from HF GPT2 (which is easy to install on Colab)
|
||||
# to our model.
|
||||
def download_file(url, destination):
|
||||
# Simplified download utility
|
||||
import requests
|
||||
response = requests.get(url, stream=True)
|
||||
file_size = int(response.headers.get("content-length", 0))
|
||||
|
||||
def map_hf_to_our_model(our_model, hf_model):
|
||||
# This is a heuristic mapping.
|
||||
# Hf: wte, wpe, h[i].ln_1, h[i].attn, h[i].ln_2, h[i].mlp, ln_f
|
||||
# Ours: tok_emb, pos_emb, trf_blocks[i].norm1, trf_blocks[i].att, norm2, ff, final_norm
|
||||
|
||||
params = hf_model.state_dict()
|
||||
|
||||
# Embeddings
|
||||
our_model.tok_emb.weight.data.copy_(params['wte.weight'])
|
||||
our_model.pos_emb.weight.data.copy_(params['wpe.weight'])
|
||||
|
||||
# Blocks
|
||||
for i, block in enumerate(our_model.trf_blocks):
|
||||
prefix = f"h.{i}."
|
||||
|
||||
# Norm 1
|
||||
block.norm1.scale.data.copy_(params[f"{prefix}ln_1.weight"])
|
||||
block.norm1.shift.data.copy_(params[f"{prefix}ln_1.bias"])
|
||||
|
||||
# Attention
|
||||
# HF: c_attn.weight is (768, 2304) -> (d, 3*d) -> [Q, K, V]
|
||||
# Ours: W_query, W_key, W_value
|
||||
qkv_w = params[f"{prefix}attn.c_attn.weight"] # Transpolose? HF Linear is (in, out) in code but weights are (out, in)?
|
||||
# HF uses Conv1D for these which stores (in, out). PyTorch Linear stores (out, in).
|
||||
# We need to be careful.
|
||||
# Let's skip detailed weight mapping here to avoid breakage without testing.
|
||||
# RECOMMENDATION: Train from scratch for this experiment since we are changing vocab
|
||||
# OR use the book's download script.
|
||||
pass
|
||||
if os.path.exists(destination):
|
||||
file_size_local = os.path.getsize(destination)
|
||||
if file_size == file_size_local:
|
||||
print(f"File already exists and is up-to-date: {destination}")
|
||||
return
|
||||
|
||||
# ... Actually, training from scratch on T4 for small syntax tasks is feasible but 500k dataset is large.
|
||||
# We SHOULD use pretrained.
|
||||
# I will include the `download_and_load_gpt2` from Ch05 in a simplified way?
|
||||
# No, `transformers` is easier.
|
||||
block_size = 1024
|
||||
with tqdm(total=file_size, unit="iB", unit_scale=True, desc=url.split("/")[-1]) as progress_bar:
|
||||
with open(destination, "wb") as file:
|
||||
for chunk in response.iter_content(block_size):
|
||||
progress_bar.update(len(chunk))
|
||||
file.write(chunk)
|
||||
|
||||
def train(cfg=GPT_CONFIG_124M, max_steps=1000):
|
||||
def download_and_load_gpt2(model_size, models_dir):
|
||||
# Validate model size
|
||||
allowed_sizes = ("124M", "355M", "774M", "1558M")
|
||||
if model_size not in allowed_sizes:
|
||||
raise ValueError(f"Model size not in {allowed_sizes}")
|
||||
|
||||
# Define paths
|
||||
model_dir = os.path.join(models_dir, model_size)
|
||||
base_url = "https://openaipublic.blob.core.windows.net/gpt-2/models"
|
||||
filenames = [
|
||||
"checkpoint", "encoder.json", "hparams.json",
|
||||
"model.ckpt.data-00000-of-00001", "model.ckpt.index",
|
||||
"model.ckpt.meta", "vocab.bpe"
|
||||
]
|
||||
|
||||
# Download files
|
||||
os.makedirs(model_dir, exist_ok=True)
|
||||
for filename in filenames:
|
||||
file_url = os.path.join(base_url, model_size, filename)
|
||||
file_path = os.path.join(model_dir, filename)
|
||||
download_file(file_url, file_path)
|
||||
|
||||
# Load settings and params
|
||||
tf_ckpt_path = tf.train.latest_checkpoint(model_dir)
|
||||
settings = json.load(open(os.path.join(model_dir, "hparams.json"), "r", encoding="utf-8"))
|
||||
params = load_gpt2_params_from_tf_ckpt(tf_ckpt_path, settings)
|
||||
|
||||
return settings, params
|
||||
|
||||
def load_gpt2_params_from_tf_ckpt(ckpt_path, settings):
|
||||
# Initialize parameters dictionary with empty blocks for each layer
|
||||
params = {"blocks": [{} for _ in range(settings["n_layer"])]}
|
||||
|
||||
# Iterate over each variable in the checkpoint
|
||||
for name, _ in tf.train.list_variables(ckpt_path):
|
||||
# Load the variable and remove singleton dimensions
|
||||
variable_array = np.squeeze(tf.train.load_variable(ckpt_path, name))
|
||||
|
||||
# Process the variable name to extract relevant parts
|
||||
variable_name_parts = name.split("/")[1:] # Skip the 'model/' prefix
|
||||
|
||||
# Identify the target dictionary for the variable
|
||||
target_dict = params
|
||||
if variable_name_parts[0].startswith("h"):
|
||||
layer_number = int(variable_name_parts[0][1:])
|
||||
target_dict = params["blocks"][layer_number]
|
||||
|
||||
# Recursively access or create nested dictionaries
|
||||
for key in variable_name_parts[1:-1]:
|
||||
target_dict = target_dict.setdefault(key, {})
|
||||
|
||||
# Assign the variable array to the last key
|
||||
last_key = variable_name_parts[-1]
|
||||
target_dict[last_key] = variable_array
|
||||
|
||||
return params
|
||||
|
||||
|
||||
def train(model_size="124M", max_steps=1000):
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
print(f"Using device: {device}")
|
||||
|
||||
# 1. Init Tokenizer & Model
|
||||
tokenizer = TokenizerWrapper()
|
||||
model = GPTModel(cfg)
|
||||
model.to(device)
|
||||
# 1. Download & Load Params
|
||||
print(f"Downloading {model_size} weights...")
|
||||
settings, params = download_and_load_gpt2(model_size, "models")
|
||||
|
||||
# 2. Resize Embeddings for Special Tokens
|
||||
# Current vocab: 50257. New: 50259.
|
||||
# We need to expand the embedding matrix.
|
||||
# Quick hack: create new embedding layer, copy old weights, init new ones.
|
||||
old_emb = model.tok_emb
|
||||
# 2. Init Model & Load Weights
|
||||
print("Initializing architecture...")
|
||||
# Map settings to our config format if needed, but we used GPT_CONFIG_124M as base.
|
||||
# We should ensure config matches loaded settings.
|
||||
cfg = GPT_CONFIG_124M
|
||||
|
||||
model = GPTModel(cfg)
|
||||
print("Loading weights into model...")
|
||||
load_weights_into_gpt(model, params)
|
||||
model.to(device)
|
||||
print("Weights loaded successfully.")
|
||||
|
||||
# 3. Resize Embeddings for Special Tokens
|
||||
tokenizer = TokenizerWrapper()
|
||||
new_vocab_size = tokenizer.base_tokenizer.n_vocab + len(tokenizer.special_tokens)
|
||||
new_emb = torch.nn.Embedding(new_vocab_size, cfg["emb_dim"])
|
||||
print(f"Resizing model vocab to {new_vocab_size}...")
|
||||
|
||||
old_emb = model.tok_emb
|
||||
new_emb = nn.Embedding(new_vocab_size, cfg["emb_dim"])
|
||||
# Copy existing
|
||||
new_emb.weight.data[:old_emb.num_embeddings] = old_emb.weight.data
|
||||
# Replace
|
||||
# Init new (mean)
|
||||
new_emb.weight.data[old_emb.num_embeddings:] = old_emb.weight.data.mean(dim=0, keepdim=True)
|
||||
model.tok_emb = new_emb.to(device)
|
||||
|
||||
# Update output head too
|
||||
# Resize Output Head
|
||||
old_head = model.out_head
|
||||
new_head = torch.nn.Linear(cfg["emb_dim"], new_vocab_size, bias=False)
|
||||
new_head = nn.Linear(cfg["emb_dim"], new_vocab_size, bias=False)
|
||||
new_head.weight.data[:old_head.out_features] = old_head.weight.data
|
||||
new_head.weight.data[old_head.out_features:] = new_emb.weight.data[old_head.out_features:]
|
||||
model.out_head = new_head.to(device)
|
||||
|
||||
print(f"Model resized to vocab: {new_vocab_size}")
|
||||
# 4. Data Loader
|
||||
train_loader = create_dataloader(tokenizer, batch_size=2, max_length=cfg["context_length"])
|
||||
|
||||
# 3. Data Loader
|
||||
train_loader = create_dataloader(tokenizer, batch_size=4, max_length=cfg["context_length"])
|
||||
# 5. Optimizer
|
||||
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0001, weight_decay=0.01)
|
||||
|
||||
# 4. Optimizer
|
||||
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-4, weight_decay=0.1)
|
||||
|
||||
# 5. Loop
|
||||
# 6. Loop
|
||||
model.train()
|
||||
step = 0
|
||||
print("Starting training...")
|
||||
|
||||
for input_chunk, target_chunk in train_loader:
|
||||
input_chunk, target_chunk = input_chunk.to(device), target_chunk.to(device)
|
||||
@ -122,7 +146,6 @@ def train(cfg=GPT_CONFIG_124M, max_steps=1000):
|
||||
optimizer.zero_grad()
|
||||
logits = model(input_chunk)
|
||||
|
||||
# Flatten for loss
|
||||
loss = torch.nn.functional.cross_entropy(
|
||||
logits.flatten(0, 1),
|
||||
target_chunk.flatten(0, 1)
|
||||
@ -140,6 +163,7 @@ def train(cfg=GPT_CONFIG_124M, max_steps=1000):
|
||||
|
||||
print("Training complete.")
|
||||
torch.save(model.state_dict(), "tool_llm.pth")
|
||||
print("Model saved to 'tool_llm.pth'.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
train()
|
||||
|
||||
Loading…
Reference in New Issue
Block a user