updating train_colab

This commit is contained in:
Shahar Dickstein 2026-02-15 13:22:15 +02:00
parent 1749b7fef7
commit 4af5986e8c

View File

@ -1,120 +1,144 @@
import torch
import torch.nn as nn
import os
import urllib.request
import json
import numpy as np
import tensorflow as tf
from tqdm import tqdm
from architecture import GPTModel, load_weights_into_gpt
from config import GPT_CONFIG_124M, SPECIAL_TOKENS
from config import GPT_CONFIG_124M
from tokenizer_utils import TokenizerWrapper
from dataset_prep import create_dataloader
def download_and_load_gpt2(model_size="124M", target_dir="models"):
# Simple placeholder for weight loading.
# In a real scenario, we'd use the code from ch05/01_main-chapter-code/gpt_download.py
# For now, we assume the user might have them or we can use the gpt_download logic provided in the book.
# To keep this script standalone for Colab, we should probably include the download logic or use HfHub.
# BUT, the user said "rely on previous_chapters.py". That file has `load_weights_into_gpt`.
# It does NOT have the downloader.
# We will assume standard gpt2 weights are available or use a helper from `transformers` to get them
# and convert, OR implement the download logic.
# PROPOSAL: Use `transformers` to fetch weights -> convert -> load, as done in ch05.
print(f"Loading weights for {model_size}...")
from transformers import GPT2Model
hf_model = GPT2Model.from_pretrained("gpt2")
state_dict = hf_model.state_dict()
# Mapping logic (simplified from ch05)
# Actually, `load_weights_into_gpt` in `architecture.py` expects a specific param structure
# matching the TF checkpoint format (untransposed etc).
# The book's `load_weights_into_gpt` is designed for the ORIGINAL 124M params from OpenAI/TF.
# If we use `gpt2` from HuggingFace, the keys are different.
# To facilitate this without complex conversion scripts, we might just train from scratch
# OR use the known `gpt_download.py` script.
# Given the constraint to use `previous_chapters.py`, we should probably provide
# the weight downloading logic or minimal conversion.
# Let's use the TF weight download logic if possible, or mapping.
# For robust Colab usage, let's assume we want to download the weights.
pass
# Since we can't easily replicate the full download logic in one file without clutter,
# we will implement a simplified mapping from HF GPT2 (which is easy to install on Colab)
# to our model.
def download_file(url, destination):
# Simplified download utility
import requests
response = requests.get(url, stream=True)
file_size = int(response.headers.get("content-length", 0))
def map_hf_to_our_model(our_model, hf_model):
# This is a heuristic mapping.
# Hf: wte, wpe, h[i].ln_1, h[i].attn, h[i].ln_2, h[i].mlp, ln_f
# Ours: tok_emb, pos_emb, trf_blocks[i].norm1, trf_blocks[i].att, norm2, ff, final_norm
params = hf_model.state_dict()
# Embeddings
our_model.tok_emb.weight.data.copy_(params['wte.weight'])
our_model.pos_emb.weight.data.copy_(params['wpe.weight'])
# Blocks
for i, block in enumerate(our_model.trf_blocks):
prefix = f"h.{i}."
# Norm 1
block.norm1.scale.data.copy_(params[f"{prefix}ln_1.weight"])
block.norm1.shift.data.copy_(params[f"{prefix}ln_1.bias"])
# Attention
# HF: c_attn.weight is (768, 2304) -> (d, 3*d) -> [Q, K, V]
# Ours: W_query, W_key, W_value
qkv_w = params[f"{prefix}attn.c_attn.weight"] # Transpolose? HF Linear is (in, out) in code but weights are (out, in)?
# HF uses Conv1D for these which stores (in, out). PyTorch Linear stores (out, in).
# We need to be careful.
# Let's skip detailed weight mapping here to avoid breakage without testing.
# RECOMMENDATION: Train from scratch for this experiment since we are changing vocab
# OR use the book's download script.
pass
if os.path.exists(destination):
file_size_local = os.path.getsize(destination)
if file_size == file_size_local:
print(f"File already exists and is up-to-date: {destination}")
return
# ... Actually, training from scratch on T4 for small syntax tasks is feasible but 500k dataset is large.
# We SHOULD use pretrained.
# I will include the `download_and_load_gpt2` from Ch05 in a simplified way?
# No, `transformers` is easier.
block_size = 1024
with tqdm(total=file_size, unit="iB", unit_scale=True, desc=url.split("/")[-1]) as progress_bar:
with open(destination, "wb") as file:
for chunk in response.iter_content(block_size):
progress_bar.update(len(chunk))
file.write(chunk)
def train(cfg=GPT_CONFIG_124M, max_steps=1000):
def download_and_load_gpt2(model_size, models_dir):
# Validate model size
allowed_sizes = ("124M", "355M", "774M", "1558M")
if model_size not in allowed_sizes:
raise ValueError(f"Model size not in {allowed_sizes}")
# Define paths
model_dir = os.path.join(models_dir, model_size)
base_url = "https://openaipublic.blob.core.windows.net/gpt-2/models"
filenames = [
"checkpoint", "encoder.json", "hparams.json",
"model.ckpt.data-00000-of-00001", "model.ckpt.index",
"model.ckpt.meta", "vocab.bpe"
]
# Download files
os.makedirs(model_dir, exist_ok=True)
for filename in filenames:
file_url = os.path.join(base_url, model_size, filename)
file_path = os.path.join(model_dir, filename)
download_file(file_url, file_path)
# Load settings and params
tf_ckpt_path = tf.train.latest_checkpoint(model_dir)
settings = json.load(open(os.path.join(model_dir, "hparams.json"), "r", encoding="utf-8"))
params = load_gpt2_params_from_tf_ckpt(tf_ckpt_path, settings)
return settings, params
def load_gpt2_params_from_tf_ckpt(ckpt_path, settings):
# Initialize parameters dictionary with empty blocks for each layer
params = {"blocks": [{} for _ in range(settings["n_layer"])]}
# Iterate over each variable in the checkpoint
for name, _ in tf.train.list_variables(ckpt_path):
# Load the variable and remove singleton dimensions
variable_array = np.squeeze(tf.train.load_variable(ckpt_path, name))
# Process the variable name to extract relevant parts
variable_name_parts = name.split("/")[1:] # Skip the 'model/' prefix
# Identify the target dictionary for the variable
target_dict = params
if variable_name_parts[0].startswith("h"):
layer_number = int(variable_name_parts[0][1:])
target_dict = params["blocks"][layer_number]
# Recursively access or create nested dictionaries
for key in variable_name_parts[1:-1]:
target_dict = target_dict.setdefault(key, {})
# Assign the variable array to the last key
last_key = variable_name_parts[-1]
target_dict[last_key] = variable_array
return params
def train(model_size="124M", max_steps=1000):
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
# 1. Init Tokenizer & Model
tokenizer = TokenizerWrapper()
model = GPTModel(cfg)
model.to(device)
# 1. Download & Load Params
print(f"Downloading {model_size} weights...")
settings, params = download_and_load_gpt2(model_size, "models")
# 2. Resize Embeddings for Special Tokens
# Current vocab: 50257. New: 50259.
# We need to expand the embedding matrix.
# Quick hack: create new embedding layer, copy old weights, init new ones.
old_emb = model.tok_emb
# 2. Init Model & Load Weights
print("Initializing architecture...")
# Map settings to our config format if needed, but we used GPT_CONFIG_124M as base.
# We should ensure config matches loaded settings.
cfg = GPT_CONFIG_124M
model = GPTModel(cfg)
print("Loading weights into model...")
load_weights_into_gpt(model, params)
model.to(device)
print("Weights loaded successfully.")
# 3. Resize Embeddings for Special Tokens
tokenizer = TokenizerWrapper()
new_vocab_size = tokenizer.base_tokenizer.n_vocab + len(tokenizer.special_tokens)
new_emb = torch.nn.Embedding(new_vocab_size, cfg["emb_dim"])
print(f"Resizing model vocab to {new_vocab_size}...")
old_emb = model.tok_emb
new_emb = nn.Embedding(new_vocab_size, cfg["emb_dim"])
# Copy existing
new_emb.weight.data[:old_emb.num_embeddings] = old_emb.weight.data
# Replace
# Init new (mean)
new_emb.weight.data[old_emb.num_embeddings:] = old_emb.weight.data.mean(dim=0, keepdim=True)
model.tok_emb = new_emb.to(device)
# Update output head too
# Resize Output Head
old_head = model.out_head
new_head = torch.nn.Linear(cfg["emb_dim"], new_vocab_size, bias=False)
new_head = nn.Linear(cfg["emb_dim"], new_vocab_size, bias=False)
new_head.weight.data[:old_head.out_features] = old_head.weight.data
new_head.weight.data[old_head.out_features:] = new_emb.weight.data[old_head.out_features:]
model.out_head = new_head.to(device)
print(f"Model resized to vocab: {new_vocab_size}")
# 4. Data Loader
train_loader = create_dataloader(tokenizer, batch_size=2, max_length=cfg["context_length"])
# 3. Data Loader
train_loader = create_dataloader(tokenizer, batch_size=4, max_length=cfg["context_length"])
# 5. Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0001, weight_decay=0.01)
# 4. Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-4, weight_decay=0.1)
# 5. Loop
# 6. Loop
model.train()
step = 0
print("Starting training...")
for input_chunk, target_chunk in train_loader:
input_chunk, target_chunk = input_chunk.to(device), target_chunk.to(device)
@ -122,7 +146,6 @@ def train(cfg=GPT_CONFIG_124M, max_steps=1000):
optimizer.zero_grad()
logits = model(input_chunk)
# Flatten for loss
loss = torch.nn.functional.cross_entropy(
logits.flatten(0, 1),
target_chunk.flatten(0, 1)
@ -140,6 +163,7 @@ def train(cfg=GPT_CONFIG_124M, max_steps=1000):
print("Training complete.")
torch.save(model.state_dict(), "tool_llm.pth")
print("Model saved to 'tool_llm.pth'.")
if __name__ == "__main__":
train()