diff --git a/tool_calling_experiment/train_colab.py b/tool_calling_experiment/train_colab.py index cfc16d5..2d38897 100644 --- a/tool_calling_experiment/train_colab.py +++ b/tool_calling_experiment/train_colab.py @@ -1,120 +1,144 @@ import torch +import torch.nn as nn import os import urllib.request +import json +import numpy as np +import tensorflow as tf +from tqdm import tqdm from architecture import GPTModel, load_weights_into_gpt -from config import GPT_CONFIG_124M, SPECIAL_TOKENS +from config import GPT_CONFIG_124M from tokenizer_utils import TokenizerWrapper from dataset_prep import create_dataloader -def download_and_load_gpt2(model_size="124M", target_dir="models"): - # Simple placeholder for weight loading. - # In a real scenario, we'd use the code from ch05/01_main-chapter-code/gpt_download.py - # For now, we assume the user might have them or we can use the gpt_download logic provided in the book. - # To keep this script standalone for Colab, we should probably include the download logic or use HfHub. - # BUT, the user said "rely on previous_chapters.py". That file has `load_weights_into_gpt`. - # It does NOT have the downloader. - # We will assume standard gpt2 weights are available or use a helper from `transformers` to get them - # and convert, OR implement the download logic. - # PROPOSAL: Use `transformers` to fetch weights -> convert -> load, as done in ch05. - - print(f"Loading weights for {model_size}...") - from transformers import GPT2Model - hf_model = GPT2Model.from_pretrained("gpt2") - state_dict = hf_model.state_dict() - - # Mapping logic (simplified from ch05) - # Actually, `load_weights_into_gpt` in `architecture.py` expects a specific param structure - # matching the TF checkpoint format (untransposed etc). - # The book's `load_weights_into_gpt` is designed for the ORIGINAL 124M params from OpenAI/TF. - - # If we use `gpt2` from HuggingFace, the keys are different. - # To facilitate this without complex conversion scripts, we might just train from scratch - # OR use the known `gpt_download.py` script. - - # Given the constraint to use `previous_chapters.py`, we should probably provide - # the weight downloading logic or minimal conversion. - - # Let's use the TF weight download logic if possible, or mapping. - # For robust Colab usage, let's assume we want to download the weights. - pass -# Since we can't easily replicate the full download logic in one file without clutter, -# we will implement a simplified mapping from HF GPT2 (which is easy to install on Colab) -# to our model. +def download_file(url, destination): + # Simplified download utility + import requests + response = requests.get(url, stream=True) + file_size = int(response.headers.get("content-length", 0)) -def map_hf_to_our_model(our_model, hf_model): - # This is a heuristic mapping. - # Hf: wte, wpe, h[i].ln_1, h[i].attn, h[i].ln_2, h[i].mlp, ln_f - # Ours: tok_emb, pos_emb, trf_blocks[i].norm1, trf_blocks[i].att, norm2, ff, final_norm - - params = hf_model.state_dict() - - # Embeddings - our_model.tok_emb.weight.data.copy_(params['wte.weight']) - our_model.pos_emb.weight.data.copy_(params['wpe.weight']) - - # Blocks - for i, block in enumerate(our_model.trf_blocks): - prefix = f"h.{i}." - - # Norm 1 - block.norm1.scale.data.copy_(params[f"{prefix}ln_1.weight"]) - block.norm1.shift.data.copy_(params[f"{prefix}ln_1.bias"]) - - # Attention - # HF: c_attn.weight is (768, 2304) -> (d, 3*d) -> [Q, K, V] - # Ours: W_query, W_key, W_value - qkv_w = params[f"{prefix}attn.c_attn.weight"] # Transpolose? HF Linear is (in, out) in code but weights are (out, in)? - # HF uses Conv1D for these which stores (in, out). PyTorch Linear stores (out, in). - # We need to be careful. - # Let's skip detailed weight mapping here to avoid breakage without testing. - # RECOMMENDATION: Train from scratch for this experiment since we are changing vocab - # OR use the book's download script. - pass + if os.path.exists(destination): + file_size_local = os.path.getsize(destination) + if file_size == file_size_local: + print(f"File already exists and is up-to-date: {destination}") + return -# ... Actually, training from scratch on T4 for small syntax tasks is feasible but 500k dataset is large. -# We SHOULD use pretrained. -# I will include the `download_and_load_gpt2` from Ch05 in a simplified way? -# No, `transformers` is easier. + block_size = 1024 + with tqdm(total=file_size, unit="iB", unit_scale=True, desc=url.split("/")[-1]) as progress_bar: + with open(destination, "wb") as file: + for chunk in response.iter_content(block_size): + progress_bar.update(len(chunk)) + file.write(chunk) -def train(cfg=GPT_CONFIG_124M, max_steps=1000): +def download_and_load_gpt2(model_size, models_dir): + # Validate model size + allowed_sizes = ("124M", "355M", "774M", "1558M") + if model_size not in allowed_sizes: + raise ValueError(f"Model size not in {allowed_sizes}") + + # Define paths + model_dir = os.path.join(models_dir, model_size) + base_url = "https://openaipublic.blob.core.windows.net/gpt-2/models" + filenames = [ + "checkpoint", "encoder.json", "hparams.json", + "model.ckpt.data-00000-of-00001", "model.ckpt.index", + "model.ckpt.meta", "vocab.bpe" + ] + + # Download files + os.makedirs(model_dir, exist_ok=True) + for filename in filenames: + file_url = os.path.join(base_url, model_size, filename) + file_path = os.path.join(model_dir, filename) + download_file(file_url, file_path) + + # Load settings and params + tf_ckpt_path = tf.train.latest_checkpoint(model_dir) + settings = json.load(open(os.path.join(model_dir, "hparams.json"), "r", encoding="utf-8")) + params = load_gpt2_params_from_tf_ckpt(tf_ckpt_path, settings) + + return settings, params + +def load_gpt2_params_from_tf_ckpt(ckpt_path, settings): + # Initialize parameters dictionary with empty blocks for each layer + params = {"blocks": [{} for _ in range(settings["n_layer"])]} + + # Iterate over each variable in the checkpoint + for name, _ in tf.train.list_variables(ckpt_path): + # Load the variable and remove singleton dimensions + variable_array = np.squeeze(tf.train.load_variable(ckpt_path, name)) + + # Process the variable name to extract relevant parts + variable_name_parts = name.split("/")[1:] # Skip the 'model/' prefix + + # Identify the target dictionary for the variable + target_dict = params + if variable_name_parts[0].startswith("h"): + layer_number = int(variable_name_parts[0][1:]) + target_dict = params["blocks"][layer_number] + + # Recursively access or create nested dictionaries + for key in variable_name_parts[1:-1]: + target_dict = target_dict.setdefault(key, {}) + + # Assign the variable array to the last key + last_key = variable_name_parts[-1] + target_dict[last_key] = variable_array + + return params + + +def train(model_size="124M", max_steps=1000): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"Using device: {device}") - # 1. Init Tokenizer & Model - tokenizer = TokenizerWrapper() - model = GPTModel(cfg) - model.to(device) + # 1. Download & Load Params + print(f"Downloading {model_size} weights...") + settings, params = download_and_load_gpt2(model_size, "models") - # 2. Resize Embeddings for Special Tokens - # Current vocab: 50257. New: 50259. - # We need to expand the embedding matrix. - # Quick hack: create new embedding layer, copy old weights, init new ones. - old_emb = model.tok_emb + # 2. Init Model & Load Weights + print("Initializing architecture...") + # Map settings to our config format if needed, but we used GPT_CONFIG_124M as base. + # We should ensure config matches loaded settings. + cfg = GPT_CONFIG_124M + + model = GPTModel(cfg) + print("Loading weights into model...") + load_weights_into_gpt(model, params) + model.to(device) + print("Weights loaded successfully.") + + # 3. Resize Embeddings for Special Tokens + tokenizer = TokenizerWrapper() new_vocab_size = tokenizer.base_tokenizer.n_vocab + len(tokenizer.special_tokens) - new_emb = torch.nn.Embedding(new_vocab_size, cfg["emb_dim"]) + print(f"Resizing model vocab to {new_vocab_size}...") + + old_emb = model.tok_emb + new_emb = nn.Embedding(new_vocab_size, cfg["emb_dim"]) # Copy existing new_emb.weight.data[:old_emb.num_embeddings] = old_emb.weight.data - # Replace + # Init new (mean) + new_emb.weight.data[old_emb.num_embeddings:] = old_emb.weight.data.mean(dim=0, keepdim=True) model.tok_emb = new_emb.to(device) - # Update output head too + # Resize Output Head old_head = model.out_head - new_head = torch.nn.Linear(cfg["emb_dim"], new_vocab_size, bias=False) + new_head = nn.Linear(cfg["emb_dim"], new_vocab_size, bias=False) new_head.weight.data[:old_head.out_features] = old_head.weight.data + new_head.weight.data[old_head.out_features:] = new_emb.weight.data[old_head.out_features:] model.out_head = new_head.to(device) - print(f"Model resized to vocab: {new_vocab_size}") + # 4. Data Loader + train_loader = create_dataloader(tokenizer, batch_size=2, max_length=cfg["context_length"]) - # 3. Data Loader - train_loader = create_dataloader(tokenizer, batch_size=4, max_length=cfg["context_length"]) + # 5. Optimizer + optimizer = torch.optim.AdamW(model.parameters(), lr=0.0001, weight_decay=0.01) - # 4. Optimizer - optimizer = torch.optim.AdamW(model.parameters(), lr=5e-4, weight_decay=0.1) - - # 5. Loop + # 6. Loop model.train() step = 0 + print("Starting training...") for input_chunk, target_chunk in train_loader: input_chunk, target_chunk = input_chunk.to(device), target_chunk.to(device) @@ -122,7 +146,6 @@ def train(cfg=GPT_CONFIG_124M, max_steps=1000): optimizer.zero_grad() logits = model(input_chunk) - # Flatten for loss loss = torch.nn.functional.cross_entropy( logits.flatten(0, 1), target_chunk.flatten(0, 1) @@ -140,6 +163,7 @@ def train(cfg=GPT_CONFIG_124M, max_steps=1000): print("Training complete.") torch.save(model.state_dict(), "tool_llm.pth") + print("Model saved to 'tool_llm.pth'.") if __name__ == "__main__": train()