updating train_colab

2026-04-11 14:21:41 +08:00 · 2026-02-15 13:22:15 +02:00 · 2026-02-15 13:22:15 +02:00 · 4af5986e8c
commit 4af5986e8c
parent 1749b7fef7
1 changed files with 114 additions and 90 deletions
--- a/tool_calling_experiment/train_colab.py
+++ b/tool_calling_experiment/train_colab.py
@ -1,120 +1,144 @@
 import torch
+import torch.nn as nn
 import os
 import urllib.request
+import json
+import numpy as np
+import tensorflow as tf
+from tqdm import tqdm
 from architecture import GPTModel, load_weights_into_gpt
-from config import GPT_CONFIG_124M, SPECIAL_TOKENS
+from config import GPT_CONFIG_124M
 from tokenizer_utils import TokenizerWrapper
 from dataset_prep import create_dataloader

-def download_and_load_gpt2(model_size="124M", target_dir="models"):
-    # Simple placeholder for weight loading. 
-    # In a real scenario, we'd use the code from ch05/01_main-chapter-code/gpt_download.py
-    # For now, we assume the user might have them or we can use the gpt_download logic provided in the book.
-    # To keep this script standalone for Colab, we should probably include the download logic or use HfHub.
-    # BUT, the user said "rely on previous_chapters.py". That file has `load_weights_into_gpt`.
-    # It does NOT have the downloader.
-    # We will assume standard gpt2 weights are available or use a helper from `transformers` to get them 
-    # and convert, OR implement the download logic.
-    # PROPOSAL: Use `transformers` to fetch weights -> convert -> load, as done in ch05.
-    
-    print(f"Loading weights for {model_size}...")
-    from transformers import GPT2Model
-    hf_model = GPT2Model.from_pretrained("gpt2")
-    state_dict = hf_model.state_dict()
-    
-    # Mapping logic (simplified from ch05)
-    # Actually, `load_weights_into_gpt` in `architecture.py` expects a specific param structure 
-    # matching the TF checkpoint format (untransposed etc). 
-    # The book's `load_weights_into_gpt` is designed for the ORIGINAL 124M params from OpenAI/TF.
-    
-    # If we use `gpt2` from HuggingFace, the keys are different.
-    # To facilitate this without complex conversion scripts, we might just train from scratch 
-    # OR use the known `gpt_download.py` script.
-    
-    # Given the constraint to use `previous_chapters.py`, we should probably provide 
-    # the weight downloading logic or minimal conversion.
-    
-    # Let's use the TF weight download logic if possible, or mapping.
-    # For robust Colab usage, let's assume we want to download the weights.
-    pass 

-# Since we can't easily replicate the full download logic in one file without clutter,
-# we will implement a simplified mapping from HF GPT2 (which is easy to install on Colab)
-# to our model.
+def download_file(url, destination):
+    # Simplified download utility
+    import requests
+    response = requests.get(url, stream=True)
+    file_size = int(response.headers.get("content-length", 0))

-def map_hf_to_our_model(our_model, hf_model):
-    # This is a heuristic mapping.
-    # Hf: wte, wpe, h[i].ln_1, h[i].attn, h[i].ln_2, h[i].mlp, ln_f
-    # Ours: tok_emb, pos_emb, trf_blocks[i].norm1, trf_blocks[i].att, norm2, ff, final_norm
-    
-    params = hf_model.state_dict()
-    
-    # Embeddings
-    our_model.tok_emb.weight.data.copy_(params['wte.weight'])
-    our_model.pos_emb.weight.data.copy_(params['wpe.weight'])
-    
-    # Blocks
-    for i, block in enumerate(our_model.trf_blocks):
-        prefix = f"h.{i}."
-        
-        # Norm 1
-        block.norm1.scale.data.copy_(params[f"{prefix}ln_1.weight"])
-        block.norm1.shift.data.copy_(params[f"{prefix}ln_1.bias"])
-        
-        # Attention
-        # HF: c_attn.weight is (768, 2304) -> (d, 3*d) -> [Q, K, V]
-        # Ours: W_query, W_key, W_value
-        qkv_w = params[f"{prefix}attn.c_attn.weight"] # Transpolose? HF Linear is (in, out) in code but weights are (out, in)?
-        # HF uses Conv1D for these which stores (in, out). PyTorch Linear stores (out, in).
-        # We need to be careful.
-        # Let's skip detailed weight mapping here to avoid breakage without testing.
-        # RECOMMENDATION: Train from scratch for this experiment since we are changing vocab
-        # OR use the book's download script.
-        pass
+    if os.path.exists(destination):
+        file_size_local = os.path.getsize(destination)
+        if file_size == file_size_local:
+            print(f"File already exists and is up-to-date: {destination}")
+            return

-# ... Actually, training from scratch on T4 for small syntax tasks is feasible but 500k dataset is large.
-# We SHOULD use pretrained.
-# I will include the `download_and_load_gpt2` from Ch05 in a simplified way? 
-# No, `transformers` is easier.
+    block_size = 1024
+    with tqdm(total=file_size, unit="iB", unit_scale=True, desc=url.split("/")[-1]) as progress_bar:
+        with open(destination, "wb") as file:
+            for chunk in response.iter_content(block_size):
+                progress_bar.update(len(chunk))
+                file.write(chunk)

-def train(cfg=GPT_CONFIG_124M, max_steps=1000):
+def download_and_load_gpt2(model_size, models_dir):
+    # Validate model size
+    allowed_sizes = ("124M", "355M", "774M", "1558M")
+    if model_size not in allowed_sizes:
+        raise ValueError(f"Model size not in {allowed_sizes}")
+
+    # Define paths
+    model_dir = os.path.join(models_dir, model_size)
+    base_url = "https://openaipublic.blob.core.windows.net/gpt-2/models"
+    filenames = [
+        "checkpoint", "encoder.json", "hparams.json",
+        "model.ckpt.data-00000-of-00001", "model.ckpt.index",
+        "model.ckpt.meta", "vocab.bpe"
+    ]
+
+    # Download files
+    os.makedirs(model_dir, exist_ok=True)
+    for filename in filenames:
+        file_url = os.path.join(base_url, model_size, filename)
+        file_path = os.path.join(model_dir, filename)
+        download_file(file_url, file_path)
+
+    # Load settings and params
+    tf_ckpt_path = tf.train.latest_checkpoint(model_dir)
+    settings = json.load(open(os.path.join(model_dir, "hparams.json"), "r", encoding="utf-8"))
+    params = load_gpt2_params_from_tf_ckpt(tf_ckpt_path, settings)
+
+    return settings, params
+
+def load_gpt2_params_from_tf_ckpt(ckpt_path, settings):
+    # Initialize parameters dictionary with empty blocks for each layer
+    params = {"blocks": [{} for _ in range(settings["n_layer"])]}
+
+    # Iterate over each variable in the checkpoint
+    for name, _ in tf.train.list_variables(ckpt_path):
+        # Load the variable and remove singleton dimensions
+        variable_array = np.squeeze(tf.train.load_variable(ckpt_path, name))
+
+        # Process the variable name to extract relevant parts
+        variable_name_parts = name.split("/")[1:]  # Skip the 'model/' prefix
+
+        # Identify the target dictionary for the variable
+        target_dict = params
+        if variable_name_parts[0].startswith("h"):
+            layer_number = int(variable_name_parts[0][1:])
+            target_dict = params["blocks"][layer_number]
+
+        # Recursively access or create nested dictionaries
+        for key in variable_name_parts[1:-1]:
+            target_dict = target_dict.setdefault(key, {})
+
+        # Assign the variable array to the last key
+        last_key = variable_name_parts[-1]
+        target_dict[last_key] = variable_array
+
+    return params
+
+
+def train(model_size="124M", max_steps=1000):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    
-    # 1. Init Tokenizer & Model
-    tokenizer = TokenizerWrapper()
-    model = GPTModel(cfg)
-    model.to(device)
+    # 1. Download & Load Params
+    print(f"Downloading {model_size} weights...")
+    settings, params = download_and_load_gpt2(model_size, "models")
    
-    # 2. Resize Embeddings for Special Tokens
-    # Current vocab: 50257. New: 50259.
-    # We need to expand the embedding matrix.
-    # Quick hack: create new embedding layer, copy old weights, init new ones.
-    old_emb = model.tok_emb
+    # 2. Init Model & Load Weights
+    print("Initializing architecture...")
+    # Map settings to our config format if needed, but we used GPT_CONFIG_124M as base.
+    # We should ensure config matches loaded settings.
+    cfg = GPT_CONFIG_124M
+    
+    model = GPTModel(cfg)
+    print("Loading weights into model...")
+    load_weights_into_gpt(model, params)
+    model.to(device)
+    print("Weights loaded successfully.")
+    
+    # 3. Resize Embeddings for Special Tokens
+    tokenizer = TokenizerWrapper()
    new_vocab_size = tokenizer.base_tokenizer.n_vocab + len(tokenizer.special_tokens)
-    new_emb = torch.nn.Embedding(new_vocab_size, cfg["emb_dim"])
+    print(f"Resizing model vocab to {new_vocab_size}...")
+    
+    old_emb = model.tok_emb
+    new_emb = nn.Embedding(new_vocab_size, cfg["emb_dim"])
    # Copy existing
    new_emb.weight.data[:old_emb.num_embeddings] = old_emb.weight.data
-    # Replace
+    # Init new (mean)
+    new_emb.weight.data[old_emb.num_embeddings:] = old_emb.weight.data.mean(dim=0, keepdim=True)
    model.tok_emb = new_emb.to(device)
    
-    # Update output head too
+    # Resize Output Head
    old_head = model.out_head
-    new_head = torch.nn.Linear(cfg["emb_dim"], new_vocab_size, bias=False)
+    new_head = nn.Linear(cfg["emb_dim"], new_vocab_size, bias=False)
    new_head.weight.data[:old_head.out_features] = old_head.weight.data
+    new_head.weight.data[old_head.out_features:] = new_emb.weight.data[old_head.out_features:]
    model.out_head = new_head.to(device)
    
-    print(f"Model resized to vocab: {new_vocab_size}")
+    # 4. Data Loader
+    train_loader = create_dataloader(tokenizer, batch_size=2, max_length=cfg["context_length"])
    
-    # 3. Data Loader
-    train_loader = create_dataloader(tokenizer, batch_size=4, max_length=cfg["context_length"])
+    # 5. Optimizer
+    optimizer = torch.optim.AdamW(model.parameters(), lr=0.0001, weight_decay=0.01)
    
-    # 4. Optimizer
-    optimizer = torch.optim.AdamW(model.parameters(), lr=5e-4, weight_decay=0.1)
-    
-    # 5. Loop
+    # 6. Loop
    model.train()
    step = 0
+    print("Starting training...")
    
    for input_chunk, target_chunk in train_loader:
        input_chunk, target_chunk = input_chunk.to(device), target_chunk.to(device)
@ -122,7 +146,6 @@ def train(cfg=GPT_CONFIG_124M, max_steps=1000):
        optimizer.zero_grad()
        logits = model(input_chunk)
        
-        # Flatten for loss
        loss = torch.nn.functional.cross_entropy(
            logits.flatten(0, 1), 
            target_chunk.flatten(0, 1)
@ -140,6 +163,7 @@ def train(cfg=GPT_CONFIG_124M, max_steps=1000):
            
    print("Training complete.")
    torch.save(model.state_dict(), "tool_llm.pth")
+    print("Model saved to 'tool_llm.pth'.")

 if __name__ == "__main__":
    train()