graph TD
    subgraph Input["Input Layer: One-Hot Encoded Context Words"]
        A1["Context Word 1<br/>'the'<br/>[0,0,0,1,0,...,0]<br/>V-dimensional"]
        A2["Context Word 2<br/>'economy'<br/>[0,0,0,0,1,...,0]<br/>V-dimensional"]
        A3["Context Word 3<br/>'strong'<br/>[0,1,0,0,0,...,0]<br/>V-dimensional"]
        A4["Context Word 4<br/>'and'<br/>[0,0,1,0,0,...,0]<br/>V-dimensional"]
    end
    
    subgraph Weights1["Weight Matrix W_in (V × N)"]
        W1["Each row = input embedding<br/>for one word<br/>Dimensions: V × N<br/>(V = vocab size, N = embedding dim)"]
    end
    
    A1 -->|"v₁ = W_in^T × x₁"| W1
    A2 -->|"v₂ = W_in^T × x₂"| W1
    A3 -->|"v₃ = W_in^T × x₃"| W1
    A4 -->|"v₄ = W_in^T × x₄"| W1
    
    subgraph Hidden["Hidden Layer: Average Context Vector"]
        H["h = (1/C) Σ vᵢ<br/>Average of context embeddings<br/>N-dimensional vector<br/>(No activation function)"]
    end
    
    W1 -->|"Extract & average<br/>word vectors"| H
    
    subgraph Weights2["Weight Matrix W_out (N × V)"]
        W2["Each column = output embedding<br/>for one word<br/>Dimensions: N × V"]
    end
    
    H -->|"u = W_out^T × h"| W2
    
    subgraph Scores["Score Layer"]
        S["u_w for each word w<br/>V scores (one per vocab word)<br/>u_w = W_out[:,w]^T × h"]
    end
    
    W2 --> S
    
    subgraph Softmax["Softmax Layer"]
        SM["P(w|context) = exp(u_w) / Σ exp(u_w')<br/>Convert scores to probabilities<br/>All probabilities sum to 1"]
    end
    
    S --> SM
    
    subgraph Output["Output: Predicted Word Probabilities"]
        O1["P('is') = 0.45"]
        O2["P('very') = 0.20"]
        O3["P('remains') = 0.15"]
        O4["P('...') = 0.20"]
    end
    
    SM --> O1
    SM --> O2
    SM --> O3
    SM --> O4
    
    subgraph Loss["Loss Function"]
        L["L = -log P(w_target|context)<br/>= -u_w_target + log Σ exp(u_w)<br/><br/>Minimize negative log likelihood<br/>of correct target word"]
    end
    
    O1 -.->|"Compare to<br/>actual target"| L
    
    subgraph Gradient["Gradient Computation"]
        G1["∂L/∂W_out[:,w] = (P(w|context) - y_w) × h<br/>y_w = 1 if w=target, else 0"]
        G2["∂L/∂W_in = (1/C) Σ x_c × (Σ(P(w) - y_w) × W_out[:,w])^T"]
    end
    
    L --> G1
    L --> G2
    
    subgraph Update["Weight Update (Gradient Descent)"]
        U["W_new = W_old - η × ∂L/∂W<br/>η = learning rate (typically 0.01-0.025)<br/><br/>Repeat for all training examples"]
    end
    
    G1 --> U
    G2 --> U
    
    U -.->|"Update weights<br/>iteratively"| W1
    U -.->|"Update weights<br/>iteratively"| W2
    
    style Input fill:#e1f5ff
    style Hidden fill:#fff4e1
    style Output fill:#e8f5e9
    style Loss fill:#ffebee
    style Gradient fill:#f3e5f5
    style Update fill:#fff9c4
    style Weights1 fill:#e0e0e0
    style Weights2 fill:#e0e0e0