1.4 KiB
1.4 KiB
Claude
graph TD
A[Input Text] --> B[Tokenization]
B --> C[Embedding]
C --> D[Transformer Encoder]
D --> E[Transformer Decoder]
E --> F[Output Text]
subgraph Transformer
D[Transformer Encoder]
E[Transformer Decoder]
end
subgraph Attention Mechanism
G[Multi-Head Attention]
H[Feed Forward]
I[Add & Norm]
end
D --> G
G --> H
H --> I
I --> D
I --> E
classDef input fill:#f9f,stroke:#333,stroke-width:4px
classDef output fill:#9ff,stroke:#333,stroke-width:4px
classDef transformer fill:#ff9,stroke:#333,stroke-width:2px
classDef attention fill:#f90,stroke:#333,stroke-width:2px
class A input
class F output
class Transformer transformer
class Attention Mechanism attention
zeroshot
ChatGPT
Graph 1
graph TD;
A[Input Layer: Text Tokens] --> B[Embedding Layer];
B --> C[Transformer Block 1];
C --> D[Transformer Block 2];
D --> E[...];
E --> F[Transformer Block N];
F --> G[Output Layer: Predicted Next Token];
Graph 2
graph TD;
A[Input from Previous Layer] --> B[Self-Attention Mechanism];
B --> C[Add & Normalize];
C --> D[Feed-Forward Neural Network];
D --> E[Add & Normalize];
E --> F[Output to Next Layer];
A -->|Skip Connection| C;
C -->|Skip Connection| E;
zeroshot