Modules#
Module#
class Module() -> 'None':
Base class for imperative neural-network modules.
Modules are registered as pytree nodes, enabling direct use with transforms.
Methods#
backward#
def backward(self, loss: 'Tensor', gradient: 'Tensor | None' = None, retain_graph: 'bool' = False, create_graph: 'bool' = False, *, realize_grads: 'bool | None' = None) -> 'None':
PyTorch-style backward convenience attached to Module.
Optionally realizes all parameter gradients after backward.
buffers#
def buffers(self) -> 'Iterator[Tensor]':
eval#
def eval(self) -> 'Module':
extra_repr#
def extra_repr(self) -> 'str':
forward#
def forward(self, *args: 'Any', **kwargs: 'Any') -> 'Any':
load_state_dict#
def load_state_dict(self, state_dict: 'OrderedDict[str, Tensor]') -> 'None':
modules#
def modules(self) -> 'Iterator[Module]':
named_buffers#
def named_buffers(self, prefix: 'str' = '') -> 'Iterator[tuple[str, Tensor]]':
named_parameters#
def named_parameters(self, prefix: 'str' = '') -> 'Iterator[tuple[str, Tensor]]':
parameters#
def parameters(self) -> 'Iterator[Tensor]':
register_buffer#
def register_buffer(self, name: 'str', tensor: 'Tensor | None') -> 'None':
state_dict#
def state_dict(self) -> 'OrderedDict[str, Tensor]':
train#
def train(self) -> 'Module':
zero_grad#
def zero_grad(self) -> 'None':
Linear#
class Linear(in_features: 'int', out_features: 'int', bias: 'bool' = True, *, dtype: 'DType' = float32) -> 'None':
Applies y = x @ W + b.
Methods#
extra_repr#
def extra_repr(self) -> 'str':
forward#
def forward(self, x: 'Tensor') -> 'Tensor':
LayerNorm#
class LayerNorm(normalized_shape: 'int | tuple[int, ...]', eps: 'float' = 1e-05, elementwise_affine: 'bool' = True, *, dtype: 'DType' = float32) -> 'None':
Applies layer normalization over the last normalized_shape dims.
Methods#
extra_repr#
def extra_repr(self) -> 'str':
forward#
def forward(self, x: 'Tensor') -> 'Tensor':
Dropout#
class Dropout(p: 'float' = 0.5) -> 'None':
Applies dropout during training.
Uses inverted-dropout scaling so no adjustment is needed at test time.
Embedding#
class Embedding(num_embeddings: 'int', embedding_dim: 'int', *, dtype: 'DType' = float32) -> 'None':
A learnable lookup table mapping integer indices to dense vectors.
Parameters
num_embeddings:int– Size of the vocabulary (number of rows).embedding_dim:int– Dimensionality of each embedding vector.
Methods#
extra_repr#
def extra_repr(self) -> 'str':
forward#
def forward(self, indices: 'Tensor') -> 'Tensor':
MultiHeadAttention#
class MultiHeadAttention(d_model: 'int', num_heads: 'int', dropout: 'float' = 0.0, bias: 'bool' = True, *, dtype: 'DType' = float32) -> 'None':
Multi-head attention as described in Attention Is All You Need.
Parameters
d_model:int– Total model dimensionality.num_heads:int– Number of parallel attention heads.d_modelmust be divisible bynum_heads.dropout:float– Dropout probability on attention weights (applied during training).bias:bool– Whether the linear projections include bias terms.
Methods#
extra_repr#
def extra_repr(self) -> 'str':
forward#
def forward(self, query: 'Tensor', key: 'Tensor', value: 'Tensor', attn_mask: 'Tensor | None' = None, is_causal: 'bool' = False) -> 'Tensor':
Run multi-head attention.
Parameters
query, key, value: `Tensor ``(batch, seq_*, d_model)``` – Noneattn_mask: `optional additive mask ``(…, seq_q, seq_k)``` – Noneis_causal:apply a causal mask– None
Returns
`Tensor ``(batch, seq_q, d_model)``` – None
TransformerEncoderLayer#
class TransformerEncoderLayer(d_model: 'int', num_heads: 'int', dim_feedforward: 'int' = 2048, dropout: 'float' = 0.1, *, dtype: 'DType' = float32) -> 'None':
A single Transformer encoder layer (pre-norm variant).
Structure::
x ─→ LayerNorm ─→ MultiHeadAttention ─→ Dropout ─→ + ─→
│ ↑
└────────────────────────────────────────────────────┘
x ─→ LayerNorm ─→ FFN ─→ Dropout ─→ + ─→
│ ↑
└──────────────────────────────────────┘
Parameters
d_model:int– Model dimensionality.num_heads:int– Number of attention heads.dim_feedforward:int– Hidden size of the position-wise feed-forward network.dropout:float– Dropout probability.
Methods#
forward#
def forward(self, src: 'Tensor', src_mask: 'Tensor | None' = None, is_causal: 'bool' = False) -> 'Tensor':
TransformerDecoderLayer#
class TransformerDecoderLayer(d_model: 'int', num_heads: 'int', dim_feedforward: 'int' = 2048, dropout: 'float' = 0.1, *, dtype: 'DType' = float32) -> 'None':
A single Transformer decoder layer (pre-norm variant).
Structure::
tgt ─→ LayerNorm ─→ Masked-Self-Attention ─→ Dropout ─→ + ─→
tgt ─→ LayerNorm ─→ Cross-Attention(tgt, memory) ─→ Dropout ─→ + ─→
tgt ─→ LayerNorm ─→ FFN ─→ Dropout ─→ + ─→
Parameters
d_model, num_heads, dim_feedforward, dropout:same as encoder layer.– None
Methods#
forward#
def forward(self, tgt: 'Tensor', memory: 'Tensor', tgt_mask: 'Tensor | None' = None, memory_mask: 'Tensor | None' = None, is_causal: 'bool' = False) -> 'Tensor':
Sequential#
class Sequential(*args: 'Any') -> 'None':
A sequential container of Modules.