Source code for nabla.nn.utils.regularization

# ===----------------------------------------------------------------------=== #
# Nabla 2025
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ===----------------------------------------------------------------------=== #

"""Regularization techniques for neural networks."""

import numpy as np

import nabla as nb



[docs]
def l1_regularization(params: list[nb.Array], weight: float = 0.01) -> nb.Array:
    """Compute L1 (Lasso) regularization loss.

    L1 regularization adds a penalty equal to the sum of absolute values of parameters.
    This encourages sparsity in the model parameters.

    Args:
        params: List of parameter arrays (typically weights)
        weight: Regularization strength

    Returns:
        Scalar L1 regularization loss
    """
    l1_loss = nb.array([0.0])

    for param in params:
        l1_loss = l1_loss + nb.sum(nb.abs(param))

    return weight * l1_loss




[docs]
def l2_regularization(params: list[nb.Array], weight: float = 0.01) -> nb.Array:
    """Compute L2 (Ridge) regularization loss.

    L2 regularization adds a penalty equal to the sum of squares of parameters.
    This encourages small parameter values and helps prevent overfitting.

    Args:
        params: List of parameter arrays (typically weights)
        weight: Regularization strength

    Returns:
        Scalar L2 regularization loss
    """
    l2_loss = nb.array([0.0])

    for param in params:
        l2_loss = l2_loss + nb.sum(param * param)

    return weight * l2_loss




[docs]
def elastic_net_regularization(
    params: list[nb.Array],
    l1_weight: float = 0.01,
    l2_weight: float = 0.01,
    l1_ratio: float = 0.5,
) -> nb.Array:
    """Compute Elastic Net regularization loss.

    Elastic Net combines L1 and L2 regularization:
    ElasticNet = l1_ratio * L1 + (1 - l1_ratio) * L2

    Args:
        params: List of parameter arrays (typically weights)
        l1_weight: L1 regularization strength
        l2_weight: L2 regularization strength
        l1_ratio: Ratio of L1 to L2 regularization (0 = pure L2, 1 = pure L1)

    Returns:
        Scalar Elastic Net regularization loss
    """
    l1_loss = l1_regularization(params, weight=1.0)  # Unweighted
    l2_loss = l2_regularization(params, weight=1.0)  # Unweighted

    combined_loss = (
        l1_ratio * l1_weight * l1_loss + (1 - l1_ratio) * l2_weight * l2_loss
    )

    return combined_loss




[docs]
def dropout(
    x: nb.Array, p: float = 0.5, training: bool = True, seed: int | None = None
) -> nb.Array:
    """Apply dropout regularization.

    During training, randomly sets elements to zero with probability p.
    During inference, scales all elements by (1-p) to maintain expected values.

    Args:
        x: Input array
        p: Dropout probability (fraction of elements to set to zero)
        training: Whether in training mode (apply dropout) or inference mode
        seed: Random seed for reproducibility

    Returns:
        Array with dropout applied
    """
    if not training or p == 0.0:
        return x

    if p >= 1.0:
        return nb.zeros_like(x)

    # Generate random mask
    if seed is not None:
        np.random.seed(seed)

    keep_prob = 1.0 - p
    mask_np = (np.random.random(x.shape) < keep_prob).astype(np.float32)
    mask = nb.Array.from_numpy(mask_np)

    # Apply mask and scale
    return (x * mask) / keep_prob




[docs]
def spectral_normalization(
    weight: nb.Array, u: nb.Array | None = None, n_iterations: int = 1
) -> tuple[nb.Array, nb.Array]:
    """Apply spectral normalization to weight matrix.

    Spectral normalization constrains the spectral norm (largest singular value)
    of weight matrices to be at most 1. This stabilizes training of GANs.

    Args:
        weight: Weight matrix to normalize [out_features, in_features]
        u: Left singular vector estimate (updated during training)
        n_iterations: Number of power iterations to approximate largest singular value

    Returns:
        Tuple of (normalized_weight, updated_u)
    """
    weight_shape = weight.shape

    # Reshape weight to 2D if needed
    if len(weight_shape) > 2:
        weight_2d = weight.reshape((weight_shape[0], -1))
    else:
        weight_2d = weight

    out_features, in_features = weight_2d.shape

    # Initialize u if not provided
    if u is None:
        u_np = np.random.normal(0, 1, (out_features,)).astype(np.float32)
        u_init = nb.Array.from_numpy(u_np)
    else:
        u_init = u

    # Power iteration to find largest singular value
    for _ in range(n_iterations):
        # v = W^T @ u / ||W^T @ u||
        weight_t = nb.transpose(weight_2d)
        u_reshaped = nb.reshape(u_init, (-1, 1))
        v_temp = nb.matmul(weight_t, u_reshaped)
        v = nb.reshape(v_temp, (-1,))
        v = v / (nb.sqrt(nb.sum(v * v)) + 1e-8)

        # u = W @ v / ||W @ v||
        v_reshaped = nb.reshape(v, (-1, 1))
        u_temp = nb.matmul(weight_2d, v_reshaped)
        u_init = nb.reshape(u_temp, (-1,))
        u_init = u_init / (nb.sqrt(nb.sum(u_init * u_init)) + 1e-8)

    # Compute spectral norm: sigma = u^T @ W @ v
    weight_t = nb.transpose(weight_2d)
    u_reshaped = nb.reshape(u_init, (-1, 1))
    v_temp = nb.matmul(weight_t, u_reshaped)
    v = nb.reshape(v_temp, (-1,))
    v = v / (nb.sqrt(nb.sum(v * v)) + 1e-8)

    v_reshaped = nb.reshape(v, (-1, 1))
    sigma_temp = nb.matmul(weight_2d, v_reshaped)
    sigma_vec = nb.reshape(sigma_temp, (-1,))
    sigma = nb.sum(u_init * sigma_vec)

    # Normalize weight by spectral norm
    normalized_weight_2d = weight_2d / (sigma + 1e-8)

    # Reshape back to original shape
    if len(weight_shape) > 2:
        normalized_weight = normalized_weight_2d.reshape(weight_shape)
    else:
        normalized_weight = normalized_weight_2d

    return normalized_weight, u_init




[docs]
def gradient_clipping(
    gradients: list[nb.Array], max_norm: float = 1.0, norm_type: str = "l2"
) -> tuple[list[nb.Array], nb.Array]:
    """Apply gradient clipping to prevent exploding gradients.

    Args:
        gradients: List of gradient arrays
        max_norm: Maximum allowed gradient norm
        norm_type: Type of norm to use ("l2" or "l1")

    Returns:
        Tuple of (clipped_gradients, total_norm)
    """
    # Compute total gradient norm
    if norm_type == "l2":
        total_norm_sq = nb.array([0.0])
        for grad in gradients:
            total_norm_sq = total_norm_sq + nb.sum(grad * grad)
        total_norm = nb.sqrt(total_norm_sq)
    elif norm_type == "l1":
        total_norm = nb.array([0.0])
        for grad in gradients:
            total_norm = total_norm + nb.sum(nb.abs(grad))
    else:
        raise ValueError(f"Unsupported norm_type: {norm_type}")

    # Clip gradients if norm exceeds threshold
    max_norm_tensor = nb.array([max_norm])
    clip_coeff = nb.minimum(max_norm_tensor / (total_norm + 1e-8), nb.array([1.0]))

    clipped_gradients = []
    for grad in gradients:
        clipped_gradients.append(grad * clip_coeff)

    return clipped_gradients, total_norm



__all__ = [
    "l1_regularization",
    "l2_regularization",
    "elastic_net_regularization",
    "dropout",
    "spectral_normalization",
    "gradient_clipping",
]