# ===----------------------------------------------------------------------=== #
# Nabla 2025
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ===----------------------------------------------------------------------=== #
"""Regularization techniques for neural networks."""
import numpy as np
import nabla as nb
[docs]
def l1_regularization(params: list[nb.Array], weight: float = 0.01) -> nb.Array:
"""Compute L1 (Lasso) regularization loss.
L1 regularization adds a penalty equal to the sum of absolute values of parameters.
This encourages sparsity in the model parameters.
Args:
params: List of parameter arrays (typically weights)
weight: Regularization strength
Returns:
Scalar L1 regularization loss
"""
l1_loss = nb.array([0.0])
for param in params:
l1_loss = l1_loss + nb.sum(nb.abs(param))
return weight * l1_loss
[docs]
def l2_regularization(params: list[nb.Array], weight: float = 0.01) -> nb.Array:
"""Compute L2 (Ridge) regularization loss.
L2 regularization adds a penalty equal to the sum of squares of parameters.
This encourages small parameter values and helps prevent overfitting.
Args:
params: List of parameter arrays (typically weights)
weight: Regularization strength
Returns:
Scalar L2 regularization loss
"""
l2_loss = nb.array([0.0])
for param in params:
l2_loss = l2_loss + nb.sum(param * param)
return weight * l2_loss
[docs]
def elastic_net_regularization(
params: list[nb.Array],
l1_weight: float = 0.01,
l2_weight: float = 0.01,
l1_ratio: float = 0.5,
) -> nb.Array:
"""Compute Elastic Net regularization loss.
Elastic Net combines L1 and L2 regularization:
ElasticNet = l1_ratio * L1 + (1 - l1_ratio) * L2
Args:
params: List of parameter arrays (typically weights)
l1_weight: L1 regularization strength
l2_weight: L2 regularization strength
l1_ratio: Ratio of L1 to L2 regularization (0 = pure L2, 1 = pure L1)
Returns:
Scalar Elastic Net regularization loss
"""
l1_loss = l1_regularization(params, weight=1.0) # Unweighted
l2_loss = l2_regularization(params, weight=1.0) # Unweighted
combined_loss = (
l1_ratio * l1_weight * l1_loss + (1 - l1_ratio) * l2_weight * l2_loss
)
return combined_loss
[docs]
def dropout(
x: nb.Array, p: float = 0.5, training: bool = True, seed: int | None = None
) -> nb.Array:
"""Apply dropout regularization.
During training, randomly sets elements to zero with probability p.
During inference, scales all elements by (1-p) to maintain expected values.
Args:
x: Input array
p: Dropout probability (fraction of elements to set to zero)
training: Whether in training mode (apply dropout) or inference mode
seed: Random seed for reproducibility
Returns:
Array with dropout applied
"""
if not training or p == 0.0:
return x
if p >= 1.0:
return nb.zeros_like(x)
# Generate random mask
if seed is not None:
np.random.seed(seed)
keep_prob = 1.0 - p
mask_np = (np.random.random(x.shape) < keep_prob).astype(np.float32)
mask = nb.Array.from_numpy(mask_np)
# Apply mask and scale
return (x * mask) / keep_prob
[docs]
def spectral_normalization(
weight: nb.Array, u: nb.Array | None = None, n_iterations: int = 1
) -> tuple[nb.Array, nb.Array]:
"""Apply spectral normalization to weight matrix.
Spectral normalization constrains the spectral norm (largest singular value)
of weight matrices to be at most 1. This stabilizes training of GANs.
Args:
weight: Weight matrix to normalize [out_features, in_features]
u: Left singular vector estimate (updated during training)
n_iterations: Number of power iterations to approximate largest singular value
Returns:
Tuple of (normalized_weight, updated_u)
"""
weight_shape = weight.shape
# Reshape weight to 2D if needed
if len(weight_shape) > 2:
weight_2d = weight.reshape((weight_shape[0], -1))
else:
weight_2d = weight
out_features, in_features = weight_2d.shape
# Initialize u if not provided
if u is None:
u_np = np.random.normal(0, 1, (out_features,)).astype(np.float32)
u_init = nb.Array.from_numpy(u_np)
else:
u_init = u
# Power iteration to find largest singular value
for _ in range(n_iterations):
# v = W^T @ u / ||W^T @ u||
weight_t = nb.transpose(weight_2d)
u_reshaped = nb.reshape(u_init, (-1, 1))
v_temp = nb.matmul(weight_t, u_reshaped)
v = nb.reshape(v_temp, (-1,))
v = v / (nb.sqrt(nb.sum(v * v)) + 1e-8)
# u = W @ v / ||W @ v||
v_reshaped = nb.reshape(v, (-1, 1))
u_temp = nb.matmul(weight_2d, v_reshaped)
u_init = nb.reshape(u_temp, (-1,))
u_init = u_init / (nb.sqrt(nb.sum(u_init * u_init)) + 1e-8)
# Compute spectral norm: sigma = u^T @ W @ v
weight_t = nb.transpose(weight_2d)
u_reshaped = nb.reshape(u_init, (-1, 1))
v_temp = nb.matmul(weight_t, u_reshaped)
v = nb.reshape(v_temp, (-1,))
v = v / (nb.sqrt(nb.sum(v * v)) + 1e-8)
v_reshaped = nb.reshape(v, (-1, 1))
sigma_temp = nb.matmul(weight_2d, v_reshaped)
sigma_vec = nb.reshape(sigma_temp, (-1,))
sigma = nb.sum(u_init * sigma_vec)
# Normalize weight by spectral norm
normalized_weight_2d = weight_2d / (sigma + 1e-8)
# Reshape back to original shape
if len(weight_shape) > 2:
normalized_weight = normalized_weight_2d.reshape(weight_shape)
else:
normalized_weight = normalized_weight_2d
return normalized_weight, u_init
[docs]
def gradient_clipping(
gradients: list[nb.Array], max_norm: float = 1.0, norm_type: str = "l2"
) -> tuple[list[nb.Array], nb.Array]:
"""Apply gradient clipping to prevent exploding gradients.
Args:
gradients: List of gradient arrays
max_norm: Maximum allowed gradient norm
norm_type: Type of norm to use ("l2" or "l1")
Returns:
Tuple of (clipped_gradients, total_norm)
"""
# Compute total gradient norm
if norm_type == "l2":
total_norm_sq = nb.array([0.0])
for grad in gradients:
total_norm_sq = total_norm_sq + nb.sum(grad * grad)
total_norm = nb.sqrt(total_norm_sq)
elif norm_type == "l1":
total_norm = nb.array([0.0])
for grad in gradients:
total_norm = total_norm + nb.sum(nb.abs(grad))
else:
raise ValueError(f"Unsupported norm_type: {norm_type}")
# Clip gradients if norm exceeds threshold
max_norm_tensor = nb.array([max_norm])
clip_coeff = nb.minimum(max_norm_tensor / (total_norm + 1e-8), nb.array([1.0]))
clipped_gradients = []
for grad in gradients:
clipped_gradients.append(grad * clip_coeff)
return clipped_gradients, total_norm
__all__ = [
"l1_regularization",
"l2_regularization",
"elastic_net_regularization",
"dropout",
"spectral_normalization",
"gradient_clipping",
]