Source code for nabla.nn.init.variance_scaling

# ===----------------------------------------------------------------------=== #
# Nabla 2025
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ===----------------------------------------------------------------------=== #

"""Variance scaling parameter initialization methods."""

import numpy as np

import nabla as nb


[docs] def he_normal(shape: tuple[int, ...], seed: int | None = None) -> nb.Array: """He normal initialization for ReLU networks. Uses normal distribution with std = sqrt(2/fan_in) which is optimal for ReLU activations. Args: shape: Shape of the parameter tensor seed: Random seed for reproducibility Returns: Initialized parameter array """ if seed is not None: np.random.seed(seed) # Handle edge case of empty shape if len(shape) == 0: fan_in = 1 else: fan_in = shape[0] if len(shape) >= 2 else shape[0] std = (2.0 / fan_in) ** 0.5 weights = np.random.normal(0.0, std, shape).astype(np.float32) return nb.Array.from_numpy(weights)
[docs] def he_uniform(shape: tuple[int, ...], seed: int | None = None) -> nb.Array: """He uniform initialization for ReLU networks. Uses uniform distribution with bound = sqrt(6/fan_in) which is optimal for ReLU activations. Args: shape: Shape of the parameter tensor seed: Random seed for reproducibility Returns: Initialized parameter array """ if seed is not None: np.random.seed(seed) # Handle edge case of empty shape if len(shape) == 0: fan_in = 1 else: fan_in = shape[0] if len(shape) >= 2 else shape[0] bound = (6.0 / fan_in) ** 0.5 weights = np.random.uniform(-bound, bound, shape).astype(np.float32) return nb.Array.from_numpy(weights)
[docs] def xavier_normal(shape: tuple[int, ...], seed: int | None = None) -> nb.Array: """Xavier/Glorot normal initialization. Uses normal distribution with std = sqrt(2/(fan_in + fan_out)) which is optimal for sigmoid/tanh activations. Args: shape: Shape of the parameter tensor seed: Random seed for reproducibility Returns: Initialized parameter array """ if seed is not None: np.random.seed(seed) # Handle different shape configurations if len(shape) == 0: fan_in = fan_out = 1 elif len(shape) >= 2: fan_in, fan_out = shape[0], shape[1] else: fan_in = fan_out = shape[0] std = (2.0 / (fan_in + fan_out)) ** 0.5 weights = np.random.normal(0.0, std, shape).astype(np.float32) return nb.Array.from_numpy(weights)
[docs] def xavier_uniform(shape: tuple[int, ...], seed: int | None = None) -> nb.Array: """Xavier/Glorot uniform initialization. Uses uniform distribution with bound = sqrt(6/(fan_in + fan_out)) which is optimal for sigmoid/tanh activations. Args: shape: Shape of the parameter tensor seed: Random seed for reproducibility Returns: Initialized parameter array """ if seed is not None: np.random.seed(seed) # Handle different shape configurations if len(shape) == 0: fan_in = fan_out = 1 elif len(shape) >= 2: fan_in, fan_out = shape[0], shape[1] else: fan_in = fan_out = shape[0] bound = (6.0 / (fan_in + fan_out)) ** 0.5 weights = np.random.uniform(-bound, bound, shape).astype(np.float32) return nb.Array.from_numpy(weights)
[docs] def lecun_normal(shape: tuple[int, ...], seed: int | None = None) -> nb.Array: """LeCun normal initialization. Uses normal distribution with std = sqrt(1/fan_in) which is optimal for SELU activations. Args: shape: Shape of the parameter tensor seed: Random seed for reproducibility Returns: Initialized parameter array """ if seed is not None: np.random.seed(seed) # Handle edge case of empty shape if len(shape) == 0: fan_in = 1 else: fan_in = shape[0] if len(shape) >= 2 else shape[0] std = (1.0 / fan_in) ** 0.5 weights = np.random.normal(0.0, std, shape).astype(np.float32) return nb.Array.from_numpy(weights)
[docs] def initialize_mlp_params(layers: list[int], seed: int = 42) -> list[nb.Array]: """Initialize MLP parameters with specialized strategy for complex functions. This is the original initialization strategy from mlp_train_jit.py, optimized for learning high-frequency functions. Args: layers: List of layer sizes [input, hidden1, hidden2, ..., output] seed: Random seed for reproducibility Returns: List of parameter arrays [W1, b1, W2, b2, ...] """ np.random.seed(seed) params = [] for i in range(len(layers) - 1): fan_in, fan_out = layers[i], layers[i + 1] if i == 0: # First layer - needs to capture high frequency # Larger weights for first layer to capture high frequency patterns std = (4.0 / fan_in) ** 0.5 elif i == len(layers) - 2: # Output layer # Conservative output layer std = (0.5 / fan_in) ** 0.5 else: # Hidden layers # Standard He initialization std = (2.0 / fan_in) ** 0.5 w_np = np.random.normal(0.0, std, (fan_in, fan_out)).astype(np.float32) # Bias initialization strategy if i < len(layers) - 2: # Hidden layers # Small positive bias to help with ReLU b_np = np.ones((1, fan_out), dtype=np.float32) * 0.05 else: # Output layer # Initialize output bias to middle of target range b_np = np.ones((1, fan_out), dtype=np.float32) * 0.5 w = nb.Array.from_numpy(w_np) b = nb.Array.from_numpy(b_np) params.extend([w, b]) return params