MTL¶

`minnormsolver` ¶

This script includes code adapted from the 'impartial-vaes' repository with minor modifications. The original code can be found at: https://github.com/adrianjav/impartial-vaes

Credit to the original authors: Adrian Javaloy, Maryam Meghdadi, and Isabel Valera for their valuable work.

`MinNormLinearSolver` ¶

Bases: Module

Solves the min norm problem in case of 2 vectors (lies on a line).

Source code in vambn/modelling/mtl/minnormsolver.py

class MinNormLinearSolver(nn.Module):
    """Solves the min norm problem in case of 2 vectors (lies on a line)."""

    def __init__(self):
        super().__init__()

    @torch.no_grad()
    def forward(self, v1v1, v1v2, v2v2):
        """
        Solver execution on scalar products of 2 vectors.

        Args:
            v1v1 (float): Scalar product <V1, V1>.
            v1v2 (float): Scalar product <V1, V2>.
            v2v2 (float): Scalar product <V2, V2>.

        Returns:
            tuple: A tuple containing:
                - gamma (float): Min-norm solution c = (gamma, 1. - gamma).
                - cost (float): The norm of min-norm point.
        """
        if v1v2 >= v1v1:
            return 1.0, v1v1
        if v1v2 >= v2v2:
            return 0.0, v2v2
        gamma = (v2v2 - v1v2) / (v1v1 + v2v2 - 2 * v1v2 + 1e-8)
        cost = v2v2 + gamma * (v1v2 - v2v2)
        return gamma, cost

`forward(v1v1, v1v2, v2v2)` ¶

Solver execution on scalar products of 2 vectors.

Parameters:

Name	Type	Description	Default
`v1v1`	`float`	Scalar product .	required
`v1v2`	`float`	Scalar product .	required
`v2v2`	`float`	Scalar product .	required

Returns:

Name	Type	Description
`tuple`		A tuple containing: - gamma (float): Min-norm solution c = (gamma, 1. - gamma). - cost (float): The norm of min-norm point.

Source code in vambn/modelling/mtl/minnormsolver.py

@torch.no_grad()
def forward(self, v1v1, v1v2, v2v2):
    """
    Solver execution on scalar products of 2 vectors.

    Args:
        v1v1 (float): Scalar product <V1, V1>.
        v1v2 (float): Scalar product <V1, V2>.
        v2v2 (float): Scalar product <V2, V2>.

    Returns:
        tuple: A tuple containing:
            - gamma (float): Min-norm solution c = (gamma, 1. - gamma).
            - cost (float): The norm of min-norm point.
    """
    if v1v2 >= v1v1:
        return 1.0, v1v1
    if v1v2 >= v2v2:
        return 0.0, v2v2
    gamma = (v2v2 - v1v2) / (v1v1 + v2v2 - 2 * v1v2 + 1e-8)
    cost = v2v2 + gamma * (v1v2 - v2v2)
    return gamma, cost

`MinNormPlanarSolver` ¶

Bases: Module

Solves the min norm problem in case the vectors lie on the same plane.

Source code in vambn/modelling/mtl/minnormsolver.py

class MinNormPlanarSolver(nn.Module):
    """Solves the min norm problem in case the vectors lie on the same plane."""

    def __init__(self, n_tasks):
        """
        Initializes the MinNormPlanarSolver.

        Args:
            n_tasks (int): Number of tasks/vectors.
        """
        super().__init__()
        i_grid = torch.arange(n_tasks)
        j_grid = torch.arange(n_tasks)
        ii_grid, jj_grid = torch.meshgrid(i_grid, j_grid)
        i_triu, j_triu = np.triu_indices(n_tasks, 1)

        self.register_buffer("n", torch.tensor(n_tasks))
        self.register_buffer("i_triu", torch.from_numpy(i_triu))
        self.register_buffer("j_triu", torch.from_numpy(j_triu))
        self.register_buffer("ii_triu", ii_grid[i_triu, j_triu])
        self.register_buffer("jj_triu", jj_grid[i_triu, j_triu])
        self.register_buffer("one", torch.ones(self.ii_triu.shape))
        self.register_buffer("zero", torch.zeros(self.ii_triu.shape))

    @torch.no_grad()
    def line_solver_vectorized(self, v1v1, v1v2, v2v2):
        """
        Linear case solver, but for collection of vector pairs (Vi, Vj).

        Args:
            v1v1 (Tensor): Vector of scalar products <Vi, Vi>.
            v1v2 (Tensor): Vector of scalar products <Vi, Vj>.
            v2v2 (Tensor): Vector of scalar products <Vj, Vj>.

        Returns:
            tuple: A tuple containing:
                - gamma (Tensor): Vector of min-norm solution c = (gamma, 1. - gamma).
                - cost (Tensor): Vector of the norm of min-norm point.
        """
        gamma = (v2v2 - v1v2) / (v1v1 + v2v2 - 2 * v1v2 + 1e-8)
        gamma = gamma.where(v1v2 < v2v2, self.zero)
        gamma = gamma.where(v1v2 < v1v1, self.one)

        cost = v2v2 + gamma * (v1v2 - v2v2)
        cost = cost.where(v1v2 < v2v2, v2v2)
        cost = cost.where(v1v2 < v1v1, v1v1)
        return gamma, cost

    @torch.no_grad()
    def forward(self, grammian):
        """
        Planar case solver, when Vi lies on the same plane.

        Args:
            grammian (Tensor): Grammian matrix G[i, j] = [<Vi, Vj>], G is a nxn tensor.

        Returns:
            Tensor: Coefficients c = [c1, ... cn] that solves the min-norm problem.
        """
        vivj = grammian[self.ii_triu, self.jj_triu]
        vivi = grammian[self.ii_triu, self.ii_triu]
        vjvj = grammian[self.jj_triu, self.jj_triu]

        gamma, cost = self.line_solver_vectorized(vivi, vivj, vjvj)
        offset = torch.argmin(cost)
        i_min, j_min = self.i_triu[offset], self.j_triu[offset]
        sol = torch.zeros(self.n, device=grammian.device)
        sol[i_min], sol[j_min] = gamma[offset], 1.0 - gamma[offset]
        return sol

`init(n_tasks)` ¶

Initializes the MinNormPlanarSolver.

Parameters:

Name	Type	Description	Default
`n_tasks`	`int`	Number of tasks/vectors.	required

Source code in vambn/modelling/mtl/minnormsolver.py

def __init__(self, n_tasks):
    """
    Initializes the MinNormPlanarSolver.

    Args:
        n_tasks (int): Number of tasks/vectors.
    """
    super().__init__()
    i_grid = torch.arange(n_tasks)
    j_grid = torch.arange(n_tasks)
    ii_grid, jj_grid = torch.meshgrid(i_grid, j_grid)
    i_triu, j_triu = np.triu_indices(n_tasks, 1)

    self.register_buffer("n", torch.tensor(n_tasks))
    self.register_buffer("i_triu", torch.from_numpy(i_triu))
    self.register_buffer("j_triu", torch.from_numpy(j_triu))
    self.register_buffer("ii_triu", ii_grid[i_triu, j_triu])
    self.register_buffer("jj_triu", jj_grid[i_triu, j_triu])
    self.register_buffer("one", torch.ones(self.ii_triu.shape))
    self.register_buffer("zero", torch.zeros(self.ii_triu.shape))

`forward(grammian)` ¶

Planar case solver, when Vi lies on the same plane.

Parameters:

Name	Type	Description	Default
`grammian`	`Tensor`	Grammian matrix G[i, j] = [], G is a nxn tensor.	required

Returns:

Name	Type	Description
`Tensor`		Coefficients c = [c1, ... cn] that solves the min-norm problem.

Source code in vambn/modelling/mtl/minnormsolver.py

@torch.no_grad()
def forward(self, grammian):
    """
    Planar case solver, when Vi lies on the same plane.

    Args:
        grammian (Tensor): Grammian matrix G[i, j] = [<Vi, Vj>], G is a nxn tensor.

    Returns:
        Tensor: Coefficients c = [c1, ... cn] that solves the min-norm problem.
    """
    vivj = grammian[self.ii_triu, self.jj_triu]
    vivi = grammian[self.ii_triu, self.ii_triu]
    vjvj = grammian[self.jj_triu, self.jj_triu]

    gamma, cost = self.line_solver_vectorized(vivi, vivj, vjvj)
    offset = torch.argmin(cost)
    i_min, j_min = self.i_triu[offset], self.j_triu[offset]
    sol = torch.zeros(self.n, device=grammian.device)
    sol[i_min], sol[j_min] = gamma[offset], 1.0 - gamma[offset]
    return sol

`line_solver_vectorized(v1v1, v1v2, v2v2)` ¶

Linear case solver, but for collection of vector pairs (Vi, Vj).

Parameters:

Name	Type	Description	Default
`v1v1`	`Tensor`	Vector of scalar products .	required
`v1v2`	`Tensor`	Vector of scalar products .	required
`v2v2`	`Tensor`	Vector of scalar products .	required

Returns:

Name	Type	Description
`tuple`		A tuple containing: - gamma (Tensor): Vector of min-norm solution c = (gamma, 1. - gamma). - cost (Tensor): Vector of the norm of min-norm point.

Source code in vambn/modelling/mtl/minnormsolver.py

@torch.no_grad()
def line_solver_vectorized(self, v1v1, v1v2, v2v2):
    """
    Linear case solver, but for collection of vector pairs (Vi, Vj).

    Args:
        v1v1 (Tensor): Vector of scalar products <Vi, Vi>.
        v1v2 (Tensor): Vector of scalar products <Vi, Vj>.
        v2v2 (Tensor): Vector of scalar products <Vj, Vj>.

    Returns:
        tuple: A tuple containing:
            - gamma (Tensor): Vector of min-norm solution c = (gamma, 1. - gamma).
            - cost (Tensor): Vector of the norm of min-norm point.
    """
    gamma = (v2v2 - v1v2) / (v1v1 + v2v2 - 2 * v1v2 + 1e-8)
    gamma = gamma.where(v1v2 < v2v2, self.zero)
    gamma = gamma.where(v1v2 < v1v1, self.one)

    cost = v2v2 + gamma * (v1v2 - v2v2)
    cost = cost.where(v1v2 < v2v2, v2v2)
    cost = cost.where(v1v2 < v1v1, v1v1)
    return gamma, cost

`MinNormSolver` ¶

Bases: Module

Solves the min norm problem in the general case.

Source code in vambn/modelling/mtl/minnormsolver.py

class MinNormSolver(nn.Module):
    """Solves the min norm problem in the general case."""

    def __init__(self, n_tasks, max_iter=250, stop_crit=1e-6):
        """
        Initializes the MinNormSolver.

        Args:
            n_tasks (int): Number of tasks/vectors.
            max_iter (int, optional): Maximum number of iterations. Defaults to 250.
            stop_crit (float, optional): Stopping criterion. Defaults to 1e-6.
        """
        super().__init__()
        self.n = n_tasks
        self.linear_solver = MinNormLinearSolver()
        self.planar_solver = MinNormPlanarSolver(n_tasks)

        n_grid = torch.arange(n_tasks)
        i_grid = torch.arange(n_tasks, dtype=torch.float32) + 1
        ii_grid, jj_grid = torch.meshgrid(n_grid, n_grid)

        self.register_buffer("n_ts", torch.tensor(n_tasks))
        self.register_buffer("i_grid", i_grid)
        self.register_buffer("ii_grid", ii_grid)
        self.register_buffer("jj_grid", jj_grid)
        self.register_buffer("zero", torch.zeros(n_tasks))
        self.register_buffer("stop_crit", torch.tensor(stop_crit))

        self.max_iter = max_iter
        self.two_sol = nn.Parameter(torch.zeros(2))
        self.two_sol.require_grad = False

    @torch.no_grad()
    def projection_to_simplex(self, gamma):
        """
        Projects gamma to the simplex.

        Args:
            gamma (Tensor): The input tensor to project.

        Returns:
            Tensor: The projected tensor.
        """
        sorted_gamma, indices = torch.sort(gamma, descending=True)
        tmp_sum = torch.cumsum(sorted_gamma, 0)
        tmp_max = (tmp_sum - 1.0) / self.i_grid

        non_zeros = torch.nonzero(tmp_max[:-1] > sorted_gamma[1:])
        if non_zeros.shape[0] > 0:
            tmax_f = tmp_max[:-1][non_zeros[0][0]]
        else:
            tmax_f = tmp_max[-1]
        return torch.max(gamma - tmax_f, self.zero)

    @torch.no_grad()
    def next_point(self, cur_val, grad):
        """
        Computes the next point in the optimization.

        Args:
            cur_val (Tensor): Current value.
            grad (Tensor): Gradient.

        Returns:
            Tensor: The next point.
        """
        proj_grad = grad - (torch.sum(grad) / self.n_ts)
        lt_zero = torch.nonzero(proj_grad < 0)
        lt_zero = lt_zero.view(lt_zero.numel())
        gt_zero = torch.nonzero(proj_grad > 0)
        gt_zero = gt_zero.view(gt_zero.numel())
        tm1 = -cur_val[lt_zero] / proj_grad[lt_zero]
        tm2 = (1.0 - cur_val[gt_zero]) / proj_grad[gt_zero]

        t = torch.tensor(1.0, device=grad.device)
        tm1_gt_zero = torch.nonzero(tm1 > 1e-7)
        tm1_gt_zero = tm1_gt_zero.view(tm1_gt_zero.numel())
        if tm1_gt_zero.shape[0] > 0:
            t = torch.min(tm1[tm1_gt_zero])

        tm2_gt_zero = torch.nonzero(tm2 > 1e-7)
        tm2_gt_zero = tm2_gt_zero.view(tm2_gt_zero.numel())
        if tm2_gt_zero.shape[0] > 0:
            t = torch.min(t, torch.min(tm2[tm2_gt_zero]))

        next_point = proj_grad * t + cur_val
        next_point = self.projection_to_simplex(next_point)
        return next_point

    @torch.no_grad()
    def forward(self, vecs):
        """
        General case solver using simplex projection algorithm.

        Args:
            vecs (Tensor): 2D tensor V, where each row is a vector Vi.

        Returns:
            Tensor: Coefficients c = [c1, ... cn] that solves the min-norm problem.
        """
        if self.n == 1:
            return vecs[0]
        if self.n == 2:
            v1v1 = torch.dot(vecs[0], vecs[0])
            v1v2 = torch.dot(vecs[0], vecs[1])
            v2v2 = torch.dot(vecs[1], vecs[1])
            self.two_sol[0], cost = self.linear_solver(v1v1, v1v2, v2v2)
            self.two_sol[1] = 1.0 - self.two_sol[0]
            return self.two_sol.clone()

        grammian = torch.mm(vecs, vecs.t())
        sol_vec = self.planar_solver(grammian)

        ii, jj = self.ii_grid, self.jj_grid
        for iter_count in range(self.max_iter):
            grad_dir = -torch.mv(grammian, sol_vec)
            new_point = self.next_point(sol_vec, grad_dir)

            v1v1 = (sol_vec[ii] * sol_vec[jj] * grammian[ii, jj]).sum()
            v1v2 = (sol_vec[ii] * new_point[jj] * grammian[ii, jj]).sum()
            v2v2 = (new_point[ii] * new_point[jj] * grammian[ii, jj]).sum()

            gamma, cost = self.linear_solver(v1v1, v1v2, v2v2)
            new_sol_vec = gamma * sol_vec + (1 - gamma) * new_point
            change = new_sol_vec - sol_vec
            if torch.sum(torch.abs(change)) < self.stop_crit:
                return sol_vec
            sol_vec = new_sol_vec
        return sol_vec

`init(n_tasks, max_iter=250, stop_crit=1e-06)` ¶

Initializes the MinNormSolver.

Parameters:

Name	Type	Description	Default
`n_tasks`	`int`	Number of tasks/vectors.	required
`max_iter`	`int`	Maximum number of iterations. Defaults to 250.	`250`
`stop_crit`	`float`	Stopping criterion. Defaults to 1e-6.	`1e-06`

Source code in vambn/modelling/mtl/minnormsolver.py

def __init__(self, n_tasks, max_iter=250, stop_crit=1e-6):
    """
    Initializes the MinNormSolver.

    Args:
        n_tasks (int): Number of tasks/vectors.
        max_iter (int, optional): Maximum number of iterations. Defaults to 250.
        stop_crit (float, optional): Stopping criterion. Defaults to 1e-6.
    """
    super().__init__()
    self.n = n_tasks
    self.linear_solver = MinNormLinearSolver()
    self.planar_solver = MinNormPlanarSolver(n_tasks)

    n_grid = torch.arange(n_tasks)
    i_grid = torch.arange(n_tasks, dtype=torch.float32) + 1
    ii_grid, jj_grid = torch.meshgrid(n_grid, n_grid)

    self.register_buffer("n_ts", torch.tensor(n_tasks))
    self.register_buffer("i_grid", i_grid)
    self.register_buffer("ii_grid", ii_grid)
    self.register_buffer("jj_grid", jj_grid)
    self.register_buffer("zero", torch.zeros(n_tasks))
    self.register_buffer("stop_crit", torch.tensor(stop_crit))

    self.max_iter = max_iter
    self.two_sol = nn.Parameter(torch.zeros(2))
    self.two_sol.require_grad = False

`forward(vecs)` ¶

General case solver using simplex projection algorithm.

Parameters:

Name	Type	Description	Default
`vecs`	`Tensor`	2D tensor V, where each row is a vector Vi.	required

Returns:

Name	Type	Description
`Tensor`		Coefficients c = [c1, ... cn] that solves the min-norm problem.

Source code in vambn/modelling/mtl/minnormsolver.py

@torch.no_grad()
def forward(self, vecs):
    """
    General case solver using simplex projection algorithm.

    Args:
        vecs (Tensor): 2D tensor V, where each row is a vector Vi.

    Returns:
        Tensor: Coefficients c = [c1, ... cn] that solves the min-norm problem.
    """
    if self.n == 1:
        return vecs[0]
    if self.n == 2:
        v1v1 = torch.dot(vecs[0], vecs[0])
        v1v2 = torch.dot(vecs[0], vecs[1])
        v2v2 = torch.dot(vecs[1], vecs[1])
        self.two_sol[0], cost = self.linear_solver(v1v1, v1v2, v2v2)
        self.two_sol[1] = 1.0 - self.two_sol[0]
        return self.two_sol.clone()

    grammian = torch.mm(vecs, vecs.t())
    sol_vec = self.planar_solver(grammian)

    ii, jj = self.ii_grid, self.jj_grid
    for iter_count in range(self.max_iter):
        grad_dir = -torch.mv(grammian, sol_vec)
        new_point = self.next_point(sol_vec, grad_dir)

        v1v1 = (sol_vec[ii] * sol_vec[jj] * grammian[ii, jj]).sum()
        v1v2 = (sol_vec[ii] * new_point[jj] * grammian[ii, jj]).sum()
        v2v2 = (new_point[ii] * new_point[jj] * grammian[ii, jj]).sum()

        gamma, cost = self.linear_solver(v1v1, v1v2, v2v2)
        new_sol_vec = gamma * sol_vec + (1 - gamma) * new_point
        change = new_sol_vec - sol_vec
        if torch.sum(torch.abs(change)) < self.stop_crit:
            return sol_vec
        sol_vec = new_sol_vec
    return sol_vec

`next_point(cur_val, grad)` ¶

Computes the next point in the optimization.

Parameters:

Name	Type	Description	Default
`cur_val`	`Tensor`	Current value.	required
`grad`	`Tensor`	Gradient.	required

Returns:

Name	Type	Description
`Tensor`		The next point.

Source code in vambn/modelling/mtl/minnormsolver.py

@torch.no_grad()
def next_point(self, cur_val, grad):
    """
    Computes the next point in the optimization.

    Args:
        cur_val (Tensor): Current value.
        grad (Tensor): Gradient.

    Returns:
        Tensor: The next point.
    """
    proj_grad = grad - (torch.sum(grad) / self.n_ts)
    lt_zero = torch.nonzero(proj_grad < 0)
    lt_zero = lt_zero.view(lt_zero.numel())
    gt_zero = torch.nonzero(proj_grad > 0)
    gt_zero = gt_zero.view(gt_zero.numel())
    tm1 = -cur_val[lt_zero] / proj_grad[lt_zero]
    tm2 = (1.0 - cur_val[gt_zero]) / proj_grad[gt_zero]

    t = torch.tensor(1.0, device=grad.device)
    tm1_gt_zero = torch.nonzero(tm1 > 1e-7)
    tm1_gt_zero = tm1_gt_zero.view(tm1_gt_zero.numel())
    if tm1_gt_zero.shape[0] > 0:
        t = torch.min(tm1[tm1_gt_zero])

    tm2_gt_zero = torch.nonzero(tm2 > 1e-7)
    tm2_gt_zero = tm2_gt_zero.view(tm2_gt_zero.numel())
    if tm2_gt_zero.shape[0] > 0:
        t = torch.min(t, torch.min(tm2[tm2_gt_zero]))

    next_point = proj_grad * t + cur_val
    next_point = self.projection_to_simplex(next_point)
    return next_point

`projection_to_simplex(gamma)` ¶

Projects gamma to the simplex.

Parameters:

Name	Type	Description	Default
`gamma`	`Tensor`	The input tensor to project.	required

Returns:

Name	Type	Description
`Tensor`		The projected tensor.

Source code in vambn/modelling/mtl/minnormsolver.py

@torch.no_grad()
def projection_to_simplex(self, gamma):
    """
    Projects gamma to the simplex.

    Args:
        gamma (Tensor): The input tensor to project.

    Returns:
        Tensor: The projected tensor.
    """
    sorted_gamma, indices = torch.sort(gamma, descending=True)
    tmp_sum = torch.cumsum(sorted_gamma, 0)
    tmp_max = (tmp_sum - 1.0) / self.i_grid

    non_zeros = torch.nonzero(tmp_max[:-1] > sorted_gamma[1:])
    if non_zeros.shape[0] > 0:
        tmax_f = tmp_max[:-1][non_zeros[0][0]]
    else:
        tmax_f = tmp_max[-1]
    return torch.max(gamma - tmax_f, self.zero)

`moo` ¶

This script includes code adapted from the 'impartial-vaes' repository with minor modifications. The original code can be found at: https://github.com/adrianjav/impartial-vaes

Credit to the original authors: Adrian Javaloy, Maryam Meghdadi, and Isabel Valera for their valuable work.

`MOOForLoop` ¶

Bases: Module

A PyTorch Module for Multiple Objective Optimization (MOO) within a loop.

Source code in vambn/modelling/mtl/moo.py

class MOOForLoop(nn.Module):
    """A PyTorch Module for Multiple Objective Optimization (MOO) within a loop."""

    inputs: Optional[torch.Tensor]

    def __init__(self, num_heads: int, moo_method: Optional[nn.Module] = None):
        """
        Initialize the MOOForLoop module.

        Args:
            num_heads (int): Number of heads for extending the input.
            moo_method (nn.Module, optional): The MOO method to be used. Default is None.
        """
        super().__init__()

        self._moo_method = [moo_method]
        self.num_heads = num_heads
        self.inputs = None
        self.outputs = None

        if self.moo_method is not None:
            self.register_full_backward_hook(MOOForLoop._hook)

    @property
    def moo_method(self):
        """Get the MOO method."""
        return self._moo_method[0]

    def _hook(
        self, grads_input: Tuple[torch.Tensor], grads_output: Any
    ) -> Tuple[torch.Tensor]:
        """
        Hook function to replace gradients with MOO directions.

        Args:
            grads_input (Tuple[torch.Tensor]): Gradients of the module's inputs.
            grads_output (Any): Gradients of the module's outputs.

        Returns:
            Tuple[torch.Tensor]: Modified gradients.
        """
        moo_directions = self.moo_method(
            grads_output[0], self.inputs, self.outputs
        )
        self.outputs = None

        original_norm = grads_output[0].sum(dim=0).norm(p=2)
        moo_norm = moo_directions.sum(dim=0).norm(p=2).clamp_min(1e-10)
        moo_directions.mul_(original_norm / moo_norm)

        return (moo_directions.sum(dim=0),)

    def forward(self, z: torch.Tensor) -> torch.Tensor:
        """
        Forward pass. Extend the input to the number of heads and store it.

        Args:
            z (torch.Tensor): Input tensor.

        Returns:
            torch.Tensor: Extended input tensor.
        """
        extended_shape = [self.num_heads] + [-1 for _ in range(z.ndim)]
        if self.moo_method.requires_input and z.requires_grad:
            self.inputs = z.detach()
        extended_z = z.unsqueeze(0).expand(extended_shape)
        return extended_z

    def __str__(self) -> str:
        return f"MOOForLoop({self.moo_method})"

`moo_method` `property` ¶

Get the MOO method.

`init(num_heads, moo_method=None)` ¶

Initialize the MOOForLoop module.

Parameters:

Name	Type	Description	Default
`num_heads`	`int`	Number of heads for extending the input.	required
`moo_method`	`Module`	The MOO method to be used. Default is None.	`None`

Source code in vambn/modelling/mtl/moo.py

def __init__(self, num_heads: int, moo_method: Optional[nn.Module] = None):
    """
    Initialize the MOOForLoop module.

    Args:
        num_heads (int): Number of heads for extending the input.
        moo_method (nn.Module, optional): The MOO method to be used. Default is None.
    """
    super().__init__()

    self._moo_method = [moo_method]
    self.num_heads = num_heads
    self.inputs = None
    self.outputs = None

    if self.moo_method is not None:
        self.register_full_backward_hook(MOOForLoop._hook)

`forward(z)` ¶

Forward pass. Extend the input to the number of heads and store it.

Parameters:

Name	Type	Description	Default
`z`	`Tensor`	Input tensor.	required

Returns:

Type	Description
`Tensor`	torch.Tensor: Extended input tensor.

Source code in vambn/modelling/mtl/moo.py

def forward(self, z: torch.Tensor) -> torch.Tensor:
    """
    Forward pass. Extend the input to the number of heads and store it.

    Args:
        z (torch.Tensor): Input tensor.

    Returns:
        torch.Tensor: Extended input tensor.
    """
    extended_shape = [self.num_heads] + [-1 for _ in range(z.ndim)]
    if self.moo_method.requires_input and z.requires_grad:
        self.inputs = z.detach()
    extended_z = z.unsqueeze(0).expand(extended_shape)
    return extended_z

`MooMulti` ¶

Bases: Module

A PyTorch Module for Multiple Objective Optimization (MOO) within a loop.

Source code in vambn/modelling/mtl/moo.py

class MooMulti(nn.Module):
    """A PyTorch Module for Multiple Objective Optimization (MOO) within a loop."""

    inputs: Optional[torch.Tensor]

    def __init__(
        self, num_modules: int, moo_method: Optional[nn.Module] = None
    ):
        """
        Initialize the MooMulti module.

        Args:
            num_modules (int): Number of heads for extending the input.
            moo_method (nn.Module, optional): The MOO method to be used. Default is None.
        """
        super().__init__()

        self._moo_method = [moo_method]
        self.num_heads = num_modules
        self.inputs = None
        self.outputs = None

        if self.moo_method is not None:
            self.register_full_backward_hook(MooMulti._hook)

    @property
    def moo_method(self):
        """Get the MOO method."""
        return self._moo_method[0]

    def _hook(
        self, grads_input: Tuple[torch.Tensor], grads_output: Any
    ) -> Tuple[torch.Tensor]:
        """
        Hook function to replace gradients with MOO directions.

        Args:
            grads_input (Tuple[torch.Tensor]): Gradients of the module's inputs.
            grads_output (Any): Gradients of the module's outputs.

        Returns:
            Tuple[torch.Tensor]: Modified gradients.
        """
        moo_directions = self.moo_method(
            grads_output[0], self.inputs, self.outputs
        )
        self.outputs = None

        if grads_output[0].shape != moo_directions.shape:
            raise ValueError(
                f"MOO directions shape {moo_directions.shape} does not match grads_output shape {grads_output[0].shape}"
            )

        original_norm = grads_output[0].norm(p=2)
        moo_norm = moo_directions.norm(p=2).clamp_min(1e-10)
        scaling_factor = original_norm / moo_norm
        scaled_moo_directions = moo_directions * scaling_factor

        if grads_input[0].shape != scaled_moo_directions.shape:
            raise ValueError(
                f"Scaled MOO directions shape {scaled_moo_directions.shape} does not match grads_input shape {grads_input[0].shape}"
            )
        return (scaled_moo_directions,)

    def forward(self, z: torch.Tensor) -> torch.Tensor:
        """
        Forward pass. Extend the input to the number of heads and store it.

        Args:
            z (torch.Tensor): Input tensor.

        Returns:
            torch.Tensor: Extended input tensor.
        """
        return z

    def __str__(self) -> str:
        return f"MooMulti({self.moo_method})"

`moo_method` `property` ¶

Get the MOO method.

`init(num_modules, moo_method=None)` ¶

Initialize the MooMulti module.

Parameters:

Name	Type	Description	Default
`num_modules`	`int`	Number of heads for extending the input.	required
`moo_method`	`Module`	The MOO method to be used. Default is None.	`None`

Source code in vambn/modelling/mtl/moo.py

def __init__(
    self, num_modules: int, moo_method: Optional[nn.Module] = None
):
    """
    Initialize the MooMulti module.

    Args:
        num_modules (int): Number of heads for extending the input.
        moo_method (nn.Module, optional): The MOO method to be used. Default is None.
    """
    super().__init__()

    self._moo_method = [moo_method]
    self.num_heads = num_modules
    self.inputs = None
    self.outputs = None

    if self.moo_method is not None:
        self.register_full_backward_hook(MooMulti._hook)

`forward(z)` ¶

Forward pass. Extend the input to the number of heads and store it.

Parameters:

Name	Type	Description	Default
`z`	`Tensor`	Input tensor.	required

Returns:

Type	Description
`Tensor`	torch.Tensor: Extended input tensor.

Source code in vambn/modelling/mtl/moo.py

def forward(self, z: torch.Tensor) -> torch.Tensor:
    """
    Forward pass. Extend the input to the number of heads and store it.

    Args:
        z (torch.Tensor): Input tensor.

    Returns:
        torch.Tensor: Extended input tensor.
    """
    return z

`MultiMOOForLoop` ¶

Bases: Module

A PyTorch Module for applying multiple MOOForLoop modules in parallel.

Source code in vambn/modelling/mtl/moo.py

class MultiMOOForLoop(nn.Module):
    """A PyTorch Module for applying multiple MOOForLoop modules in parallel."""

    def __init__(self, num_heads: int, moo_methods: Sequence[nn.Module]):
        """
        Initialize the MultiMOOForLoop module.

        Args:
            num_heads (int): Number of heads for each MOOForLoop.
            moo_methods (Sequence[nn.Module]): List of MOO methods to be used.
        """
        super().__init__()

        self.num_inputs = len(moo_methods)
        self.loops = [MOOForLoop(num_heads, method) for method in moo_methods]

    def forward(self, *args) -> Generator[torch.Tensor, None, None]:
        """
        Forward pass. Applies each MOOForLoop to its corresponding input.

        Args:
            *args (torch.Tensor): Variable number of input tensors.

        Returns:
            Generator: A generator of extended input tensors after applying MOOForLoop.
        """
        if len(args) != self.num_inputs:
            raise ValueError(
                f"Expected {self.num_inputs} inputs, got {len(args)} instead."
            )
        return (loop(z) for z, loop in zip(args, self.loops))

`init(num_heads, moo_methods)` ¶

Initialize the MultiMOOForLoop module.

Parameters:

Name	Type	Description	Default
`num_heads`	`int`	Number of heads for each MOOForLoop.	required
`moo_methods`	`Sequence[Module]`	List of MOO methods to be used.	required

Source code in vambn/modelling/mtl/moo.py

def __init__(self, num_heads: int, moo_methods: Sequence[nn.Module]):
    """
    Initialize the MultiMOOForLoop module.

    Args:
        num_heads (int): Number of heads for each MOOForLoop.
        moo_methods (Sequence[nn.Module]): List of MOO methods to be used.
    """
    super().__init__()

    self.num_inputs = len(moo_methods)
    self.loops = [MOOForLoop(num_heads, method) for method in moo_methods]

`forward(*args)` ¶

Forward pass. Applies each MOOForLoop to its corresponding input.

Parameters:

Name	Type	Description	Default
`*args`	`Tensor`	Variable number of input tensors.	`()`

Returns:

Name	Type	Description
`Generator`	`None`	A generator of extended input tensors after applying MOOForLoop.

Source code in vambn/modelling/mtl/moo.py

def forward(self, *args) -> Generator[torch.Tensor, None, None]:
    """
    Forward pass. Applies each MOOForLoop to its corresponding input.

    Args:
        *args (torch.Tensor): Variable number of input tensors.

    Returns:
        Generator: A generator of extended input tensors after applying MOOForLoop.
    """
    if len(args) != self.num_inputs:
        raise ValueError(
            f"Expected {self.num_inputs} inputs, got {len(args)} instead."
        )
    return (loop(z) for z, loop in zip(args, self.loops))

`setup_moo(hparams, num_tasks)` ¶

Setup the multi-task learning module.

Parameters:

Name	Type	Description	Default
`hparams`	`List[MtlMethodParams]`	MTL method parameters.	required
`num_tasks`	`int`	Number of tasks to perform.	required

Raises:

Type	Description
`ValueError`	If invalid method name is provided.

Returns:

Type	Description
`Module`	nn.Module: Module for MTL objective.

Source code in vambn/modelling/mtl/moo.py

def setup_moo(hparams: List[MtlMethodParams], num_tasks: int) -> nn.Module:
    """
    Setup the multi-task learning module.

    Args:
        hparams (List[MtlMethodParams]): MTL method parameters.
        num_tasks (int): Number of tasks to perform.

    Raises:
        ValueError: If invalid method name is provided.

    Returns:
        nn.Module: Module for MTL objective.
    """
    if len(hparams) == 0:
        return mtl.Identity()

    modules = []
    for obj in hparams:
        try:
            method = mtl.MtlMethods[obj.name].value
        except KeyError:
            raise ValueError(f"Invalid method name: {obj.name}")

        if obj.name in ["nsgd"]:
            modules.append(method(num_tasks=num_tasks, update_at=obj.update_at))
        elif obj.name in ["gradnorm"]:
            modules.append(
                method(
                    num_tasks=num_tasks,
                    alpha=obj.alpha,
                    update_at=obj.update_at,
                )
            )
        elif obj.name in ["cagrad"]:
            modules.append(method(alpha=obj.alpha))
        elif obj.name in ["graddrop"]:
            modules.append(method(leakage=[0.2] * num_tasks))
        else:
            modules.append(method())

    return mtl.Compose(*modules) if len(modules) != 0 else None

`mtl` ¶

This script includes code adapted from the 'impartial-vaes' repository with minor modifications. The original code can be found at: https://github.com/adrianjav/impartial-vaes

Credit to the original authors: Adrian Javaloy, Maryam Meghdadi, and Isabel Valera for their valuable work.

`CAGrad` ¶

Bases: MOOMethod

CAGrad method for multiple objective optimization.

Source code in vambn/modelling/mtl/mtl.py

class CAGrad(MOOMethod):
    """CAGrad method for multiple objective optimization."""

    requires_input: bool = False

    def __init__(self, alpha: float):
        """
        Initialize CAGrad method.

        Args:
            alpha: Alpha parameter for CAGrad.
        """
        super(CAGrad, self).__init__()
        self.alpha = alpha

    def forward(self, grads: torch.Tensor, inputs: torch.Tensor, outputs: torch.Tensor) -> torch.Tensor:
        """
        Compute new gradients using CAGrad method.

        Args:
            grads: Gradients tensor.
            inputs: Input tensor.
            outputs: Output tensor.

        Returns:
            New gradients tensor.
        """
        shape = grads.size()
        num_tasks = len(grads)
        grads = grads.flatten(start_dim=1).t()

        GG = grads.t().mm(grads).cpu()
        g0_norm = (GG.mean() + 1e-8).sqrt()

        x_start = np.ones(num_tasks) / num_tasks
        bnds = tuple((0, 1) for _ in x_start)
        cons = {"type": "eq", "fun": lambda x: 1 - sum(x)}

        A = GG.numpy()
        b = x_start.copy()
        c = (self.alpha * g0_norm + 1e-8).item()

        def objfn(x):
            return (
                x.reshape(1, num_tasks).dot(A).dot(b.reshape(num_tasks, 1))
                + c
                * np.sqrt(
                    x.reshape(1, num_tasks).dot(A).dot(x.reshape(num_tasks, 1))
                    + 1e-8
                )
            ).sum()

        res = minimize(objfn, x_start, bounds=bnds, constraints=cons)
        w_cpu = res.x

        ww = torch.Tensor(w_cpu).to(grads.device)
        gw = (grads * ww.view(1, -1)).sum(1)
        gw_norm = gw.norm()
        lmbda = c / (gw_norm + 1e-8)
        g = (grads + lmbda * gw.unsqueeze(1)) / num_tasks

        g = g.t().reshape(shape)
        grads = g

        return grads

`init(alpha)` ¶

Initialize CAGrad method.

Parameters:

Name	Type	Description	Default
`alpha`	`float`	Alpha parameter for CAGrad.	required

Source code in vambn/modelling/mtl/mtl.py

def __init__(self, alpha: float):
    """
    Initialize CAGrad method.

    Args:
        alpha: Alpha parameter for CAGrad.
    """
    super(CAGrad, self).__init__()
    self.alpha = alpha

`forward(grads, inputs, outputs)` ¶

Compute new gradients using CAGrad method.

Parameters:

Name	Type	Description	Default
`grads`	`Tensor`	Gradients tensor.	required
`inputs`	`Tensor`	Input tensor.	required
`outputs`	`Tensor`	Output tensor.	required

Returns:

Type	Description
`Tensor`	New gradients tensor.

Source code in vambn/modelling/mtl/mtl.py

def forward(self, grads: torch.Tensor, inputs: torch.Tensor, outputs: torch.Tensor) -> torch.Tensor:
    """
    Compute new gradients using CAGrad method.

    Args:
        grads: Gradients tensor.
        inputs: Input tensor.
        outputs: Output tensor.

    Returns:
        New gradients tensor.
    """
    shape = grads.size()
    num_tasks = len(grads)
    grads = grads.flatten(start_dim=1).t()

    GG = grads.t().mm(grads).cpu()
    g0_norm = (GG.mean() + 1e-8).sqrt()

    x_start = np.ones(num_tasks) / num_tasks
    bnds = tuple((0, 1) for _ in x_start)
    cons = {"type": "eq", "fun": lambda x: 1 - sum(x)}

    A = GG.numpy()
    b = x_start.copy()
    c = (self.alpha * g0_norm + 1e-8).item()

    def objfn(x):
        return (
            x.reshape(1, num_tasks).dot(A).dot(b.reshape(num_tasks, 1))
            + c
            * np.sqrt(
                x.reshape(1, num_tasks).dot(A).dot(x.reshape(num_tasks, 1))
                + 1e-8
            )
        ).sum()

    res = minimize(objfn, x_start, bounds=bnds, constraints=cons)
    w_cpu = res.x

    ww = torch.Tensor(w_cpu).to(grads.device)
    gw = (grads * ww.view(1, -1)).sum(1)
    gw_norm = gw.norm()
    lmbda = c / (gw_norm + 1e-8)
    g = (grads + lmbda * gw.unsqueeze(1)) / num_tasks

    g = g.t().reshape(shape)
    grads = g

    return grads

`Compose` ¶

Bases: MOOMethod

Compose multiple MOO methods.

Parameters:

Name	Type	Description	Default
`modules`	`MOOMethod`	List of MOO methods to compose.	`()`

Attributes:

Name	Type	Description
`methods`	`ModuleList`	List of MOO methods.
`requires_input`	`bool`	Flag indicating if input is required.

Source code in vambn/modelling/mtl/mtl.py

class Compose(MOOMethod):
    """
    Compose multiple MOO methods.

    Args:
        modules (MOOMethod): List of MOO methods to compose.

    Attributes:
        methods (nn.ModuleList): List of MOO methods.
        requires_input (bool): Flag indicating if input is required.

    """

    def __init__(self, *modules: MOOMethod):
        super().__init__()
        self.methods = nn.ModuleList(modules)
        self.requires_input = any([m.requires_input for m in modules])

    def forward(self, grads: torch.Tensor, inputs: torch.Tensor, outputs: torch.Tensor) -> torch.Tensor:
        """
        Apply composed MOO methods sequentially.

        Args:
            grads (torch.Tensor): Gradients tensor.
            inputs (torch.Tensor): Input tensor.
            outputs (torch.Tensor): Output tensor.

        Returns:
            torch.Tensor: Modified gradients.
        """
        for module in self.methods:
            grads = module(grads, inputs, outputs)
        return grads

`forward(grads, inputs, outputs)` ¶

Apply composed MOO methods sequentially.

Parameters:

Name	Type	Description	Default
`grads`	`Tensor`	Gradients tensor.	required
`inputs`	`Tensor`	Input tensor.	required
`outputs`	`Tensor`	Output tensor.	required

Returns:

Type	Description
`Tensor`	torch.Tensor: Modified gradients.

Source code in vambn/modelling/mtl/mtl.py

def forward(self, grads: torch.Tensor, inputs: torch.Tensor, outputs: torch.Tensor) -> torch.Tensor:
    """
    Apply composed MOO methods sequentially.

    Args:
        grads (torch.Tensor): Gradients tensor.
        inputs (torch.Tensor): Input tensor.
        outputs (torch.Tensor): Output tensor.

    Returns:
        torch.Tensor: Modified gradients.
    """
    for module in self.methods:
        grads = module(grads, inputs, outputs)
    return grads

`GradDrop` ¶

Bases: MOOMethod

Gradient Dropout (GradDrop) method for MOO.

Parameters:

Name	Type	Description	Default
`leakage`	`List[float]`	List of leakage rates for each task.	required

Attributes:

Name	Type	Description
`leakage`	`List[float]`	List of leakage rates for each task.

Source code in vambn/modelling/mtl/mtl.py

class GradDrop(MOOMethod):
    """Gradient Dropout (GradDrop) method for MOO.

    Args:
        leakage (List[float]): List of leakage rates for each task.

    Attributes:
        leakage (List[float]): List of leakage rates for each task.

    """

    requires_input: bool = True

    def __init__(self, leakage: List[float]):
        """
        Initialize GradDrop method.

        Args:
            leakage (List[float]): List of leakage rates for each task.

        Raises:
            AssertionError: If any leakage rate is not in the range [0, 1].

        """
        super(GradDrop, self).__init__()
        assert all(
            [0 <= x <= 1 for x in leakage]
        ), "All leakages should be in the range [0, 1]"
        self.leakage = leakage

    def forward(
        self, grads: torch.Tensor, inputs: torch.Tensor, outputs: torch.Tensor
    ) -> torch.Tensor:
        """
        Compute new gradients using GradDrop method.

        Args:
            grads (torch.Tensor): Gradients tensor.
            inputs (torch.Tensor): Input tensor.
            outputs (torch.Tensor): Output tensor.

        Returns:
            torch.Tensor: New gradients tensor.

        Raises:
            AssertionError: If the number of leakage parameters does not match the number of task gradients.

        """
        assert len(self.leakage) == len(
            grads
        ), "Leakage parameters should match the number of task gradients"
        sign_grads = [None for _ in range(len(grads))]
        for i in range(len(grads)):
            sign_grads[i] = inputs.sign() * grads[i]
            if len(grads[0].size()) > 1:  # It is batch-separated
                sign_grads[i] = grads[i].sum(dim=0, keepdim=True)

        odds = 0.5 * (
            1 + sum(sign_grads) / (sum(map(torch.abs, sign_grads)) + 1e-15)
        ).clamp(0, 1)
        assert odds.size() == sign_grads[0].size()  # pytype: disable=attribute-error

        new_grads = []
        samples = torch.rand(odds.size(), device=grads[0].device)
        for i in range(len(grads)):
            mask_i = torch.where(
                (odds > samples) & (sign_grads[i] > 0)  # pytype: disable=unsupported-operands
                | (odds < samples) & (sign_grads[i] < 0),  # pytype: disable=unsupported-operands
                torch.ones_like(odds),
                torch.zeros_like(odds),
            )
            mask_i = torch.lerp(
                mask_i, torch.ones_like(mask_i), self.leakage[i]
            )
            assert mask_i.size() == odds.size()
            new_grads.append(mask_i * grads[i])

        return torch.stack(new_grads, dim=0)

`init(leakage)` ¶

Initialize GradDrop method.

Parameters:

Name	Type	Description	Default
`leakage`	`List[float]`	List of leakage rates for each task.	required

Raises:

Type	Description
`AssertionError`	If any leakage rate is not in the range [0, 1].

Source code in vambn/modelling/mtl/mtl.py

def __init__(self, leakage: List[float]):
    """
    Initialize GradDrop method.

    Args:
        leakage (List[float]): List of leakage rates for each task.

    Raises:
        AssertionError: If any leakage rate is not in the range [0, 1].

    """
    super(GradDrop, self).__init__()
    assert all(
        [0 <= x <= 1 for x in leakage]
    ), "All leakages should be in the range [0, 1]"
    self.leakage = leakage

`forward(grads, inputs, outputs)` ¶

Compute new gradients using GradDrop method.

Parameters:

Name	Type	Description	Default
`grads`	`Tensor`	Gradients tensor.	required
`inputs`	`Tensor`	Input tensor.	required
`outputs`	`Tensor`	Output tensor.	required

Returns:

Type	Description
`Tensor`	torch.Tensor: New gradients tensor.

Raises:

Type	Description
`AssertionError`	If the number of leakage parameters does not match the number of task gradients.

Source code in vambn/modelling/mtl/mtl.py

def forward(
    self, grads: torch.Tensor, inputs: torch.Tensor, outputs: torch.Tensor
) -> torch.Tensor:
    """
    Compute new gradients using GradDrop method.

    Args:
        grads (torch.Tensor): Gradients tensor.
        inputs (torch.Tensor): Input tensor.
        outputs (torch.Tensor): Output tensor.

    Returns:
        torch.Tensor: New gradients tensor.

    Raises:
        AssertionError: If the number of leakage parameters does not match the number of task gradients.

    """
    assert len(self.leakage) == len(
        grads
    ), "Leakage parameters should match the number of task gradients"
    sign_grads = [None for _ in range(len(grads))]
    for i in range(len(grads)):
        sign_grads[i] = inputs.sign() * grads[i]
        if len(grads[0].size()) > 1:  # It is batch-separated
            sign_grads[i] = grads[i].sum(dim=0, keepdim=True)

    odds = 0.5 * (
        1 + sum(sign_grads) / (sum(map(torch.abs, sign_grads)) + 1e-15)
    ).clamp(0, 1)
    assert odds.size() == sign_grads[0].size()  # pytype: disable=attribute-error

    new_grads = []
    samples = torch.rand(odds.size(), device=grads[0].device)
    for i in range(len(grads)):
        mask_i = torch.where(
            (odds > samples) & (sign_grads[i] > 0)  # pytype: disable=unsupported-operands
            | (odds < samples) & (sign_grads[i] < 0),  # pytype: disable=unsupported-operands
            torch.ones_like(odds),
            torch.zeros_like(odds),
        )
        mask_i = torch.lerp(
            mask_i, torch.ones_like(mask_i), self.leakage[i]
        )
        assert mask_i.size() == odds.size()
        new_grads.append(mask_i * grads[i])

    return torch.stack(new_grads, dim=0)

`GradNorm` ¶

Bases: GradNormBase

Gradient Normalization (GradNorm) method for MOO.

Parameters:

Name	Type	Description	Default
`GradNormBase`	`class`	Base class for GradNorm.	required

Attributes:

Name	Type	Description
`requires_input`	`bool`	Flag indicating whether input is required.

Methods:

Name	Description
`forward`	Compute new gradients using GradNorm method.

Source code in vambn/modelling/mtl/mtl.py

class GradNorm(GradNormBase):
    """Gradient Normalization (GradNorm) method for MOO.

    Args:
        GradNormBase (class): Base class for GradNorm.

    Attributes:
        requires_input (bool): Flag indicating whether input is required.

    Methods:
        forward: Compute new gradients using GradNorm method.

    """

    requires_input: bool = False

    def forward(self, grads: torch.Tensor, inputs: torch.Tensor, outputs: torch.Tensor) -> torch.Tensor:
        """
        Compute new gradients using GradNorm method.

        Args:
            grads (torch.Tensor): Gradients tensor.
            inputs (torch.Tensor): Input tensor.
            outputs (torch.Tensor): Output tensor.

        Returns:
            torch.Tensor: New gradients tensor.
        """
        return self._forward(grads, outputs)

`forward(grads, inputs, outputs)` ¶

Compute new gradients using GradNorm method.

Parameters:

Name	Type	Description	Default
`grads`	`Tensor`	Gradients tensor.	required
`inputs`	`Tensor`	Input tensor.	required
`outputs`	`Tensor`	Output tensor.	required

Returns:

Type	Description
`Tensor`	torch.Tensor: New gradients tensor.

Source code in vambn/modelling/mtl/mtl.py

def forward(self, grads: torch.Tensor, inputs: torch.Tensor, outputs: torch.Tensor) -> torch.Tensor:
    """
    Compute new gradients using GradNorm method.

    Args:
        grads (torch.Tensor): Gradients tensor.
        inputs (torch.Tensor): Input tensor.
        outputs (torch.Tensor): Output tensor.

    Returns:
        torch.Tensor: New gradients tensor.
    """
    return self._forward(grads, outputs)

`GradNormBase` ¶

Bases: MOOMethod

Base class for Gradient Normalization (GradNorm) method.

Source code in vambn/modelling/mtl/mtl.py

class GradNormBase(MOOMethod):
    """Base class for Gradient Normalization (GradNorm) method."""

    initial_values: torch.Tensor
    counter: torch.Tensor

    def __init__(self, num_tasks: int, alpha: float, update_at: int = 20):
        """
        Initialize GradNormBase method.

        Args:
            num_tasks (int): Number of tasks.
            alpha (float): Alpha parameter for GradNorm.
            update_at (int): Update interval.
        """
        super(GradNormBase, self).__init__()
        self.epsilon = 1e-5
        self.num_tasks = num_tasks
        self.weight_ = nn.Parameter(torch.ones([num_tasks]), requires_grad=True)
        self.alpha = alpha
        self.update_at = update_at
        self.register_buffer("initial_values", torch.ones(self.num_tasks))
        self.register_buffer("counter", torch.zeros([]))

    @property
    def weight(self) -> torch.Tensor:
        """
        Compute normalized weights.

        Returns:
            torch.Tensor: Normalized weights.
        """
        ws = self.weight_.exp().clamp(self.epsilon, float("inf"))
        norm_coef = self.num_tasks / ws.sum()
        return ws * norm_coef

    def _forward(self, grads: torch.Tensor, values: List[float]) -> torch.Tensor:
        """
        Compute new gradients using GradNorm method.

        Args:
            grads (torch.Tensor): Gradients tensor.
            values (List[float]): Values for each task.

        Returns:
            torch.Tensor: New gradients tensor.
        """
        if self.initial_values is None or self.counter == self.update_at:
            self.initial_values = torch.tensor(values)
        self.counter += 1

        with torch.enable_grad():
            grads_norm = grads.flatten(start_dim=1).norm(p=2, dim=1)
            mean_grad_norm = (
                torch.mean(batch_product(grads_norm, self.weight), dim=0)
                .detach()
                .clone()
            )

            values = [
                x / y.clamp_min(self.epsilon)
                for x, y in zip(values, self.initial_values)
            ]
            average_value = torch.mean(torch.stack(values))

            loss = grads.new_zeros([])
            for i, [grad, value] in enumerate(zip(grads_norm, values)):
                r_i = value / average_value.clamp_min(self.epsilon)
                loss += torch.abs(
                    grad * self.weight[i]
                    - mean_grad_norm * torch.pow(r_i, self.alpha)
                )
            loss.backward()

        with torch.no_grad():
            new_grads = batch_product(grads, self.weight.detach())
        return new_grads

`weight` `property` ¶

Compute normalized weights.

Returns:

Type	Description
`Tensor`	torch.Tensor: Normalized weights.

`init(num_tasks, alpha, update_at=20)` ¶

Initialize GradNormBase method.

Parameters:

Name	Type	Description	Default
`num_tasks`	`int`	Number of tasks.	required
`alpha`	`float`	Alpha parameter for GradNorm.	required
`update_at`	`int`	Update interval.	`20`

Source code in vambn/modelling/mtl/mtl.py

def __init__(self, num_tasks: int, alpha: float, update_at: int = 20):
    """
    Initialize GradNormBase method.

    Args:
        num_tasks (int): Number of tasks.
        alpha (float): Alpha parameter for GradNorm.
        update_at (int): Update interval.
    """
    super(GradNormBase, self).__init__()
    self.epsilon = 1e-5
    self.num_tasks = num_tasks
    self.weight_ = nn.Parameter(torch.ones([num_tasks]), requires_grad=True)
    self.alpha = alpha
    self.update_at = update_at
    self.register_buffer("initial_values", torch.ones(self.num_tasks))
    self.register_buffer("counter", torch.zeros([]))

`GradNormModified` ¶

Bases: GradNormBase

Modified Gradient Normalization (GradNorm) method for MOO.

Uses task-gradient convergence instead of task loss convergence.

Attributes:

Name	Type	Description
`requires_input`	`bool`	Indicates whether the method requires input tensor.

Methods:

Name	Description
`forward`	Compute new gradients using modified GradNorm method.

Source code in vambn/modelling/mtl/mtl.py

class GradNormModified(GradNormBase):
    """
    Modified Gradient Normalization (GradNorm) method for MOO.

    Uses task-gradient convergence instead of task loss convergence.

    Attributes:
        requires_input (bool): Indicates whether the method requires input tensor.

    Methods:
        forward(grads, inputs, outputs): Compute new gradients using modified GradNorm method.

    """

    requires_input: bool = False

    def forward(self, grads: torch.Tensor, inputs: torch.Tensor, outputs: torch.Tensor) -> torch.Tensor:
        """
        Compute new gradients using modified GradNorm method.

        Args:
            grads (torch.Tensor): Gradients tensor.
            inputs (torch.Tensor): Input tensor.
            outputs (torch.Tensor): Output tensor.

        Returns:
            torch.Tensor: New gradients tensor.
        """
        return self._forward(grads, grads.flatten(start_dim=1).norm(p=2, dim=1))

`forward(grads, inputs, outputs)` ¶

Compute new gradients using modified GradNorm method.

Parameters:

Name	Type	Description	Default
`grads`	`Tensor`	Gradients tensor.	required
`inputs`	`Tensor`	Input tensor.	required
`outputs`	`Tensor`	Output tensor.	required

Returns:

Type	Description
`Tensor`	torch.Tensor: New gradients tensor.

Source code in vambn/modelling/mtl/mtl.py

def forward(self, grads: torch.Tensor, inputs: torch.Tensor, outputs: torch.Tensor) -> torch.Tensor:
    """
    Compute new gradients using modified GradNorm method.

    Args:
        grads (torch.Tensor): Gradients tensor.
        inputs (torch.Tensor): Input tensor.
        outputs (torch.Tensor): Output tensor.

    Returns:
        torch.Tensor: New gradients tensor.
    """
    return self._forward(grads, grads.flatten(start_dim=1).norm(p=2, dim=1))

`GradVac` ¶

Bases: MOOMethod

Gradient Vaccination (GradVac) method for MOO.

Source code in vambn/modelling/mtl/mtl.py

class GradVac(MOOMethod):
    """Gradient Vaccination (GradVac) method for MOO."""

    requires_input: bool = False

    def __init__(self, decay: float):
        """
        Initialize GradVac method.

        Args:
            decay: Decay rate for EMA.
        """
        super(GradVac, self).__init__()
        self.decay = decay

    def forward(self, grads: torch.Tensor, inputs: torch.Tensor, outputs: torch.Tensor) -> torch.Tensor:
        """
        Compute new gradients using GradVac method.

        Args:
            grads: Gradients tensor.
            inputs: Input tensor.
            outputs: Output tensor.

        Returns:
            New gradients tensor.
        """

        def vac_projection(u: torch.Tensor, v: torch.Tensor, pre_ema: float, post_ema: float) -> torch.Tensor:
            norm_u = torch.dot(u, u).sqrt()
            norm_v = torch.dot(v, v).sqrt()

            numer = norm_u * (
                pre_ema * math.sqrt(1 - post_ema**2)
                - post_ema * math.sqrt(1 - pre_ema**2)
            )
            denom = norm_v * math.sqrt(1 - pre_ema**2)

            return numer / denom.clamp_min(1e-15) * v

        size = grads.size()[1:]
        num_tasks = grads.size(0)

        grads_list = [g.flatten() for g in grads]
        ema = [[0 for _ in range(num_tasks)] for _ in range(num_tasks)]

        new_grads = []
        for i in range(num_tasks):
            grad_i = grads_list[i]
            for j in np.random.permutation(num_tasks):
                if i == j:
                    continue
                grad_j = grads_list[j]
                cos_sim = torch.cosine_similarity(grad_i, grad_j, dim=0)
                if cos_sim < ema[i][j]:
                    grad_i = grad_i + vac_projection(
                        grad_i, grad_j, ema[i][j], cos_sim
                    )
                    assert id(grads_list[i]) != id(grad_i), "Aliasing!"
                ema[i][j] = (1 - self.decay) * ema[i][j] + self.decay * cos_sim
            new_grads.append(grad_i.reshape(size))

        return torch.stack(new_grads, dim=0)

`init(decay)` ¶

Initialize GradVac method.

Parameters:

Name	Type	Description	Default
`decay`	`float`	Decay rate for EMA.	required

Source code in vambn/modelling/mtl/mtl.py

def __init__(self, decay: float):
    """
    Initialize GradVac method.

    Args:
        decay: Decay rate for EMA.
    """
    super(GradVac, self).__init__()
    self.decay = decay

`forward(grads, inputs, outputs)` ¶

Compute new gradients using GradVac method.

Parameters:

Name	Type	Description	Default
`grads`	`Tensor`	Gradients tensor.	required
`inputs`	`Tensor`	Input tensor.	required
`outputs`	`Tensor`	Output tensor.	required

Returns:

Type	Description
`Tensor`	New gradients tensor.

Source code in vambn/modelling/mtl/mtl.py

def forward(self, grads: torch.Tensor, inputs: torch.Tensor, outputs: torch.Tensor) -> torch.Tensor:
    """
    Compute new gradients using GradVac method.

    Args:
        grads: Gradients tensor.
        inputs: Input tensor.
        outputs: Output tensor.

    Returns:
        New gradients tensor.
    """

    def vac_projection(u: torch.Tensor, v: torch.Tensor, pre_ema: float, post_ema: float) -> torch.Tensor:
        norm_u = torch.dot(u, u).sqrt()
        norm_v = torch.dot(v, v).sqrt()

        numer = norm_u * (
            pre_ema * math.sqrt(1 - post_ema**2)
            - post_ema * math.sqrt(1 - pre_ema**2)
        )
        denom = norm_v * math.sqrt(1 - pre_ema**2)

        return numer / denom.clamp_min(1e-15) * v

    size = grads.size()[1:]
    num_tasks = grads.size(0)

    grads_list = [g.flatten() for g in grads]
    ema = [[0 for _ in range(num_tasks)] for _ in range(num_tasks)]

    new_grads = []
    for i in range(num_tasks):
        grad_i = grads_list[i]
        for j in np.random.permutation(num_tasks):
            if i == j:
                continue
            grad_j = grads_list[j]
            cos_sim = torch.cosine_similarity(grad_i, grad_j, dim=0)
            if cos_sim < ema[i][j]:
                grad_i = grad_i + vac_projection(
                    grad_i, grad_j, ema[i][j], cos_sim
                )
                assert id(grads_list[i]) != id(grad_i), "Aliasing!"
            ema[i][j] = (1 - self.decay) * ema[i][j] + self.decay * cos_sim
        new_grads.append(grad_i.reshape(size))

    return torch.stack(new_grads, dim=0)

`IMTLG` ¶

Bases: MOOMethod

IMTLG method for multiple objective optimization.

Source code in vambn/modelling/mtl/mtl.py

class IMTLG(MOOMethod):
    """IMTLG method for multiple objective optimization."""

    requires_input: bool = False

    def forward(self, grads: torch.Tensor, inputs: torch.Tensor, outputs: torch.Tensor) -> torch.Tensor:
        """
        Compute new gradients using IMTLG method.

        Args:
            grads (torch.Tensor): Gradients tensor.
            inputs (torch.Tensor): Input tensor.
            outputs (torch.Tensor): Output tensor.

        Returns:
            torch.Tensor: New gradients tensor.
        """
        flatten_grads = grads.flatten(start_dim=1)
        num_tasks = len(grads)
        if num_tasks == 1:
            return grads

        grad_diffs, unit_diffs = [], []
        for i in range(1, num_tasks):
            grad_diffs.append(flatten_grads[0] - flatten_grads[i])
            unit_diffs.append(
                unitary(flatten_grads[0]) - unitary(flatten_grads[i])
            )
        grad_diffs = torch.stack(grad_diffs, dim=0)
        unit_diffs = torch.stack(unit_diffs, dim=0)

        DU_T = torch.einsum("ik,jk->ij", grad_diffs, unit_diffs)
        DU_T_inv = torch.pinverse(DU_T)

        alphas = torch.einsum(
            "i,ki,kj->j", grads[0].flatten(), unit_diffs, DU_T_inv
        )
        alphas = torch.cat(
            (1 - alphas.sum(dim=0).unsqueeze(dim=0), alphas), dim=0
        )

        return batch_product(grads, alphas)

`forward(grads, inputs, outputs)` ¶

Compute new gradients using IMTLG method.

Parameters:

Name	Type	Description	Default
`grads`	`Tensor`	Gradients tensor.	required
`inputs`	`Tensor`	Input tensor.	required
`outputs`	`Tensor`	Output tensor.	required

Returns:

Type	Description
`Tensor`	torch.Tensor: New gradients tensor.

Source code in vambn/modelling/mtl/mtl.py

def forward(self, grads: torch.Tensor, inputs: torch.Tensor, outputs: torch.Tensor) -> torch.Tensor:
    """
    Compute new gradients using IMTLG method.

    Args:
        grads (torch.Tensor): Gradients tensor.
        inputs (torch.Tensor): Input tensor.
        outputs (torch.Tensor): Output tensor.

    Returns:
        torch.Tensor: New gradients tensor.
    """
    flatten_grads = grads.flatten(start_dim=1)
    num_tasks = len(grads)
    if num_tasks == 1:
        return grads

    grad_diffs, unit_diffs = [], []
    for i in range(1, num_tasks):
        grad_diffs.append(flatten_grads[0] - flatten_grads[i])
        unit_diffs.append(
            unitary(flatten_grads[0]) - unitary(flatten_grads[i])
        )
    grad_diffs = torch.stack(grad_diffs, dim=0)
    unit_diffs = torch.stack(unit_diffs, dim=0)

    DU_T = torch.einsum("ik,jk->ij", grad_diffs, unit_diffs)
    DU_T_inv = torch.pinverse(DU_T)

    alphas = torch.einsum(
        "i,ki,kj->j", grads[0].flatten(), unit_diffs, DU_T_inv
    )
    alphas = torch.cat(
        (1 - alphas.sum(dim=0).unsqueeze(dim=0), alphas), dim=0
    )

    return batch_product(grads, alphas)

`Identity` ¶

Bases: MOOMethod

Identity MOO method that returns the input gradients unchanged.

Source code in vambn/modelling/mtl/mtl.py

class Identity(MOOMethod):
    """Identity MOO method that returns the input gradients unchanged."""

    def forward(
        self,
        grads: torch.Tensor,
        inputs: Optional[torch.Tensor],
        outputs: Optional[torch.Tensor],
    ) -> torch.Tensor:
        """
        Return the input gradients unchanged.

        Args:
            grads (torch.Tensor): Input gradients.
            inputs (torch.Tensor, optional): Input tensor.
            outputs (torch.Tensor, optional): Output tensor.

        Returns:
            torch.Tensor: Unchanged input gradients.
        """
        return grads

`forward(grads, inputs, outputs)` ¶

Return the input gradients unchanged.

Parameters:

Name	Type	Description	Default
`grads`	`Tensor`	Input gradients.	required
`inputs`	`Tensor`	Input tensor.	required
`outputs`	`Tensor`	Output tensor.	required

Returns:

Type	Description
`Tensor`	torch.Tensor: Unchanged input gradients.

Source code in vambn/modelling/mtl/mtl.py

def forward(
    self,
    grads: torch.Tensor,
    inputs: Optional[torch.Tensor],
    outputs: Optional[torch.Tensor],
) -> torch.Tensor:
    """
    Return the input gradients unchanged.

    Args:
        grads (torch.Tensor): Input gradients.
        inputs (torch.Tensor, optional): Input tensor.
        outputs (torch.Tensor, optional): Output tensor.

    Returns:
        torch.Tensor: Unchanged input gradients.
    """
    return grads

`MGDAUB` ¶

Bases: MOOMethod

MGDA-UB method for multiple objective optimization.

Source code in vambn/modelling/mtl/mtl.py

class MGDAUB(MOOMethod):
    """MGDA-UB method for multiple objective optimization."""

    requires_input: bool = False

    def forward(self, grads: torch.Tensor, inputs: torch.Tensor, outputs: torch.Tensor) -> torch.Tensor:
        """
        Compute new gradients using MGDA-UB method.

        Args:
            grads (torch.Tensor): Gradients tensor.
            inputs (torch.Tensor): Input tensor.
            outputs (torch.Tensor): Output tensor.

        Returns:
            torch.Tensor: New gradients tensor.
        """
        epsilon: float = 1e-3
        shape: Tuple[int] = grads.size()[1:]
        grads = grads.flatten(start_dim=1).unsqueeze(dim=1)

        weights, min_norm = MinNormSolver.find_min_norm_element(
            grads.unbind(dim=0)
        )
        weights = [min(w, epsilon) for w in weights]

        grads = torch.stack(
            [g.reshape(shape) * w for g, w in zip(grads, weights)], dim=0
        )
        return grads

`forward(grads, inputs, outputs)` ¶

Compute new gradients using MGDA-UB method.

Parameters:

Name	Type	Description	Default
`grads`	`Tensor`	Gradients tensor.	required
`inputs`	`Tensor`	Input tensor.	required
`outputs`	`Tensor`	Output tensor.	required

Returns:

Type	Description
`Tensor`	torch.Tensor: New gradients tensor.

Source code in vambn/modelling/mtl/mtl.py

def forward(self, grads: torch.Tensor, inputs: torch.Tensor, outputs: torch.Tensor) -> torch.Tensor:
    """
    Compute new gradients using MGDA-UB method.

    Args:
        grads (torch.Tensor): Gradients tensor.
        inputs (torch.Tensor): Input tensor.
        outputs (torch.Tensor): Output tensor.

    Returns:
        torch.Tensor: New gradients tensor.
    """
    epsilon: float = 1e-3
    shape: Tuple[int] = grads.size()[1:]
    grads = grads.flatten(start_dim=1).unsqueeze(dim=1)

    weights, min_norm = MinNormSolver.find_min_norm_element(
        grads.unbind(dim=0)
    )
    weights = [min(w, epsilon) for w in weights]

    grads = torch.stack(
        [g.reshape(shape) * w for g, w in zip(grads, weights)], dim=0
    )
    return grads

`MOOMethod` ¶

Bases: Module

Base class for multiple objective optimization (MOO) methods.

Source code in vambn/modelling/mtl/mtl.py

class MOOMethod(nn.Module, metaclass=ABCMeta):
    """Base class for multiple objective optimization (MOO) methods."""

    requires_input: bool = False

    def __init__(self):
        super().__init__()

    @abstractmethod
    def forward(
        self,
        grads: torch.Tensor,
        inputs: Optional[torch.Tensor],
        outputs: Optional[torch.Tensor],
    ) -> torch.Tensor:
        """
        Computes the new task gradients based on the original ones.

        Given K gradients of size D, returns a new set of K gradients of size D based on some criterion.

        Args:
            grads (torch.Tensor): Tensor of size K x D with the different gradients.
            inputs (torch.Tensor, optional): Tensor with the input of the forward pass (if requires_input is set to True).
            outputs (torch.Tensor, optional): Tensor with the K outputs of the module (not used currently).

        Returns:
            torch.Tensor: A tensor of the same size as `grads` with the new gradients to use during backpropagation.
        """
        raise NotImplementedError("You need to implement the forward pass.")

`forward(grads, inputs, outputs)` `abstractmethod` ¶

Computes the new task gradients based on the original ones.

Given K gradients of size D, returns a new set of K gradients of size D based on some criterion.

Parameters:

Name	Type	Description	Default
`grads`	`Tensor`	Tensor of size K x D with the different gradients.	required
`inputs`	`Tensor`	Tensor with the input of the forward pass (if requires_input is set to True).	required
`outputs`	`Tensor`	Tensor with the K outputs of the module (not used currently).	required

Returns:

Type	Description
`Tensor`	torch.Tensor: A tensor of the same size as `grads` with the new gradients to use during backpropagation.

Source code in vambn/modelling/mtl/mtl.py

@abstractmethod
def forward(
    self,
    grads: torch.Tensor,
    inputs: Optional[torch.Tensor],
    outputs: Optional[torch.Tensor],
) -> torch.Tensor:
    """
    Computes the new task gradients based on the original ones.

    Given K gradients of size D, returns a new set of K gradients of size D based on some criterion.

    Args:
        grads (torch.Tensor): Tensor of size K x D with the different gradients.
        inputs (torch.Tensor, optional): Tensor with the input of the forward pass (if requires_input is set to True).
        outputs (torch.Tensor, optional): Tensor with the K outputs of the module (not used currently).

    Returns:
        torch.Tensor: A tensor of the same size as `grads` with the new gradients to use during backpropagation.
    """
    raise NotImplementedError("You need to implement the forward pass.")

`MinNormSolver` ¶

Solver for finding the minimum norm solution in the convex hull of vectors.

Source code in vambn/modelling/mtl/mtl.py

class MinNormSolver:
    """Solver for finding the minimum norm solution in the convex hull of vectors."""

    MAX_ITER = 250
    STOP_CRIT = 1e-5

    @staticmethod
    def _min_norm_element_from2(v1v1: float, v1v2: float, v2v2: float) -> tuple:
        """
        Analytical solution for min_{c} |cx_1 + (1-c)x_2|_2^2.

        Args:
            v1v1: <x1, x1>.
            v1v2: <x1, x2>.
            v2v2: <x2, x2>.

        Returns:
            tuple: Coefficients and cost for the minimum norm element.
        """
        if v1v2 >= v1v1:
            gamma = 0.999
            cost = v1v1
            return gamma, cost
        if v1v2 >= v2v2:
            gamma = 0.001
            cost = v2v2
            return gamma, cost
        gamma = -1.0 * ((v1v2 - v2v2) / (v1v1 + v2v2 - 2 * v1v2))
        cost = v2v2 + gamma * (v1v2 - v2v2)
        return gamma, cost

    @staticmethod
    def _min_norm_2d(vecs: list, dps: dict) -> tuple:
        """
        Find the minimum norm solution as a combination of two points in 2D.

        Args:
            vecs: List of vectors.
            dps: Dictionary to store dot products.

        Returns:
            tuple: Solution and updated dot products.
        """
        dmin = float("inf")
        for i in range(len(vecs)):
            for j in range(i + 1, len(vecs)):
                if (i, j) not in dps:
                    dps[(i, j)] = sum(
                        torch.dot(vecs[i][k], vecs[j][k]).item()
                        for k in range(len(vecs[i]))
                    )
                    dps[(j, i)] = dps[(i, j)]
                if (i, i) not in dps:
                    dps[(i, i)] = sum(
                        torch.dot(vecs[i][k], vecs[i][k]).item()
                        for k in range(len(vecs[i]))
                    )
                if (j, j) not in dps:
                    dps[(j, j)] = sum(
                        torch.dot(vecs[j][k], vecs[j][k]).item()
                        for k in range(len(vecs[i]))
                    )
                c, d = MinNormSolver._min_norm_element_from2(
                    dps[(i, i)], dps[(i, j)], dps[(j, j)]
                )
                if d < dmin:
                    dmin = d
                    sol = [(i, j), c, d]
        return sol, dps

    @staticmethod
    def _projection2simplex(y: np.ndarray) -> np.ndarray:
        """
        Project y onto the simplex.

        Args:
            y: Input array.

        Returns:
            Projected array.
        """
        m = len(y)
        sorted_y = np.flip(np.sort(y), axis=0)
        tmpsum = 0.0
        tmax_f = (np.sum(y) - 1.0) / m
        for i in range(m - 1):
            tmpsum += sorted_y[i]
            tmax = (tmpsum - 1) / (i + 1.0)
            if tmax > sorted_y[i + 1]:
                tmax_f = tmax
                break
        return np.maximum(y - tmax_f, np.zeros(y.shape))

    @staticmethod
    def _next_point(cur_val: np.ndarray, grad: np.ndarray, n: int) -> np.ndarray:
        """
        Compute the next point for the projected gradient descent.

        Args:
            cur_val: Current value.
            grad: Gradient.
            n: Dimension of the problem.

        Returns:
            Next point.
        """
        proj_grad = grad - (np.sum(grad) / n)
        tm1 = -1.0 * cur_val[proj_grad < 0] / proj_grad[proj_grad < 0]
        tm2 = (1.0 - cur_val[proj_grad > 0]) / proj_grad[proj_grad > 0]

        t = 1
        if len(tm1[tm1 > 1e-7]) > 0:
            t = np.min(tm1[tm1 > 1e-7])
        if len(tm2[tm2 > 1e-7]) > 0:
            t = min(t, np.min(tm2[tm2 > 1e-7]))

        next_point = proj_grad * t + cur_val
        next_point = MinNormSolver._projection2simplex(next_point)
        return next_point

    @staticmethod
    def find_min_norm_element(vecs: List) -> Tuple | None:
        """
        Find the minimum norm element in the convex hull of vectors.

        Args:
            vecs: List of vectors.

        Returns:
            Minimum norm element and its cost.
        """
        dps = {}
        init_sol, dps = MinNormSolver._min_norm_2d(vecs, dps)

        n = len(vecs)
        sol_vec = np.zeros(n)
        sol_vec[init_sol[0][0]] = init_sol[1]
        sol_vec[init_sol[0][1]] = 1 - init_sol[1]

        if n < 3:
            return sol_vec, init_sol[2]

        iter_count = 0

        grad_mat = np.zeros((n, n))
        for i in range(n):
            for j in range(n):
                grad_mat[i, j] = dps[(i, j)]

        while iter_count < MinNormSolver.MAX_ITER:
            grad_dir = -1.0 * np.dot(grad_mat, sol_vec)
            new_point = MinNormSolver._next_point(sol_vec, grad_dir, n)
            v1v1 = sum(
                sol_vec[i] * sol_vec[j] * dps[(i, j)]
                for i in range(n)
                for j in range(n)
            )
            v1v2 = sum(
                sol_vec[i] * new_point[j] * dps[(i, j)]
                for i in range(n)
                for j in range(n)
            )
            v2v2 = sum(
                new_point[i] * new_point[j] * dps[(i, j)]
                for i in range(n)
                for j in range(n)
            )
            nc, nd = MinNormSolver._min_norm_element_from2(v1v1, v1v2, v2v2)
            new_sol_vec = nc * sol_vec + (1 - nc) * new_point
            change = new_sol_vec - sol_vec
            if np.sum(np.abs(change)) < MinNormSolver.STOP_CRIT:
                return sol_vec, nd
            sol_vec = new_sol_vec

`find_min_norm_element(vecs)` `staticmethod` ¶

Find the minimum norm element in the convex hull of vectors.

Parameters:

Name	Type	Description	Default
`vecs`	`List`	List of vectors.	required

Returns:

Type	Description
`Tuple \| None`	Minimum norm element and its cost.

Source code in vambn/modelling/mtl/mtl.py

@staticmethod
def find_min_norm_element(vecs: List) -> Tuple | None:
    """
    Find the minimum norm element in the convex hull of vectors.

    Args:
        vecs: List of vectors.

    Returns:
        Minimum norm element and its cost.
    """
    dps = {}
    init_sol, dps = MinNormSolver._min_norm_2d(vecs, dps)

    n = len(vecs)
    sol_vec = np.zeros(n)
    sol_vec[init_sol[0][0]] = init_sol[1]
    sol_vec[init_sol[0][1]] = 1 - init_sol[1]

    if n < 3:
        return sol_vec, init_sol[2]

    iter_count = 0

    grad_mat = np.zeros((n, n))
    for i in range(n):
        for j in range(n):
            grad_mat[i, j] = dps[(i, j)]

    while iter_count < MinNormSolver.MAX_ITER:
        grad_dir = -1.0 * np.dot(grad_mat, sol_vec)
        new_point = MinNormSolver._next_point(sol_vec, grad_dir, n)
        v1v1 = sum(
            sol_vec[i] * sol_vec[j] * dps[(i, j)]
            for i in range(n)
            for j in range(n)
        )
        v1v2 = sum(
            sol_vec[i] * new_point[j] * dps[(i, j)]
            for i in range(n)
            for j in range(n)
        )
        v2v2 = sum(
            new_point[i] * new_point[j] * dps[(i, j)]
            for i in range(n)
            for j in range(n)
        )
        nc, nd = MinNormSolver._min_norm_element_from2(v1v1, v1v2, v2v2)
        new_sol_vec = nc * sol_vec + (1 - nc) * new_point
        change = new_sol_vec - sol_vec
        if np.sum(np.abs(change)) < MinNormSolver.STOP_CRIT:
            return sol_vec, nd
        sol_vec = new_sol_vec

`MtlMethods` ¶

Bases: Enum

Enumeration of available multi-task learning methods.

Source code in vambn/modelling/mtl/mtl.py

class MtlMethods(Enum):
    """Enumeration of available multi-task learning methods."""

    imtlg = IMTLG
    nsgd = NSGD
    gradnorm = GradNormModified
    pcgrad = PCGrad
    mgda_ub = MGDAUB
    identity = Identity
    cagrad = CAGrad
    graddrop = GradDrop

`NSGD` ¶

Bases: MOOMethod

Normalized Stochastic Gradient Descent (NSGD) method for MOO.

Source code in vambn/modelling/mtl/mtl.py

class NSGD(MOOMethod):
    """Normalized Stochastic Gradient Descent (NSGD) method for MOO."""

    initial_grads: torch.Tensor
    requires_input: bool = False

    def __init__(self, num_tasks: int, update_at: int = 20):
        """
        Initialize NSGD method.

        Args:
            num_tasks (int): Number of tasks.
            update_at (int): Update interval.
        """
        super().__init__()
        self.num_tasks = num_tasks
        self.update_at = update_at
        self.register_buffer("initial_grads", torch.ones(num_tasks))
        self.counter = 0

    def forward(self, grads: torch.Tensor, inputs: torch.Tensor, outputs: torch.Tensor) -> torch.Tensor:
        """
        Compute new gradients using NSGD method.

        Args:
            grads (torch.Tensor): Gradients tensor.
            inputs (torch.Tensor): Input tensor.
            outputs (torch.Tensor): Output tensor.

        Returns:
            torch.Tensor: New gradients tensor.
        """
        grad_norms = grads.flatten(start_dim=1).norm(dim=1)

        if self.initial_grads is None or self.counter == self.update_at:
            self.initial_grads = grad_norms

        self.counter += 1

        conv_ratios = grad_norms / self.initial_grads.clamp_min(1e-15)
        alphas = conv_ratios / conv_ratios.sum().clamp_min(1e-15)
        alphas = alphas / alphas.sum()

        weighted_sum_norms = (alphas * grad_norms).sum()
        grads = batch_product(
            grads, weighted_sum_norms / grad_norms.clamp_min(1e-15)
        )
        return grads

`init(num_tasks, update_at=20)` ¶

Initialize NSGD method.

Parameters:

Name	Type	Description	Default
`num_tasks`	`int`	Number of tasks.	required
`update_at`	`int`	Update interval.	`20`

Source code in vambn/modelling/mtl/mtl.py

def __init__(self, num_tasks: int, update_at: int = 20):
    """
    Initialize NSGD method.

    Args:
        num_tasks (int): Number of tasks.
        update_at (int): Update interval.
    """
    super().__init__()
    self.num_tasks = num_tasks
    self.update_at = update_at
    self.register_buffer("initial_grads", torch.ones(num_tasks))
    self.counter = 0

`forward(grads, inputs, outputs)` ¶

Compute new gradients using NSGD method.

Parameters:

Name	Type	Description	Default
`grads`	`Tensor`	Gradients tensor.	required
`inputs`	`Tensor`	Input tensor.	required
`outputs`	`Tensor`	Output tensor.	required

Returns:

Type	Description
`Tensor`	torch.Tensor: New gradients tensor.

Source code in vambn/modelling/mtl/mtl.py

def forward(self, grads: torch.Tensor, inputs: torch.Tensor, outputs: torch.Tensor) -> torch.Tensor:
    """
    Compute new gradients using NSGD method.

    Args:
        grads (torch.Tensor): Gradients tensor.
        inputs (torch.Tensor): Input tensor.
        outputs (torch.Tensor): Output tensor.

    Returns:
        torch.Tensor: New gradients tensor.
    """
    grad_norms = grads.flatten(start_dim=1).norm(dim=1)

    if self.initial_grads is None or self.counter == self.update_at:
        self.initial_grads = grad_norms

    self.counter += 1

    conv_ratios = grad_norms / self.initial_grads.clamp_min(1e-15)
    alphas = conv_ratios / conv_ratios.sum().clamp_min(1e-15)
    alphas = alphas / alphas.sum()

    weighted_sum_norms = (alphas * grad_norms).sum()
    grads = batch_product(
        grads, weighted_sum_norms / grad_norms.clamp_min(1e-15)
    )
    return grads

`PCGrad` ¶

Bases: MOOMethod

Projected Conflicting Gradient (PCGrad) method for MOO.

Attributes:

Name	Type	Description
`requires_input`	`bool`	Indicates whether the method requires input tensor.

Source code in vambn/modelling/mtl/mtl.py

class PCGrad(MOOMethod):
    """Projected Conflicting Gradient (PCGrad) method for MOO.

    Attributes:
        requires_input (bool): Indicates whether the method requires input tensor.
    """

    requires_input: bool = False

    def forward(self, grads: torch.Tensor, inputs: torch.Tensor, outputs: torch.Tensor) -> torch.Tensor:
        """
        Compute new gradients using PCGrad method.

        Args:
            grads (torch.Tensor): Gradients tensor.
            inputs (torch.Tensor): Input tensor.
            outputs (torch.Tensor): Output tensor.

        Returns:
            torch.Tensor: New gradients tensor.
        """
        size = grads.size()[1:]
        num_tasks = grads.size(0)
        grads_list = [g.flatten() for g in grads]

        new_grads = [None for _ in range(num_tasks)]
        for i in np.random.permutation(num_tasks):
            grad_i = grads_list[i]
            for j in np.random.permutation(num_tasks):
                if i == j:
                    continue
                grad_j = grads_list[j]
                if torch.cosine_similarity(grad_i, grad_j, dim=0) < 0:
                    grad_i = grad_i - projection(grad_i, grad_j)
                    assert id(grads_list[i]) != id(grad_i), "Aliasing!"
            new_grads[i] = grad_i.reshape(size)

        return torch.stack(new_grads, dim=0)

`forward(grads, inputs, outputs)` ¶

Compute new gradients using PCGrad method.

Parameters:

Name	Type	Description	Default
`grads`	`Tensor`	Gradients tensor.	required
`inputs`	`Tensor`	Input tensor.	required
`outputs`	`Tensor`	Output tensor.	required

Returns:

Type	Description
`Tensor`	torch.Tensor: New gradients tensor.

Source code in vambn/modelling/mtl/mtl.py

def forward(self, grads: torch.Tensor, inputs: torch.Tensor, outputs: torch.Tensor) -> torch.Tensor:
    """
    Compute new gradients using PCGrad method.

    Args:
        grads (torch.Tensor): Gradients tensor.
        inputs (torch.Tensor): Input tensor.
        outputs (torch.Tensor): Output tensor.

    Returns:
        torch.Tensor: New gradients tensor.
    """
    size = grads.size()[1:]
    num_tasks = grads.size(0)
    grads_list = [g.flatten() for g in grads]

    new_grads = [None for _ in range(num_tasks)]
    for i in np.random.permutation(num_tasks):
        grad_i = grads_list[i]
        for j in np.random.permutation(num_tasks):
            if i == j:
                continue
            grad_j = grads_list[j]
            if torch.cosine_similarity(grad_i, grad_j, dim=0) < 0:
                grad_i = grad_i - projection(grad_i, grad_j)
                assert id(grads_list[i]) != id(grad_i), "Aliasing!"
        new_grads[i] = grad_i.reshape(size)

    return torch.stack(new_grads, dim=0)

`divide(numer, denom)` ¶

Numerically stable division.

Parameters:

Name	Type	Description	Default
`numer`	`Tensor`	Numerator tensor.	required
`denom`	`Tensor`	Denominator tensor.	required

Returns:

Type	Description
	torch.Tensor: Result of numerically stable division.

Source code in vambn/modelling/mtl/mtl.py

def divide(numer, denom):
    """
    Numerically stable division.

    Args:
        numer (torch.Tensor): Numerator tensor.
        denom (torch.Tensor): Denominator tensor.

    Returns:
        torch.Tensor: Result of numerically stable division.
    """
    epsilon = 1e-15
    return (
        torch.sign(numer)
        * torch.sign(denom)
        * torch.exp(
            torch.log(numer.abs() + epsilon) - torch.log(denom.abs() + epsilon)
        )
    )

`gradient_normalizers(grads, losses, normalization_type)` ¶

Compute gradient normalizers based on the specified normalization type.

Parameters:

Name	Type	Description	Default
`grads`	`dict`	A dictionary of gradients.	required
`losses`	`dict`	A dictionary of losses.	required
`normalization_type`	`str`	The type of normalization ('l2', 'loss', 'loss+', 'none').	required

Returns:

Type	Description
`dict`	A dictionary of gradient normalizers.

Source code in vambn/modelling/mtl/mtl.py

def gradient_normalizers(grads: dict, losses: dict, normalization_type: str) -> dict:
    """
    Compute gradient normalizers based on the specified normalization type.

    Args:
        grads: A dictionary of gradients.
        losses: A dictionary of losses.
        normalization_type: The type of normalization ('l2', 'loss', 'loss+', 'none').

    Returns:
        A dictionary of gradient normalizers.
    """
    gn = {}
    if normalization_type == "l2":
        for t in grads:
            gn[t] = np.sqrt(np.sum([gr.pow(2).sum().item() for gr in grads[t]]))
    elif normalization_type == "loss":
        for t in grads:
            gn[t] = losses[t]
    elif normalization_type == "loss+":
        for t in grads:
            gn[t] = losses[t] * np.sqrt(
                np.sum([gr.pow(2).sum().item() for gr in grads[t]])
            )
    elif normalization_type == "none":
        for t in grads:
            gn[t] = 1.0
    else:
        print("ERROR: Invalid Normalization Type")
    return gn

`norm(tensor)` ¶

Compute the L2 norm of a tensor along the last dimension.

Parameters:

Name	Type	Description	Default
`tensor`	`Tensor`	Input tensor.	required

Returns:

Type	Description
	torch.Tensor: L2 norm of the input tensor.

Source code in vambn/modelling/mtl/mtl.py

def norm(tensor):
    """
    Compute the L2 norm of a tensor along the last dimension.

    Args:
        tensor (torch.Tensor): Input tensor.

    Returns:
        torch.Tensor: L2 norm of the input tensor.
    """
    return tensor.norm(p=2, dim=-1, keepdim=True)

`projection(u, v)` ¶

Project vector u onto vector v.

Parameters:

Name	Type	Description	Default
`u`	`Tensor`	Vector to be projected.	required
`v`	`Tensor`	Vector onto which u is projected.	required

Returns:

Type	Description
	torch.Tensor: Projection of u onto v.

Source code in vambn/modelling/mtl/mtl.py

def projection(u, v):
    """
    Project vector u onto vector v.

    Args:
        u (torch.Tensor): Vector to be projected.
        v (torch.Tensor): Vector onto which u is projected.

    Returns:
        torch.Tensor: Projection of u onto v.
    """
    numer = torch.dot(u, v)
    denom = torch.dot(v, v)

    return numer / denom.clamp_min(1e-15) * v

`unitary(tensor)` ¶

Normalize the tensor to unit norm.

Parameters:

Name	Type	Description	Default
`tensor`	`Tensor`	Input tensor.	required

Returns:

Type	Description
	torch.Tensor: Unitary (normalized) tensor.

Source code in vambn/modelling/mtl/mtl.py

def unitary(tensor):
    """
    Normalize the tensor to unit norm.

    Args:
        tensor (torch.Tensor): Input tensor.

    Returns:
        torch.Tensor: Unitary (normalized) tensor.
    """
    return divide(tensor, norm(tensor) + 1e-15)

`parameters` ¶

`MtlMethodParams` `dataclass` ¶

Params and method description for multi-task learning.

Attributes:

Name	Type	Description
`name`	`str`	Name of the MTL method.
`update_at`	`Optional[int]`	Update interval, specific to certain methods.
`alpha`	`Optional[float]`	Alpha parameter, specific to certain methods.

Source code in vambn/modelling/mtl/parameters.py

@dataclass
class MtlMethodParams:
    """
    Params and method description for multi-task learning.

    Attributes:
        name (str): Name of the MTL method.
        update_at (Optional[int]): Update interval, specific to certain methods.
        alpha (Optional[float]): Alpha parameter, specific to certain methods.
    """

    name: str
    update_at: Optional[int] = None
    alpha: Optional[float] = None

    def __post_init__(self):
        """
        Post-initialization to set default values for specific methods.
        """
        if self.name == "nsgd":
            if self.update_at is None:
                self.update_at = 1
        elif self.name == "gradnorm":
            if self.update_at is None:
                self.update_at = 1
            if self.alpha is None:
                self.alpha = 1.0
        elif self.name == "pcgrad":
            if self.update_at is None:
                self.update_at = 1
        elif self.name == "cagrad":
            if self.alpha is None:
                self.alpha = 10

`__post_init__()` ¶

Post-initialization to set default values for specific methods.

Source code in vambn/modelling/mtl/parameters.py

def __post_init__(self):
    """
    Post-initialization to set default values for specific methods.
    """
    if self.name == "nsgd":
        if self.update_at is None:
            self.update_at = 1
    elif self.name == "gradnorm":
        if self.update_at is None:
            self.update_at = 1
        if self.alpha is None:
            self.alpha = 1.0
    elif self.name == "pcgrad":
        if self.update_at is None:
            self.update_at = 1
    elif self.name == "cagrad":
        if self.alpha is None:
            self.alpha = 10

`utils` ¶

This script includes code adapted from the 'impartial-vaes' repository with minor modifications. The original code can be found at: https://github.com/adrianjav/impartial-vaes

Credit to the original authors: Adrian Javaloy, Maryam Meghdadi, and Isabel Valera for their valuable work.

`batch_product(batch, weight)` ¶

Multiplies each slice of the first dimension of batch by the corresponding scalar in the weight vector.

Parameters:

Name	Type	Description	Default
`batch`	`Tensor`	Tensor of size [B, ...].	required
`weight`	`Tensor`	Tensor of size [B].	required

Returns:

Type	Description
	torch.Tensor: A tensor such that `result[i] = batch[i] * weight[i]`.

Source code in vambn/modelling/mtl/utils.py

def batch_product(batch: torch.Tensor, weight: torch.Tensor):
    r"""
    Multiplies each slice of the first dimension of batch by the corresponding scalar in the weight vector.

    Args:
        batch (torch.Tensor): Tensor of size [B, ...].
        weight (torch.Tensor): Tensor of size [B].

    Returns:
        torch.Tensor: A tensor such that `result[i] = batch[i] * weight[i]`.
    """
    assert batch.size(0) == weight.size(0)
    return (batch.T * weight.T).T

MTL¶

minnormsolver ¶

MinNormLinearSolver ¶

forward(v1v1, v1v2, v2v2) ¶

MinNormPlanarSolver ¶

__init__(n_tasks) ¶

forward(grammian) ¶

line_solver_vectorized(v1v1, v1v2, v2v2) ¶

MinNormSolver ¶

__init__(n_tasks, max_iter=250, stop_crit=1e-06) ¶

forward(vecs) ¶

next_point(cur_val, grad) ¶

projection_to_simplex(gamma) ¶

moo ¶

MOOForLoop ¶

moo_method property ¶

__init__(num_heads, moo_method=None) ¶

forward(z) ¶

MooMulti ¶

moo_method property ¶

__init__(num_modules, moo_method=None) ¶

forward(z) ¶

MultiMOOForLoop ¶

__init__(num_heads, moo_methods) ¶

forward(*args) ¶

setup_moo(hparams, num_tasks) ¶

mtl ¶

CAGrad ¶

__init__(alpha) ¶

forward(grads, inputs, outputs) ¶

Compose ¶

forward(grads, inputs, outputs) ¶

GradDrop ¶

__init__(leakage) ¶

forward(grads, inputs, outputs) ¶

GradNorm ¶

forward(grads, inputs, outputs) ¶

GradNormBase ¶

weight property ¶

__init__(num_tasks, alpha, update_at=20) ¶

GradNormModified ¶

forward(grads, inputs, outputs) ¶

GradVac ¶

__init__(decay) ¶

forward(grads, inputs, outputs) ¶

IMTLG ¶

forward(grads, inputs, outputs) ¶

Identity ¶

forward(grads, inputs, outputs) ¶

MGDAUB ¶

forward(grads, inputs, outputs) ¶

MOOMethod ¶

forward(grads, inputs, outputs) abstractmethod ¶

MinNormSolver ¶

find_min_norm_element(vecs) staticmethod ¶

MtlMethods ¶

NSGD ¶

__init__(num_tasks, update_at=20) ¶

forward(grads, inputs, outputs) ¶

PCGrad ¶

forward(grads, inputs, outputs) ¶

divide(numer, denom) ¶

gradient_normalizers(grads, losses, normalization_type) ¶

norm(tensor) ¶

projection(u, v) ¶

unitary(tensor) ¶

parameters ¶

MtlMethodParams dataclass ¶

__post_init__() ¶

utils ¶

batch_product(batch, weight) ¶

`minnormsolver` ¶

`MinNormLinearSolver` ¶

`forward(v1v1, v1v2, v2v2)` ¶

`MinNormPlanarSolver` ¶

`init(n_tasks)` ¶

`forward(grammian)` ¶

`line_solver_vectorized(v1v1, v1v2, v2v2)` ¶

`MinNormSolver` ¶

`init(n_tasks, max_iter=250, stop_crit=1e-06)` ¶

`forward(vecs)` ¶

`next_point(cur_val, grad)` ¶

`projection_to_simplex(gamma)` ¶

`moo` ¶

`MOOForLoop` ¶

`moo_method` `property` ¶

`init(num_heads, moo_method=None)` ¶

`forward(z)` ¶

`MooMulti` ¶

`moo_method` `property` ¶

`init(num_modules, moo_method=None)` ¶

`forward(z)` ¶

`MultiMOOForLoop` ¶

`init(num_heads, moo_methods)` ¶

`forward(*args)` ¶

`setup_moo(hparams, num_tasks)` ¶

`mtl` ¶

`CAGrad` ¶

`init(alpha)` ¶

`forward(grads, inputs, outputs)` ¶

`Compose` ¶

`forward(grads, inputs, outputs)` ¶

`GradDrop` ¶

`init(leakage)` ¶

`forward(grads, inputs, outputs)` ¶

`GradNorm` ¶

`forward(grads, inputs, outputs)` ¶

`GradNormBase` ¶

`weight` `property` ¶

`init(num_tasks, alpha, update_at=20)` ¶

`GradNormModified` ¶

`forward(grads, inputs, outputs)` ¶

`GradVac` ¶

`init(decay)` ¶

`forward(grads, inputs, outputs)` ¶

`IMTLG` ¶

`forward(grads, inputs, outputs)` ¶

`Identity` ¶

`forward(grads, inputs, outputs)` ¶

`MGDAUB` ¶

`forward(grads, inputs, outputs)` ¶

`MOOMethod` ¶

`forward(grads, inputs, outputs)` `abstractmethod` ¶

`MinNormSolver` ¶

`find_min_norm_element(vecs)` `staticmethod` ¶

`MtlMethods` ¶

`NSGD` ¶

`init(num_tasks, update_at=20)` ¶

`forward(grads, inputs, outputs)` ¶

`PCGrad` ¶

`forward(grads, inputs, outputs)` ¶

`divide(numer, denom)` ¶

`gradient_normalizers(grads, losses, normalization_type)` ¶

`norm(tensor)` ¶

`projection(u, v)` ¶

`unitary(tensor)` ¶

`parameters` ¶

`MtlMethodParams` `dataclass` ¶

`__post_init__()` ¶

`utils` ¶

`batch_product(batch, weight)` ¶