Source code for dexpy.alias

"""Functions for detecting and listing aliases."""

import numpy as np
import scipy.linalg
from patsy import dmatrix
import math
import logging


[docs]def alias_list(model, design):
    """Returns a human-readable list of dependent model columns.

    This is done by solving AX=B, where X is the full rank model matrix,
    and B is all of the columns of the model matrix. The result is a matrix
    of coefficients which represent to what degree a given column is
    collinear with another column.

    Usage:
      >>> design = dexpy.factorial.build_factorial(4, 8)
      >>> aliases, alias_coefs = dexpy.alias.alias_list("(A+B+C+D)**2)", design)
      >>> print(aliases)
      ['A:B = C:D', 'A:C = B:D', 'A:D = B:C']
    """
    # use the square root of machine precision for testing for 0
    epsilon = math.sqrt(np.finfo(float).eps)

    logging.debug("model:\n%s", model)
    model_matrix = dmatrix(model, design, return_type="dataframe")
    logging.debug("model matrix (rhs):\n%s", model_matrix)

    # there is no requirement that the model matrix is full rank
    # so first remove linearly dependent columns using LU decomp
    _, _, upper_matrix = scipy.linalg.lu(model_matrix)

    row = 0
    col = 0
    # this array will be used to select columns for the full rank X matrix
    column_unaliased = [False] * model_matrix.shape[1]
    # iterate over diagonals of U, skipping columns with 0 in the diagonal
    while row < model_matrix.shape[0] and col < model_matrix.shape[1]:
        if abs(upper_matrix[row, col]) > epsilon:
            column_unaliased[col] = True
            row += 1
        col += 1
    logging.debug("estimating columns:\n%s", column_unaliased)

    unaliased = model_matrix.loc[:, column_unaliased]
    logging.debug("full rank matrix (lhs):\n%s", unaliased)

    alias_coefs, _, _, _ = np.linalg.lstsq(unaliased, model_matrix)
    logging.debug("alias matrix:\n%s", alias_coefs)
    alias_list = []
    for r in range(alias_coefs.shape[0]):
        alias_strings = []
        for c in range(alias_coefs.shape[1]):
            # all columns are "aliased" with themselves, so don't show
            if unaliased.columns[r] == model_matrix.columns[c]:
                continue
            # 0 means there is no correlation
            if abs(alias_coefs[r, c]) < epsilon:
                continue
            if abs(alias_coefs[r, c] - 1.0) > epsilon:
                alias_strings.append("{}*{}".format(alias_coefs[r, c],
                                     model_matrix.columns[c]))
            else:
                alias_strings.append(model_matrix.columns[c])
        if alias_strings:
            alias_list.append("{} = {}".format(unaliased.columns[r],
                              " + ".join(alias_strings)))
    return alias_list, alias_coefs