Source code for dexpy.alias

"""Functions for detecting and listing aliases."""

import numpy as np
import scipy.linalg
from patsy import dmatrix
import math
import logging


[docs]def alias_list(model, design): """Returns a human-readable list of dependent model columns. This is done by solving AX=B, where X is the full rank model matrix, and B is all of the columns of the model matrix. The result is a matrix of coefficients which represent to what degree a given column is collinear with another column. Usage: >>> design = dexpy.factorial.build_factorial(4, 8) >>> aliases, alias_coefs = dexpy.alias.alias_list("(A+B+C+D)**2)", design) >>> print(aliases) ['A:B = C:D', 'A:C = B:D', 'A:D = B:C'] """ # use the square root of machine precision for testing for 0 epsilon = math.sqrt(np.finfo(float).eps) logging.debug("model:\n%s", model) model_matrix = dmatrix(model, design, return_type="dataframe") logging.debug("model matrix (rhs):\n%s", model_matrix) # there is no requirement that the model matrix is full rank # so first remove linearly dependent columns using LU decomp _, _, upper_matrix = scipy.linalg.lu(model_matrix) row = 0 col = 0 # this array will be used to select columns for the full rank X matrix column_unaliased = [False] * model_matrix.shape[1] # iterate over diagonals of U, skipping columns with 0 in the diagonal while row < model_matrix.shape[0] and col < model_matrix.shape[1]: if abs(upper_matrix[row, col]) > epsilon: column_unaliased[col] = True row += 1 col += 1 logging.debug("estimating columns:\n%s", column_unaliased) unaliased = model_matrix.loc[:, column_unaliased] logging.debug("full rank matrix (lhs):\n%s", unaliased) alias_coefs, _, _, _ = np.linalg.lstsq(unaliased, model_matrix) logging.debug("alias matrix:\n%s", alias_coefs) alias_list = [] for r in range(alias_coefs.shape[0]): alias_strings = [] for c in range(alias_coefs.shape[1]): # all columns are "aliased" with themselves, so don't show if unaliased.columns[r] == model_matrix.columns[c]: continue # 0 means there is no correlation if abs(alias_coefs[r, c]) < epsilon: continue if abs(alias_coefs[r, c] - 1.0) > epsilon: alias_strings.append("{}*{}".format(alias_coefs[r, c], model_matrix.columns[c])) else: alias_strings.append(model_matrix.columns[c]) if alias_strings: alias_list.append("{} = {}".format(unaliased.columns[r], " + ".join(alias_strings))) return alias_list, alias_coefs