Source code for spglm.glm

# Main GLM classes

__author__ = "Taylor Oshan tayoshan@gmail.com"

import numpy as np
import numpy.linalg as la
from spreg.utils import RegressionPropsY, spdot
from spreg import user_output as USER
from .utils import cache_readonly
from .base import LikelihoodModelResults
from .iwls import iwls
from . import family

__all__ = ['GLM']


[docs]class GLM(RegressionPropsY): """ Generalised linear models. Can currently estimate Guassian, Poisson and Logisitc regression coefficients. GLM object prepares model input and fit method performs estimation which then returns a GLMResults object. Parameters ---------- y : array n*1, dependent variable. X : array n*k, independent variable, exlcuding the constant. family : string Model type: 'Gaussian', 'Poisson', 'Binomial' offset : array n*1, the offset variable at the ith location. For Poisson model this term is often the size of the population at risk or the expected size of the outcome in spatial epidemiology. Default is None where Ni becomes 1.0 for all locations. y_fix : array n*1, the fix intercept value of y Attributes ---------- y : array n*1, dependent variable. X : array n*k, independent variable, including constant. family : string Model type: 'Gaussian', 'Poisson', 'logistic' n : integer Number of observations k : integer Number of independent variables df_model : float k-1, where k is the number of variables (including intercept) df_residual : float observations minus variables (n-k) mean_y : float Mean of y std_y : float Standard deviation of y fit_params : dict Parameters passed into fit method to define estimation routine. Examples -------- >>> import libpysal >>> from spglm.glm import GLM >>> from spglm.family import Gaussian >>> db = libpysal.io.open(libpysal.examples.get_path('columbus.dbf'),'r') >>> y = np.array(db.by_col("HOVAL")) >>> y = np.reshape(y, (49,1)) >>> X = [] >>> X.append(db.by_col("INC")) >>> X.append(db.by_col("CRIME")) >>> X = np.array(X).T >>> model = GLM(y, X, family=Gaussian()) >>> results = model.fit() >>> results.params array([46.42818268, 0.62898397, -0.48488854]) """
[docs] def __init__(self, y, X, family=family.Gaussian(), offset=None, y_fix=None, constant=True): """ Initialize class """ self.n = USER.check_arrays(y, X) USER.check_y(y, self.n) self.y = y if constant: self.X = USER.check_constant(X) else: self.X = X self.family = family self.k = self.X.shape[1] if offset is None: self.offset = np.ones(shape=(self.n, 1)) else: self.offset = offset * 1.0 if y_fix is None: self.y_fix = np.zeros(shape=(self.n, 1)) else: self.y_fix = y_fix self.fit_params = {}
def fit(self, ini_betas=None, tol=1.0e-6, max_iter=200, solve='iwls'): """ Method that fits a model with a particular estimation routine. Parameters ---------- ini_betas : array k*1, initial coefficient values, including constant. Default is None, which calculates initial values during estimation. tol: float Tolerence for estimation convergence. max_iter : integer Maximum number of iterations if convergence not achieved. solve :string Technique to solve MLE equations. 'iwls' = iteratively (re)weighted least squares (default) """ self.fit_params['ini_betas'] = ini_betas self.fit_params['tol'] = tol self.fit_params['max_iter'] = max_iter self.fit_params['solve'] = solve if solve.lower() == 'iwls': params, predy, w, n_iter = iwls( self.y, self.X, self.family, self.offset, self.y_fix, ini_betas, tol, max_iter) self.fit_params['n_iter'] = n_iter return GLMResults(self, params.flatten(), predy, w) @cache_readonly def df_model(self): return self.X.shape[1] - 1 @cache_readonly def df_resid(self): return self.n - self.df_model - 1
[docs]class GLMResults(LikelihoodModelResults): """ Results of estimated GLM and diagnostics. Parameters ---------- model : GLM object Pointer to GLM object with estimation parameters. params : array k*1, estimared coefficients mu : array n*1, predicted y values. w : array n*1, final weight used for iwls Attributes ---------- model : GLM Object Points to GLM object for which parameters have been estimated. y : array n*1, dependent variable. x : array n*k, independent variable, including constant. family : string Model type: 'Gaussian', 'Poisson', 'Logistic' n : integer Number of observations k : integer Number of independent variables df_model : float k-1, where k is the number of variables (including intercept) df_residual : float observations minus variables (n-k) fit_params : dict parameters passed into fit method to define estimation routine. scale : float sigma squared used for subsequent computations. params : array n*k, estimared beta coefficients w : array n*1, final weight values of x mu : array n*1, predicted value of y (i.e., fittedvalues) cov_params : array Variance covariance matrix (kxk) of betas bse : array k*1, standard errors of betas pvalues : array k*1, two-tailed pvalues of parameters tvalues : array k*1, the tvalues of the standard errors null : array n*1, predicted values of y for null model deviance : float value of the deviance function evalued at params; see family.py for distribution-specific deviance null_deviance : float value of the deviance function for the model fit with a constant as the only regressor llf : float value of the loglikelihood function evalued at params; see family.py for distribution-specific loglikelihoods llnull : float value of log-likelihood function evaluated at null aic : float AIC bic : float BIC D2 : float percent deviance explained adj_D2 : float adjusted percent deviance explained pseudo_R2 : float McFadden's pseudo R2 (coefficient of determination) adj_pseudoR2 : float adjusted McFadden's pseudo R2 tr_S : trace of the hat matrix S resid_response : array response residuals; defined as y-mu resid_pearson : array Pearson residuals; defined as (y-mu)/sqrt(VAR(mu)) where VAR is the distribution specific variance function; see family.py and varfuncs.py for more information. resid_working : array Working residuals; the working residuals are defined as resid_response/link'(mu); see links.py for the derivatives of the link functions. resid_anscombe : array Anscombe residuals; see family.py for distribution-specific Anscombe residuals. resid_deviance : array deviance residuals; see family.py for distribution-specific deviance residuals. pearson_chi2 : float chi-Squared statistic is defined as the sum of the squares of the Pearson residuals normalized_cov_params : array k*k, approximates [X.T*X]-1 Examples -------- >>> import libpysal >>> from spglm.glm import GLM, GLMResults >>> from spglm.family import Gaussian >>> db = libpysal.io.open(libpysal.examples.get_path('columbus.dbf'),'r') >>> y = np.array(db.by_col("HOVAL")) >>> y = np.reshape(y, (49,1)) >>> X = [] >>> X.append(db.by_col("INC")) >>> X.append(db.by_col("CRIME")) >>> X = np.array(X).T >>> model = GLM(y, X, family=Gaussian()) >>> results1 = model.fit() >>> results1.aic 408.73548964604873 >>> model = results1.model >>> params = results1.params.flatten() >>> predy = results1.mu >>> w = results1.w >>> results2 = GLMResults(model, params, predy, w) >>> results2.aic 408.73548964604873 """
[docs] def __init__(self, model, params, mu, w): self.model = model self.n = model.n self.y = model.y.T.flatten() self.X = model.X self.k = model.k self.offset = model.offset self.family = model.family self.fit_params = model.fit_params self.params = params self.w = w self.mu = mu.flatten() self._cache = {}
@cache_readonly def df_model(self): return self.model.df_model @cache_readonly def df_resid(self): return self.model.df_resid @cache_readonly def normalized_cov_params(self): return la.inv(spdot(self.w.T, self.w)) @cache_readonly def resid_response(self): return (self.y - self.mu) @cache_readonly def resid_pearson(self): return ((self.y - self.mu) / np.sqrt(self.family.variance(self.mu))) @cache_readonly def resid_working(self): return (self.resid_response / self.family.link.deriv(self.mu)) @cache_readonly def resid_anscombe(self): return (self.family.resid_anscombe(self.y, self.mu)) @cache_readonly def resid_deviance(self): return (self.family.resid_dev(self.y, self.mu)) @cache_readonly def pearson_chi2(self): chisq = (self.y - self.mu)**2 / self.family.variance(self.mu) chisqsum = np.sum(chisq) return chisqsum @cache_readonly def null(self): y = np.reshape(self.y, (-1, 1)) model = self.model X = np.ones((len(y), 1)) null_mod = GLM( y, X, family=self.family, offset=self.offset, constant=False) return null_mod.fit().mu @cache_readonly def scale(self): if isinstance(self.family, (family.Binomial, family.Poisson)): return 1. else: return (((np.power(self.resid_response, 2) / self.family.variance(self.mu))).sum() / (self.df_resid)) @cache_readonly def deviance(self): return self.family.deviance(self.y, self.mu) @cache_readonly def null_deviance(self): return self.family.deviance(self.y, self.null) @cache_readonly def llnull(self): return self.family.loglike(self.y, self.null, scale=self.scale) @cache_readonly def llf(self): return self.family.loglike(self.y, self.mu, scale=self.scale) @cache_readonly def aic(self): if isinstance(self.family, family.QuasiPoisson): return np.nan else: return -2 * self.llf + 2 * (self.df_model + 1) @cache_readonly def bic(self): return (self.deviance - (self.model.n - self.df_model - 1) * np.log(self.model.n)) @cache_readonly def D2(self): return 1 - (self.deviance / self.null_deviance) @cache_readonly def adj_D2(self): return 1.0 - (float(self.n) - 1.0) / \ (float(self.n) - float(self.k)) * (1.0 - self.D2) @cache_readonly def pseudoR2(self): return 1 - (self.llf / self.llnull) @cache_readonly def adj_pseudoR2(self): return 1 - ((self.llf - self.k) / self.llnull) @cache_readonly def tr_S(self): xtx_inv = np.linalg.inv(np.dot(self.X.T, self.X)) xtx_inv_xt = np.dot(xtx_inv, self.X.T) S = np.dot(self.X, xtx_inv_xt) return np.trace(S)