Goodness-of-fit tests and the bootstrap

The three nested statistics

For a candidate family fitted by maximum likelihood, the package computes a Hotelling-type quadratic form in the discrepancy between the sample and fitted log-cumulants. Three nested choices of cumulant orders give three statistics:

\(T^2_{(2,3)}\): log-scale shape (dispersion, skewness);
\(T^2_{(1,2,3)}\): adds the first log-cumulant (entropy/scale);
\(T^2_{(1,\ldots,6)}\): adds higher-order cumulants for tail discrimination.

x <- reliability_datasets$Yarn
T2_all(x, "Weibull")
#> $T2_23
#> $T2_23$T2
#> [1] 3.205122
#> 
#> $T2_23$df
#> [1] 1
#> 
#> $T2_23$p_chisq
#> [1] 0.07340803
#> 
#> $T2_23$p_F
#> [1] 0.07646484
#> 
#> $T2_23$theta
#> [1]   1.605465 248.033500
#> 
#> $T2_23$d
#> [1] -0.04889879  0.16479398
#> 
#> $T2_23$Kd
#>           [,1]      [,2]
#> [1,] 0.0732021 0.2102838
#> [2,] 0.2102838 0.3415546
#> 
#> $T2_23$eigmin
#> [1] -0.04206613
#> 
#> $T2_23$conv
#> [1] TRUE
#> 
#> 
#> $T2_123
#> $T2_123$T2
#> [1] 4.984082
#> 
#> $T2_123$df
#> [1] 2
#> 
#> $T2_123$p_chisq
#> [1] 0.08274091
#> 
#> $T2_123$p_F
#> [1] 0.09010487
#> 
#> $T2_123$theta
#> [1]   1.605465 248.033500
#> 
#> $T2_123$d
#> [1]  0.009550301 -0.048898794  0.164793978
#> 
#> $T2_123$Kd
#>             [,1]       [,2]       [,3]
#> [1,] -0.02944059 0.04955572 -0.3173502
#> [2,]  0.04955572 0.07320210  0.2102838
#> [3,] -0.31735019 0.21028381  0.3415546
#> 
#> $T2_123$eigmin
#> [1] -0.2793186
#> 
#> $T2_123$conv
#> [1] TRUE
#> 
#> 
#> $T2_123456
#> $T2_123456$T2
#> [1] 13.61917
#> 
#> $T2_123456$df
#> [1] 4
#> 
#> $T2_123456$p_chisq
#> [1] 0.008615166
#> 
#> $T2_123456$p_F
#> [1] 0.01399289
#> 
#> $T2_123456$theta
#> [1]   1.605465 248.033500
#> 
#> $T2_123456$d
#> [1]  0.009550301 -0.048898794  0.164793978 -0.658861256  2.731493015
#> [6] -9.511474432
#> 
#> $T2_123456$Kd
#>             [,1]         [,2]        [,3]        [,4]       [,5]       [,6]
#> [1,] -0.02944059   0.04955572  -0.3173502    1.824940  -6.639026   20.13131
#> [2,]  0.04955572   0.07320210   0.2102838   -2.720260  12.809123  -45.83800
#> [3,] -0.31735019   0.21028381   0.3415546    2.090514 -15.927277   68.35576
#> [4,]  1.82494003  -2.72025951   2.0905143   -4.777864  24.301953 -117.77434
#> [5,] -6.63902601  12.80912265 -15.9272766   24.301953 -55.470330  231.31724
#> [6,] 20.13131432 -45.83800063  68.3557640 -117.774338 231.317244 -736.64961
#> 
#> $T2_123456$eigmin
#> [1] -834.2953
#> 
#> $T2_123456$conv
#> [1] TRUE
#> 
#> 
#> $fit
#> $fit$theta
#> [1]   1.605465 248.033500
#> 
#> $fit$Sigma
#>           [,1]        [,2]
#> [1,]  1.487107    62.69933
#> [2,] 62.699329 26511.04699
#> 
#> $fit$loglik
#> [1] -625.2069
#> 
#> $fit$conv
#> [1] TRUE

Why the bootstrap?

The discrepancy covariance \(\mathbf{K}_d\) is typically ill-conditioned: one eigenvalue is much smaller than the others and is estimated with large relative error. As a result the asymptotic chi-squared reference over-rejects, and the distortion does not vanish as the sample grows. The parametric bootstrap reproduces the ill-conditioning in each replicate, so it cancels in the bootstrap p-value.

T2_bootstrap(x, "Weibull", B = 299, seed = 1)
#> $p_boot
#>     T2_23    T2_123 T2_123456 
#> 0.4180602 0.2541806 0.3779264 
#> 
#> $T2_obs
#>     T2_23    T2_123 T2_123456 
#>  3.205122  4.984082 13.619165 
#> 
#> $B
#> [1] 299
#> 
#> $valid
#> [1] 299 299 299
#> 
#> $obs
#> $obs$T2_23
#> $obs$T2_23$T2
#> [1] 3.205122
#> 
#> $obs$T2_23$df
#> [1] 1
#> 
#> $obs$T2_23$p_chisq
#> [1] 0.07340803
#> 
#> $obs$T2_23$p_F
#> [1] 0.07646484
#> 
#> $obs$T2_23$theta
#> [1]   1.605465 248.033500
#> 
#> $obs$T2_23$d
#> [1] -0.04889879  0.16479398
#> 
#> $obs$T2_23$Kd
#>           [,1]      [,2]
#> [1,] 0.0732021 0.2102838
#> [2,] 0.2102838 0.3415546
#> 
#> $obs$T2_23$eigmin
#> [1] -0.04206613
#> 
#> $obs$T2_23$conv
#> [1] TRUE
#> 
#> 
#> $obs$T2_123
#> $obs$T2_123$T2
#> [1] 4.984082
#> 
#> $obs$T2_123$df
#> [1] 2
#> 
#> $obs$T2_123$p_chisq
#> [1] 0.08274091
#> 
#> $obs$T2_123$p_F
#> [1] 0.09010487
#> 
#> $obs$T2_123$theta
#> [1]   1.605465 248.033500
#> 
#> $obs$T2_123$d
#> [1]  0.009550301 -0.048898794  0.164793978
#> 
#> $obs$T2_123$Kd
#>             [,1]       [,2]       [,3]
#> [1,] -0.02944059 0.04955572 -0.3173502
#> [2,]  0.04955572 0.07320210  0.2102838
#> [3,] -0.31735019 0.21028381  0.3415546
#> 
#> $obs$T2_123$eigmin
#> [1] -0.2793186
#> 
#> $obs$T2_123$conv
#> [1] TRUE
#> 
#> 
#> $obs$T2_123456
#> $obs$T2_123456$T2
#> [1] 13.61917
#> 
#> $obs$T2_123456$df
#> [1] 4
#> 
#> $obs$T2_123456$p_chisq
#> [1] 0.008615166
#> 
#> $obs$T2_123456$p_F
#> [1] 0.01399289
#> 
#> $obs$T2_123456$theta
#> [1]   1.605465 248.033500
#> 
#> $obs$T2_123456$d
#> [1]  0.009550301 -0.048898794  0.164793978 -0.658861256  2.731493015
#> [6] -9.511474432
#> 
#> $obs$T2_123456$Kd
#>             [,1]         [,2]        [,3]        [,4]       [,5]       [,6]
#> [1,] -0.02944059   0.04955572  -0.3173502    1.824940  -6.639026   20.13131
#> [2,]  0.04955572   0.07320210   0.2102838   -2.720260  12.809123  -45.83800
#> [3,] -0.31735019   0.21028381   0.3415546    2.090514 -15.927277   68.35576
#> [4,]  1.82494003  -2.72025951   2.0905143   -4.777864  24.301953 -117.77434
#> [5,] -6.63902601  12.80912265 -15.9272766   24.301953 -55.470330  231.31724
#> [6,] 20.13131432 -45.83800063  68.3557640 -117.774338 231.317244 -736.64961
#> 
#> $obs$T2_123456$eigmin
#> [1] -834.2953
#> 
#> $obs$T2_123456$conv
#> [1] TRUE
#> 
#> 
#> $obs$fit
#> $obs$fit$theta
#> [1]   1.605465 248.033500
#> 
#> $obs$fit$Sigma
#>           [,1]        [,2]
#> [1,]  1.487107    62.69933
#> [2,] 62.699329 26511.04699
#> 
#> $obs$fit$loglik
#> [1] -625.2069
#> 
#> $obs$fit$conv
#> [1] TRUE

In practice, prefer the bootstrap p-values for all sample sizes.

Full model comparison

gof_compare_all() runs the three \(T^2\) tests, the Anderson–Darling and Cramer–von Mises tests, and the AIC across all six families:

gof_compare_all(x, use_bootstrap = TRUE, B = 199, seed = 1)
#>              Dist    T2_23        p_23    T2_123        p_123     T2_full
#> stat      Weibull 3.205122 0.073408033  4.984082 8.274091e-02   13.619165
#> stat1     Frechet 9.283366 0.002312441  7.596404 5.848483e-03 4590.357543
#> stat2       Gamma 1.129038 0.568633637  1.103158 5.760397e-01    6.820438
#> stat3    InvGamma 8.310963 0.003940649  7.003544 8.134853e-03  913.428776
#> stat4   LogNormal 5.455520 0.019506603 29.914648 3.192395e-07   92.386879
#> stat5 LogLogistic 5.963457 0.014605378  8.832689 1.207830e-02   85.779331
#>              p_full        AD        AD_p       CvM       CvM_p      AIC
#> stat   8.615166e-03 0.5410004 0.705401889 0.0947568 0.611194678 1254.414
#> stat1  0.000000e+00 5.5787380 0.001522060 0.9905471 0.002589291 1308.897
#> stat2  1.456870e-01 0.6813406 0.574619957 0.1259204 0.472054154 1254.526
#> stat3 2.051427e-196 5.2662380 0.002139914 1.0097728 0.002334086 1301.964
#> stat4  4.095678e-19 2.0232404 0.089180840 0.3729128 0.085283955 1267.620
#> stat5  1.036615e-17 1.3077253 0.229857896 0.1636898 0.350430282 1263.243
#>            pb_23     pb_123     pb_full
#> stat  0.42713568 0.26633166 0.346733668
#> stat1 0.15577889 0.24120603 0.000000000
#> stat2 0.51758794 0.47738693 0.356783920
#> stat3 0.04020101 0.05025126 0.000000000
#> stat4 0.02512563 0.00000000 0.010050251
#> stat5 0.02010050 0.04522613 0.005025126

The \(T^2\) tests and the EDF tests are complementary: the former are most sensitive to subtle log-shape departures, the latter to tail departures.