Last updated: 2018-07-06
Code version: eae2de7
Loading required package: ashr
Package 'mclust' version 5.4
Type 'citation("mclust")' for citing this R package in publications.
Attaching package: 'mclust'
The following object is masked from 'package:ashr':
dens
UKBioBank Strong data
data = readRDS('../data/UKBioBank/StrongData.rds')
Estimate se based on p values
# Adjust p value == 0
data$p[data$p == 0] = 1e-323
Fit EZ model directly to the Z scores, with standard errors of the non-missing Z scores set to 1 and the missing ones set to 10^6.
mash.data = mash_set_data(Bhat = data$beta, pval = data$p, alpha = 1)
Zhat = mash.data$Bhat; Shat = mash.data$Shat
missing = is.na(Zhat)
Shat[missing] = 10^6
Zhat[missing] = 0
data.EZ = mash_set_data(Zhat,Shat)
# column center
Z.center = apply(Zhat, 2, function(x) x - mean(x))
\[ Z = LF' + E \] Z is an \(n \times p\) observed centered data, F is a \(p\times k\) matrix of factors, L is an \(n \times k\) matrix of loadings.
Results from Flash_UKBio
fmodel = readRDS('../output/Flash_UKBio_strong.rds')
Suppose the rows of L come from a mixture of multivariate normals, with covariances \(\Sigma_1,\dots,\Sigma_M\) (each one a K by K matrix). \[ l_{i \cdot} \sim \sum_{j=1}^{M} N(\mu_{j}, \Sigma_{j}) \] Then the rows of \(LF'\) come from a mixture of multivariate normals \[ Fl_{i\cdot} \sim \sum_{j=1}^{M} N(F\mu_{j}, F\Sigma_{j}F') \] We estimate the covariance matrix as \(F(\Sigma_{j}+\mu_{j}\mu_{j}')F′\).
Cluster loadings:
loading = fmodel$EL[,1:18]
colnames(loading) = paste0('Factor',seq(1,18))
mod = Mclust(loading)
summary(mod$BIC)
Best BIC values:
VVV,9 VVV,8 VVI,9
BIC -1836091 -1847525.77 -1852520.94
BIC diff 0 -11434.81 -16429.98
U_list = alply(mod$parameters$variance$sigma,3)
mu_list = alply(mod$parameters$mean,2)
ll = list()
for (i in 1:length(U_list)){
ll[[i]] = U_list[[i]] + mu_list[[i]] %*% t(mu_list[[i]])
}
Factors = fmodel$EF[,1:18]
U.loading = lapply(ll, function(U){Factors %*% (U %*% t(Factors))})
names(U.loading) = paste0('Load', "_", (1:length(U.loading)))
# rank 1
Flash_res = flash_get_lf(fmodel)
U.Flash = c(mashr::cov_from_factors(t(as.matrix(Factors)), "Flash"),
list("tFlash" = t(Flash_res) %*% Flash_res / nrow(Z.center)))
U.pca = cov_pca(data.EZ, 5)
svd currently performed on Bhat; maybe should be Bhat/Shat?
U.dd = c(U.pca, U.loading, U.Flash, list('XX' = t(data.EZ$Bhat) %*% data.EZ$Bhat / nrow(data.EZ$Bhat)))
U.ed = cov_ed(data.EZ, U.dd)
saveRDS(U.ed, '../output/CovED_UKBio_strong_Z.rds')
U.ed = readRDS('../output/CovED_UKBio_strong_Z.rds')
U.c = cov_canonical(data.EZ)
Read random data
data.rand = readRDS('../data/UKBioBank/RandomData.rds')
# Estimate se based on p values
# Adjust p value == 0
data.rand$p[data.rand$p == 0] = 1e-323
mash.data.rand = mash_set_data(Bhat = data.rand$beta, pval = data.rand$p, alpha = 1)
Zhat = mash.data.rand$Bhat; Shat = mash.data.rand$Shat
missing = is.na(Zhat)
Shat[missing] = 10^6
Zhat[missing] = 0
data.rand.EZ = mash_set_data(Zhat,Shat)
Vhat = estimate_null_correlation(data.rand.EZ)
data.rand.EZ.V = mash_set_data(data.rand.EZ$Bhat, data.rand.EZ$Shat, V = Vhat)
mash.model = mash(data.rand.EZ.V, c(U.c, U.ed), outputlevel = 1)
saveRDS(mash.model, '../output/UKBio_mash_model.rds')
mash.model = readRDS('../output/UKBio_mash_model.rds')
The log-likelihood of fit is
get_loglik(mash.model)
[1] -5357356
Here is a plot of weights learned:
options(repr.plot.width=12, repr.plot.height=4)
barplot(get_estimated_pi(mash.model), las = 2, cex.names = 0.7)
The founded correlations:
There must be some strong environmental factors that are affecting these phenotypes together. The associations from the mash result here may not the pure genetic associations.
From the load_4
covariance matrix, neuroticism and smoking are highly correlated here (0.71). Neurotic people may smoke more and finish their school life earlier. (Factor 15)
ED_XX
:
x <- cov2cor(mash.model$fitted_g$Ulist[["ED_XX"]])
colnames(x) <- colnames(get_lfsr(mash.model))
rownames(x) <- colnames(x)
corrplot::corrplot(x, method='color', cl.lim=c(-1,1), type='upper', addCoef.col = "black", tl.col="black", tl.srt=45, col=colorRampPalette(rev(c("#D73027","#FC8D59","#FEE090","#FFFFBF", "#E0F3F8","#91BFDB","#4575B4")))(128))
The top eigenvalues:
svd.out = svd(mash.model$fitted_g$Ulist[["ED_XX"]])
v = svd.out$v
colnames(v) = colnames(get_lfsr(mash.model))
rownames(v) = colnames(v)
options(repr.plot.width=10, repr.plot.height=5)
for (j in 1:6)
barplot(v[,j]/v[,j][which.max(abs(v[,j]))], cex.names = 0.6,
las = 2, main = paste0("EigenVector ", j, " for empirical covariance matrix"))
ED_load_4
x <- cov2cor(mash.model$fitted_g$Ulist[["ED_Load_4"]])
colnames(x) <- colnames(get_lfsr(mash.model))
rownames(x) <- colnames(x)
corrplot::corrplot(x, method='color', cl.lim=c(-1,1), type='upper', addCoef.col = "black", tl.col="black", tl.srt=45, col=colorRampPalette(rev(c("#D73027","#FC8D59","#FEE090","#FFFFBF", "#E0F3F8","#91BFDB","#4575B4")))(128))
The top eigenvalues:
svd.out = svd(mash.model$fitted_g$Ulist[["ED_Load_4"]])
v = svd.out$v
colnames(v) = colnames(get_lfsr(mash.model))
rownames(v) = colnames(v)
options(repr.plot.width=10, repr.plot.height=5)
for (j in 1:3)
barplot(v[,j]/v[,j][which.max(abs(v[,j]))], cex.names = 0.6,
las = 2, main = paste0("EigenVector ", j, " for Load 4 covariance matrix"))
data.strong = mash_set_data(data.EZ$Bhat, data.EZ$Shat, V = Vhat)
mash.model$result = mash_compute_posterior_matrices(mash.model, data.strong)
There are 60069 significant snps.
Pairwise sharing (Shared by magnitude when sign is ignored):
x <- get_pairwise_sharing(mash.model, FUN = abs)
colnames(x) <- colnames(get_lfsr(mash.model))
rownames(x) <- colnames(x)
clrs=colorRampPalette(rev(c('darkred', 'red','orange','yellow','cadetblue1', 'cyan', 'dodgerblue4', 'blue','darkorchid1','lightgreen','green', 'forestgreen','darkolivegreen')))(200)
corrplot::corrplot(x, method='color', type='upper', addCoef.col = "black", tl.col="black", tl.srt=45, diag = FALSE, col=clrs, cl.lim = c(0,1))
Pairwise sharing (Shared by magnitude and sign):
x <- get_pairwise_sharing(mash.model)
colnames(x) <- colnames(get_lfsr(mash.model))
rownames(x) <- colnames(x)
clrs=colorRampPalette(rev(c('darkred', 'red','orange','yellow','cadetblue1', 'cyan', 'dodgerblue4', 'blue','darkorchid1','lightgreen','green', 'forestgreen','darkolivegreen')))(200)
corrplot::corrplot(x, method='color', type='upper', addCoef.col = "black", tl.col="black", diag=FALSE,tl.srt=45, col=clrs, cl.lim = c(0,1))
sessionInfo()
R version 3.4.4 (2018-03-15)
Platform: x86_64-apple-darwin15.6.0 (64-bit)
Running under: macOS High Sierra 10.13.5
Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] colorRamps_2.3 ggplot2_2.2.1 lattice_0.20-35 plyr_1.8.4
[5] mclust_5.4 mashr_0.2-10 ashr_2.2-7 flashr_0.5-11
loaded via a namespace (and not attached):
[1] Rcpp_0.12.17 compiler_3.4.4 pillar_1.2.2
[4] git2r_0.21.0 iterators_1.0.9 tools_3.4.4
[7] corrplot_0.84 digest_0.6.15 evaluate_0.10.1
[10] tibble_1.4.2 gtable_0.2.0 rlang_0.2.0
[13] Matrix_1.2-14 foreach_1.4.4 yaml_2.1.19
[16] parallel_3.4.4 mvtnorm_1.0-7 ebnm_0.1-11
[19] stringr_1.3.0 knitr_1.20 rprojroot_1.3-2
[22] grid_3.4.4 rmarkdown_1.9 rmeta_3.0
[25] magrittr_1.5 backports_1.1.2 scales_0.5.0
[28] codetools_0.2-15 htmltools_0.3.6 MASS_7.3-50
[31] assertthat_0.2.0 softImpute_1.4 colorspace_1.3-2
[34] stringi_1.2.2 lazyeval_0.2.1 pscl_1.5.2
[37] doParallel_1.0.11 munsell_0.4.3 truncnorm_1.0-8
[40] SQUAREM_2017.10-1
This R Markdown site was created with workflowr