Yesterday I posted a minimal phylogenetic PCA using contrasts (Felsenstein 1985).
The main advantage of this is that it does phylogenetic PCA (following Revell 2009), but scales better to larger datasets because it does not involve N × N (for N taxa) matrix inversion. (Indeed, this is kind of the point of contrasts!)
After a bit of fiddling, today I wrote an extension of this to Pagel’s (1999) λ.
This is what that function looks like.
phyl_pca<-function(tree,X,method="BM"){
lik<-function(lambda,tree,X){
tt<-phytools:::lambdaTree(tree,lambda)
pics<-lapply(X,pic,tt,scaled=FALSE,var.contrasts=TRUE)
pX<-sapply(pics,function(x) x[,1]/sqrt(x[,2]))
vcv<-t(pX)%*%pX/(Ntip(tt)-1)
vars<-pics[[1]][,2]
logL<-0
for(i in 1:nrow(pX)){
x<-sapply(pics,function(x,i) x[i,1],i=i)
logL<-logL+mnormt::dmnorm(x,varcov=vars[i]*vcv,log=TRUE)
}
logL
}
X<-X[tree$tip.label,]
if(method=="lambda"){
reml_fit<-optimize(lik,c(0,1),tree=tree,
X=as.data.frame(X),maximum=TRUE)
logL.lambda<-reml_fit$objective
lambda<-reml_fit$maximum
tree<-phytools:::lambdaTree(tree,lambda)
} else {
logL.lambda<-lik(1,tree,X)
lambda<-1
}
pX<-apply(X,2,pic,phy=tree)
vcv<-t(pX)%*%pX/(Ntip(tree)-1)
a<-apply(X,2,function(x,tree) rep(ace(x,tree,
method="pic")$ace[1],Ntip(tree)),tree=tree)
eig<-eigen(vcv)
Eval<-diag(eig$values)
colnames(Eval)<-rownames(Eval)<-paste("PC",1:nrow(Eval),sep="")
Evec<-eig$vectors
rownames(Evec)<-colnames(X)
colnames(Evec)<-colnames(Eval)
S<-(X-a)%*%Evec
pic_corr<-function(x,y,tree){
px<-pic(x,tree)
py<-pic(y,tree)
mean(px*py)/sqrt(mean(px^2)*mean(py^2))
}
L<-apply(S,2,function(x,y,tree) apply(y,2,pic_corr,
x=x,tree=tree),y=X,tree=tree)
dimnames(L)<-dimnames(Evec)
object<-list(Eval=Eval,Evec=Evec,
S=S,L=L,lambda=lambda,logL.lambda=logL.lambda,
V=vcv,a=a[1,,drop=FALSE],
mode="cov",call=match.call())
class(object)<-"phyl.pca"
object
}
Since we are computing contrasts instead of the original data, technically we are using restricted maximum likelihood (REML) to optimize λ. We still get a huge computational advantage by doing this.
Let’s see.
library(phytools)
## simulate data with lambda = 0.6
lambda<-0.6
tree<-pbtree(n=200)
vcv<-clusterGeneration::genPositiveDefMat(10,
"unifcorrmat")$Sigma
X<-sim.corrs(phytools:::lambdaTree(tree,lambda),
vcv)
Starting with phyl.pca
in phytools.
system.time(ppca_ml<-phyl.pca(tree,X,method="lambda"))
## user system elapsed
## 6.43 0.08 21.33
ppca_ml
## Phylogenetic pca
## Standard deviations:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10
## 4.31616 4.01712 3.39111 3.00109 2.04504 1.71176 1.15564 0.98006 0.87278 0.20861
## Loads:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10
## [1,] 4.7462e-01 -0.4806239 0.4191735 -0.294062 0.217829 0.015974 -0.298758 0.331756 -0.185259 -0.01454068
## [2,] -4.3743e-06 0.2936455 -0.6604620 -0.655605 0.168443 -0.059635 0.017417 -0.057903 -0.110264 -0.00013738
## [3,] 5.5296e-01 -0.4308829 0.2202151 0.080344 0.048030 0.620567 0.082670 -0.158031 -0.183204 0.02899774
## [4,] -5.0057e-01 0.6849344 0.4546268 -0.136166 -0.025438 0.148241 0.153659 0.060954 -0.064597 -0.03071714
## [5,] -7.2998e-01 -0.3819135 -0.0101332 -0.422701 -0.348102 0.070734 0.046053 0.111808 0.018559 0.03671028
## [6,] -6.7155e-02 -0.0029304 0.7891214 -0.551515 0.084691 -0.091512 -0.132212 -0.177900 0.062350 0.00606643
## [7,] 7.3958e-01 0.1327498 0.0082785 -0.134077 -0.633622 -0.087927 -0.034122 -0.047259 -0.066079 -0.01955133
## [8,] 3.7372e-01 0.5992872 -0.3374310 -0.284524 -0.036591 0.463110 -0.178435 0.070785 0.231623 -0.00803616
## [9,] -8.8691e-01 -0.0154607 -0.2158111 0.266327 -0.116294 0.094412 -0.240161 -0.096858 -0.076831 -0.01581511
## [10,] -9.6482e-02 -0.9481978 -0.1229003 -0.233683 0.023089 0.080072 0.088926 -0.030077 0.063777 -0.04555520
## lambda:
## [1] 0.55526
Now using our new REML function.
system.time(ppca_reml<-phyl_pca(tree,X,method="lambda"))
## user system elapsed
## 0.08 0.00 0.21
ppca_reml
## Phylogenetic pca
## Standard deviations:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10
## 4.36961 4.05840 3.42795 3.03251 2.06458 1.73144 1.16706 0.99029 0.88266 0.21057
## Loads:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10
## [1,] 0.47576690 -0.4820428 0.4175804 0.294172 0.216966 0.016302 -0.297265 0.331830 -0.185316 -0.01451415
## [2,] -0.00064244 0.2908121 -0.6627109 0.654459 0.168711 -0.060274 0.017718 -0.057759 -0.110360 -0.00012949
## [3,] 0.55404753 -0.4295254 0.2187942 -0.080685 0.048170 0.620962 0.082678 -0.158043 -0.183285 0.02894295
## [4,] -0.50217067 0.6828500 0.4548453 0.139658 -0.026081 0.148351 0.153826 0.060778 -0.064546 -0.03067857
## [5,] -0.73033553 -0.3843107 -0.0130407 0.420391 -0.347547 0.070898 0.045860 0.111405 0.018631 0.03660921
## [6,] -0.06703325 -0.0072461 0.7872969 0.554199 0.083915 -0.091371 -0.132512 -0.177850 0.062163 0.00604124
## [7,] 0.74009624 0.1328999 0.0055181 0.133973 -0.633083 -0.087369 -0.034629 -0.047321 -0.066095 -0.01954688
## [8,] 0.37303002 0.5984050 -0.3387614 0.285701 -0.035148 0.463308 -0.178128 0.070892 0.231660 -0.00802226
## [9,] -0.88732186 -0.0140251 -0.2152158 -0.266731 -0.114582 0.094526 -0.239765 -0.096338 -0.076930 -0.01578684
## [10,] -0.09581160 -0.9488756 -0.1249671 0.230110 0.023075 0.080023 0.088829 -0.030221 0.063862 -0.04548989
## lambda:
## [1] 0.57096
There are very small numerical differences between our estimates – which comes down to our different estimation procedure (ML vs REML); however, we see the promised computational advantage.
OK, now let’s test it with a much bigger tree and dataset – let’s say 2,000 taxa and 20 characters.
lambda<-0.4
tree<-pbtree(n=2000)
vcv<-clusterGeneration::genPositiveDefMat(20,
"unifcorrmat")$Sigma
X<-sim.corrs(phytools:::lambdaTree(tree,lambda),
vcv)
system.time(ppca_reml<-phyl_pca(tree,X,method="lambda"))
## user system elapsed
## 1.26 0.03 2.21
options(digits=3)
ppca_reml
## Phylogenetic pca
## Standard deviations:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10 PC11 PC12 PC13 PC14 PC15 PC16 PC17 PC18
## 4.1151 3.8530 3.6877 3.4204 3.2111 3.0166 2.6411 2.5707 2.1083 2.0304 1.6608 1.5033 1.3006 1.1877 0.9327 0.7166 0.6101 0.4224
## PC19 PC20
## 0.2058 0.0975
## Loads:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10 PC11 PC12 PC13 PC14
## [1,] 0.7838 -0.0915 0.14421 0.2598 0.0559 0.2983 -0.2090 0.2093 -0.07945 0.24283 0.06969 0.17236 0.00748 0.08946
## [2,] -0.2068 -0.0479 -0.28917 0.2673 -0.3631 -0.4640 -0.2636 -0.2138 0.40960 0.20230 -0.11334 0.06707 -0.04351 0.25505
## [3,] -0.1799 0.0325 -0.05317 -0.0761 -0.3135 0.1391 -0.8200 0.1728 0.04045 0.00599 0.15459 -0.03767 -0.15049 -0.02309
## [4,] -0.7435 -0.3764 -0.00453 0.0886 0.0325 0.3050 -0.1819 -0.0920 -0.35711 0.00948 0.00974 -0.01019 -0.12462 0.09304
## [5,] 0.3117 -0.4107 -0.03763 -0.4323 0.4573 0.3411 0.0770 -0.2132 0.00512 -0.26676 -0.22306 0.08991 -0.04733 0.15256
## [6,] 0.3552 -0.3587 -0.43823 -0.0260 0.1497 -0.4012 0.1038 -0.2902 0.07427 0.13749 -0.09299 0.12165 -0.18267 -0.35242
## [7,] -0.0394 0.3358 -0.04730 -0.5041 -0.2472 -0.4282 0.1939 0.5000 -0.23833 0.00063 -0.01945 0.10336 -0.05609 0.15505
## [8,] 0.3132 0.4029 -0.28411 0.2489 0.1393 -0.3594 -0.0182 -0.3468 -0.36775 0.25405 -0.16750 -0.09826 -0.27829 -0.01490
## [9,] -0.2459 -0.5470 -0.53433 0.3518 0.2374 -0.1308 0.0806 0.3208 0.16898 0.03578 0.09419 -0.04007 -0.03586 0.02586
## [10,] 0.1258 -0.1179 -0.59183 0.2492 -0.6134 0.2466 0.1837 -0.1021 -0.21282 -0.02924 -0.05460 0.07971 0.12965 -0.04665
## [11,] 0.1956 0.2398 -0.81430 -0.3155 0.1241 0.0579 -0.2279 0.0304 0.01679 -0.25123 0.01239 0.00229 0.00955 0.03585
## [12,] 0.4057 -0.1461 -0.33615 -0.4173 -0.0946 -0.3012 -0.3200 -0.4261 0.07716 0.08500 0.21419 0.01622 0.02163 0.06000
## [13,] 0.3268 0.1584 -0.35252 0.0616 0.6602 0.0542 0.0548 0.3820 -0.21463 -0.01301 0.13077 -0.03307 -0.08943 -0.19023
## [14,] -0.5208 0.5992 -0.26381 -0.1475 0.2806 0.2778 0.0541 -0.0827 0.06490 0.28294 0.05655 0.12879 0.08154 0.00297
## [15,] -0.1630 -0.3547 0.09112 0.1440 0.3996 -0.6069 -0.0849 -0.1849 -0.33418 -0.04340 0.13431 0.20297 0.25235 0.03384
## [16,] -0.1561 -0.0652 0.08676 -0.1559 0.2163 -0.1704 -0.7427 0.0296 0.06599 0.05702 -0.39814 0.17704 0.17366 -0.22682
## [17,] -0.0518 0.5592 0.01531 0.7185 0.1524 -0.0587 -0.0815 0.0465 0.04607 -0.29378 -0.15994 0.09159 -0.00716 0.06893
## [18,] 0.1500 -0.3851 -0.25640 -0.0581 0.2966 0.0452 0.3269 0.0243 0.12637 0.50185 -0.40871 -0.14069 0.07556 0.15727
## [19,] 0.1811 0.1724 0.06869 0.2133 0.0817 0.0708 0.4466 -0.4975 0.19182 -0.15027 0.47046 0.22021 -0.21463 0.09358
## [20,] 0.3789 0.4003 -0.06552 0.2112 0.2515 0.0384 -0.1349 -0.1412 -0.18267 0.00923 0.17280 -0.60107 0.27322 0.17230
## PC15 PC16 PC17 PC18 PC19 PC20
## [1,] -0.01955 0.01401 -0.00580 0.02905 -0.00226 0.003073
## [2,] 0.06838 -0.01636 -0.19869 0.00214 0.00650 -0.004601
## [3,] 0.27313 -0.00784 0.09887 -0.05848 -0.00231 -0.010417
## [4,] -0.06665 -0.06360 -0.02158 0.03151 -0.00595 0.000475
## [5,] 0.08263 0.06909 -0.02169 -0.04470 -0.02134 -0.002047
## [6,] 0.17954 -0.08491 -0.00447 0.15145 -0.07845 -0.005843
## [7,] 0.01390 -0.03584 0.00777 -0.00840 -0.02231 0.003904
## [8,] -0.00114 0.10367 0.02533 -0.03049 0.01420 0.007917
## [9,] -0.02795 0.05832 0.03307 -0.02004 -0.01085 0.006836
## [10,] 0.03044 -0.00655 -0.00764 -0.03431 -0.00571 -0.000635
## [11,] -0.00656 -0.01489 0.00872 0.06227 0.02805 0.005411
## [12,] -0.26810 -0.03461 0.06371 -0.04401 -0.02595 -0.015577
## [13,] -0.00992 -0.11731 -0.15473 -0.07431 0.01054 -0.017798
## [14,] 0.01778 0.04036 0.00393 -0.00382 -0.01070 -0.001279
## [15,] 0.10248 0.02015 0.01842 0.00606 0.01639 -0.004290
## [16,] -0.08423 -0.12916 -0.02531 -0.07865 -0.01458 0.036651
## [17,] -0.04299 -0.03410 0.04715 0.00143 -0.01510 -0.008011
## [18,] 0.07101 -0.22636 0.14179 -0.00506 0.03091 -0.010734
## [19,] 0.09055 -0.18123 0.02922 -0.04560 0.00699 0.022998
## [20,] 0.09792 -0.04654 -0.04018 0.00524 -0.04428 0.012082
## lambda:
## [1] 0.4
biplot(ppca_reml,cex=c(0.3,0.8))
Not bad.
No comments:
Post a Comment
Note: due to the very large amount of spam, all comments are now automatically submitted for moderation.