Saturday, September 16, 2017

Visualizing all the non-homoplastic SNPs in a set on a phylogeny

The student of a colleague recently asked me about 'mapping' SNPs (single- nucleotide-polymorphisms') onto the tree. Basically, what he seemed to want was for a set of m SNPs, a list of m edge IDs or taxon bipartitions corresponding to the edge with which that SNP was perfectly consistent.

I asked him to send me an example (so I could be sure I understood what he was working from), and I'm still waiting, but I thought I'd nonetheless post a partial solution that might work for him so that we can go from there.

First, I'll conjure a tree & some data.

library(phytools)
tree
## 
## Phylogenetic tree with 26 tips and 25 internal nodes.
## 
## Tip labels:
##  A, B, C, D, E, F, ...
## 
## Rooted; includes branch lengths.
plotTree(tree)

plot of chunk unnamed-chunk-1

X
##   [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13]
## A "c"  "g"  "c"  "c"  "a"  "a"  "c"  "a"  "g"  "g"   "a"   "t"   "g"  
## B "c"  "g"  "c"  "c"  "a"  "a"  "c"  "a"  "g"  "g"   "a"   "t"   "g"  
## C "c"  "g"  "c"  "c"  "a"  "a"  "c"  "a"  "g"  "g"   "t"   "t"   "g"  
## D "c"  "g"  "c"  "c"  "a"  "a"  "c"  "a"  "g"  "g"   "t"   "t"   "g"  
## E "c"  "g"  "c"  "c"  "a"  "a"  "c"  "a"  "g"  "g"   "t"   "t"   "g"  
## F "c"  "g"  "c"  "c"  "a"  "a"  "c"  "a"  "g"  "g"   "t"   "t"   "g"  
## G "c"  "g"  "c"  "c"  "a"  "a"  "a"  "t"  "c"  "g"   "t"   "t"   "c"  
## H "c"  "g"  "c"  "c"  "a"  "a"  "a"  "t"  "c"  "g"   "t"   "t"   "c"  
## I "g"  "g"  "c"  "c"  "a"  "a"  "a"  "t"  "c"  "g"   "t"   "t"   "c"  
## J "g"  "g"  "c"  "c"  "a"  "a"  "a"  "t"  "c"  "g"   "t"   "t"   "c"  
## K "g"  "g"  "c"  "c"  "a"  "a"  "a"  "t"  "c"  "g"   "t"   "t"   "c"  
## L "g"  "g"  "c"  "c"  "a"  "a"  "a"  "t"  "c"  "g"   "t"   "t"   "c"  
## M "c"  "g"  "c"  "c"  "a"  "c"  "a"  "a"  "g"  "g"   "t"   "a"   "c"  
## N "c"  "g"  "c"  "c"  "a"  "c"  "a"  "a"  "g"  "g"   "t"   "a"   "c"  
## O "c"  "g"  "c"  "c"  "a"  "c"  "a"  "a"  "g"  "g"   "t"   "a"   "c"  
## P "c"  "g"  "c"  "c"  "c"  "c"  "a"  "a"  "g"  "g"   "t"   "a"   "c"  
## Q "c"  "g"  "c"  "g"  "a"  "a"  "a"  "a"  "g"  "g"   "t"   "a"   "c"  
## R "c"  "g"  "c"  "g"  "a"  "a"  "a"  "a"  "g"  "g"   "t"   "a"   "c"  
## S "c"  "g"  "a"  "c"  "a"  "c"  "a"  "a"  "g"  "g"   "t"   "a"   "c"  
## T "c"  "t"  "a"  "g"  "a"  "c"  "a"  "t"  "g"  "g"   "t"   "a"   "c"  
## U "c"  "t"  "c"  "c"  "a"  "a"  "c"  "a"  "g"  "g"   "t"   "t"   "g"  
## V "c"  "t"  "c"  "c"  "a"  "a"  "c"  "a"  "g"  "g"   "t"   "t"   "g"  
## W "c"  "g"  "c"  "c"  "a"  "a"  "c"  "a"  "g"  "g"   "t"   "t"   "g"  
## X "c"  "g"  "c"  "c"  "a"  "a"  "c"  "a"  "g"  "g"   "t"   "t"   "g"  
## Y "c"  "g"  "c"  "c"  "a"  "a"  "c"  "a"  "g"  "g"   "t"   "t"   "g"  
## Z "c"  "g"  "c"  "c"  "a"  "a"  "c"  "a"  "g"  "t"   "t"   "t"   "g"  
##   [,14] [,15] [,16] [,17] [,18] [,19] [,20] [,21] [,22] [,23] [,24] [,25]
## A "t"   "t"   "a"   "t"   "a"   "a"   "c"   "g"   "c"   "t"   "c"   "c"  
## B "t"   "t"   "a"   "t"   "c"   "t"   "c"   "a"   "c"   "t"   "t"   "c"  
## C "t"   "t"   "a"   "t"   "c"   "a"   "c"   "a"   "c"   "t"   "t"   "c"  
## D "t"   "t"   "a"   "t"   "c"   "a"   "c"   "a"   "c"   "t"   "t"   "c"  
## E "t"   "t"   "a"   "t"   "c"   "a"   "c"   "a"   "a"   "t"   "t"   "c"  
## F "t"   "t"   "a"   "t"   "c"   "a"   "c"   "a"   "c"   "t"   "t"   "c"  
## G "t"   "t"   "c"   "t"   "c"   "a"   "c"   "a"   "c"   "a"   "t"   "c"  
## H "t"   "t"   "c"   "t"   "c"   "a"   "c"   "a"   "c"   "a"   "t"   "c"  
## I "t"   "t"   "c"   "t"   "c"   "a"   "c"   "a"   "c"   "a"   "t"   "c"  
## J "t"   "t"   "c"   "t"   "c"   "a"   "c"   "a"   "c"   "a"   "t"   "c"  
## K "t"   "t"   "c"   "t"   "c"   "a"   "c"   "a"   "c"   "a"   "t"   "c"  
## L "t"   "t"   "c"   "t"   "c"   "a"   "c"   "a"   "c"   "a"   "t"   "c"  
## M "t"   "t"   "c"   "t"   "c"   "a"   "a"   "a"   "c"   "t"   "t"   "c"  
## N "t"   "t"   "c"   "t"   "c"   "t"   "c"   "a"   "c"   "t"   "t"   "c"  
## O "t"   "t"   "c"   "t"   "c"   "t"   "c"   "a"   "c"   "t"   "t"   "c"  
## P "t"   "t"   "c"   "t"   "c"   "t"   "c"   "a"   "c"   "t"   "t"   "c"  
## Q "t"   "t"   "c"   "t"   "c"   "a"   "c"   "a"   "c"   "t"   "t"   "c"  
## R "t"   "t"   "c"   "t"   "c"   "a"   "c"   "a"   "c"   "t"   "t"   "c"  
## S "t"   "t"   "c"   "t"   "c"   "a"   "c"   "a"   "c"   "t"   "t"   "c"  
## T "t"   "t"   "c"   "t"   "c"   "a"   "c"   "a"   "c"   "t"   "t"   "c"  
## U "t"   "g"   "a"   "g"   "c"   "a"   "c"   "a"   "c"   "t"   "t"   "a"  
## V "t"   "g"   "a"   "g"   "c"   "a"   "c"   "a"   "a"   "t"   "t"   "a"  
## W "t"   "t"   "a"   "t"   "c"   "a"   "c"   "a"   "c"   "t"   "t"   "c"  
## X "t"   "t"   "a"   "t"   "c"   "a"   "c"   "a"   "c"   "t"   "t"   "c"  
## Y "t"   "t"   "a"   "t"   "c"   "a"   "c"   "a"   "c"   "t"   "t"   "c"  
## Z "c"   "t"   "a"   "t"   "c"   "a"   "c"   "a"   "c"   "t"   "t"   "c"  
##   [,26] [,27] [,28]
## A "t"   "t"   "c"  
## B "t"   "t"   "c"  
## C "t"   "t"   "c"  
## D "t"   "t"   "c"  
## E "t"   "t"   "c"  
## F "t"   "a"   "c"  
## G "t"   "t"   "c"  
## H "t"   "t"   "c"  
## I "t"   "t"   "c"  
## J "t"   "t"   "c"  
## K "t"   "t"   "c"  
## L "t"   "t"   "c"  
## M "g"   "t"   "c"  
## N "g"   "t"   "c"  
## O "g"   "t"   "c"  
## P "g"   "t"   "c"  
## Q "t"   "t"   "c"  
## R "t"   "t"   "c"  
## S "t"   "t"   "c"  
## T "t"   "t"   "c"  
## U "t"   "t"   "g"  
## V "t"   "t"   "g"  
## W "t"   "t"   "g"  
## X "t"   "t"   "g"  
## Y "t"   "t"   "g"  
## Z "t"   "t"   "g"

Now, I'm going to find all the splits in the tree using the function phangorn::as.splits:

library(phangorn)
obj<-as.splits(tree)
obj
##       A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
##  [1,] | . . . . . . . . . . . . . . . . . . . . . . . . .
##  [2,] . | . . . . . . . . . . . . . . . . . . . . . . . .
##  [3,] . . | . . . . . . . . . . . . . . . . . . . . . . .
##  [4,] . . . | . . . . . . . . . . . . . . . . . . . . . .
##  [5,] . . . . | . . . . . . . . . . . . . . . . . . . . .
##  [6,] . . . . . | . . . . . . . . . . . . . . . . . . . .
##  [7,] . . . . . . | . . . . . . . . . . . . . . . . . . .
##  [8,] . . . . . . . | . . . . . . . . . . . . . . . . . .
##  [9,] . . . . . . . . | . . . . . . . . . . . . . . . . .
## [10,] . . . . . . . . . | . . . . . . . . . . . . . . . .
## [11,] . . . . . . . . . . | . . . . . . . . . . . . . . .
## [12,] . . . . . . . . . . . | . . . . . . . . . . . . . .
## [13,] . . . . . . . . . . . . | . . . . . . . . . . . . .
## [14,] . . . . . . . . . . . . . | . . . . . . . . . . . .
## [15,] . . . . . . . . . . . . . . | . . . . . . . . . . .
## [16,] . . . . . . . . . . . . . . . | . . . . . . . . . .
## [17,] . . . . . . . . . . . . . . . . | . . . . . . . . .
## [18,] . . . . . . . . . . . . . . . . . | . . . . . . . .
## [19,] . . . . . . . . . . . . . . . . . . | . . . . . . .
## [20,] . . . . . . . . . . . . . . . . . . . | . . . . . .
## [21,] . . . . . . . . . . . . . . . . . . . . | . . . . .
## [22,] . . . . . . . . . . . . . . . . . . . . . | . . . .
## [23,] . . . . . . . . . . . . . . . . . . . . . . | . . .
## [24,] . . . . . . . . . . . . . . . . . . . . . . . | . .
## [25,] . . . . . . . . . . . . . . . . . . . . . . . . | .
## [26,] . . . . . . . . . . . . . . . . . . . . . . . . . |
## [27,] | | | | | | | | | | | | | | | | | | | | | | | | | |
## [28,] | | | | | | . . . . . . . . . . . . . . . . . . . .
## [29,] | | . . . . . . . . . . . . . . . . . . . . . . . .
## [30,] . . | | | | . . . . . . . . . . . . . . . . . . . .
## [31,] . . | | | . . . . . . . . . . . . . . . . . . . . .
## [32,] . . | | . . . . . . . . . . . . . . . . . . . . . .
## [33,] . . . . . . | | | | | | | | | | | | | | | | | | | |
## [34,] . . . . . . | | | | | | | | | | | | | | . . . . . .
## [35,] . . . . . . | | | | | | . . . . . . . . . . . . . .
## [36,] . . . . . . | | . . . . . . . . . . . . . . . . . .
## [37,] . . . . . . . . | | | | . . . . . . . . . . . . . .
## [38,] . . . . . . . . | | | . . . . . . . . . . . . . . .
## [39,] . . . . . . . . . | | . . . . . . . . . . . . . . .
## [40,] . . . . . . . . . . . . | | | | | | | | . . . . . .
## [41,] . . . . . . . . . . . . | | | | | | . . . . . . . .
## [42,] . . . . . . . . . . . . | | | | . . . . . . . . . .
## [43,] . . . . . . . . . . . . . | | | . . . . . . . . . .
## [44,] . . . . . . . . . . . . . | | . . . . . . . . . . .
## [45,] . . . . . . . . . . . . . . . . | | . . . . . . . .
## [46,] . . . . . . . . . . . . . . . . . . | | . . . . . .
## [47,] . . . . . . . . . . . . . . . . . . . . | | | | | |
## [48,] . . . . . . . . . . . . . . . . . . . . | | . . . .
## [49,] . . . . . . . . . . . . . . . . . . . . . . | | | |
## [50,] . . . . . . . . . . . . . . . . . . . . . . | | | .
## [51,] . . . . . . . . . . . . . . . . . . . . . . | | . .

What this tells us is which branch number (in rows) split which set of taxa (in columns). We can see this by overlaying the splits as edge labels on our tree:

plotTree(tree,offset=1)
edgelabels(c(1:(Ntip(tree)+tree$Nnode))[-Ntip(tree)-1],
    sapply(c(1:(Ntip(tree)+tree$Nnode))[-Ntip(tree)-1],function(x,y) 
    which(y==x),y=tree$edge[,2]),frame="circle",cex=0.7,bg="white")

plot of chunk unnamed-chunk-3

OK, now let's figure out which split matches each of the columns of X. Let's take column 1 as an example:

X[,1]
##   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O   P   Q   R 
## "c" "c" "c" "c" "c" "c" "c" "c" "g" "g" "g" "g" "c" "c" "c" "c" "c" "c" 
##   S   T   U   V   W   X   Y   Z 
## "c" "c" "c" "c" "c" "c" "c" "c"

It's very easy to see that this matches split #37, and we can verify this as follows:

split<-function(x,tree){
    x<-as.factor(x)
    levels<-levels(x)
    tax1<-names(x)[which(x==levels[1])]
    tax2<-names(x)[which(x==levels[2])]
    obj<-list(left=sapply(tax1,function(x,y) which(y==x),y=tree$tip.label),
        right=sapply(tax2,function(x,y) which(y==x),y=tree$tip.label))
    lapply(obj,function(x){ names(x)<-NULL; x })
}
s1<-split(X[,1],tree)
which(sapply(obj,function(x,y) identical(sort(x),sort(y)),y=s1$left)+
    sapply(obj,function(x,y) identical(sort(x),sort(y)),y=s1$right)>0)
## [1] 37

This matches with what we figured out already.

Now let's do this for all SNPs:

splits<-apply(X,2,split,tree=tree)
splits
## [[1]]
## [[1]]$left
##  [1]  1  2  3  4  5  6  7  8 13 14 15 16 17 18 19 20 21 22 23 24 25 26
## 
## [[1]]$right
## [1]  9 10 11 12
## 
## 
## [[2]]
## [[2]]$left
##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 23 24 25 26
## 
## [[2]]$right
## [1] 20 21 22
## 
## 
## [[3]]
## [[3]]$left
## [1] 19 20
## 
## [[3]]$right
##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 21 22 23 24 25
## [24] 26
## 
## 
## [[4]]
## [[4]]$left
##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 19 21 22 23 24 25 26
## 
## [[4]]$right
## [1] 17 18 20
## 
## 
## [[5]]
## [[5]]$left
##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 17 18 19 20 21 22 23 24
## [24] 25 26
## 
## [[5]]$right
## [1] 16
## 
## 
## [[6]]
## [[6]]$left
##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 17 18 21 22 23 24 25 26
## 
## [[6]]$right
## [1] 13 14 15 16 19 20
## 
## 
## [[7]]
## [[7]]$left
##  [1]  7  8  9 10 11 12 13 14 15 16 17 18 19 20
## 
## [[7]]$right
##  [1]  1  2  3  4  5  6 21 22 23 24 25 26
## 
## 
## [[8]]
## [[8]]$left
##  [1]  1  2  3  4  5  6 13 14 15 16 17 18 19 21 22 23 24 25 26
## 
## [[8]]$right
## [1]  7  8  9 10 11 12 20
## 
## 
## [[9]]
## [[9]]$left
## [1]  7  8  9 10 11 12
## 
## [[9]]$right
##  [1]  1  2  3  4  5  6 13 14 15 16 17 18 19 20 21 22 23 24 25 26
## 
## 
## [[10]]
## [[10]]$left
##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
## [24] 24 25
## 
## [[10]]$right
## [1] 26
## 
## 
## [[11]]
## [[11]]$left
## [1] 1 2
## 
## [[11]]$right
##  [1]  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
## [24] 26
## 
## 
## [[12]]
## [[12]]$left
## [1] 13 14 15 16 17 18 19 20
## 
## [[12]]$right
##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 21 22 23 24 25 26
## 
## 
## [[13]]
## [[13]]$left
##  [1]  7  8  9 10 11 12 13 14 15 16 17 18 19 20
## 
## [[13]]$right
##  [1]  1  2  3  4  5  6 21 22 23 24 25 26
## 
## 
## [[14]]
## [[14]]$left
## [1] 26
## 
## [[14]]$right
##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
## [24] 24 25
## 
## 
## [[15]]
## [[15]]$left
## [1] 21 22
## 
## [[15]]$right
##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 23 24 25
## [24] 26
## 
## 
## [[16]]
## [[16]]$left
##  [1]  1  2  3  4  5  6 21 22 23 24 25 26
## 
## [[16]]$right
##  [1]  7  8  9 10 11 12 13 14 15 16 17 18 19 20
## 
## 
## [[17]]
## [[17]]$left
## [1] 21 22
## 
## [[17]]$right
##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 23 24 25
## [24] 26
## 
## 
## [[18]]
## [[18]]$left
## [1] 1
## 
## [[18]]$right
##  [1]  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
## [24] 25 26
## 
## 
## [[19]]
## [[19]]$left
##  [1]  1  3  4  5  6  7  8  9 10 11 12 13 17 18 19 20 21 22 23 24 25 26
## 
## [[19]]$right
## [1]  2 14 15 16
## 
## 
## [[20]]
## [[20]]$left
## [1] 13
## 
## [[20]]$right
##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 14 15 16 17 18 19 20 21 22 23 24
## [24] 25 26
## 
## 
## [[21]]
## [[21]]$left
##  [1]  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
## [24] 25 26
## 
## [[21]]$right
## [1] 1
## 
## 
## [[22]]
## [[22]]$left
## [1]  5 22
## 
## [[22]]$right
##  [1]  1  2  3  4  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 23 24 25
## [24] 26
## 
## 
## [[23]]
## [[23]]$left
## [1]  7  8  9 10 11 12
## 
## [[23]]$right
##  [1]  1  2  3  4  5  6 13 14 15 16 17 18 19 20 21 22 23 24 25 26
## 
## 
## [[24]]
## [[24]]$left
## [1] 1
## 
## [[24]]$right
##  [1]  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
## [24] 25 26
## 
## 
## [[25]]
## [[25]]$left
## [1] 21 22
## 
## [[25]]$right
##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 23 24 25
## [24] 26
## 
## 
## [[26]]
## [[26]]$left
## [1] 13 14 15 16
## 
## [[26]]$right
##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 17 18 19 20 21 22 23 24 25 26
## 
## 
## [[27]]
## [[27]]$left
## [1] 6
## 
## [[27]]$right
##  [1]  1  2  3  4  5  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
## [24] 25 26
## 
## 
## [[28]]
## [[28]]$left
##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
## 
## [[28]]$right
## [1] 21 22 23 24 25 26

These are the splits in our data, now let's make a function to match these splits to the edges of our tree (if a unique match exists):

which.edge<-function(obj,split){
    tmp<-which(sapply(obj,function(x,y) identical(sort(x),sort(y)),
        y=split$left)+sapply(obj,function(x,y) identical(sort(x),
        sort(y)),y=split$right)>0)
    if(length(tmp)==0) NA else tmp
}
EE<-sapply(splits,which.edge,obj=obj)
EE
##  [1] 37 NA 46 NA 16 NA 34 NA 35 26 29 40 34 26 48 34 48  1 NA 13  1 NA 35
## [24]  1 48 42  6 47

This tells us that certain of our SNPs don't match the tree - that is, they are homoplastic (indicating by NA. If we look at any one of these, say SNP #4, we can easily see that this is true. For instance:

X[,4]
##   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O   P   Q   R 
## "c" "c" "c" "c" "c" "c" "c" "c" "c" "c" "c" "c" "c" "c" "c" "c" "g" "g" 
##   S   T   U   V   W   X   Y   Z 
## "c" "g" "c" "c" "c" "c" "c" "c"

requires at least two changes on the tree.

Now, perhaps we'd like to overlay all the SNPs that are perfectly consistent (that is, they are non-homoplastic) on the tree? That can be done as follows:

plotTree(tree)
ee<-EE[!is.na(EE)]
changes<-summary(as.factor(ee))
lastPP<-get("last_plot.phylo",envir=.PlotPhyloEnv)
for(i in 1:length(changes)){
    n<-changes[i]
    e<-which(tree$edge[,2]==as.numeric(names(changes))[i])
    s<-lastPP$xx[tree$edge[e,1]]
    f<-lastPP$xx[tree$edge[e,2]]
    pos.y<-lastPP$yy[tree$edge[e,2]]+c(-0.25,0.25)
    for(j in 1:n){
        pos.x<-rep(j*(f-s)/(n+1)+s,2)
        lines(pos.x,pos.y,lwd=4,col=make.transparent("blue",0.5),
            lend=2)
        text(x=pos.x[1],y=pos.y[2]-0.05,
            which(EE==as.numeric(names(changes))[i])[j],pos=3,
            cex=0.7)

    }
}

plot of chunk unnamed-chunk-9

You get the idea.

Of course, there is much more that we can do two - for instance, if a SNP has more than 2 states or if we want to represent more than one change on the tree for a given SNP - but I will leave it here for now.

FYI, the tree & data were simulated using phytools as follows:

tree<-pbtree(n=26,tip.label=LETTERS,scale=1)
t<-sum(tree$edge.length)
Q<-matrix(c(-1/t,1/(3*t),1/(3*t),1/(3*t),
    1/(3*t),-1/t,1/(3*t),1/(3*t),
    1/(3*t),1/(3*t),-1/t,1/(3*t),
    1/(3*t),1/(3*t),1/(3*t),-1/t),4,4,
    dimnames=list(c("a","c","g","t"),
    c("a","c","g","t")))
X<-genSeq(tree,l=50,Q=Q,format="matrix")
## remove invariant sites & sites with more than 2 states
ii<-which(apply(X,2,function(x) length(unique(x))==2))
X<-X[,ii]

No comments:

Post a Comment

Note: due to the very large amount of spam, all comments are now automatically submitted for moderation.