Dec 10, 2020
Philippe Veber
added new note on multinomial method and phylogeny
# Multinomial and phylogeny
The multinomial method for detecting convergent substitutions
completely ignores phylogenetic inertia. A rather immediate
consequence is that it doesn't assess significance properly: its
pvalues are far from calibration. However in our tests it behaves
rather well in average precision. This notes shows a family of tree
topologies on which the multinomial method significantly underperforms
in comparison to a phylogenyaware method (tdg09).
```
ocaml
open
Core_kernel
;;
open
Phylogenetics
;;
open
Codepitk
;;
module
Top
=
Bistro_utils
.
Toplevel_eval
.
Make
(
struct
let
np
=
4
let
mem
=
4
end
)()
```
```
ocaml
let
tree_dfs_map
t
~
node
~
leaf
~
branch
~
init
=
let
rec
node_traversal
acc
=
function

Tree
.
Leaf
l
>
let
acc
,
l'
=
leaf
acc
l
in
Tree
.
Leaf
l'
,
acc

Node
n
>
let
acc
,
data
=
node
acc
n
.
data
in
let
children
,
acc
=
List1
.
fold
n
.
branches
~
init
:
([]
,
acc
)
~
f
:
(
fun
(
branch_acc
,
acc
)
b
>
let
b'
,
acc
=
branch_traversal
acc
b
in
b'
::
branch_acc
,
acc
)
in
Tree
.
node
data
(
List1
.
of_list_exn
(
List
.
rev
children
))
,
acc
and
branch_traversal
acc
(
Tree
.
Branch
b
)
=
let
acc
,
data
=
branch
acc
b
.
data
in
let
tip
,
acc
=
node_traversal
acc
b
.
tip
in
Tree
.
branch
data
tip
,
acc
in
fst
(
node_traversal
init
t
)
let
tree_renumber_leaves
t
=
tree_dfs_map
t
~
init
:
0
~
node
:
(
fun
i
d
>
i
,
d
)
~
leaf
:
(
fun
i
(
_
,
d
)
>
(
i
+
1
)
,
(
i
,
d
))
~
branch
:
(
fun
i
d
>
i
,
d
)
```
```
ocaml
let
bad_tree_for_multinomial
?
(
lambda
=
1
.
)
n
=
let
branch
length
condition
tip
=
Tree
.
branch
{
Convergence_tree
.
length
;
condition
}
tip
in
let
pitchfork
length
condition
=
Tree
.
node
()
(
List1
.
init
n
~
f
:
(
fun
_
>
branch
length
condition
(
Tree
.
leaf
(
0
,
condition
))))
in
let
indep_subtree
=
Codepitk
.
Tdg09
.
Pack
.
pair_tree
~
branch_length1
:
lambda
~
branch_length2
:
lambda
~
npairs
:
n
in
let
dep_subtree
=
let
p
=
0
.
99
*.
lambda
and
q
=
0
.
01
*.
lambda
in
Tree
.
binary_node
()
(
branch
p
`Ancestral
(
pitchfork
q
`Ancestral
))
(
branch
p
`Convergent
(
pitchfork
q
`Convergent
))
in
Tree
.
binary_node
()
(
branch
2
.
`Ancestral
dep_subtree
)
(
branch
2
.
`Ancestral
indep_subtree
)
>
tree_renumber_leaves
```
```
ocaml
let
string_of_tree
t
=
Tree
.
to_printbox
t
~
leaf
:
(
fun
(
i
,
cond
)
>
sprintf
"%d (%c)"
i
(
match
cond
with
`Convergent
>
'
C'

`Ancestral
>
'
A'
))
>
PrintBox_text
.
to_string
;;
string_of_tree
(
bad_tree_for_multinomial
4
);;
```
```
ocaml
let
wag
=
Bistro_unix
.
wget
"https://www.ebi.ac.uk/goldmansrv/WAG/wag.dat"
>
Top
.
path
>
Wag
.
parse
let
tdg09_site_simulation
?
alpha
:
(
alpha
=
1
.
)
?
(
scale
=
1
.
)
~
convergent_site
tree
=
let
open
Tdg09
.
Pack
in
let
exchangeability_matrix
=
wag
.
rate_matrix
in
let
stationary_distribution0
=
simulate_profile
alpha
in
let
stationary_distribution1
=
if
convergent_site
then
simulate_profile
alpha
else
stationary_distribution0
in
Model3
.
simulate_site
~
exchangeability_matrix
~
stationary_distribution0
~
stationary_distribution1
~
scale
tree
let
tdg09_simulation
?
alpha
tree
~
n_h0
~
n_ha
=
Array
.
init
(
n_h0
+
n_ha
)
~
f
:
(
fun
i
>
tdg09_site_simulation
tree
?
alpha
~
convergent_site
:
(
i
<
n_h0
)
)
let
tdg09_on_tdg09_simulation
tree
sim
=
let
open
Tdg09
.
Pack
in
let
site
=
Tree
.
leaves
sim
>
Array
.
of_list
in
let
_
,
_
,
lrt
=
Model3
.
lrt
~
mode
:
`sparse
wag
tree
site
in
lrt
.
pvalue
let
multinomial_on_tdg09_simulation
site
=
let
leaves1
,
leaves2
=
Convergence_tree
.
leaves
site
>
List
.
partition_map
~
f
:
(
function

aa
,
`Ancestral
>
Either
.
first
aa

aa
,
`Convergent
>
Either
.
second
aa
)
in
let
counts
x
=
(
Amino_acid
.
counts
(
Sequence
.
of_list
x
)
:>
int
array
)
in
let
x1
=
counts
leaves1
in
let
x2
=
counts
leaves2
in
let
d
=
Multinomial_test
.
data
~
x1
~
x2
in
let
r
=
Multinomial_test
.(
Permutation
.
test
~
statistic
:
LRT
.
likelihood_log_ratio
d
)
in
r
.
pvalue
let
tree_path
=
"data/besnard2009/besnard2009.nhx"
;;
let
load_tree
fn
=
Convergence_tree
.
from_file
fn
>
Rresult
.
R
.
failwith_error_msg
>
Tdg09
.
Pack
.
convergence_tree
```
```
ocaml
let
run
?
alpha
~
n_h0
~
n_ha
()
=
let
tree
=
bad_tree_for_multinomial
~
lambda
:
1
.
20
in
let
sites
=
tdg09_simulation
?
alpha
tree
~
n_h0
~
n_ha
in
let
run
f
=
Prc
.
Dataset
(
List
.
init
(
Array
.
length
sites
)
~
f
:
(
fun
i
>
f
sites
.
(
i
)
,
i
>=
n_h0
))
in
let
scores_tdg09
=
run
(
tdg09_on_tdg09_simulation
tree
)
in
let
scores_multinomial
=
run
multinomial_on_tdg09_simulation
in
[
"multinomial"
,
Prc
.
Precision_recall
.
auc_average_precision
scores_multinomial
;
"tdg09"
,
Prc
.
Precision_recall
.
auc_average_precision
scores_tdg09
;
]
let
_
=
run
~
alpha
:
0
.
1
~
n_h0
:
100
~
n_ha
:
100
()
```
