Commit 61dcd710 authored by Philippe Veber's avatar Philippe Veber
Browse files

reorganized notes

parent 611029a9
(executable
(name codepi_multinomial_note)
(name multinomial_null)
(libraries codepi scirep)
(modules Codepi_multinomial_note)
(modules Multinomial_null)
(modes native)
(preprocess (pps bistro.ppx)))
(rule
(target codepi_multinomial_note.ml)
(deps codepi_multinomial_note.md)
(target multinomial_null.ml)
(deps multinomial_null.md)
(action (with-stdout-to %{target} (run ocaml-mdx pp %{deps}))))
# Benchmark of methods for convergent substitution detection
```ocaml
open Core;;
open Phylogenetics;;
open Codepi;;
open Codepitk;;
module Tk = Codepitk;;
module Pipeline = Simulation_pipeline.Mutsel
module Top = Bistro_utils.Toplevel_eval.Make(
struct
let np = 8
let mem = 10
end)()
;;
module Show = struct
include Scirep.Show
let png_rplot f = png (fun fn ->
OCamlR_grDevices.png fn ;
f () ;
OCamlR_grDevices.dev_off ()
)
let cached_png_rplot f =
let f = fun%workflow dest ->
OCamlR_grDevices.png dest ;
[%eval f] () ;
OCamlR_grDevices.dev_off ()
in
Bistro.Workflow.path_plugin ~descr:"cached_png_plot" f
|> Top.path
|> png_file
end
;;
open Top;;
```
For that purpose we use simulations under the mutsel model.
```ocaml
let n_h0 = 900
let n_ha = 100
let tree_path = "data/besnard2009/besnard2009.nhx"
let tree = Simulation_pipeline.NHX (Bistro.Workflow.input tree_path)
let profiles = "example/aa_fitness/263SelectedProfiles.tsv"
let sim =
Pipeline.query ~tree ~profiles ~n_h0 ~n_ha ~ne_s:(2., 2.) ()
```
```ocaml
let benchmark1 =
Pipeline.(benchmark sim [multinomial_asymptotic_lrt ; tdg09 ; inhouse_tdg09 ; diffsel])
;;
let b = Top.eval benchmark1 in
List.zip_exn b.method_labels b.average_precision;;
```
# Multinomial method for detecting convergence
# Multinomial method under null hypothesis
A simple although theoretically wrong method for detecting a
difference in amino-acid composition in relation to a binary feature
of species is to gather amino-acid counts in each group of extant
species, and apply a test against the null hypothesis that the amino
acid composition (or profile) is the same in the two groups.
We investivate here how this simple approach behaves compared to more
sophisticated models that take the species phylogeny into account.
This note checks the behaviour of the multinomial method for
convergent profile detection, under its own null hypothesis, then
under a null hypothesis with phylogenetic inertia.
```ocaml
open Core;;
open Phylogenetics;;
open Codepi;;
open Codepitk;;
module Tk = Codepitk;;
module Pipeline = Simulation_pipeline.Mutsel
module Top = Bistro_utils.Toplevel_eval.Make(
struct
......@@ -27,12 +19,6 @@ module Top = Bistro_utils.Toplevel_eval.Make(
module Show = struct
include Scirep.Show
let png_rplot f = png (fun fn ->
OCamlR_grDevices.png fn ;
f () ;
OCamlR_grDevices.dev_off ()
)
let cached_png_rplot f =
let f = fun%workflow dest ->
OCamlR_grDevices.png dest ;
......@@ -44,74 +30,15 @@ module Show = struct
|> png_file
end
;;
open Top;;
```
For that purpose we use simulations under the mutsel model.
```ocaml
let n_h0 = 900
let n_ha = 100
let tree_path = "data/besnard2009/besnard2009.nhx"
let tree = Simulation_pipeline.NHX (Bistro.Workflow.input tree_path)
let profiles = "example/aa_fitness/263SelectedProfiles.tsv"
let sim =
Pipeline.query ~tree ~profiles ~n_h0 ~n_ha ~ne_s:(2., 2.) ()
```
```ocaml
let benchmark1 =
Pipeline.(benchmark sim [ multinomial_asymptotic_lrt ; tdg09 ])
;;
let b = Top.eval benchmark1 in
List.zip_exn b.method_labels b.average_precision;;
```
This result suggests that multinomial performs equally well as tdg09
while not taking into account the phylogeny. Surprising.
## p value distribution under non convergent sites
Let's look at those results from a different angle. Since we perform a
test, it is expected to find a uniform distribution for pvalues under
the null hypothesis.
Let's check it:
```ocaml
let benchmark_output (b : Pipeline.benchmark) key =
let assoc = List.zip_exn b.method_labels b.method_outputs in
List.Assoc.find_exn assoc key ~equal:String.equal
;;
Show.png_rplot (fun () ->
let benchmark = eval benchmark1 in
let pvals =
benchmark_output benchmark "Multinomial_1mp"
|> Array.sub ~pos:0 ~len:benchmark.n_h0
|> Array.filter_opt
|> Array.map ~f:(fun x -> 1. -. x)
in
let main = "Multinomial LRT pvalue distribution on non convergent sites" in
ignore (OCamlR_graphics.hist ~xlab:"pvalue" ~main pvals : OCamlR_graphics.hist)
)
;;
```
This is far from uniform, and to be honest no surprise here:
uniformity is expected when the data is generated under the null *of
the test*. Here we are working under a different model. So maybe, just
to be sure, we can check that our test pvalues are uniform under the
test's null.
## p value distribution under the test's null hypothesis
LRT's null when simulating under dense profiles looks indeed uniform:
```ocaml
Show.cached_png_rplot (fun%workflow () ->
let open Tk.Multinomial_test in
let open Multinomial_test in
uniformity_test ~alpha:1. ~k:5 ~n1:200 ~n2:200 LRT.asymptotic_test
)
;;
......@@ -120,7 +47,7 @@ Show.cached_png_rplot (fun%workflow () ->
But it deteriorates for a smaller number of observations:
```ocaml
Show.cached_png_rplot (fun%workflow () ->
let open Tk.Multinomial_test in
let open Multinomial_test in
uniformity_test ~alpha:1. ~k:5 ~n1:20 ~n2:20 LRT.asymptotic_test
)
;;
......@@ -129,7 +56,7 @@ Show.cached_png_rplot (fun%workflow () ->
Or when increasing the number of categories:
```ocaml
Show.cached_png_rplot (fun%workflow () ->
let open Tk.Multinomial_test in
let open Multinomial_test in
uniformity_test ~alpha:1. ~k:20 ~n1:200 ~n2:200 LRT.asymptotic_test
)
;;
......@@ -138,7 +65,7 @@ Show.cached_png_rplot (fun%workflow () ->
Or for sparse probability distribution:
```ocaml
Show.cached_png_rplot (fun%workflow () ->
let open Tk.Multinomial_test in
let open Multinomial_test in
uniformity_test ~alpha:0.1 ~k:5 ~n1:200 ~n2:200 LRT.asymptotic_test
)
;;
......@@ -150,7 +77,7 @@ observed:
```ocaml
Show.cached_png_rplot (fun%workflow () ->
let open Tk.Multinomial_test in
let open Multinomial_test in
uniformity_test ~alpha:0.1 ~k:5 ~n1:200 ~n2:200 (LRT.asymptotic_test ~df_correction:true)
)
;;
......@@ -162,7 +89,7 @@ obtain:
```ocaml
Show.cached_png_rplot (fun%workflow () ->
let open Tk.Multinomial_test in
let open Multinomial_test in
uniformity_test
~filter_out_single_category:true ~alpha:0.1
~k:5 ~n1:200 ~n2:200
......@@ -177,7 +104,7 @@ trivial cases), we go from:
```ocaml
Show.cached_png_rplot (fun%workflow () ->
let open Tk.Multinomial_test in
let open Multinomial_test in
uniformity_test
~alpha:0.1
~k:20 ~n1:20 ~n2:20
......@@ -190,7 +117,7 @@ to
```ocaml
Show.cached_png_rplot (fun%workflow () ->
let open Tk.Multinomial_test in
let open Multinomial_test in
uniformity_test
~filter_out_single_category:true
~alpha:0.1 ~k:20
......@@ -206,7 +133,7 @@ costly):
```ocaml
Show.cached_png_rplot (fun%workflow () ->
let open Tk.Multinomial_test in
let open Multinomial_test in
uniformity_test
~filter_out_single_category:true ~alpha:0.1
~k:20 ~n1:20 ~n2:20
......@@ -217,6 +144,8 @@ Show.cached_png_rplot (fun%workflow () ->
## Simulating under tdg09's null distribution
Here's the setup to simulate alignment under tdg09 method.
```ocaml
let wag =
Bistro_unix.wget "https://www.ebi.ac.uk/goldman-srv/WAG/wag.dat"
......@@ -225,7 +154,7 @@ let wag =
;;
let tdg09_site_simulation ?(alpha = 1.) ?(scale = 1.) tree =
let open Tk.Tdg09.Pack in
let open Tdg09.Pack in
let exchangeability_matrix = wag.rate_matrix in
let stationary_distribution = simulate_profile alpha in
let param = scale in
......@@ -237,7 +166,36 @@ let tdg09_simulation ?alpha tree nsites =
tdg09_site_simulation tree ?alpha
)
;;
```
Then how to implement a uniformity test under tdg09's null :
```ocaml
let tdg09_uniformity_test ?alpha tree f =
Show.cached_png_rplot (fun%workflow () ->
let sites = tdg09_simulation ?alpha tree 1_000 in
Array.map sites ~f
|> OCamlR_graphics.hist ~breaks:(`n 60)
|> fun _ -> ()
)
;;
```
We load a "real" tree :
```ocaml
let tree_path = "data/besnard2009/besnard2009.nhx";;
let load_tree fn =
Convergence_tree.from_file fn
|> Rresult.R.failwith_error_msg
|> Tdg09.Pack.convergence_tree
;;
```
This is how to apply multinomial method on those alignments:
```ocaml
let multinomial_on_tdg09_simulation site =
let leaves1, leaves2 =
Convergence_tree.leaves site
......@@ -255,42 +213,32 @@ let multinomial_on_tdg09_simulation site =
) in
r.pvalue
;;
```
And we're ready to go :
```ocaml
let tree = load_tree tree_path in
tdg09_uniformity_test tree multinomial_on_tdg09_simulation;;
```
This is far from uniform, and to be honest no surprise here:
uniformity is expected when the data is generated under the null *of
the test*. Here we are working under a different model.
## Tdg09 under its null hypothesis
```ocaml
let tdg09_on_tdg09_simulation tree sim =
let open Tk.Tdg09.Pack in
let open Tdg09.Pack in
let site = Tree.leaves sim |> Array.of_list in
let _, _, lrt = Model3.lrt ~mode:`sparse wag tree site in
lrt.pvalue
;;
let tdg09_uniformity_test ?alpha tree f =
let sites = tdg09_simulation ?alpha tree 1_000 in
Array.map sites ~f
;;
let _ =
Show.cached_png_rplot (fun%workflow () ->
let tree =
Tk.Tdg09.Pack.pair_tree
~branch_length1:0.3 ~branch_length2:0.3
~npairs:10 in
tdg09_uniformity_test ~alpha:0.1 tree multinomial_on_tdg09_simulation
|> OCamlR_graphics.hist ~breaks:(`n 60)
|> fun _ -> ()
)
;;
```
```ocaml
let _ =
Show.cached_png_rplot (fun%workflow () ->
let tree =
Tk.Tdg09.Pack.pair_tree
~branch_length1:0.3 ~branch_length2:0.3
~npairs:10 in
tdg09_uniformity_test ~alpha:0.1 tree (tdg09_on_tdg09_simulation tree)
|> OCamlR_graphics.hist ~breaks:(`n 60)
|> fun _ -> ()
)
;;
let tree = load_tree tree_path in
tdg09_uniformity_test tree (tdg09_on_tdg09_simulation tree);;
```
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment