Commit 2bcb3e92 authored by Carine Rey's avatar Carine Rey
Browse files

add a new couple with indels

parent aed9c6ab
......@@ -23,3 +23,11 @@ let repo ~preview dataset_l =
|> Repo.shift "Results_per_hypothesis"
)
|> List.concat
let add_indels_to_dataset d =
let p = 0.33 in
let model_prefix = sprintf "%s_0.33_i" d.model_prefix in
let tree_prefix = d.tree_prefix in
let is_real = d.is_real in
let dataset = Ready_dataset.add_indels_to_ready_dataset ~p:0.33 d.dataset in
{model_prefix; tree_prefix; is_real; dataset}
......@@ -120,7 +120,7 @@ let derive_from_model ~model ~input_tree ~tree_dataset ~tree_prefix ~profile ~pr
let ready_dataset = { Ready_dataset.input_tree = input_tree ; tree_dataset ; fna; faa; fna_infos} in
{ Dataset.model_prefix; is_real= false; tree_prefix; dataset = ready_dataset }
let derive_from_tree ~tree_dir ~tree ~profile ~preview ~use_concat ~ns ~no_Ne ~no_HaPC ~ne_test =
let derive_from_tree ~tree_dir ~tree ~profile ~preview ~use_concat ~ns ~no_Ne ~no_HaPC ~ne_test ~add_indels =
let tree_prefix = Filename.chop_extension tree in
let input_tree = input (Filename.concat tree_dir tree) in
let tree_dataset = Tree_dataset.prepare ~descr:("simulated_data." ^ tree_prefix) input_tree in
......@@ -160,17 +160,25 @@ let derive_from_tree ~tree_dir ~tree ~profile ~preview ~use_concat ~ns ~no_Ne ~n
let dataset_per_hypo = List.map models ~f:(fun model ->
derive_from_model ~model ~input_tree ~tree_dataset ~tree_prefix ~profile ~preview ~ns
) in
let ready_dataset_H0 = (derive_from_model ~model:H0_NeG5 ~input_tree ~tree_dataset ~tree_prefix ~profile ~preview ~ns).dataset in
let dataset_H0 = derive_from_model ~model:H0_NeG5 ~input_tree ~tree_dataset ~tree_prefix ~profile ~preview ~ns in
let ready_dataset_H0 = dataset_H0.dataset in
let indel_H0 = Dataset.add_indels_to_dataset dataset_H0 in
let dataset_HaPC = derive_from_model ~model:HaPC_NeG5 ~input_tree ~tree_dataset ~tree_prefix ~profile ~preview ~ns in
let ready_dataset_HaPC = dataset_HaPC.dataset in
let indel_HaPC = Dataset.add_indels_to_dataset dataset_HaPC in
let ready_dataset_HaPCOC = (derive_from_model ~model:HaPCOC ~input_tree ~tree_dataset ~tree_prefix ~profile ~preview ~ns).dataset in
let ready_dataset_HaPC = (derive_from_model ~model:HaPC_NeG5 ~input_tree ~tree_dataset ~tree_prefix ~profile ~preview ~ns).dataset in
let _concat_H0HaPCOC = {Dataset.model_prefix="H0_NeG5+HaPCOC"; tree_prefix; is_real = false; dataset = Ready_dataset.paste ready_dataset_H0 ready_dataset_HaPCOC} in
let concat_H0HaPC = {Dataset.model_prefix="H0_NeG5+HaPC_NeG5"; tree_prefix; is_real = false; dataset = Ready_dataset.paste ready_dataset_H0 ready_dataset_HaPC} in
let dataset_concat_hypos = if use_concat then [concat_H0HaPC;] else [] in
List.concat [ dataset_per_hypo ; dataset_concat_hypos ]
let dataset_with_indels = if add_indels then [indel_H0;indel_HaPC;] else [] in
List.concat [ dataset_per_hypo ; dataset_concat_hypos; dataset_with_indels ]
let derive_sim ~tree_dir ~trees ~profile ~preview ~use_concat ~ns ~no_Ne ~no_HaPC ~ne_test =
let derive_sim ~tree_dir ~trees ~profile ~preview ~use_concat ~ns ~no_Ne ~no_HaPC ~ne_test ~add_indels =
List.map trees ~f:(fun tree ->
derive_from_tree ~tree_dir ~tree ~profile ~preview ~use_concat ~ns ~no_Ne ~no_HaPC ~ne_test)
derive_from_tree ~tree_dir ~tree ~profile ~preview ~use_concat ~add_indels ~ns ~no_Ne ~no_HaPC ~ne_test)
|> List.concat
......@@ -295,16 +303,16 @@ let derive_det ~dataset_l ~preview ~fast_mode =
List.map dataset_l ~f:(fun dataset ->
derive_from_dataset ~preview ~dataset ~fast_mode)
let derive_profile ?(indir = "") ?(ns = 0) ~preview ~fast_mode ~no_Ne ~ne_test ~no_HaPC ~tree_dir ~profile ~use_concat ~only_simu ~seed () =
let derive_profile ?(indir = "") ?(ns = 0) ~preview ~fast_mode ~no_Ne ~ne_test ~no_HaPC ~tree_dir ~profile ~use_concat ~add_indels ~only_simu ~seed () =
let trees = Array.to_list @@ Sys.readdir tree_dir in
let simu_dataset_l = derive_sim ~tree_dir ~trees ~profile ~preview ~use_concat ~ns ~no_Ne ~no_HaPC ~ne_test in
let simu_dataset_l = derive_sim ~tree_dir ~trees ~profile ~preview ~use_concat ~ns ~no_Ne ~no_HaPC ~add_indels ~ne_test in
let post_analyses_simu = Post_analyses.post_analyses_simu_of_simu_dataset_l ~simu_dataset_l in
let repo_of_post_analyses_simu = Post_analyses.repo_of_post_analyses_simu ~post_analyses_simu in
let repo_and_post_analyses_per_tree_simu = List.map trees ~f:(fun tree -> (*to keep together all models per tree*)
let trees = [tree] in
let tree_prefix = Filename.chop_extension tree in
let dataset_l =
derive_sim ~tree_dir ~trees ~profile ~preview ~use_concat ~ns ~no_Ne ~no_HaPC ~ne_test in
derive_sim ~tree_dir ~trees ~profile ~preview ~use_concat ~ns ~no_Ne ~no_HaPC ~ne_test ~add_indels in
let dataset_results_l =
if only_simu then
[]
......@@ -359,21 +367,21 @@ let detection_main ~outdir ~indir ?(np = 2) ?(mem = 2) ~preview ~fast_mode () =
let repo = repo_of_dataset_results_l ~dataset_results_l in
Repo.build ~outdir ~np ~mem:(`GB mem) ~logger repo
let simulation_main ~outdir ?(ns = 0) ?(np = 2) ?(mem = 2) ~tree_dir ~profile_fn ~preview ~use_concat ~no_Ne ~no_HaPC ~seed () =
let simulation_main ~outdir ?(ns = 0) ?(np = 2) ?(mem = 2) ~tree_dir ~profile_fn ~preview ~use_concat ~add_indels ~no_Ne ~no_HaPC ~seed () =
let nb_sites = if ns = 0 then (if preview then 20 else 50) else ns in
let profile = Profile.profile_l_of_splitted_profile ~nb_cat:1 ~nb_sites profile_fn in
let trees = Array.to_list @@ Sys.readdir tree_dir in
let dataset_l = derive_sim ~tree_dir ~trees ~profile ~preview ~use_concat ~ns ~no_Ne ~no_HaPC ~ne_test:false in
let dataset_l = derive_sim ~tree_dir ~trees ~profile ~preview ~use_concat ~ns ~no_Ne ~no_HaPC ~add_indels ~ne_test:false in
let repo = Dataset.repo dataset_l ~preview in
Repo.build ~outdir ~np ~mem:(`GB mem) ~logger repo
let validation_main ~outdir ?(indir = "") ?(ns = 0) ?(np = 2) ?(mem = 2) ~preview ~fast_mode ~no_Ne ~ne_test ~no_HaPC ~tree_dir ~profile_fn ~use_concat ~only_simu ?(seed = Random.int Int.max_value) () =
let validation_main ~outdir ?(indir = "") ?(ns = 0) ?(np = 2) ?(mem = 2) ~preview ~fast_mode ~no_Ne ~ne_test ~no_HaPC ~tree_dir ~profile_fn ~use_concat ~add_indels ~only_simu ?(seed = Random.int Int.max_value) () =
printf "Global seed: %i\n" seed;
(* simulated trees *)
Random.init seed ;
let nb_sites = if ns = 0 then (if preview then 20 else 50) else ns in
let profile = Profile.profile_l_of_splitted_profile ~nb_cat:3 ~nb_sites profile_fn in
let sim_repo_l = derive_profile ~indir ~ns ~preview ~fast_mode ~no_Ne ~ne_test ~no_HaPC ~tree_dir ~profile ~use_concat ~only_simu ~seed () in
let sim_repo_l = derive_profile ~indir ~ns ~preview ~fast_mode ~no_Ne ~ne_test ~no_HaPC ~tree_dir ~profile ~use_concat ~add_indels ~only_simu ~seed () in
(* real trees *)
let indir_dataset_l = if indir = "" then [] else parse_input_data indir in
let dataset_l = indir_dataset_l in
......@@ -413,6 +421,8 @@ let simulation_command =
flag "--mem" (optional int) ~doc:"INT Available memory (in GB)"
and use_concat =
flag "--use-concat" no_arg ~doc:" Use concatenation H0+Ha_pcoc"
and add_indels =
flag "--add-indels" no_arg ~doc:" add indels in H*NeG5"
and tree_dir =
flag "--tree-dir" (required string) ~doc:"PATH Path to tree directory"
and profile_fn =
......@@ -420,7 +430,7 @@ let simulation_command =
and seed =
flag "--seed" (optional int) ~doc:"INT Global seed"
in
simulation_main ~outdir ?ns ?np ?mem ~no_Ne ~no_HaPC ~tree_dir ~profile_fn ~preview ~use_concat ~seed
simulation_main ~outdir ?ns ?np ?mem ~no_Ne ~no_HaPC ~tree_dir ~profile_fn ~preview ~use_concat ~add_indels ~seed
]
let detection_command =
......@@ -468,6 +478,8 @@ let validation_command =
flag "--only-simu" no_arg ~doc:" mode only simulation"
and use_concat =
flag "--use-concat" no_arg ~doc:" Use concatenation H0+Ha_pcoc"
and add_indels =
flag "--add-indels" no_arg ~doc:" add indels in H*NeG5"
and ns =
flag "--ns" (optional int) ~doc:"INT Number of sites to simulate (WARNING: will be multiplicated per 3)"
and np =
......@@ -481,5 +493,5 @@ let validation_command =
and seed =
flag "--seed" (optional int) ~doc:"INT Global seed"
in
validation_main ~outdir ?indir ?ns ?np ?mem ~preview ~fast_mode ~no_Ne ~ne_test ~no_HaPC ~tree_dir ~profile_fn ~use_concat ~only_simu ?seed
validation_main ~outdir ?indir ?ns ?np ?mem ~preview ~fast_mode ~no_Ne ~ne_test ~no_HaPC ~tree_dir ~profile_fn ~use_concat ~only_simu ~add_indels ?seed
]
......@@ -68,3 +68,24 @@ let paste d1 d2 =
let fna_infos = Some (paste_fna_infos ~fna_infos_l) in
let ready_dataset = of_raw {Raw_dataset.input_tree=r_d1.input_tree ; fna; fna_infos} in
ready_dataset
let add_indels_to_fna ~(p:float) (fna:nucleotide_fasta workflow) : nucleotide_fasta workflow =
let env = docker_image ~account:"carinerey" ~name:"python_basics" ~tag:"07252018" () in
workflow ~descr:("add_indels") [
cmd "python" ~env [
file_dump (string Scripts.add_indels) ;
opt "-p" float p;
opt "-a" dep fna;
opt "-o" ident dest;
string "-c";
]
]
let add_indels_to_ready_dataset ~p d =
let r_d = to_raw d in
let fna = add_indels_to_fna ~p r_d.fna in
let fna_infos = r_d.fna_infos in
let ready_dataset = of_raw {Raw_dataset.input_tree=r_d.input_tree ; fna; fna_infos} in
ready_dataset
......@@ -153,7 +153,7 @@ df_leaves_l = []
n_sites = ali.get_alignment_length()
Sites = [i +1 for i in range(n_sites)]
AA = ["A", "R", "N", "D", "C", "E", "Q", "G", "H", "I", "L", "K", "M", "F", "P", "S", "T", "W", "Y", "V"]
AA = ["A", "R", "N", "D", "C", "E", "Q", "G", "H", "I", "L", "K", "M", "F", "P", "S", "T", "W", "Y", "V","-"]
#init_AA_dict = {aa:0 for aa in AA}
Conv_AA_per_site_l = [{aa:0 for aa in AA} for i in range(n_sites)]
NonConv_AA_per_site_l = [{aa:0 for aa in AA} for i in range(n_sites)]
......@@ -178,10 +178,10 @@ OnlyNonConv_ConvAA = [0] * n_sites #B = nb AA only in (NonConvLeaves AND ConvLea
for i in range(n_sites):
logger.info(i)
ConvAA = set([aa for aa, v in Conv_AA_per_site_l[i].items() if v > 0])
ConvAA = set([aa for aa, v in Conv_AA_per_site_l[i].items() if v > 0 and aa != "-"])
logger.debug( Conv_AA_per_site_l[i])
logger.debug(ConvAA)
NonConvAA = set([aa for aa, v in NonConv_AA_per_site_l[i].items() if v > 0])
NonConvAA = set([aa for aa, v in NonConv_AA_per_site_l[i].items() if v > 0 and aa != "-"])
logger.debug(NonConv_AA_per_site_l[i])
logger.debug(NonConvAA)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment