Commit 5fc3531f authored by Carine Rey's avatar Carine Rey
Browse files

use predefined pairs of profiles in function of their distance

parent f22a7cf1
......@@ -28,4 +28,5 @@ RUN pip install docopt
RUN pip install matplotlib
RUN apt-get update && \
apt-get install --no-install-recommends -qy \
python-tk
python-tk \
xvfb
......@@ -46,11 +46,11 @@ let bppseqgen ?(descr="") ~nb_sites ~tree ~config : nucleotide_fasta workflow =
)
] / selector ["seq.fa"]
let conf_file_bppseqgen_multi_profiles ~tree ~profile_w ~ne_c ~ne_a ~config ~nb_sites_per_profile =
let conf_file_bppseqgen_multi_profiles ~tree ~profile_f ~ne_c ~ne_a ~config ~nb_sites_per_profile =
seq ~sep:"\n" (
[
assign "input.tree.file" (dep tree) ;
assign "PROFILE_F" (dep profile_w) ;
assign "PROFILE_F" (dep profile_f) ;
assign "number_of_sites" (int nb_sites_per_profile) ;
assign "NE_1" (float ne_a) ;
assign "NE_C" (float ne_c) ;
......@@ -59,31 +59,27 @@ let conf_file_bppseqgen_multi_profiles ~tree ~profile_w ~ne_c ~ne_a ~config ~nb_
@ config
)
let bppseqgen_multi_profiles_script ~config ~nb_combis ~out ~profile_w =
let bppseqgen_multi_profiles_script ~config ~out ~profile_c =
let vars = [
"FINAL_OUT", ident out ;
"PARAM", config ;
"PROFILE_F", dep profile_w ;
"NB_COMBI_PROFILES", int nb_combis ;
"PROFILE_C", dep profile_c ;
]
in
bash_script vars {|
start_i=1
end_i=$NB_COMBI_PROFILES
NB_COLS=`head -n 1 $PROFILE_F | awk '{print NF}'`
rm -f $FINAL_OUT.info
for ((i=start_i; i<=end_i; i++))
i=0
while read -r line
do
echo "i: $i"
shuf -i1-$NB_COLS -n2 -o random_num
COL_M1=`head -n 1 random_num`
COL_M2=`tail -n 1 random_num`
echo -e $COL_M1'\t'$COL_M2 >> $FINAL_OUT.info
bppseqgen param=$PARAM i=$i COL_M1=$COL_M1 COL_M2=$COL_M2 output.sequence.file=out_int_"$i".fa
done
echo "i: $i"
((i++))
name="$line"
COL_M1=`echo $line | cut -f 1 -d " "`
COL_M2=`echo $line | cut -f 2 -d " "`
bppseqgen param=$PARAM i=$i COL_M1=$COL_M1 COL_M2=$COL_M2 output.sequence.file=out_int_"$i".fa
done < "$PROFILE_C"
cp $PROFILE_C $FINAL_OUT.info
# horizontal concatenation of fasta
catfasta2phyml.pl -f out_int_* > $FINAL_OUT
......@@ -91,7 +87,7 @@ let bppseqgen_multi_profiles_script ~config ~nb_combis ~out ~profile_w =
|}
let bppseqgen_multi_profiles ?(descr="") ~profile_w ~nb_sites ~tree ~config ~ne_c ~ne_a : bppseqgen_multi_profiles directory workflow =
let bppseqgen_multi_profiles ?(descr="") ~profile_f ~profile_c ~nb_sites ~tree ~config ~ne_c ~ne_a : bppseqgen_multi_profiles directory workflow =
let nb_sites_per_profile = 1 in
let nb_combis = Pervasives.(nb_sites / nb_sites_per_profile) in
let config_f = dest // "config.bpp" in
......@@ -102,8 +98,8 @@ let bppseqgen_multi_profiles ?(descr="") ~profile_w ~nb_sites ~tree ~config ~ne_
mkdir_p dest;
mkdir_p tmp;
cd tmp;
cmd "cat" ~stdout:config_f [(file_dump (conf_file_bppseqgen_multi_profiles ~tree ~profile_w ~config ~ne_c ~ne_a ~nb_sites_per_profile))];
cmd "bash" [(file_dump (bppseqgen_multi_profiles_script ~config:config_f ~nb_combis ~out ~profile_w))];
cmd "cat" ~stdout:config_f [(file_dump (conf_file_bppseqgen_multi_profiles ~tree ~profile_f ~config ~ne_c ~ne_a ~nb_sites_per_profile))];
cmd "bash" [(file_dump (bppseqgen_multi_profiles_script ~config:config_f ~out ~profile_c))];
]
)
]
......@@ -138,7 +134,7 @@ let conf_file_bppseqman_fa2phy ~fna =
seq ~sep:"\n" [
assign "input.sequence.file" (dep fna) ;
assign "output.sequence.file" dest ;
assign "output.sequence.format" (string "Phylip") ;
assign "output.sequence.format" (string "Phylip(order=interleaved, type=extended)") ;
string {| input.alignment = true
input.sequence.remove_stop_codons = no
input.sequence.sites_to_use = all
......@@ -147,7 +143,7 @@ let conf_file_bppseqman_fa2phy ~fna =
]
let fa2phy ~(fna: nucleotide_fasta workflow) : nucleotide_phylip workflow =
workflow ~descr:"bppsuite.fa2phy" [
workflow ~descr:"bppsuite.fa2phy_interleaved" [
cmd "bppseqman" ~env [
assign "param" (file_dump (conf_file_bppseqman_fa2phy ~fna)) ;
]
......
......@@ -13,7 +13,8 @@ val bppseqgen :
val bppseqgen_multi_profiles :
?descr : string ->
profile_w: _ workflow ->
profile_f: text_file workflow ->
profile_c: text_file workflow ->
nb_sites:int ->
tree:nhx workflow ->
config:Bistro.Template.t list ->
......
......@@ -85,7 +85,7 @@ let diffsel ~(phy_n:nucleotide_phylip workflow) ~(tree: _ workflow) ~(w_every:in
opt "-x" seq [ int w_every; string " "; int n_cycles];
ident chainname ;
];
cmd "bash" [(file_dump (diffsel_add_iterations_script ~chainname ))];
cmd "bash" [(file_dump (diffsel_add_iterations_script ~chainname ))];
]
)
]
......@@ -112,7 +112,7 @@ let check_conv run_diffsel : text_file directory workflow =
] ;
cmd "cp" [string "DiffselMCMCConvergenceAnalysis.html" ; ident out] ;
cmd "cp" [string "new_iterations.txt" ; ident nb_new_iterations]
]
]
)
]
......
......@@ -47,7 +47,7 @@ let parse_input_data indir =
let derive_from_model ~model ~input_tree ~tree_dataset ~tree_prefix ~profile ~preview ~ns =
let model_prefix = Convergence_hypothesis.string_of_model model in
let nb_sites = if ns = 0 then (if preview then 20 else 50) else ns in
let nb_sites = ns in
let nodes = Tree_dataset.nodes tree_dataset model in
let tree = Tree_dataset.tree tree_dataset `Simulation in
let descr = "."^model_prefix^"."^tree_prefix in
......@@ -74,8 +74,9 @@ let derive_from_model ~model ~input_tree ~tree_dataset ~tree_prefix ~profile ~pr
| H0_SmallNeInBigNe -> 10.
| _ -> 1.
in
let profile_w = profile.profile_w in
let run_fna = Bppsuite.bppseqgen_multi_profiles ~descr ~nb_sites ~tree ~config:config_p ~profile_w ~ne_c ~ne_a in
let profile_f = profile.profile_f in
let profile_c = profile.profile_c in
let run_fna = Bppsuite.bppseqgen_multi_profiles ~descr ~nb_sites ~tree ~config:config_p ~profile_f ~profile_c ~ne_c ~ne_a in
let fna = Bppsuite.bppseqgen_multi_profiles_get_fa run_fna in
let fna_infos = Some (Bppsuite.bppseqgen_multi_profiles_get_info run_fna) in
......@@ -128,8 +129,10 @@ let derive_from_tree ~tree_dir ~tree ~profile ~preview ~use_concat ~ns ~no_Ne ~n
) in
let ready_dataset_H0 = (derive_from_model ~model:H0 ~input_tree ~tree_dataset ~tree_prefix ~profile ~preview ~ns).dataset in
let ready_dataset_HaPCOC = (derive_from_model ~model:HaPCOC ~input_tree ~tree_dataset ~tree_prefix ~profile ~preview ~ns).dataset in
let concat_H0Ha = {Dataset.model_prefix="H0+HaPCOC"; tree_prefix; is_real = false; dataset = Ready_dataset.paste ready_dataset_H0 ready_dataset_HaPCOC} in
let dataset_concat_hypos = if use_concat then [concat_H0Ha;] else [] in
let ready_dataset_HaPC = (derive_from_model ~model:HaPC ~input_tree ~tree_dataset ~tree_prefix ~profile ~preview ~ns).dataset in
let concat_H0HaPCOC = {Dataset.model_prefix="H0+HaPCOC"; tree_prefix; is_real = false; dataset = Ready_dataset.paste ready_dataset_H0 ready_dataset_HaPCOC} in
let concat_H0HaPC = {Dataset.model_prefix="H0+HaPC"; tree_prefix; is_real = false; dataset = Ready_dataset.paste ready_dataset_H0 ready_dataset_HaPC} in
let dataset_concat_hypos = if use_concat then [concat_H0HaPC;] else [] in
List.concat [ dataset_per_hypo ; dataset_concat_hypos ]
let derive_sim ~tree_dir ~trees ~profile ~preview ~use_concat ~ns ~no_Ne ~no_HaPC ~ne_test =
......@@ -269,7 +272,7 @@ let derive_profile ?(indir = "") ?(ns = 0) ~preview ~fast_mode ~no_Ne ~ne_test ~
let dataset_l =
derive_sim ~tree_dir ~trees ~profile ~preview ~use_concat ~ns ~no_Ne ~no_HaPC ~ne_test in
let dataset_results_l =
if only_simu then
if only_simu then
[]
else
derive_det ~dataset_l ~preview ~fast_mode
......@@ -288,8 +291,13 @@ let derive_profile ?(indir = "") ?(ns = 0) ~preview ~fast_mode ~no_Ne ~ne_test ~
let all_post_analyses_per_tree = List.map repo_and_post_analyses_per_tree_simu ~f:(fun (r,p) -> p) in
let profile_prefix = profile.profile_n in
let repo_post_analyses_all_trees = Post_analyses.repo_post_analyses_all_trees_of_all_post_analyses_per_tree ~all_post_analyses_per_tree ~profile_prefix in
let repo = repo_of_post_analyses_simu @ all_repo_per_tree_simu @repo_post_analyses_all_trees in
Repo.shift profile_prefix repo
let repo_post_analyses_all_trees = if only_simu then
[]
else
repo_post_analyses_all_trees
in
let repo = repo_of_post_analyses_simu @ all_repo_per_tree_simu @ repo_post_analyses_all_trees in
Repo.shift profile_prefix repo
let logger =
......@@ -306,9 +314,8 @@ let detection_main ~outdir ~indir ?(np = 2) ?(mem = 2) ~preview ~fast_mode () =
Repo.build ~outdir ~np ~mem:(`GB mem) ~logger repo
let simulation_main ~outdir ?(ns = 0) ?(np = 2) ?(mem = 2) ~tree_dir ~profile_fn ~preview ~use_concat ~no_Ne ~no_HaPC () =
let profile_w = input profile_fn in
let profile_n = Filename.chop_extension profile_fn in
let profile = {profile_w; profile_n} in
let nb_sites = if ns = 0 then (if preview then 20 else 50) else ns in
let profile = Profile.profile_l_of_splitted_profile ~nb_cat:1 ~nb_sites profile_fn in
let trees = Array.to_list @@ Sys.readdir tree_dir in
let dataset_l = derive_sim ~tree_dir ~trees ~profile ~preview ~use_concat ~ns ~no_Ne ~no_HaPC ~ne_test:false in
let repo = Dataset.repo dataset_l ~preview in
......@@ -316,16 +323,14 @@ let simulation_main ~outdir ?(ns = 0) ?(np = 2) ?(mem = 2) ~tree_dir ~profile_fn
let validation_main ~outdir ?(indir = "") ?(ns = 0) ?(np = 2) ?(mem = 2) ~preview ~fast_mode ~no_Ne ~ne_test ~no_HaPC ~tree_dir ~profile_fn ~use_concat ~only_simu () =
(* simulated trees *)
let profile_l = Profile.profile_l_of_splitted_profile (Profile.split_profile (input profile_fn)) in
let sim_repo_l = List.map profile_l ~f:(fun profile ->
derive_profile ~indir ~ns ~preview ~fast_mode ~no_Ne ~ne_test ~no_HaPC ~tree_dir ~profile ~use_concat ~only_simu ()
) |> List.concat in
let nb_sites = if ns = 0 then (if preview then 20 else 50) else ns in
let profile = Profile.profile_l_of_splitted_profile ~nb_cat:3 ~nb_sites profile_fn in
let sim_repo_l = derive_profile ~indir ~ns ~preview ~fast_mode ~no_Ne ~ne_test ~no_HaPC ~tree_dir ~profile ~use_concat ~only_simu () in
(* real trees *)
let indir_dataset_l = if indir = "" then [] else parse_input_data indir in
let dataset_l = indir_dataset_l in
let dataset_results_l =
if only_simu then
if only_simu then
[]
else
derive_det ~dataset_l ~preview ~fast_mode
......@@ -412,7 +417,7 @@ let validation_command =
and use_concat =
flag "--use-concat" no_arg ~doc:" Use concatenation H0+Ha_pcoc"
and ns =
flag "--ns" (optional int) ~doc:"INT Number of sites to simulate"
flag "--ns" (optional int) ~doc:"INT Number of sites to simulate (WARNING: will be multiplicated per 3)"
and np =
flag "--np" (optional int) ~doc:"INT Number of available processors"
and mem =
......
......@@ -4,12 +4,13 @@ open Bistro.EDSL
open File_formats
type profile = {
profile_w : text_file workflow;
profile_n : string;
profile_f : text_file workflow;
profile_c : text_file workflow;
profile_n : string;
}
let split_profile profile_fn : text_file directory workflow =
let split_profile ~nb_sites ~dist_bins profile_f : text_file directory workflow =
let env = docker_image ~account:"carinerey" ~name:"python_basics" ~tag:"07252018" () in
let package = tmp // "diffsel_script_utils.py" in
let script = tmp // "generate_pairs.py" in
......@@ -20,6 +21,8 @@ let split_profile profile_fn : text_file directory workflow =
mkdir_p tmp ;
mkdir_p dest ;
cd tmp ;
cmd "(" [string "Xvfb :1 -screen 0 1024x768x16 & )"];
cmd "export" [string "DISPLAY=:1"];
cmd "cp" [ file_dump (string Scripts.diffsel_script_utils) ; package] ;
cmd "cp" [ file_dump (string Scripts.generate_pairs) ; script] ;
......@@ -28,18 +31,39 @@ let split_profile profile_fn : text_file directory workflow =
cmd "python" [
string "generate_pairs.py" ;
opt "-o" ident prefix ;
dep profile_fn ;
opt "-s" int nb_sites ;
opt "-b" string dist_bins ;
dep profile_f ;
]
]
)
]
let profile_l_of_splitted_profile splitted_profile =
let p0 = splitted_profile / selector ["profile_0.tsv"] in
let p1 = splitted_profile / selector ["profile_1.tsv"] in
let p2 = splitted_profile / selector ["profile_2.tsv"] in
[{profile_w=p0; profile_n="p0"};
{profile_w=p1; profile_n="p1"};
{profile_w=p2; profile_n="p2"};
let cat_file ~(f_l: text_file workflow list) : text_file workflow =
workflow ~descr:"cat.text_files" [
cmd "cat" ~stdout:dest (List.concat [
List.map f_l ~f:(fun f -> dep f) ;
])
]
let profile_l_of_splitted_profile ~nb_cat ~nb_sites profile_fn =
let profile_f = input profile_fn in
let prefix = Filename.chop_extension profile_fn in
let dist_bins = match nb_cat with
| 3 -> "[0.01,0.4],[0.4,0.6],[0.6,2]"
| 1 -> "[0.01,2]"
| _ -> failwith {| nbcat must be 1 or 3 |}
in
let splitted_profile = split_profile ~nb_sites ~dist_bins profile_f in
match nb_cat with
| 3 -> (
let p0 = splitted_profile / selector ["profile_0.tsv"] in
let p1 = splitted_profile / selector ["profile_1.tsv"] in
let p2 = splitted_profile / selector ["profile_2.tsv"] in
{profile_c=cat_file [p0;p1;p2] ; profile_n=prefix ^ "_3categories" ; profile_f};
)
| 1 -> (let p0 = splitted_profile / selector ["profile_0.tsv"] in
{profile_c=p0; profile_n=prefix ; profile_f})
| _ -> failwith {| nbcat must be 1 or 3 |}
......@@ -205,7 +205,7 @@ df_final = df_final[["Sites","NbConvAA","NbNonConvAA","NbOnlyConvAA","NbOnlyNonC
#===================================================================================================
if AliInfoFile:
try:
profil_df = pd.read_csv(AliInfoFile.name, sep="\t", names=["Pf_anc","Pf_conv"])
profil_df = pd.read_csv(AliInfoFile.name, sep="\t", names=["Pf_anc","Pf_conv","dist"])
except Exception as exc:
logger.error(str(exc))
sys.exit(1)
......@@ -219,7 +219,7 @@ if AliInfoFile:
# reorder col names
profil_df = profil_df[["Sites","Pf_anc","Pf_conv"]]
profil_df = profil_df[["Sites","Pf_anc","Pf_conv","dist"]]
logger.info("AliInfoFile (%s) ok after checking", AliInfoFile.name)
df_final = pd.merge(df_final,profil_df, on="Sites")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment