Commit b8f2ef94 authored by Carine Rey's avatar Carine Rey
Browse files

save profiles use for simulation

parent 76b4074e
......@@ -4,7 +4,7 @@ open Bistro.EDSL
open Bistro_bioinfo.Std
open File_formats
type bppseqgen_multi_profiles
let env = docker_image ~account:"carinerey" ~name:"bppsuite" ~tag:"07052018" ()
......@@ -79,12 +79,16 @@ let bppseqgen_multi_profiles_script ~config ~nb_combis ~out ~profile_f =
end_i=$NB_COMBI_PROFILES
NB_COLS=`head -n 1 $PROFILE_F | awk '{print NF}'`
echo '#'$PROFILE_F > $FINAL_OUT.info
echo -e Sites'\t'ProfilC1'\t'ProfilC2 >> $FINAL_OUT.info
for ((i=start_i; i<=end_i; i++))
do
echo "i: $i"
shuf -i1-$NB_COLS -n2 -o random_num
COL_M1=`head -n 1 random_num`
COL_M2=`tail -n 1 random_num`
echo -e $i'\t'$COL_M1'\t'$COL_M2 >> $FINAL_OUT.infos
bppseqgen param=$PARAM i=$i COL_M1=$COL_M1 COL_M2=$COL_M2 output.sequence.file=out_int_"$i".fa
done
......@@ -94,7 +98,7 @@ let bppseqgen_multi_profiles_script ~config ~nb_combis ~out ~profile_f =
|}
let bppseqgen_multi_profiles ?(descr="") ~profile_f ~nb_sites ~tree ~config ~ne_c : nucleotide_fasta workflow =
let bppseqgen_multi_profiles ?(descr="") ~profile_f ~nb_sites ~tree ~config ~ne_c : bppseqgen_multi_profiles directory workflow =
let nb_sites_per_profile = if nb_sites > 100 then 2 else 1 in
let nb_combis = Pervasives.(nb_sites / nb_sites_per_profile) in
let config_f = dest // "config.bpp" in
......@@ -109,8 +113,13 @@ let bppseqgen_multi_profiles ?(descr="") ~profile_f ~nb_sites ~tree ~config ~ne_
cmd "bash" [(file_dump (bppseqgen_multi_profiles_script ~config:config_f ~nb_combis ~out ~profile_f))];
]
)
] / selector ["seq.fa"]
]
let bppseqgen_multi_profiles_get_fa run_bppseqgen_multi_profiles : nucleotide_fasta workflow =
run_bppseqgen_multi_profiles / selector ["seq.fa"]
let bppseqgen_multi_profiles_get_info run_bppseqgen_multi_profiles : text_file workflow =
run_bppseqgen_multi_profiles / selector ["seq.fa.infos"]
let conf_file_bppseqman_fna2faa ~fna =
seq ~sep:"\n" [
......
......@@ -2,6 +2,8 @@ open Bistro.Std
open Bistro_bioinfo.Std
open File_formats
type bppseqgen_multi_profiles
val bppseqgen :
?descr : string ->
nb_sites:int ->
......@@ -16,8 +18,16 @@ val bppseqgen_multi_profiles :
tree:nhx workflow ->
config:Bistro.Template.t list ->
ne_c: float ->
bppseqgen_multi_profiles directory workflow
val bppseqgen_multi_profiles_get_fa :
bppseqgen_multi_profiles directory workflow ->
nucleotide_fasta workflow
val bppseqgen_multi_profiles_get_info :
bppseqgen_multi_profiles directory workflow ->
text_file workflow
val fna2faa :
fna:nucleotide_fasta workflow ->
aminoacid_fasta workflow
......
......@@ -4,6 +4,7 @@ open Bistro_utils
type t = {
model_prefix: string ;
tree_prefix : string ;
is_real: bool ;
dataset : Ready_dataset.t ;
}
......
......@@ -31,8 +31,10 @@ let parse_input_data indir =
let tree_prefix = Filename.chop_extension input_tree in
let input_tree = input (Filename.concat indir (Filename.concat dataset_prefix input_tree)) in
let fna = input (Filename.concat indir (Filename.concat dataset_prefix fna)) in
let raw_dataset = Raw_dataset.{input_tree; fna} in
let fna_infos = None in
let raw_dataset = Raw_dataset.{input_tree; fna; fna_infos} in
let dataset = {Dataset.model_prefix = tree_prefix;
is_real = true;
tree_prefix = dataset_prefix;
dataset = Ready_dataset.of_raw raw_dataset
} in
......@@ -63,11 +65,13 @@ let derive_from_model ~model ~input_tree ~tree_dataset ~tree_prefix ~profile_f ~
| HaPCOC_NeBig -> 6.
| _ -> 1.
in
let fna = Bppsuite.bppseqgen_multi_profiles ~descr ~nb_sites ~tree ~config:config_p ~profile_f ~ne_c in
let run_fna = Bppsuite.bppseqgen_multi_profiles ~descr ~nb_sites ~tree ~config:config_p ~profile_f ~ne_c in
let fna = Bppsuite.bppseqgen_multi_profiles_get_fa run_fna in
let fna_infos = Some (Bppsuite.bppseqgen_multi_profiles_get_info run_fna) in
let faa = Bppsuite.fna2faa ~fna in
let ready_dataset = { Ready_dataset.input_tree = input_tree ; tree_dataset ; fna; faa} in
{ Dataset.model_prefix; tree_prefix; dataset = ready_dataset }
let ready_dataset = { Ready_dataset.input_tree = input_tree ; tree_dataset ; fna; faa; fna_infos} in
{ Dataset.model_prefix; is_real= false; tree_prefix; dataset = ready_dataset }
let derive_from_tree ~tree_dir ~tree ~profile_f ~preview =
let tree_prefix = Filename.chop_extension tree in
......@@ -89,7 +93,7 @@ let derive_from_tree ~tree_dir ~tree ~profile_f ~preview =
) in
let ready_dataset_H0 = (derive_from_model ~model:H0 ~input_tree ~tree_dataset ~tree_prefix ~profile_f ~preview).dataset in
let ready_dataset_HaPCOC = (derive_from_model ~model:HaPCOC ~input_tree ~tree_dataset ~tree_prefix ~profile_f ~preview).dataset in
let concat_H0Ha = {Dataset.model_prefix="H0+HaPCOC"; tree_prefix; dataset = Ready_dataset.paste ready_dataset_H0 ready_dataset_HaPCOC} in
let concat_H0Ha = {Dataset.model_prefix="H0+HaPCOC"; tree_prefix; is_real = false; dataset = Ready_dataset.paste ready_dataset_H0 ready_dataset_HaPCOC} in
let dataset_concat_hypos = [concat_H0Ha;] in
List.concat [ dataset_per_hypo ; dataset_concat_hypos ]
......
......@@ -5,6 +5,7 @@ open Bistro_utils
type t = {
input_tree: nhx workflow ;
fna: nucleotide_fasta workflow ;
fna_infos: text_file workflow option ;
}
let repo rd =
......
open Core
open Bistro.Std
open Bistro_utils
open Bistro.EDSL
open Bistro_bioinfo.Std
open File_formats
type t = {
input_tree: nhx workflow ;
tree_dataset : [`tree_dataset] directory workflow ;
fna: nucleotide_fasta workflow ;
fna_infos: text_file workflow option ;
faa: aminoacid_fasta workflow ;
}
......@@ -13,12 +18,14 @@ type t = {
let of_raw (raw_dataset : Raw_dataset.t) =
let input_tree = raw_dataset.input_tree in
let fna = raw_dataset.fna in
let fna_infos = raw_dataset.fna_infos in
let tree_dataset = Tree_dataset.prepare input_tree in
let faa = Bppsuite.fna2faa ~fna in
{ input_tree; tree_dataset ; fna; faa}
{ input_tree; tree_dataset ; fna; faa; fna_infos}
let repo rd =
Repo.[
[
item ["input_tree.nhx"] rd.input_tree ;
item ["tree.H0.node_ids" ] (Tree_dataset.nodes rd.tree_dataset H0) ;
item ["tree.Ha.node_ids" ] (Tree_dataset.nodes rd.tree_dataset HaPCOC) ;
......@@ -28,15 +35,34 @@ let repo rd =
item ["tree.convergent_topology" ] (Tree_dataset.topological_tree rd.tree_dataset) ;
item ["simulated_sequences.fna"] rd.fna ;
item ["simulated_sequences.faa"] rd.faa ;
] ;
match rd.fna_infos with
| Some w -> [item ["simulated_sequences.fna_infos"] w]
| None -> []
;
]
|> List.concat
|> Repo.shift "ready_dataset"
let to_raw { input_tree ; fna } =
{ Raw_dataset.input_tree ; fna }
let to_raw { input_tree ; fna ; fna_infos} =
{ Raw_dataset.input_tree ; fna ; fna_infos}
let paste_fna_infos ~(fna_infos_l: text_file workflow list) : text_file workflow =
workflow ~descr:"cat" [
cmd "cat" ~stdout:dest (List.concat [
List.map fna_infos_l ~f:(fun fna_infos -> dep fna_infos) ;
])
]
let paste d1 d2 =
let r_d1 = to_raw d1 in
let r_d2 = to_raw d2 in
let fna = Bppsuite.paste_fna [r_d1.fna ; r_d2.fna ] in
let ready_dataset = of_raw {Raw_dataset.input_tree=r_d1.input_tree ; fna} in
let fna_infos_l = List.map [r_d1.fna_infos ; r_d2.fna_infos] ~f:(fun fna_infos ->
match fna_infos with
| Some i -> [i]
| None -> []
) |> List.concat in
let fna_infos = Some (paste_fna_infos ~fna_infos_l) in
let ready_dataset = of_raw {Raw_dataset.input_tree=r_d1.input_tree ; fna; fna_infos} in
ready_dataset
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment