profile.ml 2.23 KB
Newer Older
1 2 3 4 5 6
open Core_kernel
open Bistro.Std
open Bistro.EDSL
open File_formats

type profile = {
7 8 9
  profile_f : text_file workflow;
  profile_c : text_file workflow;
  profile_n : string;
10 11 12
}


13
let split_profile ~nb_sites ~dist_bins profile_f : text_file directory workflow =
14 15 16 17 18 19 20 21 22 23
  let env = docker_image ~account:"carinerey" ~name:"python_basics" ~tag:"07252018" () in
  let package = tmp // "diffsel_script_utils.py" in
  let script = tmp // "generate_pairs.py" in
  let prefix = dest // "profile" in
  workflow ~descr:"parse_profile.split_profile" [
    docker env (
      and_list [
        mkdir_p tmp ;
        mkdir_p dest ;
        cd tmp ;
24 25
        cmd "(" [string "Xvfb :1 -screen 0 1024x768x16 & )"];
        cmd "export" [string "DISPLAY=:1"];
26 27 28 29 30 31 32 33

        cmd "cp"  [ file_dump (string Scripts.diffsel_script_utils) ; package] ;
        cmd "cp" [ file_dump (string Scripts.generate_pairs) ; script] ;

        (*generate_pairs.py [options...] -o <output-prefix> <profiles-file> *)
        cmd "python" [
          string "generate_pairs.py" ;
          opt "-o" ident prefix ;
34 35 36
          opt "-s" int nb_sites ;
          opt "-b" string dist_bins ;
          dep profile_f ;
37 38 39 40 41 42
        ]
      ]
    )
  ]


43 44 45 46 47
let cat_file ~(f_l: text_file workflow list) : text_file workflow =
  workflow ~descr:"cat.text_files" [
    cmd "cat" ~stdout:dest (List.concat [
        List.map f_l ~f:(fun f -> dep f) ;
      ])
48
  ]
49 50 51 52


let profile_l_of_splitted_profile ~nb_cat ~nb_sites profile_fn =
  let profile_f = input profile_fn in
Carine Rey's avatar
Carine Rey committed
53
  let prefix = Filename.chop_extension (Filename.basename profile_fn) in
54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69
  let dist_bins = match nb_cat with
    | 3 -> "[0.01,0.4],[0.4,0.6],[0.6,2]"
    | 1 -> "[0.01,2]"
    | _ -> failwith {| nbcat must be 1 or 3 |}
  in
  let splitted_profile = split_profile ~nb_sites ~dist_bins profile_f in
  match nb_cat with
  | 3 -> (
      let p0 = splitted_profile / selector ["profile_0.tsv"] in
      let p1 = splitted_profile / selector ["profile_1.tsv"] in
      let p2 = splitted_profile / selector ["profile_2.tsv"] in
      {profile_c=cat_file [p0;p1;p2] ; profile_n=prefix ^ "_3categories" ; profile_f};
    )
  | 1 -> (let p0 = splitted_profile / selector ["profile_0.tsv"] in
          {profile_c=p0; profile_n=prefix ; profile_f})
  | _ -> failwith {| nbcat must be 1 or 3 |}