Commit cfae0635 authored by Carine Rey's avatar Carine Rey
Browse files

Merge branch 't_choices'

parents ecf9c14f 5a0589b1
FROM r-base
MAINTAINER Carine Rey carine.rey@ens-lyon.org
RUN echo 'install.packages(c("optparse", "ggplot2", "reshape2", "cowplot", "coda"), repos="https://mirror.ibcp.fr/pub/CRAN/", dependencies=TRUE)' > /tmp/packages.R \
&& Rscript /tmp/packages.R
#! /bin/bash
set -e
IMAGE_NAME=r
DOCKERFILE_DIR=.
TAG=07162018
REPO=carinerey/$IMAGE_NAME:$TAG
docker build -t $REPO -f ./Dockerfile $DOCKERFILE_DIR
if [[ $1 == "push_yes" ]]
then
docker push $REPO
fi
......@@ -17,7 +17,7 @@ let string_of_model m = match m with
| H0 -> "H0"
| HaPC -> "HaPC"
| HaPCOC -> "HaPCOC"
| H0_NeSmall -> "H0_NeSmall"
| H0_NeSmall -> "H0_NeSmall"
| HaPCOC_NeSmall -> "HaPCOC_NeSmall"
| HaPC_NeSmall -> "HaPC_NeSmall"
| H0_NeBig -> "H0_NeBig"
......
......@@ -244,15 +244,24 @@ let simulation_main ~outdir ?(np = 2) ?(mem = 2) ~tree_dir ~profile_fn ~preview
let validation_main ~outdir ~indir ?(np = 2) ?(mem = 2) ~preview ~no_diffsel ~tree_dir ~profile_fn ~use_concat () =
let trees = Array.to_list @@ Sys.readdir tree_dir in
let repo = List.map trees ~f:(fun tree ->
let trees = [tree] in
let tree_prefix = Filename.chop_extension tree in
let dataset_l =
derive_sim ~tree_dir ~trees ~profile_fn ~preview ~use_concat
@ parse_input_data indir in
let dataset_results_l = derive_det ~dataset_l ~preview ~no_diffsel in
let repo = [
let post_analyses = Post_analyses.post_analyses_of_dataset_results_l ~dataset_results_l in
let repo_per_tree = [
Dataset.repo dataset_l ~preview ;
repo_of_dataset_results_l ~dataset_results_l ;
Repo.shift tree_prefix (Post_analyses.repo_of_post_analyses ~prefix:tree_prefix ~post_analyses);
] |> List.concat
in
repo_per_tree
)
|> List.concat
in
Repo.build ~outdir ~np ~mem:(`GB mem) ~logger repo
let simulation_command =
......
open Core
open Bistro.Std
open Bistro.EDSL
open Bistro_bioinfo.Std
open Bistro_utils
open File_formats
open Convergence_detection
type post_analyses_dir
type post_analyses = {
t_choices_complete: text_file workflow ;
t_choices_max: text_file workflow ;
t_choices_plot: text_file workflow ;
}
let is_hyp ~hyp dataset_results =
let model_prefix = dataset_results.model_prefix in
let merged_results = dataset_results.merged_results in
model_prefix = hyp
let make_t_choices ~h0_merged_results ~ha_merged_results : post_analyses_dir directory workflow =
let env = docker_image ~account:"carinerey" ~name:"r" ~tag:"07162018" () in
let out = dest // "out" in
workflow ~descr:"post_analyses.t_choices" [
docker env (
and_list [
mkdir_p dest ;
cmd "Rscript" [
file_dump (string Scripts.calc_t_per_meth) ;
opt "--H0" dep h0_merged_results;
opt "--Ha" dep ha_merged_results;
opt "--out " ident out;
];
])
]
let post_analyses_of_dataset_results_l ~dataset_results_l =
let h0_res = List.find dataset_results_l (is_hyp ~hyp: "H0") in
let ha_res = List.find dataset_results_l (is_hyp ~hyp: "HaPCOC") in
match (h0_res, ha_res) with
| (Some h0, Some ha) ->
let h0_merged_results = h0.merged_results in
let ha_merged_results = ha.merged_results in
let t_choices_dir = make_t_choices ~h0_merged_results ~ha_merged_results in
let t_choices_max = t_choices_dir / selector ["out.max_per_meth.tsv"] in
let t_choices_complete = t_choices_dir / selector ["out.complete.tsv"] in
let t_choices_plot = t_choices_dir / selector ["out.pdf"] in
Some {t_choices_max; t_choices_complete ; t_choices_plot}
| _ -> failwith {| tata |}
let repo_of_post_analyses ~prefix ~post_analyses =
match post_analyses with
| None -> []
| Some w ->
Repo.[
item [prefix ^ ".t_choices.max_mcc_per_meth.tsv"] w.t_choices_max ;
item [prefix ^ ".t_choices.complete.tsv"] w.t_choices_complete ;
item [prefix ^ ".t_choices.pdf"] w.t_choices_plot ;
] |> Repo.shift "t_choices"
#!/usr/bin/env Rscript
#https://www.r-bloggers.com/passing-arguments-to-an-r-script-from-command-lines/
library("optparse")
library("reshape2")
library("ggplot2")
library("cowplot")
option_list = list(
make_option(c("--H0"), type="character", default=NULL,
help="merged_results H0", metavar="character"),
make_option(c("--Ha"), type="character", default=NULL,
help="merged_results Ha", metavar="character"),
make_option(c("-o","--out"), type="character", default="out",
help="output prefix [default= %default]", metavar="character")
);
opt_parser = OptionParser(option_list=option_list);
opt = parse_args(opt_parser);
if (is.null(opt$H0)){
print_help(opt_parser)
stop("At least one argument must be supplied (H0 input file)", call.=FALSE)
}
if (is.null(opt$Ha)){
print_help(opt_parser)
stop("At least one argument must be supplied (Ha input file)", call.=FALSE)
}
## fun...
calc_TN_FP = function(vals, t){
TN = 0
FP = 0
vals[is.na(vals)] = 0
if (length(vals) > 0) {
TN = sum(vals <= t)
FP = sum(vals > t)
}
return(data.frame(TN = TN, FP = FP))
}
calc_TP_FN = function(vals, t){
FN = 0
TP = 0
vals = na.omit(vals)
if (length(vals) > 0) {
FN = sum(vals <= t)
TP = sum(vals > t)
}
return(data.frame(TP = TP, FN = FN))
}
calc_TN_FP_TP_FN = function(t, df_H0_melt, df_Ha_melt){
df_TN_FP = do.call(rbind.data.frame,tapply(df_H0_melt$value, df_H0_melt$variable, calc_TN_FP, t=t))
df_TP_FN = do.call(rbind.data.frame,tapply(df_Ha_melt$value, df_H0_melt$variable, calc_TP_FN, t=t))
df_TN_FP_TP_FN = merge(df_TN_FP, df_TP_FN, by=0)
df_TN_FP_TP_FN$t = t
return(df_TN_FP_TP_FN)
}
## program...
df_H0 = read.table(opt$H0, header=TRUE, sep = '\t')
df_Ha = read.table(opt$Ha, header=TRUE, sep = '\t')
df_H0 = df_H0[, ! colnames(df_H0) %in% c("Indel_prop", "Indel_prop.ConvLeaves.")]
df_Ha = df_Ha[, ! colnames(df_Ha) %in% c("Indel_prop", "Indel_prop.ConvLeaves.")]
df_H0_melt = melt(df_H0, id.vars = c("Sites"))
df_Ha_melt = melt(df_Ha, id.vars = c("Sites"))
df = do.call(rbind.data.frame,lapply(seq(0,0.999,0.01), calc_TN_FP_TP_FN, df_H0_melt = df_H0_melt, df_Ha_melt = df_Ha_melt))
df$sens = df$TP / (df$TP+df$FN)
df$sens[is.na(df$sens)] = 0
df$spe = df$TN / (df$FP+df$TN)
df$spe[is.na(df$spe)] = 0
n_sites = sum(df[1,c("TP","FN")])
p = 140 / n_sites
n = (6000 -140)/ n_sites
df$FP_2 = df$FP * n
df$TP_2 = df$TP* p
df$FN_2 = df$FN* p
df$TN_2 = df$TN* n
df$mcc = (df$TP_2 * df$TN_2 - df$FP_2 * df$FN_2) / sqrt((df$TP_2+df$FP_2)*(df$TP_2+df$FN_2)*(df$TN_2+df$FP_2)*(df$TN_2+df$FN_2))
df$mcc[is.na(df$mcc)] = 0
df_out = df[,c("Row.names", "t", "sens", "spe", "mcc")]
colnames(df_out) = c("methode", "threshold", "sensitivity", "specificity", "MCC")
df_out = df_out[order(df_out$methode),]
df_out_melt = melt(df_out, id.vars = c("methode","threshold"))
df_max_mcc_per_method = do.call(rbind, lapply(split(df_out,df_out$methode),
function(x) {return(x[which.max(x$MCC),c("methode", "threshold", "MCC","sensitivity","specificity")])}))
print(df_max_mcc_per_method)
df_max_mcc_per_method_2 = df_max_mcc_per_method
df_max_mcc_per_method_2$variable="MCC"
alpha = 0.7
x_labs = "Threshold"
y_labs = ""
plot = ggplot(df_out_melt, aes(x=threshold, y=value)) + theme_bw() + guides(fill=FALSE) + labs(x=x_labs, y=y_labs) #+ ylim(y_lim)
plot = plot + geom_point()
plot = plot + geom_hline( data = df_max_mcc_per_method_2, aes(yintercept = MCC), col="red", alpha=alpha, show.legend = NA)
plot = plot + geom_vline( data = df_max_mcc_per_method, aes(xintercept = threshold), col="red", alpha=alpha, show.legend = NA,linetype="dotted")
plot = plot + facet_grid(variable ~ methode, scales="free")
save_plot(paste0(opt$out,".pdf"),
plot,
ncol = 0.4 * length(unique(df_out_melt$methode)),
nrow = 1.7,
base_aspect_ratio = 1.5,
limitsize = FALSE
)
write.table(df_out, file=paste0(opt$out,".complete.tsv"), row.names=FALSE, quote=F, sep = "\t")
write.table(df_max_mcc_per_method, file=paste0(opt$out,".max_per_meth.tsv"), row.names=FALSE, quote=F, sep = "\t")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment