open Core_kernel open Rresult type column = | Ints of int array | Int_opts of int option array | Floats of float array | Float_opts of float option array | Strings of string array | String_opts of string option array [@@deriving show] type t = { nrows : int ; ncols : int ; cols : (string * column) list ; } let columns df = df.cols let nrows t = t.nrows let ncols t = t.ncols let get_col_by_name t = List.Assoc.find t.cols ~equal:String.equal let get_col t i = List.nth t.cols i |> Option.map ~f:snd let parse_header h = let labels = String.split ~on:'\t' h in labels, List.length labels let fold_lines xs ~init ~f = let rec loop i acc = function | [] -> Ok acc | x :: xs -> match f i acc x with | Ok r -> loop (i + 1) r xs | Error _ as e -> e in loop 0 init xs let optionally f = function | "NA" -> None | s -> Some (f s) let rev_convert_col col = let conv f = List.rev_map col ~f |> Array.of_list in let conv_opt f = conv (optionally f) in if List.mem col "NA" ~equal:String.equal then try Int_opts (conv_opt Int.of_string) with _ -> try Float_opts (conv_opt Float.of_string) with _ -> String_opts ( List.map col ~f:Option.some |> Array.of_list ) else try Ints (conv Int.of_string) with _ -> try Floats (conv Float.of_string) with _ -> Strings (Array.of_list_rev col) let parse_lines ncols lines = let open Result.Monad_infix in let init = 0, List.init ncols ~f:(Fn.const []) in fold_lines lines ~init ~f:(fun i (nr, acc) l -> let fields = String.split l ~on:'\t' in match List.map2 fields acc ~f:List.cons with | Ok r -> Ok (nr + 1, r) | Unequal_lengths -> Rresult.R.error_msgf "Line %d doesn't have the expected %d fields" (i + 1) ncols ) >>| fun (nrows, cols) -> nrows, List.map cols ~f:rev_convert_col type parse_result = (int * column list, [`Msg of string]) result [@@deriving show] let%expect_test "Dataframe.parse_line ex1" = let got = parse_lines 3 [ "a\t1.2\tNA" ; "a\t1.2\t2" ; "c\t-1.2\tNA" ; ] in print_endline (show_parse_result got) ; [%expect {| (Ok (3, [(Dataframe.Strings [|"a"; "a"; "c"|]); (Dataframe.Floats [|1.2; 1.2; -1.2|]); (Dataframe.Int_opts [|None; (Some 2); None|])])) |}] let check_header ~colnames header = match List.for_all2 colnames header ~f:String.equal with | Ok true -> Ok () | Ok false -> Error (`Msg "header is different from expected value") | Unequal_lengths -> Error (`Msg "incorrect number of columns") let from_file ?(header = `Read_in_file) path = let open Let_syntax.Result in let lines = In_channel.read_lines path in let* labels, ncols, data_lines = match header, lines with | (`Read_in_file | `Expect _), [] -> Error (`Msg "empty file but expected header") | `Read_in_file, header :: lines -> let labels, ncols = parse_header header in Ok (labels, ncols, lines) | `Expect colnames, header :: data_lines -> let labels, ncols = parse_header header in let+ () = check_header ~colnames labels in labels, ncols, data_lines | `Use colnames, data_lines -> Ok (colnames, List.length colnames, data_lines) in parse_lines ncols data_lines >>= fun (nrows, cols) -> let cols = List.zip_exn labels cols in Ok { nrows ; ncols ; cols }