dataframe.ml 3.33 KB
Newer Older
1 2 3 4
open Core_kernel
open Rresult

type column =
5 6 7 8 9 10
  | Ints of int array
  | Int_opts of int option array
  | Floats of float array
  | Float_opts of float option array
  | Strings of string array
  | String_opts of string option array
11
[@@deriving show]
12 13 14 15 16 17 18

type t = {
  nrows : int ;
  ncols : int ;
  cols : (string * column) list ;
}

Philippe Veber's avatar
Philippe Veber committed
19 20
let columns df = df.cols

21 22 23
let nrows t = t.nrows
let ncols t = t.ncols

24 25 26 27
let get_col_by_name t = List.Assoc.find t.cols ~equal:String.equal
let get_col t i =
  List.nth t.cols i
  |> Option.map ~f:snd
28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53

let parse_header h =
  let labels = String.split ~on:'\t' h in
  labels, List.length labels

let fold_lines xs ~init ~f =
  let rec loop i acc = function
    | [] -> Ok acc
    | x :: xs ->
      match f i acc x with
      | Ok r -> loop (i + 1) r xs
      | Error _ as e -> e
  in
  loop 0 init xs

let optionally f = function
  | "NA" -> None
  | s -> Some (f s)

let rev_convert_col col =
  let conv f =
    List.rev_map col ~f
    |> Array.of_list
  in
  let conv_opt f = conv (optionally f) in
  if List.mem col "NA" ~equal:String.equal then
54 55
    try Int_opts (conv_opt Int.of_string) with _ ->
    try Float_opts (conv_opt Float.of_string)
56
    with _ ->
57
      String_opts (
58 59 60 61
        List.map col ~f:Option.some
        |> Array.of_list
      )
  else
62 63 64
    try Ints (conv Int.of_string) with _ ->
    try Floats (conv Float.of_string) with _ ->
    Strings (Array.of_list_rev col)
65 66 67 68 69 70 71 72 73 74 75 76

let parse_lines ncols lines =
  let open Result.Monad_infix in
  let init = 0, List.init ncols ~f:(Fn.const []) in
  fold_lines lines ~init ~f:(fun i (nr, acc) l ->
      let fields = String.split l ~on:'\t' in
      match List.map2 fields acc ~f:List.cons with
      | Ok r -> Ok (nr + 1, r)
      | Unequal_lengths -> Rresult.R.error_msgf "Line %d doesn't have the expected %d fields" (i + 1) ncols
    ) >>| fun (nrows, cols) ->
  nrows, List.map cols ~f:rev_convert_col

77 78 79 80 81 82 83 84 85 86 87 88 89 90
type parse_result = (int * column list, [`Msg of string]) result
[@@deriving show]

let%expect_test "Dataframe.parse_line ex1" =
  let got =
    parse_lines 3 [
      "a\t1.2\tNA" ;
      "a\t1.2\t2" ;
      "c\t-1.2\tNA" ;
    ]
  in
  print_endline (show_parse_result got) ;
  [%expect {|
    (Ok (3,
91 92 93
         [(Dataframe.Strings [|"a"; "a"; "c"|]);
           (Dataframe.Floats [|1.2; 1.2; -1.2|]);
           (Dataframe.Int_opts [|None; (Some 2); None|])])) |}]
94

95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
let check_header ~colnames header =
  match List.for_all2 colnames header ~f:String.equal with
  | Ok true -> Ok ()
  | Ok false -> Error (`Msg "header is different from expected value")
  | Unequal_lengths -> Error (`Msg "incorrect number of columns")

let from_file ?(header = `Read_in_file) path =
  let open Let_syntax.Result in
  let lines = In_channel.read_lines path in
  let* labels, ncols, data_lines =
    match header, lines with
    | (`Read_in_file | `Expect _), [] ->
      Error (`Msg "empty file but expected header")
    | `Read_in_file, header :: lines ->
      let labels, ncols = parse_header header in
      Ok (labels, ncols, lines)
    | `Expect colnames, header :: data_lines ->
      let labels, ncols = parse_header header in
      let+ () = check_header ~colnames labels in
      labels, ncols, data_lines
    | `Use colnames, data_lines ->
      Ok (colnames, List.length colnames, data_lines)
  in
  parse_lines ncols data_lines >>= fun (nrows, cols) ->
  let cols = List.zip_exn labels cols in
  Ok { nrows ; ncols ; cols }