dataframe.ml 2.05 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79
open Core_kernel
open Rresult

type column =
  | Int of int array
  | Maybe_int of int option array
  | Float of float array
  | Maybe_float of float option array
  | String of string array
  | Maybe_string of string option array

type t = {
  nrows : int ;
  ncols : int ;
  cols : (string * column) list ;
}

let nrows t = t.nrows
let ncols t = t.ncols

let col t = List.Assoc.find t.cols ~equal:String.equal

let parse_header h =
  let labels = String.split ~on:'\t' h in
  labels, List.length labels

let fold_lines xs ~init ~f =
  let rec loop i acc = function
    | [] -> Ok acc
    | x :: xs ->
      match f i acc x with
      | Ok r -> loop (i + 1) r xs
      | Error _ as e -> e
  in
  loop 0 init xs

let optionally f = function
  | "NA" -> None
  | s -> Some (f s)

let rev_convert_col col =
  let conv f =
    List.rev_map col ~f
    |> Array.of_list
  in
  let conv_opt f = conv (optionally f) in
  if List.mem col "NA" ~equal:String.equal then
    try Maybe_int (conv_opt Int.of_string) with _ ->
    try Maybe_float (conv_opt Float.of_string)
    with _ ->
      Maybe_string (
        List.map col ~f:Option.some
        |> Array.of_list
      )
  else
    try Int (conv Int.of_string) with _ ->
    try Float (conv Float.of_string) with _ ->
    String (Array.of_list col)

let parse_lines ncols lines =
  let open Result.Monad_infix in
  let init = 0, List.init ncols ~f:(Fn.const []) in
  fold_lines lines ~init ~f:(fun i (nr, acc) l ->
      let fields = String.split l ~on:'\t' in
      match List.map2 fields acc ~f:List.cons with
      | Ok r -> Ok (nr + 1, r)
      | Unequal_lengths -> Rresult.R.error_msgf "Line %d doesn't have the expected %d fields" (i + 1) ncols
    ) >>| fun (nrows, cols) ->
  nrows, List.map cols ~f:rev_convert_col

let from_file path =
  let open Result.Monad_infix in
  match In_channel.read_lines path with
  | [] -> Error (`Msg "empty file")
  | header :: lines ->
    let labels, ncols = parse_header header in
    parse_lines ncols lines >>= fun (nrows, cols) ->
    let cols = List.zip_exn labels cols in
    Ok { nrows ; ncols ; cols }