biokepi icon indicating copy to clipboard operation
biokepi copied to clipboard

Save JSON/CSV manifests automatically

Open tavinathanson opened this issue 8 years ago • 1 comments

From @tavinathanson:

confused about the CSV in https://github.com/hammerlab/epidisco/issues/149 vs. the JSON output, and which of those (or both?) we'd want to have for arbitary pipelines?

From @smondet:

@ihodes once we figure out exactly what we want we should bake saving+jsonoutput (+csv?) directly into the To_workflow compiler. my understanding is that JSON is a superset of the CSV information

Also relates to discohorts. From @tavinathanson:

discohorts will rely on more manual determination of file paths w/out a manifest, but will try to make it easy to switch over to using the manifest when available

tavinathanson avatar Mar 27 '17 17:03 tavinathanson

I've added this to pipelines in an ad-hoc manner (e.g. for @timodonnell below)

(** We want to extend the compiler to handle a new function,
    `write_csv_manifest`, so we define the new signature that the compiler must
    have. *)
module type Semantics = sig
  include Biokepi.EDSL.Semantics

  val write_csv_manifest :
    normal:[ `Bam ] repr ->
    tumor:[ `Bam ] repr ->
    vcfs:(string * [ `Vcf ] repr) list ->
    string -> unit
end

(** Here we add the function itself for the `To_workflow` compiler (the compiler
    which handles the transformation from the eDSL to actual Ketrew workflow
    nodes). All we're adding is a function which outputs a CSV locally when the
    workflow is compiled. *)
module To_workflow
    (Config : Biokepi.EDSL.Compile.To_workflow.Compiler_configuration) =
struct
  include Biokepi.EDSL.Compile.To_workflow.Make(Config)

  let write_csv_manifest ~normal ~tumor ~vcfs out
    =
    let csv =
      let module F = Biokepi.EDSL.Compile.To_workflow.File_type_specification in
      let header = ["name"; "path"] in
      let vcfs =
        List.map ~f:(fun (name, n) -> [name; (F.get_vcf n)#product#path]) vcfs
      in
      [
        header;
        ["normal";  (F.get_bam normal)#product#path];
        ["tumor";  (F.get_bam tumor)#product#path];
      ] @ vcfs
    in
    let outc = Csv.to_channel (open_out out) in
    Csv.output_all outc csv
end

ihodes avatar Mar 28 '17 06:03 ihodes