Save JSON/CSV manifests automatically
From @tavinathanson:
confused about the CSV in https://github.com/hammerlab/epidisco/issues/149 vs. the JSON output, and which of those (or both?) we'd want to have for arbitary pipelines?
From @smondet:
@ihodes once we figure out exactly what we want we should bake saving+jsonoutput (+csv?) directly into the
To_workflowcompiler. my understanding is that JSON is a superset of the CSV information
Also relates to discohorts. From @tavinathanson:
discohorts will rely on more manual determination of file paths w/out a manifest, but will try to make it easy to switch over to using the manifest when available
I've added this to pipelines in an ad-hoc manner (e.g. for @timodonnell below)
(** We want to extend the compiler to handle a new function,
`write_csv_manifest`, so we define the new signature that the compiler must
have. *)
module type Semantics = sig
include Biokepi.EDSL.Semantics
val write_csv_manifest :
normal:[ `Bam ] repr ->
tumor:[ `Bam ] repr ->
vcfs:(string * [ `Vcf ] repr) list ->
string -> unit
end
(** Here we add the function itself for the `To_workflow` compiler (the compiler
which handles the transformation from the eDSL to actual Ketrew workflow
nodes). All we're adding is a function which outputs a CSV locally when the
workflow is compiled. *)
module To_workflow
(Config : Biokepi.EDSL.Compile.To_workflow.Compiler_configuration) =
struct
include Biokepi.EDSL.Compile.To_workflow.Make(Config)
let write_csv_manifest ~normal ~tumor ~vcfs out
=
let csv =
let module F = Biokepi.EDSL.Compile.To_workflow.File_type_specification in
let header = ["name"; "path"] in
let vcfs =
List.map ~f:(fun (name, n) -> [name; (F.get_vcf n)#product#path]) vcfs
in
[
header;
["normal"; (F.get_bam normal)#product#path];
["tumor"; (F.get_bam tumor)#product#path];
] @ vcfs
in
let outc = Csv.to_channel (open_out out) in
Csv.output_all outc csv
end