cwl-utils
cwl-utils copied to clipboard
Proposal for a Function to Generate JSON Schema for Workflow Parameter Files
Hi CWL community,
I'm reaching out to share some thoughts that emerged from a recent discussion within the Japan community (@inutano, @tom-tan) regarding the development of JSON schemas for workflow parameter files.
Specifically, we're looking at creating JSON schemas that corresponds to the YAML templates generated by cwltool --make-template
.
While the templates created by cwltool --make-template
are incredibly useful, I believe that a JSON schema would be more suitable for generating forms for expected workflow inputs and representing workflow parameters in Workflow Execution Services (WES). (Ref.: nf-core - rnaseq - schema_input.json)
To address this, I have drafted a preliminary Python function snippet:
from json import dumps
from typing import Any
from cwl_utils.parser import load_document_by_uri, save
def parse_inputs(cwl_url: str) -> Any:
cwl_obj = load_document_by_uri(cwl_url)
saved_obj = save(cwl_obj)
if "inputs" not in saved_obj:
raise ValueError("Inputs are missing in the provided object.")
return saved_obj["inputs"]
def inputs_to_jsonschema(inputs: Any) -> Any:
"""
Converts a CWL inputs object into a jsonschema object.
Args:
inputs: CWL inputs object.
Returns:
A jsonschema object.
"""
schema = {
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"properties": {},
"required": [],
"additionalProperties": False,
}
# Refer to https://www.commonwl.org/v1.2/Workflow.html#WorkflowInputParameter for more details
for input_item in inputs:
input_id = input_item.get("id")
input_type = input_item.get("type")
if input_id is None or input_type is None:
raise ValueError(
"Each item in the 'inputs' object must include 'id' and 'type' fields.")
property_schema = _input_type_to_property_schema(input_type)
if "secondaryFiles" in input_item:
# TODO: do nothing?
# secondaryFiles does not seem to affect the --make-template
# For example, refer to $ cwltool --make-template https://raw.githubusercontent.com/common-workflow-language/cwl-v1.2/main/tests/stage-array.cwl
pass
if "default" in input_item:
property_schema["default"] = input_item["default"]
schema["properties"][input_id] = property_schema # type: ignore
if "default" not in input_item and "null" not in input_type:
schema["required"].append(input_id)
return schema
def _input_type_to_property_schema(input_type: Any) -> Any:
if isinstance(input_type, dict):
nested_type = input_type.get("type")
if nested_type is None:
raise ValueError("The 'inputs.[].type' nested type object must contain a 'type' field.")
if nested_type == "enum":
enum = input_type.get("symbols")
if enum is None:
raise ValueError("The 'inputs.[].type' nested type object must contain a 'symbols' field.")
return {
"type": "string",
"enum": enum,
}
elif nested_type == "record":
schema = {
"type": "object",
"properties": {},
"required": [],
"additionalProperties": False,
}
fields = input_type.get("fields")
if fields is None:
raise ValueError("The 'inputs.[].type' nested type object must contain a 'fields' field.")
for field in fields:
field_name = field.get("name")
field_type = field.get("type")
if field_name is None or field_type is None:
raise ValueError("The 'inputs.[].type.[].fields' object must contain 'name' and 'type' fields.")
field_id = field_name.split("#")[-1].split("/")[-1]
schema["properties"][field_id] = _input_type_to_property_schema(field_type) # type: ignore
if "default" not in field:
schema["required"].append(field_id)
return schema
elif nested_type == "array":
item_type = input_type.get("items")
if item_type is None:
raise ValueError("If 'inputs.[].type.type' is 'array', 'inputs.[].type' must contain an 'items' field.")
return {
"type": "array",
"items": _input_type_to_property_schema(item_type),
"additionalItems": False
}
else:
raise ValueError(f"Unexpected type encountered: {input_type}.")
elif isinstance(input_type, list):
if len(input_type) != 2 or "null" not in input_type:
raise ValueError(f"Unexpected type encountered: {input_type}.")
original_type = [t for t in input_type if t != "null"][0]
schema = _input_type_to_property_schema(original_type)
schema["nullable"] = True
return schema
else:
if input_type == "File":
return {
"type": "object",
"properties": {
"class": {"type": "string", "const": "File"},
"path": {"type": "string"},
"location": {"type": "string"}
},
"required": ["class"],
"oneOf": [
{"required": ["path"]},
{"required": ["location"]}
],
"additionalProperties": False,
}
elif input_type == "Directory":
return {
"type": "object",
"properties": {
"class": {"type": "string", "const": "Directory"},
"path": {"type": "string"},
"location": {"type": "string"}
},
"required": ["class"],
"oneOf": [
{"required": ["path"]},
{"required": ["location"]}
],
"additionalProperties": False,
}
elif input_type == "Any":
return {
"anyOf": [
{"type": "boolean"},
{"type": "integer"},
{"type": "number"},
{"type": "string"},
{"type": "array"},
{"type": "object"}
]
}
elif input_type == "null":
return {"type": "null"}
else:
if input_type in ["long", "float", "double"]:
return {"type": "number"}
elif input_type == "int":
return {"type": "integer"}
else:
return {"type": input_type}
def validate_jsonschema_itself(jsonschema: Any) -> None:
from jsonschema.validators import validator_for
validator = validator_for(jsonschema)
validator.check_schema(jsonschema)
def main() -> None:
test_urls = [
# Sapporo example workflow.
"https://raw.githubusercontent.com/sapporo-wes/sapporo-service/main/tests/resources/cwltool/trimming_and_qc.cwl",
# When the definition itself is a nasty case.
"https://raw.githubusercontent.com/common-workflow-language/cwl-v1.2/main/tests/echo-tool-packed.cwl",
"https://raw.githubusercontent.com/common-workflow-language/cwl-v1.2/main/tests/revsort-packed.cwl",
# When the type is nasty.
"https://raw.githubusercontent.com/common-workflow-language/cwl-v1.2/main/tests/anon_enum_inside_array.cwl",
# The number of parameters is a little large, and the definition itself is a straightforward case.
"https://raw.githubusercontent.com/common-workflow-language/cwl-v1.2/main/tests/bwa-mem-tool.cwl",
# The case where CommandInputParameter is shortened (e.g., param: string)
"https://raw.githubusercontent.com/common-workflow-language/cwl-v1.2/main/tests/env-tool1.cwl",
# No input parameters
"https://raw.githubusercontent.com/common-workflow-language/cwl-v1.2/main/tests/envvar3.cwl",
# Any
"https://raw.githubusercontent.com/common-workflow-language/cwl-v1.2/main/tests/params.cwl",
# Dir
"https://raw.githubusercontent.com/common-workflow-language/cwl-v1.2/main/tests/dir.cwl",
# SecondaryFiles
"https://raw.githubusercontent.com/common-workflow-language/cwl-v1.2/main/tests/secondaryfiles/rename-inputs.cwl",
"https://raw.githubusercontent.com/common-workflow-language/cwl-v1.2/main/tests/stage-array.cwl",
]
for url in test_urls:
try:
print(f"{'-' * 3} Test URL: {url} {'-' * 10}")
print("\n")
inputs = parse_inputs(url)
print("Inputs object: \n")
print(dumps(inputs, indent=2))
print("\n")
print("JSON Schema: \n")
jsonschema = inputs_to_jsonschema(inputs)
validate_jsonschema_itself(jsonschema)
print(dumps(jsonschema, indent=2))
print("\n")
except Exception as e:
print(f"Failed to parse: {url}")
print(e)
import traceback
traceback.print_exc()
if __name__ == "__main__":
main()
This function is capable of generating a JSON schema like the following example (https://raw.githubusercontent.com/common-workflow-language/cwl-v1.2/main/tests/bwa-mem-tool.cwl):
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"properties": {
"reference": {
"type": "object",
"properties": {
"class": {
"type": "string",
"const": "File"
},
"path": {
"type": "string"
},
"location": {
"type": "string"
}
},
"required": [
"class"
],
"oneOf": [
{
"required": [
"path"
]
},
{
"required": [
"location"
]
}
],
"additionalProperties": false
},
"reads": {
"type": "array",
"items": {
"type": "object",
"properties": {
"class": {
"type": "string",
"const": "File"
},
"path": {
"type": "string"
},
"location": {
"type": "string"
}
},
"required": [
"class"
],
"oneOf": [
{
"required": [
"path"
]
},
{
"required": [
"location"
]
}
],
"additionalProperties": false
},
"additionalItems": false
},
"minimum_seed_length": {
"type": "integer"
},
"min_std_max_min": {
"type": "array",
"items": {
"type": "integer"
},
"additionalItems": false
},
"args.py": {
"type": "object",
"properties": {
"class": {
"type": "string",
"const": "File"
},
"path": {
"type": "string"
},
"location": {
"type": "string"
}
},
"required": [
"class"
],
"oneOf": [
{
"required": [
"path"
]
},
{
"required": [
"location"
]
}
],
"additionalProperties": false,
"default": {
"class": "File",
"location": "args.py"
}
}
},
"required": [
"reference",
"reads",
"minimum_seed_length",
"min_std_max_min"
],
"additionalProperties": false
}
I am aware that there may be deficiencies, such as a lack of comprehensive test cases. Therefore, I am eager to receive feedback on this implementation approach and any other suggestions you may have.
A first comment is: why are you reverting a typed CWL object into an untyped Python dictionary using save? Wouldn't it be safer and more reliable to rely directly on CWL Python objects?
Thank you for your comment.
I somewhat understand that the inputs object is a cwl.InputParameter. However, considering the load_document_by_uri interface, which is defined as follows:
def load_document_by_uri(
path: Union[str, Path],
loadingOptions: Optional[LoadingOptions] = None,
load_all: bool = False,
) -> Any:
It returns Any, which makes me wonder if it's difficult to implement something using a safer and more reliable type. I know I could use casting, but that seems counterproductive.
If there's a better approach, I would appreciate your guidance.
Thank you @suecharo ! Yes, this would be useful. Could you please open a PR? Then we all can work on it together there.