QCEngine icon indicating copy to clipboard operation
QCEngine copied to clipboard

Bond orders for atomatic systems in OpenFF

Open awvwgk opened this issue 3 years ago • 1 comments

Describe the bug

Running the openff harness with fractional bond orders to indicate aromaticity yields the following error message:

QCEngine Execution Error:
Traceback (most recent call last):
  File "/home/baedorf/mambaforge/envs/openff/lib/python3.9/site-packages/qcengine/util.py", line 114, in compute_wrapper
    yield metadata
  File "/home/baedorf/mambaforge/envs/openff/lib/python3.9/site-packages/qcengine/compute.py", line 91, in compute
    output_data = executor.compute(input_data, config)
  File "/home/baedorf/mambaforge/envs/openff/lib/python3.9/site-packages/qcengine/programs/openmm.py", line 266, in compute
    rdkit_mol = RDKitHarness._process_molecule_rdkit(input_model.molecule)
  File "/home/baedorf/mambaforge/envs/openff/lib/python3.9/site-packages/qcengine/programs/rdkit.py", line 56, in _process_molecule_rdkit
    rw_mol.AddBond(atom1, atom2, bond_types[bo])
KeyError: 1.5

To Reproduce

JSON input and output provided by @benbaed

input.json
{
  "driver": "energy",
  "model": { "method": "openff-1.0.0", "basis": "smirnoff" },
  "molecule": {
    "schema_version": 2,
    "schema_name": "qcschema_molecule",
    "provenance": {
      "creator": "mctc-lib",
      "version": "0.3.0",
      "routine": "mctc_io_write_qcschema::write_qcschema"
    },
    "symbols": ["C", "C", "C", "C", "C", "C", "H", "H", "H", "H", "H", "H"],
    "atomic_numbers": [6, 6, 6, 6, 6, 6, 1, 1, 1, 1, 1, 1],
    "geometry": [
      -1.9357787502774407e1, -2.5522641039123668, -9.9588566767497208e-2,
      -1.9522760593453768e1, -1.2147159529060192, 2.1501303845931368,
      -1.9534098950201493e1, 1.4078459628422282, 2.1164932595748933,
      -1.9379897298432464e1, 2.6887023301099626, -1.668628168039849e-1,
      -1.9199050508306289e1, 1.3509652064911528, -2.415447932489847,
      -1.9188468042008413e1, -1.2712187640321702, -2.3819997800840649,
      -1.9646915599841332e1, -2.2141921002177698, 3.9266619143488133,
      -1.9667324641987236e1, 2.4513527288576351, 3.8667575961983447,
      -1.9393125381304806e1, 4.7322521612745065, -1.9388590038605719e-1,
      -1.9078108036330583e1, 2.3508192990278278, -4.1929243253078328,
      -1.9060533583371608e1, -2.3147255300475775, -4.1335869249947512,
      -1.9353441132687781e1, -4.5960029076893738, -7.3888291472659226e-2
    ],
    "molecular_charge": 0,
    "connectivity": [
      [0, 11, 1],
      [0, 1, 1.5],
      [1, 6, 1],
      [2, 1, 1.5],
      [2, 7, 1],
      [3, 2, 1.5],
      [4, 5, 1.5],
      [4, 3, 1.5],
      [5, 0, 1.5],
      [8, 3, 1],
      [9, 4, 1],
      [10, 5, 1]
    ]
  }
}
error.json
{
  "id": null,
  "input_data": {
    "driver": "energy",
    "model": { "method": "openff-1.0.0", "basis": "smirnoff" },
    "molecule": {
      "schema_version": 2,
      "schema_name": "qcschema_molecule",
      "provenance": {
        "creator": "mctc-lib",
        "version": "0.3.0",
        "routine": "mctc_io_write_qcschema::write_qcschema"
      },
      "symbols": ["C", "C", "C", "C", "C", "C", "H", "H", "H", "H", "H", "H"],
      "atomic_numbers": [6, 6, 6, 6, 6, 6, 1, 1, 1, 1, 1, 1],
      "geometry": [
        -19.357787502774407, -2.552264103912367, -0.09958856676749721,
        -19.522760593453768, -1.2147159529060192, 2.1501303845931368,
        -19.534098950201493, 1.4078459628422282, 2.1164932595748933,
        -19.379897298432464, 2.6887023301099626, -0.1668628168039849,
        -19.19905050830629, 1.3509652064911528, -2.415447932489847,
        -19.188468042008413, -1.2712187640321702, -2.381999780084065,
        -19.646915599841332, -2.21419210021777, 3.9266619143488133,
        -19.667324641987236, 2.451352728857635, 3.8667575961983447,
        -19.393125381304806, 4.7322521612745065, -0.1938859003860572,
        -19.078108036330583, 2.350819299027828, -4.192924325307833,
        -19.06053358337161, -2.3147255300475775, -4.133586924994751,
        -19.35344113268778, -4.596002907689374, -0.07388829147265923
      ],
      "molecular_charge": 0,
      "connectivity": [
        [0, 11, 1],
        [0, 1, 1.5],
        [1, 6, 1],
        [2, 1, 1.5],
        [2, 7, 1],
        [3, 2, 1.5],
        [4, 5, 1.5],
        [4, 3, 1.5],
        [5, 0, 1.5],
        [8, 3, 1],
        [9, 4, 1],
        [10, 5, 1]
      ]
    },
    "provenance": {
      "cpu": "Intel(R) Core(TM) i7-7700K CPU @ 4.20GHz",
      "hostname": "kekule",
      "username": "baedorf",
      "qcengine_version": "v0.23.0",
      "wall_time": 1.3736984729766846,
      "creator": "QCEngine",
      "version": "v0.23.0"
    }
  },
  "success": false,
  "error": {
    "error_type": "unknown_error",
    "error_message": "QCEngine Execution Error:\nTraceback (most recent call last):\n  File \"/home/baedorf/mambaforge/envs/openff/lib/python3.9/site-packages/qcengine/util.py\", line 114, in compute_wrapper\n    yield metadata\n  File \"/home/baedorf/mambaforge/envs/openff/lib/python3.9/site-packages/qcengine/compute.py\", line 91, in compute\n    output_data = executor.compute(input_data, config)\n  File \"/home/baedorf/mambaforge/envs/openff/lib/python3.9/site-packages/qcengine/programs/openmm.py\", line 266, in compute\n    rdkit_mol = RDKitHarness._process_molecule_rdkit(input_model.molecule)\n  File \"/home/baedorf/mambaforge/envs/openff/lib/python3.9/site-packages/qcengine/programs/rdkit.py\", line 56, in _process_molecule_rdkit\n    rw_mol.AddBond(atom1, atom2, bond_types[bo])\nKeyError: 1.5\n",
    "extras": null
  },
  "extras": null
}

Expected behavior

Allow specification of fractional bond orders for aromaticity with OpenFF.

Additional context

cc @mattwthompson, @j-wags, @dotsdl

awvwgk avatar May 24 '22 14:05 awvwgk

Hi @awvwgk,

TL;DR

The connectivity field isn't sufficient for OpenFF to understand the molecular graph. Use the canonical_isomeric_explicit_hydrogen_mapped_smiles key in the extras field instead.

Detailed description

OpenFF molecules are inherently a "cheminformatics graph" representation. That is, they are graphs first and foremost (atoms have elements, formal charge, and possibly stereo, and bonds have order and possibly stereo), and secondarily there may be coordinates attached. In cases where an OpenFF Molecule is being created from another representation, we need to be able to unambiguously map that representation into a chemical graph with the information content above.

There are two kinda-coupled problems in this workflow. In decreasing order of fundamental-ness:

  • The input representation doesn't have formal charges on the atoms. As far as I know there are cases where it's not trivial to deduce these (like, P, S, or N atoms that could have multiple valence states). I've heard that XYZ2MOL can attempt this but we haven't benchmarked its reliability.
  • There are multiple aromaticity models that disagree with each other. OpenFF force fields use a widely-supported aromaticity model with an open specification (the MDL aromaticity model). So a bond order of 1.5 isn't a safe input without knowing which aromaticity model it came from. This is the cause of the error message that you're getting - this code expects all bond orders to be integer. (actually this whole code path where QCEngine makes an OFFMol without knowing atomic formal charges smells funny to me, I'll be looking closer at this)

To get this workflow working: If the full "cheminformatics graph" representation of the molecule is available, it's possible to set the input's canonical_isomeric_explicit_hydrogen_mapped_smiles in the extras field. OpenFF provides a few pathways in the Toolkit and QCSubmit to handle this transition between cheminformatics-land and QC-land. For example, a MOL file of benzene can be safely converted to a QCSchema Molecule as follows:

from openff.toolkit.topology import Molecule
benzene = Molecule.from_file('benzene.mol')
print(benzene.to_qcschema().json())

yields

{"schema_name": "qcschema_molecule", "schema_version": 2, "validated": true, "symbols": ["C", "C", "C", "C", "C", "C", "H", "H", "H", "H", "H", "H"], "geometry": [3.56232272, -1.95832318, -0.21240522, 5.51989001, -3.08195434, 1.14914246, 5.4391987, -3.12182756, 3.78360965, 3.40094011, -2.0382586, 5.05634019, 1.44337281, -0.91481642, 3.69460355, 1.52425309, -0.8749432, 1.06013636, 3.62336087, -1.92808756, -2.26351395, 7.10782687, -3.92477219, 0.15892597, 6.96326282, -3.99677075, 4.84355703, 3.3382012, -2.07019497, 7.10801585, -0.14361919, -0.07124267, 4.68482004, 0.00075589, 0.00056692, -0.00056692], "name": "C6H6", "molecular_charge": 0.0, "molecular_multiplicity": 1, "connectivity": [[0, 5, 2.0], [0, 1, 1.0], [0, 6, 1.0], [1, 2, 2.0], [1, 7, 1.0], [2, 3, 1.0], [2, 8, 1.0], [3, 4, 2.0], [3, 9, 1.0], [4, 5, 1.0], [4, 10, 1.0], [5, 11, 1.0]], "fix_com": false, "fix_orientation": false, "provenance": {"creator": "QCElemental", "version": "v0.24.0", "routine": "qcelemental.molparse.from_schema"}, "extras": {"canonical_isomeric_explicit_hydrogen_mapped_smiles": "[H:9][c:3]1[c:2]([c:1]([c:6]([c:5]([c:4]1[H:10])[H:11])[H:12])[H:7])[H:8]"}}

j-wags avatar May 24 '22 19:05 j-wags