Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion pysmiles/read_smiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
"""

import enum
import re
import logging

import networkx as nx
Expand Down Expand Up @@ -244,10 +245,14 @@ def read_smiles(smiles, explicit_hydrogen=False, zero_order_bonds=True,
information.
Edges will have an 'order'.
"""
# sanitize invalid SMILES input that is accepted by RDKit
# see: https://github.com/gruenewald-lab/CGsmiles/issues/70#issuecomment-4750353505
pattern = r'(\(=[A-Z]\))(\d)'
mod_smiles = re.sub(pattern, r'\2\1', smiles)

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suppose one could argue that this is the beginning of a slippery slope to shim around more and more pathological SMILES (RDKit can permute at least a dozen variants of the SMILES inputs used in this PR and still handles them just fine).

It might be useful to clarify why we're not using RDKit in this ecosystem--just the "weight" of the dependency?

bond_to_order = {'-': 1, '=': 2, '#': 3, '$': 4, ':': 1.5, '.': 0}
default_bond = 1
default_aromatic_bond = 1.5
mol, ez_isomer_atoms, ring_bonds = base_smiles_parser(smiles, strict=strict,
mol, ez_isomer_atoms, ring_bonds = base_smiles_parser(mod_smiles, strict=strict,
node_attr='_atom_str', edge_attr='_bond_str')
for node in mol:
mol.nodes[node].update(parse_atom(mol.nodes[node]['_atom_str']))
Expand Down
22 changes: 22 additions & 0 deletions tests/test_read_smiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -1022,3 +1022,25 @@ def test_aromatic_molecules(smiles):
"""These molecules are totally aromatic"""
mol = read_smiles(smiles, reinterpret_aromatic=True)
assert all(nx.get_node_attributes(mol, 'aromatic').values())


@pytest.mark.parametrize("smiles", [
# these are both interpreted as the same
# molecule in RDKit
"C(c1c2cccc3c2c(cc1)C(=C)C(=C)C(=C)3)",

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The other pathological example from the cross-listed issue is far harder to shim around. My one hesitation is the danger of reinventing the wheel given the versatility RDKit has with SMILES canonicalization/permutations.

"C(c1c2cccc3c2c(cc1)C(=C)C(=C)C3(=C))"

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

for reference:

from rdkit import Chem
smiles_1 = "C(c1c2cccc3c2c(cc1)C(=C)C(=C)C(=C)3)" 
smiles_2 = "C(c1c2cccc3c2c(cc1)C(=C)C(=C)C3(=C))" 
mol_1 = Chem.MolFromSmiles(smiles_1)
mol_2 = Chem.MolFromSmiles(smiles_2)
assert Chem.MolToSmiles(mol_1) == Chem.MolToSmiles(mol_2)

and neither of those original SMILES matches what RDKit considers "canonical." I'm assuming the opensmiles specification is therefore different from what they consider standard/canonical.

])
def test_non_canonical_smiles_handling(smiles):
# harmonizing SMILES input handling to better match RDKit
# can help accommodate SMILES strings provided by chemists
# per:
# https://github.com/gruenewald-lab/CGsmiles/issues/70
mol = read_smiles(smiles)
# expected values are from the "good" string at:
# https://github.com/gruenewald-lab/CGsmiles/issues/70#issuecomment-4750353505
expected_nodes = list(range(17))
expected_edges = [(0, 1), (1, 2), (1, 10), (2, 3), (2, 7), (3, 4), (4, 5),
(5, 6), (6, 7), (6, 15), (7, 8), (8, 9), (8, 11), (9, 10),
(11, 12), (11, 13), (13, 14), (13, 15), (15, 16)]
assert list(mol.nodes) == expected_nodes
assert list(mol.edges) == expected_edges
Loading