"""
Provides functions and classes related to processing and generating conformers.
"""
import os
from typing import Dict, Tuple, Optional, Any
import numpy as np
import pandas as pd
import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem, rdDistGeom, rdFMCS, rdMolAlign
from rdkit.ML.Cluster import Butina
from neurosnap.log import logger
[docs]
def find_LCS(mol: Chem.rdchem.Mol) -> Chem.rdchem.Mol:
"""Find the largest common substructure (LCS) between a
set of conformers and aligns all conformers to the LCS.
Parameters:
mol: Input RDkit molecule object, must already have conformers present
Returns:
Resultant molecule object with all conformers aligned to the LCS
Raises:
Exception: if no LCS is detected
"""
logger.info("Finding largest common substructure (LCS) for clustering")
# Convert the molecule to a list of molecules (each conformer treated as a separate molecule)
conformer_mols = []
for conf in mol.GetConformers():
new_mol = Chem.Mol(mol)
new_mol.RemoveAllConformers()
new_mol.AddConformer(conf, assignId=True)
conformer_mols.append(new_mol)
# Perform MCS (Maximum Common Substructure) search
mcs_result = rdFMCS.FindMCS(conformer_mols, completeRingsOnly=True, ringMatchesRingOnly=True)
core = Chem.MolFromSmarts(mcs_result.smartsString)
# Check if a core substructure was found
assert core is not None and core.GetNumAtoms(), "No core substructure detected. Aligning using first conformer."
logger.debug("Core substructure detected. Aligning using core.")
AllChem.AlignMolConformers(mol, mol.GetSubstructMatch(core))
return mol
[docs]
def minimize(mol: Chem.rdchem.Mol, method: str = "MMFF94", percentile: float = 100.0) -> Tuple[float, Dict[int, float]]:
"""Minimize conformer energy (kcal/mol) using RDkit
and filter out conformers based on energy percentile.
Parameters:
mol: RDkit mol object containing the conformers you want to minimize. (rdkit.Chem.rdchem.Mol)
method: Can be either UFF, MMFF94, or MMFF94s (str)
percentile: Filters out conformers above a given energy percentile (0 to 100). For example, 10.0 will retain conformers within the lowest 10% energy. (float)
Returns:
A tuple of the form ``(mol_filtered, energies)``
- ``mol_filtered``: Molecule object with filtered conformers.
- ``energies``: Dictionary where keys are conformer IDs and values are calculated energies in kcal/mol.
"""
logger.info(f"Minimizing energy of all conformers using {method}")
energies = {} # keys are conformer IDs, values are calculated energies.
for i in range(mol.GetNumConformers()):
if method == "UFF":
AllChem.UFFOptimizeMolecule(mol, confId=i)
ff = AllChem.UFFGetMoleculeForceField(mol, confId=i)
elif method in ["MMFF94", "MMFF94s"]:
AllChem.MMFFOptimizeMolecule(mol, confId=i, mmffVariant=method)
props = AllChem.MMFFGetMoleculeProperties(mol)
if props is None:
logger.warning(f"MMFF properties could not be initialized for molecule: {Chem.MolToSmiles(mol)}")
energies[i] = 0
continue
ff = AllChem.MMFFGetMoleculeForceField(mol, props, confId=i)
else:
raise ValueError(f"Invalid minimization method: {method}")
if ff is None:
logger.warning(f"Force field could not be initialized for conformer {i}, setting value to 0.")
energies[i] = 0
continue
energies[i] = ff.CalcEnergy()
# Determine the energy threshold based on the specified percentile
energy_values = list(energies.values())
energy_threshold = np.percentile(energy_values, percentile)
# Filter out conformers above the energy threshold
mol_filtered = Chem.Mol(mol) # Copy the molecule structure
mol_filtered.RemoveAllConformers() # Remove all conformers from the copy
energies_filtered = {}
num_filtered = 0
for conf_id, energy in energies.items():
if energy <= energy_threshold:
num_filtered += 1
conformer = mol.GetConformer(conf_id)
new_conf_id = mol_filtered.AddConformer(conformer, assignId=True) # Add the filtered conformer to the new molecule
energies_filtered[new_conf_id] = energy
logger.info(f"Filtered {num_filtered:,}/{len(energy_values):,} conformers within the lowest {percentile}% of energies.")
logger.info(f"min: {min(energy_values):.2f}, max: {max(energy_values):.2f}, avg: {sum(energy_values)/len(energy_values):.2f}")
return mol_filtered, energies_filtered
[docs]
def generate(
input_mol: Any,
output_name: str = "unique_conformers",
write_multi: bool = False,
num_confs: int = 1000,
min_method: Optional[str] = "auto",
max_atoms: int = 500,
) -> pd.DataFrame:
"""Generate conformers for an input molecule.
Performs the following actions in order:
1. Generate conformers using ETKDG method
2. Minimize energy of all conformers and remove those below a dynamic threshold
3. Align & create RMSD matrix of all conformers
4. Clusters using Butina method to remove structurally redundant conformers
5. Return most energetically favorable conformers in each cluster
Parameters:
input_mol: Input molecule can be a path to a molecule file, a SMILES string, or an instance of rdkit.Chem.rdchem.Mol
output_name: Output to write SDF files of passing conformers
write_multi: If True will write all unique conformers to a single SDF file, if False will write all unique conformers in separate SDF files in output_name
num_confs: Number of conformers to generate
min_method: Method for minimization, can be either "auto", "UFF", "MMFF94", "MMFF94s", or None for no minimization
max_atoms: Maximum number of atoms allowed for the input molecule
Returns:
A dataframe with all conformer statistics. Note if energy minimization is disabled or fails then energy column will consist of None values.
"""
### parse input and construct corresponding RDkit mol object
my_mol = None
if isinstance(input_mol, rdkit.Chem.rdchem.Mol):
my_mol = input_mol
if isinstance(input_mol, str):
if input_mol.endswith(".pdb"):
my_mol = Chem.MolFromPDBFile(input_mol)
elif input_mol.endswith(".sdf"):
for mol in Chem.SDMolSupplier(input_mol): # get first valid molecule if exists
if mol is not None:
my_mol = mol
break
assert my_mol is not None, "Unable to find a valid molecule in the provided SDF file."
else: # assume smiles
my_mol = Chem.MolFromSmiles(input_mol)
assert my_mol is not None, f"Invalid smiles string provided {input_mol}"
else:
raise ValueError(
"Invalid input type for input_mol. Input molecule can be a path to a molecule file, a SMILES string, or an instance of rdkit.Chem.rdchem.Mol"
)
# Add hydrogens to molecule to generate a more accurate 3D structure
my_mol = Chem.AddHs(my_mol)
# Check if the number of atoms exceeds max_atoms
if my_mol.GetNumAtoms() > max_atoms:
raise ValueError(
f"Input molecule exceeds the maximum allowed atoms ({max_atoms}). It has {my_mol.GetNumAtoms()} atoms. For proteins try using a different service such as the AFcluster and AlphaFlow tools on Neurosnap."
)
### Generate 3D conformers
logger.debug(f"Generating {num_confs:,} 3D conformers.")
params = AllChem.ETKDGv3()
params.numThreads = 0 # Will use all available CPU cores
params.forceTol = 0.0001 # Indirectly increases embedding attempts
params.useRandomCoords = True # Fallback to random coordinates if necessary
params.pruneRmsThresh = 0.1 # Allow some deviation to increase conformer count
rdDistGeom.EmbedMultipleConfs(my_mol, numConfs=num_confs, params=params) # NOTE: maxAttempts=100 was removed as does not work with ETKDGv3
logger.info(f"Generated {my_mol.GetNumConformers():,} 3D conformers.")
### Minimize energy of each conformer
energies = {}
if min_method:
# if min_method is set to auto then try a variety of methods until one succeeds
if min_method == "auto":
for method in ["MMFF94", "UFF", "MMFF94s"]:
try:
my_mol, energies = minimize(my_mol, method=method)
except:
pass
else:
my_mol, energies = minimize(my_mol, method=min_method)
### Clustering
## Calculate the RMSD matrix between all pairs of conformers
dists = []
num_conformers = my_mol.GetNumConformers()
my_mol_nh = Chem.RemoveHs(my_mol) # remove hydrogens as it interferes with following steps
for i in range(num_conformers):
for j in range(i):
dists.append(rdMolAlign.GetBestRMS(my_mol_nh, my_mol_nh, i, j))
## Perform clustering using Butina approach
# Calculate the dynamic threshold based on molecule size (distance threshold in Å for Butina)
num_heavy_atoms = my_mol.GetNumHeavyAtoms()
c = 0.2 # Constant, can be adjusted based on needs
threshold = c * np.sqrt(num_heavy_atoms)
logger.debug(f"Dynamic RMSD threshold: {threshold:.2f} Å")
# perform clustering
logger.debug(f"Clustering {num_conformers:,} conformers.")
clusters = Butina.ClusterData(dists, num_conformers, threshold, isDistData=True, reordering=True)
# get most favorable representatives for each cluster using calculated energy
output = {
"conformer_id": [],
"cluster_id": [],
"energy": [],
"cluster_size": [],
}
for i, cluster in enumerate(clusters, start=1):
best_energy = float("inf")
best_cid = None
if energies:
for cid in cluster:
if energies[cid] < best_energy:
best_cid = cid
best_energy = energies[cid]
else: # if no min_method / energies available just take the first element in the cluster and set energy to None
best_cid = cluster[0]
best_energy = None
logger.debug(f"Cluster ID: {i}, Best Conformer: {best_cid} ({best_energy:.2f}), Conformers {cluster}")
output["conformer_id"].append(best_cid)
output["cluster_id"].append(i)
output["energy"].append(best_energy)
output["cluster_size"].append(len(cluster))
df = pd.DataFrame(output)
logger.info(f"Selected {len(df)} unique conformers.")
print(df)
## Write output
if write_multi: # write to single SDF file
with Chem.SDWriter(f"{output_name}.sdf") as w:
for conf_id in output["conformer_id"]:
w.write(my_mol, confId=conf_id)
else: # write to single separate SDF files
os.makedirs(output_name, exist_ok=True)
for conf_id in output["conformer_id"]:
with Chem.SDWriter(os.path.join(output_name, f"conformer_{conf_id}.sdf")) as w:
w.write(my_mol, confId=conf_id)
return df