Source code for neurosnap.conformers

"""
Provides functions and classes related to processing and generating conformers.
"""

import os
from typing import Dict, Tuple, Optional, Any

import numpy as np
import pandas as pd
import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem, rdDistGeom, rdFMCS, rdMolAlign
from rdkit.ML.Cluster import Butina

from neurosnap.log import logger


[docs] def find_LCS(mol: Chem.rdchem.Mol) -> Chem.rdchem.Mol: """Find the largest common substructure (LCS) between a set of conformers and aligns all conformers to the LCS. Parameters: mol: Input RDkit molecule object, must already have conformers present Returns: Resultant molecule object with all conformers aligned to the LCS Raises: Exception: if no LCS is detected """ logger.info("Finding largest common substructure (LCS) for clustering") # Convert the molecule to a list of molecules (each conformer treated as a separate molecule) conformer_mols = [] for conf in mol.GetConformers(): new_mol = Chem.Mol(mol) new_mol.RemoveAllConformers() new_mol.AddConformer(conf, assignId=True) conformer_mols.append(new_mol) # Perform MCS (Maximum Common Substructure) search mcs_result = rdFMCS.FindMCS(conformer_mols, completeRingsOnly=True, ringMatchesRingOnly=True) core = Chem.MolFromSmarts(mcs_result.smartsString) # Check if a core substructure was found assert core is not None and core.GetNumAtoms(), "No core substructure detected. Aligning using first conformer." logger.debug("Core substructure detected. Aligning using core.") AllChem.AlignMolConformers(mol, mol.GetSubstructMatch(core)) return mol
[docs] def minimize(mol: Chem.rdchem.Mol, method: str = "MMFF94", percentile: float = 100.0) -> Tuple[float, Dict[int, float]]: """Minimize conformer energy (kcal/mol) using RDkit and filter out conformers based on energy percentile. Parameters: mol: RDkit mol object containing the conformers you want to minimize. (rdkit.Chem.rdchem.Mol) method: Can be either UFF, MMFF94, or MMFF94s (str) percentile: Filters out conformers above a given energy percentile (0 to 100). For example, 10.0 will retain conformers within the lowest 10% energy. (float) Returns: A tuple of the form ``(mol_filtered, energies)`` - ``mol_filtered``: Molecule object with filtered conformers. - ``energies``: Dictionary where keys are conformer IDs and values are calculated energies in kcal/mol. """ logger.info(f"Minimizing energy of all conformers using {method}") energies = {} # keys are conformer IDs, values are calculated energies. for i in range(mol.GetNumConformers()): if method == "UFF": AllChem.UFFOptimizeMolecule(mol, confId=i) ff = AllChem.UFFGetMoleculeForceField(mol, confId=i) elif method in ["MMFF94", "MMFF94s"]: AllChem.MMFFOptimizeMolecule(mol, confId=i, mmffVariant=method) props = AllChem.MMFFGetMoleculeProperties(mol) if props is None: logger.warning(f"MMFF properties could not be initialized for molecule: {Chem.MolToSmiles(mol)}") energies[i] = 0 continue ff = AllChem.MMFFGetMoleculeForceField(mol, props, confId=i) else: raise ValueError(f"Invalid minimization method: {method}") if ff is None: logger.warning(f"Force field could not be initialized for conformer {i}, setting value to 0.") energies[i] = 0 continue energies[i] = ff.CalcEnergy() # Determine the energy threshold based on the specified percentile energy_values = list(energies.values()) energy_threshold = np.percentile(energy_values, percentile) # Filter out conformers above the energy threshold mol_filtered = Chem.Mol(mol) # Copy the molecule structure mol_filtered.RemoveAllConformers() # Remove all conformers from the copy energies_filtered = {} num_filtered = 0 for conf_id, energy in energies.items(): if energy <= energy_threshold: num_filtered += 1 conformer = mol.GetConformer(conf_id) new_conf_id = mol_filtered.AddConformer(conformer, assignId=True) # Add the filtered conformer to the new molecule energies_filtered[new_conf_id] = energy logger.info(f"Filtered {num_filtered:,}/{len(energy_values):,} conformers within the lowest {percentile}% of energies.") logger.info(f"min: {min(energy_values):.2f}, max: {max(energy_values):.2f}, avg: {sum(energy_values)/len(energy_values):.2f}") return mol_filtered, energies_filtered
[docs] def generate( input_mol: Any, output_name: str = "unique_conformers", write_multi: bool = False, num_confs: int = 1000, min_method: Optional[str] = "auto", max_atoms: int = 500, ) -> pd.DataFrame: """Generate conformers for an input molecule. Performs the following actions in order: 1. Generate conformers using ETKDG method 2. Minimize energy of all conformers and remove those below a dynamic threshold 3. Align & create RMSD matrix of all conformers 4. Clusters using Butina method to remove structurally redundant conformers 5. Return most energetically favorable conformers in each cluster Parameters: input_mol: Input molecule can be a path to a molecule file, a SMILES string, or an instance of rdkit.Chem.rdchem.Mol output_name: Output to write SDF files of passing conformers write_multi: If True will write all unique conformers to a single SDF file, if False will write all unique conformers in separate SDF files in output_name num_confs: Number of conformers to generate min_method: Method for minimization, can be either "auto", "UFF", "MMFF94", "MMFF94s", or None for no minimization max_atoms: Maximum number of atoms allowed for the input molecule Returns: A dataframe with all conformer statistics. Note if energy minimization is disabled or fails then energy column will consist of None values. """ ### parse input and construct corresponding RDkit mol object my_mol = None if isinstance(input_mol, rdkit.Chem.rdchem.Mol): my_mol = input_mol if isinstance(input_mol, str): if input_mol.endswith(".pdb"): my_mol = Chem.MolFromPDBFile(input_mol) elif input_mol.endswith(".sdf"): for mol in Chem.SDMolSupplier(input_mol): # get first valid molecule if exists if mol is not None: my_mol = mol break assert my_mol is not None, "Unable to find a valid molecule in the provided SDF file." else: # assume smiles my_mol = Chem.MolFromSmiles(input_mol) assert my_mol is not None, f"Invalid smiles string provided {input_mol}" else: raise ValueError( "Invalid input type for input_mol. Input molecule can be a path to a molecule file, a SMILES string, or an instance of rdkit.Chem.rdchem.Mol" ) # Add hydrogens to molecule to generate a more accurate 3D structure my_mol = Chem.AddHs(my_mol) # Check if the number of atoms exceeds max_atoms if my_mol.GetNumAtoms() > max_atoms: raise ValueError( f"Input molecule exceeds the maximum allowed atoms ({max_atoms}). It has {my_mol.GetNumAtoms()} atoms. For proteins try using a different service such as the AFcluster and AlphaFlow tools on Neurosnap." ) ### Generate 3D conformers logger.debug(f"Generating {num_confs:,} 3D conformers.") params = AllChem.ETKDGv3() params.numThreads = 0 # Will use all available CPU cores params.forceTol = 0.0001 # Indirectly increases embedding attempts params.useRandomCoords = True # Fallback to random coordinates if necessary params.pruneRmsThresh = 0.1 # Allow some deviation to increase conformer count rdDistGeom.EmbedMultipleConfs(my_mol, numConfs=num_confs, params=params) # NOTE: maxAttempts=100 was removed as does not work with ETKDGv3 logger.info(f"Generated {my_mol.GetNumConformers():,} 3D conformers.") ### Minimize energy of each conformer energies = {} if min_method: # if min_method is set to auto then try a variety of methods until one succeeds if min_method == "auto": for method in ["MMFF94", "UFF", "MMFF94s"]: try: my_mol, energies = minimize(my_mol, method=method) except: pass else: my_mol, energies = minimize(my_mol, method=min_method) ### Clustering ## Calculate the RMSD matrix between all pairs of conformers dists = [] num_conformers = my_mol.GetNumConformers() my_mol_nh = Chem.RemoveHs(my_mol) # remove hydrogens as it interferes with following steps for i in range(num_conformers): for j in range(i): dists.append(rdMolAlign.GetBestRMS(my_mol_nh, my_mol_nh, i, j)) ## Perform clustering using Butina approach # Calculate the dynamic threshold based on molecule size (distance threshold in Å for Butina) num_heavy_atoms = my_mol.GetNumHeavyAtoms() c = 0.2 # Constant, can be adjusted based on needs threshold = c * np.sqrt(num_heavy_atoms) logger.debug(f"Dynamic RMSD threshold: {threshold:.2f} Å") # perform clustering logger.debug(f"Clustering {num_conformers:,} conformers.") clusters = Butina.ClusterData(dists, num_conformers, threshold, isDistData=True, reordering=True) # get most favorable representatives for each cluster using calculated energy output = { "conformer_id": [], "cluster_id": [], "energy": [], "cluster_size": [], } for i, cluster in enumerate(clusters, start=1): best_energy = float("inf") best_cid = None if energies: for cid in cluster: if energies[cid] < best_energy: best_cid = cid best_energy = energies[cid] else: # if no min_method / energies available just take the first element in the cluster and set energy to None best_cid = cluster[0] best_energy = None logger.debug(f"Cluster ID: {i}, Best Conformer: {best_cid} ({best_energy:.2f}), Conformers {cluster}") output["conformer_id"].append(best_cid) output["cluster_id"].append(i) output["energy"].append(best_energy) output["cluster_size"].append(len(cluster)) df = pd.DataFrame(output) logger.info(f"Selected {len(df)} unique conformers.") print(df) ## Write output if write_multi: # write to single SDF file with Chem.SDWriter(f"{output_name}.sdf") as w: for conf_id in output["conformer_id"]: w.write(my_mol, confId=conf_id) else: # write to single separate SDF files os.makedirs(output_name, exist_ok=True) for conf_id in output["conformer_id"]: with Chem.SDWriter(os.path.join(output_name, f"conformer_{conf_id}.sdf")) as w: w.write(my_mol, confId=conf_id) return df