Source code for alphabase.constants.atom

import os
import re
import typing
from collections import defaultdict

import numba
import numpy as np
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors

from alphabase.constants._const import CONST_FILE_FOLDER, common_const_dict
from alphabase.yaml_utils import load_yaml

MASS_PROTON: float = common_const_dict["MASS_PROTON"]
MASS_ISOTOPE: float = common_const_dict["MASS_ISOTOPE"]

MAX_ISOTOPE_LEN: int = common_const_dict["MAX_ISOTOPE_LEN"]
EMPTY_DIST: np.ndarray = np.zeros(MAX_ISOTOPE_LEN)
EMPTY_DIST[0] = 1


[docs] @numba.njit def truncate_isotope(isotopes: np.ndarray, mono_idx: int) -> tuple: """ For a given isotope distribution (intensity patterns), this function truncates the distribution by top `MAX_ISOTOPE_LEN` neighbors those contain the monoisotopic peak pointed by `mono_idx`. Parameters ---------- isotopes : np.ndarray Isotope patterns with size > `MAX_ISOTOPE_LEN`. mono_idx : int Monoisotopic peak position (index) in the isotope patterns Returns ------- int the new position of `mono_idx` int the start position of the truncated isotopes int the end position of the truncated isotopes """ trunc_start = mono_idx - 1 trunc_end = mono_idx + 1 while ( trunc_start >= 0 and trunc_end < len(isotopes) and (trunc_end - trunc_start - 1) < MAX_ISOTOPE_LEN ): if isotopes[trunc_end] >= isotopes[trunc_start]: trunc_end += 1 else: trunc_start -= 1 if trunc_end - trunc_start - 1 < MAX_ISOTOPE_LEN: if trunc_start == -1: trunc_end = MAX_ISOTOPE_LEN elif trunc_end == len(isotopes): trunc_start = len(isotopes) - MAX_ISOTOPE_LEN - 1 return mono_idx - trunc_start - 1, trunc_start + 1, trunc_end
#: chemical element information in dict defined by `nist_element.yaml` CHEM_INFO_DICT = {} #: {element: mass} CHEM_MONO_MASS = {} #: {element: np.ndarray of abundance distribution} CHEM_ISOTOPE_DIST: numba.typed.Dict = numba.typed.Dict.empty( key_type=numba.types.unicode_type, value_type=numba.types.float64[:] ) #: {element: int (mono position)} CHEM_MONO_IDX: numba.typed.Dict = numba.typed.Dict.empty( key_type=numba.types.unicode_type, value_type=numba.types.int64 ) MASS_H: int = None MASS_C: int = None MASS_O: int = None MASS_N: int = None MASS_H2O: int = None # raise errors if the value is not reset MASS_NH3: int = None
[docs] def update_atom_infos(new_atom_info: typing.Dict): """Update the atom information in CHEM_INFO_DICT. Args: new_atom_info (Dict): A dictionary specifying new isotopic abundances and masses. Examples: Replacing N with 15N .. code-block:: python {"N": {"abundance": [0.01, 0.99]}, {"mass": [14.00307400443, 15.00010889888]}, } """ for atom, info in new_atom_info.items(): CHEM_INFO_DICT[atom] = info reset_elements()
[docs] def reset_elements(): global MASS_C, MASS_H, MASS_O, MASS_N global MASS_H2O, MASS_NH3 for elem, items in CHEM_INFO_DICT.items(): isotopes = np.array(items["abundance"]) masses = np.array(items["mass"]) _sort_idx = np.argsort(masses) masses = masses[_sort_idx] isotopes = isotopes[_sort_idx] _mass_pos = np.round(masses).astype(int) _mass_pos = _mass_pos - _mass_pos[0] if _mass_pos[-1] - _mass_pos[0] + 1 <= MAX_ISOTOPE_LEN: _isos = np.zeros(MAX_ISOTOPE_LEN) _isos[_mass_pos] = isotopes _masses = np.zeros(MAX_ISOTOPE_LEN) _masses[_mass_pos] = masses mono_idx = np.argmax(_isos) CHEM_MONO_MASS[elem] = _masses[mono_idx] CHEM_ISOTOPE_DIST[elem] = _isos CHEM_MONO_IDX[elem] = mono_idx else: _isos = np.zeros(_mass_pos[-1] - _mass_pos[0] + 1) _isos[_mass_pos] = isotopes _masses = np.zeros(_mass_pos[-1] - _mass_pos[0] + 1) _masses[_mass_pos] = masses mono_idx = np.argmax(_isos) CHEM_MONO_MASS[elem] = _masses[mono_idx] _mono_idx, start, end = truncate_isotope(_isos, mono_idx) CHEM_ISOTOPE_DIST[elem] = _isos[start:end] CHEM_MONO_IDX[elem] = _mono_idx MASS_C = CHEM_MONO_MASS["C"] MASS_H = CHEM_MONO_MASS["H"] MASS_N = CHEM_MONO_MASS["N"] MASS_O = CHEM_MONO_MASS["O"] MASS_H2O = CHEM_MONO_MASS["H"] * 2 + CHEM_MONO_MASS["O"] MASS_NH3 = CHEM_MONO_MASS["H"] * 3 + CHEM_MONO_MASS["N"]
[docs] def load_elem_yaml(yaml_file: str): """Load built-in or user-defined element yaml file. Default yaml is: os.path.join(_base_dir, 'nist_element.yaml') """ global CHEM_INFO_DICT global CHEM_MONO_MASS global CHEM_ISOTOPE_DIST global CHEM_MONO_IDX CHEM_INFO_DICT = load_yaml(yaml_file) CHEM_MONO_MASS = {} CHEM_ISOTOPE_DIST = numba.typed.Dict.empty( key_type=numba.types.unicode_type, value_type=numba.types.float64[:] ) CHEM_MONO_IDX = numba.typed.Dict.empty( key_type=numba.types.unicode_type, value_type=numba.types.int64 ) reset_elements()
load_elem_yaml(os.path.join(CONST_FILE_FOLDER, "nist_element.yaml"))
[docs] def parse_formula(formula: str) -> list: """ Given a formula (str, e.g. `H(1)C(2)O(3)`), it generates `[('H', 2), ('C', 2), ('O', 1)]` """ if not formula: return [] items = [item.split("(") for item in formula.strip(")").split(")")] return [(elem, int(n)) for elem, n in items]
[docs] def calc_mass_from_formula(formula: str): """ Calculates the mass of the formula` Parameters ---------- formula : str e.g. `H(1)C(2)O(3)` Returns ------- float mass of the formula """ return np.sum([CHEM_MONO_MASS[elem] * n for elem, n in parse_formula(formula)])
[docs] class ChemicalCompositonFormula: """ Initialize the ChemicalCompositonFormula with a given formula. Parameters ---------- formula : str The chemical formula as a string. Returns ------- None """
[docs] def __init__(self, formula=None): self.elements = ( defaultdict(int) if formula is None else self._parse_formula(formula) )
[docs] @classmethod def from_smiles(cls, smiles: str) -> "ChemicalCompositonFormula": """ Create a ChemicalCompositonFormula instance from a SMILES string. Parameters ---------- smiles : str The SMILES representation of the molecule. Returns ------- ChemicalCompositonFormula An instance of the class based on the SMILES string. Raises ------ ValueError If the SMILES string is invalid and can't be converted to an RDKit molecule. """ mol = Chem.MolFromSmiles(smiles) if not mol: raise ValueError(f"Invalid RDKit molecule: {smiles}") formula = rdMolDescriptors.CalcMolFormula( mol, separateIsotopes=True, abbreviateHIsotopes=False ) formula = formula.replace("[1H]", "H") return cls._from_rdkit_formula(formula)
@classmethod def _from_rdkit_formula(cls, formula: str) -> "ChemicalCompositonFormula": """ Create a ChemicalCompositonFormula instance from an RDKit formula. Parameters ---------- formula : str The chemical formula as generated by RDKit. Returns ------- ChemicalCompositonFormula An instance of the class based on the RDKit formula. """ instance = cls.__new__(cls) instance.elements = instance._parse_rdkit_formula(formula) return instance def _parse_formula(self, formula) -> dict: """ Parse a chemical formula string into a dictionary of elements and their counts. Parameters ---------- formula : str The chemical formula to parse. Returns ------- dict A dictionary with elements as keys and their counts as values. """ # Expected pattern: (\d+)?: optional isotope number, ([A-Z][a-z]*): element symbol, (?:\(([-]?\d+)\))?: optional count in parentheses # Example: 13C(2)H(3)O(-1) -> [('13', 'C', '2'), ('', 'H', '3'), ('', 'O', '-1')] pattern = r"(\d+)?([A-Z][a-z]*)(?:\(([-]?\d+)\))?" matches = re.findall(pattern, formula) element_counts = defaultdict(int) for isotope, element, count in matches: if isotope: element = f"{isotope}{element}" count = int(count) if count else 1 element_counts[element] += count self._validate_atoms(element_counts) return element_counts def _parse_rdkit_formula(self, formula: str) -> dict: """ Parse an RDKit-generated formula string into a dictionary of elements and their counts. Parameters ---------- formula : str The RDKit-generated chemical formula to parse. Returns ------- dict A dictionary with elements as keys and their counts as values. """ # Expected pattern: (\[(\d+)([A-Z][a-z]*)\]|([A-Z][a-z]*)): isotope in square brackets or element symbol, followed by (\d*): optional count # Example: [13C]C2H5OH -> [('[13C]', '13', 'C', '', ''), ('C', '', '', 'C', '2'), ('H', '', '', 'H', '5'), ('O', '', '', 'O', ''), ('H', '', '', 'H', '')] pattern = r"(\[(\d+)([A-Z][a-z]*)\]|([A-Z][a-z]*))(\d*)" matches = re.findall(pattern, formula) element_counts = defaultdict(int) for match in matches: count = int(match[4]) if match[4] else 1 if match[1]: # noqa: SIM108 # Isotope, see 0th element in the example above element = f"{match[1]}{match[2]}" else: # Regular element, see the rest in the example above element = match[3] element_counts[element] += count self._validate_atoms(element_counts) return element_counts def _validate_atoms(self, element_counts): """ Validate the elements in the formula. Parameters ---------- element_counts : dict The elements and their counts in the formula. Raises ------ ValueError If the formula contains an unknown element. """ for element in element_counts: if element not in CHEM_MONO_MASS: raise ValueError(f"Unknown element: {element}") def __str__(self): """ Return a string representation of the chemical formula. Returns ------- str The chemical formula as a string. """ return "".join( f"{element}({count})" for element, count in sorted(self.elements.items()) if count != 0 ) def __repr__(self): """ Return a string representation of the ChemicalCompositonFormula instance. Returns ------- str A string representation of the instance. """ return f"ChemicalCompositonFormula('{self.__str__()}')" def __add__(self, other): """ Add two ChemicalCompositonFormula instances. Parameters ---------- other : ChemicalCompositonFormula The other instance to add. Returns ------- ChemicalCompositonFormula A new instance representing the sum of the two formulas. """ result = ChemicalCompositonFormula() for element in set(self.elements.keys()) | set(other.elements.keys()): result.elements[element] = self.elements[element] + other.elements[element] return result def __sub__(self, other): """ Subtract one ChemicalCompositonFormula instance from another. Parameters ---------- other : ChemicalCompositonFormula The instance to subtract. Returns ------- ChemicalCompositonFormula A new instance representing the difference of the two formulas. """ result = ChemicalCompositonFormula() for element in set(self.elements.keys()) | set(other.elements.keys()): result.elements[element] = self.elements[element] - other.elements[element] return result