Source code for alphabase.constants.atom
import os
import re
import typing
from collections import defaultdict
import numba
import numpy as np
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors
from alphabase.constants._const import CONST_FILE_FOLDER, common_const_dict
from alphabase.yaml_utils import load_yaml
MASS_PROTON: float = common_const_dict["MASS_PROTON"]
MASS_ISOTOPE: float = common_const_dict["MASS_ISOTOPE"]
MAX_ISOTOPE_LEN: int = common_const_dict["MAX_ISOTOPE_LEN"]
EMPTY_DIST: np.ndarray = np.zeros(MAX_ISOTOPE_LEN)
EMPTY_DIST[0] = 1
[docs]
@numba.njit
def truncate_isotope(isotopes: np.ndarray, mono_idx: int) -> tuple:
"""
For a given isotope distribution (intensity patterns),
this function truncates the distribution by top
`MAX_ISOTOPE_LEN` neighbors those contain the monoisotopic
peak pointed by `mono_idx`.
Parameters
----------
isotopes : np.ndarray
Isotope patterns with size > `MAX_ISOTOPE_LEN`.
mono_idx : int
Monoisotopic peak position (index) in the isotope patterns
Returns
-------
int
the new position of `mono_idx`
int
the start position of the truncated isotopes
int
the end position of the truncated isotopes
"""
trunc_start = mono_idx - 1
trunc_end = mono_idx + 1
while (
trunc_start >= 0
and trunc_end < len(isotopes)
and (trunc_end - trunc_start - 1) < MAX_ISOTOPE_LEN
):
if isotopes[trunc_end] >= isotopes[trunc_start]:
trunc_end += 1
else:
trunc_start -= 1
if trunc_end - trunc_start - 1 < MAX_ISOTOPE_LEN:
if trunc_start == -1:
trunc_end = MAX_ISOTOPE_LEN
elif trunc_end == len(isotopes):
trunc_start = len(isotopes) - MAX_ISOTOPE_LEN - 1
return mono_idx - trunc_start - 1, trunc_start + 1, trunc_end
#: chemical element information in dict defined by `nist_element.yaml`
CHEM_INFO_DICT = {}
#: {element: mass}
CHEM_MONO_MASS = {}
#: {element: np.ndarray of abundance distribution}
CHEM_ISOTOPE_DIST: numba.typed.Dict = numba.typed.Dict.empty(
key_type=numba.types.unicode_type, value_type=numba.types.float64[:]
)
#: {element: int (mono position)}
CHEM_MONO_IDX: numba.typed.Dict = numba.typed.Dict.empty(
key_type=numba.types.unicode_type, value_type=numba.types.int64
)
MASS_H: int = None
MASS_C: int = None
MASS_O: int = None
MASS_N: int = None
MASS_H2O: int = None # raise errors if the value is not reset
MASS_NH3: int = None
[docs]
def update_atom_infos(new_atom_info: typing.Dict):
"""Update the atom information in CHEM_INFO_DICT.
Args:
new_atom_info (Dict): A dictionary specifying new isotopic abundances and masses.
Examples:
Replacing N with 15N
.. code-block:: python
{"N":
{"abundance": [0.01, 0.99]},
{"mass": [14.00307400443, 15.00010889888]},
}
"""
for atom, info in new_atom_info.items():
CHEM_INFO_DICT[atom] = info
reset_elements()
[docs]
def reset_elements():
global MASS_C, MASS_H, MASS_O, MASS_N
global MASS_H2O, MASS_NH3
for elem, items in CHEM_INFO_DICT.items():
isotopes = np.array(items["abundance"])
masses = np.array(items["mass"])
_sort_idx = np.argsort(masses)
masses = masses[_sort_idx]
isotopes = isotopes[_sort_idx]
_mass_pos = np.round(masses).astype(int)
_mass_pos = _mass_pos - _mass_pos[0]
if _mass_pos[-1] - _mass_pos[0] + 1 <= MAX_ISOTOPE_LEN:
_isos = np.zeros(MAX_ISOTOPE_LEN)
_isos[_mass_pos] = isotopes
_masses = np.zeros(MAX_ISOTOPE_LEN)
_masses[_mass_pos] = masses
mono_idx = np.argmax(_isos)
CHEM_MONO_MASS[elem] = _masses[mono_idx]
CHEM_ISOTOPE_DIST[elem] = _isos
CHEM_MONO_IDX[elem] = mono_idx
else:
_isos = np.zeros(_mass_pos[-1] - _mass_pos[0] + 1)
_isos[_mass_pos] = isotopes
_masses = np.zeros(_mass_pos[-1] - _mass_pos[0] + 1)
_masses[_mass_pos] = masses
mono_idx = np.argmax(_isos)
CHEM_MONO_MASS[elem] = _masses[mono_idx]
_mono_idx, start, end = truncate_isotope(_isos, mono_idx)
CHEM_ISOTOPE_DIST[elem] = _isos[start:end]
CHEM_MONO_IDX[elem] = _mono_idx
MASS_C = CHEM_MONO_MASS["C"]
MASS_H = CHEM_MONO_MASS["H"]
MASS_N = CHEM_MONO_MASS["N"]
MASS_O = CHEM_MONO_MASS["O"]
MASS_H2O = CHEM_MONO_MASS["H"] * 2 + CHEM_MONO_MASS["O"]
MASS_NH3 = CHEM_MONO_MASS["H"] * 3 + CHEM_MONO_MASS["N"]
[docs]
def load_elem_yaml(yaml_file: str):
"""Load built-in or user-defined element yaml file. Default yaml is:
os.path.join(_base_dir, 'nist_element.yaml')
"""
global CHEM_INFO_DICT
global CHEM_MONO_MASS
global CHEM_ISOTOPE_DIST
global CHEM_MONO_IDX
CHEM_INFO_DICT = load_yaml(yaml_file)
CHEM_MONO_MASS = {}
CHEM_ISOTOPE_DIST = numba.typed.Dict.empty(
key_type=numba.types.unicode_type, value_type=numba.types.float64[:]
)
CHEM_MONO_IDX = numba.typed.Dict.empty(
key_type=numba.types.unicode_type, value_type=numba.types.int64
)
reset_elements()
load_elem_yaml(os.path.join(CONST_FILE_FOLDER, "nist_element.yaml"))
[docs]
def parse_formula(formula: str) -> list:
"""
Given a formula (str, e.g. `H(1)C(2)O(3)`),
it generates `[('H', 2), ('C', 2), ('O', 1)]`
"""
if not formula:
return []
items = [item.split("(") for item in formula.strip(")").split(")")]
return [(elem, int(n)) for elem, n in items]
[docs]
def calc_mass_from_formula(formula: str):
"""
Calculates the mass of the formula`
Parameters
----------
formula : str
e.g. `H(1)C(2)O(3)`
Returns
-------
float
mass of the formula
"""
return np.sum([CHEM_MONO_MASS[elem] * n for elem, n in parse_formula(formula)])
[docs]
class ChemicalCompositonFormula:
"""
Initialize the ChemicalCompositonFormula with a given formula.
Parameters
----------
formula : str
The chemical formula as a string.
Returns
-------
None
"""
[docs]
def __init__(self, formula=None):
self.elements = (
defaultdict(int) if formula is None else self._parse_formula(formula)
)
[docs]
@classmethod
def from_smiles(cls, smiles: str) -> "ChemicalCompositonFormula":
"""
Create a ChemicalCompositonFormula instance from a SMILES string.
Parameters
----------
smiles : str
The SMILES representation of the molecule.
Returns
-------
ChemicalCompositonFormula
An instance of the class based on the SMILES string.
Raises
------
ValueError
If the SMILES string is invalid and can't be converted to an RDKit molecule.
"""
mol = Chem.MolFromSmiles(smiles)
if not mol:
raise ValueError(f"Invalid RDKit molecule: {smiles}")
formula = rdMolDescriptors.CalcMolFormula(
mol, separateIsotopes=True, abbreviateHIsotopes=False
)
formula = formula.replace("[1H]", "H")
return cls._from_rdkit_formula(formula)
@classmethod
def _from_rdkit_formula(cls, formula: str) -> "ChemicalCompositonFormula":
"""
Create a ChemicalCompositonFormula instance from an RDKit formula.
Parameters
----------
formula : str
The chemical formula as generated by RDKit.
Returns
-------
ChemicalCompositonFormula
An instance of the class based on the RDKit formula.
"""
instance = cls.__new__(cls)
instance.elements = instance._parse_rdkit_formula(formula)
return instance
def _parse_formula(self, formula) -> dict:
"""
Parse a chemical formula string into a dictionary of elements and their counts.
Parameters
----------
formula : str
The chemical formula to parse.
Returns
-------
dict
A dictionary with elements as keys and their counts as values.
"""
# Expected pattern: (\d+)?: optional isotope number, ([A-Z][a-z]*): element symbol, (?:\(([-]?\d+)\))?: optional count in parentheses
# Example: 13C(2)H(3)O(-1) -> [('13', 'C', '2'), ('', 'H', '3'), ('', 'O', '-1')]
pattern = r"(\d+)?([A-Z][a-z]*)(?:\(([-]?\d+)\))?"
matches = re.findall(pattern, formula)
element_counts = defaultdict(int)
for isotope, element, count in matches:
if isotope:
element = f"{isotope}{element}"
count = int(count) if count else 1
element_counts[element] += count
self._validate_atoms(element_counts)
return element_counts
def _parse_rdkit_formula(self, formula: str) -> dict:
"""
Parse an RDKit-generated formula string into a dictionary of elements and their counts.
Parameters
----------
formula : str
The RDKit-generated chemical formula to parse.
Returns
-------
dict
A dictionary with elements as keys and their counts as values.
"""
# Expected pattern: (\[(\d+)([A-Z][a-z]*)\]|([A-Z][a-z]*)): isotope in square brackets or element symbol, followed by (\d*): optional count
# Example: [13C]C2H5OH -> [('[13C]', '13', 'C', '', ''), ('C', '', '', 'C', '2'), ('H', '', '', 'H', '5'), ('O', '', '', 'O', ''), ('H', '', '', 'H', '')]
pattern = r"(\[(\d+)([A-Z][a-z]*)\]|([A-Z][a-z]*))(\d*)"
matches = re.findall(pattern, formula)
element_counts = defaultdict(int)
for match in matches:
count = int(match[4]) if match[4] else 1
if match[1]: # noqa: SIM108
# Isotope, see 0th element in the example above
element = f"{match[1]}{match[2]}"
else:
# Regular element, see the rest in the example above
element = match[3]
element_counts[element] += count
self._validate_atoms(element_counts)
return element_counts
def _validate_atoms(self, element_counts):
"""
Validate the elements in the formula.
Parameters
----------
element_counts : dict
The elements and their counts in the formula.
Raises
------
ValueError
If the formula contains an unknown element.
"""
for element in element_counts:
if element not in CHEM_MONO_MASS:
raise ValueError(f"Unknown element: {element}")
def __str__(self):
"""
Return a string representation of the chemical formula.
Returns
-------
str
The chemical formula as a string.
"""
return "".join(
f"{element}({count})"
for element, count in sorted(self.elements.items())
if count != 0
)
def __repr__(self):
"""
Return a string representation of the ChemicalCompositonFormula instance.
Returns
-------
str
A string representation of the instance.
"""
return f"ChemicalCompositonFormula('{self.__str__()}')"
def __add__(self, other):
"""
Add two ChemicalCompositonFormula instances.
Parameters
----------
other : ChemicalCompositonFormula
The other instance to add.
Returns
-------
ChemicalCompositonFormula
A new instance representing the sum of the two formulas.
"""
result = ChemicalCompositonFormula()
for element in set(self.elements.keys()) | set(other.elements.keys()):
result.elements[element] = self.elements[element] + other.elements[element]
return result
def __sub__(self, other):
"""
Subtract one ChemicalCompositonFormula instance from another.
Parameters
----------
other : ChemicalCompositonFormula
The instance to subtract.
Returns
-------
ChemicalCompositonFormula
A new instance representing the difference of the two formulas.
"""
result = ChemicalCompositonFormula()
for element in set(self.elements.keys()) | set(other.elements.keys()):
result.elements[element] = self.elements[element] - other.elements[element]
return result