Source code for alphabase.constants.isotope

import typing

import numba
import numpy as np

from alphabase.constants.atom import (
    CHEM_ISOTOPE_DIST,
    CHEM_MONO_IDX,
    EMPTY_DIST,
    MAX_ISOTOPE_LEN,
    parse_formula,
    truncate_isotope,
)


[docs] @numba.njit def abundance_convolution( d1: np.ndarray, mono1: int, d2: np.ndarray, mono2: int, ) -> typing.Tuple[np.ndarray, int]: """ If we have two isotope distributions, we can convolute them into one distribution. Parameters ---------- d1 : np.ndarray isotope distribution to convolute mono1 : int mono position of d1. d2 : np.ndarray isotope distribution to convolute mono2 : int mono position of d2 Returns ------- tuple[np.ndarray,int] np.ndarray, convoluted isotope distribution int, new mono position. """ mono_idx = mono1 + mono2 ret = np.zeros(MAX_ISOTOPE_LEN * 2 - 1) for i in range(len(d1)): for j in range(len(d2)): ret[i + j] += d1[i] * d2[j] mono_idx, start, end = truncate_isotope(ret, mono_idx) return ret[start:end], mono_idx
[docs] @numba.njit def one_element_dist( elem: str, n: int, chem_isotope_dist: numba.typed.Dict, chem_mono_idx: numba.typed.Dict, ) -> typing.Tuple[np.ndarray, int]: """ Calculate the isotope distribution for an element and its numbers. Parameters ---------- elem : str element. n : int element number. chem_isotope_dist : numba.typed.Dict use `CHEM_ISOTOPE_DIST` as parameter. chem_mono_idx : numba.typed.Dict use `CHEM_MONO_IDX` as parameter. Returns ------- tuple[np.ndarray, int] np.ndarray, isotope distribution of the element. int, mono position in the distribution """ if n == 0: return EMPTY_DIST.copy(), 0 elif n == 1: return chem_isotope_dist[elem], chem_mono_idx[elem] tmp_dist, mono_idx = one_element_dist( elem, n // 2, chem_isotope_dist, chem_mono_idx ) tmp_dist, mono_idx = abundance_convolution(tmp_dist, mono_idx, tmp_dist, mono_idx) if n % 2 == 0: return tmp_dist, mono_idx else: return abundance_convolution( tmp_dist, mono_idx, chem_isotope_dist[elem], chem_mono_idx[elem] )
[docs] def formula_dist(formula: typing.Union[list, str]) -> typing.Tuple[np.ndarray, int]: """ Generate the isotope distribution and the mono index for a given formula (as a list, e.g. `[('H', 2), ('C', 2), ('O', 1)]`). Parameters ---------- formula : Union[list, str] chemical formula, could be str or list. If str: "H(1)N(2)O(3)". If list: "[('H',1),('H',2),('H',3)]". Returns ------- tuple[np.ndarray,int] np.ndarray, isotope distribution int, mono position """ if isinstance(formula, str): formula = parse_formula(formula) calc_dist = EMPTY_DIST.copy() mono_idx = 0 for elem, n in formula: _dist, _mono = one_element_dist(elem, n, CHEM_ISOTOPE_DIST, CHEM_MONO_IDX) calc_dist, mono_idx = abundance_convolution(calc_dist, mono_idx, _dist, _mono) return calc_dist, mono_idx
def _calc_one_elem_cum_dist(element_cum_dist: np.ndarray, element_cum_mono: np.ndarray): """Pre-build isotope abundance distribution for an element for fast calculation. Internel function. Added information inplace into element_cum_dist and element_cum_mono Parameters ---------- element_cum_dist : np.ndarray Cumulated element abundance distribution element_cum_mono : np.ndarray Cumulated element mono position in the distribution """ for n in range(2, len(element_cum_dist)): (element_cum_dist[n], element_cum_mono[n]) = abundance_convolution( element_cum_dist[n - 1], element_cum_mono[n - 1], element_cum_dist[1], element_cum_mono[1], )
[docs] class IsotopeDistribution:
[docs] def __init__( self, max_elem_num_dict: dict = { "C": 2000, "H": 5000, "N": 1000, "O": 1000, "S": 200, "P": 200, }, ): """Faster calculation of isotope abundance distribution by pre-building isotope distribution tables for most common elements. We have considered large enough number of elements for shotgun proteomics. We can increase `max_elem_num_dict` to support larger peptide or top-down in the future. However, current `MAX_ISOTOPE_LEN` is not suitable for top-down, it must be extended to a larger number (100?). Note that non-standard amino acids have 1000000 C elements in AlphaBase, We clip 1000000 C to the maximal number of C in `max_elem_num_dict`. As they have very large masses thus impossible to identify, their isotope distributions do not matter. Parameters ---------- max_elem_num_dict : dict, optional Define the maximal number of the elements. Defaults to { 'C': 2000, 'H': 5000, 'N': 1000, 'O': 1000, 'S': 200, 'P': 200, } Attributes ---------- element_to_cum_dist_dict : dict {element: cumulated isotope distribution array}, and the cumulated isotope distribution array is a 2-D float np.ndarray with shape (element_max_number, MAX_ISOTOPE_LEN). element_to_cum_mono_idx : dict {element: mono position array of cumulated isotope distribution}, and mono position array is a 1-D int np.ndarray. """ self.element_to_cum_dist_dict = {} self.element_to_cum_mono_idx = {} for elem, n in max_elem_num_dict.items(): if n < 2: n = 2 self.element_to_cum_dist_dict[elem] = np.zeros((n, MAX_ISOTOPE_LEN)) self.element_to_cum_mono_idx[elem] = -np.ones(n, dtype=np.int64) self.element_to_cum_dist_dict[elem][0, :] = EMPTY_DIST self.element_to_cum_mono_idx[elem][0] = 0 self.element_to_cum_dist_dict[elem][1, :] = CHEM_ISOTOPE_DIST[elem] self.element_to_cum_mono_idx[elem][1] = CHEM_MONO_IDX[elem] _calc_one_elem_cum_dist( self.element_to_cum_dist_dict[elem], self.element_to_cum_mono_idx[elem] )
[docs] def calc_formula_distribution( self, formula: typing.List[typing.Tuple[str, int]], ) -> typing.Tuple[np.ndarray, int]: """Calculate isotope abundance distribution for a given formula Parameters ---------- formula : List[tuple(str,int)] chemical formula: "[('H',1),('C',2),('O',3)]". Returns ------- tuple[np.ndarray, int] np.ndarray, isotope abundance distribution int, monoisotope position in the distribution array Examples -------- >>> from alphabase.constants import IsotopeDistribution, parse_formula >>> iso = IsotopeDistribution() >>> formula = 'C(100)H(100)O(10)Na(1)Fe(1)' >>> formula = parse_formula(formula) >>> dist, mono = iso.calc_formula_distribution(formula) >>> dist array([1.92320044e-02, 2.10952666e-02, 3.13753566e-01, 3.42663681e-01, 1.95962632e-01, 7.69157517e-02, 2.31993814e-02, 5.71948249e-03, 1.19790438e-03, 2.18815385e-04]) >>> # Fe's mono position is 2 Da larger than its smallest mass, >>> # so the mono position of this formula shifts by +2 (Da). >>> mono 2 >>> formula = 'C(100)H(100)O(10)13C(1)Na(1)' >>> formula = parse_formula(formula) >>> dist, mono = iso.calc_formula_distribution(formula) >>> dist array([3.29033438e-03, 3.29352217e-01, 3.59329960e-01, 2.01524592e-01, 7.71395498e-02, 2.26229845e-02, 5.41229894e-03, 1.09842389e-03, 1.94206388e-04, 3.04911585e-05]) >>> # 13C's mono position is +1 Da shifted >>> mono 1 >>> formula = 'C(100)H(100)O(10)Na(1)' >>> formula = parse_formula(formula) >>> dist, mono = iso.calc_formula_distribution(formula) >>> dist array([3.29033438e-01, 3.60911319e-01, 2.02775462e-01, 7.76884706e-02, 2.27963906e-02, 5.45578135e-03, 1.10754072e-03, 1.95857410e-04, 3.07552058e-05, 4.35047710e-06]) >>> # mono position is normal (=0) for regular formulas >>> mono 0 """ mono = 0 dist = EMPTY_DIST.copy() for elem, n in formula: if elem in self.element_to_cum_dist_dict: if n >= len(self.element_to_cum_mono_idx[elem]): n = len(self.element_to_cum_mono_idx[elem]) - 1 dist, mono = abundance_convolution( dist, mono, self.element_to_cum_dist_dict[elem][n], self.element_to_cum_mono_idx[elem][n], ) else: dist, mono = abundance_convolution( dist, mono, *one_element_dist(elem, n, CHEM_ISOTOPE_DIST, CHEM_MONO_IDX), ) return dist, mono