from typing import List, Tuple
import numpy as np
from alphabase.constants.aa import (
calc_AA_masses,
calc_AA_masses_for_same_len_seqs,
calc_sequence_masses_for_same_len_seqs,
)
from alphabase.constants.atom import MASS_H2O
from alphabase.constants.modification import (
ModificationKeys,
calc_mod_masses_for_same_len_seqs,
calc_modification_mass,
calc_modification_mass_sum,
)
[docs]
def calc_diff_modification_mass(
pep_len: int, mass_diffs: List[float], mass_diff_sites: List[int]
) -> np.ndarray:
"""
For open-search, we may also get modification
mass diffs other than mod names. This function calculate
modification masses from these diff masses.
Parameters
----------
pep_len : int
nAA
mass_diffs : List[float]
mass diffs on the peptide
mass_diff_sites : List[int]
localized sites of corresponding mass diffs
Returns
-------
np.ndarray
1-D array with length=`peplen`.
Masses of modifications (mass diffs) through the peptide,
`0` if sites has no modifications
"""
masses = np.zeros(pep_len)
for site, mass in zip(mass_diff_sites, mass_diffs):
if site == 0 or site == -1:
masses[site] += mass
else:
masses[site - 1] += mass
return masses
[docs]
def calc_mod_diff_masses_for_same_len_seqs(
nAA: int, aa_mass_diffs_list: List[List[float]], mod_sites_list: List[List[int]]
) -> np.ndarray:
"""
Calculate diff modification masses for the given peptide length (`nAA`),
For open-search, we may also get modification
mass diffs other than mod names. This function calculate
modification masses from these diff masses.
Parameters
----------
nAA : int
peptide length
mod_names_list : List[List[str]]
list of modification list
mod_sites_list : List[List[int]]
list of modification site list corresponding
to `mod_names_list`.
* `site=0` refers to an N-term modification
* `site=-1` refers to a C-term modification
* `1<=site<=peplen` refers to a normal modification
Returns
-------
np.ndarray
2-D array with shape=`(nAA, pep_count or len(mod_names_list)))`.
Masses of modifications through all the peptides,
`0` if sites has no modifications
"""
masses = np.zeros((len(aa_mass_diffs_list), nAA))
for i, (aa_mass_diffs, mod_sites) in enumerate(
zip(aa_mass_diffs_list, mod_sites_list)
):
for mod_diff, site in zip(aa_mass_diffs, mod_sites):
if site == 0 or site == -1:
masses[i, site] += mod_diff
else:
masses[i, site - 1] += mod_diff
return masses
[docs]
def calc_b_y_and_peptide_mass(
sequence: str,
mod_names: List[str],
mod_sites: List[int],
aa_mass_diffs: List[float] = None,
aa_mass_diff_sites: List[int] = None,
) -> Tuple[np.ndarray, np.ndarray, float]:
"""
It is highly recommend to use
`calc_b_y_and_peptide_masses_for_same_len_seqs`
as it is much faster
"""
residue_masses = calc_AA_masses(sequence)
mod_masses = calc_modification_mass(len(sequence), mod_names, mod_sites)
residue_masses += mod_masses
if aa_mass_diffs is not None:
mod_masses = calc_diff_modification_mass(
len(sequence), aa_mass_diffs, aa_mass_diff_sites
)
residue_masses += mod_masses
# residue_masses = residue_masses[np.newaxis, ...]
b_masses = np.cumsum(residue_masses)
b_masses, pepmass = b_masses[:-1], b_masses[-1]
pepmass += MASS_H2O
y_masses = pepmass - b_masses
return b_masses, y_masses, pepmass
[docs]
def calc_peptide_masses_for_same_len_seqs(
sequences: np.ndarray, mod_list: List[str], mod_diff_list: List[str] = None
) -> np.ndarray:
"""
Calculate peptide masses for peptide sequences with same lengths.
We need 'same_len' here because numpy can process AA sequences
with same length very fast.
See `alphabase.aa.calc_sequence_masses_for_same_len_seqs`
Parameters
----------
mod_list : List[str]
list of modifications,
e.g. `['Oxidation@M;Phospho@S','Phospho@S;Deamidated@N']`
mass_diff_list : List[str]
List of modifications as mass diffs,
e.g. `['15.9xx;79.9xxx','79.9xx;0.98xx']`
Returns
-------
np.ndarray
peptide masses (1-D array, H2O already added)
"""
seq_masses = calc_sequence_masses_for_same_len_seqs(sequences)
mod_masses = np.zeros_like(seq_masses)
for i, mods in enumerate(mod_list):
if len(mods) > 0:
mod_masses[i] = calc_modification_mass_sum(
mods.split(ModificationKeys.SEPARATOR)
)
if mod_diff_list is not None:
for i, mass_diffs in enumerate(mod_diff_list):
if len(mass_diffs) > 0:
mod_masses[i] += np.sum(
[
float(mass)
for mass in mass_diffs.split(ModificationKeys.SEPARATOR)
]
)
return seq_masses + mod_masses
[docs]
def calc_b_y_and_peptide_masses_for_same_len_seqs(
sequences: np.ndarray,
mod_list: List[List[str]],
site_list: List[List[int]],
mod_diff_list: List[List[float]] = None,
mod_diff_site_list: List[List[int]] = None,
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
"""
Calculate b/y fragment masses and peptide masses
for peptide sequences with same lengths.
We need 'same_len' here because numpy can process AA sequences
with same length very fast.
Parameters
----------
sequence : np.ndarray of str
np.ndarray of peptie sequences with same length.
mod_list : List[List[str]]
list of modifications ,
e.g. `[['Oxidation@M','Phospho@S'],['Phospho@S','Deamidated@N']]`
site_list : List[List[int]]
list of modification sites
corresponding to `mod_list`, e.g. `[[3,6],[4,17]]`
mod_diff_list : List[List[float]]
list of modifications,
e.g. `[[15.994915,79.966331],[79.966331,0.984016]]`
mod_diff_site_list : List[List[int]]
list of modification mass diff sites
corresponding to `mod_list`, e.g. `[[3,6],[4,17]]`
Returns
-------
np.ndarray
neutral b fragment masses (2-D array)
np.ndarray
neutral y fragmnet masses (2-D array)
np.ndarray
neutral peptide masses (1-D array)
"""
aa_masses = calc_AA_masses_for_same_len_seqs(sequences)
nAA = len(sequences[0])
# mod_masses = np.zeros_like(aa_masses)
# for i, (mods, sites) in enumerate(zip(mod_list, site_list)):
# if len(mods) != 0:
# mod_masses[i,:] = calc_modification_mass(
# seq_len,
# mods,
# sites,
# )
mod_masses = calc_mod_masses_for_same_len_seqs(nAA, mod_list, site_list)
if mod_diff_list is not None:
mod_masses += calc_mod_diff_masses_for_same_len_seqs(
nAA, mod_diff_list, mod_diff_site_list
)
# for i, (mass_diffs, sites) in enumerate(zip(
# mass_diff_list, mass_diff_site_list
# )):
# if len(mass_diffs) != 0:
# mod_masses[i,:] += calc_diff_modification_mass(
# seq_len,
# mass_diffs,
# sites,
# )
aa_masses += mod_masses
b_masses = np.cumsum(aa_masses, axis=1)
b_masses, pepmass = b_masses[:, :-1], b_masses[:, -1:]
pepmass += MASS_H2O
y_masses = pepmass - b_masses
return b_masses, y_masses, pepmass.flatten()