import os
from typing import List, Union
import numba
import numpy as np
import pandas as pd
from alphabase.constants._const import CONST_FILE_FOLDER
from alphabase.constants.atom import (
calc_mass_from_formula,
parse_formula,
)
class _ConstantsClass(type):
"""A metaclass for classes that should only contain string constants."""
def __setattr__(cls, name, value):
raise TypeError("Constants class cannot be modified")
[docs]
class ModificationKeys(metaclass=_ConstantsClass):
"""Constants for modification format strings and terminal sites.
Modification names in AlphaBase follow the pattern: `ModName@Site`
For example: `Phospho@S`, `Oxidation@M`, `Acetyl@Protein_N-term`
Multiple modifications are joined with SEPARATOR:
`Phospho@S;Oxidation@M` for mods, `5;7` for sites
Terminal modifications use special site identifiers:
- `Any_N-term`: Any peptide N-terminus (e.g., `Acetyl@Any_N-term`)
- `Protein_N-term`: Protein N-terminus only
- AA-specific terminals: `Q^Any_N-term` for Gln-specific N-term mod
Examples
--------
>>> mods = "Phospho@S;Oxidation@M"
>>> mod_list = mods.split(ModificationKeys.SEPARATOR) # ['Phospho@S', 'Oxidation@M']
>>> mod_name, site = mod_list[0].split(ModificationKeys.SITE_SEPARATOR) # 'Phospho', 'S'
"""
#: Separator for multiple values: "Phospho@S;Oxidation@M" or "5;7" for sites
SEPARATOR: str = ";"
#: Separator between mod name and site: "Phospho@S" -> "Phospho" + "@" + "S"
SITE_SEPARATOR: str = "@"
#: Separator for AA-specific terminal mods: "Q^Any_N-term" -> "Q" + "^" + "Any_N-term"
TERM_SEPARATOR: str = "^"
#: Any peptide N-terminus: "Acetyl@Any_N-term"
ANY_N_TERM: str = "Any_N-term"
#: Any peptide C-terminus: "Amidated@Any_C-term"
ANY_C_TERM: str = "Any_C-term"
#: Protein N-terminus only: "Acetyl@Protein_N-term"
PROTEIN_N_TERM: str = "Protein_N-term"
#: Protein C-terminus only
PROTEIN_C_TERM: str = "Protein_C-term"
#: AA-specific N-term suffix: "Gln->pyro-Glu@Q^Any_N-term" (Q at any N-term)
ANY_N_TERM_SPECIFIC: str = "^Any_N-term"
#: AA-specific C-term suffix
ANY_C_TERM_SPECIFIC: str = "^Any_C-term"
#: AA-specific Protein N-term suffix: "Met-loss@M^Protein_N-term"
PROTEIN_N_TERM_SPECIFIC: str = "^Protein_N-term"
#: AA-specific Protein C-term suffix
PROTEIN_C_TERM_SPECIFIC: str = "^Protein_C-term"
#: Main entry of modification infomation (DataFrame fotmat).
MOD_DF: pd.DataFrame = pd.DataFrame()
MOD_INFO_DICT: dict = {}
#: Modification to formula str dict. {mod_name: formula str ('H(1)C(2)O(3)')}
MOD_CHEM: dict = {}
#: Modification to mass dict.
MOD_MASS: dict = {}
#: Modification to modification neutral loss dict.
MOD_LOSS_MASS: dict = {}
# Modification to formula dict of dict, i.e. {modname: {'C': n, 'H': m, ...}}
MOD_Composition: dict = {}
#: Modification loss importance
MOD_LOSS_IMPORTANCE: dict = {}
_MOD_CLASSIFICATION_USER_ADDED = "User-added"
[docs]
def update_all_by_MOD_DF():
"""
As DataFrame is more conveneint in data operation,
we can also process MOD_DF and then update all global
modification variables from MOD_DF
"""
MOD_INFO_DICT.clear()
MOD_INFO_DICT.update(MOD_DF.to_dict(orient="index"))
MOD_CHEM.clear()
MOD_CHEM.update(MOD_DF["composition"].to_dict())
MOD_MASS.clear()
MOD_MASS.update(MOD_DF["mass"].to_dict())
MOD_LOSS_MASS.clear()
MOD_LOSS_MASS.update(MOD_DF["modloss"].to_dict())
MOD_LOSS_IMPORTANCE.clear()
MOD_LOSS_IMPORTANCE.update(MOD_DF["modloss_importance"].to_dict())
MOD_Composition.clear()
for mod, chem in MOD_CHEM.items():
MOD_Composition[mod] = dict(parse_formula(chem))
[docs]
def add_modifications_for_lower_case_AA():
"""Add modifications for lower-case AAs for advanced usages"""
global MOD_DF
lower_case_df = MOD_DF.copy()
def _mod_lower_case(modname):
modname, site = modname.split(ModificationKeys.SITE_SEPARATOR)
if len(site) == 1:
return modname + ModificationKeys.SITE_SEPARATOR + site.lower()
elif ModificationKeys.TERM_SEPARATOR in site:
site = site[0].lower() + site[1:]
return modname + ModificationKeys.SITE_SEPARATOR + site
else:
return ""
lower_case_df["mod_name"] = lower_case_df["mod_name"].apply(_mod_lower_case)
lower_case_df = lower_case_df[lower_case_df["mod_name"] != ""]
lower_case_df.set_index("mod_name", drop=False, inplace=True)
lower_case_df["lower_case_AA"] = True
MOD_DF["lower_case_AA"] = False
MOD_DF = pd.concat([MOD_DF, lower_case_df])
update_all_by_MOD_DF()
[docs]
def keep_modloss_by_importance(modloss_importance_level: float = 1.0):
MOD_DF["modloss"] = MOD_DF["modloss_original"]
MOD_DF.loc[MOD_DF.modloss_importance < modloss_importance_level, "modloss"] = 0
MOD_LOSS_MASS.clear()
MOD_LOSS_MASS.update(MOD_DF["modloss"].to_dict())
[docs]
def load_mod_df(
tsv: str = os.path.join(CONST_FILE_FOLDER, "modification.tsv"),
*,
modloss_importance_level=1,
):
global MOD_DF
MOD_DF = pd.read_table(tsv, keep_default_na=False)
if any(mask := MOD_DF["mod_name"].str.contains(" ", regex=False)):
raise ValueError(
f"Modification names must not contain spaces: {MOD_DF[mask]['mod_name'].values}"
)
MOD_DF.drop_duplicates("mod_name", inplace=True)
MOD_DF.fillna("", inplace=True)
MOD_DF["unimod_id"] = MOD_DF["unimod_id"].astype(np.int32)
MOD_DF.set_index("mod_name", drop=False, inplace=True)
MOD_DF["mass"] = MOD_DF["composition"].apply(calc_mass_from_formula)
MOD_DF["modloss_original"] = MOD_DF["modloss_composition"].apply(
calc_mass_from_formula
)
MOD_DF["modloss"] = MOD_DF["modloss_original"]
keep_modloss_by_importance(modloss_importance_level)
update_all_by_MOD_DF()
load_mod_df()
[docs]
def calc_modification_mass(
nAA: int, mod_names: List[str], mod_sites: List[int]
) -> np.ndarray:
"""
Calculate modification masses for the given peptide length (`nAA`),
and modified site list.
Parameters
----------
nAA : int
Peptide length
mod_names : list
List[str]. Modification name list
mod_sites : list
List[int]. Modification site list corresponding to `mod_names`.
* `site=0` refers to an N-term modification
* `site=-1` refers to a C-term modification
* `1<=site<=peplen` refers to a normal modification
Returns
-------
np.ndarray
1-D array with length=`nAA`.
Masses of modifications through the peptide,
`0` if sites has no modifications
"""
masses = np.zeros(nAA)
for site, mod in zip(mod_sites, mod_names):
if site == 0 or site == -1:
masses[site] += MOD_MASS[mod]
else:
masses[site - 1] += MOD_MASS[mod]
return masses
[docs]
def calc_mod_masses_for_same_len_seqs(
nAA: int, mod_names_list: List[List[str]], mod_sites_list: List[List[int]]
) -> np.ndarray:
"""
Calculate modification masses for the given peptides with same peptide length (`nAA`).
Parameters
----------
nAA : int
Peptide length
mod_names_list : List[List[str]]
List (pep_count) of modification list (n_mod on each peptide)
mod_sites_list : List[List[int]]
List of modification site list corresponding to `mod_names_list`.
* `site=0` refers to an N-term modification
* `site=-1` refers to a C-term modification
* `1<=site<=peplen` refers to a normal modification
Returns
-------
np.ndarray
2-D array with shape=`(nAA, pep_count or len(mod_names_list)))`.
Masses of modifications through all the peptides,
`0` if sites without modifications.
"""
masses = np.zeros((len(mod_names_list), nAA))
for i, (mod_names, mod_sites) in enumerate(zip(mod_names_list, mod_sites_list)):
for mod, site in zip(mod_names, mod_sites):
if site == 0 or site == -1:
masses[i, site] += MOD_MASS[mod]
else:
masses[i, site - 1] += MOD_MASS[mod]
return masses
[docs]
def calc_modification_mass_sum(mod_names: List[str]) -> float:
"""
Calculate summed mass of the given modification
without knowing the sites and peptide length.
It is useful to calculate peptide mass.
Parameters
----------
mod_names : List[str]
Modification name list
Returns
-------
float
Total mass
"""
return np.sum([MOD_MASS[mod] for mod in mod_names])
@numba.jit(nopython=True, nogil=True)
def _calc_modloss_with_importance(
mod_losses: np.ndarray, _loss_importance: np.ndarray
) -> np.ndarray:
"""
Calculate modification loss masses (e.g. -98 Da for Phospho@S/T).
Modification with higher `_loss_importance` has higher priorities.
For example, `AM(Oxidation@M)S(Phospho@S)...`,
importance of Phospho@S > importance of Oxidation@M, so the modloss of
b3 ion will be -98 Da, not -64 Da.
Parameters
----------
mod_losses : np.ndarray
Mod loss masses of each AA position
_loss_importance : np.ndarray
Mod loss importance of each AA position
Returns
-------
np.ndarray
New mod_loss masses selected by `_loss_importance`
"""
prev_importance = _loss_importance[0]
prev_most = 0
for i, _curr_imp in enumerate(_loss_importance[1:], 1):
if _curr_imp > prev_importance:
prev_most = i
prev_importance = _curr_imp
else:
mod_losses[i] = mod_losses[prev_most]
return mod_losses
[docs]
def calc_modloss_mass_with_importance(
nAA: int,
mod_names: List,
mod_sites: List,
for_nterm_frag: bool,
) -> np.ndarray:
"""
Calculate modification loss masses (e.g. -98 Da for Phospho@S/T,
-64 Da for Oxidation@M). Modifications with higher `MOD_LOSS_IMPORTANCE`
have higher priorities. For example, `AS(Phospho@S)M(Oxidation@M)...`,
importance of Phospho@S > importance of Oxidation@M, so the modloss of
b3 ion will be -98 Da, not -64 Da.
Parameters
----------
nAA : int
Peptide length
mod_names : List[str]
Modification name list
mod_sites : List[int]
Modification site list
for_nterm_frag : bool
If `True`, the loss will be on the
N-term fragments (mainly `b` ions);
If `False`, the loss will be on the
C-term fragments (mainly `y` ions)
Returns
-------
np.ndarray
mod_loss masses
"""
if not mod_names:
return np.zeros(nAA - 1)
mod_losses = np.zeros(nAA + 2)
mod_losses[mod_sites] = [MOD_LOSS_MASS[mod] for mod in mod_names]
_loss_importance = np.zeros(nAA + 2)
_loss_importance[mod_sites] = [MOD_LOSS_IMPORTANCE.get(mod, 0) for mod in mod_names]
# Will not consider the modloss if the corresponding modloss_importance is 0
mod_losses[_loss_importance == 0] = 0
if for_nterm_frag:
return _calc_modloss_with_importance(mod_losses, _loss_importance)[1:-2]
else:
return _calc_modloss_with_importance(mod_losses[::-1], _loss_importance[::-1])[
-3:0:-1
]
@numba.njit
def _calc_modloss(mod_losses: np.ndarray) -> np.ndarray:
"""
Calculate modification loss masses (e.g. -98 Da for Phospho@S/T).
Parameters
----------
mod_losses : np.ndarray
Mod loss masses of each AA position
Returns
-------
np.ndarray
New mod_loss masses
"""
for i, _curr_loss in enumerate(mod_losses[1:], 1):
if _curr_loss == 0:
mod_losses[i] = mod_losses[i - 1]
else:
mod_losses[i] = _curr_loss
return mod_losses
[docs]
def calc_modloss_mass(
nAA: int,
mod_names: List,
mod_sites: List,
for_nterm_frag: bool,
) -> np.ndarray:
"""
Calculate modification loss masses (e.g. -98 Da for Phospho@S/T,
-64 Da for Oxidation@M). The mod loss mass is calculated by the
modification closer to the fragment sites. For example,
the modloss of the b3 ion for `AS(Phospho@S)M(Oxidation@M)...`
will be -64 Da.
Parameters
----------
nAA : int
Peptide length
mod_names : List[str]
Modification name list
mod_sites : List[int]
Modification site list corresponding
for_nterm_frag : bool
If `True`, the loss will be on the
N-term fragments (mainly `b` ions);
If `False`, the loss will be on the
C-term fragments (mainly `y` ions)
Returns
-------
np.ndarray
mod_loss masses
"""
if len(mod_names) == 0:
return np.zeros(nAA - 1)
mod_losses = np.zeros(nAA + 2)
mod_losses[mod_sites] = [MOD_LOSS_MASS[mod] for mod in mod_names]
if for_nterm_frag:
return _calc_modloss(mod_losses)[1:-2]
else:
return _calc_modloss(mod_losses[::-1])[-3:0:-1]
def _check_mass_sanity(
mod_name: str,
composition: str,
smiles: str,
):
"""
Check if the mass of the modification is consistent with the formula.
Parameters
----------
mod_name : str
Modification name (e.g. Mod@S)
composition : str
Composition formula (e.g. "H(4)O(2)"), used to calculate the mass of the modification
smiles : str
SMILES string of the modification, used to calculate and compare the mass of the modification
Raises
------
ValueError
If the mass of the modification is inconsistent with the formula
"""
if not smiles or mod_name not in MOD_MASS:
return
composition_mass = calc_mass_from_formula(composition)
if not np.allclose(composition_mass, MOD_MASS[mod_name], atol=1e-5):
raise ValueError(
f"Modification mass of {mod_name} is inconsistent with the composition formula: {composition}, df version {MOD_DF.loc[mod_name,['composition']]}"
f" calculated_mass={composition_mass}, mod_mass={MOD_MASS[mod_name]}"
)
def _add_a_new_modification(
mod_name: str,
composition: str,
modloss_composition: str = "",
smiles: str = "",
):
"""
Add a new modification into :data:`MOD_DF` or update SMILES if modification already exists.
Parameters
----------
mod_name : str
Modification name (e.g. Mod@S)
composition : str
Composition formula (e.g. "H(4)O(2)")
modloss_composition : str
Composition formula of the modification loss (e.g. "H(2)O(1)")
smiles : str
SMILES string of the modification
"""
_check_mass_sanity(mod_name, composition, smiles)
if mod_name in MOD_DF.index:
# If the modification already exists, only update the SMILES
MOD_DF.loc[mod_name, "smiles"] = smiles
return
MOD_DF.loc[
mod_name,
[
"mod_name",
"composition",
"modloss_composition",
"classification",
"unimod_id",
"smiles",
],
] = [
mod_name,
composition,
modloss_composition,
_MOD_CLASSIFICATION_USER_ADDED,
0,
smiles,
]
composition_mass = calc_mass_from_formula(composition)
modloss_mass = calc_mass_from_formula(modloss_composition)
MOD_DF.loc[mod_name, ["mass", "modloss"]] = (
composition_mass,
modloss_mass,
)
if MOD_DF.loc[mod_name, "modloss"] > 0:
MOD_DF.loc[mod_name, "modloss_importance"] = 1e10
MOD_DF.fillna(0, inplace=True)
# update_all_by_MOD_DF()
[docs]
def has_custom_mods():
"""Returns whether `MOD_DF` has user-defined modifications or not."""
return len(MOD_DF[MOD_DF["classification"] == _MOD_CLASSIFICATION_USER_ADDED]) > 0
[docs]
def add_new_modifications(new_mods: Union[list, dict]):
"""Add new modifications into :data:`MOD_DF`.
Parameters
----------
new_mods : list or dict
list of tuples example:
```
[(
mod@site:str (e.g. Mod@S),
composition:str (e.g. "H(4)O(2)"),
[optional] modloss composition:str (e.g. "H(2)O(1)"),
), ...]
```,
dict example:
```
{
"mod@site": {
"composition":"H(4)O(2)",
"modloss_composition":"H(2)O(1)"
}, ...
}
```
"""
if isinstance(new_mods, list):
for items in new_mods:
_add_a_new_modification(*items)
else:
for mod_name, mod_info in new_mods.items():
_add_a_new_modification(mod_name, **mod_info)
update_all_by_MOD_DF()