Source code for alphabase.constants.modification

import os
from typing import List, Union

import numba
import numpy as np
import pandas as pd

from alphabase.constants._const import CONST_FILE_FOLDER
from alphabase.constants.atom import (
    calc_mass_from_formula,
    parse_formula,
)


class _ConstantsClass(type):
    """A metaclass for classes that should only contain string constants."""

    def __setattr__(cls, name, value):
        raise TypeError("Constants class cannot be modified")


[docs] class ModificationKeys(metaclass=_ConstantsClass): """Constants for modification format strings and terminal sites. Modification names in AlphaBase follow the pattern: `ModName@Site` For example: `Phospho@S`, `Oxidation@M`, `Acetyl@Protein_N-term` Multiple modifications are joined with SEPARATOR: `Phospho@S;Oxidation@M` for mods, `5;7` for sites Terminal modifications use special site identifiers: - `Any_N-term`: Any peptide N-terminus (e.g., `Acetyl@Any_N-term`) - `Protein_N-term`: Protein N-terminus only - AA-specific terminals: `Q^Any_N-term` for Gln-specific N-term mod Examples -------- >>> mods = "Phospho@S;Oxidation@M" >>> mod_list = mods.split(ModificationKeys.SEPARATOR) # ['Phospho@S', 'Oxidation@M'] >>> mod_name, site = mod_list[0].split(ModificationKeys.SITE_SEPARATOR) # 'Phospho', 'S' """ #: Separator for multiple values: "Phospho@S;Oxidation@M" or "5;7" for sites SEPARATOR: str = ";" #: Separator between mod name and site: "Phospho@S" -> "Phospho" + "@" + "S" SITE_SEPARATOR: str = "@" #: Separator for AA-specific terminal mods: "Q^Any_N-term" -> "Q" + "^" + "Any_N-term" TERM_SEPARATOR: str = "^" #: Any peptide N-terminus: "Acetyl@Any_N-term" ANY_N_TERM: str = "Any_N-term" #: Any peptide C-terminus: "Amidated@Any_C-term" ANY_C_TERM: str = "Any_C-term" #: Protein N-terminus only: "Acetyl@Protein_N-term" PROTEIN_N_TERM: str = "Protein_N-term" #: Protein C-terminus only PROTEIN_C_TERM: str = "Protein_C-term" #: AA-specific N-term suffix: "Gln->pyro-Glu@Q^Any_N-term" (Q at any N-term) ANY_N_TERM_SPECIFIC: str = "^Any_N-term" #: AA-specific C-term suffix ANY_C_TERM_SPECIFIC: str = "^Any_C-term" #: AA-specific Protein N-term suffix: "Met-loss@M^Protein_N-term" PROTEIN_N_TERM_SPECIFIC: str = "^Protein_N-term" #: AA-specific Protein C-term suffix PROTEIN_C_TERM_SPECIFIC: str = "^Protein_C-term"
#: Main entry of modification infomation (DataFrame fotmat). MOD_DF: pd.DataFrame = pd.DataFrame() MOD_INFO_DICT: dict = {} #: Modification to formula str dict. {mod_name: formula str ('H(1)C(2)O(3)')} MOD_CHEM: dict = {} #: Modification to mass dict. MOD_MASS: dict = {} #: Modification to modification neutral loss dict. MOD_LOSS_MASS: dict = {} # Modification to formula dict of dict, i.e. {modname: {'C': n, 'H': m, ...}} MOD_Composition: dict = {} #: Modification loss importance MOD_LOSS_IMPORTANCE: dict = {} _MOD_CLASSIFICATION_USER_ADDED = "User-added"
[docs] def update_all_by_MOD_DF(): """ As DataFrame is more conveneint in data operation, we can also process MOD_DF and then update all global modification variables from MOD_DF """ MOD_INFO_DICT.clear() MOD_INFO_DICT.update(MOD_DF.to_dict(orient="index")) MOD_CHEM.clear() MOD_CHEM.update(MOD_DF["composition"].to_dict()) MOD_MASS.clear() MOD_MASS.update(MOD_DF["mass"].to_dict()) MOD_LOSS_MASS.clear() MOD_LOSS_MASS.update(MOD_DF["modloss"].to_dict()) MOD_LOSS_IMPORTANCE.clear() MOD_LOSS_IMPORTANCE.update(MOD_DF["modloss_importance"].to_dict()) MOD_Composition.clear() for mod, chem in MOD_CHEM.items(): MOD_Composition[mod] = dict(parse_formula(chem))
[docs] def add_modifications_for_lower_case_AA(): """Add modifications for lower-case AAs for advanced usages""" global MOD_DF lower_case_df = MOD_DF.copy() def _mod_lower_case(modname): modname, site = modname.split(ModificationKeys.SITE_SEPARATOR) if len(site) == 1: return modname + ModificationKeys.SITE_SEPARATOR + site.lower() elif ModificationKeys.TERM_SEPARATOR in site: site = site[0].lower() + site[1:] return modname + ModificationKeys.SITE_SEPARATOR + site else: return "" lower_case_df["mod_name"] = lower_case_df["mod_name"].apply(_mod_lower_case) lower_case_df = lower_case_df[lower_case_df["mod_name"] != ""] lower_case_df.set_index("mod_name", drop=False, inplace=True) lower_case_df["lower_case_AA"] = True MOD_DF["lower_case_AA"] = False MOD_DF = pd.concat([MOD_DF, lower_case_df]) update_all_by_MOD_DF()
[docs] def keep_modloss_by_importance(modloss_importance_level: float = 1.0): MOD_DF["modloss"] = MOD_DF["modloss_original"] MOD_DF.loc[MOD_DF.modloss_importance < modloss_importance_level, "modloss"] = 0 MOD_LOSS_MASS.clear() MOD_LOSS_MASS.update(MOD_DF["modloss"].to_dict())
[docs] def load_mod_df( tsv: str = os.path.join(CONST_FILE_FOLDER, "modification.tsv"), *, modloss_importance_level=1, ): global MOD_DF MOD_DF = pd.read_table(tsv, keep_default_na=False) if any(mask := MOD_DF["mod_name"].str.contains(" ", regex=False)): raise ValueError( f"Modification names must not contain spaces: {MOD_DF[mask]['mod_name'].values}" ) MOD_DF.drop_duplicates("mod_name", inplace=True) MOD_DF.fillna("", inplace=True) MOD_DF["unimod_id"] = MOD_DF["unimod_id"].astype(np.int32) MOD_DF.set_index("mod_name", drop=False, inplace=True) MOD_DF["mass"] = MOD_DF["composition"].apply(calc_mass_from_formula) MOD_DF["modloss_original"] = MOD_DF["modloss_composition"].apply( calc_mass_from_formula ) MOD_DF["modloss"] = MOD_DF["modloss_original"] keep_modloss_by_importance(modloss_importance_level) update_all_by_MOD_DF()
load_mod_df()
[docs] def calc_modification_mass( nAA: int, mod_names: List[str], mod_sites: List[int] ) -> np.ndarray: """ Calculate modification masses for the given peptide length (`nAA`), and modified site list. Parameters ---------- nAA : int Peptide length mod_names : list List[str]. Modification name list mod_sites : list List[int]. Modification site list corresponding to `mod_names`. * `site=0` refers to an N-term modification * `site=-1` refers to a C-term modification * `1<=site<=peplen` refers to a normal modification Returns ------- np.ndarray 1-D array with length=`nAA`. Masses of modifications through the peptide, `0` if sites has no modifications """ masses = np.zeros(nAA) for site, mod in zip(mod_sites, mod_names): if site == 0 or site == -1: masses[site] += MOD_MASS[mod] else: masses[site - 1] += MOD_MASS[mod] return masses
[docs] def calc_mod_masses_for_same_len_seqs( nAA: int, mod_names_list: List[List[str]], mod_sites_list: List[List[int]] ) -> np.ndarray: """ Calculate modification masses for the given peptides with same peptide length (`nAA`). Parameters ---------- nAA : int Peptide length mod_names_list : List[List[str]] List (pep_count) of modification list (n_mod on each peptide) mod_sites_list : List[List[int]] List of modification site list corresponding to `mod_names_list`. * `site=0` refers to an N-term modification * `site=-1` refers to a C-term modification * `1<=site<=peplen` refers to a normal modification Returns ------- np.ndarray 2-D array with shape=`(nAA, pep_count or len(mod_names_list)))`. Masses of modifications through all the peptides, `0` if sites without modifications. """ masses = np.zeros((len(mod_names_list), nAA)) for i, (mod_names, mod_sites) in enumerate(zip(mod_names_list, mod_sites_list)): for mod, site in zip(mod_names, mod_sites): if site == 0 or site == -1: masses[i, site] += MOD_MASS[mod] else: masses[i, site - 1] += MOD_MASS[mod] return masses
[docs] def calc_modification_mass_sum(mod_names: List[str]) -> float: """ Calculate summed mass of the given modification without knowing the sites and peptide length. It is useful to calculate peptide mass. Parameters ---------- mod_names : List[str] Modification name list Returns ------- float Total mass """ return np.sum([MOD_MASS[mod] for mod in mod_names])
@numba.jit(nopython=True, nogil=True) def _calc_modloss_with_importance( mod_losses: np.ndarray, _loss_importance: np.ndarray ) -> np.ndarray: """ Calculate modification loss masses (e.g. -98 Da for Phospho@S/T). Modification with higher `_loss_importance` has higher priorities. For example, `AM(Oxidation@M)S(Phospho@S)...`, importance of Phospho@S > importance of Oxidation@M, so the modloss of b3 ion will be -98 Da, not -64 Da. Parameters ---------- mod_losses : np.ndarray Mod loss masses of each AA position _loss_importance : np.ndarray Mod loss importance of each AA position Returns ------- np.ndarray New mod_loss masses selected by `_loss_importance` """ prev_importance = _loss_importance[0] prev_most = 0 for i, _curr_imp in enumerate(_loss_importance[1:], 1): if _curr_imp > prev_importance: prev_most = i prev_importance = _curr_imp else: mod_losses[i] = mod_losses[prev_most] return mod_losses
[docs] def calc_modloss_mass_with_importance( nAA: int, mod_names: List, mod_sites: List, for_nterm_frag: bool, ) -> np.ndarray: """ Calculate modification loss masses (e.g. -98 Da for Phospho@S/T, -64 Da for Oxidation@M). Modifications with higher `MOD_LOSS_IMPORTANCE` have higher priorities. For example, `AS(Phospho@S)M(Oxidation@M)...`, importance of Phospho@S > importance of Oxidation@M, so the modloss of b3 ion will be -98 Da, not -64 Da. Parameters ---------- nAA : int Peptide length mod_names : List[str] Modification name list mod_sites : List[int] Modification site list for_nterm_frag : bool If `True`, the loss will be on the N-term fragments (mainly `b` ions); If `False`, the loss will be on the C-term fragments (mainly `y` ions) Returns ------- np.ndarray mod_loss masses """ if not mod_names: return np.zeros(nAA - 1) mod_losses = np.zeros(nAA + 2) mod_losses[mod_sites] = [MOD_LOSS_MASS[mod] for mod in mod_names] _loss_importance = np.zeros(nAA + 2) _loss_importance[mod_sites] = [MOD_LOSS_IMPORTANCE.get(mod, 0) for mod in mod_names] # Will not consider the modloss if the corresponding modloss_importance is 0 mod_losses[_loss_importance == 0] = 0 if for_nterm_frag: return _calc_modloss_with_importance(mod_losses, _loss_importance)[1:-2] else: return _calc_modloss_with_importance(mod_losses[::-1], _loss_importance[::-1])[ -3:0:-1 ]
@numba.njit def _calc_modloss(mod_losses: np.ndarray) -> np.ndarray: """ Calculate modification loss masses (e.g. -98 Da for Phospho@S/T). Parameters ---------- mod_losses : np.ndarray Mod loss masses of each AA position Returns ------- np.ndarray New mod_loss masses """ for i, _curr_loss in enumerate(mod_losses[1:], 1): if _curr_loss == 0: mod_losses[i] = mod_losses[i - 1] else: mod_losses[i] = _curr_loss return mod_losses
[docs] def calc_modloss_mass( nAA: int, mod_names: List, mod_sites: List, for_nterm_frag: bool, ) -> np.ndarray: """ Calculate modification loss masses (e.g. -98 Da for Phospho@S/T, -64 Da for Oxidation@M). The mod loss mass is calculated by the modification closer to the fragment sites. For example, the modloss of the b3 ion for `AS(Phospho@S)M(Oxidation@M)...` will be -64 Da. Parameters ---------- nAA : int Peptide length mod_names : List[str] Modification name list mod_sites : List[int] Modification site list corresponding for_nterm_frag : bool If `True`, the loss will be on the N-term fragments (mainly `b` ions); If `False`, the loss will be on the C-term fragments (mainly `y` ions) Returns ------- np.ndarray mod_loss masses """ if len(mod_names) == 0: return np.zeros(nAA - 1) mod_losses = np.zeros(nAA + 2) mod_losses[mod_sites] = [MOD_LOSS_MASS[mod] for mod in mod_names] if for_nterm_frag: return _calc_modloss(mod_losses)[1:-2] else: return _calc_modloss(mod_losses[::-1])[-3:0:-1]
def _check_mass_sanity( mod_name: str, composition: str, smiles: str, ): """ Check if the mass of the modification is consistent with the formula. Parameters ---------- mod_name : str Modification name (e.g. Mod@S) composition : str Composition formula (e.g. "H(4)O(2)"), used to calculate the mass of the modification smiles : str SMILES string of the modification, used to calculate and compare the mass of the modification Raises ------ ValueError If the mass of the modification is inconsistent with the formula """ if not smiles or mod_name not in MOD_MASS: return composition_mass = calc_mass_from_formula(composition) if not np.allclose(composition_mass, MOD_MASS[mod_name], atol=1e-5): raise ValueError( f"Modification mass of {mod_name} is inconsistent with the composition formula: {composition}, df version {MOD_DF.loc[mod_name,['composition']]}" f" calculated_mass={composition_mass}, mod_mass={MOD_MASS[mod_name]}" ) def _add_a_new_modification( mod_name: str, composition: str, modloss_composition: str = "", smiles: str = "", ): """ Add a new modification into :data:`MOD_DF` or update SMILES if modification already exists. Parameters ---------- mod_name : str Modification name (e.g. Mod@S) composition : str Composition formula (e.g. "H(4)O(2)") modloss_composition : str Composition formula of the modification loss (e.g. "H(2)O(1)") smiles : str SMILES string of the modification """ _check_mass_sanity(mod_name, composition, smiles) if mod_name in MOD_DF.index: # If the modification already exists, only update the SMILES MOD_DF.loc[mod_name, "smiles"] = smiles return MOD_DF.loc[ mod_name, [ "mod_name", "composition", "modloss_composition", "classification", "unimod_id", "smiles", ], ] = [ mod_name, composition, modloss_composition, _MOD_CLASSIFICATION_USER_ADDED, 0, smiles, ] composition_mass = calc_mass_from_formula(composition) modloss_mass = calc_mass_from_formula(modloss_composition) MOD_DF.loc[mod_name, ["mass", "modloss"]] = ( composition_mass, modloss_mass, ) if MOD_DF.loc[mod_name, "modloss"] > 0: MOD_DF.loc[mod_name, "modloss_importance"] = 1e10 MOD_DF.fillna(0, inplace=True) # update_all_by_MOD_DF()
[docs] def has_custom_mods(): """Returns whether `MOD_DF` has user-defined modifications or not.""" return len(MOD_DF[MOD_DF["classification"] == _MOD_CLASSIFICATION_USER_ADDED]) > 0
[docs] def add_new_modifications(new_mods: Union[list, dict]): """Add new modifications into :data:`MOD_DF`. Parameters ---------- new_mods : list or dict list of tuples example: ``` [( mod@site:str (e.g. Mod@S), composition:str (e.g. "H(4)O(2)"), [optional] modloss composition:str (e.g. "H(2)O(1)"), ), ...] ```, dict example: ``` { "mod@site": { "composition":"H(4)O(2)", "modloss_composition":"H(2)O(1)" }, ... } ``` """ if isinstance(new_mods, list): for items in new_mods: _add_a_new_modification(*items) else: for mod_name, mod_info in new_mods.items(): _add_a_new_modification(mod_name, **mod_info) update_all_by_MOD_DF()