Source code for alphabase.constants.aa
import os
import typing
import numpy as np
import pandas as pd
from alphabase.constants._const import CONST_FILE_FOLDER
from alphabase.constants.atom import (
MASS_H2O,
calc_mass_from_formula,
parse_formula,
reset_elements,
)
# We use all 128 ASCII code to represent amino acids for flexible extensions in the future.
# The amino acid masses are stored in 128-lengh array :py:data:`AA_ASCII_MASS`.
# If an ASCII code is not in `aa_formula`, the mass will be set as a large value to disable MS search.
aa_formula: pd.DataFrame = pd.read_csv(
os.path.join(CONST_FILE_FOLDER, "amino_acid.tsv"), sep="\t", index_col=0
)
#: AA mass array with ASCII code, mass of 'A' is AA_ASCII_MASS[ord('A')]
AA_ASCII_MASS: np.ndarray = np.ones(128) * 1e8
#: 128-len AA dataframe
AA_DF: pd.DataFrame = pd.DataFrame()
# AA formula to formula dict of dict. For example: {'K': {'C': n, 'O': m, ...}}
AA_Composition: dict = {}
[docs]
def replace_atoms(atom_replace_dict: typing.Dict):
for aa, row in aa_formula.iterrows():
formula = row["formula"]
atom_comp = dict(parse_formula(formula))
for atom_from, atom_to in atom_replace_dict.items():
if atom_from in atom_comp:
atom_comp[atom_to] = atom_comp[atom_from]
del atom_comp[atom_from]
aa_formula.loc[aa, "formula"] = "".join(
[f"{atom}({n})" for atom, n in atom_comp.items()]
)
[docs]
def reset_AA_mass() -> np.ndarray:
"""AA mass in np.array with shape (128,)"""
global AA_ASCII_MASS
for aa, row in aa_formula.iterrows():
AA_ASCII_MASS[ord(aa)] = calc_mass_from_formula(row["formula"])
return AA_ASCII_MASS
reset_AA_mass()
[docs]
def reset_AA_df():
global AA_DF
AA_DF = pd.DataFrame()
num_rows = len(AA_ASCII_MASS)
AA_DF["aa"] = [chr(aa) for aa in range(num_rows)]
AA_DF["formula"] = [""] * num_rows
AA_DF["smiles"] = [""] * num_rows
AA_DF["mass"] = AA_ASCII_MASS
for aa, row in aa_formula.iterrows():
AA_DF.loc[ord(aa), "formula"] = row["formula"]
AA_DF.loc[ord(aa), "smiles"] = row["smiles"]
return AA_DF
reset_AA_df()
[docs]
def reset_AA_Composition():
global AA_Composition
AA_Composition = {}
for aa, row in aa_formula.iterrows():
AA_Composition[aa] = dict(parse_formula(row["formula"]))
return AA_Composition
reset_AA_Composition()
[docs]
def reset_AA_atoms(atom_replace_dict: typing.Dict = {}):
reset_elements()
replace_atoms(atom_replace_dict)
reset_AA_mass()
reset_AA_df()
reset_AA_Composition()
[docs]
def update_an_AA(aa: str, formula: str, smiles: str = ""):
aa_idx = ord(aa)
aa_formula.loc[aa, "formula"] = formula
aa_formula.loc[aa, "smiles"] = smiles
AA_DF.loc[aa_idx, "formula"] = formula
AA_DF.loc[aa_idx, "smiles"] = smiles
AA_ASCII_MASS[aa_idx] = calc_mass_from_formula(formula)
AA_DF.loc[aa_idx, "mass"] = AA_ASCII_MASS[aa_idx]
AA_Composition[aa] = dict(parse_formula(formula))
[docs]
def calc_AA_masses(sequence: str) -> np.ndarray:
"""
Parameters
----------
sequence : str
Unmodified peptide sequence
Returns
-------
np.ndarray
Masses of each amino acid.
"""
return AA_ASCII_MASS[np.array(sequence, "c").view(np.int8)]
[docs]
def calc_AA_masses_for_same_len_seqs(sequence_array: np.ndarray) -> np.ndarray:
"""
Calculate AA masses for the array of same-len AA sequences.
Parameters
----------
sequence_array : np.ndarray or list
unmodified sequences with the same length.
Returns
-------
np.ndarray
2-D (array_size, sequence_len) array of masses.
Raises
-------
ValueError
If sequences are not with the same length.
"""
return AA_ASCII_MASS[
# we use np.int32 here because unicode str
# uses 4 bytes for a char.
np.array(sequence_array).view(np.int32)
].reshape(len(sequence_array), -1)
[docs]
def calc_sequence_masses_for_same_len_seqs(sequence_array: np.ndarray) -> np.ndarray:
"""
Calculate sequence masses for the array of same-len AA sequences.
Parameters
----------
sequence_array : np.ndarray or list
unmodified sequences with the same length.
Returns
-------
np.ndarray
1-D (array_size, sequence_len) array of masses.
Raises
-------
ValueError
If sequences are not with the same length.
"""
return np.sum(calc_AA_masses_for_same_len_seqs(sequence_array), axis=1) + MASS_H2O
[docs]
def calc_AA_masses_for_var_len_seqs(sequence_array: np.ndarray) -> np.ndarray:
"""
We recommend to use `calc_AA_masses_for_same_len_seqs` as it is much faster. # TODO it's the same
Parameters
----------
sequence_array : np.ndarray
Sequences with variable lengths.
Returns
-------
np.ndarray
1D array of masses, values of 1e8 are used to fill the max length. # TODO change this to 0
"""
return AA_ASCII_MASS[np.array(sequence_array).view(np.int32)].reshape(
len(sequence_array), -1
)