Source code for alphabase.spectral_library.reader

"""Module for reading spectral libraries."""

from typing import List, Optional

import numpy as np
import pandas as pd
from tqdm import tqdm

from alphabase.constants._const import PEAK_INTENSITY_DTYPE
from alphabase.peptide.mobility import mobility_to_ccs_for_df
from alphabase.psm_reader.keys import LibPsmDfCols, PsmDfCols
from alphabase.psm_reader.maxquant_reader import ModifiedSequenceReader
from alphabase.spectral_library.base import SpecLibBase
from alphabase.utils import _get_delimiter


[docs] class LibraryReaderBase(ModifiedSequenceReader, SpecLibBase): """Base class for reading spectral libraries.""" _reader_type = "library_reader_base" _add_unimod_to_mod_mapping = True
[docs] def __init__( # noqa: PLR0913 many arguments in function definition self, charged_frag_types: List[str] = [ "b_z1", "b_z2", "y_z1", "y_z2", "b_modloss_z1", "b_modloss_z2", "y_modloss_z1", "y_modloss_z2", ], column_mapping: Optional[dict] = None, modification_mapping: Optional[dict] = None, fdr: float = 0.01, fixed_C57: bool = False, # noqa: FBT001, FBT002, N803 TODO: make this *,fixed_c57 (breaking) mod_seq_columns: Optional[List[str]] = None, rt_unit: Optional[str] = None, # library reader-specific: precursor_mz_min: float = 400, precursor_mz_max: float = 2000, decoy: Optional[str] = None, **kwargs, ): """Base class for reading spectral libraries from long format csv files. Parameters ---------- charged_frag_types: list of str List of fragment types to be used in the spectral library. The default is ['b_z1','b_z2','y_z1', 'y_z2', 'b_modloss_z1','b_modloss_z2','y_modloss_z1', 'y_modloss_z2'] column_mapping: dict Dictionary mapping the column names in the csv file to the column names in the spectral library. The default is None, which uses the `library_reader_base` column mapping in `psm_reader.yaml` modification_mapping: dict Dictionary mapping the modification names in the csv file to the modification names in the spectral library. fdr: float False discovery rate threshold for filtering the spectral library. default is 0.01 fixed_C57: bool If true, the search engine will not show `Carbamidomethyl` in the modified sequences. By default False mod_seq_columns: list of str List of column names in the csv file containing the modified sequence. By default the mapping is taken from `psm_reader.yaml` rt_unit: str Unit of the retention time column in the csv file. The default is 'irt' precursor_mz_min: float Minimum precursor m/z value for filtering the spectral library. precursor_mz_max: float Maximum precursor m/z value for filtering the spectral library. decoy: str Decoy type for the spectral library. Can be either `pseudo_reverse` or `diann` **kwargs: dict deprecated """ SpecLibBase.__init__( self, charged_frag_types=charged_frag_types, precursor_mz_min=precursor_mz_min, precursor_mz_max=precursor_mz_max, decoy=decoy, ) ModifiedSequenceReader.__init__( self, column_mapping=column_mapping, modification_mapping=modification_mapping, fdr=fdr, keep_decoy=False, fixed_C57=fixed_C57, mod_seq_columns=mod_seq_columns, rt_unit=rt_unit, **kwargs, )
def _find_key_columns(self, lib_df: pd.DataFrame) -> None: """Find and create the key columns for the spectral library. Parameters ---------- lib_df: pd.DataFrame Dataframe containing the spectral library. """ if LibPsmDfCols.FRAGMENT_LOSS_TYPE not in lib_df.columns: lib_df[LibPsmDfCols.FRAGMENT_LOSS_TYPE] = "" lib_df.fillna({LibPsmDfCols.FRAGMENT_LOSS_TYPE: ""}, inplace=True) lib_df.replace( {LibPsmDfCols.FRAGMENT_LOSS_TYPE: "noloss"}, {LibPsmDfCols.FRAGMENT_LOSS_TYPE: ""}, inplace=True, ) if PsmDfCols.MODS not in lib_df.columns: lib_df[PsmDfCols.MODS] = "" if PsmDfCols.MOD_SITES not in lib_df.columns: lib_df[PsmDfCols.MOD_SITES] = "" def _get_fragment_intensity(self, lib_df: pd.DataFrame) -> pd.DataFrame: # noqa: PLR0912, C901 too many branches, too complex TODO: refactor """Create the self._fragment_intensity dataframe from a given spectral library. In the process, the input dataframe is converted from long format to a precursor dataframe and returned. Parameters ---------- lib_df: pd.DataFrame Dataframe containing the spectral library. Returns ------- precursor_df: pd.DataFrame Dataframe containing the fragment intensity. """ frag_col_dict = dict( zip(self.charged_frag_types, range(len(self.charged_frag_types))) ) self._find_key_columns(lib_df) # drop all columns which are all NaN as they prohibit grouping lib_df = lib_df.dropna(axis=1, how="all") precursor_df_list = [] frag_intens_list = [] n_aa_list = [] fragment_columns = [ LibPsmDfCols.FRAGMENT_MZ, LibPsmDfCols.FRAGMENT_TYPE, LibPsmDfCols.FRAGMENT_CHARGE, LibPsmDfCols.FRAGMENT_SERIES, LibPsmDfCols.FRAGMENT_LOSS_TYPE, LibPsmDfCols.FRAGMENT_INTENSITY, ] # by default, all non-fragment columns are used to group the library non_fragment_columns = sorted(set(lib_df.columns) - set(fragment_columns)) for keys, df_group in tqdm(lib_df.groupby(non_fragment_columns)): precursor_columns = dict(zip(non_fragment_columns, keys)) n_aa = len(precursor_columns[PsmDfCols.SEQUENCE]) intensities = np.zeros( (n_aa - 1, len(self.charged_frag_types)), dtype=PEAK_INTENSITY_DTYPE, ) for frag_type_, frag_num_, loss_type, frag_charge, intensity in df_group[ [ LibPsmDfCols.FRAGMENT_TYPE, LibPsmDfCols.FRAGMENT_SERIES, LibPsmDfCols.FRAGMENT_LOSS_TYPE, LibPsmDfCols.FRAGMENT_CHARGE, LibPsmDfCols.FRAGMENT_INTENSITY, ] ].to_numpy(): if frag_type_ in "abc": frag_num = frag_num_ - 1 elif frag_type_ in "xyz": frag_num = n_aa - frag_num_ - 1 else: continue if loss_type == "": frag_type = f"{frag_type_}_z{frag_charge}" elif loss_type == "H3PO4": frag_type = f"{frag_type_}_modloss_z{frag_charge}" elif loss_type == "H2O": frag_type = f"{frag_type_}_H2O_z{frag_charge}" elif loss_type == "NH3": frag_type = f"{frag_type_}_NH3_z{frag_charge}" elif loss_type == "unknown": # DiaNN+fragger frag_type = f"{frag_type_}_z{frag_charge}" else: continue if frag_type not in frag_col_dict: continue frag_col_idx = frag_col_dict[frag_type] intensities[frag_num, frag_col_idx] = intensity max_intensity = np.max(intensities) if max_intensity <= 0: continue normalized_intensities = intensities / max_intensity precursor_df_list.append(precursor_columns) frag_intens_list.append(normalized_intensities) n_aa_list.append(n_aa) df = pd.DataFrame(precursor_df_list) self._fragment_intensity_df = pd.DataFrame( np.concatenate(frag_intens_list), columns=self.charged_frag_types ) indices = np.zeros(len(n_aa_list) + 1, dtype=np.int64) indices[1:] = np.array(n_aa_list) - 1 indices = np.cumsum(indices) df[LibPsmDfCols.FRAG_START_IDX] = indices[:-1] df[LibPsmDfCols.FRAG_STOP_IDX] = indices[1:] return df def _load_file(self, filename: str) -> pd.DataFrame: """Load the spectral library from a csv file.""" csv_sep = _get_delimiter(filename) return pd.read_csv( filename, sep=csv_sep, keep_default_na=False, na_values=[ "#N/A", "#N/A N/A", "#NA", "-1.#IND", "-1.#QNAN", "-NaN", "-nan", "1.#IND", "1.#QNAN", "<NA>", "N/A", "NA", "NULL", "NaN", "None", "n/a", "nan", "null", ], ) def _post_process(self, origin_df: pd.DataFrame) -> None: """Process the spectral library and create the `fragment_intensity`, `fragment_mz` dataframe.""" del origin_df # unused, only here for backwards compatibility in alphapeptdeep # identify unknown modifications len_before = len(self._psm_df) self._psm_df = self._psm_df[~self._psm_df[PsmDfCols.MODS].isna()] len_after = len(self._psm_df) if len_before != len_after: pass # TODO: this literally does nothing if PsmDfCols.NAA not in self._psm_df.columns: self._psm_df[PsmDfCols.NAA] = self._psm_df[PsmDfCols.SEQUENCE].str.len() self._psm_df = self._get_fragment_intensity(self._psm_df) self.normalize_rt_by_raw_name() if PsmDfCols.MOBILITY in self._psm_df.columns: self._psm_df[PsmDfCols.CCS] = mobility_to_ccs_for_df( self._psm_df, PsmDfCols.MOBILITY ) self._psm_df.drop(PsmDfCols.MODIFIED_SEQUENCE, axis=1, inplace=True) self._precursor_df = self._psm_df self.calc_fragment_mz_df()
# legacy SWATHLibraryReader = LibraryReaderBase