Source code for alphabase.psm_reader.maxquant_reader

"""Reader for MaxQuant data."""

import warnings
from abc import ABC
from typing import List, Optional

import numba
import numpy as np
import pandas as pd

from alphabase.constants.modification import ModificationKeys
from alphabase.psm_reader.keys import PsmDfCols
from alphabase.psm_reader.psm_reader import (
    PSMReaderBase,
    psm_reader_provider,
    psm_reader_yaml,
)
from alphabase.psm_reader.utils import get_column_mapping_for_df

# Module-level constant for use in numba-compiled functions
_SEPARATOR = ModificationKeys.SEPARATOR

# make sure all warnings are shown
warnings.filterwarnings("always")


[docs] @numba.njit def replace_parentheses_with_brackets( modseq: str, ) -> str: """Replace parentheses with brackets in the modified sequence.""" mod_depth = 0 for i, aa in enumerate(modseq): if aa == "(": if mod_depth <= 0: modseq = modseq[:i] + "[" + modseq[i + 1 :] mod_depth += 1 elif aa == "[": mod_depth += 1 elif aa == ")": mod_depth -= 1 if mod_depth <= 0: modseq = modseq[:i] + "]" + modseq[i + 1 :] elif aa == "]": mod_depth -= 1 return modseq
[docs] @numba.njit def parse_mod_seq( modseq: str, mod_sep: str = "()", fixed_C57: bool = True, # noqa: FBT001, FBT002, N803 TODO: make this *,fixed_c57 (breaking) ) -> tuple: """Extract modifications and sites from the modified sequence (modseq). Parameters ---------- modseq : str modified sequence to extract modifications. mod_sep : str, optional separator to indicate the modification section. Defaults to '()' fixed_C57 : bool If Carbamidomethyl@C is a fixed modification and not displayed in the sequence. Defaults to True. Returns ------- tuple str: naked peptide sequence str: modification names, separated by ';' str: modification sites, separated by ';'. 0 for N-term; -1 for C-term; 1 to N for normal modifications. """ peptide_mod_seq = modseq underscore_for_ncterm = modseq[0] == "_" mod_list = [] site_list = [] site = peptide_mod_seq.find(mod_sep[0]) while site != -1: site_end = peptide_mod_seq.find(mod_sep[1], site + 1) + 1 if site_end < len(peptide_mod_seq) and peptide_mod_seq[site_end] == mod_sep[1]: site_end += 1 if underscore_for_ncterm: site_list.append(site - 1) else: site_list.append(site) start_mod = site if start_mod > 0: start_mod -= 1 mod_list.append(peptide_mod_seq[start_mod:site_end]) peptide_mod_seq = peptide_mod_seq[:site] + peptide_mod_seq[site_end:] site = peptide_mod_seq.find(mod_sep[0], site) # patch for phos. How many other modification formats does MQ have? site = peptide_mod_seq.find("p") while site != -1: mod_list.append(peptide_mod_seq[site : site + 2]) site_list = [i - 1 if i > site else i for i in site_list] if underscore_for_ncterm: site_list.append(site) else: site_list.append(site + 1) peptide_mod_seq = peptide_mod_seq[:site] + peptide_mod_seq[site + 1 :] site = peptide_mod_seq.find("p", site) if fixed_C57: site = peptide_mod_seq.find("C") while site != -1: if underscore_for_ncterm: site_list.append(site) else: site_list.append(site + 1) mod_list.append("C" + "Carbamidomethyl (C)".join(mod_sep)) site = peptide_mod_seq.find("C", site + 1) sequence = peptide_mod_seq.strip("_") n_aa = len(sequence) return ( sequence, _SEPARATOR.join(mod_list), _SEPARATOR.join([str(i) if i <= n_aa else "-1" for i in site_list]), )
[docs] class ModifiedSequenceReader(PSMReaderBase, ABC): """Reader for MaxQuant-like data.""" _add_unimod_to_mod_mapping = True
[docs] def __init__( # noqa: PLR0913 # too many arguments in function definition, missing argument descriptions self, *, column_mapping: Optional[dict] = None, modification_mapping: Optional[dict] = None, mod_seq_columns: Optional[List[str]] = None, fdr: float = 0.01, keep_decoy: bool = False, rt_unit: Optional[str] = None, **kwargs, ): """Reader for MaxQuant-like data (in terms of modification loading and decoy translation). See documentation of `PSMReaderBase` for more information. See documentation of `PSMReaderBase` for the parameters. """ super().__init__( column_mapping=column_mapping, modification_mapping=modification_mapping, mod_seq_columns=mod_seq_columns, fdr=fdr, keep_decoy=keep_decoy, rt_unit=rt_unit, **kwargs, ) self.fixed_C57 = False
def _translate_decoy(self) -> None: if PsmDfCols.DECOY in self._psm_df.columns: self._psm_df[PsmDfCols.DECOY] = ( self._psm_df[PsmDfCols.DECOY] == "-" ).astype(np.int8) def _load_modifications(self, origin_df: pd.DataFrame) -> None: if origin_df[self.mod_seq_column].str.contains("[", regex=False).any(): if origin_df[self.mod_seq_column].str.contains("(", regex=False).any(): origin_df[self.mod_seq_column] = origin_df[self.mod_seq_column].apply( replace_parentheses_with_brackets ) mod_sep = "[]" else: mod_sep = "()" seqs, mods, mod_sites = zip( *origin_df[self.mod_seq_column].apply( parse_mod_seq, mod_sep=mod_sep, fixed_C57=self.fixed_C57, ) ) self._psm_df[PsmDfCols.MODS] = mods self._psm_df[PsmDfCols.MOD_SITES] = mod_sites if PsmDfCols.SEQUENCE not in self._psm_df.columns: self._psm_df[PsmDfCols.SEQUENCE] = seqs
[docs] class MaxQuantReader(ModifiedSequenceReader): """Reader for MaxQuant data.""" _reader_type = "maxquant"
[docs] def __init__( # noqa: PLR0913, D417 # too many arguments in function definition, missing argument descriptions self, *, column_mapping: Optional[dict] = None, modification_mapping: Optional[dict] = None, mod_seq_columns: Optional[List[str]] = None, fdr: float = 0.01, keep_decoy: bool = False, rt_unit: Optional[str] = None, # MaxQuant reader-specific fixed_C57: Optional[bool] = None, # noqa: N803 TODO: make this *,fixed_c57 (breaking) **kwargs, ): """Reader for MaxQuant data. See documentation of `PSMReaderBase` for more information. Parameters ---------- fixed_C57 : bool, optional If true, the search engine will not show `Carbamidomethyl` in the modified sequences. by default read from psm_reader_yaml key `fixed_C57`. See documentation of `PSMReaderBase` for the rest of parameters. """ super().__init__( column_mapping=column_mapping, modification_mapping=modification_mapping, mod_seq_columns=mod_seq_columns, fdr=fdr, keep_decoy=keep_decoy, rt_unit=rt_unit, **kwargs, ) # set True if the search engine will not show Carbamidomethyl in the modified sequences self.fixed_C57 = ( fixed_C57 if fixed_C57 is not None else psm_reader_yaml[self._reader_type]["fixed_C57"] )
def _pre_process(self, df: pd.DataFrame) -> pd.DataFrame: """MaxQuant-specific preprocessing of output data.""" df = df[~pd.isna(df["Retention time"])] df.fillna("", inplace=True) # remove MBR PSMs as they are currently not supported and will crash import mapped_columns = get_column_mapping_for_df(self.column_mapping, df) if PsmDfCols.SCAN_NUM in mapped_columns: scan_num_col = mapped_columns[PsmDfCols.SCAN_NUM] no_ms2_mask = df[scan_num_col] == "" if (num_no_ms2_mask := np.sum(no_ms2_mask)) > 0: warnings.warn( f"MaxQuant PSM file contains {num_no_ms2_mask} MBR PSMs without MS2 scan. This is not yet supported and rows containing MBR PSMs will be removed." ) df = df[~no_ms2_mask] df.reset_index(drop=True, inplace=True) df[scan_num_col] = df[scan_num_col].astype(int) # if 'K0' in df.columns: # df['Mobility'] = df['K0'] # Bug in MaxQuant? It should be 1/K0 # min_rt = df['Retention time'].min() return df
[docs] def register_readers() -> None: """Register MaxQuant reader.""" psm_reader_provider.register_reader("maxquant", MaxQuantReader)