Source code for alphabase.psm_reader.maxquant_reader

"""Reader for MaxQuant data."""

import warnings
from abc import ABC
from typing import List, Optional

import numba
import numpy as np
import pandas as pd

from alphabase.constants.modification import ModificationKeys
from alphabase.psm_reader.keys import PsmDfCols
from alphabase.psm_reader.psm_reader import (
    PSMReaderBase,
    psm_reader_provider,
    psm_reader_yaml,
)
from alphabase.psm_reader.utils import get_column_mapping_for_df

# Module-level constant for use in numba-compiled functions
_SEPARATOR = ModificationKeys.SEPARATOR

# make sure all warnings are shown
warnings.filterwarnings("always")



[docs]
@numba.njit
def replace_parentheses_with_brackets(
    modseq: str,
) -> str:
    """Replace parentheses with brackets in the modified sequence."""
    mod_depth = 0
    for i, aa in enumerate(modseq):
        if aa == "(":
            if mod_depth <= 0:
                modseq = modseq[:i] + "[" + modseq[i + 1 :]
            mod_depth += 1
        elif aa == "[":
            mod_depth += 1
        elif aa == ")":
            mod_depth -= 1
            if mod_depth <= 0:
                modseq = modseq[:i] + "]" + modseq[i + 1 :]
        elif aa == "]":
            mod_depth -= 1
    return modseq




[docs]
@numba.njit
def parse_mod_seq(
    modseq: str,
    mod_sep: str = "()",
    fixed_C57: bool = True,  # noqa: FBT001, FBT002, N803 TODO: make this  *,fixed_c57  (breaking)
) -> tuple:
    """Extract modifications and sites from the modified sequence (modseq).

    Parameters
    ----------
    modseq : str
        modified sequence to extract modifications.

    mod_sep : str, optional
        separator to indicate the modification section.
        Defaults to '()'

    fixed_C57 : bool
        If Carbamidomethyl@C is a fixed modification
        and not displayed in the sequence. Defaults to True.

    Returns
    -------
    tuple
        str: naked peptide sequence

        str: modification names, separated by ';'

        str: modification sites, separated by ';'.
        0 for N-term; -1 for C-term; 1 to N for normal modifications.

    """
    peptide_mod_seq = modseq
    underscore_for_ncterm = modseq[0] == "_"
    mod_list = []
    site_list = []
    site = peptide_mod_seq.find(mod_sep[0])
    while site != -1:
        site_end = peptide_mod_seq.find(mod_sep[1], site + 1) + 1
        if site_end < len(peptide_mod_seq) and peptide_mod_seq[site_end] == mod_sep[1]:
            site_end += 1
        if underscore_for_ncterm:
            site_list.append(site - 1)
        else:
            site_list.append(site)
        start_mod = site
        if start_mod > 0:
            start_mod -= 1
        mod_list.append(peptide_mod_seq[start_mod:site_end])
        peptide_mod_seq = peptide_mod_seq[:site] + peptide_mod_seq[site_end:]
        site = peptide_mod_seq.find(mod_sep[0], site)

    # patch for phos. How many other modification formats does MQ have?
    site = peptide_mod_seq.find("p")
    while site != -1:
        mod_list.append(peptide_mod_seq[site : site + 2])
        site_list = [i - 1 if i > site else i for i in site_list]
        if underscore_for_ncterm:
            site_list.append(site)
        else:
            site_list.append(site + 1)
        peptide_mod_seq = peptide_mod_seq[:site] + peptide_mod_seq[site + 1 :]
        site = peptide_mod_seq.find("p", site)

    if fixed_C57:
        site = peptide_mod_seq.find("C")
        while site != -1:
            if underscore_for_ncterm:
                site_list.append(site)
            else:
                site_list.append(site + 1)
            mod_list.append("C" + "Carbamidomethyl (C)".join(mod_sep))
            site = peptide_mod_seq.find("C", site + 1)
    sequence = peptide_mod_seq.strip("_")
    n_aa = len(sequence)
    return (
        sequence,
        _SEPARATOR.join(mod_list),
        _SEPARATOR.join([str(i) if i <= n_aa else "-1" for i in site_list]),
    )




[docs]
class ModifiedSequenceReader(PSMReaderBase, ABC):
    """Reader for MaxQuant-like data."""

    _add_unimod_to_mod_mapping = True


[docs]
    def __init__(  # noqa: PLR0913 # too many arguments in function definition, missing argument descriptions
        self,
        *,
        column_mapping: Optional[dict] = None,
        modification_mapping: Optional[dict] = None,
        mod_seq_columns: Optional[List[str]] = None,
        fdr: float = 0.01,
        keep_decoy: bool = False,
        rt_unit: Optional[str] = None,
        **kwargs,
    ):
        """Reader for MaxQuant-like data (in terms of modification loading and decoy translation).

        See documentation of `PSMReaderBase` for more information.

        See documentation of `PSMReaderBase` for the parameters.
        """
        super().__init__(
            column_mapping=column_mapping,
            modification_mapping=modification_mapping,
            mod_seq_columns=mod_seq_columns,
            fdr=fdr,
            keep_decoy=keep_decoy,
            rt_unit=rt_unit,
            **kwargs,
        )

        self.fixed_C57 = False


    def _translate_decoy(self) -> None:
        if PsmDfCols.DECOY in self._psm_df.columns:
            self._psm_df[PsmDfCols.DECOY] = (
                self._psm_df[PsmDfCols.DECOY] == "-"
            ).astype(np.int8)

    def _load_modifications(self, origin_df: pd.DataFrame) -> None:
        if origin_df[self.mod_seq_column].str.contains("[", regex=False).any():
            if origin_df[self.mod_seq_column].str.contains("(", regex=False).any():
                origin_df[self.mod_seq_column] = origin_df[self.mod_seq_column].apply(
                    replace_parentheses_with_brackets
                )
            mod_sep = "[]"
        else:
            mod_sep = "()"

        seqs, mods, mod_sites = zip(
            *origin_df[self.mod_seq_column].apply(
                parse_mod_seq,
                mod_sep=mod_sep,
                fixed_C57=self.fixed_C57,
            )
        )

        self._psm_df[PsmDfCols.MODS] = mods
        self._psm_df[PsmDfCols.MOD_SITES] = mod_sites
        if PsmDfCols.SEQUENCE not in self._psm_df.columns:
            self._psm_df[PsmDfCols.SEQUENCE] = seqs




[docs]
class MaxQuantReader(ModifiedSequenceReader):
    """Reader for MaxQuant data."""

    _reader_type = "maxquant"


[docs]
    def __init__(  # noqa: PLR0913, D417 # too many arguments in function definition, missing argument descriptions
        self,
        *,
        column_mapping: Optional[dict] = None,
        modification_mapping: Optional[dict] = None,
        mod_seq_columns: Optional[List[str]] = None,
        fdr: float = 0.01,
        keep_decoy: bool = False,
        rt_unit: Optional[str] = None,
        # MaxQuant reader-specific
        fixed_C57: Optional[bool] = None,  # noqa: N803 TODO: make this  *,fixed_c57  (breaking)
        **kwargs,
    ):
        """Reader for MaxQuant data.

        See documentation of `PSMReaderBase` for more information.

        Parameters
        ----------
        fixed_C57 : bool, optional
            If true, the search engine will not show `Carbamidomethyl`
            in the modified sequences.
            by default read from psm_reader_yaml key `fixed_C57`.

        See documentation of `PSMReaderBase` for the rest of parameters.

        """
        super().__init__(
            column_mapping=column_mapping,
            modification_mapping=modification_mapping,
            mod_seq_columns=mod_seq_columns,
            fdr=fdr,
            keep_decoy=keep_decoy,
            rt_unit=rt_unit,
            **kwargs,
        )

        # set True if the search engine will not show Carbamidomethyl in the modified sequences
        self.fixed_C57 = (
            fixed_C57
            if fixed_C57 is not None
            else psm_reader_yaml[self._reader_type]["fixed_C57"]
        )


    def _pre_process(self, df: pd.DataFrame) -> pd.DataFrame:
        """MaxQuant-specific preprocessing of output data."""
        df = df[~pd.isna(df["Retention time"])]
        df.fillna("", inplace=True)

        # remove MBR PSMs as they are currently not supported and will crash import
        mapped_columns = get_column_mapping_for_df(self.column_mapping, df)
        if PsmDfCols.SCAN_NUM in mapped_columns:
            scan_num_col = mapped_columns[PsmDfCols.SCAN_NUM]
            no_ms2_mask = df[scan_num_col] == ""
            if (num_no_ms2_mask := np.sum(no_ms2_mask)) > 0:
                warnings.warn(
                    f"MaxQuant PSM file contains {num_no_ms2_mask} MBR PSMs without MS2 scan. This is not yet supported and rows containing MBR PSMs will be removed."
                )
                df = df[~no_ms2_mask]
                df.reset_index(drop=True, inplace=True)
            df[scan_num_col] = df[scan_num_col].astype(int)

        # if 'K0' in df.columns:
        #     df['Mobility'] = df['K0'] # Bug in MaxQuant? It should be 1/K0
        # min_rt = df['Retention time'].min()
        return df




[docs]
def register_readers() -> None:
    """Register MaxQuant reader."""
    psm_reader_provider.register_reader("maxquant", MaxQuantReader)