Source code for alphabase.pg_reader.mztab_pg_reader

"""FragPipe protein group reader."""

from pathlib import Path
from typing import Literal, Optional, Union

import pandas as pd

from .pg_reader import PGReaderBase, pg_reader_provider



[docs]
class MZTabPGReader(PGReaderBase):
    """Reader for MZTab search engine output.

    MZTab is a standardized tab-delimited format for reporting proteomics and metabolomics results.
    The format organizes data into distinct sections: metadata (MTD), protein groups (PRH/PRT),
    peptides (PEH/PEP), PSMs (PSH/PSM), and small molecules (SMH/SML), with each section identified
    by specific three-letter prefixes. This reader extracts protein-level quantification data from
    the PRT lines, which contain protein abundances across samples or study variables.

    Example:
    -------
    Per default, the reader will return the raw intensities from the `razor` method. Additional protein features are stored
    in the dataframe index, samples are stored as columns.

    .. code-block:: python

        from alphabase.pg_reader import MZTabPGReader

        # Get raw intensities
        reader = MZTabPGReader()
        results = reader.import_file(path)


    References:
    ----------
    - Griss, J. et al. The mzTab Data Exchange Format: Communicating Mass-spectrometry-based Proteomics and Metabolomics Experimental Results to a Wider Audience*. Molecular & Cellular Proteomics 13, 2765-2775 (2014).
    - Official MZTab Repository: https://github.com/HUPO-PSI/mzTab.git
    - Official documentation: https://hupo-psi.github.io/mzTab/

    """

    _reader_type: str = "mztab"

    _PROTEIN_ROW_INDICATOR: str = "PRT"
    _PROTEIN_HEADER_INDICATOR: str = "PRH"
    _SEPARATOR: str = "\t"


[docs]
    def __init__(  # noqa: D107 inherited from base class
        self,
        *,
        column_mapping: Optional[dict[str, str]] = None,
        measurement_regex: Union[
            str, Literal["assay", "study_variable"], None  # noqa: PYI051 raw and lfq are special cases and not equivalent to string
        ] = "assay",
    ):
        super().__init__(
            column_mapping=column_mapping, measurement_regex=measurement_regex
        )


    def _load_file(self, file_path: str) -> pd.DataFrame:
        """Load MZTab file and extract protein data section.

        Parameters
        ----------
        file_path : str
            Path to MZTab file

        Returns
        -------
        pd.DataFrame
            DataFrame containing protein data from MZTab file

        Notes
        -----
        Protein lines are indicated with a leading `PRT`. The protein metadata header is
        indicated with a leading `PRH`. The file is tab separated.

        Raises
        ------
        ValueError
            If no protein data or metadata is found in the file

        """
        file_path = Path(file_path)
        protein_header = None
        protein_rows = []

        with file_path.open() as f:
            for line in f:
                line_stripped = line.strip()

                if line_stripped.startswith(self._PROTEIN_HEADER_INDICATOR):
                    # Protein header line - remove 'PRH' prefix and parse columns
                    header_content = line_stripped[3:].strip()
                    protein_header = header_content.split(self._SEPARATOR)

                elif line_stripped.startswith(self._PROTEIN_ROW_INDICATOR):
                    # Protein data line - remove 'PRT' prefix and parse data
                    row_content = line_stripped[3:].strip()
                    protein_rows.append(row_content.split(self._SEPARATOR))

        # Validate that we found protein data
        if protein_header is None:
            raise ValueError(
                f"No protein header ({self._PROTEIN_HEADER_INDICATOR}) found in MZTab file"
            )

        if not protein_rows:
            raise ValueError(
                f"No protein data rows ({self._PROTEIN_ROW_INDICATOR}) found in MZTab file"
            )

        return pd.DataFrame(protein_rows, columns=protein_header)



pg_reader_provider.register_reader("mztab", reader_class=MZTabPGReader)