Source code for alphabase.pg_reader.mztab_pg_reader
"""FragPipe protein group reader."""
from pathlib import Path
from typing import Literal, Optional, Union
import pandas as pd
from .pg_reader import PGReaderBase, pg_reader_provider
[docs]
class MZTabPGReader(PGReaderBase):
"""Reader for MZTab search engine output.
MZTab is a standardized tab-delimited format for reporting proteomics and metabolomics results.
The format organizes data into distinct sections: metadata (MTD), protein groups (PRH/PRT),
peptides (PEH/PEP), PSMs (PSH/PSM), and small molecules (SMH/SML), with each section identified
by specific three-letter prefixes. This reader extracts protein-level quantification data from
the PRT lines, which contain protein abundances across samples or study variables.
Example:
-------
Per default, the reader will return the raw intensities from the `razor` method. Additional protein features are stored
in the dataframe index, samples are stored as columns.
.. code-block:: python
from alphabase.pg_reader import MZTabPGReader
# Get raw intensities
reader = MZTabPGReader()
results = reader.import_file(path)
References:
----------
- Griss, J. et al. The mzTab Data Exchange Format: Communicating Mass-spectrometry-based Proteomics and Metabolomics Experimental Results to a Wider Audience*. Molecular & Cellular Proteomics 13, 2765-2775 (2014).
- Official MZTab Repository: https://github.com/HUPO-PSI/mzTab.git
- Official documentation: https://hupo-psi.github.io/mzTab/
"""
_reader_type: str = "mztab"
_PROTEIN_ROW_INDICATOR: str = "PRT"
_PROTEIN_HEADER_INDICATOR: str = "PRH"
_SEPARATOR: str = "\t"
[docs]
def __init__( # noqa: D107 inherited from base class
self,
*,
column_mapping: Optional[dict[str, str]] = None,
measurement_regex: Union[
str, Literal["assay", "study_variable"], None # noqa: PYI051 raw and lfq are special cases and not equivalent to string
] = "assay",
):
super().__init__(
column_mapping=column_mapping, measurement_regex=measurement_regex
)
def _load_file(self, file_path: str) -> pd.DataFrame:
"""Load MZTab file and extract protein data section.
Parameters
----------
file_path : str
Path to MZTab file
Returns
-------
pd.DataFrame
DataFrame containing protein data from MZTab file
Notes
-----
Protein lines are indicated with a leading `PRT`. The protein metadata header is
indicated with a leading `PRH`. The file is tab separated.
Raises
------
ValueError
If no protein data or metadata is found in the file
"""
file_path = Path(file_path)
protein_header = None
protein_rows = []
with file_path.open() as f:
for line in f:
line_stripped = line.strip()
if line_stripped.startswith(self._PROTEIN_HEADER_INDICATOR):
# Protein header line - remove 'PRH' prefix and parse columns
header_content = line_stripped[3:].strip()
protein_header = header_content.split(self._SEPARATOR)
elif line_stripped.startswith(self._PROTEIN_ROW_INDICATOR):
# Protein data line - remove 'PRT' prefix and parse data
row_content = line_stripped[3:].strip()
protein_rows.append(row_content.split(self._SEPARATOR))
# Validate that we found protein data
if protein_header is None:
raise ValueError(
f"No protein header ({self._PROTEIN_HEADER_INDICATOR}) found in MZTab file"
)
if not protein_rows:
raise ValueError(
f"No protein data rows ({self._PROTEIN_ROW_INDICATOR}) found in MZTab file"
)
return pd.DataFrame(protein_rows, columns=protein_header)
pg_reader_provider.register_reader("mztab", reader_class=MZTabPGReader)