Source code for alphabase.pg_reader.alphapept_pg_reader

"""AlphaPept protein group reader."""

import re
import warnings
from typing import Any, Literal, Optional, Union

import pandas as pd

from alphabase.constants.modification import ModificationKeys

from .keys import PGCols
from .pg_reader import PGReaderBase, pg_reader_provider


[docs] class AlphaPeptPGReader(PGReaderBase): """Reader for protein group matrices from the alphapept search engine. Per default, the reader will read raw intensities from the protein group matrix. By passing a suitable regular expression, it is also possible to extract LFQ corrected intensities from the reader. Notes: ----- AlphaPept protein group matrices contain both raw intensities and LFQ-corrected intensities. The LFQ-corrected intensities are marked by an `_LFQ` suffix. In order to read alphapept `.hdf` output, please install the package with extra optional dependencies `pip install "alphabase[hdf]"`. Example: ------- Get example data .. code-block:: python import os import tempfile from alphabase.tools.data_downloader import DataShareDownloader from alphabase.pg_reader import AlphaPeptPGReader # Download to temporary directory URL = "https://datashare.biochem.mpg.de/s/6G6KHJqwcRPQiOO" download_dir = tempfile.mkdtemp() download_path = DataShareDownloader(url=URL, output_dir=download_dir).download() Per default, the reader will return the raw intensities. Additional protein features are stored in the dataframe index, samples are stored as columns. .. code-block:: python # Get raw intensities reader = AlphaPeptPGReader() results = reader.import_file(download_path) results.index.names > FrozenList(['proteins', 'uniprot_ids', 'ensembl_ids', 'source_db', 'is_decoy']) results.columns > Index(['A', 'B'], dtype='object') To read the LFQ values, pass the pre-configured key `lfq` to the reader, which represents a regular expression that automatically extracts the `LFQ` columns from the protein group table. .. code-block:: python # Get raw intensities reader = AlphaPeptPGReader(measurement_regex="lfq") results = reader.import_file(download_path) results.index.names > FrozenList(['proteins', 'uniprot_ids', 'ensembl_ids', 'source_db', 'is_decoy']) results.columns > Index(['A_LFQ', 'B_LFQ'], dtype='object') To check out all preconfigured regular expressions, use the `get_preconfigured_regex` method: .. code-block:: python AlphaPeptPGReader.get_preconfigured_regex() > {'raw': '^.*(?<!_LFQ)$', 'lfq': '_LFQ$'} """ _reader_type: str = "alphapept" # Report file settings (delimiter + index column) _FILE_DELIMITER: str = "," # alphapept does not set a name for the feature column, i.e. it is set to the pandas default _INDEX_COL: str = "Unnamed: 0" # Default delimiter in fasta file headers _ENTRY_DELIMITER: str = "|" # Feature settings # Decoys are prefixed with REV__ in alphapept _DECOY_REGEX: str = "^REV__" # Ensembl IDs are identified with a ENSEMBL prefix _ENSEMBL_REGEX: str = "^ENSEMBL:" _ENSEMBL_NAME: str = "ENSEMBL" # The expected length of fasta headers is 3 (sp|Uniprot ID|Uniprot Name) _FASTA_HEADER_DEFAULT_LENGTH: int = 3 _NA_STR: str = "na" _PG_DELIMITER: str = ModificationKeys.SEPARATOR
[docs] def __init__( self, *, column_mapping: Optional[dict[str, Any]] = None, measurement_regex: Union[str, Literal["raw", "lfq"], None] = "raw", # noqa: PYI051 raw and lfq are special cases and not equivalent to string ): """Initialize AlphaPept protein group matrix reader. Parameters ---------- column_mapping Dictionary mapping alphabase column names (keys) to AlphaPept column names (values). If `None`, uses default mapping from configuration file. measurement_regex Pattern to select quantity columns - "raw" (default): Raw intensities (excludes _LFQ columns) - "lfq": LFQ-corrected intensities (_LFQ suffix) - str: Custom regular expression pattern - None: All quantity columns See class documentation for usage examples and `get_preconfigured_regex()` for available patterns. """ super().__init__( column_mapping=column_mapping, measurement_regex=measurement_regex )
def _pre_process(self, df: pd.DataFrame) -> pd.DataFrame: """Preprocess of alphapept protein group report and return modified copy of the dataframe. Processes feature index to a parsed, streamlined version. Parameters ---------- df alphapept protein group report. Returns ------- :class:`pd.DataFrame` Modified copy of protein group report with parsed index. The index contains the levels - proteins: str - uniprot_ids: str - ensembl_ids: str - source_db: str - is_decoy: bool """ df = df.copy() # alphapept does not set a name for the feature column # load it as regular column and set it to index afterwards df = df.set_index(self._INDEX_COL) # Parse index parsed_index: list[dict[str, str]] = list( df.index.map(lambda idx: self._parse_alphapept_index(idx)) ) # Overwrite index with streamlined version df.index = pd.MultiIndex.from_frame(pd.DataFrame(parsed_index)) return df def _parse_alphapept_index(self, identifier: str) -> dict[str, str]: """Parse protein identifier from AlphaPept protein group table. Parameters ---------- identifier : str Protein identifier string from AlphaPept Returns ------- dict Dictionary with parsed components: - proteins: str, semicolon-separated protein names or self._NA_STR - uniprot_ids: str, semicolon-separated UniProt IDs or self._NA_STR - ensembl_ids: str, semicolon-separated ENSEMBL IDs or self._NA_STR - source_db: str, semicolon-separated data sources or self._NA_STR - is_decoy: bool, True if any identifier in a protein group starts with "REV__" Examples -------- .. code-block:: python # sp|Q9NQT4|EXOS5_HUMAN {"source_db": "sp", "uniprot_ids": "Q9NQT4", "ensembl_ids": "na", "proteins": "EXOS5_HUMAN", "is_decoy": False} # Q0IIK2 {"source_db": self._NA_STR, "uniprot_ids": "Q0IIK2", "ensembl_ids": "na", "proteins": self._NA_STR, "is_decoy": False} # "sp|Q9H2K8|TAOK3_HUMAN,sp|Q7L7X3|TAOK1_HUMAN" {"source_db": "sp;sp", "uniprot_ids": "Q9H2K8;Q7L7X3", "ensembl_ids": "na;na", "proteins": "TAOK3_HUMAN;TAOK1_HUMAN", "is_decoy": False} # ENSEMBL:ENSBTAP00000024146 {"source_db": "ENSEMBL", "uniprot_ids": self._NA_STR, "ensembl_ids": "ENSBTAP00000024146", "proteins": self._NA_STR, "is_decoy": False} # ENSEMBL:ENSBTAP00000024146,sp|P35520|CBS_HUMAN {"source_db": "ENSEMBL;sp", "uniprot_ids": "P35520", "ensembl_ids": "ENSBTAP00000024146", "proteins": "CBS_HUMAN", "is_decoy": False} # REV__sp|Q13085|ACACA_HUMAN {"source_db": "REV__sp", "uniprot_ids": "Q13085", "ensembl_ids": "na", "proteins": "ACACA_HUMAN", "is_decoy": True} """ decoy_pattern = re.compile(self._DECOY_REGEX) ensembl_pattern = re.compile(self._ENSEMBL_REGEX) # Multiple proteins are separted by comma protein_entries = identifier.split(",") source_db: list[str] = [] uniprot_ids: list[str] = [] ensembl_ids: list[str] = [] proteins: list[str] = [] is_decoy: list[bool] = [] for entry in protein_entries: # Decoys # Identify decoys and remove decoy prefix if present entry_is_decoy = bool(decoy_pattern.search(entry)) is_decoy.append(entry_is_decoy) # Check for ENSEMBL format (ENSEMBL:IDENTIFIER) if re.search(ensembl_pattern, entry): source_db.append(self._ENSEMBL_NAME) # Remove "ENSEMBL:" prefix uniprot_ids.append(self._NA_STR) proteins.append(self._NA_STR) ensembl_ids.append(re.sub(ensembl_pattern, "", entry)) # Check if entry contains pipe separators (UniProt format) # Options: # sp|Q9H2K8|TAOK3_HUMAN # Q9H2K8 # TODO: How to handle REV sequences here? # Currently they are only marked by the DECOY_INDICATOR flag, but should the individual identifiers be flagged as well? elif self._ENTRY_DELIMITER in entry: parts = entry.split(self._ENTRY_DELIMITER) if len(parts) == self._FASTA_HEADER_DEFAULT_LENGTH: source_db.append(parts[0]) uniprot_ids.append(parts[1]) proteins.append(parts[2]) ensembl_ids.append(self._NA_STR) else: # Handle unexpected format warnings.warn( f"Encountered unexpected format. Set {entry} to proteins.", stacklevel=2, ) source_db.append(self._NA_STR) uniprot_ids.append(self._NA_STR) proteins.append(entry) ensembl_ids.append(self._NA_STR) else: # No pipes or ENSEMBL prefix, assume it's just a UniProt ID uniprot_ids.append(entry) source_db.append(self._NA_STR) proteins.append(self._NA_STR) ensembl_ids.append(self._NA_STR) # Join with semicolons or use self._NA_STR if empty source_db_str = ( self._PG_DELIMITER.join(source_db) if source_db else self._NA_STR ) uniprot_ids_str = ( self._PG_DELIMITER.join(uniprot_ids) if uniprot_ids else self._NA_STR ) ensembl_ids_str = ( self._PG_DELIMITER.join(ensembl_ids) if ensembl_ids else self._NA_STR ) proteins_str = self._PG_DELIMITER.join(proteins) if proteins else self._NA_STR is_decoy = any(is_decoy) return { PGCols.PROTEINS: proteins_str, PGCols.UNIPROT_IDS: uniprot_ids_str, PGCols.ENSEMBL_IDS: ensembl_ids_str, PGCols.SOURCE_DB: source_db_str, PGCols.DECOY_INDICATOR: is_decoy, }
pg_reader_provider.register_reader("alphapept", reader_class=AlphaPeptPGReader)