Source code for alphabase.psm_reader.alphapept_reader

"""Reader for AlphaPept's .ms_data.hdf files."""

from pathlib import Path
from typing import Tuple

import h5py
import numba
import numpy as np
import pandas as pd

from alphabase.constants.modification import ModificationKeys
from alphabase.psm_reader.keys import PsmDfCols
from alphabase.psm_reader.psm_reader import (
    PSMReaderBase,
    psm_reader_provider,
)

# Module-level constant for use in numba-compiled functions
_SEPARATOR = ModificationKeys.SEPARATOR


[docs] @numba.njit def parse_ap(precursor: str) -> Tuple[str, str, str, str, int]: """Parser to parse peptide strings.""" items = precursor.split("_") decoy = 1 if len(items) == 3 else 0 # noqa: PLR2004 magic value modseq = items[0] charge = items[-1] parsed = [] mods = [] sites = [] string = "" for i in range(len(modseq)): if modseq[i].isupper(): break if i > 0: sites.append("0") mods.append(modseq[:i]) modseq = modseq[i:] for i in modseq: string += i if i.isupper(): parsed.append(i) if len(string) > 1: sites.append(str(len(parsed))) mods.append(string) string = "" return ( "".join(parsed), _SEPARATOR.join(mods), _SEPARATOR.join(sites), charge, decoy, )
[docs] class AlphaPeptReader(PSMReaderBase): """Reader for AlphaPept's .ms_data.hdf files.""" _reader_type = "alphapept" def _load_file(self, filename: str) -> pd.DataFrame: """Load an AlphaPept output file to a DataFrame.""" with h5py.File(filename, "r") as _hdf: dataset = _hdf[ "identifications" ] # TODO: "identifications" could be moved to yaml df = pd.DataFrame({col: dataset[col] for col in dataset}) # TODO: make this more stable df[PsmDfCols.RAW_NAME] = Path(filename).name[: -len(".ms_data.hdf")] return df def _pre_process(self, df: pd.DataFrame) -> pd.DataFrame: """AlphaPept-specific preprocessing of output data.""" df["precursor"] = df["precursor"].str.decode("utf-8") # df['naked_sequence'] = df['naked_sequence'].str.decode('utf-8') if "scan_no" in df.columns: df["scan_no"] = df["scan_no"].astype("int") df["raw_idx"] = df["scan_no"] - 1 # if thermo, use scan-1 as spec_idx df[PsmDfCols.CHARGE] = df[PsmDfCols.CHARGE].astype(int) return df def _load_modifications(self, origin_df: pd.DataFrame) -> None: ( self._psm_df[PsmDfCols.SEQUENCE], self._psm_df[PsmDfCols.MODS], self._psm_df[PsmDfCols.MOD_SITES], _charges, self._psm_df[PsmDfCols.DECOY], ) = zip(*origin_df["precursor"].apply(parse_ap)) self._psm_df[PsmDfCols.DECOY] = self._psm_df[PsmDfCols.DECOY].astype(np.int8)
[docs] def register_readers() -> None: """Register readers for AlphaPept's .ms_data.hdf files.""" psm_reader_provider.register_reader("alphapept", AlphaPeptReader)