Source code for alphabase.psm_reader.alphapept_reader
"""Reader for AlphaPept's .ms_data.hdf files."""
from pathlib import Path
from typing import Tuple
import h5py
import numba
import numpy as np
import pandas as pd
from alphabase.constants.modification import ModificationKeys
from alphabase.psm_reader.keys import PsmDfCols
from alphabase.psm_reader.psm_reader import (
PSMReaderBase,
psm_reader_provider,
)
# Module-level constant for use in numba-compiled functions
_SEPARATOR = ModificationKeys.SEPARATOR
[docs]
@numba.njit
def parse_ap(precursor: str) -> Tuple[str, str, str, str, int]:
"""Parser to parse peptide strings."""
items = precursor.split("_")
decoy = 1 if len(items) == 3 else 0 # noqa: PLR2004 magic value
modseq = items[0]
charge = items[-1]
parsed = []
mods = []
sites = []
string = ""
for i in range(len(modseq)):
if modseq[i].isupper():
break
if i > 0:
sites.append("0")
mods.append(modseq[:i])
modseq = modseq[i:]
for i in modseq:
string += i
if i.isupper():
parsed.append(i)
if len(string) > 1:
sites.append(str(len(parsed)))
mods.append(string)
string = ""
return (
"".join(parsed),
_SEPARATOR.join(mods),
_SEPARATOR.join(sites),
charge,
decoy,
)
[docs]
class AlphaPeptReader(PSMReaderBase):
"""Reader for AlphaPept's .ms_data.hdf files."""
_reader_type = "alphapept"
def _load_file(self, filename: str) -> pd.DataFrame:
"""Load an AlphaPept output file to a DataFrame."""
with h5py.File(filename, "r") as _hdf:
dataset = _hdf[
"identifications"
] # TODO: "identifications" could be moved to yaml
df = pd.DataFrame({col: dataset[col] for col in dataset})
# TODO: make this more stable
df[PsmDfCols.RAW_NAME] = Path(filename).name[: -len(".ms_data.hdf")]
return df
def _pre_process(self, df: pd.DataFrame) -> pd.DataFrame:
"""AlphaPept-specific preprocessing of output data."""
df["precursor"] = df["precursor"].str.decode("utf-8")
# df['naked_sequence'] = df['naked_sequence'].str.decode('utf-8')
if "scan_no" in df.columns:
df["scan_no"] = df["scan_no"].astype("int")
df["raw_idx"] = df["scan_no"] - 1 # if thermo, use scan-1 as spec_idx
df[PsmDfCols.CHARGE] = df[PsmDfCols.CHARGE].astype(int)
return df
def _load_modifications(self, origin_df: pd.DataFrame) -> None:
(
self._psm_df[PsmDfCols.SEQUENCE],
self._psm_df[PsmDfCols.MODS],
self._psm_df[PsmDfCols.MOD_SITES],
_charges,
self._psm_df[PsmDfCols.DECOY],
) = zip(*origin_df["precursor"].apply(parse_ap))
self._psm_df[PsmDfCols.DECOY] = self._psm_df[PsmDfCols.DECOY].astype(np.int8)
[docs]
def register_readers() -> None:
"""Register readers for AlphaPept's .ms_data.hdf files."""
psm_reader_provider.register_reader("alphapept", AlphaPeptReader)