Source code for alphabase.psm_reader.pfind_reader
"""pFind reader."""
from typing import Optional, Tuple, Union
import numpy as np
import pandas as pd
from pandas._libs.missing import NAType
import alphabase.constants.modification as ap_mod
from alphabase.constants.modification import ModificationKeys
from alphabase.psm_reader.keys import PsmDfCols
from alphabase.psm_reader.psm_reader import (
PSMReaderBase,
psm_reader_provider,
)
def _convert_one_pfind_mod(mod: str) -> Optional[str]: # noqa: C901 too complex (11 > 10) TODO: refactor
if mod[-1] == ")":
mod = mod[: (mod.find("(") - 1)]
idx = mod.rfind("[")
name = mod[:idx]
site = mod[(idx + 1) :]
else:
idx = mod.rfind("[")
name = mod[:idx]
site = mod[(idx + 1) : -1]
if len(site) == 1:
return_value = name + ModificationKeys.SITE_SEPARATOR + site
elif site == "AnyN-term":
return_value = (
name + ModificationKeys.SITE_SEPARATOR + ModificationKeys.ANY_N_TERM
)
elif site == "ProteinN-term":
return_value = (
name + ModificationKeys.SITE_SEPARATOR + ModificationKeys.PROTEIN_N_TERM
)
elif site.startswith("AnyN-term"):
return_value = (
name
+ ModificationKeys.SITE_SEPARATOR
+ site[-1]
+ ModificationKeys.ANY_N_TERM_SPECIFIC
)
elif site.startswith("ProteinN-term"):
return_value = (
name
+ ModificationKeys.SITE_SEPARATOR
+ site[-1]
+ ModificationKeys.PROTEIN_N_TERM_SPECIFIC
)
elif site == "AnyC-term":
return_value = (
name + ModificationKeys.SITE_SEPARATOR + ModificationKeys.ANY_C_TERM
)
elif site == "ProteinC-term":
return_value = (
name + ModificationKeys.SITE_SEPARATOR + ModificationKeys.PROTEIN_C_TERM
)
elif site.startswith("AnyC-term"):
return_value = (
name
+ ModificationKeys.SITE_SEPARATOR
+ site[-1]
+ ModificationKeys.ANY_C_TERM_SPECIFIC
)
elif site.startswith("ProteinC-term"):
return_value = (
name
+ ModificationKeys.SITE_SEPARATOR
+ site[-1]
+ ModificationKeys.PROTEIN_C_TERM_SPECIFIC
)
else:
return_value = None
return return_value
[docs]
def translate_pFind_mod(mod_str: str) -> Union[str, NAType]: # noqa: N802 name `get_pFind_mods` should be lowercase TODO: used by peptdeep
"""Translate pFind modification string."""
if not mod_str:
return ""
ret_mods = []
for mod_ in mod_str.split(ModificationKeys.SEPARATOR):
mod = _convert_one_pfind_mod(mod_)
if not mod or mod not in ap_mod.MOD_INFO_DICT:
return pd.NA
ret_mods.append(mod)
return ModificationKeys.SEPARATOR.join(ret_mods)
[docs]
def get_pFind_mods(pfind_mod_str: str) -> Tuple[str, str]: # noqa: N802 name `get_pFind_mods` should be lowercase TODO: used by peptdeep
"""Parse pFind modification string."""
pfind_mod_str = pfind_mod_str.strip(ModificationKeys.SEPARATOR)
if not pfind_mod_str:
return "", ""
items = [
item.split(",", 3) for item in pfind_mod_str.split(ModificationKeys.SEPARATOR)
]
items = [
("-1", mod)
if (mod.endswith("C-term]") or mod[:-2].endswith("C-term"))
# else ('0', mod) if mod.endswith('N-term]')
else (site, mod)
for site, mod in items
]
items = list(zip(*items))
return ModificationKeys.SEPARATOR.join(items[1]), ModificationKeys.SEPARATOR.join(
items[0]
)
[docs]
def parse_pfind_protein(protein: str, *, keep_reverse: bool = True) -> str:
"""Parse pFind protein string."""
proteins = protein.strip("/").split("/")
return ModificationKeys.SEPARATOR.join(
[
protein
for protein in proteins
if (not protein.startswith("REV_") or keep_reverse)
]
)
[docs]
class pFindReader(PSMReaderBase): # noqa: N801 name `pFindReader` should use CapWords convention TODO: used by peptdeep, alpharaw
"""Reader for pFind's .txt files."""
_reader_type = "pfind"
def _translate_modifications(self) -> None:
pass
def _load_file(self, filename: str) -> pd.DataFrame:
"""Load a pFind output file to a DataFrame."""
return pd.read_csv(filename, index_col=False, sep="\t", keep_default_na=False)
def _pre_process(self, df: pd.DataFrame) -> pd.DataFrame:
"""pFind-specific preprocessing of output data."""
df.fillna("", inplace=True)
df = df[df["Sequence"] != ""]
df[PsmDfCols.RAW_NAME] = df["File_Name"].str.split(".").apply(lambda x: x[0])
df["Proteins"] = df["Proteins"].apply(parse_pfind_protein)
return df
def _translate_decoy(self) -> None:
self._psm_df[PsmDfCols.DECOY] = (
self._psm_df[PsmDfCols.DECOY] == "decoy"
).astype(np.int8)
def _translate_score(self) -> None:
"""Translate pFind pvalue to AlphaBase score: the larger the better."""
self._psm_df[PsmDfCols.SCORE] = -np.log(
self._psm_df[PsmDfCols.SCORE].astype(float) + 1e-100
)
def _load_modifications(self, origin_df: pd.DataFrame) -> None:
mods, mod_sites = zip(*origin_df["Modification"].apply(get_pFind_mods))
self._psm_df[PsmDfCols.MODS] = [translate_pFind_mod(mod) for mod in mods]
self._psm_df[PsmDfCols.MOD_SITES] = mod_sites
[docs]
def register_readers() -> None:
"""Register pFind readers."""
psm_reader_provider.register_reader("pfind", pFindReader)
psm_reader_provider.register_reader("pfind3", pFindReader)