import copy
import multiprocessing as mp
from typing import Any
import pandas as pd
from alphabase.spectral_library.base import SpecLibBase
def _batchify_series(series, mp_batch_size):
"""Internal funciton for multiprocessing"""
for i in range(0, len(series), mp_batch_size):
yield series.iloc[i : i + mp_batch_size]
[docs]
class BaseDecoyGenerator:
"""
Base class for decoy generator.
A class is used instead of a function to make as it needs to be pickled for multiprocessing.
"""
def __call__(self, series: pd.Series) -> pd.Series:
"""
Main entry of this class, it calls follows methods:
- self._decoy()
"""
return series.apply(self._decoy)
def _decoy(self, sequence: str) -> str:
raise NotImplementedError("Subclass should implement this method.")
[docs]
class DIANNDecoyGenerator(BaseDecoyGenerator):
[docs]
def __init__(
self,
raw_AAs: str = "GAVLIFMPWSCTYHKRQENDBJOUXZsty",
mutated_AAs: str = "LLLVVLLLLTSSSSLLNDQEVVVVVVtss",
):
"""
DiaNN-like decoy peptide generator
Parameters
----------
raw_AAs : str, optional
AAs those DiaNN decoy from.
Defaults to 'GAVLIFMPWSCTYHKRQENDBJOUXZsty'.
mutated_AAs : str, optional
AAs those DiaNN decoy to.
Defaults to 'LLLVVLLLLTSSSSLLNDQEVVVVVVtss'.
"""
self.raw_AAs = raw_AAs
self.mutated_AAs = mutated_AAs
def _decoy(self, sequence: str) -> str:
return (
sequence[0]
+ self.mutated_AAs[self.raw_AAs.index(sequence[1])]
+ sequence[2:-2]
+ self.mutated_AAs[self.raw_AAs.index(sequence[-2])]
+ sequence[-1]
)
[docs]
class PseudoReverseDecoyGenerator(BaseDecoyGenerator):
[docs]
def __init__(self, fix_C_term: bool = True):
"""
Pseudo-reverse decoy generator.
Parameters
----------
fix_C_term : bool, optional
If fix C-term AA when decoy.
Defaults to True.
"""
self.fix_C_term = fix_C_term
def _decoy(self, sequence: str) -> str:
if self.fix_C_term:
return sequence[:-1][::-1] + sequence[-1]
else:
return sequence[::-1]
[docs]
class SpecLibDecoy(SpecLibBase):
"""
Pseudo-reverse peptide decoy generator.
"""
[docs]
def __init__(
self,
target_lib: SpecLibBase,
decoy_generator: Any = PseudoReverseDecoyGenerator,
**kwargs,
):
"""
Parameters
----------
target_lib : SpecLibBase
Target library to decoy.
fix_C_term : bool, optional
If fix C-term AA when decoy.
Defaults to True.
Attributes
----------
target_lib : SpecLibBase
same as 'target_lib' in Args.
"""
self.__dict__ = copy.deepcopy(target_lib.__dict__)
self.target_lib = target_lib
self.generator = decoy_generator(**kwargs)
[docs]
def translate_to_decoy(
self, multiprocessing: bool = True, mp_batch_size=10000, mp_process_num: int = 8
):
"""
Main entry of this class, it calls follows methods:
- self.decoy_sequence()
Parameters
----------
multiprocessing : bool, optional
If true use multiprocessing.
Defaults to True.
mp_batch_size : int, optional
Batch size for multiprocessing.
Defaults to 10000.
mp_process_num : int, optional
Number of processes for multiprocessing.
Defaults to 8.
"""
self.decoy_sequence(
multiprocessing=multiprocessing,
mp_batch_size=mp_batch_size,
mp_process_num=mp_process_num,
)
[docs]
def append_to_target_lib(self):
"""
A decoy method should define how to append itself to target_lib.
Sub-classes should override this method when necessary.
"""
self._remove_target_seqs()
self._precursor_df["decoy"] = 1
self.target_lib._precursor_df["decoy"] = 0
self.target_lib._precursor_df = pd.concat(
(self.target_lib._precursor_df, self._precursor_df), ignore_index=True
)
self.target_lib.refine_df()
[docs]
def decoy_sequence(
self, multiprocessing: bool = True, mp_batch_size=10000, mp_process_num: int = 8
):
"""
Generate decoy sequences from `self.target_lib`.
Sub-classes should override the `_decoy_seq` method when necessary.
Parameters
----------
multiprocessing : bool, optional
If true use multiprocessing.
Defaults to True.
mp_batch_size : int, optional
Batch size for multiprocessing.
Defaults to 10000.
mp_process_num : int, optional
Number of processes for multiprocessing.
Defaults to 8.
"""
if not multiprocessing or self._precursor_df.shape[0] < mp_batch_size:
self._precursor_df["sequence"] = self.generator(
self._precursor_df["sequence"]
)
self._remove_target_seqs()
return
sequence_batches = list(
_batchify_series(self._precursor_df["sequence"], mp_batch_size)
)
series_list = []
with mp.get_context("spawn").Pool(mp_process_num) as p:
processing = p.imap(self.generator, sequence_batches)
for df in processing:
series_list.append(df)
self._precursor_df["sequence"] = pd.concat(series_list)
self._remove_target_seqs()
def _remove_target_seqs(self):
target_seqs = set(self.target_lib._precursor_df.sequence.values)
self._precursor_df.drop(
self._precursor_df.loc[self._precursor_df.sequence.isin(target_seqs)].index,
inplace=True,
)
[docs]
class SpecLibDecoyProvider:
[docs]
def __init__(self):
self.decoy_dict = {}
[docs]
def register(self, name: str, decoy_class: SpecLibDecoy):
"""Register a new decoy class"""
self.decoy_dict[name.lower()] = decoy_class
[docs]
def get_decoy_lib(
self, name: str, target_lib: SpecLibBase, **kwargs
) -> SpecLibDecoy:
"""Get an object of a subclass of `SpecLibDecoy` based on
registered name.
Parameters
----------
name : str
Registered decoy class name
target_lib : SpecLibBase
Target library for decoy generation
Returns
-------
SpecLibDecoy
Decoy library object
"""
if not name:
return None
name = name.lower()
if name == "none" or name == "null":
return None
if name in self.decoy_dict:
return SpecLibDecoy(
target_lib, decoy_generator=self.decoy_dict[name], **kwargs
)
else:
raise ValueError(f"Decoy method {name} not found.")
decoy_lib_provider: SpecLibDecoyProvider = SpecLibDecoyProvider()
"""
Factory object of `SpecLibDecoyProvider` to
register and get different types of decoy methods.
"""
decoy_lib_provider.register("pseudo_reverse", PseudoReverseDecoyGenerator)
decoy_lib_provider.register("diann", DIANNDecoyGenerator)