Source code for alphabase.spectral_library.flat

import warnings
from typing import Union

import pandas as pd

from alphabase.constants.spectral_library import LOSS_NUMBER_TO_TYPE
from alphabase.io.hdf import HDF_File
from alphabase.peptide.fragment import (
    create_dense_matrices,
    filter_valid_charged_frag_types,
    flatten_fragments,
    remove_unused_fragments,
    sort_charged_frag_types,
)
from alphabase.spectral_library.base import SpecLibBase, get_available_columns


[docs] class SpecLibFlat(SpecLibBase): """ Flatten the spectral library (SpecLibBase) by using :meth:`parse_base_library`. Attributes ---------- custom_fragment_df_columns : list of str 'mz' and 'intensity' columns are required in :attr:`fragment_df`, others could be customized. It can include ['type','number','position','charge','loss_type']. min_fragment_intensity : float minimal intensity to keep in :attr:`fragment_df`. keep_top_k_fragments : float top k highest peaks to keep in :attr:`fragment_df`. """ key_numeric_columns = SpecLibBase.key_numeric_columns + [ "flat_frag_start_idx", "flat_frag_stop_idx", ] """ :obj:`SpecLibBase.key_numeric_columns <alphabase.spectral_library.base.SpecLibBase.key_numeric_columns>` + `['flat_frag_start_idx','flat_frag_stop_idx']`. """
[docs] def __init__( self, charged_frag_types: list = ["b_z1", "b_z2", "y_z1", "y_z2"], min_fragment_intensity: float = 0.001, keep_top_k_fragments: int = 1000, custom_fragment_df_columns: list = [ "type", "number", "position", "charge", "loss_type", ], **kwargs, ): """ Parameters ---------- min_fragment_intensity : float, optional minimal intensity to keep, by default 0.001 keep_top_k_fragments : int, optional top k highest peaks to keep, by default 1000 custom_fragment_df_columns : list, optional See :attr:`custom_fragment_df_columns`, defaults to ['type','number','position','charge','loss_type'] """ super().__init__(charged_frag_types=charged_frag_types) self.min_fragment_intensity = min_fragment_intensity self.keep_top_k_fragments = keep_top_k_fragments self.custom_fragment_df_columns = custom_fragment_df_columns
@property def fragment_df(self) -> pd.DataFrame: """The flat fragment dataframe with columns (['mz', 'intensity'] + :attr:`custom_fragment_df_columns`.) """ return self._fragment_df @property def protein_df(self) -> pd.DataFrame: """Protein dataframe""" return self._protein_df
[docs] def remove_unused_fragments(self): """Remove unused fragments from fragment_df. This method is inherited from :class:`SpecLibBase` and has not been implemented for a flat library. """ self._precursor_df, (self._fragment_df,) = remove_unused_fragments( self._precursor_df, (self._fragment_df,), frag_start_col="flat_frag_start_idx", frag_stop_col="flat_frag_stop_idx", )
[docs] def parse_base_library( self, library: SpecLibBase, keep_original_frag_dfs: bool = False, copy_precursor_df: bool = False, **kwargs, ): """ Flatten an library object of SpecLibBase or its inherited class. This method will generate :attr:`precursor_df` and :attr:`fragment_df` The fragments in fragment_df can be located by `flat_frag_start_idx` and `flat_frag_stop_idx` in precursor_df. Parameters ---------- library : SpecLibBase A library object with attributes `precursor_df`, `fragment_mz_df` and `fragment_intensity_df`. keep_original_frag_dfs : bool, default True If `fragment_mz_df` and `fragment_intensity_df` are kept in this library. copy_precursor_df : bool, default False If True, make a copy of `precursor_df` from `library`, otherwise `flat_frag_start_idx` and `flat_frag_stop_idx` columns will also append to the `library`. """ self._precursor_df, self._fragment_df = flatten_fragments( library.precursor_df.copy() if copy_precursor_df else library.precursor_df, library.fragment_mz_df, library.fragment_intensity_df, min_fragment_intensity=self.min_fragment_intensity, keep_top_k_fragments=self.keep_top_k_fragments, custom_columns=self.custom_fragment_df_columns, **kwargs, ) if hasattr(library, "protein_df"): self._protein_df = library.protein_df else: self._protein_df = pd.DataFrame() if keep_original_frag_dfs: self.charged_frag_types = library.fragment_mz_df.columns.values for dense_frag_df in library.available_dense_fragment_dfs(): setattr(self, dense_frag_df, getattr(library, dense_frag_df)) warnings.warn( "The SpecLibFlat object will have a strictly flat representation in the future. keep_original_frag_dfs=True will be deprecated.", DeprecationWarning, )
[docs] def save_hdf(self, hdf_file: str): """Save library dataframes into hdf_file. For `self.precursor_df`, this method will save it into two hdf groups: hdf_file: `library/precursor_df` and `library/mod_seq_df`. `library/precursor_df` contains all essential numberic columns those can be loaded faster from hdf file into memory: `['precursor_mz', 'charge', 'mod_seq_hash', 'mod_seq_charge_hash', 'frag_start_idx', 'frag_stop_idx', 'flat_frag_start_idx', 'flat_frag_stop_idx', 'decoy', 'rt_pred', 'ccs_pred', 'mobility_pred', 'miss_cleave', 'nAA', 'isotope_mz_m1', 'isotope_intensity_m1', ...]` `library/mod_seq_df` contains all string columns and the other not essential columns: 'sequence','mods','mod_sites', ['proteins', 'genes']... as well as 'mod_seq_hash', 'mod_seq_charge_hash' columns to map back to `precursor_df` Parameters ---------- hdf_file : str the hdf file path to save """ super().save_hdf(hdf_file) _hdf = HDF_File(hdf_file, read_only=False, truncate=True, delete_existing=False) _hdf.library.fragment_df = self.fragment_df _hdf.library.protein_df = self.protein_df _hdf.library.fragment_mz_df = self.fragment_mz_df _hdf.library.fragment_intensity_df = self.fragment_intensity_df
[docs] def load_hdf( self, hdf_file: str, load_mod_seq: bool = False, infer_charged_frag_types: bool = True, ): """Load the hdf library from hdf_file Parameters ---------- hdf_file : str hdf library path to load load_mod_seq : bool, optional if also load mod_seq_df. Defaults to False. infer_charged_frag_types : bool, optional if True, infer the charged fragment types as defined in the hdf file, defaults to True. This is the default as users most likely don't know the charged fragment types in the hdf file. If set to False, only charged frag types defined in `SpecLibBase.charged_frag_types` will be loaded. """ super().load_hdf(hdf_file, load_mod_seq=load_mod_seq) _hdf = HDF_File( hdf_file, ) self._fragment_df = _hdf.library.fragment_df.values self._protein_df = _hdf.library.protein_df.values if infer_charged_frag_types: self.charged_frag_types = sort_charged_frag_types( filter_valid_charged_frag_types(_hdf.library.fragment_mz_df.columns) ) _fragment_intensity_df = _hdf.library.fragment_intensity_df.values self._fragment_intensity_df = _fragment_intensity_df[ get_available_columns(_fragment_intensity_df, self.charged_frag_types) ] _fragment_mz_df = _hdf.library.fragment_mz_df.values self._fragment_mz_df = _fragment_mz_df[ get_available_columns(_fragment_mz_df, self.charged_frag_types) ]
[docs] def get_full_charged_types(self, frag_df: pd.DataFrame) -> list: """ Infer the full set of charged fragment types from the fragment dataframe by full we mean a complete set of fragment types for each charge so if we have a fragment b_z1 we should also have a fragment y_z1 and vice versa Parameters ---------- frag_df : pd.DataFrame The fragment dataframe Returns ------- charged_frag_types : list The full set of charged fragment types in the form of a list of strings such as ['a_z1','b_z1','c_z1','x_z1','y_z1','z_z1'] """ warnings.warn( "The get_full_charged_types method is deprecated. Use get_charged_frag_types instead.", DeprecationWarning, ) unique_charge_type_pairs = frag_df[ ["type", "loss_type", "charge"] ].drop_duplicates() # Fragtypes from ascii to char self.frag_types_as_char = { i: chr(i) for i in unique_charge_type_pairs["type"].unique() } charged_frag_types = set() # Now if we have a fragment type that is a,b,c we should have the corresponding x,y,z corresponding = {"a": "x", "b": "y", "c": "z", "x": "a", "y": "b", "z": "c"} for type, loss, max_charge in unique_charge_type_pairs.values: for possible_charge in range(1, max_charge + 1): # Add the string for this pair charged_frag_types.add( f"{self.frag_types_as_char[type]}{LOSS_NUMBER_TO_TYPE[loss]}_z{possible_charge}" ) # Add the string for the corresponding pair charged_frag_types.add( f"{corresponding[self.frag_types_as_char[type]]}{LOSS_NUMBER_TO_TYPE[loss]}_z{possible_charge}" ) return list(charged_frag_types)
[docs] def calc_dense_fragments( self, additional_columns: Union[list, None] = None, charged_frag_types: Union[list, None] = None, ) -> None: """ Create a hybrid SpecLibFlat which has both flat and dense fragment representations. Converts the flat fragment representation to dense matrices and stores them in the object. Creates fragment_mz_df (using calculated m/z values) and fragment_intensity_df by default. For each additional column specified (e.g., 'intensity'), creates a corresponding _fragment_<column>_df matrix. Including 'mz' in additional_columns will use observed rather than calculated m/z values. Fragment types can be specified explicitly or inherited from self.charged_frag_types. Only fragments matching these types will be included in the dense matrices. Each fragment type (e.g., 'b_z1', 'y_z2') becomes a column in the resulting dense matrices. Updates the precursor_df with new frag_start_idx and frag_stop_idx columns for the dense representation. Parameters ---------- additional_columns : Union[list, None], optional Additional fragment columns to convert to dense format, defaults to ['intensity'] charged_frag_types : Union[list, None], optional Fragment types to include in dense format, defaults to self.charged_frag_types Returns ------- None Modifies the SpecLibFlat object in place """ if charged_frag_types is None: charged_frag_types = self.charged_frag_types if additional_columns is None: additional_columns = ["intensity"] df_collection, frag_start_idx, frag_stop_idx = create_dense_matrices( self._precursor_df, self._fragment_df, charged_frag_types, flat_columns=additional_columns, ) for col, df in df_collection.items(): setattr(self, f"_fragment_{col}_df", df) self.precursor_df["frag_start_idx"] = frag_start_idx self.precursor_df["frag_stop_idx"] = frag_stop_idx
[docs] def to_speclib_base( self, flat_columns: Union[list, None] = None, charged_frag_types: Union[list, None] = None, ) -> SpecLibBase: """ Convert the flat library to a new SpecLibBase object with dense fragment matrices. Creates a new SpecLibBase containing fragment_mz_df (using calculated m/z values). Flat columns like 'intensity' are transformed into dense matrices as fragment_intensity_df. For all columns specified in flat_columns, a corresponding _fragment_<column>_df matrix is created and assigned to the new SpecLibBase object. Warning ------- If the column 'mz' is added to flat_columns, it will override the calculated m/z values in fragment_mz_df. To mitigate this behavior and get observed as calculated m/z values, rename the flat mz column to 'mz_observed' before calling to_speclib_base. Fragment types can be specified explicitly or inherited from self.charged_frag_types. Only fragments matching these types will be included in the dense matrices. Each fragment type (e.g., 'b_z1', 'y_z2') becomes a column in the resulting dense matrices. The precursor_df is copied and updated with new dense fragment indices, removing any flat-specific columns (flat_frag_start_idx, flat_frag_stop_idx). Parameters ---------- flat_columns : Union[list, None], optional Fragment columns from the flat representation to convert to dense format, defaults to ['intensity'] charged_frag_types : Union[list, None], optional Fragment types to include in dense format, defaults to self.charged_frag_types Returns ------- SpecLibBase A new SpecLibBase object with dense fragment representations """ # Create SpecLibBase object speclib_base = SpecLibBase() speclib_base._precursor_df = self._precursor_df.copy() if charged_frag_types is None: charged_frag_types = self.charged_frag_types if flat_columns is None: flat_columns = ["intensity"] speclib_base.charged_frag_types = charged_frag_types df_collection, frag_start_idx, frag_stop_idx = create_dense_matrices( speclib_base._precursor_df, self._fragment_df, speclib_base.charged_frag_types, flat_columns=flat_columns, ) speclib_base.precursor_df["frag_start_idx"] = frag_start_idx speclib_base.precursor_df["frag_stop_idx"] = frag_stop_idx for col, df in df_collection.items(): setattr(speclib_base, f"_fragment_{col}_df", df) # Drop flat indices from precursor_df if they exist speclib_base._precursor_df = speclib_base._precursor_df.drop( ["flat_frag_start_idx", "flat_frag_stop_idx"], axis=1, errors="ignore" ) return speclib_base
[docs] def to_SpecLibBase(self): # raise a deprecation warning warnings.warn( "The to_SpecLibBase method is deprecated. Use to_speclib_base instead.", DeprecationWarning, ) return self.to_speclib_base()