Source code for alphabase.spectral_library.flat

import warnings
from typing import Union

import pandas as pd

from alphabase.constants.spectral_library import LOSS_NUMBER_TO_TYPE
from alphabase.io.hdf import HDF_File
from alphabase.peptide.fragment import (
    create_dense_matrices,
    filter_valid_charged_frag_types,
    flatten_fragments,
    remove_unused_fragments,
    sort_charged_frag_types,
)
from alphabase.spectral_library.base import SpecLibBase, get_available_columns



[docs]
class SpecLibFlat(SpecLibBase):
    """
    Flatten the spectral library (SpecLibBase) by using :meth:`parse_base_library`.

    Attributes
    ----------
    custom_fragment_df_columns : list of str
        'mz' and 'intensity' columns are required in :attr:`fragment_df`,
        others could be customized.
        It can include ['type','number','position','charge','loss_type'].

    min_fragment_intensity : float
        minimal intensity to keep in :attr:`fragment_df`.

    keep_top_k_fragments : float
        top k highest peaks to keep in :attr:`fragment_df`.

    """

    key_numeric_columns = SpecLibBase.key_numeric_columns + [
        "flat_frag_start_idx",
        "flat_frag_stop_idx",
    ]
    """
    :obj:`SpecLibBase.key_numeric_columns <alphabase.spectral_library.base.SpecLibBase.key_numeric_columns>`
    + `['flat_frag_start_idx','flat_frag_stop_idx']`.
    """


[docs]
    def __init__(
        self,
        charged_frag_types: list = ["b_z1", "b_z2", "y_z1", "y_z2"],
        min_fragment_intensity: float = 0.001,
        keep_top_k_fragments: int = 1000,
        custom_fragment_df_columns: list = [
            "type",
            "number",
            "position",
            "charge",
            "loss_type",
        ],
        **kwargs,
    ):
        """
        Parameters
        ----------
        min_fragment_intensity : float, optional
            minimal intensity to keep, by default 0.001

        keep_top_k_fragments : int, optional
            top k highest peaks to keep, by default 1000

        custom_fragment_df_columns : list, optional
            See :attr:`custom_fragment_df_columns`,
            defaults to ['type','number','position','charge','loss_type']
        """
        super().__init__(charged_frag_types=charged_frag_types)
        self.min_fragment_intensity = min_fragment_intensity
        self.keep_top_k_fragments = keep_top_k_fragments

        self.custom_fragment_df_columns = custom_fragment_df_columns


    @property
    def fragment_df(self) -> pd.DataFrame:
        """The flat fragment dataframe with columns
        (['mz', 'intensity'] + :attr:`custom_fragment_df_columns`.)
        """
        return self._fragment_df

    @property
    def protein_df(self) -> pd.DataFrame:
        """Protein dataframe"""
        return self._protein_df


[docs]
    def remove_unused_fragments(self):
        """Remove unused fragments from fragment_df.
        This method is inherited from :class:`SpecLibBase` and has not been implemented for a flat library.
        """
        self._precursor_df, (self._fragment_df,) = remove_unused_fragments(
            self._precursor_df,
            (self._fragment_df,),
            frag_start_col="flat_frag_start_idx",
            frag_stop_col="flat_frag_stop_idx",
        )



[docs]
    def parse_base_library(
        self,
        library: SpecLibBase,
        keep_original_frag_dfs: bool = False,
        copy_precursor_df: bool = False,
        **kwargs,
    ):
        """
        Flatten an library object of SpecLibBase or its inherited class.
        This method will generate :attr:`precursor_df` and :attr:`fragment_df`
        The fragments in fragment_df can be located by
        `flat_frag_start_idx` and `flat_frag_stop_idx` in precursor_df.

        Parameters
        ----------
        library : SpecLibBase
            A library object with attributes
            `precursor_df`, `fragment_mz_df` and `fragment_intensity_df`.

        keep_original_frag_dfs : bool, default True
            If `fragment_mz_df` and `fragment_intensity_df` are
            kept in this library.

        copy_precursor_df : bool, default False
            If True, make a copy of `precursor_df` from `library`,
            otherwise `flat_frag_start_idx` and `flat_frag_stop_idx`
            columns will also append to the `library`.
        """
        self._precursor_df, self._fragment_df = flatten_fragments(
            library.precursor_df.copy() if copy_precursor_df else library.precursor_df,
            library.fragment_mz_df,
            library.fragment_intensity_df,
            min_fragment_intensity=self.min_fragment_intensity,
            keep_top_k_fragments=self.keep_top_k_fragments,
            custom_columns=self.custom_fragment_df_columns,
            **kwargs,
        )

        if hasattr(library, "protein_df"):
            self._protein_df = library.protein_df
        else:
            self._protein_df = pd.DataFrame()

        if keep_original_frag_dfs:
            self.charged_frag_types = library.fragment_mz_df.columns.values
            for dense_frag_df in library.available_dense_fragment_dfs():
                setattr(self, dense_frag_df, getattr(library, dense_frag_df))

            warnings.warn(
                "The SpecLibFlat object will have a strictly flat representation in the future. keep_original_frag_dfs=True will be deprecated.",
                DeprecationWarning,
            )



[docs]
    def save_hdf(self, hdf_file: str):
        """Save library dataframes into hdf_file.
        For `self.precursor_df`, this method will save it into two hdf groups:
        hdf_file: `library/precursor_df` and `library/mod_seq_df`.

        `library/precursor_df` contains all essential numberic columns those
        can be loaded faster from hdf file into memory:
        `['precursor_mz', 'charge', 'mod_seq_hash', 'mod_seq_charge_hash',
        'frag_start_idx', 'frag_stop_idx', 'flat_frag_start_idx', 'flat_frag_stop_idx',
        'decoy', 'rt_pred', 'ccs_pred', 'mobility_pred', 'miss_cleave', 'nAA',
        'isotope_mz_m1', 'isotope_intensity_m1', ...]`

        `library/mod_seq_df` contains all string columns and the other
        not essential columns:
        'sequence','mods','mod_sites', ['proteins', 'genes']...
        as well as 'mod_seq_hash', 'mod_seq_charge_hash' columns to map
        back to `precursor_df`

        Parameters
        ----------
        hdf_file : str
            the hdf file path to save

        """
        super().save_hdf(hdf_file)
        _hdf = HDF_File(hdf_file, read_only=False, truncate=True, delete_existing=False)
        _hdf.library.fragment_df = self.fragment_df
        _hdf.library.protein_df = self.protein_df
        _hdf.library.fragment_mz_df = self.fragment_mz_df
        _hdf.library.fragment_intensity_df = self.fragment_intensity_df



[docs]
    def load_hdf(
        self,
        hdf_file: str,
        load_mod_seq: bool = False,
        infer_charged_frag_types: bool = True,
    ):
        """Load the hdf library from hdf_file

        Parameters
        ----------
        hdf_file : str
            hdf library path to load

        load_mod_seq : bool, optional
            if also load mod_seq_df.
            Defaults to False.

        infer_charged_frag_types : bool, optional
            if True, infer the charged fragment types as defined in the hdf file, defaults to True.
            This is the default as users most likely don't know the charged fragment types in the hdf file.
            If set to False, only charged frag types defined in `SpecLibBase.charged_frag_types` will be loaded.

        """
        super().load_hdf(hdf_file, load_mod_seq=load_mod_seq)
        _hdf = HDF_File(
            hdf_file,
        )
        self._fragment_df = _hdf.library.fragment_df.values
        self._protein_df = _hdf.library.protein_df.values

        if infer_charged_frag_types:
            self.charged_frag_types = sort_charged_frag_types(
                filter_valid_charged_frag_types(_hdf.library.fragment_mz_df.columns)
            )

        _fragment_intensity_df = _hdf.library.fragment_intensity_df.values
        self._fragment_intensity_df = _fragment_intensity_df[
            get_available_columns(_fragment_intensity_df, self.charged_frag_types)
        ]

        _fragment_mz_df = _hdf.library.fragment_mz_df.values
        self._fragment_mz_df = _fragment_mz_df[
            get_available_columns(_fragment_mz_df, self.charged_frag_types)
        ]



[docs]
    def get_full_charged_types(self, frag_df: pd.DataFrame) -> list:
        """
        Infer the full set of charged fragment types from the fragment dataframe
        by full we mean a complete set of fragment types for each charge
        so if we have a fragment b_z1 we should also have a fragment y_z1 and vice versa

        Parameters
        ----------
        frag_df : pd.DataFrame
            The fragment dataframe

        Returns
        -------
        charged_frag_types : list
            The full set of charged fragment types in the form of a list of strings such as ['a_z1','b_z1','c_z1','x_z1','y_z1','z_z1']

        """
        warnings.warn(
            "The get_full_charged_types method is deprecated. Use get_charged_frag_types instead.",
            DeprecationWarning,
        )

        unique_charge_type_pairs = frag_df[
            ["type", "loss_type", "charge"]
        ].drop_duplicates()
        # Fragtypes from ascii to char
        self.frag_types_as_char = {
            i: chr(i) for i in unique_charge_type_pairs["type"].unique()
        }

        charged_frag_types = set()
        # Now if we have a fragment type that is a,b,c we should have the corresponding x,y,z

        corresponding = {"a": "x", "b": "y", "c": "z", "x": "a", "y": "b", "z": "c"}

        for type, loss, max_charge in unique_charge_type_pairs.values:
            for possible_charge in range(1, max_charge + 1):
                # Add the string for this pair
                charged_frag_types.add(
                    f"{self.frag_types_as_char[type]}{LOSS_NUMBER_TO_TYPE[loss]}_z{possible_charge}"
                )
                # Add the string for the corresponding pair
                charged_frag_types.add(
                    f"{corresponding[self.frag_types_as_char[type]]}{LOSS_NUMBER_TO_TYPE[loss]}_z{possible_charge}"
                )
        return list(charged_frag_types)



[docs]
    def calc_dense_fragments(
        self,
        additional_columns: Union[list, None] = None,
        charged_frag_types: Union[list, None] = None,
    ) -> None:
        """
        Create a hybrid SpecLibFlat which has both flat and dense fragment representations.
        Converts the flat fragment representation to dense matrices and stores them in the object.

        Creates fragment_mz_df (using calculated m/z values) and fragment_intensity_df by default.
        For each additional column specified (e.g., 'intensity'), creates a corresponding
        _fragment_<column>_df matrix. Including 'mz' in additional_columns will use observed
        rather than calculated m/z values.

        Fragment types can be specified explicitly or inherited from self.charged_frag_types.
        Only fragments matching these types will be included in the dense matrices. Each fragment
        type (e.g., 'b_z1', 'y_z2') becomes a column in the resulting dense matrices.

        Updates the precursor_df with new frag_start_idx and frag_stop_idx columns for the
        dense representation.

        Parameters
        ----------
        additional_columns : Union[list, None], optional
            Additional fragment columns to convert to dense format, defaults to ['intensity']
        charged_frag_types : Union[list, None], optional
            Fragment types to include in dense format, defaults to self.charged_frag_types

        Returns
        -------
        None
            Modifies the SpecLibFlat object in place
        """

        if charged_frag_types is None:
            charged_frag_types = self.charged_frag_types

        if additional_columns is None:
            additional_columns = ["intensity"]

        df_collection, frag_start_idx, frag_stop_idx = create_dense_matrices(
            self._precursor_df,
            self._fragment_df,
            charged_frag_types,
            flat_columns=additional_columns,
        )

        for col, df in df_collection.items():
            setattr(self, f"_fragment_{col}_df", df)

        self.precursor_df["frag_start_idx"] = frag_start_idx
        self.precursor_df["frag_stop_idx"] = frag_stop_idx



[docs]
    def to_speclib_base(
        self,
        flat_columns: Union[list, None] = None,
        charged_frag_types: Union[list, None] = None,
    ) -> SpecLibBase:
        """
        Convert the flat library to a new SpecLibBase object with dense fragment matrices.

        Creates a new SpecLibBase containing fragment_mz_df (using calculated m/z values).
        Flat columns like 'intensity' are transformed into dense matrices as fragment_intensity_df.
        For all columns specified in flat_columns, a corresponding _fragment_<column>_df matrix is created and assigned to the new SpecLibBase object.

        Warning
        -------
        If the column 'mz' is added to flat_columns, it will override the calculated m/z values in fragment_mz_df.
        To mitigate this behavior and get observed as calculated m/z values, rename the flat mz column to 'mz_observed' before calling to_speclib_base.

        Fragment types can be specified explicitly or inherited from self.charged_frag_types.
        Only fragments matching these types will be included in the dense matrices. Each fragment
        type (e.g., 'b_z1', 'y_z2') becomes a column in the resulting dense matrices.

        The precursor_df is copied and updated with new dense fragment indices, removing any
        flat-specific columns (flat_frag_start_idx, flat_frag_stop_idx).

        Parameters
        ----------
        flat_columns : Union[list, None], optional
            Fragment columns from the flat representation to convert to dense format, defaults to ['intensity']

        charged_frag_types : Union[list, None], optional
            Fragment types to include in dense format, defaults to self.charged_frag_types

        Returns
        -------
        SpecLibBase
            A new SpecLibBase object with dense fragment representations
        """
        # Create SpecLibBase object
        speclib_base = SpecLibBase()
        speclib_base._precursor_df = self._precursor_df.copy()

        if charged_frag_types is None:
            charged_frag_types = self.charged_frag_types

        if flat_columns is None:
            flat_columns = ["intensity"]

        speclib_base.charged_frag_types = charged_frag_types

        df_collection, frag_start_idx, frag_stop_idx = create_dense_matrices(
            speclib_base._precursor_df,
            self._fragment_df,
            speclib_base.charged_frag_types,
            flat_columns=flat_columns,
        )

        speclib_base.precursor_df["frag_start_idx"] = frag_start_idx
        speclib_base.precursor_df["frag_stop_idx"] = frag_stop_idx

        for col, df in df_collection.items():
            setattr(speclib_base, f"_fragment_{col}_df", df)

        # Drop flat indices from precursor_df if they exist
        speclib_base._precursor_df = speclib_base._precursor_df.drop(
            ["flat_frag_start_idx", "flat_frag_stop_idx"], axis=1, errors="ignore"
        )

        return speclib_base



[docs]
    def to_SpecLibBase(self):
        # raise a deprecation warning
        warnings.warn(
            "The to_SpecLibBase method is deprecated. Use to_speclib_base instead.",
            DeprecationWarning,
        )
        return self.to_speclib_base()