{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# SpecLibFasta usage" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "%reload_ext autoreload\n", "%autoreload 2" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "from alphabase.protein.fasta import SpecLibFasta" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Proteins from a dict (or loaded from fasta files)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "prot1 = 'MABCDESTKAFGHIJKLMNOPQRAFGHIJK'\n", "prot2 = 'AFGHIJKLMNOPQR'\n", "protein_dict = {\n", " 'xx': {\n", " 'protein_id': 'xx',\n", " 'gene_name': '',\n", " 'sequence': prot1\n", " },\n", " 'yy': {\n", " 'protein_id': 'yy',\n", " 'gene_name': 'gene',\n", " 'sequence': prot2\n", " }\n", "}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "`alphabase.protein.fasta.SpecLibFasta.get_peptides_from_protein_dict` will digest a protein dict into a peptide dataframe. \n", "\n", "`alphabase.protein.fasta.SpecLibFasta.get_peptides_from_fasta` will digest a fasta file or a fasta list into a peptide dataframe. " ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sequenceprotein_idxesmiss_cleavageis_prot_ntermis_prot_ctermmodsmod_sitesnAA
0AFGHIJK0;10TrueTrue7
1LMNOPQR0;10FalseTrue7
2ABCDESTK00TrueFalse8
3MABCDESTK00TrueFalse9
4AFGHIJKLMNOPQR0;11TrueTrue14
5LMNOPQRAFGHIJK01FalseTrue14
6ABCDESTKAFGHIJK01TrueFalse15
7MABCDESTKAFGHIJK01TrueFalse16
8AFGHIJKLMNOPQRAFGHIJK02FalseTrue21
9ABCDESTKAFGHIJKLMNOPQR02TrueFalse22
10MABCDESTKAFGHIJKLMNOPQR02TrueFalse23
\n", "
" ], "text/plain": [ " sequence protein_idxes miss_cleavage is_prot_nterm \\\n", "0 AFGHIJK 0;1 0 True \n", "1 LMNOPQR 0;1 0 False \n", "2 ABCDESTK 0 0 True \n", "3 MABCDESTK 0 0 True \n", "4 AFGHIJKLMNOPQR 0;1 1 True \n", "5 LMNOPQRAFGHIJK 0 1 False \n", "6 ABCDESTKAFGHIJK 0 1 True \n", "7 MABCDESTKAFGHIJK 0 1 True \n", "8 AFGHIJKLMNOPQRAFGHIJK 0 2 False \n", "9 ABCDESTKAFGHIJKLMNOPQR 0 2 True \n", "10 MABCDESTKAFGHIJKLMNOPQR 0 2 True \n", "\n", " is_prot_cterm mods mod_sites nAA \n", "0 True 7 \n", "1 True 7 \n", "2 False 8 \n", "3 False 9 \n", "4 True 14 \n", "5 True 14 \n", "6 False 15 \n", "7 False 16 \n", "8 True 21 \n", "9 False 22 \n", "10 False 23 " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "fasta_lib = SpecLibFasta(\n", " ['b_z1','y_z1'], I_to_L=False, decoy='pseudo_reverse',\n", " var_mods=['Acetyl@Protein_N-term', 'Oxidation@M'],\n", " fix_mods=['Carbamidomethyl@C'],\n", ")\n", "# fasta_lib.get_peptides_from_fasta(fasta_files)\n", "fasta_lib.get_peptides_from_protein_dict(protein_dict)\n", "fasta_lib.precursor_df" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
protein_idgene_namesequence
0xxMABCDESTKAFGHIJKLMNOPQRAFGHIJK
1yygeneAFGHIJKLMNOPQR
\n", "
" ], "text/plain": [ " protein_id gene_name sequence\n", "0 xx MABCDESTKAFGHIJKLMNOPQRAFGHIJK\n", "1 yy gene AFGHIJKLMNOPQR" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "fasta_lib.protein_df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We can also append the protein names to precursor_df" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sequenceprotein_idxesmiss_cleavageis_prot_ntermis_prot_ctermmodsmod_sitesnAAproteinsgenes
0AFGHIJK0;10TrueTrue7xx;yygene
1LMNOPQR0;10FalseTrue7xx;yygene
2ABCDESTK00TrueFalse8xx
3MABCDESTK00TrueFalse9xx
4AFGHIJKLMNOPQR0;11TrueTrue14xx;yygene
5LMNOPQRAFGHIJK01FalseTrue14xx
6ABCDESTKAFGHIJK01TrueFalse15xx
7MABCDESTKAFGHIJK01TrueFalse16xx
8AFGHIJKLMNOPQRAFGHIJK02FalseTrue21xx
9ABCDESTKAFGHIJKLMNOPQR02TrueFalse22xx
10MABCDESTKAFGHIJKLMNOPQR02TrueFalse23xx
\n", "
" ], "text/plain": [ " sequence protein_idxes miss_cleavage is_prot_nterm \\\n", "0 AFGHIJK 0;1 0 True \n", "1 LMNOPQR 0;1 0 False \n", "2 ABCDESTK 0 0 True \n", "3 MABCDESTK 0 0 True \n", "4 AFGHIJKLMNOPQR 0;1 1 True \n", "5 LMNOPQRAFGHIJK 0 1 False \n", "6 ABCDESTKAFGHIJK 0 1 True \n", "7 MABCDESTKAFGHIJK 0 1 True \n", "8 AFGHIJKLMNOPQRAFGHIJK 0 2 False \n", "9 ABCDESTKAFGHIJKLMNOPQR 0 2 True \n", "10 MABCDESTKAFGHIJKLMNOPQR 0 2 True \n", "\n", " is_prot_cterm mods mod_sites nAA proteins genes \n", "0 True 7 xx;yy gene \n", "1 True 7 xx;yy gene \n", "2 False 8 xx \n", "3 False 9 xx \n", "4 True 14 xx;yy gene \n", "5 True 14 xx \n", "6 False 15 xx \n", "7 False 16 xx \n", "8 True 21 xx \n", "9 False 22 xx \n", "10 False 23 xx " ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "fasta_lib.append_protein_name()\n", "fasta_lib.precursor_df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "If we have our own precursor_df loaded by psm_readers, we can directly assign it to fasta_lib. \n", "\n", "``` python\n", "fasta_lib._precursor_df = precursor_df\n", "```\n", "Thus, we can still use SpecLibFasta functionalities for this precursor_df." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Add modifications including both var_mods (`Acetyl@Protein_N-term`, `Oxidation@M`, see initialzation of fasta_lib) and fix_mods (`Carbamidomethyl@C`) into the precursor_df." ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sequencemodsmod_sites
0AFGHIJK
1AFGHIJKAcetyl@Protein_N-term0
2LMNOPQROxidation@M2
3LMNOPQR
4ABCDESTKCarbamidomethyl@C3
5ABCDESTKAcetyl@Protein_N-term;Carbamidomethyl@C0;3
6MABCDESTKOxidation@M;Carbamidomethyl@C1;4
7MABCDESTKCarbamidomethyl@C4
8MABCDESTKAcetyl@Protein_N-term;Oxidation@M;Carbamidomet...0;1;4
9MABCDESTKAcetyl@Protein_N-term;Carbamidomethyl@C0;4
10AFGHIJKLMNOPQROxidation@M9
11AFGHIJKLMNOPQR
12AFGHIJKLMNOPQRAcetyl@Protein_N-term;Oxidation@M0;9
13AFGHIJKLMNOPQRAcetyl@Protein_N-term0
14LMNOPQRAFGHIJKOxidation@M2
15LMNOPQRAFGHIJK
16ABCDESTKAFGHIJKCarbamidomethyl@C3
17ABCDESTKAFGHIJKAcetyl@Protein_N-term;Carbamidomethyl@C0;3
18MABCDESTKAFGHIJKOxidation@M;Carbamidomethyl@C1;4
19MABCDESTKAFGHIJKCarbamidomethyl@C4
20MABCDESTKAFGHIJKAcetyl@Protein_N-term;Oxidation@M;Carbamidomet...0;1;4
21MABCDESTKAFGHIJKAcetyl@Protein_N-term;Carbamidomethyl@C0;4
22AFGHIJKLMNOPQRAFGHIJKOxidation@M9
23AFGHIJKLMNOPQRAFGHIJK
24ABCDESTKAFGHIJKLMNOPQROxidation@M;Carbamidomethyl@C17;3
25ABCDESTKAFGHIJKLMNOPQRCarbamidomethyl@C3
26ABCDESTKAFGHIJKLMNOPQRAcetyl@Protein_N-term;Oxidation@M;Carbamidomet...0;17;3
27ABCDESTKAFGHIJKLMNOPQRAcetyl@Protein_N-term;Carbamidomethyl@C0;3
28MABCDESTKAFGHIJKLMNOPQROxidation@M;Carbamidomethyl@C1;4
29MABCDESTKAFGHIJKLMNOPQROxidation@M;Carbamidomethyl@C18;4
30MABCDESTKAFGHIJKLMNOPQROxidation@M;Oxidation@M;Carbamidomethyl@C1;18;4
31MABCDESTKAFGHIJKLMNOPQRCarbamidomethyl@C4
32MABCDESTKAFGHIJKLMNOPQRAcetyl@Protein_N-term;Oxidation@M;Carbamidomet...0;1;4
33MABCDESTKAFGHIJKLMNOPQRAcetyl@Protein_N-term;Oxidation@M;Carbamidomet...0;18;4
34MABCDESTKAFGHIJKLMNOPQRAcetyl@Protein_N-term;Oxidation@M;Oxidation@M;...0;1;18;4
35MABCDESTKAFGHIJKLMNOPQRAcetyl@Protein_N-term;Carbamidomethyl@C0;4
\n", "
" ], "text/plain": [ " sequence \\\n", "0 AFGHIJK \n", "1 AFGHIJK \n", "2 LMNOPQR \n", "3 LMNOPQR \n", "4 ABCDESTK \n", "5 ABCDESTK \n", "6 MABCDESTK \n", "7 MABCDESTK \n", "8 MABCDESTK \n", "9 MABCDESTK \n", "10 AFGHIJKLMNOPQR \n", "11 AFGHIJKLMNOPQR \n", "12 AFGHIJKLMNOPQR \n", "13 AFGHIJKLMNOPQR \n", "14 LMNOPQRAFGHIJK \n", "15 LMNOPQRAFGHIJK \n", "16 ABCDESTKAFGHIJK \n", "17 ABCDESTKAFGHIJK \n", "18 MABCDESTKAFGHIJK \n", "19 MABCDESTKAFGHIJK \n", "20 MABCDESTKAFGHIJK \n", "21 MABCDESTKAFGHIJK \n", "22 AFGHIJKLMNOPQRAFGHIJK \n", "23 AFGHIJKLMNOPQRAFGHIJK \n", "24 ABCDESTKAFGHIJKLMNOPQR \n", "25 ABCDESTKAFGHIJKLMNOPQR \n", "26 ABCDESTKAFGHIJKLMNOPQR \n", "27 ABCDESTKAFGHIJKLMNOPQR \n", "28 MABCDESTKAFGHIJKLMNOPQR \n", "29 MABCDESTKAFGHIJKLMNOPQR \n", "30 MABCDESTKAFGHIJKLMNOPQR \n", "31 MABCDESTKAFGHIJKLMNOPQR \n", "32 MABCDESTKAFGHIJKLMNOPQR \n", "33 MABCDESTKAFGHIJKLMNOPQR \n", "34 MABCDESTKAFGHIJKLMNOPQR \n", "35 MABCDESTKAFGHIJKLMNOPQR \n", "\n", " mods mod_sites \n", "0 \n", "1 Acetyl@Protein_N-term 0 \n", "2 Oxidation@M 2 \n", "3 \n", "4 Carbamidomethyl@C 3 \n", "5 Acetyl@Protein_N-term;Carbamidomethyl@C 0;3 \n", "6 Oxidation@M;Carbamidomethyl@C 1;4 \n", "7 Carbamidomethyl@C 4 \n", "8 Acetyl@Protein_N-term;Oxidation@M;Carbamidomet... 0;1;4 \n", "9 Acetyl@Protein_N-term;Carbamidomethyl@C 0;4 \n", "10 Oxidation@M 9 \n", "11 \n", "12 Acetyl@Protein_N-term;Oxidation@M 0;9 \n", "13 Acetyl@Protein_N-term 0 \n", "14 Oxidation@M 2 \n", "15 \n", "16 Carbamidomethyl@C 3 \n", "17 Acetyl@Protein_N-term;Carbamidomethyl@C 0;3 \n", "18 Oxidation@M;Carbamidomethyl@C 1;4 \n", "19 Carbamidomethyl@C 4 \n", "20 Acetyl@Protein_N-term;Oxidation@M;Carbamidomet... 0;1;4 \n", "21 Acetyl@Protein_N-term;Carbamidomethyl@C 0;4 \n", "22 Oxidation@M 9 \n", "23 \n", "24 Oxidation@M;Carbamidomethyl@C 17;3 \n", "25 Carbamidomethyl@C 3 \n", "26 Acetyl@Protein_N-term;Oxidation@M;Carbamidomet... 0;17;3 \n", "27 Acetyl@Protein_N-term;Carbamidomethyl@C 0;3 \n", "28 Oxidation@M;Carbamidomethyl@C 1;4 \n", "29 Oxidation@M;Carbamidomethyl@C 18;4 \n", "30 Oxidation@M;Oxidation@M;Carbamidomethyl@C 1;18;4 \n", "31 Carbamidomethyl@C 4 \n", "32 Acetyl@Protein_N-term;Oxidation@M;Carbamidomet... 0;1;4 \n", "33 Acetyl@Protein_N-term;Oxidation@M;Carbamidomet... 0;18;4 \n", "34 Acetyl@Protein_N-term;Oxidation@M;Oxidation@M;... 0;1;18;4 \n", "35 Acetyl@Protein_N-term;Carbamidomethyl@C 0;4 " ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "fasta_lib.add_modifications()\n", "fasta_lib.precursor_df[['sequence','mods','mod_sites']]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "`alphabase.protein.fasta.SpecLibFasta.add_additional_modifications` is specially designed for `Phospho`, as it may generate thousands of peptidoforms for a peptide with multiple phospho sites. " ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sequenceprotein_idxesmiss_cleavageis_prot_ntermis_prot_ctermmodsmod_sitesnAAproteinsgenes
0AFGHIJK0;10TrueTrue7xx;yygene
1AFGHIJK0;10TrueTrueAcetyl@Protein_N-term07xx;yygene
2LMNOPQR0;10FalseTrueOxidation@M27xx;yygene
3LMNOPQR0;10FalseTrue7xx;yygene
4ABCDESTK00TrueFalseCarbamidomethyl@C;Phospho@S3;68xx
.................................
79MABCDESTKAFGHIJKLMNOPQR02TrueFalseAcetyl@Protein_N-term;Oxidation@M;Oxidation@M;...0;1;18;4;823xx
80MABCDESTKAFGHIJKLMNOPQR02TrueFalseAcetyl@Protein_N-term;Oxidation@M;Oxidation@M;...0;1;18;423xx
81MABCDESTKAFGHIJKLMNOPQR02TrueFalseAcetyl@Protein_N-term;Carbamidomethyl@C;Phospho@S0;4;723xx
82MABCDESTKAFGHIJKLMNOPQR02TrueFalseAcetyl@Protein_N-term;Carbamidomethyl@C;Phospho@T0;4;823xx
83MABCDESTKAFGHIJKLMNOPQR02TrueFalseAcetyl@Protein_N-term;Carbamidomethyl@C0;423xx
\n", "

84 rows × 10 columns

\n", "
" ], "text/plain": [ " sequence protein_idxes miss_cleavage is_prot_nterm \\\n", "0 AFGHIJK 0;1 0 True \n", "1 AFGHIJK 0;1 0 True \n", "2 LMNOPQR 0;1 0 False \n", "3 LMNOPQR 0;1 0 False \n", "4 ABCDESTK 0 0 True \n", ".. ... ... ... ... \n", "79 MABCDESTKAFGHIJKLMNOPQR 0 2 True \n", "80 MABCDESTKAFGHIJKLMNOPQR 0 2 True \n", "81 MABCDESTKAFGHIJKLMNOPQR 0 2 True \n", "82 MABCDESTKAFGHIJKLMNOPQR 0 2 True \n", "83 MABCDESTKAFGHIJKLMNOPQR 0 2 True \n", "\n", " is_prot_cterm mods \\\n", "0 True \n", "1 True Acetyl@Protein_N-term \n", "2 True Oxidation@M \n", "3 True \n", "4 False Carbamidomethyl@C;Phospho@S \n", ".. ... ... \n", "79 False Acetyl@Protein_N-term;Oxidation@M;Oxidation@M;... \n", "80 False Acetyl@Protein_N-term;Oxidation@M;Oxidation@M;... \n", "81 False Acetyl@Protein_N-term;Carbamidomethyl@C;Phospho@S \n", "82 False Acetyl@Protein_N-term;Carbamidomethyl@C;Phospho@T \n", "83 False Acetyl@Protein_N-term;Carbamidomethyl@C \n", "\n", " mod_sites nAA proteins genes \n", "0 7 xx;yy gene \n", "1 0 7 xx;yy gene \n", "2 2 7 xx;yy gene \n", "3 7 xx;yy gene \n", "4 3;6 8 xx \n", ".. ... ... ... ... \n", "79 0;1;18;4;8 23 xx \n", "80 0;1;18;4 23 xx \n", "81 0;4;7 23 xx \n", "82 0;4;8 23 xx \n", "83 0;4 23 xx \n", "\n", "[84 rows x 10 columns]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from alphabase.protein.fasta import append_special_modifications\n", "fasta_lib._precursor_df = append_special_modifications(\n", " fasta_lib.precursor_df, ['Phospho@S','Phospho@T'],\n", " min_mod_num=0, max_mod_num=1, max_peptidoform_num=100\n", ")\n", "fasta_lib.precursor_df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Flexible method to add peptide labeling" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sequenceprotein_idxesmiss_cleavageis_prot_ntermis_prot_ctermmodsmod_sitesnAAproteinsgeneslabeling_channel
0AFGHIJK0;10TrueTrue7xx;yygene
1AFGHIJK0;10TrueTrueAcetyl@Protein_N-term07xx;yygene
2LMNOPQR0;10FalseTrueOxidation@M27xx;yygene
3LMNOPQR0;10FalseTrue7xx;yygene
4ABCDESTK00TrueFalseCarbamidomethyl@C;Phospho@S3;68xx
....................................
247MABCDESTKAFGHIJKLMNOPQR02TrueFalseAcetyl@Protein_N-term;Oxidation@M;Oxidation@M;...0;1;18;4;8;9;1623xx8
248MABCDESTKAFGHIJKLMNOPQR02TrueFalseAcetyl@Protein_N-term;Oxidation@M;Oxidation@M;...0;1;18;4;9;1623xx8
249MABCDESTKAFGHIJKLMNOPQR02TrueFalseAcetyl@Protein_N-term;Carbamidomethyl@C;Phosph...0;4;7;9;1623xx8
250MABCDESTKAFGHIJKLMNOPQR02TrueFalseAcetyl@Protein_N-term;Carbamidomethyl@C;Phosph...0;4;8;9;1623xx8
251MABCDESTKAFGHIJKLMNOPQR02TrueFalseAcetyl@Protein_N-term;Carbamidomethyl@C;Dimeth...0;4;9;1623xx8
\n", "

252 rows × 11 columns

\n", "
" ], "text/plain": [ " sequence protein_idxes miss_cleavage is_prot_nterm \\\n", "0 AFGHIJK 0;1 0 True \n", "1 AFGHIJK 0;1 0 True \n", "2 LMNOPQR 0;1 0 False \n", "3 LMNOPQR 0;1 0 False \n", "4 ABCDESTK 0 0 True \n", ".. ... ... ... ... \n", "247 MABCDESTKAFGHIJKLMNOPQR 0 2 True \n", "248 MABCDESTKAFGHIJKLMNOPQR 0 2 True \n", "249 MABCDESTKAFGHIJKLMNOPQR 0 2 True \n", "250 MABCDESTKAFGHIJKLMNOPQR 0 2 True \n", "251 MABCDESTKAFGHIJKLMNOPQR 0 2 True \n", "\n", " is_prot_cterm mods \\\n", "0 True \n", "1 True Acetyl@Protein_N-term \n", "2 True Oxidation@M \n", "3 True \n", "4 False Carbamidomethyl@C;Phospho@S \n", ".. ... ... \n", "247 False Acetyl@Protein_N-term;Oxidation@M;Oxidation@M;... \n", "248 False Acetyl@Protein_N-term;Oxidation@M;Oxidation@M;... \n", "249 False Acetyl@Protein_N-term;Carbamidomethyl@C;Phosph... \n", "250 False Acetyl@Protein_N-term;Carbamidomethyl@C;Phosph... \n", "251 False Acetyl@Protein_N-term;Carbamidomethyl@C;Dimeth... \n", "\n", " mod_sites nAA proteins genes labeling_channel \n", "0 7 xx;yy gene \n", "1 0 7 xx;yy gene \n", "2 2 7 xx;yy gene \n", "3 7 xx;yy gene \n", "4 3;6 8 xx \n", ".. ... ... ... ... ... \n", "247 0;1;18;4;8;9;16 23 xx 8 \n", "248 0;1;18;4;9;16 23 xx 8 \n", "249 0;4;7;9;16 23 xx 8 \n", "250 0;4;8;9;16 23 xx 8 \n", "251 0;4;9;16 23 xx 8 \n", "\n", "[252 rows x 11 columns]" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "fasta_lib.add_peptide_labeling({\n", " '': [], # not labelled for reference\n", " '0': ['Dimethyl@Any_N-term','Dimethyl@K'],\n", " '8': ['Dimethyl:2H(6)13C(2)@Any_N-term','Dimethyl:2H(6)13C(2)@K'],\n", "})\n", "fasta_lib.precursor_df" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sequencemodsmod_sitescharge
0AFGHIJK2
1AFGHIJK3
2AFGHIJK4
3AFGHIJKAcetyl@Protein_N-term02
4AFGHIJKAcetyl@Protein_N-term03
...............
751MABCDESTKAFGHIJKLMNOPQRAcetyl@Protein_N-term;Carbamidomethyl@C;Phosph...0;4;8;9;163
752MABCDESTKAFGHIJKLMNOPQRAcetyl@Protein_N-term;Carbamidomethyl@C;Phosph...0;4;8;9;164
753MABCDESTKAFGHIJKLMNOPQRAcetyl@Protein_N-term;Carbamidomethyl@C;Dimeth...0;4;9;162
754MABCDESTKAFGHIJKLMNOPQRAcetyl@Protein_N-term;Carbamidomethyl@C;Dimeth...0;4;9;163
755MABCDESTKAFGHIJKLMNOPQRAcetyl@Protein_N-term;Carbamidomethyl@C;Dimeth...0;4;9;164
\n", "

756 rows × 4 columns

\n", "
" ], "text/plain": [ " sequence \\\n", "0 AFGHIJK \n", "1 AFGHIJK \n", "2 AFGHIJK \n", "3 AFGHIJK \n", "4 AFGHIJK \n", ".. ... \n", "751 MABCDESTKAFGHIJKLMNOPQR \n", "752 MABCDESTKAFGHIJKLMNOPQR \n", "753 MABCDESTKAFGHIJKLMNOPQR \n", "754 MABCDESTKAFGHIJKLMNOPQR \n", "755 MABCDESTKAFGHIJKLMNOPQR \n", "\n", " mods mod_sites charge \n", "0 2 \n", "1 3 \n", "2 4 \n", "3 Acetyl@Protein_N-term 0 2 \n", "4 Acetyl@Protein_N-term 0 3 \n", ".. ... ... ... \n", "751 Acetyl@Protein_N-term;Carbamidomethyl@C;Phosph... 0;4;8;9;16 3 \n", "752 Acetyl@Protein_N-term;Carbamidomethyl@C;Phosph... 0;4;8;9;16 4 \n", "753 Acetyl@Protein_N-term;Carbamidomethyl@C;Dimeth... 0;4;9;16 2 \n", "754 Acetyl@Protein_N-term;Carbamidomethyl@C;Dimeth... 0;4;9;16 3 \n", "755 Acetyl@Protein_N-term;Carbamidomethyl@C;Dimeth... 0;4;9;16 4 \n", "\n", "[756 rows x 4 columns]" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "fasta_lib.add_charge()\n", "fasta_lib.precursor_df[['sequence','mods','mod_sites','charge']]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Append precursor mz and isotope information" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/wenfengzeng/workspace/alphabase/alphabase/peptide/precursor.py:613: RuntimeWarning: invalid value encountered in divide\n", " precursor_dist /= np.sum(precursor_dist, axis=1, keepdims=True)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
precursor_mzi_0i_1i_2i_3i_4i_5
03.932371e+020.6258220.2859180.0728830.0134110.0019660.0
12.624938e+020.6258220.2859180.0728830.0134110.0019660.0
21.971222e+020.6258220.2859180.0728830.0134110.0019660.0
34.142423e+020.6109210.2926990.0786900.0153120.0023780.0
42.764973e+020.6109210.2926990.0786900.0153120.0023780.0
........................
7514.000960e+06NaNNaNNaNNaNNaNNaN
7523.000720e+06NaNNaNNaNNaNNaNNaN
7536.001400e+06NaNNaNNaNNaNNaNNaN
7544.000934e+06NaNNaNNaNNaNNaNNaN
7553.000700e+06NaNNaNNaNNaNNaNNaN
\n", "

756 rows × 7 columns

\n", "
" ], "text/plain": [ " precursor_mz i_0 i_1 i_2 i_3 i_4 i_5\n", "0 3.932371e+02 0.625822 0.285918 0.072883 0.013411 0.001966 0.0\n", "1 2.624938e+02 0.625822 0.285918 0.072883 0.013411 0.001966 0.0\n", "2 1.971222e+02 0.625822 0.285918 0.072883 0.013411 0.001966 0.0\n", "3 4.142423e+02 0.610921 0.292699 0.078690 0.015312 0.002378 0.0\n", "4 2.764973e+02 0.610921 0.292699 0.078690 0.015312 0.002378 0.0\n", ".. ... ... ... ... ... ... ...\n", "751 4.000960e+06 NaN NaN NaN NaN NaN NaN\n", "752 3.000720e+06 NaN NaN NaN NaN NaN NaN\n", "753 6.001400e+06 NaN NaN NaN NaN NaN NaN\n", "754 4.000934e+06 NaN NaN NaN NaN NaN NaN\n", "755 3.000700e+06 NaN NaN NaN NaN NaN NaN\n", "\n", "[756 rows x 7 columns]" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "fasta_lib.calc_precursor_mz()\n", "fasta_lib.calc_precursor_isotope()\n", "fasta_lib.precursor_df[['precursor_mz']+[col for col in fasta_lib.precursor_df.columns if col.startswith('i_')]]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Using `alphabase.spectral_library.base.SpecLibBase.calc_fragment_mz_df` to calculate fragment mz dataframe." ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
b_z1y_z1
07.204439e+01714.429749
12.191128e+02567.361328
22.761343e+02510.339844
34.131932e+02373.280945
45.262772e+02260.196869
.........
119111.200205e+07751.420959
119121.200216e+07637.377991
119131.200240e+07400.230286
119141.200250e+07303.177521
119151.200262e+07175.118958
\n", "

11916 rows × 2 columns

\n", "
" ], "text/plain": [ " b_z1 y_z1\n", "0 7.204439e+01 714.429749\n", "1 2.191128e+02 567.361328\n", "2 2.761343e+02 510.339844\n", "3 4.131932e+02 373.280945\n", "4 5.262772e+02 260.196869\n", "... ... ...\n", "11911 1.200205e+07 751.420959\n", "11912 1.200216e+07 637.377991\n", "11913 1.200240e+07 400.230286\n", "11914 1.200250e+07 303.177521\n", "11915 1.200262e+07 175.118958\n", "\n", "[11916 rows x 2 columns]" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "fasta_lib.calc_fragment_mz_df()\n", "fasta_lib.fragment_mz_df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "`calc_fragment_mz_df()` also generate pointers `frag_start_idx` and `frag_stop_idx` in the precursor_df to locate fragments of each precursor. " ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
frag_start_idxfrag_stop_idx
006
1612
21218
31824
42430
.........
7511180611828
7521182811850
7531185011872
7541187211894
7551189411916
\n", "

756 rows × 2 columns

\n", "
" ], "text/plain": [ " frag_start_idx frag_stop_idx\n", "0 0 6\n", "1 6 12\n", "2 12 18\n", "3 18 24\n", "4 24 30\n", ".. ... ...\n", "751 11806 11828\n", "752 11828 11850\n", "753 11850 11872\n", "754 11872 11894\n", "755 11894 11916\n", "\n", "[756 rows x 2 columns]" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "fasta_lib.precursor_df[['frag_start_idx','frag_stop_idx']]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Note that all fragment ions are stored from peptide's N-terminal to C-terminal, so the b-ions are in the ascending order (from b1 to bn) and y-ions are in the decending order (from yn to y1)." ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
b_z1y_z1
672.044388714.429749
7219.112808567.361328
8276.134277510.339844
9413.193176373.280945
10526.277222260.196869
11639.361328147.112808
\n", "
" ], "text/plain": [ " b_z1 y_z1\n", "6 72.044388 714.429749\n", "7 219.112808 567.361328\n", "8 276.134277 510.339844\n", "9 413.193176 373.280945\n", "10 526.277222 260.196869\n", "11 639.361328 147.112808" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "start, end = fasta_lib.precursor_df[['frag_start_idx','frag_stop_idx']].values[1]\n", "fasta_lib.fragment_mz_df.iloc[start:end,:]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Save protein_df, precursor_df, fragment_mz_df, fragment_intensity_df into a hdf file." ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "# fasta_lib.save_hdf('path/to/hdf_file.hdf')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3.8.3 ('base')", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.12" }, "orig_nbformat": 4, "vscode": { "interpreter": { "hash": "8a3b27e141e49c996c9b863f8707e97aabd49c4a7e8445b9b783b34e4a21a9b2" } } }, "nbformat": 4, "nbformat_minor": 2 }