Example of usage

MSCI is a Python package designed to evaluate the information content of peptide fragmentation spectra. Our objective was to compute an information-content index for all peptides within a given proteome. This would allow us to devise data acquisition and analysis strategies that generate and prioritize the most informative fragment ions for peptide quantification.

#Download MSCI package and necessary installations

#!git clone https://github.com/proteomicsunitcrg/MSCI.git
#! pip install matchms
# do not restart session if asked (press cancel matchms since probably you already have matchms installed )
#%cd MSCI
#import sys
#sys.path.append('/content/MSCI')
! pip install MSCI==0.2.0
Collecting MSCI==0.2.0
  Downloading MSCI-0.2.0-py2.py3-none-any.whl.metadata (903 bytes)
Requirement already satisfied: Click>=7.0 in /usr/local/lib/python3.10/dist-packages (from MSCI==0.2.0) (8.1.7)
Successfully installed MSCI-0.2.0 gitdb-4.0.11 gitpython-3.1.43 pydeck-0.9.1 smmap-5.0.1 streamlit-1.37.1 tenacity-8.5.0 watchdog-4.0.2
Requirement already satisfied: biopython in /usr/local/lib/python3.10/dist-packages (1.84)
Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from biopython) (1.26.4)
Requirement already satisfied: matchms in /usr/local/lib/python3.10/dist-packages (0.27.0)
Requirement already satisfied: deprecated>=1.2.14 in /usr/local/lib/python3.10/dist-packages (from matchms) (1.2.14)

Import

from MSCI.Preprocessing.Koina import PeptideProcessor
from MSCI.Grouping_MS1.Grouping_mw_irt import process_peptide_combinations
from MSCI.Preprocessing.read_msp_file import read_msp_file
from MSCI.Similarity.spectral_angle_similarity import process_spectra_pairs
from MSCI.data.digest import parse_fasta_and_digest, tryptic_digest, peptides_to_csv
from matchms.importing import load_from_msp
import random
import numpy as np
import pandas as pd

Generate predicted dataset

Parse fasta file

result = parse_fasta_and_digest("https://raw.githubusercontent.com/proteomicsunitcrg/MSCI/refs/heads/main/tutorial/sp_human_2023_04.fasta", digest_type="trypsin")
peptides_to_csv(result, "random_tryptic_peptides.txt")

Download the list of peptides of interest

import random

# List of standard amino acids
amino_acids = 'ACDEFGHIKLMNPQRSTVWY'

# Function to generate a single tryptic peptide
def generate_tryptic_peptide(min_length=8, max_length=20):
    length = random.randint(min_length, max_length - 1)
    peptide = ''.join(random.choices(amino_acids, k=length))
    peptide += random.choice('KR')
    return peptide

# Generate a list of 90 random tryptic peptides
tryptic_peptides = [generate_tryptic_peptide() for _ in range(90)]

# Generate 5 pairs of peptides that are permutations of each other and print them
permuted_pairs = []
for _ in range(5):
    peptide = generate_tryptic_peptide()
    # Ensure the peptide has at least 2 characters to swap
    if len(peptide) < 2:
        continue
    # Select two different positions to swap
    pos1, pos2 = random.sample(range(len(peptide) - 1), 2)
    permuted_peptide_list = list(peptide)
    permuted_peptide_list[pos1], permuted_peptide_list[pos2] = permuted_peptide_list[pos2], permuted_peptide_list[pos1]
    permuted_peptide = ''.join(permuted_peptide_list)
    tryptic_peptides.append(peptide)
    tryptic_peptides.append(permuted_peptide)
    permuted_pairs.append((peptide, permuted_peptide))

# Ensure the last peptide meets the length requirement
last_peptide_length = random.randint(5, 20)
last_peptide = ''.join(random.choices(amino_acids, k=last_peptide_length))
tryptic_peptides[-1] = last_peptide

# Shuffle the list to mix the pairs with the other peptides
random.shuffle(tryptic_peptides)
# Save the peptides to a file
with open('random_tryptic_peptides.txt', 'w') as f:
    for peptide in tryptic_peptides:
        f.write(f"{peptide}\n")


print("Generated 100 random tryptic peptides with permutation pairs and saved to 'random_tryptic_peptides.txt'.")
Generated 100 random tryptic peptides with permutation pairs and saved to 'random_tryptic_peptides.txt'.

Predict with Koina

If available your own list of peptides

processor = PeptideProcessor(
    input_file="random_tryptic_peptides.txt",
    collision_energy=30,
    charge=2,
    model_intensity="Prosit_2020_intensity_HCD",
    model_irt="Prosit_2019_irt"
)

processor.process('random_tryptic_peptides.msp')

Load dataset

# You can use your own spectra
File= 'random_tryptic_peptides.msp'
spectra = list(load_from_msp(File))
2024-08-22 13:30:02,993:WARNING:matchms:add_precursor_mz:No precursor_mz found in metadata.

Group within MS1 tolerance

mz_tolerance = 1
irt_tolerance = 5


mz_irt_df = read_msp_file(File)
Groups_df = process_peptide_combinations(mz_irt_df, mz_tolerance, irt_tolerance, use_ppm=False)

Groups_df
Results DataFrame Columns: Index(['index1', 'index2', 'peptide 1', 'peptide 2', 'm/z  1', 'm/z 2',
       'iRT 1', 'iRT 2'],
      dtype='object')
index1 index2 peptide 1 peptide 2 m/z 1 m/z 2 iRT 1 iRT 2
0 2 15 FTCQIAHVCPHFNNPK/2 IDIDKYGKAISACHPPK/2 928.440166 928.490379 50.206707 49.247311
1 8 19 RTNYPMFEYHK/2 TLPRMTKYYGVR/2 743.350811 742.905754 35.316872 34.458534
2 46 73 HQEEAMMFHPLMNKNNTFR/2 QSAICREAEQTKFNMVSKFR/2 1188.045732 1187.093736 61.910671 62.716576

Calculate similarity within fragment tolerance

Groups_df.columns = Groups_df.columns.str.strip()
index_array = Groups_df[['index1','index2']].values.astype(int)
result = process_spectra_pairs(index_array, spectra,  mz_irt_df, tolerance =0, ppm=10)
result.to_csv("output.csv", index=False)
result
0.002814877157520823
0.0
0.0025644450471453695
index1 index2 peptide 1 peptide 2 m/z 1 m/z 2 iRT 1 iRT 2 similarity_score
0 2 15 FTCQIAHVCPHFNNPK/2 IDIDKYGKAISACHPPK/2 928.440166 928.490379 50.206707 49.247311 0.002815
1 8 19 RTNYPMFEYHK/2 TLPRMTKYYGVR/2 743.350811 742.905754 35.316872 34.458534 0.000000
2 46 73 HQEEAMMFHPLMNKNNTFR/2 QSAICREAEQTKFNMVSKFR/2 1188.045732 1187.093736 61.910671 62.716576 0.002564

Plot results

Plot spectra of interest using matchms

import matplotlib.pyplot as plt
print(mz_irt_df.iloc[19])
print(mz_irt_df.iloc[36])
spectra[19].plot_against(spectra[36])
plt.savefig('spectra_comparison.png')
Name    MRIGTPEPWSTQSDKR/2
MW              944.970342
iRT              41.258202
Name: 19, dtype: object
Name    QAIMSISYHSCYNMFR/2
MW              975.936599
iRT              93.540787
Name: 36, dtype: object