Example Scripts¶

This collection of example scripts is designed to get familiar with the basic functionality of SugarPy. They can be modified and combined according to your needs. Please also check the folder “example_scripts” as well as Ursgal’s “example_scripts” for further use cases and implementation into more compelx workflows.

Simple Example Scripts¶

Parse peptide sequence input file¶

parse_peptide_sequence_input_file.main(input_path=None)¶

This script generates a peptide lookup that can be used as an input for further SugarPy functions. Here, an Ursgal result file is parsed and pepide sequences as well as corresponding informations are stored in a dictionary. However, This dictionary can also be generated in different ways, but should follow this structure:

peptide_lookup = {
    'sequence#unimod:pos': {    
        'rt': set(),
        'spec_id': set(),
        'accuracy': [],
        'protein': 'description',
    }
}

Usage:

./parse_peptide_sequence_input_file.py <inpu_result_file.csv>

#!/usr/bin/env python3
# encoding: utf-8

import sugarpy
import sys
import pprint

def main(input_path=None):
    '''
    This script generates a peptide lookup that can be used as an input
    for further SugarPy functions. Here, an Ursgal result file is parsed
    and pepide sequences as well as corresponding informations are stored
    in a dictionary. However, This dictionary can also be generated in
    different ways, but should follow this structure::

        peptide_lookup = {
            'sequence#unimod:pos': {
                'rt': set(),
                'spec_id': set(),
                'accuracy': [],
                'protein': 'description',
            }
        }

    Usage::

        ./parse_peptide_sequence_input_file.py <inpu_result_file.csv>
    '''
    sp_run = sugarpy.run.Run()

    peptide_lookup = sp_run.parse_ident_file(
        ident_file=input_path,
        unimod_glycans_incl_in_search=[
            'HexNAc',
            'HexNAc(2)',
        ],
    )
    pprint.pprint(peptide_lookup)

if __name__ == '__main__':
    main(input_path=sys.argv[1])

Build glycopeptide combinations¶

build_glycopeptide_combinations.main()¶

For a given list of peptides and a set of monosaccharides, this script builds all theoretical combinations of glycopeptides.

Usage:

./build_glycopeptide_combination.py

#!/usr/bin/env python3
# encoding: utf-8

import sugarpy
import pprint

def main():
    '''
    For a given list of peptides and a set of monosaccharides,
    this script builds all theoretical combinations of glycopeptides.

    Usage::

        ./build_glycopeptide_combination.py
    '''
    sp_run = sugarpy.run.Run(
        monosaccharides={
            "dHex": 'C6H10O4',
            "Hex": 'C6H10O5',
            "HexNAc": 'C8H13NO5',
            "NeuAc": 'C11H17NO8',
        },
    )

    pep_unimod_list = [
        'GLYCANSTPEPTIDE',
    ]

    glycopeptide_combinations = sp_run.add_glycans2peptide(
        peptide_list=pep_unimod_list,
        max_tree_length=5,
    )

    pprint.pprint(glycopeptide_combinations)

if __name__ == '__main__':
    main()

Full Workflow Example Scripts¶

SugarPy run example¶

sugarpy_run_example.main(ident_file=None, unimod_glycans_incl_in_search=['HexNAc', 'HexNAc(2)'], max_tree_length=10, monosaccharides={'Hex': 'C6H10O5', 'HexNAc': 'C8H13NO5', 'NeuAc': 'C11H17NO8', 'dHex': 'C6H10O4'}, mzml_file=None, scan_rt_lookup=None, rt_border_tolerance=1, charges=[1, 2], output_file=None, pyqms_params={}, min_spec_number=1, min_tree_length=1, max_trees_per_spec=5, min_sugarpy_score=0, min_sub_cov=0.5, ms_level=1, force=True)¶

This example script takes the following two files as input and than performs a search for intact glycopeptides:

Ursgal result file: this file contains the identified glycopeptide sequences in .csv format
mzML file: the raw MS data in .mzML format

Usage:

./sugarpy_run_example.py <ursgal_result_file.csv> <ms_data_file.mzML>

#!/usr/bin/env python
# encoding: utf-8

import sugarpy
import ursgal
import sys
import os
import statistics
import pickle
from collections import namedtuple


def main(
    ident_file=None,
    unimod_glycans_incl_in_search=[
        'HexNAc',
        'HexNAc(2)',
    ],
    max_tree_length=10,
    monosaccharides={
        "dHex": 'C6H10O4',
        "Hex": 'C6H10O5',
        "HexNAc": 'C8H13NO5',
        "NeuAc": 'C11H17NO8',
    },
    mzml_file=None,
    scan_rt_lookup=None,
    rt_border_tolerance=1,
    charges=[1, 2],
    output_file=None,
    pyqms_params={},
    min_spec_number=1,
    min_tree_length=1,
    max_trees_per_spec=5,
    min_sugarpy_score=0,
    min_sub_cov=0.5,
    ms_level=1,
    force=True,
):
    '''
    This example script takes the following two files as input and
    than performs a search for intact glycopeptides:

    * Ursgal result file: this file contains the identified glycopeptide sequences in .csv format
    * mzML file: the raw MS data in .mzML format

    Usage::

        ./sugarpy_run_example.py <ursgal_result_file.csv> <ms_data_file.mzML>
    '''

    # initialize Sugary run
    sp_run = sugarpy.run.Run(
        monosaccharides=monosaccharides
    )

    # read the Ursgal result file and store pepides in lookup
    peptide_lookup = sp_run.parse_ident_file(
        ident_file=ident_file,
        unimod_glycans_incl_in_search=unimod_glycans_incl_in_search,
    )

    # build all theoretical glycopeptide combinations
    pep_unimod_list = list(peptide_lookup.keys())
    peps_with_glycans = sp_run.add_glycans2peptide(
        peptide_list=pep_unimod_list,
        max_tree_length=max_tree_length,
    )

    lookup_dict = sp_run.build_rt_lookup(mzml_file, ms_level)
    mzml_basename = os.path.basename(mzml_file).replace('.mzML', '')
    scan2rt = lookup_dict[mzml_basename]['scan_2_rt']
    output_folder = os.path.dirname(output_file)

    # run pyqms to identify isotope envelopes for all theoretical glycopeptides
    pyqms_results = {}
    for n, peptide_unimod in enumerate(sorted(peps_with_glycans)):
        pkl_name = '-'.join(peptide_unimod.split(';'))
        pkl_name = '_'.join(pkl_name.split(':'))
        rt_min = min(peptide_lookup[peptide_unimod]
                     ['rt']) - rt_border_tolerance
        rt_max = max(peptide_lookup[peptide_unimod]
                     ['rt']) + rt_border_tolerance
        print(
            '[ SugarPy  ] Quantification for peptide ',
            peptide_unimod,
            '#{0} out of {1}'.format(
                n + 1,
                len(peps_with_glycans)
            )
        )
        results_pkl = sp_run.quantify(
            molecule_name_dict=peps_with_glycans[peptide_unimod],
            rt_window=(rt_min, rt_max),
            ms_level=ms_level,
            charges=charges,
            params=pyqms_params,
            pkl_name=os.path.join(
                output_folder,
                '{0}_{1}_pyQms_results.pkl'.format(
                    mzml_basename,
                    pkl_name
                )
            ),
            mzml_file=mzml_file,
        )
        pyqms_results[peptide_unimod] = results_pkl

    # combine matched glycopeptide fragments to inteact glycopeptides
    validated_results = sp_run.validate_results(
        pyqms_results_dict=pyqms_results,
        min_spec_number=min_spec_number,
        min_tree_length=min_tree_length,
    )

    # initiate results class and save Sugary results as csv file
    sp_results = sugarpy.results.Results(
        validated_results=validated_results,
        monosaccharides=monosaccharides
    )
    sp_results.write_results2csv(
        output_file=output_file,
        max_trees_per_spec=max_trees_per_spec,
        min_sugarpy_score=min_sugarpy_score,
        min_sub_cov=min_sub_cov,
        peptide_lookup=peptide_lookup,
        scan_rt_lookup=scan2rt,
        mzml_basename=mzml_basename,
    )
    output_pkl = output_file.replace('.csv', '.pkl')
    with open(output_pkl, 'wb') as out_pkl:
        pickle.dump(sp_results, out_pkl)
    print('Done.')

if __name__ == '__main__':
    if len(sys.argv) < 3:
        print(main.__doc__)
        sys.exit(1)
    mzml_file = sys.argv[1]
    ident_file = sys.argv[2]
    output_file = ident_file.replace('.csv', '_sugarpy.csv')
    main(
        ident_file=ident_file,
        mzml_file=mzml_file,
        output_file=output_file,
    )

SugarPy results example¶

sugarpy_results_example.main(mzml_file=None, validated_results_pkl=None, ursgal_result_file=None, ms_level=1, output_file=None, min_sugarpy_score=1, min_sub_cov=0.5, min_spec_number=1, charges=[1, 2, 3, 4, 5], monosaccharides={'Hex': 'C6H10O5', 'HexNAc': 'C8H13NO5', 'NeuAc': 'C11H17NO8', 'dHex': 'C6H10O4'}, rt_border_tolerance=1, max_tree_length=10, unimod_glycans_incl_in_search=['HexNAc', 'HexNAc(2)'], max_trees_per_spec=5)¶

This example script takes the following three files as input:

Ursgal result file: this file contains the identified glycopeptide sequences in .csv format
mzML file: the raw MS data in .mzML format
SugarPy results pkl: pkl file that contains SugarPy results (see sugarpy_run_exampe.py on how to generate it)

From those files, a SugarPy results csv is generated, as well as a glycopeptide elution profile.

Usage:

./sugarpy_results_example.py <ursgal_result_file.csv> <ms_data_file.mzML> <sugarpy_results.pkl>

#!/usr/bin/env python
# encoding: utf-8

import sugarpy
import ursgal
import sys
import os
import pickle

def main(
    mzml_file=None,
    validated_results_pkl=None,
    ursgal_result_file=None,
    ms_level=1,
    output_file=None,
    min_sugarpy_score=1,
    min_sub_cov=0.5,
    min_spec_number=1,
    charges=[1, 2, 3, 4, 5],
    monosaccharides={
        "dHex": 'C6H10O4',
        "Hex": 'C6H10O5',
        "HexNAc": 'C8H13NO5',
        "NeuAc": 'C11H17NO8',
    },
    rt_border_tolerance=1,
    max_tree_length=10,
    unimod_glycans_incl_in_search=[
        'HexNAc',
        'HexNAc(2)',
    ],
    max_trees_per_spec=5,
):
    '''
    This example script takes the following three files as input:

    * Ursgal result file: this file contains the identified glycopeptide sequences in .csv format
    * mzML file: the raw MS data in .mzML format
    * SugarPy results pkl: pkl file that contains SugarPy results (see sugarpy_run_exampe.py on how to generate it)

    From those files, a SugarPy results csv is generated, as well as a glycopeptide elution profile.

    Usage::

        ./sugarpy_results_example.py <ursgal_result_file.csv> <ms_data_file.mzML> <sugarpy_results.pkl>
    '''

    # load results from pkl
    with open(validated_results_pkl, 'rb') as results_pkl:
        validated_results = pickle.load(results_pkl)

    mzml_basename = os.path.basename(mzml_file)
    sp_run = sugarpy.run.Run()
    lookup_dict = sp_run.build_rt_lookup(mzml_file, ms_level)
    scan2rt = lookup_dict[mzml_basename]['scan_2_rt']

    # load results class
    sp_results = sugarpy.results.Results(
        monosaccharides=monosaccharides,
        scan_rt_lookup=scan2rt,
        validated_results=validated_results,
    )

    # write results csv
    sp_run = sugarpy.run.Run()
    peptide_lookup = sp_run.parse_ident_file(
        ident_file=ursgal_ident_file,
        unimod_glycans_incl_in_search=unimod_glycans_incl_in_search
    )
    sp_result_file = sp_results.write_results2csv(
        output_file=validated_results_pkl.replace('.pkl', '.csv'),
        max_trees_per_spec=max_trees_per_spec,
        min_sugarpy_score=min_sugarpy_score,
        min_sub_cov=min_sub_cov,
        peptide_lookup=peptide_lookup,
        monosaccharides=monosaccharides,
        scan_rt_lookup=scan2rt,
        mzml_basename=mzml_basename
    )


    # plot glycopeptide elution profile
    plot_molecule_dict = sp_results.parse_result_file(sp_result_file)
    elution_profile_file = sp_results.plot_glycan_elution_profile(
        peptide_list=sorted(plot_molecule_dict.keys()),
        min_sugarpy_score=min_sugarpy_score,
        min_sub_cov=min_sub_cov,
        x_axis_type='retention_time',
        score_type='top_scores',
        output_file=output_file.replace('.txt', '_glycan_profile.html'),
        plotly_layout={
            'font' : {
                'family':'Arial',
                'size': 26,
                'color' : 'rgb(0, 0, 0)',
            },
            'width':1700,
            'height':1200,
            'margin':{
                'l':120,
                'r':50,
                't':50,
                'b':170,
            },
            'xaxis':{
                'type':'linear',
                'color':'rgb(0, 0, 0)',
                'title': 'Retention time [min]',
                'title_font':{
                    'family':'Arial',
                    'size':26,
                    'color':'rgb(0, 0, 0)',
                },
                'autorange':True,
                'fixedrange':False,
                'tickmode':'auto',
                'showticklabels':True,
                'tickfont':{
                    'family':'Arial',
                    'size':22,
                    'color':'rgb(0, 0, 0)',
                },
                'tickangle':0,
                'ticklen':5,
                'tickwidth':1,
                'tickcolor':'rgb(0, 0, 0)',
                'ticks':'outside',
                'showline':True,
                'linecolor':'rgb(0, 0, 0)',
                'mirror':False,
                'showgrid':False,
                'anchor':'y',
                'side':'bottom',
            },
            'yaxis':{
                'type':'linear',
                'color':'rgb(0, 0, 0)',
                'title':'Max. SugarPy score',
                'title_font':{
                    'family':'Arial',
                    'size':26,
                    'color':'rgb(0, 0, 0)',
                },
                'autorange':True,
                'fixedrange':False,
                'tickmode':'auto',
                'showticklabels':True,
                'tickfont':{
                    'family':'Arial',
                    'size':22,
                    'color':'rgb(0, 0, 0)',
                },
                'tickangle':0,
                'ticklen':5,
                'tickwidth':1,
                'tickcolor':'rgb(0, 0, 0)',
                'ticks':'outside',
                'showline':True,
                'linecolor':'rgb(0, 0, 0)',
                'mirror':False,
                'showgrid':False,
                'zeroline':False,
                'anchor':'x',
                'side':'left',
            },
            'legend':{
                'font':{
                    'family':'Arial',
                    'size':5,
                    'color':'rgb(0, 0, 0)',
                },
                'orientation':'v',
                'traceorder':'normal',
            },
            'showlegend':True,
            'paper_bgcolor':'rgba(0,0,0,0)',
            'plot_bgcolor':'rgba(0,0,0,0)',
        },
    )

    print('Done.')
    return output_file


if __name__ == '__main__':
    ursgal_ident_file = sys.argv[1]
    mzml_file = sys.argv[2]
    validated_results_pkl = sys.argv[3]
    output_file = result_file.replace('.csv', '.html')
    main(
        mzml_file=mzml_file,
        validated_results_pkl=validated_results_pkl,
        ursgal_result_file=ursgal_result_file,
        output_file=output_file,
    )