Example Scripts¶
This collection of example scripts is designed to get familiar with the basic functionality of SugarPy. They can be modified and combined according to your needs. Please also check the folder “example_scripts” as well as Ursgal’s “example_scripts” for further use cases and implementation into more compelx workflows.
Simple Example Scripts¶
Parse peptide sequence input file¶
-
parse_peptide_sequence_input_file.
main
(input_path=None)¶ This script generates a peptide lookup that can be used as an input for further SugarPy functions. Here, an Ursgal result file is parsed and pepide sequences as well as corresponding informations are stored in a dictionary. However, This dictionary can also be generated in different ways, but should follow this structure:
peptide_lookup = { 'sequence#unimod:pos': { 'rt': set(), 'spec_id': set(), 'accuracy': [], 'protein': 'description', } }
Usage:
./parse_peptide_sequence_input_file.py <inpu_result_file.csv>
#!/usr/bin/env python3
# encoding: utf-8
import sugarpy
import sys
import pprint
def main(input_path=None):
'''
This script generates a peptide lookup that can be used as an input
for further SugarPy functions. Here, an Ursgal result file is parsed
and pepide sequences as well as corresponding informations are stored
in a dictionary. However, This dictionary can also be generated in
different ways, but should follow this structure::
peptide_lookup = {
'sequence#unimod:pos': {
'rt': set(),
'spec_id': set(),
'accuracy': [],
'protein': 'description',
}
}
Usage::
./parse_peptide_sequence_input_file.py <inpu_result_file.csv>
'''
sp_run = sugarpy.run.Run()
peptide_lookup = sp_run.parse_ident_file(
ident_file=input_path,
unimod_glycans_incl_in_search=[
'HexNAc',
'HexNAc(2)',
],
)
pprint.pprint(peptide_lookup)
if __name__ == '__main__':
main(input_path=sys.argv[1])
Build glycopeptide combinations¶
-
build_glycopeptide_combinations.
main
()¶ For a given list of peptides and a set of monosaccharides, this script builds all theoretical combinations of glycopeptides.
Usage:
./build_glycopeptide_combination.py
#!/usr/bin/env python3
# encoding: utf-8
import sugarpy
import pprint
def main():
'''
For a given list of peptides and a set of monosaccharides,
this script builds all theoretical combinations of glycopeptides.
Usage::
./build_glycopeptide_combination.py
'''
sp_run = sugarpy.run.Run(
monosaccharides={
"dHex": 'C6H10O4',
"Hex": 'C6H10O5',
"HexNAc": 'C8H13NO5',
"NeuAc": 'C11H17NO8',
},
)
pep_unimod_list = [
'GLYCANSTPEPTIDE',
]
glycopeptide_combinations = sp_run.add_glycans2peptide(
peptide_list=pep_unimod_list,
max_tree_length=5,
)
pprint.pprint(glycopeptide_combinations)
if __name__ == '__main__':
main()
Full Workflow Example Scripts¶
SugarPy run example¶
-
sugarpy_run_example.
main
(ident_file=None, unimod_glycans_incl_in_search=['HexNAc', 'HexNAc(2)'], max_tree_length=10, monosaccharides={'Hex': 'C6H10O5', 'HexNAc': 'C8H13NO5', 'NeuAc': 'C11H17NO8', 'dHex': 'C6H10O4'}, mzml_file=None, scan_rt_lookup=None, rt_border_tolerance=1, charges=[1, 2], output_file=None, pyqms_params={}, min_spec_number=1, min_tree_length=1, max_trees_per_spec=5, min_sugarpy_score=0, min_sub_cov=0.5, ms_level=1, force=True)¶ This example script takes the following two files as input and than performs a search for intact glycopeptides:
- Ursgal result file: this file contains the identified glycopeptide sequences in .csv format
- mzML file: the raw MS data in .mzML format
Usage:
./sugarpy_run_example.py <ursgal_result_file.csv> <ms_data_file.mzML>
#!/usr/bin/env python
# encoding: utf-8
import sugarpy
import ursgal
import sys
import os
import statistics
import pickle
from collections import namedtuple
def main(
ident_file=None,
unimod_glycans_incl_in_search=[
'HexNAc',
'HexNAc(2)',
],
max_tree_length=10,
monosaccharides={
"dHex": 'C6H10O4',
"Hex": 'C6H10O5',
"HexNAc": 'C8H13NO5',
"NeuAc": 'C11H17NO8',
},
mzml_file=None,
scan_rt_lookup=None,
rt_border_tolerance=1,
charges=[1, 2],
output_file=None,
pyqms_params={},
min_spec_number=1,
min_tree_length=1,
max_trees_per_spec=5,
min_sugarpy_score=0,
min_sub_cov=0.5,
ms_level=1,
force=True,
):
'''
This example script takes the following two files as input and
than performs a search for intact glycopeptides:
* Ursgal result file: this file contains the identified glycopeptide sequences in .csv format
* mzML file: the raw MS data in .mzML format
Usage::
./sugarpy_run_example.py <ursgal_result_file.csv> <ms_data_file.mzML>
'''
# initialize Sugary run
sp_run = sugarpy.run.Run(
monosaccharides=monosaccharides
)
# read the Ursgal result file and store pepides in lookup
peptide_lookup = sp_run.parse_ident_file(
ident_file=ident_file,
unimod_glycans_incl_in_search=unimod_glycans_incl_in_search,
)
# build all theoretical glycopeptide combinations
pep_unimod_list = list(peptide_lookup.keys())
peps_with_glycans = sp_run.add_glycans2peptide(
peptide_list=pep_unimod_list,
max_tree_length=max_tree_length,
)
lookup_dict = sp_run.build_rt_lookup(mzml_file, ms_level)
mzml_basename = os.path.basename(mzml_file).replace('.mzML', '')
scan2rt = lookup_dict[mzml_basename]['scan_2_rt']
output_folder = os.path.dirname(output_file)
# run pyqms to identify isotope envelopes for all theoretical glycopeptides
pyqms_results = {}
for n, peptide_unimod in enumerate(sorted(peps_with_glycans)):
pkl_name = '-'.join(peptide_unimod.split(';'))
pkl_name = '_'.join(pkl_name.split(':'))
rt_min = min(peptide_lookup[peptide_unimod]
['rt']) - rt_border_tolerance
rt_max = max(peptide_lookup[peptide_unimod]
['rt']) + rt_border_tolerance
print(
'[ SugarPy ] Quantification for peptide ',
peptide_unimod,
'#{0} out of {1}'.format(
n + 1,
len(peps_with_glycans)
)
)
results_pkl = sp_run.quantify(
molecule_name_dict=peps_with_glycans[peptide_unimod],
rt_window=(rt_min, rt_max),
ms_level=ms_level,
charges=charges,
params=pyqms_params,
pkl_name=os.path.join(
output_folder,
'{0}_{1}_pyQms_results.pkl'.format(
mzml_basename,
pkl_name
)
),
mzml_file=mzml_file,
)
pyqms_results[peptide_unimod] = results_pkl
# combine matched glycopeptide fragments to inteact glycopeptides
validated_results = sp_run.validate_results(
pyqms_results_dict=pyqms_results,
min_spec_number=min_spec_number,
min_tree_length=min_tree_length,
)
# initiate results class and save Sugary results as csv file
sp_results = sugarpy.results.Results(
validated_results=validated_results,
monosaccharides=monosaccharides
)
sp_results.write_results2csv(
output_file=output_file,
max_trees_per_spec=max_trees_per_spec,
min_sugarpy_score=min_sugarpy_score,
min_sub_cov=min_sub_cov,
peptide_lookup=peptide_lookup,
scan_rt_lookup=scan2rt,
mzml_basename=mzml_basename,
)
output_pkl = output_file.replace('.csv', '.pkl')
with open(output_pkl, 'wb') as out_pkl:
pickle.dump(sp_results, out_pkl)
print('Done.')
if __name__ == '__main__':
if len(sys.argv) < 3:
print(main.__doc__)
sys.exit(1)
mzml_file = sys.argv[1]
ident_file = sys.argv[2]
output_file = ident_file.replace('.csv', '_sugarpy.csv')
main(
ident_file=ident_file,
mzml_file=mzml_file,
output_file=output_file,
)
SugarPy results example¶
-
sugarpy_results_example.
main
(mzml_file=None, validated_results_pkl=None, ursgal_result_file=None, ms_level=1, output_file=None, min_sugarpy_score=1, min_sub_cov=0.5, min_spec_number=1, charges=[1, 2, 3, 4, 5], monosaccharides={'Hex': 'C6H10O5', 'HexNAc': 'C8H13NO5', 'NeuAc': 'C11H17NO8', 'dHex': 'C6H10O4'}, rt_border_tolerance=1, max_tree_length=10, unimod_glycans_incl_in_search=['HexNAc', 'HexNAc(2)'], max_trees_per_spec=5)¶ This example script takes the following three files as input:
- Ursgal result file: this file contains the identified glycopeptide sequences in .csv format
- mzML file: the raw MS data in .mzML format
- SugarPy results pkl: pkl file that contains SugarPy results (see sugarpy_run_exampe.py on how to generate it)
From those files, a SugarPy results csv is generated, as well as a glycopeptide elution profile.
Usage:
./sugarpy_results_example.py <ursgal_result_file.csv> <ms_data_file.mzML> <sugarpy_results.pkl>
#!/usr/bin/env python
# encoding: utf-8
import sugarpy
import ursgal
import sys
import os
import pickle
def main(
mzml_file=None,
validated_results_pkl=None,
ursgal_result_file=None,
ms_level=1,
output_file=None,
min_sugarpy_score=1,
min_sub_cov=0.5,
min_spec_number=1,
charges=[1, 2, 3, 4, 5],
monosaccharides={
"dHex": 'C6H10O4',
"Hex": 'C6H10O5',
"HexNAc": 'C8H13NO5',
"NeuAc": 'C11H17NO8',
},
rt_border_tolerance=1,
max_tree_length=10,
unimod_glycans_incl_in_search=[
'HexNAc',
'HexNAc(2)',
],
max_trees_per_spec=5,
):
'''
This example script takes the following three files as input:
* Ursgal result file: this file contains the identified glycopeptide sequences in .csv format
* mzML file: the raw MS data in .mzML format
* SugarPy results pkl: pkl file that contains SugarPy results (see sugarpy_run_exampe.py on how to generate it)
From those files, a SugarPy results csv is generated, as well as a glycopeptide elution profile.
Usage::
./sugarpy_results_example.py <ursgal_result_file.csv> <ms_data_file.mzML> <sugarpy_results.pkl>
'''
# load results from pkl
with open(validated_results_pkl, 'rb') as results_pkl:
validated_results = pickle.load(results_pkl)
mzml_basename = os.path.basename(mzml_file)
sp_run = sugarpy.run.Run()
lookup_dict = sp_run.build_rt_lookup(mzml_file, ms_level)
scan2rt = lookup_dict[mzml_basename]['scan_2_rt']
# load results class
sp_results = sugarpy.results.Results(
monosaccharides=monosaccharides,
scan_rt_lookup=scan2rt,
validated_results=validated_results,
)
# write results csv
sp_run = sugarpy.run.Run()
peptide_lookup = sp_run.parse_ident_file(
ident_file=ursgal_ident_file,
unimod_glycans_incl_in_search=unimod_glycans_incl_in_search
)
sp_result_file = sp_results.write_results2csv(
output_file=validated_results_pkl.replace('.pkl', '.csv'),
max_trees_per_spec=max_trees_per_spec,
min_sugarpy_score=min_sugarpy_score,
min_sub_cov=min_sub_cov,
peptide_lookup=peptide_lookup,
monosaccharides=monosaccharides,
scan_rt_lookup=scan2rt,
mzml_basename=mzml_basename
)
# plot glycopeptide elution profile
plot_molecule_dict = sp_results.parse_result_file(sp_result_file)
elution_profile_file = sp_results.plot_glycan_elution_profile(
peptide_list=sorted(plot_molecule_dict.keys()),
min_sugarpy_score=min_sugarpy_score,
min_sub_cov=min_sub_cov,
x_axis_type='retention_time',
score_type='top_scores',
output_file=output_file.replace('.txt', '_glycan_profile.html'),
plotly_layout={
'font' : {
'family':'Arial',
'size': 26,
'color' : 'rgb(0, 0, 0)',
},
'width':1700,
'height':1200,
'margin':{
'l':120,
'r':50,
't':50,
'b':170,
},
'xaxis':{
'type':'linear',
'color':'rgb(0, 0, 0)',
'title': 'Retention time [min]',
'title_font':{
'family':'Arial',
'size':26,
'color':'rgb(0, 0, 0)',
},
'autorange':True,
'fixedrange':False,
'tickmode':'auto',
'showticklabels':True,
'tickfont':{
'family':'Arial',
'size':22,
'color':'rgb(0, 0, 0)',
},
'tickangle':0,
'ticklen':5,
'tickwidth':1,
'tickcolor':'rgb(0, 0, 0)',
'ticks':'outside',
'showline':True,
'linecolor':'rgb(0, 0, 0)',
'mirror':False,
'showgrid':False,
'anchor':'y',
'side':'bottom',
},
'yaxis':{
'type':'linear',
'color':'rgb(0, 0, 0)',
'title':'Max. SugarPy score',
'title_font':{
'family':'Arial',
'size':26,
'color':'rgb(0, 0, 0)',
},
'autorange':True,
'fixedrange':False,
'tickmode':'auto',
'showticklabels':True,
'tickfont':{
'family':'Arial',
'size':22,
'color':'rgb(0, 0, 0)',
},
'tickangle':0,
'ticklen':5,
'tickwidth':1,
'tickcolor':'rgb(0, 0, 0)',
'ticks':'outside',
'showline':True,
'linecolor':'rgb(0, 0, 0)',
'mirror':False,
'showgrid':False,
'zeroline':False,
'anchor':'x',
'side':'left',
},
'legend':{
'font':{
'family':'Arial',
'size':5,
'color':'rgb(0, 0, 0)',
},
'orientation':'v',
'traceorder':'normal',
},
'showlegend':True,
'paper_bgcolor':'rgba(0,0,0,0)',
'plot_bgcolor':'rgba(0,0,0,0)',
},
)
print('Done.')
return output_file
if __name__ == '__main__':
ursgal_ident_file = sys.argv[1]
mzml_file = sys.argv[2]
validated_results_pkl = sys.argv[3]
output_file = result_file.replace('.csv', '.html')
main(
mzml_file=mzml_file,
validated_results_pkl=validated_results_pkl,
ursgal_result_file=ursgal_result_file,
output_file=output_file,
)