In this tutorial, we will fold a protein structure using a very simple algorithm in PyRosetta, and compare the folded structure with the solved crystal structure of the protein.
We begin by importing the relevant libraries from Python. If running the following cell produces any errors or warnings, make sure you have followed all the steps in the "Setting up Pyrosetta" section.
import os
import glob
import shutil
import pandas as pd
import nglview as ngl
import pyrosetta as prs
prs.init()
from pyrosetta import rosetta
scorefxn_low = prs.create_score_function('score3')
scorefxn_high = prs.get_fa_scorefxn()
native_pose = prs.pose_from_pdb('data/1BL0/1BL0_chainA.pdb')
We can check the amino acid sequence of the structure with a very simple command.
native_pose.sequence()
We can also assign the correct secondary structure.
DSSP = prs.rosetta.protocols.moves.DsspMover()
DSSP.apply(native_pose) # populates the pose's Pose.secstruct
Let's view more information about the first residue of the protein.
print(native_pose.residue(1))
pose = prs.pose_from_sequence(native_pose.sequence())
test_pose = prs.Pose()
test_pose.assign(pose)
test_pose.pdb_info().name('Linearized Pose')
view = ngl.show_rosetta(test_pose)
view.add_ball_and_stick()
view # zoom in to see the atoms!
We will be using the centroid representation to perform rough and fast scoring in the initial stages of the folding algorithm. Later on, we will switch to the full atom represenation to do accurate minimization and get the final structures.
to_centroid = prs.SwitchResidueTypeSetMover('centroid')
to_full_atom = prs.SwitchResidueTypeSetMover('fa_standard')
to_full_atom.apply(test_pose)
print('Full Atom Score:', scorefxn_high(test_pose))
to_centroid.apply(test_pose)
print('Centroid Score:', scorefxn_low(test_pose))
# here write the code to visualize the centroid only structure and print the information of the 1st residue
print(native_pose.residue(1))
view = ngl.show_rosetta(test_pose)
view.add_ball_and_stick()
view
# Loading the files with the pre-computed fragmets
long_frag_filename = 'data/1BL0/9_fragments.txt'
long_frag_length = 9
short_frag_filename = 'data/1BL0/3_fragments.txt'
short_frag_length = 3
# Defining parameters of the folding algorithm
long_inserts=5 # How many 9-fragment pieces to insest during the search
short_inserts=10 # How many 3-fragment pieces to insest during the search
kT = 3.0 # Simulated Annealing temperature
cycles = 2 # How many cycles of Monte Carlo search to run
jobs = 50 # How many trajectories in parallel to compute.
job_output = 'outputs/1BL0/decoy' # The prefix of the filenames to store the results
movemap = prs.MoveMap()
movemap.set_bb(True)
fragset_long = rosetta.core.fragment.ConstantLengthFragSet(long_frag_length, long_frag_filename)
long_frag_mover = rosetta.protocols.simple_moves.ClassicFragmentMover(fragset_long, movemap)
fragset_short = rosetta.core.fragment.ConstantLengthFragSet(short_frag_length, short_frag_filename)
short_frag_mover = rosetta.protocols.simple_moves.ClassicFragmentMover(fragset_short, movemap)
insert_long_frag = prs.RepeatMover(long_frag_mover, long_inserts)
insert_short_frag = prs.RepeatMover(short_frag_mover, short_inserts)
200
# Making sure the structure is in centroid-only mode for the search
test_pose.assign(pose)
to_centroid.apply(test_pose)
# Defining what sequence of actions to do between each scoring step
folding_mover = prs.SequenceMover()
folding_mover.add_mover(insert_long_frag)
folding_mover.add_mover(insert_short_frag)
mc = prs.MonteCarlo(test_pose, scorefxn_low, kT)
trial = prs.TrialMover(folding_mover, mc)
# Setting up how many cycles of search to do in each trajectory
folding = prs.RepeatMover(trial, cycles)
fast_relax_mover = prs.rosetta.protocols.relax.FastRelax(scorefxn_high)
scores = [0] * (jobs + 1)
scores[0] = scorefxn_low(test_pose)
if os.path.isdir(os.path.dirname(job_output)):
shutil.rmtree(os.path.dirname(job_output), ignore_errors=True)
os.makedirs(os.path.dirname(job_output))
jd = prs.PyJobDistributor(job_output, nstruct=jobs, scorefxn=scorefxn_high)
counter = 0
while not jd.job_complete:
# a. set necessary variables for the new trajectory
# -reload the starting pose
test_pose.assign(pose)
to_centroid.apply(test_pose)
# -change the pose's PDBInfo.name, for the PyMOL_Observer
counter += 1
test_pose.pdb_info().name(job_output + '_' + str(counter))
# -reset the MonteCarlo object (sets lowest_score to that of test_pose)
mc.reset(test_pose)
#### if you create a custom protocol, you may have additional
#### variables to reset, such as kT
#### if you create a custom protocol, this section will most likely
#### change, many protocols exist as single Movers or can be
#### chained together in a sequence (see above) so you need
#### only apply the final Mover
# b. apply the refinement protocol
folding.apply(test_pose)
####
# c. export the lowest scoring decoy structure for this trajectory
# -recover the lowest scoring decoy structure
mc.recover_low(test_pose)
# -store the final score for this trajectory
# -convert the decoy to fullatom
# the sidechain conformations will all be default,
# normally, the decoys would NOT be converted to fullatom before
# writing them to PDB (since a large number of trajectories would
# be considered and their fullatom score are unnecessary)
# here the fullatom mode is reproduced to make the output easier to
# understand and manipulate, PyRosetta can load in PDB files of
# centroid structures, however you must convert to fullatom for
# nearly any other application
to_full_atom.apply(test_pose)
fast_relax_mover.apply(test_pose)
scores[counter] = scorefxn_high(test_pose)
# -output the fullatom decoy structure into a PDB file
jd.output_decoy(test_pose)
# -export the final structure to PyMOL
test_pose.pdb_info().name(job_output + '_' + str(counter) + '_fa')
decoy_poses = [prs.pose_from_pdb(f) for f in glob.glob(job_output + '*.pdb')]
def align_and_get_rmsds(native_pose, decoy_poses):
prs.rosetta.core.pose.full_model_info.make_sure_full_model_info_is_setup(native_pose)
rmsds = []
for p in decoy_poses:
prs.rosetta.core.pose.full_model_info.make_sure_full_model_info_is_setup(p)
rmsds += [prs.rosetta.protocols.stepwise.modeler.align.superimpose_with_stepwise_aligner(native_pose, p)]
return rmsds
rmsds = align_and_get_rmsds(native_pose, decoy_poses)
rmsd_data = []
for i in range(1, len(decoy_poses)): # print out the job scores
rmsd_data.append({'structure': decoy_poses[i].pdb_info().name(),
'rmsd': rmsds[i],
'energy_score': scores[i]})
rmsd_df = pd.DataFrame(rmsd_data)
rmsd_df.sort_values('rmsd')