Publications
Justas Dauparas Amir Motmaen, Minkyung Baek
Peptide-binding specificity prediction using fine-tuned protein structure prediction networks Journal Article
In: Proceedings of the National Academy of Sciences, 2023.
@article{nokey,
title = {Peptide-binding specificity prediction using fine-tuned protein structure prediction networks},
author = {Amir Motmaen, Justas Dauparas, Minkyung Baek, Mohamad H. Abedi, David Baker, Philip Bradley},
url = {https://www.pnas.org/doi/10.1073/pnas.2216697120, PNAS (Open Access)},
doi = {10.1073/pnas.2216697120},
year = {2023},
date = {2023-02-21},
urldate = {2023-02-21},
journal = {Proceedings of the National Academy of Sciences},
abstract = {Peptide-binding proteins play key roles in biology, and predicting their binding specificity is a long-standing challenge. While considerable protein structural information is available, the most successful current methods use sequence information alone, in part because it has been a challenge to model the subtle structural changes accompanying sequence substitutions. Protein structure prediction networks such as AlphaFold model sequence-structure relationships very accurately, and we reasoned that if it were possible to specifically train such networks on binding data, more generalizable models could be created. We show that placing a classifier on top of the AlphaFold network and fine-tuning the combined network parameters for both classification and structure prediction accuracy leads to a model with strong generalizable performance on a wide range of Class I and Class II peptide-MHC interactions that approaches the overall performance of the state-of-the-art NetMHCpan sequence-based method. The peptide-MHC optimized model shows excellent performance in distinguishing binding and non-binding peptides to SH3 and PDZ domains. This ability to generalize well beyond the training set far exceeds that of sequence-only models and should be particularly powerful for systems where less experimental data are available.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Dauparas, J.; Anishchenko, I.; Bennett, N.; Bai, H.; Ragotte, R. J.; Milles, L. F.; Wicky, B. I. M.; Courbet, A.; de Haas, R. J.; Bethel, N.; Leung, P. J. Y.; Huddy, T. F.; Pellock, S.; Tischer, D.; Chan, F.; Koepnick, B.; Nguyen, H.; Kang, A.; Sankaran, B.; Bera, A. K.; King, N. P.; Baker, D.
Robust deep learning–based protein sequence design using ProteinMPNN Journal Article
In: Science, 2022.
@article{Dauparas2022,
title = {Robust deep learning–based protein sequence design using ProteinMPNN},
author = {Dauparas, J.
and Anishchenko, I.
and Bennett, N.
and Bai, H.
and Ragotte, R. J.
and Milles, L. F.
and Wicky, B. I. M.
and Courbet, A.
and de Haas, R. J.
and Bethel, N.
and Leung, P. J. Y.
and Huddy, T. F.
and Pellock, S.
and Tischer, D.
and Chan, F.
and Koepnick, B.
and Nguyen, H.
and Kang, A.
and Sankaran, B.
and Bera, A. K.
and King, N. P.
and Baker, D.},
url = {https://www.science.org/doi/abs/10.1126/science.add2187, Science
https://www.bakerlab.org/wp-content/uploads/2022/09/Dauparas_etal_Science2022_Sequence_design_via_ProteinMPNN.pdf, PDF},
doi = {10.1126/science.add2187},
year = {2022},
date = {2022-09-15},
journal = {Science},
abstract = {While deep learning has revolutionized protein structure prediction, almost all experimentally characterized de novo protein designs have been generated using physically based approaches such as Rosetta. Here we describe a deep learning–based protein sequence design method, ProteinMPNN, with outstanding performance in both in silico and experimental tests. On native protein backbones, ProteinMPNN has a sequence recovery of 52.4%, compared to 32.9% for Rosetta. The amino acid sequence at different positions can be coupled between single or multiple chains, enabling application to a wide range of current protein design challenges. We demonstrate the broad utility and high accuracy of ProteinMPNN using X-ray crystallography, cryoEM and functional studies by rescuing previously failed designs, made using Rosetta or AlphaFold, of protein monomers, cyclic homo-oligomers, tetrahedral nanoparticles, and target binding proteins},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Wang, Jue; Lisanza, Sidney; Juergens, David; Tischer, Doug; Watson, Joseph L.; Castro, Karla M.; Ragotte, Robert; Saragovi, Amijai; Milles, Lukas F.; Baek, Minkyung; Anishchenko, Ivan; Yang, Wei; Hicks, Derrick R.; Expòsit, Marc; Schlichthaerle, Thomas; Chun, Jung-Ho; Dauparas, Justas; Bennett, Nathaniel; Wicky, Basile I. M.; Muenks, Andrew; DiMaio, Frank; Correia, Bruno; Ovchinnikov, Sergey; Baker, David
Scaffolding protein functional sites using deep learning Journal Article
In: Science, 2022.
@article{Wang2022,
title = {Scaffolding protein functional sites using deep learning},
author = {Jue Wang and Sidney Lisanza and David Juergens and Doug Tischer and Joseph L. Watson and Karla M. Castro and Robert Ragotte and Amijai Saragovi and Lukas F. Milles and Minkyung Baek and Ivan Anishchenko and Wei Yang and Derrick R. Hicks and Marc Expòsit and Thomas Schlichthaerle and Jung-Ho Chun and Justas Dauparas and Nathaniel Bennett and Basile I. M. Wicky and Andrew Muenks and Frank DiMaio and Bruno Correia and Sergey Ovchinnikov and David Baker },
url = {https://www.science.org/doi/abs/10.1126/science.abn2100, Science
https://www.ipd.uw.edu/wp-content/uploads/2022/07/science.abn2100.pdf, Download PDF},
doi = {10.1126/science.abn2100},
year = {2022},
date = {2022-07-21},
urldate = {2022-07-21},
journal = {Science},
abstract = {The binding and catalytic functions of proteins are generally mediated by a small number of functional residues held in place by the overall protein structure. Here, we describe deep learning approaches for scaffolding such functional sites without needing to prespecify the fold or secondary structure of the scaffold. The first approach, “constrained hallucination,” optimizes sequences such that their predicted structures contain the desired functional site. The second approach, “inpainting,” starts from the functional site and fills in additional sequence and structure to create a viable protein scaffold in a single forward pass through a specifically trained RoseTTAFold network. We use these two methods to design candidate immunogens, receptor traps, metalloproteins, enzymes, and protein-binding proteins and validate the designs using a combination of in silico and experimental tests. Protein design has had success in finding sequences that fold into a desired conformation, but designing functional proteins remains challenging. Wang et al. describe two deep-learning methods to design proteins that contain prespecified functional sites. In the first, they found sequences predicted to fold into stable structures that contain the functional site. In the second, they retrained a structure prediction network to recover the sequence and full structure of a protein given only the functional site. The authors demonstrate their methods by designing proteins containing a variety of functional motifs. —VV Deep-learning methods enable the scaffolding of desired functional residues within a well-folded designed protein.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Sen, Neeladri; Anishchenko, Ivan; N, Bordin; Sillitoe, Ian; Velankar, Sameer; Baker, David; Orengo, Christine
Characterizing and explaining the impact of disease-associated mutations in proteins without known structures or structural homologs Journal Article
In: Briefings in Bioinformatics, 2022.
@article{Sen2022,
title = {Characterizing and explaining the impact of disease-associated mutations in proteins without known structures or structural homologs},
author = {Sen, Neeladri
and Anishchenko, Ivan
and Bordin N
and Sillitoe, Ian
and Velankar, Sameer
and Baker, David
and Orengo, Christine},
url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC9294430/},
doi = {10.1093/bib/bbac187},
year = {2022},
date = {2022-07-18},
journal = {Briefings in Bioinformatics},
abstract = {Mutations in human proteins lead to diseases. The structure of these proteins can help understand the mechanism of such diseases and develop therapeutics against them. With improved deep learning techniques, such as RoseTTAFold and AlphaFold, we can predict the structure of proteins even in the absence of structural homologs. We modeled and extracted the domains from 553 disease-associated human proteins without known protein structures or close homologs in the Protein Databank. We noticed that the model quality was higher and the Root mean square deviation (RMSD) lower between AlphaFold and RoseTTAFold models for domains that could be assigned to CATH families as compared to those which could only be assigned to Pfam families of unknown structure or could not be assigned to either. We predicted ligand-binding sites, protein-protein interfaces and conserved residues in these predicted structures. We then explored whether the disease-associated missense mutations were in the proximity of these predicted functional sites, whether they destabilized the protein structure based on ddG calculations or whether they were predicted to be pathogenic. We could explain 80% of these disease-associated mutations based on proximity to functional sites, structural destabilization or pathogenicity. When compared to polymorphisms, a larger percentage of disease-associated missense mutations were buried, closer to predicted functional sites, predicted as destabilizing and pathogenic. Usage of models from the two state-of-the-art techniques provide better confidence in our predictions, and we explain 93 additional mutations based on RoseTTAFold models which could not be explained based solely on AlphaFold models.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Linder, Johannes; La Fleur, Alyssa; Chen, Zibo; Ljubetič, Ajasja; Baker, David; Kannan, Sreeram; Seelig, Georg
Interpreting neural networks for biological sequences by learning stochastic masks Journal Article
In: Nature Machine Intelligence, 2022.
@article{Linder2022,
title = {Interpreting neural networks for biological sequences by learning stochastic masks},
author = {Linder, Johannes and La Fleur, Alyssa and Chen, Zibo and Ljubetič, Ajasja and Baker, David and Kannan, Sreeram and Seelig, Georg},
url = {https://www.nature.com/articles/s42256-021-00428-6, Nature Machine Intelligence},
doi = {10.1038/s42256-021-00428-6},
year = {2022},
date = {2022-01-25},
urldate = {2022-01-25},
journal = {Nature Machine Intelligence},
abstract = {Sequence-based neural networks can learn to make accurate predictions from large biological datasets, but model interpretation remains challenging. Many existing feature attribution methods are optimized for continuous rather than discrete input patterns and assess individual feature importance in isolation, making them ill-suited for interpreting nonlinear interactions in molecular sequences. Here, building on work in computer vision and natural language processing, we developed an approach based on deep learning—scrambler networks—wherein the most important sequence positions are identified with learned input masks. Scramblers learn to predict position-specific scoring matrices where unimportant nucleotides or residues are scrambled by raising their entropy. We apply scramblers to interpret the effects of genetic variants, uncover nonlinear interactions between cis-regulatory elements, explain binding specificity for protein–protein interactions, and identify structural determinants of de novo-designed proteins. We show that scramblers enable efficient attribution across large datasets and result in high-quality explanations, often outperforming state-of-the-art methods.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Baek, Minkyung; Baker, David
Deep learning and protein structure modeling Journal Article
In: Nature Methods, 2022.
@article{Baek2022,
title = {Deep learning and protein structure modeling},
author = {Minkyung Baek and David Baker},
url = {https://www.nature.com/articles/s41592-021-01360-8, Nature Methods
https://www.bakerlab.org/wp-content/uploads/2022/01/Baek_Baker_NatureMethods2022_Deep_Learning_and_Protein_Structure_Modeling.pdf, Download PDF
},
doi = {10.1038/s41592-021-01360-8},
year = {2022},
date = {2022-01-22},
urldate = {2022-01-22},
journal = {Nature Methods},
abstract = {Deep learning has transformed protein structure modeling. Here we relate AlphaFold and RoseTTAFold to classical physically based approaches to protein structure prediction, and discuss the many areas of structural biology that are likely to be affected by further advances in deep learning.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Anishchenko, Ivan; Pellock, Samuel J.; Chidyausiku, Tamuka M.; Ramelot, Theresa A.; Ovchinnikov, Sergey; Hao, Jingzhou; Bafna, Khushboo; Norn, Christoffer; Kang, Alex; Bera, Asim K.; DiMaio, Frank; Carter, Lauren; Chow, Cameron M.; Montelione, Gaetano T.; Baker, David
De novo protein design by deep network hallucination Journal Article
In: Nature, 2021.
@article{Anishchenko2021,
title = {De novo protein design by deep network hallucination},
author = {Anishchenko, Ivan
and Pellock, Samuel J.
and Chidyausiku, Tamuka M.
and Ramelot, Theresa A.
and Ovchinnikov, Sergey
and Hao, Jingzhou
and Bafna, Khushboo
and Norn, Christoffer
and Kang, Alex
and Bera, Asim K.
and DiMaio, Frank
and Carter, Lauren
and Chow, Cameron M.
and Montelione, Gaetano T.
and Baker, David},
url = {https://www.nature.com/articles/s41586-021-04184-w
https://www.bakerlab.org/wp-content/uploads/2022/01/Anishchenko_etal_Nature2021_DeepNetworkHallucination.pdf},
doi = {10.1038/s41586-021-04184-w},
year = {2021},
date = {2021-12-01},
urldate = {2021-12-01},
journal = {Nature},
abstract = {There has been considerable recent progress in protein structure prediction using deep neural networks to predict inter-residue distances from amino acid sequences1–3. Here we investigate whether the information captured by such networks is sufficiently rich to generate new folded proteins with sequences unrelated to those of the naturally occurring proteins used in training the models. We generate random amino acid sequences, and input them into the trRosetta structure prediction network to predict starting residue–residue distance maps, which, as expected, are quite featureless. We then carry out Monte Carlo sampling in amino acid sequence space, optimizing the contrast (Kullback–Leibler divergence) between the inter-residue distance distributions predicted by the network and background distributions averaged over all proteins. Optimization from different random starting points resulted in novel proteins spanning a wide range of sequences and predicted structures. We obtained synthetic genes encoding 129 of the network-‘hallucinated’ sequences, and expressed and purified the proteins in Escherichia coli; 27 of the proteins yielded monodisperse species with circular dichroism spectra consistent with the hallucinated structures. We determined the three-dimensional structures of three of the hallucinated proteins, two by X-ray crystallography and one by NMR, and these closely matched the hallucinated models. Thus, deep networks trained to predict native protein structures from their sequences can be inverted to design new proteins, and such networks and methods should contribute alongside traditional physics-based models to the de novo design of proteins with new functions.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Du, Zongyang; Su, Hong; Wang, Wenkai; Ye, Lisha; Wei, Hong; Peng, Zhenling; Anishchenko, Ivan; Baker, David; Yang, Jianyi
The trRosetta server for fast and accurate protein structure prediction Journal Article
In: Nature Protocols, 2021.
@article{Du2021,
title = {The trRosetta server for fast and accurate protein structure prediction},
author = {Du, Zongyang
and Su, Hong
and Wang, Wenkai
and Ye, Lisha
and Wei, Hong
and Peng, Zhenling
and Anishchenko, Ivan
and Baker, David
and Yang, Jianyi},
url = {https://www.nature.com/articles/s41596-021-00628-9
https://www.bakerlab.org/wp-content/uploads/2022/01/Du_etal_NatProt2021_trRosetta_server.pdf},
doi = {10.1038/s41596-021-00628-9},
year = {2021},
date = {2021-12-01},
urldate = {2021-12-01},
journal = {Nature Protocols},
abstract = {The trRosetta (transform-restrained Rosetta) server is a web-based platform for fast and accurate protein structure prediction, powered by deep learning and Rosetta. With the input of a protein’s amino acid sequence, a deep neural network is first used to predict the inter-residue geometries, including distance and orientations. The predicted geometries are then transformed as restraints to guide the structure prediction on the basis of direct energy minimization, which is implemented under the framework of Rosetta. The trRosetta server distinguishes itself from other similar structure prediction servers in terms of rapid and accurate de novo structure prediction. As an illustration, trRosetta was applied to two Pfam families with unknown structures, for which the predicted de novo models were estimated to have high accuracy. Nevertheless, to take advantage of homology modeling, homologous templates are used as additional inputs to the network automatically. In general, it takes ~1 h to predict the final structure for a typical protein with ~300 amino acids, using a maximum of 10 CPU cores in parallel in our cluster system. To enable large-scale structure modeling, a downloadable package of trRosetta with open-source codes is available as well. A detailed guidance for using the package is also available in this protocol. The server and the package are available at https://yanglab.nankai.edu.cn/trRosetta/ and https://yanglab.nankai.edu.cn/trRosetta/download/, respectively.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Baek, Minkyung; Anishchenko, Ivan; Park, Hahnbeom; Humphreys, Ian R.; Baker, David
Protein oligomer modeling guided by predicted inter-chain contacts in CASP14 Journal Article
In: Proteins, 2021.
@article{Baek2021b,
title = {Protein oligomer modeling guided by predicted inter-chain contacts in CASP14},
author = {Minkyung Baek and Ivan Anishchenko and Hahnbeom Park and Ian R. Humphreys and David Baker},
url = {https://onlinelibrary.wiley.com/doi/abs/10.1002/prot.26197, Proteins},
doi = {10.1002/prot.26197},
year = {2021},
date = {2021-07-29},
urldate = {2021-07-29},
journal = {Proteins},
abstract = {For CASP14, we developed deep learning-based methods for predicting homo-oligomeric and hetero-oligomeric contacts and used them for oligomer modeling. To build structure models, we developed an oligomer structure generation method that utilizes predicted inter-chain contacts to guide iterative restrained minimization from random backbone structures. We supplemented this gradient-based fold-and-dock method with template-based and ab initio docking approaches using deep learning-based subunit predictions on 29 assembly targets. These methods produced oligomer models with summed Z-scores 5.5 units higher than the next best group, with the fold-and-dock method having the best relative performance. Over the eight targets for which this method was used, the best of the five submitted models had average oligomer TM-score of 0.71 (average oligomer TM-score of the next best group: 0.64), and explicit modeling of inter-subunit interactions improved modeling of six out of 40 individual domains (ΔGDT-TS > 2.0).
},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Baek, Minkyung; DiMaio, Frank; Anishchenko, Ivan; Dauparas, Justas; Ovchinnikov, Sergey; Lee, Gyu Rie; Wang, Jue; Cong, Qian; Kinch, Lisa N.; Schaeffer, R. Dustin; Millán, Claudia; Park, Hahnbeom; Adams, Carson; Glassman, Caleb R.; DeGiovanni, Andy; Pereira, Jose H.; Rodrigues, Andria V.; van Dijk, Alberdina A.; Ebrecht, Ana C.; Opperman, Diederik J.; Sagmeister, Theo; Buhlheller, Christoph; Pavkov-Keller, Tea; Rathinaswamy, Manoj K.; Dalwadi, Udit; Yip, Calvin K.; Burke, John E.; Garcia, K. Christopher; Grishin, Nick V.; Adams, Paul D.; Read, Randy J.; Baker, David
Accurate prediction of protein structures and interactions using a three-track neural network Journal Article
In: Science, 2021.
@article{Baek2021,
title = {Accurate prediction of protein structures and interactions using a three-track neural network},
author = {Baek, Minkyung
and DiMaio, Frank
and Anishchenko, Ivan
and Dauparas, Justas
and Ovchinnikov, Sergey
and Lee, Gyu Rie
and Wang, Jue
and Cong, Qian
and Kinch, Lisa N.
and Schaeffer, R. Dustin
and Millán, Claudia
and Park, Hahnbeom
and Adams, Carson
and Glassman, Caleb R.
and DeGiovanni, Andy
and Pereira, Jose H.
and Rodrigues, Andria V.
and van Dijk, Alberdina A.
and Ebrecht, Ana C.
and Opperman, Diederik J.
and Sagmeister, Theo
and Buhlheller, Christoph
and Pavkov-Keller, Tea
and Rathinaswamy, Manoj K.
and Dalwadi, Udit
and Yip, Calvin K.
and Burke, John E.
and Garcia, K. Christopher
and Grishin, Nick V.
and Adams, Paul D.
and Read, Randy J.
and Baker, David},
url = {http://science.sciencemag.org/content/early/2021/07/14/science.abj8754, Science
https://www.ipd.uw.edu/wp-content/uploads/2021/07/Baek_etal_Science2021_RoseTTAFold.pdf, Download PDF},
doi = {10.1126/science.abj8754},
year = {2021},
date = {2021-07-15},
urldate = {2021-07-15},
journal = {Science},
abstract = {DeepMind presented remarkably accurate predictions at the recent CASP14 protein structure prediction assessment conference. We explored network architectures incorporating related ideas and obtained the best performance with a three-track network in which information at the 1D sequence level, the 2D distance map level, and the 3D coordinate level is successively transformed and integrated. The three-track network produces structure predictions with accuracies approaching those of DeepMind in CASP14, enables the rapid solution of challenging X-ray crystallography and cryo-EM structure modeling problems, and provides insights into the functions of proteins of currently unknown structure. The network also enables rapid generation of accurate protein-protein complex models from sequence information alone, short-circuiting traditional approaches which require modeling of individual subunits followed by docking. We make the method available to the scientific community to speed biological research.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Norn, Christoffer; Wicky, Basile I. M.; Juergens, David; Liu, Sirui; Kim, David; Tischer, Doug; Koepnick, Brian; Anishchenko, Ivan; Baker, David; Ovchinnikov, Sergey
Protein sequence design by conformational landscape optimization Journal Article
In: Proceedings of the National Academy of Sciences, vol. 118, no. 11, 2021.
@article{Norn2021,
title = {Protein sequence design by conformational landscape optimization},
author = {Norn, Christoffer and Wicky, Basile I. M. and Juergens, David and Liu, Sirui and Kim, David and Tischer, Doug and Koepnick, Brian and Anishchenko, Ivan and Baker, David and Ovchinnikov, Sergey},
url = {https://www.pnas.org/content/118/11/e2017228118, PNAS
https://www.bakerlab.org/wp-content/uploads/2021/03/Norn_etal_PNAS2021_LandscapeOptimization.pdf, Download PDF},
doi = {10.1073/pnas.2017228118},
year = {2021},
date = {2021-03-16},
urldate = {2021-03-16},
journal = {Proceedings of the National Academy of Sciences},
volume = {118},
number = {11},
abstract = {Almost all proteins fold to their lowest free energy state, which is determined by their amino acid sequence. Computational protein design has primarily focused on finding sequences that have very low energy in the target designed structure. However, what is most relevant during folding is not the absolute energy of the folded state but the energy difference between the folded state and the lowest-lying alternative states. We describe a deep learning approach that captures aspects of the folding landscape, in particular the presence of structures in alternative energy minima, and show that it can enhance current protein design methods.The protein design problem is to identify an amino acid sequence that folds to a desired structure. Given Anfinsen{textquoteright}s thermodynamic hypothesis of folding, this can be recast as finding an amino acid sequence for which the desired structure is the lowest energy state. As this calculation involves not only all possible amino acid sequences but also, all possible structures, most current approaches focus instead on the more tractable problem of finding the lowest-energy amino acid sequence for the desired structure, often checking by protein structure prediction in a second step that the desired structure is indeed the lowest-energy conformation for the designed sequence, and typically discarding a large fraction of designed sequences for which this is not the case. Here, we show that by backpropagating gradients through the transform-restrained Rosetta (trRosetta) structure prediction network from the desired structure to the input amino acid sequence, we can directly optimize over all possible amino acid sequences and all possible structures in a single calculation. We find that trRosetta calculations, which consider the full conformational landscape, can be more effective than Rosetta single-point energy estimations in predicting folding and stability of de novo designed proteins. We compare sequence design by conformational landscape optimization with the standard energy-based sequence design methodology in Rosetta and show that the former can result in energy landscapes with fewer alternative energy minima. We show further that more funneled energy landscapes can be designed by combining the strengths of the two approaches: the low-resolution trRosetta model serves to disfavor alternative states, and the high-resolution Rosetta model serves to create a deep energy minimum at the design target structure.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Hiranuma, Naozumi; Park, Hahnbeom; Baek, Minkyung; Anishchenko, Ivan; Dauparas, Justas; Baker, David
Improved protein structure refinement guided by deep learning based accuracy estimation Journal Article
In: Nature Communications, vol. 12, no. 1340, 2021.
@article{Hiranuma2021,
title = {Improved protein structure refinement guided by deep learning based accuracy estimation},
author = {Naozumi Hiranuma and Hahnbeom Park and Minkyung Baek and Ivan Anishchenko and Justas Dauparas and David Baker
},
url = {https://www.nature.com/articles/s41467-021-21511-x, Nature Communications
https://www.bakerlab.org/wp-content/uploads/2021/02/Hiranuma_etal_NatureComms2021_DeepLearningStructureRefinement.pdf, Download PDF},
doi = {10.1038/s41467-021-21511-x},
year = {2021},
date = {2021-02-26},
urldate = {2021-02-26},
journal = {Nature Communications},
volume = {12},
number = {1340},
abstract = {We develop a deep learning framework (DeepAccNet) that estimates per-residue accuracy and residue-residue distance signed error in protein models and uses these predictions to guide Rosetta protein structure refinement. The network uses 3D convolutions to evaluate local atomic environments followed by 2D convolutions to provide their global contexts and outperforms other methods that similarly predict the accuracy of protein structure models. Overall accuracy predictions for X-ray and cryoEM structures in the PDB correlate with their resolution, and the network should be broadly useful for assessing the accuracy of both predicted structure models and experimentally determined structures and identifying specific regions likely to be in error. Incorporation of the accuracy predictions at multiple stages in the Rosetta refinement protocol considerably increased the accuracy of the resulting protein structure models, illustrating how deep learning can improve search for global energy minima of biomolecules.
},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Park, Hahnbeom; Zhou, Guangfeng; Baek, Minkyung; Baker, David; DiMaio, Frank
Force Field Optimization Guided by Small Molecule Crystal Lattice Data Enables Consistent Sub-Angstrom Protein–Ligand Docking Journal Article
In: Journal of Chemical Theory and Computation, 2021.
@article{Park2021,
title = {Force Field Optimization Guided by Small Molecule Crystal Lattice Data Enables Consistent Sub-Angstrom Protein–Ligand Docking},
author = {Hahnbeom Park and Guangfeng Zhou and Minkyung Baek and David Baker and Frank DiMaio},
url = {https://pubs.acs.org/doi/full/10.1021/acs.jctc.0c01184
https://www.bakerlab.org/wp-content/uploads/2021/02/Park_etal_JCTC2021_Small_mol_force_field_optimization.pdf},
doi = {10.1021/acs.jctc.0c01184},
year = {2021},
date = {2021-02-12},
journal = {Journal of Chemical Theory and Computation},
abstract = {Accurate and rapid calculation of protein-small molecule interaction free energies is critical for computational drug discovery. Because of the large chemical space spanned by drug-like molecules, classical force fields contain thousands of parameters describing atom-pair distance and torsional preferences; each parameter is typically optimized independently on simple representative molecules. Here, we describe a new approach in which small molecule force field parameters are jointly optimized guided by the rich source of information contained within thousands of available small molecule crystal structures. We optimize parameters by requiring that the experimentally determined molecular lattice arrangements have lower energy than all alternative lattice arrangements. Thousands of independent crystal lattice-prediction simulations were run on each of 1386 small molecule crystal structures, and energy function parameters of an implicit solvent energy model were optimized, so native crystal lattice arrangements had the lowest energy. The resulting energy model was implemented in Rosetta, together with a rapid genetic algorithm docking method employing grid-based scoring and receptor flexibility. The success rate of bound structure recapitulation in cross-docking on 1112 complexes was improved by more than 10% over previously published methods, with solutions within <1 Å in over half of the cases. Our results demonstrate that small molecule crystal structures are a rich source of information for guiding molecular force field development, and the improved Rosetta energy function should increase accuracy in a wide range of small molecule structure prediction and design studies.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Yang, Jianyi; Anishchenko, Ivan; Park, Hahnbeom; Peng, Zhenling; Ovchinnikov, Sergey; Baker, David
Improved protein structure prediction using predicted interresidue orientations Journal Article
In: Proceedings of the National Academy of Sciences, 2020, ISBN: 0027-8424.
@article{Yang2020,
title = {Improved protein structure prediction using predicted interresidue orientations},
author = {Yang, Jianyi and Anishchenko, Ivan and Park, Hahnbeom and Peng, Zhenling and Ovchinnikov, Sergey and Baker, David},
url = {https://www.pnas.org/content/early/2020/01/01/1914677117
https://www.bakerlab.org/wp-content/uploads/2020/01/Yang2020_ImprovedStructurePredictionInterresidueOrientations.pdf
},
doi = {10.1073/pnas.1914677117},
isbn = {0027-8424},
year = {2020},
date = {2020-01-02},
journal = {Proceedings of the National Academy of Sciences},
abstract = {Protein structure prediction is a longstanding challenge in computational biology. Through extension of deep learning-based prediction to interresidue orientations in addition to distances, and the development of a constrained optimization by Rosetta, we show that more accurate models can be generated. Results on a set of 18 de novo-designed proteins suggests the proposed method should be directly applicable to current challenges in de novo protein design.The prediction of interresidue contacts and distances from coevolutionary data using deep learning has considerably advanced protein structure prediction. Here, we build on these advances by developing a deep residual network for predicting interresidue orientations, in addition to distances, and a Rosetta-constrained energy-minimization protocol for rapidly and accurately generating structure models guided by these restraints. In benchmark tests on 13th Community-Wide Experiment on the Critical Assessment of Techniques for Protein Structure Prediction (CASP13)- and Continuous Automated Model Evaluation (CAMEO)-derived sets, the method outperforms all previously described structure-prediction methods. Although trained entirely on native proteins, the network consistently assigns higher probability to de novo-designed proteins, identifying the key fold-determining residues and providing an independent quantitative measure of the "ideality" of a protein structure. The method promises to be useful for a broad range of protein structure prediction and design problems.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Wu, Qi; Peng, Zhenling; Anishchenko, Ivan; Cong, Qian; Baker, David; Yang, Jianyi
Protein contact prediction using metagenome sequence data and residual neural networks Journal Article
In: Bioinformatics, vol. 36, no. 1, 2019.
@article{Wu2019,
title = {Protein contact prediction using metagenome sequence data and residual neural networks},
author = {Qi Wu and Zhenling Peng and Ivan Anishchenko and Qian Cong and David Baker and Jianyi Yang},
url = {https://academic.oup.com/bioinformatics/article/36/1/41/5512356},
doi = {10.1093/bioinformatics/btz477},
year = {2019},
date = {2019-06-07},
journal = {Bioinformatics},
volume = {36},
number = {1},
abstract = {Motivation: Almost all protein residue contact prediction methods rely on the availability of deep multiple sequence alignments (MSAs). However, many proteins from the poorly populated families do not have sufficient number of homologs in the conventional UniProt database. Here we aim to solve this issue by exploring the rich sequence data from the metagenome sequencing projects. Results: Based on the improved MSA constructed from the metagenome sequence data, we developed MapPred, a new deep learning-based contact prediction method. MapPred consists of two component methods, DeepMSA and DeepMeta, both trained with the residual neural networks. DeepMSA was inspired by the recent method DeepCov, which was trained on 441 matrices of covariance features. By considering the symmetry of contact map, we reduced the number of matrices to 231, which makes the training more efficient in DeepMSA. Experiments show that DeepMSA outperforms DeepCov by 10–13% in precision. DeepMeta works by combining predicted contacts and other sequence profile features. Experiments on three benchmark datasets suggest that the contribution from the metagenome sequence data is significant with P-values less than 4.04E-17. MapPred is shown to be complementary and comparable the state-of-the-art methods. The success of MapPred is attributed to three factors: the deeper MSA from the metagenome sequence data, improved feature design in DeepMSA and optimized training by the residual neural networks.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Day, Austin L; Greisen, Per; Doyle, Lindsey; Schena, Alberto; Stella, Nephi; Johnsson, Kai; Baker, David; Stoddard, Barry
Unintended specificity of an engineered ligand-binding protein facilitated by unpredicted plasticity of the protein fold Journal Article
In: Protein Engineering, Design and Selection, 2018.
@article{Day2018,
title = {Unintended specificity of an engineered ligand-binding protein facilitated by unpredicted plasticity of the protein fold},
author = {Day, Austin L and Greisen, Per and Doyle, Lindsey and Schena, Alberto and Stella, Nephi and Johnsson, Kai and Baker, David and Stoddard, Barry
},
url = {https://dx.doi.org/10.1093/protein/gzy031
https://www.bakerlab.org/wp-content/uploads/2019/02/Day2018.pdf},
doi = {10.1093/protein/gzy031},
year = {2018},
date = {2018-12-19},
journal = {Protein Engineering, Design and Selection},
abstract = {Attempts to create novel ligand-binding proteins often focus on formation of a binding pocket with shape complementarity against the desired ligand (particularly for compounds that lack distinct polar moieties). Although designed proteins often exhibit binding of the desired ligand, in some cases they display unintended recognition behavior. One such designed protein, that was originally intended to bind tetrahydrocannabinol (THC), was found instead to display binding of 25-hydroxy-cholecalciferol (25-D3) and was subjected to biochemical characterization, further selections for enhanced 25-D3 binding affinity and crystallographic analyses. The deviation in specificity is due in part to unexpected altertion of its conformation, corresponding to a significant change of the orientation of an α-helix and an equally large movement of a loop, both of which flank the designed ligand-binding pocket. Those changes led to engineered protein constructs that exhibit significantly more contacts and complementarity towards the 25-D3 ligand than the initial designed protein had been predicted to form towards its intended THC ligand. Molecular dynamics simulations imply that the initial computationally designed mutations may contribute to the movement of the helix. These analyses collectively indicate that accurate prediction and control of backbone dynamics conformation, through a combination of improved conformational sampling and/or de novo structure design, represents a key area of further development for the design and optimization of engineered ligand-binding proteins.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Park, Hahnbeom; Ovchinnikov, Sergey; Kim, David E.; DiMaio, Frank; Baker, David
Protein homology model refinement by large-scale energy optimization Journal Article
In: Proceedings of the National Academy of Sciences, vol. 115, no. 12, pp. 3054–3059, 2018, ISSN: 0027-8424.
@article{Park2018,
title = {Protein homology model refinement by large-scale energy optimization},
author = {Park, Hahnbeom and Ovchinnikov, Sergey and Kim, David E. and DiMaio, Frank and Baker, David},
url = {https://www.pnas.org/content/115/12/3054
https://www.bakerlab.org/wp-content/uploads/2019/01/Park2018_refinement.pdf},
doi = {10.1073/pnas.1719115115},
issn = {0027-8424},
year = {2018},
date = {2018-03-20},
journal = {Proceedings of the National Academy of Sciences},
volume = {115},
number = {12},
pages = {3054–3059},
abstract = {Protein structure refinement by direct global energy optimization has been a longstanding challenge in computational structural biology due to limitations in both energy function accuracy and conformational sampling. This manuscript demonstrates that with recent advances in both areas, refinement can significantly improve protein comparative models based on structures of distant homologues.Proteins fold to their lowest free-energy structures, and hence the most straightforward way to increase the accuracy of a partially incorrect protein structure model is to search for the lowest-energy nearby structure. This direct approach has met with little success for two reasons: first, energy function inaccuracies can lead to false energy minima, resulting in model degradation rather than improvement; and second, even with an accurate energy function, the search problem is formidable because the energy only drops considerably in the immediate vicinity of the global minimum, and there are a very large number of degrees of freedom. Here we describe a large-scale energy optimization-based refinement method that incorporates advances in both search and energy function accuracy that can substantially improve the accuracy of low-resolution homology models. The method refined low-resolution homology models into correct folds for 50 of 84 diverse protein families and generated improved models in recent blind structure prediction experiments. Analyses of the basis for these improvements reveal contributions from both the improvements in conformational sampling techniques and the energy function.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Hahnbeom Park Sergey Ovchinnikov, David E. Kim
Protein structure prediction using Rosetta in CASP12 Journal Article
In: Proteins, 2017.
@article{Ovchinnikov2017,
title = {Protein structure prediction using Rosetta in CASP12},
author = {Sergey Ovchinnikov, Hahnbeom Park, David E. Kim, Frank DiMaio, David Baker},
url = {https://onlinelibrary.wiley.com/doi/epdf/10.1002/prot.25390
https://www.bakerlab.org/wp-content/uploads/2019/10/Ovchinnikov_et_al-2018-Proteins__Structure_Function_and_Bioinformatics.pdf},
doi = {10.1002/prot.25390},
year = {2017},
date = {2017-09-22},
journal = {Proteins},
abstract = {We describe several notable aspects of our structure predictions using Rosetta in CASP12 in the free modeling (FM) and refinement (TR) categories. First, we had previously generated (and published) models for most large protein families lacking experimentally determined structures usingRosetta guided by co-evolution based contact predictions, and for several targets these models proved better starting points for comparative modeling than any known crystal structure—our model database thus starts to fulfill one of the goals of the original protein structure initiative. Second, while our“human”group simply submitted ROBETTA models for most targets, for six targets expert intervention improved predictions considerably; the largest improvement was for T0886where we correctly parsed two discontinuous domains guided by predicted contact maps to accurately identify a structural homolog of the same fold. Third, Rosetta all atom refinement followed by MD simulations led to consistent but small improvements when starting models were close to the native structure, and larger but less consistent improvements when starting models were further away.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Anishchenko, I; Ovchinnikov, S; Kamisetty, H; Baker, D
Origins of coevolution between residues distant in protein 3D structures Journal Article
In: Proceedings of the National Academy of Sciences, vol. 114, no. 34, pp. 9122-9127, 2017.
@article{1000,
title = {Origins of coevolution between residues distant in protein 3D structures},
author = {I Anishchenko and S Ovchinnikov and H Kamisetty and D Baker},
editor = {August 22, 2017},
url = {http://www.pnas.org/content/114/34/9122
https://www.bakerlab.org/wp-content/uploads/2018/08/9122.full1_.pdf},
doi = {10.1073/pnas.1702664114},
year = {2017},
date = {2017-08-22},
journal = {Proceedings of the National Academy of Sciences},
volume = {114},
number = {34},
pages = {9122-9127},
abstract = {Residue pairs that directly coevolve in protein families are generally close in protein 3D structures. Here we study the exceptions to this general trend—directly coevolving residue pairs that are distant in protein structures—to determine the origins of evolutionary pressure on spatially distant residues and to understand the sources of error in contact-based structure prediction. Over a set of 4,000 protein families, we find that 25% of directly coevolving residue pairs are separated by more than 5 Å in protein structures and 3% by more than 15 Å. The majority (91%) of directly coevolving residue pairs in the 5–15 Å range are found to be in contact in at least one homologous structure—these exceptions arise from structural variation in the family in the region containing the residues. Thirty-five percent of the exceptions greater than 15 Å are at homo-oligomeric interfaces, 19% arise from family structural variation, and 27% are in repeat proteins likely reflecting alignment errors. Of the remaining long-range exceptions (<1% of the total number of coupled pairs), many can be attributed to close interactions in an oligomeric state. Overall, the results suggest that directly coevolving residue pairs not in repeat proteins are spatially proximal in at least one biologically relevant protein conformation within the family; we find little evidence for direct coupling between residues at spatially separated allosteric and functional sites or for increased direct coupling between residue pairs on putative allosteric pathways connecting them.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Ovchinnikov, Sergey; Park, Hahnbeom; Varghese, Neha; Huang, Po-Ssu; Pavlopoulos, Georgios A.; Kim, David E.; Kamisetty, Hetunandan; Kyrpides, Nikos C.; Baker, David
Protein structure determination using metagenome sequence data Journal Article
In: Science, vol. 355, no. 6322, pp. 294–298, 2017, ISSN: 0036-8075.
@article{Ovchinnikov294,
title = {Protein structure determination using metagenome sequence data},
author = { Sergey Ovchinnikov and Hahnbeom Park and Neha Varghese and Po-Ssu Huang and Georgios A. Pavlopoulos and David E. Kim and Hetunandan Kamisetty and Nikos C. Kyrpides and David Baker},
url = {https://www.bakerlab.org/wp-content/uploads/2017/01/ovchinnikov_science_2017.pdf
http://science.sciencemag.org/content/355/6322/294},
doi = {10.1126/science.aah4043},
issn = {0036-8075},
year = {2017},
date = {2017-01-01},
journal = {Science},
volume = {355},
number = {6322},
pages = {294--298},
publisher = {American Association for the Advancement of Science},
abstract = {Fewer than a third of the 14,849 known protein families have at least one member with an experimentally determined structure. This leaves more than 5000 protein families with no structural information. Protein modeling using residue-residue contacts inferred from evolutionary data has been successful in modeling unknown structures, but it requires large numbers of aligned sequences. Ovchinnikov et al. augmented such sequence alignments with metagenome sequence data (see the Perspective by S"oding). They determined the number of sequences required to allow modeling, developed criteria for model quality, and, where possible, improved modeling by matching predicted contacts to known structures. Their method predicted quality structural models for 614 protein families, of which about 140 represent newly discovered protein folds.Science, this issue p. 294; see also p. 248Despite decades of work by structural biologists, there are still ~5200 protein families with unknown structure outside the range of comparative modeling. We show that Rosetta structure prediction guided by residue-residue contacts inferred from evolutionary information can accurately model proteins that belong to large families and that metagenome sequence data more than triple the number of protein families with sufficient sequences for accurate modeling. We then integrate metagenome data, contact-based structure matching, and Rosetta structure calculations to generate models for 614 protein families with currently unknown structures; 206 are membrane proteins and 137 have folds not represented in the Protein Data Bank. This approach provides the representative models for large protein families originally envisioned as the goal of the Protein Structure Initiative at a fraction of the cost.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Preprints are available on bioRxiv.
2023
FROM THE LAB
Amir Motmaen, Justas Dauparas, Minkyung Baek, Mohamad H. Abedi, David Baker, Philip Bradley
Peptide-binding specificity prediction using fine-tuned protein structure prediction networks Journal Article
In: Proceedings of the National Academy of Sciences, 2023.
@article{nokey,
title = {Peptide-binding specificity prediction using fine-tuned protein structure prediction networks},
author = {Amir Motmaen, Justas Dauparas, Minkyung Baek, Mohamad H. Abedi, David Baker, Philip Bradley},
url = {https://www.pnas.org/doi/10.1073/pnas.2216697120, PNAS (Open Access)},
doi = {10.1073/pnas.2216697120},
year = {2023},
date = {2023-02-21},
urldate = {2023-02-21},
journal = {Proceedings of the National Academy of Sciences},
abstract = {Peptide-binding proteins play key roles in biology, and predicting their binding specificity is a long-standing challenge. While considerable protein structural information is available, the most successful current methods use sequence information alone, in part because it has been a challenge to model the subtle structural changes accompanying sequence substitutions. Protein structure prediction networks such as AlphaFold model sequence-structure relationships very accurately, and we reasoned that if it were possible to specifically train such networks on binding data, more generalizable models could be created. We show that placing a classifier on top of the AlphaFold network and fine-tuning the combined network parameters for both classification and structure prediction accuracy leads to a model with strong generalizable performance on a wide range of Class I and Class II peptide-MHC interactions that approaches the overall performance of the state-of-the-art NetMHCpan sequence-based method. The peptide-MHC optimized model shows excellent performance in distinguishing binding and non-binding peptides to SH3 and PDZ domains. This ability to generalize well beyond the training set far exceeds that of sequence-only models and should be particularly powerful for systems where less experimental data are available.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
COLLABORATOR LED
Sorry, no publications matched your criteria.
2022
FROM THE LAB
Dauparas, J. and Anishchenko, I. and Bennett, N. and Bai, H. and Ragotte, R. J. and Milles, L. F. and Wicky, B. I. M. and Courbet, A. and de Haas, R. J. and Bethel, N. and Leung, P. J. Y. and Huddy, T. F. and Pellock, S. and Tischer, D. and Chan, F. and Koepnick, B. and Nguyen, H. and Kang, A. and Sankaran, B. and Bera, A. K. and King, N. P. and Baker, D.
Robust deep learning–based protein sequence design using ProteinMPNN Journal Article
In: Science, 2022.
@article{Dauparas2022,
title = {Robust deep learning–based protein sequence design using ProteinMPNN},
author = {Dauparas, J.
and Anishchenko, I.
and Bennett, N.
and Bai, H.
and Ragotte, R. J.
and Milles, L. F.
and Wicky, B. I. M.
and Courbet, A.
and de Haas, R. J.
and Bethel, N.
and Leung, P. J. Y.
and Huddy, T. F.
and Pellock, S.
and Tischer, D.
and Chan, F.
and Koepnick, B.
and Nguyen, H.
and Kang, A.
and Sankaran, B.
and Bera, A. K.
and King, N. P.
and Baker, D.},
url = {https://www.science.org/doi/abs/10.1126/science.add2187, Science
https://www.bakerlab.org/wp-content/uploads/2022/09/Dauparas_etal_Science2022_Sequence_design_via_ProteinMPNN.pdf, PDF},
doi = {10.1126/science.add2187},
year = {2022},
date = {2022-09-15},
journal = {Science},
abstract = {While deep learning has revolutionized protein structure prediction, almost all experimentally characterized de novo protein designs have been generated using physically based approaches such as Rosetta. Here we describe a deep learning–based protein sequence design method, ProteinMPNN, with outstanding performance in both in silico and experimental tests. On native protein backbones, ProteinMPNN has a sequence recovery of 52.4%, compared to 32.9% for Rosetta. The amino acid sequence at different positions can be coupled between single or multiple chains, enabling application to a wide range of current protein design challenges. We demonstrate the broad utility and high accuracy of ProteinMPNN using X-ray crystallography, cryoEM and functional studies by rescuing previously failed designs, made using Rosetta or AlphaFold, of protein monomers, cyclic homo-oligomers, tetrahedral nanoparticles, and target binding proteins},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Jue Wang, Sidney Lisanza, David Juergens, Doug Tischer, Joseph L. Watson, Karla M. Castro, Robert Ragotte, Amijai Saragovi, Lukas F. Milles, Minkyung Baek, Ivan Anishchenko, Wei Yang, Derrick R. Hicks, Marc Expòsit, Thomas Schlichthaerle, Jung-Ho Chun, Justas Dauparas, Nathaniel Bennett, Basile I. M. Wicky, Andrew Muenks, Frank DiMaio, Bruno Correia, Sergey Ovchinnikov, David Baker
Scaffolding protein functional sites using deep learning Journal Article
In: Science, 2022.
@article{Wang2022,
title = {Scaffolding protein functional sites using deep learning},
author = {Jue Wang and Sidney Lisanza and David Juergens and Doug Tischer and Joseph L. Watson and Karla M. Castro and Robert Ragotte and Amijai Saragovi and Lukas F. Milles and Minkyung Baek and Ivan Anishchenko and Wei Yang and Derrick R. Hicks and Marc Expòsit and Thomas Schlichthaerle and Jung-Ho Chun and Justas Dauparas and Nathaniel Bennett and Basile I. M. Wicky and Andrew Muenks and Frank DiMaio and Bruno Correia and Sergey Ovchinnikov and David Baker },
url = {https://www.science.org/doi/abs/10.1126/science.abn2100, Science
https://www.ipd.uw.edu/wp-content/uploads/2022/07/science.abn2100.pdf, Download PDF},
doi = {10.1126/science.abn2100},
year = {2022},
date = {2022-07-21},
urldate = {2022-07-21},
journal = {Science},
abstract = {The binding and catalytic functions of proteins are generally mediated by a small number of functional residues held in place by the overall protein structure. Here, we describe deep learning approaches for scaffolding such functional sites without needing to prespecify the fold or secondary structure of the scaffold. The first approach, “constrained hallucination,” optimizes sequences such that their predicted structures contain the desired functional site. The second approach, “inpainting,” starts from the functional site and fills in additional sequence and structure to create a viable protein scaffold in a single forward pass through a specifically trained RoseTTAFold network. We use these two methods to design candidate immunogens, receptor traps, metalloproteins, enzymes, and protein-binding proteins and validate the designs using a combination of in silico and experimental tests. Protein design has had success in finding sequences that fold into a desired conformation, but designing functional proteins remains challenging. Wang et al. describe two deep-learning methods to design proteins that contain prespecified functional sites. In the first, they found sequences predicted to fold into stable structures that contain the functional site. In the second, they retrained a structure prediction network to recover the sequence and full structure of a protein given only the functional site. The authors demonstrate their methods by designing proteins containing a variety of functional motifs. —VV Deep-learning methods enable the scaffolding of desired functional residues within a well-folded designed protein.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Minkyung Baek, David Baker
Deep learning and protein structure modeling Journal Article
In: Nature Methods, 2022.
@article{Baek2022,
title = {Deep learning and protein structure modeling},
author = {Minkyung Baek and David Baker},
url = {https://www.nature.com/articles/s41592-021-01360-8, Nature Methods
https://www.bakerlab.org/wp-content/uploads/2022/01/Baek_Baker_NatureMethods2022_Deep_Learning_and_Protein_Structure_Modeling.pdf, Download PDF
},
doi = {10.1038/s41592-021-01360-8},
year = {2022},
date = {2022-01-22},
urldate = {2022-01-22},
journal = {Nature Methods},
abstract = {Deep learning has transformed protein structure modeling. Here we relate AlphaFold and RoseTTAFold to classical physically based approaches to protein structure prediction, and discuss the many areas of structural biology that are likely to be affected by further advances in deep learning.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
COLLABORATOR LED
Sen, Neeladri and Anishchenko, Ivan and Bordin N and Sillitoe, Ian and Velankar, Sameer and Baker, David and Orengo, Christine
Characterizing and explaining the impact of disease-associated mutations in proteins without known structures or structural homologs Journal Article
In: Briefings in Bioinformatics, 2022.
@article{Sen2022,
title = {Characterizing and explaining the impact of disease-associated mutations in proteins without known structures or structural homologs},
author = {Sen, Neeladri
and Anishchenko, Ivan
and Bordin N
and Sillitoe, Ian
and Velankar, Sameer
and Baker, David
and Orengo, Christine},
url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC9294430/},
doi = {10.1093/bib/bbac187},
year = {2022},
date = {2022-07-18},
journal = {Briefings in Bioinformatics},
abstract = {Mutations in human proteins lead to diseases. The structure of these proteins can help understand the mechanism of such diseases and develop therapeutics against them. With improved deep learning techniques, such as RoseTTAFold and AlphaFold, we can predict the structure of proteins even in the absence of structural homologs. We modeled and extracted the domains from 553 disease-associated human proteins without known protein structures or close homologs in the Protein Databank. We noticed that the model quality was higher and the Root mean square deviation (RMSD) lower between AlphaFold and RoseTTAFold models for domains that could be assigned to CATH families as compared to those which could only be assigned to Pfam families of unknown structure or could not be assigned to either. We predicted ligand-binding sites, protein-protein interfaces and conserved residues in these predicted structures. We then explored whether the disease-associated missense mutations were in the proximity of these predicted functional sites, whether they destabilized the protein structure based on ddG calculations or whether they were predicted to be pathogenic. We could explain 80% of these disease-associated mutations based on proximity to functional sites, structural destabilization or pathogenicity. When compared to polymorphisms, a larger percentage of disease-associated missense mutations were buried, closer to predicted functional sites, predicted as destabilizing and pathogenic. Usage of models from the two state-of-the-art techniques provide better confidence in our predictions, and we explain 93 additional mutations based on RoseTTAFold models which could not be explained based solely on AlphaFold models.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Linder, Johannes, La Fleur, Alyssa, Chen, Zibo, Ljubetič, Ajasja, Baker, David, Kannan, Sreeram, Seelig, Georg
Interpreting neural networks for biological sequences by learning stochastic masks Journal Article
In: Nature Machine Intelligence, 2022.
@article{Linder2022,
title = {Interpreting neural networks for biological sequences by learning stochastic masks},
author = {Linder, Johannes and La Fleur, Alyssa and Chen, Zibo and Ljubetič, Ajasja and Baker, David and Kannan, Sreeram and Seelig, Georg},
url = {https://www.nature.com/articles/s42256-021-00428-6, Nature Machine Intelligence},
doi = {10.1038/s42256-021-00428-6},
year = {2022},
date = {2022-01-25},
urldate = {2022-01-25},
journal = {Nature Machine Intelligence},
abstract = {Sequence-based neural networks can learn to make accurate predictions from large biological datasets, but model interpretation remains challenging. Many existing feature attribution methods are optimized for continuous rather than discrete input patterns and assess individual feature importance in isolation, making them ill-suited for interpreting nonlinear interactions in molecular sequences. Here, building on work in computer vision and natural language processing, we developed an approach based on deep learning—scrambler networks—wherein the most important sequence positions are identified with learned input masks. Scramblers learn to predict position-specific scoring matrices where unimportant nucleotides or residues are scrambled by raising their entropy. We apply scramblers to interpret the effects of genetic variants, uncover nonlinear interactions between cis-regulatory elements, explain binding specificity for protein–protein interactions, and identify structural determinants of de novo-designed proteins. We show that scramblers enable efficient attribution across large datasets and result in high-quality explanations, often outperforming state-of-the-art methods.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
2021
FROM THE LAB
Anishchenko, Ivan and Pellock, Samuel J. and Chidyausiku, Tamuka M. and Ramelot, Theresa A. and Ovchinnikov, Sergey and Hao, Jingzhou and Bafna, Khushboo and Norn, Christoffer and Kang, Alex and Bera, Asim K. and DiMaio, Frank and Carter, Lauren and Chow, Cameron M. and Montelione, Gaetano T. and Baker, David
De novo protein design by deep network hallucination Journal Article
In: Nature, 2021.
@article{Anishchenko2021,
title = {De novo protein design by deep network hallucination},
author = {Anishchenko, Ivan
and Pellock, Samuel J.
and Chidyausiku, Tamuka M.
and Ramelot, Theresa A.
and Ovchinnikov, Sergey
and Hao, Jingzhou
and Bafna, Khushboo
and Norn, Christoffer
and Kang, Alex
and Bera, Asim K.
and DiMaio, Frank
and Carter, Lauren
and Chow, Cameron M.
and Montelione, Gaetano T.
and Baker, David},
url = {https://www.nature.com/articles/s41586-021-04184-w
https://www.bakerlab.org/wp-content/uploads/2022/01/Anishchenko_etal_Nature2021_DeepNetworkHallucination.pdf},
doi = {10.1038/s41586-021-04184-w},
year = {2021},
date = {2021-12-01},
urldate = {2021-12-01},
journal = {Nature},
abstract = {There has been considerable recent progress in protein structure prediction using deep neural networks to predict inter-residue distances from amino acid sequences1–3. Here we investigate whether the information captured by such networks is sufficiently rich to generate new folded proteins with sequences unrelated to those of the naturally occurring proteins used in training the models. We generate random amino acid sequences, and input them into the trRosetta structure prediction network to predict starting residue–residue distance maps, which, as expected, are quite featureless. We then carry out Monte Carlo sampling in amino acid sequence space, optimizing the contrast (Kullback–Leibler divergence) between the inter-residue distance distributions predicted by the network and background distributions averaged over all proteins. Optimization from different random starting points resulted in novel proteins spanning a wide range of sequences and predicted structures. We obtained synthetic genes encoding 129 of the network-‘hallucinated’ sequences, and expressed and purified the proteins in Escherichia coli; 27 of the proteins yielded monodisperse species with circular dichroism spectra consistent with the hallucinated structures. We determined the three-dimensional structures of three of the hallucinated proteins, two by X-ray crystallography and one by NMR, and these closely matched the hallucinated models. Thus, deep networks trained to predict native protein structures from their sequences can be inverted to design new proteins, and such networks and methods should contribute alongside traditional physics-based models to the de novo design of proteins with new functions.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Minkyung Baek, Ivan Anishchenko, Hahnbeom Park, Ian R. Humphreys, David Baker
Protein oligomer modeling guided by predicted inter-chain contacts in CASP14 Journal Article
In: Proteins, 2021.
@article{Baek2021b,
title = {Protein oligomer modeling guided by predicted inter-chain contacts in CASP14},
author = {Minkyung Baek and Ivan Anishchenko and Hahnbeom Park and Ian R. Humphreys and David Baker},
url = {https://onlinelibrary.wiley.com/doi/abs/10.1002/prot.26197, Proteins},
doi = {10.1002/prot.26197},
year = {2021},
date = {2021-07-29},
urldate = {2021-07-29},
journal = {Proteins},
abstract = {For CASP14, we developed deep learning-based methods for predicting homo-oligomeric and hetero-oligomeric contacts and used them for oligomer modeling. To build structure models, we developed an oligomer structure generation method that utilizes predicted inter-chain contacts to guide iterative restrained minimization from random backbone structures. We supplemented this gradient-based fold-and-dock method with template-based and ab initio docking approaches using deep learning-based subunit predictions on 29 assembly targets. These methods produced oligomer models with summed Z-scores 5.5 units higher than the next best group, with the fold-and-dock method having the best relative performance. Over the eight targets for which this method was used, the best of the five submitted models had average oligomer TM-score of 0.71 (average oligomer TM-score of the next best group: 0.64), and explicit modeling of inter-subunit interactions improved modeling of six out of 40 individual domains (ΔGDT-TS > 2.0).
},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Baek, Minkyung and DiMaio, Frank and Anishchenko, Ivan and Dauparas, Justas and Ovchinnikov, Sergey and Lee, Gyu Rie and Wang, Jue and Cong, Qian and Kinch, Lisa N. and Schaeffer, R. Dustin and Millán, Claudia and Park, Hahnbeom and Adams, Carson and Glassman, Caleb R. and DeGiovanni, Andy and Pereira, Jose H. and Rodrigues, Andria V. and van Dijk, Alberdina A. and Ebrecht, Ana C. and Opperman, Diederik J. and Sagmeister, Theo and Buhlheller, Christoph and Pavkov-Keller, Tea and Rathinaswamy, Manoj K. and Dalwadi, Udit and Yip, Calvin K. and Burke, John E. and Garcia, K. Christopher and Grishin, Nick V. and Adams, Paul D. and Read, Randy J. and Baker, David
Accurate prediction of protein structures and interactions using a three-track neural network Journal Article
In: Science, 2021.
@article{Baek2021,
title = {Accurate prediction of protein structures and interactions using a three-track neural network},
author = {Baek, Minkyung
and DiMaio, Frank
and Anishchenko, Ivan
and Dauparas, Justas
and Ovchinnikov, Sergey
and Lee, Gyu Rie
and Wang, Jue
and Cong, Qian
and Kinch, Lisa N.
and Schaeffer, R. Dustin
and Millán, Claudia
and Park, Hahnbeom
and Adams, Carson
and Glassman, Caleb R.
and DeGiovanni, Andy
and Pereira, Jose H.
and Rodrigues, Andria V.
and van Dijk, Alberdina A.
and Ebrecht, Ana C.
and Opperman, Diederik J.
and Sagmeister, Theo
and Buhlheller, Christoph
and Pavkov-Keller, Tea
and Rathinaswamy, Manoj K.
and Dalwadi, Udit
and Yip, Calvin K.
and Burke, John E.
and Garcia, K. Christopher
and Grishin, Nick V.
and Adams, Paul D.
and Read, Randy J.
and Baker, David},
url = {http://science.sciencemag.org/content/early/2021/07/14/science.abj8754, Science
https://www.ipd.uw.edu/wp-content/uploads/2021/07/Baek_etal_Science2021_RoseTTAFold.pdf, Download PDF},
doi = {10.1126/science.abj8754},
year = {2021},
date = {2021-07-15},
urldate = {2021-07-15},
journal = {Science},
abstract = {DeepMind presented remarkably accurate predictions at the recent CASP14 protein structure prediction assessment conference. We explored network architectures incorporating related ideas and obtained the best performance with a three-track network in which information at the 1D sequence level, the 2D distance map level, and the 3D coordinate level is successively transformed and integrated. The three-track network produces structure predictions with accuracies approaching those of DeepMind in CASP14, enables the rapid solution of challenging X-ray crystallography and cryo-EM structure modeling problems, and provides insights into the functions of proteins of currently unknown structure. The network also enables rapid generation of accurate protein-protein complex models from sequence information alone, short-circuiting traditional approaches which require modeling of individual subunits followed by docking. We make the method available to the scientific community to speed biological research.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Norn, Christoffer, Wicky, Basile I. M., Juergens, David, Liu, Sirui, Kim, David, Tischer, Doug, Koepnick, Brian, Anishchenko, Ivan, Baker, David, Ovchinnikov, Sergey
Protein sequence design by conformational landscape optimization Journal Article
In: Proceedings of the National Academy of Sciences, vol. 118, no. 11, 2021.
@article{Norn2021,
title = {Protein sequence design by conformational landscape optimization},
author = {Norn, Christoffer and Wicky, Basile I. M. and Juergens, David and Liu, Sirui and Kim, David and Tischer, Doug and Koepnick, Brian and Anishchenko, Ivan and Baker, David and Ovchinnikov, Sergey},
url = {https://www.pnas.org/content/118/11/e2017228118, PNAS
https://www.bakerlab.org/wp-content/uploads/2021/03/Norn_etal_PNAS2021_LandscapeOptimization.pdf, Download PDF},
doi = {10.1073/pnas.2017228118},
year = {2021},
date = {2021-03-16},
urldate = {2021-03-16},
journal = {Proceedings of the National Academy of Sciences},
volume = {118},
number = {11},
abstract = {Almost all proteins fold to their lowest free energy state, which is determined by their amino acid sequence. Computational protein design has primarily focused on finding sequences that have very low energy in the target designed structure. However, what is most relevant during folding is not the absolute energy of the folded state but the energy difference between the folded state and the lowest-lying alternative states. We describe a deep learning approach that captures aspects of the folding landscape, in particular the presence of structures in alternative energy minima, and show that it can enhance current protein design methods.The protein design problem is to identify an amino acid sequence that folds to a desired structure. Given Anfinsen{textquoteright}s thermodynamic hypothesis of folding, this can be recast as finding an amino acid sequence for which the desired structure is the lowest energy state. As this calculation involves not only all possible amino acid sequences but also, all possible structures, most current approaches focus instead on the more tractable problem of finding the lowest-energy amino acid sequence for the desired structure, often checking by protein structure prediction in a second step that the desired structure is indeed the lowest-energy conformation for the designed sequence, and typically discarding a large fraction of designed sequences for which this is not the case. Here, we show that by backpropagating gradients through the transform-restrained Rosetta (trRosetta) structure prediction network from the desired structure to the input amino acid sequence, we can directly optimize over all possible amino acid sequences and all possible structures in a single calculation. We find that trRosetta calculations, which consider the full conformational landscape, can be more effective than Rosetta single-point energy estimations in predicting folding and stability of de novo designed proteins. We compare sequence design by conformational landscape optimization with the standard energy-based sequence design methodology in Rosetta and show that the former can result in energy landscapes with fewer alternative energy minima. We show further that more funneled energy landscapes can be designed by combining the strengths of the two approaches: the low-resolution trRosetta model serves to disfavor alternative states, and the high-resolution Rosetta model serves to create a deep energy minimum at the design target structure.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
COLLABORATOR LED
Du, Zongyang and Su, Hong and Wang, Wenkai and Ye, Lisha and Wei, Hong and Peng, Zhenling and Anishchenko, Ivan and Baker, David and Yang, Jianyi
The trRosetta server for fast and accurate protein structure prediction Journal Article
In: Nature Protocols, 2021.
@article{Du2021,
title = {The trRosetta server for fast and accurate protein structure prediction},
author = {Du, Zongyang
and Su, Hong
and Wang, Wenkai
and Ye, Lisha
and Wei, Hong
and Peng, Zhenling
and Anishchenko, Ivan
and Baker, David
and Yang, Jianyi},
url = {https://www.nature.com/articles/s41596-021-00628-9
https://www.bakerlab.org/wp-content/uploads/2022/01/Du_etal_NatProt2021_trRosetta_server.pdf},
doi = {10.1038/s41596-021-00628-9},
year = {2021},
date = {2021-12-01},
urldate = {2021-12-01},
journal = {Nature Protocols},
abstract = {The trRosetta (transform-restrained Rosetta) server is a web-based platform for fast and accurate protein structure prediction, powered by deep learning and Rosetta. With the input of a protein’s amino acid sequence, a deep neural network is first used to predict the inter-residue geometries, including distance and orientations. The predicted geometries are then transformed as restraints to guide the structure prediction on the basis of direct energy minimization, which is implemented under the framework of Rosetta. The trRosetta server distinguishes itself from other similar structure prediction servers in terms of rapid and accurate de novo structure prediction. As an illustration, trRosetta was applied to two Pfam families with unknown structures, for which the predicted de novo models were estimated to have high accuracy. Nevertheless, to take advantage of homology modeling, homologous templates are used as additional inputs to the network automatically. In general, it takes ~1 h to predict the final structure for a typical protein with ~300 amino acids, using a maximum of 10 CPU cores in parallel in our cluster system. To enable large-scale structure modeling, a downloadable package of trRosetta with open-source codes is available as well. A detailed guidance for using the package is also available in this protocol. The server and the package are available at https://yanglab.nankai.edu.cn/trRosetta/ and https://yanglab.nankai.edu.cn/trRosetta/download/, respectively.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Naozumi Hiranuma, Hahnbeom Park, Minkyung Baek, Ivan Anishchenko, Justas Dauparas, David Baker
Improved protein structure refinement guided by deep learning based accuracy estimation Journal Article
In: Nature Communications, vol. 12, no. 1340, 2021.
@article{Hiranuma2021,
title = {Improved protein structure refinement guided by deep learning based accuracy estimation},
author = {Naozumi Hiranuma and Hahnbeom Park and Minkyung Baek and Ivan Anishchenko and Justas Dauparas and David Baker
},
url = {https://www.nature.com/articles/s41467-021-21511-x, Nature Communications
https://www.bakerlab.org/wp-content/uploads/2021/02/Hiranuma_etal_NatureComms2021_DeepLearningStructureRefinement.pdf, Download PDF},
doi = {10.1038/s41467-021-21511-x},
year = {2021},
date = {2021-02-26},
urldate = {2021-02-26},
journal = {Nature Communications},
volume = {12},
number = {1340},
abstract = {We develop a deep learning framework (DeepAccNet) that estimates per-residue accuracy and residue-residue distance signed error in protein models and uses these predictions to guide Rosetta protein structure refinement. The network uses 3D convolutions to evaluate local atomic environments followed by 2D convolutions to provide their global contexts and outperforms other methods that similarly predict the accuracy of protein structure models. Overall accuracy predictions for X-ray and cryoEM structures in the PDB correlate with their resolution, and the network should be broadly useful for assessing the accuracy of both predicted structure models and experimentally determined structures and identifying specific regions likely to be in error. Incorporation of the accuracy predictions at multiple stages in the Rosetta refinement protocol considerably increased the accuracy of the resulting protein structure models, illustrating how deep learning can improve search for global energy minima of biomolecules.
},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Hahnbeom Park, Guangfeng Zhou, Minkyung Baek, David Baker, Frank DiMaio
Force Field Optimization Guided by Small Molecule Crystal Lattice Data Enables Consistent Sub-Angstrom Protein–Ligand Docking Journal Article
In: Journal of Chemical Theory and Computation, 2021.
@article{Park2021,
title = {Force Field Optimization Guided by Small Molecule Crystal Lattice Data Enables Consistent Sub-Angstrom Protein–Ligand Docking},
author = {Hahnbeom Park and Guangfeng Zhou and Minkyung Baek and David Baker and Frank DiMaio},
url = {https://pubs.acs.org/doi/full/10.1021/acs.jctc.0c01184
https://www.bakerlab.org/wp-content/uploads/2021/02/Park_etal_JCTC2021_Small_mol_force_field_optimization.pdf},
doi = {10.1021/acs.jctc.0c01184},
year = {2021},
date = {2021-02-12},
journal = {Journal of Chemical Theory and Computation},
abstract = {Accurate and rapid calculation of protein-small molecule interaction free energies is critical for computational drug discovery. Because of the large chemical space spanned by drug-like molecules, classical force fields contain thousands of parameters describing atom-pair distance and torsional preferences; each parameter is typically optimized independently on simple representative molecules. Here, we describe a new approach in which small molecule force field parameters are jointly optimized guided by the rich source of information contained within thousands of available small molecule crystal structures. We optimize parameters by requiring that the experimentally determined molecular lattice arrangements have lower energy than all alternative lattice arrangements. Thousands of independent crystal lattice-prediction simulations were run on each of 1386 small molecule crystal structures, and energy function parameters of an implicit solvent energy model were optimized, so native crystal lattice arrangements had the lowest energy. The resulting energy model was implemented in Rosetta, together with a rapid genetic algorithm docking method employing grid-based scoring and receptor flexibility. The success rate of bound structure recapitulation in cross-docking on 1112 complexes was improved by more than 10% over previously published methods, with solutions within <1 Å in over half of the cases. Our results demonstrate that small molecule crystal structures are a rich source of information for guiding molecular force field development, and the improved Rosetta energy function should increase accuracy in a wide range of small molecule structure prediction and design studies.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
2020
FROM THE LAB
Yang, Jianyi, Anishchenko, Ivan, Park, Hahnbeom, Peng, Zhenling, Ovchinnikov, Sergey, Baker, David
Improved protein structure prediction using predicted interresidue orientations Journal Article
In: Proceedings of the National Academy of Sciences, 2020, ISBN: 0027-8424.
@article{Yang2020,
title = {Improved protein structure prediction using predicted interresidue orientations},
author = {Yang, Jianyi and Anishchenko, Ivan and Park, Hahnbeom and Peng, Zhenling and Ovchinnikov, Sergey and Baker, David},
url = {https://www.pnas.org/content/early/2020/01/01/1914677117
https://www.bakerlab.org/wp-content/uploads/2020/01/Yang2020_ImprovedStructurePredictionInterresidueOrientations.pdf
},
doi = {10.1073/pnas.1914677117},
isbn = {0027-8424},
year = {2020},
date = {2020-01-02},
journal = {Proceedings of the National Academy of Sciences},
abstract = {Protein structure prediction is a longstanding challenge in computational biology. Through extension of deep learning-based prediction to interresidue orientations in addition to distances, and the development of a constrained optimization by Rosetta, we show that more accurate models can be generated. Results on a set of 18 de novo-designed proteins suggests the proposed method should be directly applicable to current challenges in de novo protein design.The prediction of interresidue contacts and distances from coevolutionary data using deep learning has considerably advanced protein structure prediction. Here, we build on these advances by developing a deep residual network for predicting interresidue orientations, in addition to distances, and a Rosetta-constrained energy-minimization protocol for rapidly and accurately generating structure models guided by these restraints. In benchmark tests on 13th Community-Wide Experiment on the Critical Assessment of Techniques for Protein Structure Prediction (CASP13)- and Continuous Automated Model Evaluation (CAMEO)-derived sets, the method outperforms all previously described structure-prediction methods. Although trained entirely on native proteins, the network consistently assigns higher probability to de novo-designed proteins, identifying the key fold-determining residues and providing an independent quantitative measure of the "ideality" of a protein structure. The method promises to be useful for a broad range of protein structure prediction and design problems.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
COLLABORATOR LED
Sorry, no publications matched your criteria.
2019
FROM THE LAB
Sorry, no publications matched your criteria.
COLLABORATOR LED
Qi Wu, Zhenling Peng, Ivan Anishchenko, Qian Cong, David Baker, Jianyi Yang
Protein contact prediction using metagenome sequence data and residual neural networks Journal Article
In: Bioinformatics, vol. 36, no. 1, 2019.
@article{Wu2019,
title = {Protein contact prediction using metagenome sequence data and residual neural networks},
author = {Qi Wu and Zhenling Peng and Ivan Anishchenko and Qian Cong and David Baker and Jianyi Yang},
url = {https://academic.oup.com/bioinformatics/article/36/1/41/5512356},
doi = {10.1093/bioinformatics/btz477},
year = {2019},
date = {2019-06-07},
journal = {Bioinformatics},
volume = {36},
number = {1},
abstract = {Motivation: Almost all protein residue contact prediction methods rely on the availability of deep multiple sequence alignments (MSAs). However, many proteins from the poorly populated families do not have sufficient number of homologs in the conventional UniProt database. Here we aim to solve this issue by exploring the rich sequence data from the metagenome sequencing projects. Results: Based on the improved MSA constructed from the metagenome sequence data, we developed MapPred, a new deep learning-based contact prediction method. MapPred consists of two component methods, DeepMSA and DeepMeta, both trained with the residual neural networks. DeepMSA was inspired by the recent method DeepCov, which was trained on 441 matrices of covariance features. By considering the symmetry of contact map, we reduced the number of matrices to 231, which makes the training more efficient in DeepMSA. Experiments show that DeepMSA outperforms DeepCov by 10–13% in precision. DeepMeta works by combining predicted contacts and other sequence profile features. Experiments on three benchmark datasets suggest that the contribution from the metagenome sequence data is significant with P-values less than 4.04E-17. MapPred is shown to be complementary and comparable the state-of-the-art methods. The success of MapPred is attributed to three factors: the deeper MSA from the metagenome sequence data, improved feature design in DeepMSA and optimized training by the residual neural networks.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
2018
FROM THE LAB
Park, Hahnbeom, Ovchinnikov, Sergey, Kim, David E., DiMaio, Frank, Baker, David
Protein homology model refinement by large-scale energy optimization Journal Article
In: Proceedings of the National Academy of Sciences, vol. 115, no. 12, pp. 3054–3059, 2018, ISSN: 0027-8424.
@article{Park2018,
title = {Protein homology model refinement by large-scale energy optimization},
author = {Park, Hahnbeom and Ovchinnikov, Sergey and Kim, David E. and DiMaio, Frank and Baker, David},
url = {https://www.pnas.org/content/115/12/3054
https://www.bakerlab.org/wp-content/uploads/2019/01/Park2018_refinement.pdf},
doi = {10.1073/pnas.1719115115},
issn = {0027-8424},
year = {2018},
date = {2018-03-20},
journal = {Proceedings of the National Academy of Sciences},
volume = {115},
number = {12},
pages = {3054–3059},
abstract = {Protein structure refinement by direct global energy optimization has been a longstanding challenge in computational structural biology due to limitations in both energy function accuracy and conformational sampling. This manuscript demonstrates that with recent advances in both areas, refinement can significantly improve protein comparative models based on structures of distant homologues.Proteins fold to their lowest free-energy structures, and hence the most straightforward way to increase the accuracy of a partially incorrect protein structure model is to search for the lowest-energy nearby structure. This direct approach has met with little success for two reasons: first, energy function inaccuracies can lead to false energy minima, resulting in model degradation rather than improvement; and second, even with an accurate energy function, the search problem is formidable because the energy only drops considerably in the immediate vicinity of the global minimum, and there are a very large number of degrees of freedom. Here we describe a large-scale energy optimization-based refinement method that incorporates advances in both search and energy function accuracy that can substantially improve the accuracy of low-resolution homology models. The method refined low-resolution homology models into correct folds for 50 of 84 diverse protein families and generated improved models in recent blind structure prediction experiments. Analyses of the basis for these improvements reveal contributions from both the improvements in conformational sampling techniques and the energy function.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
COLLABORATOR LED
Day, Austin L, Greisen, Per, Doyle, Lindsey, Schena, Alberto, Stella, Nephi, Johnsson, Kai, Baker, David, Stoddard, Barry
Unintended specificity of an engineered ligand-binding protein facilitated by unpredicted plasticity of the protein fold Journal Article
In: Protein Engineering, Design and Selection, 2018.
@article{Day2018,
title = {Unintended specificity of an engineered ligand-binding protein facilitated by unpredicted plasticity of the protein fold},
author = {Day, Austin L and Greisen, Per and Doyle, Lindsey and Schena, Alberto and Stella, Nephi and Johnsson, Kai and Baker, David and Stoddard, Barry
},
url = {https://dx.doi.org/10.1093/protein/gzy031
https://www.bakerlab.org/wp-content/uploads/2019/02/Day2018.pdf},
doi = {10.1093/protein/gzy031},
year = {2018},
date = {2018-12-19},
journal = {Protein Engineering, Design and Selection},
abstract = {Attempts to create novel ligand-binding proteins often focus on formation of a binding pocket with shape complementarity against the desired ligand (particularly for compounds that lack distinct polar moieties). Although designed proteins often exhibit binding of the desired ligand, in some cases they display unintended recognition behavior. One such designed protein, that was originally intended to bind tetrahydrocannabinol (THC), was found instead to display binding of 25-hydroxy-cholecalciferol (25-D3) and was subjected to biochemical characterization, further selections for enhanced 25-D3 binding affinity and crystallographic analyses. The deviation in specificity is due in part to unexpected altertion of its conformation, corresponding to a significant change of the orientation of an α-helix and an equally large movement of a loop, both of which flank the designed ligand-binding pocket. Those changes led to engineered protein constructs that exhibit significantly more contacts and complementarity towards the 25-D3 ligand than the initial designed protein had been predicted to form towards its intended THC ligand. Molecular dynamics simulations imply that the initial computationally designed mutations may contribute to the movement of the helix. These analyses collectively indicate that accurate prediction and control of backbone dynamics conformation, through a combination of improved conformational sampling and/or de novo structure design, represents a key area of further development for the design and optimization of engineered ligand-binding proteins.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
2017–1998
ALL PAPERS
2017
Sergey Ovchinnikov, Hahnbeom Park, David E. Kim, Frank DiMaio, David Baker
Protein structure prediction using Rosetta in CASP12 Journal Article
In: Proteins, 2017.
@article{Ovchinnikov2017,
title = {Protein structure prediction using Rosetta in CASP12},
author = {Sergey Ovchinnikov, Hahnbeom Park, David E. Kim, Frank DiMaio, David Baker},
url = {https://onlinelibrary.wiley.com/doi/epdf/10.1002/prot.25390
https://www.bakerlab.org/wp-content/uploads/2019/10/Ovchinnikov_et_al-2018-Proteins__Structure_Function_and_Bioinformatics.pdf},
doi = {10.1002/prot.25390},
year = {2017},
date = {2017-09-22},
journal = {Proteins},
abstract = {We describe several notable aspects of our structure predictions using Rosetta in CASP12 in the free modeling (FM) and refinement (TR) categories. First, we had previously generated (and published) models for most large protein families lacking experimentally determined structures usingRosetta guided by co-evolution based contact predictions, and for several targets these models proved better starting points for comparative modeling than any known crystal structure—our model database thus starts to fulfill one of the goals of the original protein structure initiative. Second, while our“human”group simply submitted ROBETTA models for most targets, for six targets expert intervention improved predictions considerably; the largest improvement was for T0886where we correctly parsed two discontinuous domains guided by predicted contact maps to accurately identify a structural homolog of the same fold. Third, Rosetta all atom refinement followed by MD simulations led to consistent but small improvements when starting models were close to the native structure, and larger but less consistent improvements when starting models were further away.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
I Anishchenko, S Ovchinnikov, H Kamisetty, D Baker
Origins of coevolution between residues distant in protein 3D structures Journal Article
In: Proceedings of the National Academy of Sciences, vol. 114, no. 34, pp. 9122-9127, 2017.
@article{1000,
title = {Origins of coevolution between residues distant in protein 3D structures},
author = {I Anishchenko and S Ovchinnikov and H Kamisetty and D Baker},
editor = {August 22, 2017},
url = {http://www.pnas.org/content/114/34/9122
https://www.bakerlab.org/wp-content/uploads/2018/08/9122.full1_.pdf},
doi = {10.1073/pnas.1702664114},
year = {2017},
date = {2017-08-22},
journal = {Proceedings of the National Academy of Sciences},
volume = {114},
number = {34},
pages = {9122-9127},
abstract = {Residue pairs that directly coevolve in protein families are generally close in protein 3D structures. Here we study the exceptions to this general trend—directly coevolving residue pairs that are distant in protein structures—to determine the origins of evolutionary pressure on spatially distant residues and to understand the sources of error in contact-based structure prediction. Over a set of 4,000 protein families, we find that 25% of directly coevolving residue pairs are separated by more than 5 Å in protein structures and 3% by more than 15 Å. The majority (91%) of directly coevolving residue pairs in the 5–15 Å range are found to be in contact in at least one homologous structure—these exceptions arise from structural variation in the family in the region containing the residues. Thirty-five percent of the exceptions greater than 15 Å are at homo-oligomeric interfaces, 19% arise from family structural variation, and 27% are in repeat proteins likely reflecting alignment errors. Of the remaining long-range exceptions (<1% of the total number of coupled pairs), many can be attributed to close interactions in an oligomeric state. Overall, the results suggest that directly coevolving residue pairs not in repeat proteins are spatially proximal in at least one biologically relevant protein conformation within the family; we find little evidence for direct coupling between residues at spatially separated allosteric and functional sites or for increased direct coupling between residue pairs on putative allosteric pathways connecting them.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Sergey Ovchinnikov, Hahnbeom Park, Neha Varghese, Po-Ssu Huang, Georgios A. Pavlopoulos, David E. Kim, Hetunandan Kamisetty, Nikos C. Kyrpides, David Baker
Protein structure determination using metagenome sequence data Journal Article
In: Science, vol. 355, no. 6322, pp. 294–298, 2017, ISSN: 0036-8075.
@article{Ovchinnikov294,
title = {Protein structure determination using metagenome sequence data},
author = { Sergey Ovchinnikov and Hahnbeom Park and Neha Varghese and Po-Ssu Huang and Georgios A. Pavlopoulos and David E. Kim and Hetunandan Kamisetty and Nikos C. Kyrpides and David Baker},
url = {https://www.bakerlab.org/wp-content/uploads/2017/01/ovchinnikov_science_2017.pdf
http://science.sciencemag.org/content/355/6322/294},
doi = {10.1126/science.aah4043},
issn = {0036-8075},
year = {2017},
date = {2017-01-01},
journal = {Science},
volume = {355},
number = {6322},
pages = {294--298},
publisher = {American Association for the Advancement of Science},
abstract = {Fewer than a third of the 14,849 known protein families have at least one member with an experimentally determined structure. This leaves more than 5000 protein families with no structural information. Protein modeling using residue-residue contacts inferred from evolutionary data has been successful in modeling unknown structures, but it requires large numbers of aligned sequences. Ovchinnikov et al. augmented such sequence alignments with metagenome sequence data (see the Perspective by S"oding). They determined the number of sequences required to allow modeling, developed criteria for model quality, and, where possible, improved modeling by matching predicted contacts to known structures. Their method predicted quality structural models for 614 protein families, of which about 140 represent newly discovered protein folds.Science, this issue p. 294; see also p. 248Despite decades of work by structural biologists, there are still ~5200 protein families with unknown structure outside the range of comparative modeling. We show that Rosetta structure prediction guided by residue-residue contacts inferred from evolutionary information can accurately model proteins that belong to large families and that metagenome sequence data more than triple the number of protein families with sufficient sequences for accurate modeling. We then integrate metagenome data, contact-based structure matching, and Rosetta structure calculations to generate models for 614 protein families with currently unknown structures; 206 are membrane proteins and 137 have folds not represented in the Protein Data Bank. This approach provides the representative models for large protein families originally envisioned as the goal of the Protein Structure Initiative at a fraction of the cost.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
2016
Ovchinnikov, Sergey, Park, Hahnbeom, Kim, David E., Liu, Yuan, Wang, Ray Yu-Ruei, Baker, David
Structure prediction using sparse simulated NOE restraints with Rosetta in CASP11 Journal Article
In: Proteins: Structure, Function, and Bioinformatics, pp. n/a–n/a, 2016, ISSN: 1097-0134.
@article{PROT:PROT25006,
title = {Structure prediction using sparse simulated NOE restraints with Rosetta in CASP11},
author = {Ovchinnikov, Sergey and Park, Hahnbeom and Kim, David E. and Liu, Yuan and Wang, Ray Yu-Ruei and Baker, David},
url = {http://dx.doi.org/10.1002/prot.25006
https://www.bakerlab.org/wp-content/uploads/2016/05/Ovchinnikov_et_al-2016-Proteins__Structure_Function_and_Bioinformatics.pdf},
doi = {10.1002/prot.25006},
issn = {1097-0134},
year = {2016},
date = {2016-01-01},
journal = {Proteins: Structure, Function, and Bioinformatics},
pages = {n/a--n/a},
abstract = {In CASP11 we generated protein structure models using simulated ambiguous and unambiguous nuclear Overhauser effect (NOE) restraints with a two stage protocol. Low resolution models were generated guided by the unambiguous restraints using continuous chain folding for alpha and alpha-beta proteins, and iterative annealing for all beta proteins to take advantage of the strand pairing information implicit in the restraints. The Rosetta fragment/model hybridization protocol was then used to recombine and regularize these models, and refine them in the Rosetta full atom energy function guided by both the unambiguous and the ambiguous restraints. Fifteen out of 19 targets were modeled with GDT-TS quality scores greater than 60 for Model 1, significantly improving upon the non-assisted predictions. Our results suggest that atomic level accuracy is achievable using sparse NOE data when there is at least one correctly assigned NOE for every residue. Proteins 2016. © 2016 Wiley Periodicals, Inc.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
2015
S Ovchinnikov, DE Kim, RY Wang, Y Liu, F DiMaio, D Baker
Improved de novo structure prediction in CASP11 by incorporating Co-evolution information into rosetta Journal Article
In: Proteins, 2015.
@article{S2015,
title = {Improved de novo structure prediction in CASP11 by incorporating Co-evolution information into rosetta},
author = {S Ovchinnikov and DE Kim and RY Wang and Y Liu and F DiMaio and D Baker},
url = {https://www.bakerlab.org/wp-content/uploads/2015/12/Ovchinnikov_Proteins_2015.pdf},
doi = {10.1002/prot.24974},
year = {2015},
date = {2015-12-17},
journal = {Proteins},
abstract = {We describe CASP11 de novo blind structure predictions made using the Rosetta structure prediction methodology with both automatic and human assisted protocols. Model accuracy was generally improved using co-evolution derived residue-residue contact information as restraints during Rosetta conformational sampling and refinement, particularly when the number of sequences in the family was more than three times the length of the protein. The highlight was the human assisted prediction of T0806, a large and topologically complex target with no homologs of known structure, which had unprecedented accuracy - <3.0 Å root-mean-square deviation (RMSD) from the crystal structure over 223 residues. For this target, we increased the amount of conformational sampling over our fully automated method by employing an iterative hybridization protocol. Our results clearly demonstrate, in a blind prediction scenario, that co-evolution derived contacts can considerably increase the accuracy of template-free structure modeling. This article is protected by copyright. All rights reserved.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
S Ovchinnikov, L Kinch, H Park, Y Liao, J Pei, DE Kim, H Kamisetty, NV Grishin, D Baker
Large-scale determination of previously unsolved protein structures using evolutionary information Journal Article
In: eLife, 2015.
@article{S2015b,
title = {Large-scale determination of previously unsolved protein structures using evolutionary information},
author = {S Ovchinnikov, L Kinch, H Park, Y Liao, J Pei, DE Kim, H Kamisetty, NV Grishin, D Baker},
url = {https://www.bakerlab.org/wp-content/uploads/2016/01/Ovchinnikov_eLife_2015.pdf},
doi = {10.7554/eLife.09248},
year = {2015},
date = {2015-09-03},
journal = {eLife},
abstract = {The prediction of the structures of proteins without detectable sequence similarity to any protein of known structure remains an outstanding scientific challenge. Here we report significant progress in this area. We first describe de novo blind structure predictions of unprecendented accuracy we made for two proteins in large families in the recent CASP11 blind test of protein structure prediction methods by incorporating residue-residue co-evolution information in the Rosetta structure prediction program. We then describe the use of this method to generate structure models for 58 of the 121 large protein families in prokaryotes for which three-dimensional structures are not available. These models, which are posted online for public access, provide structural information for the over 400,000 proteins belonging to the 58 families and suggest hypotheses about mechanism for the subset for which the function is known, and hypotheses about function for the remainder. },
keywords = {},
pubstate = {published},
tppubtype = {article}
}
2014
George A Khoury, Adam Liwo, Firas Khatib, Hongyi Zhou, Gaurav Chopra, Jaume Bacardit, Leandro O Bortot, Rodrigo A Faccioli, Xin Deng, Yi He, Pawel Krupa, Jilong Li, Magdalena A Mozolewska, Adam K Sieradzan, James Smadbeck, Tomasz Wirecki, Seth Cooper, Jeff Flatten, Kefan Xu, David Baker, Jianlin Cheng, Alexandre C B Delbem, Christodoulos A Floudas, Chen Keasar, Michael Levitt, Zoran Popovi’c, Harold A Scheraga, Jeffrey Skolnick, Silvia N Crivelli
WeFold: a coopetition for protein structure prediction. Journal Article
In: Proteins, vol. 82, pp. 1850-68, 2014, ISSN: 1097-0134.
@article{625,
title = {WeFold: a coopetition for protein structure prediction.},
author = { George A Khoury and Adam Liwo and Firas Khatib and Hongyi Zhou and Gaurav Chopra and Jaume Bacardit and Leandro O Bortot and Rodrigo A Faccioli and Xin Deng and Yi He and Pawel Krupa and Jilong Li and Magdalena A Mozolewska and Adam K Sieradzan and James Smadbeck and Tomasz Wirecki and Seth Cooper and Jeff Flatten and Kefan Xu and David Baker and Jianlin Cheng and Alexandre C B Delbem and Christodoulos A Floudas and Chen Keasar and Michael Levitt and Zoran Popovi'c and Harold A Scheraga and Jeffrey Skolnick and Silvia N Crivelli},
url = {http://www.bakerlab.org/wp-content/uploads/2015/12/Khoury_Proteins_2014.pdf},
doi = {10.1002/prot.24538},
issn = {1097-0134},
year = {2014},
date = {2014-09-01},
journal = {Proteins},
volume = {82},
pages = {1850-68},
abstract = {The protein structure prediction problem continues to elude scientists. Despite the introduction of many methods, only modest gains were made over the last decade for certain classes of prediction targets. To address this challenge, a social-media based worldwide collaborative effort, named WeFold, was undertaken by 13 labs. During the collaboration, the laboratories were simultaneously competing with each other. Here, we present the first attempt at "coopetition" in scientific research applied to the protein structure prediction and refinement problems. The coopetition was possible by allowing the participating labs to contribute different components of their protein structure prediction pipelines and create new hybrid pipelines that they tested during CASP10. This manuscript describes both successes and areas needing improvement as identified throughout the first WeFold experiment and discusses the efforts that are underway to advance this initiative. A footprint of all contributions and structures are publicly accessible at http://www.wefold.org.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Kuang-Yui M Chen, Jiaming Sun, Jason S Salvo, David Baker, Patrick Barth
High-resolution modeling of transmembrane helical protein structures from distant homologues. Journal Article
In: PLoS computational biology, vol. 10, pp. e1003636, 2014, ISSN: 1553-7358.
@article{622,
title = {High-resolution modeling of transmembrane helical protein structures from distant homologues.},
author = { Kuang-Yui M Chen and Jiaming Sun and Jason S Salvo and David Baker and Patrick Barth},
url = {http://www.bakerlab.org/wp-content/uploads/2015/12/Chen_PLOS_2014.pdf},
doi = {10.1371/journal.pcbi.1003636},
issn = {1553-7358},
year = {2014},
date = {2014-05-01},
journal = {PLoS computational biology},
volume = {10},
pages = {e1003636},
abstract = {Eukaryotic transmembrane helical (TMH) proteins perform a wide diversity of critical cellular functions, but remain structurally largely uncharacterized and their high-resolution structure prediction is currently hindered by the lack of close structural homologues. To address this problem, we present a novel and generic method for accurately modeling large TMH protein structures from distant homologues exhibiting distinct loop and TMH conformations. Models of the adenosine A2AR and chemokine CXCR4 receptors were first ranked in GPCR-DOCK blind prediction contests in the receptor structure accuracy category. In a benchmark of 50 TMH protein homolog pairs of diverse topology (from 5 to 12 TMHs), size (from 183 to 420 residues) and sequence identity (from 15% to 70%), the method improves most starting templates, and achieves near-atomic accuracy prediction of membrane-embedded regions. Unlike starting templates, the models are of suitable quality for computer-based protein engineering: redesigned models and redesigned X-ray structures exhibit very similar native interactions. The method should prove useful for the atom-level modeling and design of a large fraction of structurally uncharacterized TMH proteins from a wide range of structural homologues.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Sergey Ovchinnikov, Hetunandan Kamisetty, David Baker
Robust and accurate prediction of residue-residue interactions across protein interfaces using evolutionary information. Journal Article
In: eLife, vol. 3, pp. e02030, 2014, ISSN: 2050-084X.
@article{540,
title = {Robust and accurate prediction of residue-residue interactions across protein interfaces using evolutionary information.},
author = { Sergey Ovchinnikov and Hetunandan Kamisetty and David Baker},
url = {http://www.bakerlab.org/wp-content/uploads/2015/12/Ovchinnikov_2014A.pdf},
doi = {10.7554/eLife.02030},
issn = {2050-084X},
year = {2014},
date = {2014-05-01},
journal = {eLife},
volume = {3},
pages = {e02030},
abstract = {Do the amino acid sequence identities of residues that make contact across protein interfaces covary during evolution? If so, such covariance could be used to predict contacts across interfaces and assemble models of biological complexes. We find that residue pairs identified using a pseudo-likelihood-based method to covary across protein-protein interfaces in the 50S ribosomal unit and 28 additional bacterial protein complexes with known structure are almost always in contact in the complex, provided that the number of aligned sequences is greater than the average length of the two proteins. We use this method to make subunit contact predictions for an additional 36 protein complexes with unknown structures, and present models based on these predictions for the tripartite ATP-independent periplasmic (TRAP) transporter, the tripartite efflux system, the pyruvate formate lyase-activating enzyme complex, and the methionine ABC transporter.DOI: http://dx.doi.org/10.7554/eLife.02030.001.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
David Baker
Protein folding, structure prediction and design. Journal Article
In: Biochemical Society transactions, vol. 42, pp. 225-9, 2014, ISSN: 1470-8752.
@article{529,
title = {Protein folding, structure prediction and design.},
author = { David Baker},
url = {http://www.bakerlab.org/wp-content/uploads/2015/12/Baker_BiochemSocTrans_2014.pdf},
doi = {10.1042/BST20130055},
issn = {1470-8752},
year = {2014},
date = {2014-04-01},
journal = {Biochemical Society transactions},
volume = {42},
pages = {225-9},
abstract = {I describe how experimental studies of protein folding have led to advances in protein structure prediction and protein design. I describe the finding that protein sequences are not optimized for rapid folding, the contact order-protein folding rate correlation, the incorporation of experimental insights into protein folding into the Rosetta protein structure production methodology and the use of this methodology to determine structures from sparse experimental data. I then describe the inverse problem (protein design) and give an overview of recent work on designing proteins with new structures and functions. I also describe the contributions of the general public to these efforts through the Rosetta@home distributed computing project and the FoldIt interactive protein folding and design game.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
2013
Rocco Moretti, Sarel J Fleishman, Rudi Agius, Mieczyslaw Torchala, Paul A Bates, Panagiotis L Kastritis, Jo~ao P G L M Rodrigues, Mika"el Trellet, Alexandre M J J Bonvin, Meng Cui, Marianne Rooman, Dimitri Gillis, Yves Dehouck, Iain Moal, Miguel Romero-Durana, Laura Perez-Cano, Chiara Pallara, Brian Jimenez, Juan Fernandez-Recio, Samuel Flores, Michael Pacella, Krishna Praneeth Kilambi, Jeffrey J Gray, Petr Popov, Sergei Grudinin, Juan Esquivel-Rodr’iguez, Daisuke Kihara, Nan Zhao, Dmitry Korkin, Xiaolei Zhu, Omar N A Demerdash, Julie C Mitchell, Eiji Kanamori, Yuko Tsuchiya, Haruki Nakamura, Hasup Lee, Hahnbeom Park, Chaok Seok, Jamica Sarmiento, Shide Liang, Shusuke Teraguchi, Daron M Standley, Hiromitsu Shimoyama, Genki Terashi, Mayuko Takeda-Shitaka, Mitsuo Iwadate, Hideaki Umeyama, Dmitri Beglov, David R Hall, Dima Kozakov, Sandor Vajda, Brian G Pierce, Howook Hwang, Thom Vreven, Zhiping Weng, Yangyu Huang, Haotian Li, Xiufeng Yang, Xiaofeng Ji, Shiyong Liu, Yi Xiao, Martin Zacharias, Sanbo Qin, Huan-Xiang Zhou, Sheng-You Huang, Xiaoqin Zou, Sameer Velankar, Jo"el Janin, Shoshana J Wodak, David Baker
Community-wide evaluation of methods for predicting the effect of mutations on protein-protein interactions. Journal Article
In: Proteins, vol. 81, pp. 1980-7, 2013, ISSN: 1097-0134.
@article{505,
title = {Community-wide evaluation of methods for predicting the effect of mutations on protein-protein interactions.},
author = { Rocco Moretti and Sarel J Fleishman and Rudi Agius and Mieczyslaw Torchala and Paul A Bates and Panagiotis L Kastritis and Jo~ao P G L M Rodrigues and Mika"el Trellet and Alexandre M J J Bonvin and Meng Cui and Marianne Rooman and Dimitri Gillis and Yves Dehouck and Iain Moal and Miguel Romero-Durana and Laura Perez-Cano and Chiara Pallara and Brian Jimenez and Juan Fernandez-Recio and Samuel Flores and Michael Pacella and Krishna Praneeth Kilambi and Jeffrey J Gray and Petr Popov and Sergei Grudinin and Juan Esquivel-Rodr'iguez and Daisuke Kihara and Nan Zhao and Dmitry Korkin and Xiaolei Zhu and Omar N A Demerdash and Julie C Mitchell and Eiji Kanamori and Yuko Tsuchiya and Haruki Nakamura and Hasup Lee and Hahnbeom Park and Chaok Seok and Jamica Sarmiento and Shide Liang and Shusuke Teraguchi and Daron M Standley and Hiromitsu Shimoyama and Genki Terashi and Mayuko Takeda-Shitaka and Mitsuo Iwadate and Hideaki Umeyama and Dmitri Beglov and David R Hall and Dima Kozakov and Sandor Vajda and Brian G Pierce and Howook Hwang and Thom Vreven and Zhiping Weng and Yangyu Huang and Haotian Li and Xiufeng Yang and Xiaofeng Ji and Shiyong Liu and Yi Xiao and Martin Zacharias and Sanbo Qin and Huan-Xiang Zhou and Sheng-You Huang and Xiaoqin Zou and Sameer Velankar and Jo"el Janin and Shoshana J Wodak and David Baker},
url = {http://www.bakerlab.org/wp-content/uploads/2015/12/Moretti_Proteins_2013.pdf},
doi = {10.1002/prot.24356},
issn = {1097-0134},
year = {2013},
date = {2013-11-01},
journal = {Proteins},
volume = {81},
pages = {1980-7},
abstract = {Community-wide blind prediction experiments such as CAPRI and CASP provide an objective measure of the current state of predictive methodology. Here we describe a community-wide assessment of methods to predict the effects of mutations on protein-protein interactions. Twenty-two groups predicted the effects of comprehensive saturation mutagenesis for two designed influenza hemagglutinin binders and the results were compared with experimental yeast display enrichment data obtained using deep sequencing. The most successful methods explicitly considered the effects of mutation on monomer stability in addition to binding affinity, carried out explicit side-chain sampling and backbone relaxation, evaluated packing, electrostatic, and solvation effects, and correctly identified around a third of the beneficial mutations. Much room for improvement remains for even the best techniques, and large-scale fitness landscapes should continue to provide an excellent test bed for continued evaluation of both existing and new prediction methodologies.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Robert Vernon, Yang Shen, David Baker, Oliver F Lange
Improved chemical shift based fragment selection for CS-Rosetta using Rosetta3 fragment picker. Journal Article
In: Journal of biomolecular NMR, vol. 57, pp. 117-27, 2013, ISSN: 1573-5001.
@article{508,
title = {Improved chemical shift based fragment selection for CS-Rosetta using Rosetta3 fragment picker.},
author = { Robert Vernon and Yang Shen and David Baker and Oliver F Lange},
doi = {10.1007/s10858-013-9772-4},
issn = {1573-5001},
year = {2013},
date = {2013-10-01},
journal = {Journal of biomolecular NMR},
volume = {57},
pages = {117-27},
abstract = {A new fragment picker has been developed for CS-Rosetta that combines beneficial features of the original fragment picker, MFR, used with CS-Rosetta, and the fragment picker, NNMake, that was used for purely sequence based fragment selection in the context of ROSETTA de-novo structure prediction. Additionally, the new fragment picker has reduced sensitivity to outliers and other difficult to match data points rendering the protocol more robust and less likely to introduce bias towards wrong conformations in cases where data is bad, missing or inconclusive. The fragment picker protocol gives significant improvements on 6 of 23 CS-Rosetta targets. An independent benchmark on 39 protein targets, whose NMR data sets were published only after protocol optimization had been finished, also show significantly improved performance for the new fragment picker (van der Schot et al. in J Biomol NMR, 2013).},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Hetunandan Kamisetty, Sergey Ovchinnikov, David Baker
Assessing the utility of coevolution-based residue-residue contact predictions in a sequence- and structure-rich era. Journal Article
In: Proceedings of the National Academy of Sciences of the United States of America, vol. 110, pp. 15674-9, 2013, ISSN: 1091-6490.
@article{498,
title = {Assessing the utility of coevolution-based residue-residue contact predictions in a sequence- and structure-rich era.},
author = { Hetunandan Kamisetty and Sergey Ovchinnikov and David Baker},
url = {http://www.bakerlab.org/wp-content/uploads/2015/12/Kamisetty_PNAS_2013.pdf},
doi = {10.1073/pnas.1314045110},
issn = {1091-6490},
year = {2013},
date = {2013-09-01},
journal = {Proceedings of the National Academy of Sciences of the United States of America},
volume = {110},
pages = {15674-9},
abstract = {Recently developed methods have shown considerable promise in predicting residue-residue contacts in protein 3D structures using evolutionary covariance information. However, these methods require large numbers of evolutionarily related sequences to robustly assess the extent of residue covariation, and the larger the protein family, the more likely that contact information is unnecessary because a reasonable model can be built based on the structure of a homolog. Here we describe a method that integrates sequence coevolution and structural context information using a pseudolikelihood approach, allowing more accurate contact predictions from fewer homologous sequences. We rigorously assess the utility of predicted contacts for protein structure prediction using large and representative sequence and structure databases from recent structure prediction experiments. We find that contact predictions are likely to be accurate when the number of aligned sequences (with sequence redundancy reduced to 90%) is greater than five times the length of the protein, and that accurate predictions are likely to be useful for structure modeling if the aligned sequences are more similar to the protein of interest than to the closest homolog of known structure. These conditions are currently met by 422 of the protein families collected in the Pfam database.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Gijs van der Schot, Zaiyong Zhang, Robert Vernon, Yang Shen, Wim F Vranken, David Baker, Alexandre M J J Bonvin, Oliver F Lange
Improving 3D structure prediction from chemical shift data. Journal Article
In: Journal of biomolecular NMR, vol. 57, pp. 27-35, 2013, ISSN: 1573-5001.
@article{507,
title = {Improving 3D structure prediction from chemical shift data.},
author = { Gijs van der Schot and Zaiyong Zhang and Robert Vernon and Yang Shen and Wim F Vranken and David Baker and Alexandre M J J Bonvin and Oliver F Lange},
doi = {10.1007/s10858-013-9762-6},
issn = {1573-5001},
year = {2013},
date = {2013-09-01},
journal = {Journal of biomolecular NMR},
volume = {57},
pages = {27-35},
abstract = {We report advances in the calculation of protein structures from chemical shift nuclear magnetic resonance data alone. Our previously developed method, CS-Rosetta, assembles structures from a library of short protein fragments picked from a large library of protein structures using chemical shifts and sequence information. Here we demonstrate that combination of a new and improved fragment picker and the iterative sampling algorithm RASREC yield significant improvements in convergence and accuracy. Moreover, we introduce improved criteria for assessing the accuracy of the models produced by the method. The method was tested on 39 proteins in the 50-100 residue size range and yields reliable structures in 70~% of the cases. All structures that passed the reliability filter were accurate (<2~r A RMSD from the reference).},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
David E Kim, Frank DiMaio, Ray Yu-Ruei Wang, Yifan Song, David Baker
One contact for every twelve residues allows robust and accurate topology-level protein structure modeling. Journal Article
In: Proteins, 2013, ISSN: 1097-0134.
@article{506,
title = {One contact for every twelve residues allows robust and accurate topology-level protein structure modeling.},
author = { David E Kim and Frank DiMaio and Ray Yu-Ruei Wang and Yifan Song and David Baker},
url = {http://www.bakerlab.org/wp-content/uploads/2015/12/Kim_Proteins_2013.pdf},
doi = {10.1002/prot.24374},
issn = {1097-0134},
year = {2013},
date = {2013-07-01},
journal = {Proteins},
abstract = {A number of methods have been described for identifying pairs of contacting residues in protein three-dimensional structures, but it is unclear how many contacts are required for accurate structure modeling. The CASP10 assisted contact experiment provided a blind test of contact guided protein structure modeling. We describe the models generated for these contact guided prediction challenges using the Rosetta structure modeling methodology. For nearly all cases, the submitted models had the correct overall topology, and in some cases, they had near atomic-level accuracy; for example the model of the 384 residue homo-oligomeric tetramer (Tc680o) had only 2.9 r A root-mean-square deviation (RMSD) from the crystal structure. Our results suggest that experimental and bioinformatic methods for obtaining contact information may need to generate only one correct contact for every 12 residues in the protein to allow accurate topology level modeling. Proteins 2013;. textcopyright 2013 Wiley Periodicals, Inc.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Gert Kiss, Nihan Celebi-"Olc c"um, Rocco Moretti, David Baker, K N Houk
Computational enzyme design Journal Article
In: Angewandte Chemie (International ed. in English), vol. 52, pp. 5700-25, 2013, ISSN: 1521-3773.
@article{472,
title = {Computational enzyme design},
author = { Gert Kiss and Nihan Celebi-"Olc c"um and Rocco Moretti and David Baker and K N Houk},
url = {http://www.bakerlab.org/wp-content/uploads/2015/12/Kiss_AngewChemIntEd_2013.pdf},
doi = {10.1002/anie.201204077},
issn = {1521-3773},
year = {2013},
date = {2013-05-01},
journal = {Angewandte Chemie (International ed. in English)},
volume = {52},
pages = {5700-25},
abstract = {Recent developments in computational chemistry and biology have come together in the "inside-out" approach to enzyme engineering. Proteins have been designed to catalyze reactions not previously accelerated in nature. Some of these proteins fold and act as catalysts, but the success rate is still low. The achievements and limitations of the current technology are highlighted and contrasted to other protein engineering techniques. On its own, computational "inside-out" design can lead to the production of catalytically active and selective proteins, but their kinetic performances fall short of natural enzymes. When combined with directed evolution, molecular dynamics simulations, and crowd-sourced structure-prediction approaches, however, computational designs can be significantly improved in terms of binding, turnover, and thermal stability.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
MA Molski, JL Goodman, FC Chou, D Baker, R Das, A Schepartz
Remodeling a beta-peptide bundle Journal Article
In: Chemical Science, vol. 4, pp. 319-324, 2013, ISSN: 2041-6520.
@article{605,
title = {Remodeling a beta-peptide bundle},
author = { MA Molski and JL Goodman and FC Chou and D Baker and R Das and A Schepartz},
url = {http://www.bakerlab.org/wp-content/uploads/2015/12/remodelingabeta_Baker2013.pdf},
doi = {10.1039/c2sc21117c},
issn = {2041-6520},
year = {2013},
date = {2013-00-01},
journal = {Chemical Science},
volume = {4},
pages = {319-324},
abstract = {Natural biopolymers fold with fidelity, burying diverse side chains into well-packed cores and protecting their backbones from solvent. Certain beta-peptide oligomers assemble into bundles of defined octameric stoichiometry that resemble natural proteins in many respects. These beta-peptide bundles are thermostable, fold cooperatively, exchange interior amide N-H protons slowly, exclude hydrophobic dyes, and can be characterized at high resolution using X-ray crystallography - just like many proteins found in nature. But unlike natural proteins, all octameric beta-peptide bundles contain a sequence-uniform hydrophobic core composed of 32 leucine side chains. Here we apply rational design principles, including the Rosetta computational design methodology, to introduce sequence diversity into the bundle core while retaining the characteristic beta-peptide bundle fold. Using circular dichroism spectroscopy and analytical ultracentrifugation, we confirmed the prediction that an octameric bundle still assembles upon a major remodelling of its core: the mutation of sixteen core beta-homo-leucine side chains into sixteen beta-homo-phenylalanine side chains. Nevertheless, the bundle containing a partially beta-homo-phenylalanine core poorly protects interior amide protons from exchange, suggesting molten-globule-like properties. We further improve stability by the incorporation of eight beta-homo-pentafluorophenyalanine side chains, giving an assembly with amide protection factors comparable to prior well-structured bundles. By demonstrating that their cores tolerate significant sequence variation, the beta-peptide bundles reported here represent a starting point for the "bottom-up" construction of beta-peptide assemblies possessing both structure and sophisticated function.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
2012
Troy C Krzysiak, Jinwon Jung, James Thompson, David Baker, Angela M Gronenborn
APOBEC2 is a monomer in solution: implications for APOBEC3G models Journal Article
In: Biochemistry, vol. 51, pp. 2008-17, 2012, ISSN: 1520-4995.
@article{604,
title = {APOBEC2 is a monomer in solution: implications for APOBEC3G models},
author = { Troy C Krzysiak and Jinwon Jung and James Thompson and David Baker and Angela M Gronenborn},
url = {http://beta.baker/wp-content/uploads/2015/12/apobec2isamonomer_Baker2012.pdf},
doi = {10.1021/bi300021s},
issn = {1520-4995},
year = {2012},
date = {2012-03-01},
journal = {Biochemistry},
volume = {51},
pages = {2008-17},
abstract = {Although the physiological role of APOBEC2 is still largely unknown, a crystal structure of a truncated variant of this protein was determined several years ago [Prochnow, C. (2007) Nature445, 447-451]. This APOBEC2 structure had considerable impact in the HIV field because it was considered a good model for the structure of APOBEC3G, an important HIV restriction factor that abrogates HIV infectivity in the absence of the viral accessory protein Vif. The quaternary structure and the arrangement of the monomers of APOBEC2 in the crystal were taken as being representative for APOBEC3G and exploited in explaining its enzymatic and anti-HIV activity. Here we show, unambiguously, that in contrast to the findings for the crystal, APOBEC2 is monomeric in solution. The nuclear magnetic resonance solution structure of full-length APOBEC2 reveals that the N-terminal tail that was removed for crystallization resides close to strand β2, the dimer interface in the crystal structure, and shields this region of the protein from engaging in intermolecular contacts. In addition, the presence of the N-terminal region drastically alters the aggregation propensity of APOBEC2, rendering the full-length protein highly soluble and not prone to precipitation. In summary, our results cast doubt on all previous structure-function predictions for APOBEC3G that were based on the crystal structure of APOBEC2.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Oliver F Lange, David Baker
Resolution-adapted recombination of structural features significantly improves sampling in restraint-guided structure calculation. Journal Article
In: Proteins, vol. 80, pp. 884-95, 2012, ISSN: 1097-0134.
@article{460,
title = {Resolution-adapted recombination of structural features significantly improves sampling in restraint-guided structure calculation.},
author = { Oliver F Lange and David Baker},
url = {http://beta.baker/wp-content/uploads/2015/12/Lange_Proteins_2012.pdf},
issn = {1097-0134},
year = {2012},
date = {2012-03-01},
journal = {Proteins},
volume = {80},
pages = {884-95},
abstract = {Recent work has shown that NMR structures can be determined by integrating sparse NMR data with structure prediction methods such as Rosetta. The experimental data serve to guide the search for the lowest energy state towards the deep minimum at the native state which is frequently missed in Rosetta de novo structure calculations. However, as the protein size increases, sampling again becomes limiting; for example, the standard Rosetta protocol involving Monte Carlo fragment insertion starting from an extended chain fails to converge for proteins over 150 amino acids even with guidance from chemical shifts (CS-Rosetta) and other NMR data. The primary limitation of this protocol--that every folding trajectory is completely independent of every other--was recently overcome with the development of a new approach involving resolution-adapted structural recombination (RASREC). Here we describe the RASREC approach in detail and compare it to standard CS-Rosetta. We show that the improved sampling of RASREC is essential in obtaining accurate structures over a benchmark set of 11 proteins in the 15-25 kDa size range using chemical shifts, backbone RDCs and HN-HN NOE data; in a number of cases the improved sampling methodology makes a larger contribution than incorporation of additional experimental data. Experimental data are invaluable for guiding sampling to the vicinity of the global energy minimum, but for larger proteins, the standard Rosetta fold-from-extended-chain protocol does not converge on the native minimum even with experimental data and the more powerful RASREC approach is necessary to converge to accurate solutions.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Julia Handl, Joshua Knowles, Robert Vernon, David Baker, Simon C Lovell
The dual role of fragments in fragment-assembly methods for de novo protein structure prediction Journal Article
In: Proteins, vol. 80, pp. 490-504, 2012, ISSN: 1097-0134.
@article{601,
title = {The dual role of fragments in fragment-assembly methods for de novo protein structure prediction},
author = { Julia Handl and Joshua Knowles and Robert Vernon and David Baker and Simon C Lovell},
url = {https://www.bakerlab.org/wp-content/uploads/2018/06/Handl_et_al-2012-Proteins3A_Structure2C_Function2C_and_Bioinformatics.pdf
https://onlinelibrary.wiley.com/doi/full/10.1002/prot.23215},
doi = {10.1002/prot.23215},
issn = {1097-0134},
year = {2012},
date = {2012-02-01},
journal = {Proteins},
volume = {80},
pages = {490-504},
abstract = {In fragment-assembly techniques for protein structure prediction, models of protein structure are assembled from fragments of known protein structures. This process is typically guided by a knowledge-based energy function and uses a heuristic optimization method. The fragments play two important roles in this process: they define the set of structural parameters available, and they also assume the role of the main variation operators that are used by the optimiser. Previous analysis has typically focused on the first of these roles. In particular, the relationship between local amino acid sequence and local protein structure has been studied by a range of authors. The correlation between the two has been shown to vary with the window length considered, and the results of these analyses have informed directly the choice of fragment length in state-of-the-art prediction techniques. Here, we focus on the second role of fragments and aim to determine the effect of fragment length from an optimization perspective. We use theoretical analyses to reveal how the size and structure of the search space changes as a function of insertion length. Furthermore, empirical analyses are used to explore additional ways in which the size of the fragment insertion influences the search both in a simulation model and for the fragment-assembly technique, Rosetta.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
2011
Sarel J Fleishman, Timothy A Whitehead, Eva-Maria Strauch, Jacob E Corn, Sanbo Qin, Huan-Xiang Zhou, Julie C Mitchell, Omar N A Demerdash, Mayuko Takeda-Shitaka, Genki Terashi, Iain H Moal, Xiaofan Li, Paul A Bates, Martin Zacharias, Hahnbeom Park, Jun-su Ko, Hasup Lee, Chaok Seok, Thomas Bourquard, Julie Bernauer, Anne Poupon, J’er^ome Az’e, Seren Soner, Sefik Kerem Ovali, Pemra Ozbek, Nir Ben Tal, T"urkan Haliloglu, Howook Hwang, Thom Vreven, Brian G Pierce, Zhiping Weng, Laura P’erez-Cano, Carles Pons, Juan Fern’andez-Recio, Fan Jiang, Feng Yang, Xinqi Gong, Libin Cao, Xianjin Xu, Bin Liu, Panwen Wang, Chunhua Li, Cunxin Wang, Charles H Robert, Mainak Guharoy, Shiyong Liu, Yangyu Huang, Lin Li, Dachuan Guo, Ying Chen, Yi Xiao, Nir London, Zohar Itzhaki, Ora Schueler-Furman, Yuval Inbar, Vladimir Potapov, Mati Cohen, Gideon Schreiber, Yuko Tsuchiya, Eiji Kanamori, Daron M Standley, Haruki Nakamura, Kengo Kinoshita, Camden M Driggers, Robert G Hall, Jessica L Morgan, Victor L Hsu, Jian Zhan, Yuedong Yang, Yaoqi Zhou, Panagiotis L Kastritis, Alexandre M J J Bonvin, Weiyi Zhang, Carlos J Camacho, Krishna P Kilambi, Aroop Sircar, Jeffrey J Gray, Masahito Ohue, Nobuyuki Uchikoga, Yuri Matsuzaki, Takashi Ishida, Yutaka Akiyama, Raed Khashan, Stephen Bush, Denis Fouches, Alexander Tropsha, Juan Esquivel-Rodr’iguez, Daisuke Kihara, P Benjamin Stranges, Ron Jacak, Brian Kuhlman, Sheng-You Huang, Xiaoqin Zou, Shoshana J Wodak, Joel Janin, David Baker
Community-wide assessment of protein-interface modeling suggests improvements to design methodology Journal Article
In: Journal of Molecular Biology, vol. 414, pp. 289-302, 2011, ISSN: 1089-8638.
@article{598,
title = {Community-wide assessment of protein-interface modeling suggests improvements to design methodology},
author = { Sarel J Fleishman and Timothy A Whitehead and Eva-Maria Strauch and Jacob E Corn and Sanbo Qin and Huan-Xiang Zhou and Julie C Mitchell and Omar N A Demerdash and Mayuko Takeda-Shitaka and Genki Terashi and Iain H Moal and Xiaofan Li and Paul A Bates and Martin Zacharias and Hahnbeom Park and Jun-su Ko and Hasup Lee and Chaok Seok and Thomas Bourquard and Julie Bernauer and Anne Poupon and J'er^ome Az'e and Seren Soner and Sefik Kerem Ovali and Pemra Ozbek and Nir Ben Tal and T"urkan Haliloglu and Howook Hwang and Thom Vreven and Brian G Pierce and Zhiping Weng and Laura P'erez-Cano and Carles Pons and Juan Fern'andez-Recio and Fan Jiang and Feng Yang and Xinqi Gong and Libin Cao and Xianjin Xu and Bin Liu and Panwen Wang and Chunhua Li and Cunxin Wang and Charles H Robert and Mainak Guharoy and Shiyong Liu and Yangyu Huang and Lin Li and Dachuan Guo and Ying Chen and Yi Xiao and Nir London and Zohar Itzhaki and Ora Schueler-Furman and Yuval Inbar and Vladimir Potapov and Mati Cohen and Gideon Schreiber and Yuko Tsuchiya and Eiji Kanamori and Daron M Standley and Haruki Nakamura and Kengo Kinoshita and Camden M Driggers and Robert G Hall and Jessica L Morgan and Victor L Hsu and Jian Zhan and Yuedong Yang and Yaoqi Zhou and Panagiotis L Kastritis and Alexandre M J J Bonvin and Weiyi Zhang and Carlos J Camacho and Krishna P Kilambi and Aroop Sircar and Jeffrey J Gray and Masahito Ohue and Nobuyuki Uchikoga and Yuri Matsuzaki and Takashi Ishida and Yutaka Akiyama and Raed Khashan and Stephen Bush and Denis Fouches and Alexander Tropsha and Juan Esquivel-Rodr'iguez and Daisuke Kihara and P Benjamin Stranges and Ron Jacak and Brian Kuhlman and Sheng-You Huang and Xiaoqin Zou and Shoshana J Wodak and Joel Janin and David Baker},
url = {https://www.bakerlab.org/wp-content/uploads/2018/06/1-s2.0-S0022283611010552-main.pdf
https://www.sciencedirect.com/science/article/pii/S0022283611010552?via%3Dihub},
doi = {10.1016/j.jmb.2011.09.031},
issn = {1089-8638},
year = {2011},
date = {2011-11-01},
journal = {Journal of Molecular Biology},
volume = {414},
pages = {289-302},
abstract = {The CAPRI (Critical Assessment of Predicted Interactions) and CASP (Critical Assessment of protein Structure Prediction) experiments have demonstrated the power of community-wide tests of methodology in assessing the current state of the art and spurring progress in the very challenging areas of protein docking and structure prediction. We sought to bring the power of community-wide experiments to bear on a very challenging protein design problem that provides a complementary but equally fundamental test of current understanding of protein-binding thermodynamics. We have generated a number of designed protein-protein interfaces with very favorable computed binding energies but which do not appear to be formed in experiments, suggesting that there may be important physical chemistry missing in the energy calculations. A total of 28 research groups took up the challenge of determining what is missing: we provided structures of 87 designed complexes and 120 naturally occurring complexes and asked participants to identify energetic contributions and/or structural features that distinguish between the two sets. The community found that electrostatics and solvation terms partially distinguish the designs from the natural complexes, largely due to the nonpolar character of the designed interactions. Beyond this polarity difference, the community found that the designed binding surfaces were, on average, structurally less embedded in the designed monomers, suggesting that backbone conformational rigidity at the designed surface is important for realization of the designed function. These results can be used to improve computational design strategies, but there is still much to be learned; for example, one designed complex, which does form in experiments, was classified by all metrics as a nonbinder.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
James Thompson, David Baker
Incorporation of evolutionary information into Rosetta comparative modeling. Journal Article
In: Proteins, vol. 79, pp. 2380-8, 2011, ISSN: 1097-0134.
@article{421,
title = {Incorporation of evolutionary information into Rosetta comparative modeling.},
author = { James Thompson and David Baker},
url = {https://www.bakerlab.org/wp-content/uploads/2018/06/7a8a6bd9c93cfb06e1f3c0416a914b7494ffd1d2e15654117ed9e259a487cf33.pdf
https://onlinelibrary.wiley.com/doi/abs/10.1002/prot.23046},
doi = {10.1002/prot.23046},
issn = {1097-0134},
year = {2011},
date = {2011-08-01},
journal = {Proteins},
volume = {79},
pages = {2380-8},
abstract = {Prediction of protein structures from sequences is a fundamental problem in computational biology. Algorithms that attempt to predict a structure from sequence primarily use two sources of information. The first source is physical in nature: proteins fold into their lowest energy state. Given an energy function that describes the interactions governing folding, a method for constructing models of protein structures, and the amino acid sequence of a protein of interest, the structure prediction problem becomes a search for the lowest energy structure. Evolution provides an orthogonal source of information: proteins of similar sequences have similar structure, and therefore proteins of known structure can guide modeling. The relatively successful Rosetta approach takes advantage of the first, but not the second source of information during model optimization. Following the classic work by Andrej Sali and colleagues, we develop a probabilistic approach to derive spatial restraints from proteins of known structure using advances in alignment technology and the growth in the number of structures in the Protein Data Bank. These restraints define a region of conformational space that is high-probability, given the template information, and we incorporate them into Rosettatextquoterights comparative modeling protocol. The combined approach performs considerably better on a benchmark based on previous CASP experiments. Incorporating evolutionary information into Rosetta is analogous to incorporating sparse experimental data: in both cases, the additional information eliminates large regions of conformational space and increases the probability that energy-based refinement will hone in on the deep energy minimum at the native state.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Elizabeth H Kellogg, Andrew Leaver-Fay, David Baker
Role of conformational sampling in computing mutation-induced changes in protein structure and stability Journal Article
In: Proteins, vol. 79, pp. 830-8, 2011, ISSN: 1097-0134.
@article{354,
title = {Role of conformational sampling in computing mutation-induced changes in protein structure and stability},
author = { Elizabeth H Kellogg and Andrew Leaver-Fay and David Baker},
doi = {10.1002/prot.22921},
issn = {1097-0134},
year = {2011},
date = {2011-03-01},
journal = {Proteins},
volume = {79},
pages = {830-8},
abstract = {The prediction of changes in protein stability and structure resulting from single amino acid substitutions is both a fundamental test of macromolecular modeling methodology and an important current problem as high throughput sequencing reveals sequence polymorphisms at an increasing rate. In principle, given the structure of a wild-type protein and a point mutation whose effects are to be predicted, an accurate method should recapitulate both the structural changes and the change in the folding-free energy. Here, we explore the performance of protocols which sample an increasing diversity of conformations. We find that surprisingly similar performances in predicting changes in stability are achieved using protocols that involve very different amounts of conformational sampling, provided that the resolution of the force field is matched to the resolution of the sampling method. Methods involving backbone sampling can in some cases closely recapitulate the structural changes accompanying mutations but not surprisingly tend to do more harm than good in cases where structural changes are negligible. Analysis of the outliers in the stability change calculations suggests areas needing particular improvement; these include the balance between desolvation and the formation of favorable buried polar interactions, and unfolded state modeling.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Frank DiMaio, Andrew Leaver-Fay, Phil Bradley, David Baker, Ingemar Andr’e
Modeling symmetric macromolecular structures in Rosetta3 Journal Article
In: PloS One, vol. 6, pp. e20450, 2011, ISSN: 1932-6203.
@article{590,
title = {Modeling symmetric macromolecular structures in Rosetta3},
author = { Frank DiMaio and Andrew Leaver-Fay and Phil Bradley and David Baker and Ingemar Andr'e},
doi = {10.1371/journal.pone.0020450},
issn = {1932-6203},
year = {2011},
date = {2011-00-01},
journal = {PloS One},
volume = {6},
pages = {e20450},
abstract = {Symmetric protein assemblies play important roles in many biochemical processes. However, the large size of such systems is challenging for traditional structure modeling methods. This paper describes the implementation of a general framework for modeling arbitrary symmetric systems in Rosetta3. We describe the various types of symmetries relevant to the study of protein structure that may be modeled using Rosettatextquoterights symmetric framework. We then describe how this symmetric framework is efficiently implemented within Rosetta, which restricts the conformational search space by sampling only symmetric degrees of freedom, and explicitly simulates only a subset of the interacting monomers. Finally, we describe structure prediction and design applications that utilize the Rosetta3 symmetric modeling capabilities, and provide a guide to running simulations on symmetric systems.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
2010
Sarel J Fleishman, Jacob E Corn, Eva M Strauch, Tim A Whitehead, Ingemar Andre, James Thompson, James J Havranek, Rhiju Das, Philip Bradley, David Baker
Rosetta in CAPRI rounds 13-19. Journal Article
In: Proteins, vol. 78, pp. 3212-8, 2010, ISSN: 1097-0134.
@article{578,
title = {Rosetta in CAPRI rounds 13-19.},
author = { Sarel J Fleishman and Jacob E Corn and Eva M Strauch and Tim A Whitehead and Ingemar Andre and James Thompson and James J Havranek and Rhiju Das and Philip Bradley and David Baker},
doi = {10.1002/prot.22784},
issn = {1097-0134},
year = {2010},
date = {2010-11-01},
journal = {Proteins},
volume = {78},
pages = {3212-8},
abstract = {Modeling the conformational changes that occur on binding of macromolecules is an unsolved challenge. In previous rounds of the Critical Assessment of PRediction of Interactions (CAPRI), it was demonstrated that the Rosetta approach to macromolecular modeling could capture side chain conformational changes on binding with high accuracy. In rounds 13-19 we tested the ability of various backbone remodeling strategies to capture the main-chain conformational changes observed during binding events. These approaches span a wide range of backbone motions, from limited refinement of loops to relieve clashes in homologous docking, through extensive remodeling of loop segments, to large-scale remodeling of RNA. Although the results are encouraging, major improvements in sampling and energy evaluation are clearly required for consistent high accuracy modeling. Analysis of our failures in the CAPRI challenges suggest that conformational sampling at the termini of exposed beta strands is a particularly pressing area for improvement.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Michael D Tyka, Daniel A Keedy, Ingemar Andr’e, Frank DiMaio, Yifan Song, David C Richardson, Jane S Richardson, David Baker
Alternate States of Proteins Revealed by Detailed Energy Landscape Mapping Journal Article
In: Journal of molecular biology, 2010, ISSN: 1089-8638.
@article{260,
title = {Alternate States of Proteins Revealed by Detailed Energy Landscape Mapping},
author = { Michael D Tyka and Daniel A Keedy and Ingemar Andr'e and Frank DiMaio and Yifan Song and David C Richardson and Jane S Richardson and David Baker},
issn = {1089-8638},
year = {2010},
date = {2010-11-01},
journal = {Journal of molecular biology},
abstract = {What conformations do protein molecules populate in solution? Crystallography provides a high-resolution description of protein structure in the crystal environment, while NMR describes structure in solution but using less data. NMR structures display more variability, but is this because crystal contacts are absent or because of fewer data constraints? Here we report unexpected insight into this issue obtained through analysis of detailed protein energy landscapes generated by large-scale, native-enhanced sampling of conformational space with Rosetta@home for 111 protein domains. In the absence of tightly associating binding partners or ligands, the lowest-energy Rosetta models were nearly all <2.5~r A C(α)RMSD from the experimental structure; this result demonstrates that structure prediction accuracy for globular proteins is limited mainly by the ability to sample close to the native structure. While the lowest-energy models are similar to deposited structures, they are not identical; the largest deviations are most often in regions involved in ligand, quaternary, or crystal contacts. For ligand binding proteins, the low energy models may resemble the apo structures, and for oligomeric proteins, the monomeric assembly intermediates. The deviations between the low energy models and crystal structures largely disappear when landscapes are computed in the context of the crystal lattice or multimer. The computed low-energy ensembles, with tight crystal-structure-like packing in the core, but more NMR-structure-like variability in loops, may in some cases resemble the native state ensembles of proteins better than individual crystal or NMR structures, and can suggest experimentally testable hypotheses relating alternative states and structural heterogeneity to function.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Seth Cooper, Firas Khatib, Adrien Treuille, Janos Barbero, Jeehyung Lee, Michael Beenen, Andrew Leaver-Fay, David Baker, Zoran Popovi’c, Foldit Players
Predicting protein structures with a multiplayer online game Journal Article
In: Nature, vol. 466, pp. 756-60, 2010, ISSN: 1476-4687.
@article{16,
title = {Predicting protein structures with a multiplayer online game},
author = { Seth Cooper and Firas Khatib and Adrien Treuille and Janos Barbero and Jeehyung Lee and Michael Beenen and Andrew Leaver-Fay and David Baker and Zoran Popovi'c and Foldit Players},
issn = {1476-4687},
year = {2010},
date = {2010-08-01},
journal = {Nature},
volume = {466},
pages = {756-60},
abstract = {People exert large amounts of problem-solving effort playing computer games. Simple image- and text-recognition tasks have been successfully textquoterightcrowd-sourcedtextquoteright through games, but it is not clear if more complex scientific problems can be solved with human-directed computing. Protein structure prediction is one such problem: locating the biologically relevant native conformation of a protein is a formidable computational challenge given the very large size of the search space. Here we describe Foldit, a multiplayer online game that engages non-scientists in solving hard prediction problems. Foldit players interact with protein structures using direct manipulation tools and user-friendly versions of algorithms from the Rosetta structure prediction methodology, while they compete and collaborate to optimize the computed energy. We show that top-ranked Foldit players excel at solving challenging structure refinement problems in which substantial backbone rearrangements are necessary to achieve the burial of hydrophobic residues. Players working collaboratively develop a rich assortment of new strategies and algorithms; unlike computational approaches, they explore not only the conformational space but also the space of possible search strategies. The integration of human visual problem-solving and strategy development capabilities with traditional computational algorithms through interactive multiplayer games is a powerful new approach to solving computationally-limited scientific problems.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Ben Blum, Michael I Jordan, David Baker
Feature space resampling for protein conformational search Journal Article
In: Proteins, vol. 78, pp. 1583-93, 2010, ISSN: 1097-0134.
@article{271,
title = {Feature space resampling for protein conformational search},
author = { Ben Blum and Michael I Jordan and David Baker},
issn = {1097-0134},
year = {2010},
date = {2010-05-01},
journal = {Proteins},
volume = {78},
pages = {1583-93},
abstract = {De novo protein structure prediction requires location of the lowest energy state of the polypeptide chain among a vast set of possible conformations. Powerful approaches include conformational space annealing, in which search progressively focuses on the most promising regions of conformational space, and genetic algorithms, in which features of the best conformations thus far identified are recombined. We describe a new approach that combines the strengths of these two approaches. Protein conformations are projected onto a discrete feature space which includes backbone torsion angles, secondary structure, and beta pairings. For each of these there is one "native" value: the one found in the native structure. We begin with a large number of conformations generated in independent Monte Carlo structure prediction trajectories from Rosetta. Native values for each feature are predicted from the frequencies of feature value occurrences and the energy distribution in conformations containing them. A second round of structure prediction trajectories are then guided by the predicted native feature distributions. We show that native features can be predicted at much higher than background rates, and that using the predicted feature distributions improves structure prediction in a benchmark of 28 proteins. The advantages of our approach are that features from many different input structures can be combined simultaneously without producing atomic clashes or otherwise physically inviable models, and that the features being recombined have a relatively high chance of being correct.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Chu Wang, Robert Vernon, Oliver Lange, Michael Tyka, David Baker
Prediction of structures of zinc-binding proteins through explicit modeling of metal coordination geometry Journal Article
In: Protein science, vol. 19, pp. 494-506, 2010, ISSN: 1469-896X.
@article{257,
title = {Prediction of structures of zinc-binding proteins through explicit modeling of metal coordination geometry},
author = { Chu Wang and Robert Vernon and Oliver Lange and Michael Tyka and David Baker},
issn = {1469-896X},
year = {2010},
date = {2010-03-01},
journal = {Protein science},
volume = {19},
pages = {494-506},
abstract = {Metal ions play an essential role in stabilizing protein structures and contributing to protein function. Ions such as zinc have well-defined coordination geometries, but it has not been easy to take advantage of this knowledge in protein structure prediction efforts. Here, we present a computational method to predict structures of zinc-binding proteins given knowledge of the positions of zinc-coordinating residues in the amino acid sequence. The method takes advantage of the "atom-tree" representation of molecular systems and modular architecture of the Rosetta3 software suite to incorporate explicit metal ion coordination geometry into previously developed de novo prediction and loop modeling protocols. Zinc cofactors are tethered to their interacting residues based on coordination geometries observed in natural zinc-binding proteins. The incorporation of explicit zinc atoms and their coordination geometry in both de novo structure prediction and loop modeling significantly improves sampling near the native conformation. The method can be readily extended to predict protein structures bound to other metal and/or small chemical cofactors with well-defined coordination or ligation geometry.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Yang Shen, Philip N Bryan, Yanan He, John Orban, David Baker, Ad Bax
De novo structure generation using chemical shifts for proteins with high-sequence identity but different folds Journal Article
In: Protein Science : A Publication of the Protein Society, vol. 19, pp. 349-56, 2010, ISSN: 1469-896X.
@article{584,
title = {De novo structure generation using chemical shifts for proteins with high-sequence identity but different folds},
author = { Yang Shen and Philip N Bryan and Yanan He and John Orban and David Baker and Ad Bax},
doi = {10.1002/pro.303},
issn = {1469-896X},
year = {2010},
date = {2010-02-01},
journal = {Protein Science : A Publication of the Protein Society},
volume = {19},
pages = {349-56},
abstract = {Proteins with high-sequence identity but very different folds present a special challenge to sequence-based protein structure prediction methods. In particular, a 56-residue three-helical bundle protein (GA(95)) and an alpha/beta-fold protein (GB(95)), which share 95% sequence identity, were targets in the CASP-8 structure prediction contest. With only 12 out of 300 submitted server-CASP8 models for GA(95) exhibiting the correct fold, this protein proved particularly challenging despite its small size. Here, we demonstrate that the information contained in NMR chemical shifts can readily be exploited by the CS-Rosetta structure prediction program and yields adequate convergence, even when input chemical shifts are limited to just amide (1)H(N) and (15)N or (1)H(N) and (1)H(alpha) values.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}