Model-o-Matic: Modelling turned easy(er)

This small set of functions can be used in many situations, but they were made to fulfill one single objective: save me time when modelling new proteins. To model a protein, I would have to blast its sequence, retrieve the significant results with a matching PDB file, download the PDB file and its fasta sequence, run a needle alignment on it against the original sequence, and finally write the scripts for Modeller. With this script, I just have to input one sequence and then have a look at the details file it generates to assess which model is the best. And by then, I’ll already have its PDB, its sequence aligned, and some additional information. In the near future, I’ll make it generate the python script for the templates as well, and probably make it a little bit more organized (not to leave your desktop full of pdbs and fastas). It’s not perfect, it may not be useful for everyone, but that’s the joy of scripting ;) Use and abuse if you need.

Requires BioPython and SOAPpy. It was scripted in Linux but in principle it should work everywhere. Bugs and suggestions welcome ;)

CODE IN PASTEBIN

#!/usr/bin/python

# Generates a details file with info necessary for modelling protocol
# Aimed at saving time with blasting, needleing, etc.
# Joao Rodrigues @ Utrecht 2008
# Last Update: 07.12.2008
# Email: anaryin@gmail.com

# IMPORTANT: Rememeber to edit your email in the needleIt function! I've been blacklisted once by NCBI already :x

## Imports

from SOAPpy import WSDL

from time import sleep

import re, os, urllib2,sys

from Bio import AlignIO, SeqIO, ExPASy

import cStringIO
from Bio.Blast import NCBIWWW, NCBIXML
from Bio.PDB.PDBParser import PDBParser

## Code

def blastSequence(path, output2file=False):
"""Submits a protein sequence to the BLAST service at NCBI servers. Returns a list of ranked dictionaries containing results with matching PDB structures
Input: Sequence file path in FASTA format ; True/False flag determining creation of a file with the results
Output: List of dictionaries containing information on the structures and their BLAST output values"""

# Find & Parse Sequence File
if os.path.exists(path):
handle = open(path, 'rU')
for record in SeqIO.parse(handle, "fasta"): # Parses the fasta format
fastaName = record.id
proteinSeq = record.seq.tostring()
handle.close()
else:
print "ERROR: The path you inserted is not valid."
sys.exit(0)

print "\n## Blasting your sequence(s)"
print "\t## Sequence %s: %s(...)" %(fastaName, proteinSeq[0:20])

# Run Blast
outputBlast = cStringIO.StringIO(NCBIWWW.qblast('blastp', 'nr', proteinSeq).read()) # StringIO needed for parser function

# Parse Results
blastResults = NCBIXML.parse(outputBlast) # Parse Results
print "\n## Retrieving information from BLAST results with structure solved"

PDBs = [] # Holds the results with a structure ; List to preserve ranking

for record in blastResults:
for alignment in record.alignments:
if alignment.title.split('|')[2] == 'pdb':
structChain = alignment.title.split('|')[4][0]
structID = alignment.title.split('|')[3]
for hsp in alignment.hsps:
e_value = float(hsp.expect)
score = "%d" %int(hsp.bits)

PDBs.append({'Id':structID, 'Chain':structChain, 'Score':score, 'E-Value':e_value})

# Write Results to File if requested
if output2file:
file = open('%s_blast.results' %fastaName, 'w')

printableSeq = '' # Just turning the long seq into 60 AA per line
for i in range(len(proteinSeq)):
if i%60 == 0 and i != 0:
printableSeq += '\n'
else:
printableSeq += proteinSeq[i]

file.write("""#### AUTOMATICALLY GENERATED BLAST RESULTS FILE ####\n\n>> Sequence Name: %s\n>> Input Sequence:\n%s\n""" %(fastaName,printableSeq))
for structure in PDBs:
file.write("\n\n# Result %s\n" %(PDBs.index(structure)+1))
file.write(">> PDB Identifier: %(Id)s (Chain %(Chain)s)\n>> Score: %(Score)s\n>> E-Value: %(E-Value)s" %structure)
file.close()

# Return results as normal
return PDBs

def queryPDB(blastResults, retmax=5):
"""Retrieves the PDB files associated with the X best structures
Input: List of dictionaries with info
Output: Dictionary with information on several structures"""

print "\n## Analyzing PDB Files for the %s best BLAST results" %retmax

rcsbLink = "http://www.rcsb.org/pdb/download/downloadFile.do?fileFormat=pdb&compression=NO&structureId="
fastaLink = "http://www.rcsb.org/pdb/download/downloadFile.do?fileFormat=FASTA&compression=NO&structureId="

pdbInformation = {}

for record in blastResults:

if retmax == 0:
break
else:
retmax -= 1

if not os.path.exists("%s.pdb" %record["Id"]): # Download the PDB file
print "\t## Downloading PDB File %s" %record["Id"]
pdbData = urllib2.urlopen(rcsbLink+record["Id"]).read()
pdbFile = open("%s.pdb" %(record["Id"]), 'w') # Save it
pdbFile.write(pdbData)
pdbFile.close()

# Parse the information on the PDB file

print "\t## Retrieving information from PDB File: %s" %record["Id"]

structParser = PDBParser()

structure = structParser.get_structure(record, "%s.pdb" %record["Id"])
header = structParser.get_header()

structMethod = header["structure_method"]
try:
structResolution = "%.2f" %float(header["resolution"])
except TypeError:
structResolution = "NA"
depositionDate = header["deposition_date"]
organism = header["source"]["1"]["organism_scientific"]

if not os.path.exists("template_%s.fasta" %record["Id"]): # Download the FASTA file of the PDB structure
print "\t## Downloading correspondent FASTA files"
fastaData = cStringIO.StringIO(urllib2.urlopen(fastaLink+record["Id"]).read())
print "\t## Retrieving information from FASTA file"
for fasta in SeqIO.parse(fastaData, "fasta"):
if record['Chain'] == fasta.id.split(':')[1][0]: # Retrieve information of the same chain as the BLAST result
pdbSeq = fasta.seq

tempFastaFile = open("template_%s.fasta" %record["Id"], 'w')
tempFastaFile.write('>%s\n%s' %(record["Id"], pdbSeq)) # Save it
tempFastaFile.close()

pdbInformation[record["Id"]] = {'StructSeq':"template_%s.fasta"%record["Id"], 'StructMethod':structMethod, 'StructResolution':structResolution, 'DepositDate':depositionDate, 'Source Organism':organism}

return pdbInformation

def needleIt(path1, path2, output2file=False):
"""Queries EBI EMBOSS webservice with 2 sequences for the NEEDLE pair-wise alignment tool"""

seq1 = open(path1).read() # Where e1.seq and e2.seq are FASTA formatted files on

seq2 = open(path2).read() # the same directory as this python script.

wsdlLink = "http://www.ebi.ac.uk/Tools/webservices/wsdl/WSEmboss.wsdl"

server = WSDL.Proxy(wsdlLink)

blast_params = {

'tool':'needle',

'email':'email@user.com',    # User e-mail address < ------------------------------------------------ EDIT THIS YOURSELF!!
# uncomment the 3 # below to alter parameters of alignment
'aformat': 'markx10' #,
#      'gapopen': 10.0
#      'gapextend': 0.5

}

blast_data = &#91;{'type':'asequence', 'content':seq1}, {'type':'bsequence', 'content':seq2}&#93;

print "## Running NEEDLE: %s vs %s" %(path1.split('.')&#91;0&#93;,path2.split('.')&#91;0&#93;)

jobId = server.run(params=blast_params,content=blast_data) # Submit to Webservice

status = 'PENDING'

while status=='PENDING' or status == 'RUNNING':

status = server.checkStatus(jobId)

print "\tNeedle Status:",status

if status == 'RUNNING' or status == 'PENDING':

sleep(15)

needleOutput = server.poll(jobId, 'tooloutput') # Get Results

# Write Results to File if requested
if output2file:
file = open('%s_%s.needle' %(path1.split('.')&#91;0&#93;,path2.split('.')&#91;0&#93;), 'w')
file.write(needleOutput)
file.close()

# Parse Results to extract relevant information

infoRegex = re.compile('.Length:.(\d+)\s# Identity:\s+(&#91;\d/\(\)\.% &#93;+)\s# Similarity:\s+(&#91;\d/\(\)\.% &#93;+)\s# Gaps:\s+(&#91;\d/\(\)\.% &#93;+)\s# Score:\s+(&#91;\d\.&#93;+)')

needleNumbers = re.findall(infoRegex, needleOutput)&#91;0&#93;

alignment = '\n'.join(&#91;line for line in needleOutput.split('\n') if len(line)>1 and line[0] != '#' and line[0] != ';'])

return {"Length":needleNumbers[0], "Identity":needleNumbers[1], "Similarity":needleNumbers[2], "Gaps":needleNumbers[3], "Score":needleNumbers[4], "Alignment":alignment}

def writeDetails(oriSequence, blastResults, PDBs):
"""Creates Details file with all relevant information for modelling"""

file = open("%s.details" %oriSequence.split('.')[0], 'w')

file.write("### Details File for Modelling of %s\n\n" %oriSequence.split('.')[0])

modelSequence = ''.join(open(oriSequence).readlines()[1:])

file.write("## Model Sequence:\n%s\n\n" %modelSequence)

file.write("## Possible Templates Information\n## Ordered by Blast Rank")

for pdb, pdbInfo in PDBs.iteritems():
templateName = pdb
templateSequence = pdbInfo["StructSeq"]
blastInfo = [dictionary for dictionary in blastResults if dictionary["Id"] == pdb][0]
alignment = needleIt(oriSequence, templateSequence)

file.write("\n\nTemplate Name: %(Id)s (Chain %(Chain)s)\n\n# BLAST RESULTS\n\nScore: %(Score)s\nE-Value: %(E-Value)s\n" %blastInfo)
file.write("\n# NEEDLE RESULTS\n\nScore: %(Score)s\nLength: %(Length)s\nIdentity: %(Identity)s\nSimilarity: %(Similarity)s\nGaps: %(Gaps)s\n\n%(Alignment)s\n\n" %alignment)
file.write("-"*60)
file.write("-"*60)

file.close()

## Usage ##
blastResults = blastSequence('e1.seq')
PDBs = queryPDB(blastResults)
writeDetails('e1.seq', blastResults, PDBs)

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s