Model-o-Matic.py: A new version

We sometimes learn a small detail that changes our way of thinking about a particular problem. That happened to me the past week when I learned what the line of code below meant:

if __name__ == '__main__'

Essentially it means: “If the script is being run on itself, do this, if not (being imported for example into another script), don’t”.

I applied this line to my previous model-o-matic.py script and a new better version came out. It is essentially the same idea and processing. It changed a little bit on flexibility and usability:

  • Can now accept a fasta format file with how many sequences you want or a single sequence in the command line.
  • Does not produce all the intermediate files (NEEDLE output, BLAST output, PDBs, etc) unless stated so by -k flag
  • More object-oriented programming makes it easier to 1) understand 2) reuse 3) import

And, with no further ado, here’s the script:

DOWNLOAD HERE (Use the new server version here)

#!/usr/bin/python

# Generates a details file with info necessary for modelling protocol
# Aimed at saving time with blasting, needleing, etc.
# Joao Rodrigues @ Utrecht
# Last Update: 28/02/2009
# Email: anaryin@gmail.com

############# Imports

import cStringIO, sys, urllib2, re, time, os # Included in Python
from Bio.Blast import NCBIWWW, NCBIXML
from Bio.PDB.PDBParser import PDBParser
from Bio import SeqIO
from SOAPpy import WSDL

############# Model-o-Matic Class

class ModeloMatic():
“””Gathers information to model by homology a given protein aminoacid sequence”””

def __init__(self):
print “## MODEL-O-MATIC ##\n”

def blastSequence(self, sequence, nresults, keep):
“””Retrieves Homologous Sequences with PDB structures solved”””

print “\n## Performing BLAST on %s(…)” %sequence[0:15]
resultsWithPDB = [] # List of Dictionaries

blastOutput = cStringIO.StringIO(
NCBIWWW.qblast(
‘blastp’, # Program
‘pdb’, # Database
sequence,
hitlist_size=nresults # How many results to parse?
).read())

if keep:
print “\t## Saving BLAST results to BLAST_OUTPUT.txt”
file = open(‘BLAST_OUTPUT.txt’, ‘w’)
file.write(blastOutput.getvalue())
file.close()

print “\n## Parsing BLAST results”
blastResults = NCBIXML.parse(blastOutput)

for hit in blastResults:
for alignment in hit.alignments:
chain = alignment.title.split(‘|’)[4][0]
pdbName = alignment.title.split(‘|’)[3]
# Blast Numbers
for hsp in alignment.hsps:
e_value = float(hsp.expect)
score = “%d” %int(hsp.bits)
PDBSequence = hsp.sbjct.replace(‘-‘, ”)

resultsWithPDB.append({‘PDBID’:pdbName, ‘Chain’:chain, ‘E-Value’:e_value, ‘Blast_Score’:score, ‘PDBSequence’:PDBSequence})

return resultsWithPDB

def validateSequence(self, sequence):
“””Checks if a given protein sequence is valid for blasting or not”””

# One-Letter-Code Options from http://www.ncbi.nlm.nih.gov/BLAST/blastcgihelp.shtml
#
# A alanine P proline
# B aspartate/asparagine Q glutamine
# C cystine R arginine
# D aspartate S serine
# E glutamate T threonine
# F phenylalanine U selenocysteine
# G glycine V valine
# H histidine W tryptophan
# I isoleucine Y tyrosine
# K lysine Z glutamate/glutamine
# L leucine X any
# M methionine
# N asparagine – gap of indeterminate length

allowed = [‘A’,’B’,’C’,’D’,’E’,’F’,’G’,’H’,’I’,’K’,’L’,’M’,’N’,’P’,’Q’,’R’,’S’,’T’,’U’,’V’,’W’,’Y’,’Z’,’X”-‘]

for letter in [i.upper() for i in set(sequence)]:
if letter not in allowed:
return (0, sequence.index(letter))

return (1,)

def getPDBs(self, blastResults, nresults, keep):
“””Queries Brookhaven PDB for info on the results of the BLAST”””

print “\n## Processing BLAST results”

results = []

rcsbLink = “http://www.rcsb.org/pdb/files/”

P = PDBParser()

for structure in blastResults[:nresults]: # Processes X values of the BLAST results

PDBID = structure[‘PDBID’]
print “\t## Getting PDB information for %s” %PDBID

pdb = urllib2.urlopen(rcsbLink+PDBID.lower()+’.pdb’).read()

if keep:
print “\t## Saving %s.pdb” %PDBID
file = open(‘%s.pdb’%PDBID, ‘w’)
file.write(pdb)
file.close()

P.get_structure(PDBID, cStringIO.StringIO(pdb))
info = P.get_header()

try:
resolution = “%.2f” %float(info[‘resolution’])
except TypeError:
resolution = “NA”

structure.update({‘method’:info[“structure_method”], ‘resolution’:resolution, ‘deposition’:info[‘deposition_date’], ‘organism’:info[‘source’][‘1’][‘organism_scientific’]})

results.append(structure)

return results

def pairwiseAlign(self, modelseq, templateseq, keep):
“””Pairwise Alignment using Needleman-Wunsch Algorithm available at EBI server”””

wsdlLink = “http://www.ebi.ac.uk/Tools/webservices/wsdl/WSEmboss.wsdl”

sequence1 = cStringIO.StringIO(‘>SEQUENCE\n’+str(modelseq)+’\n’).getvalue()
sequence2 = cStringIO.StringIO(‘>TEMPLATE\n’+str(templateseq)+’\n’).getvalue()

server = WSDL.Proxy(wsdlLink)

align_params = {
‘tool’:’needle’,
’email’:’anaryin@gmail.com’,
‘aformat’: ‘markx10’,
# ‘gapopen’: 10.0
# ‘gapextend’: 0.5
}

align_data = [{‘type’:’asequence’, ‘content’:sequence1}, {‘type’:’bsequence’, ‘content’:sequence2}]
print “## Running NEEDLE”
jobId = server.run(params=align_params,content=align_data) # Submit to Webservice

status = ‘PENDING’
while status==’PENDING’ or status == ‘RUNNING’:
status = server.checkStatus(jobId)
print “\tNeedle Status:”,status
if status == ‘RUNNING’ or status == ‘PENDING’:
time.sleep(15)

needleOutput = server.poll(jobId, ‘tooloutput’) # Get Results

if keep:
print “\t## Saving NEEDLE output”
file = open(‘NEEDLE.output’, ‘a’)
file.write(needleOutput)
file.close()

return needleOutput

def needleParser(self, rawOutput, keep):
“””Parses the output of needle (format markx10)”””

infoRegex = re.compile(‘.Length:.(\d+)\s# Identity:\s+([\d/\(\)\.% ]+)\s# Similarity:\s+([\d/\(\)\.% ]+)\s# Gaps:\s+([\d/\(\)\.% ]+)\s# Score:\s+([\d\.]+)’)

needleStats = re.findall(infoRegex, rawOutput)[0]
alignment = ‘\n’.join([line for line in rawOutput.split(‘\n’) if len(line)>1 and line[0] != ‘#’ and line[0] != ‘;’])

return {“Length”:needleStats[0], “Identity”:needleStats[1], “Similarity”:needleStats[2], “Gaps”:needleStats[3], “Needle_Score”:needleStats[4], “Alignment”:alignment}

def detailsFile(self, sequence, dictionary, output):
“””From a dictionary with all the information writes the Details file”””

k = 0
while os.path.exists(output+’.details’): # Safety measure
k += 1
output = output+str(k)

print “\n## Creating output file: %s.details” %output

file = open(output+’.details’, ‘w’)
file.write(‘### Model-o-Matic: Automated Template Analysis Script\n###\n’)
file.write(‘### Submitted Sequence %s\n###\n’ %sequence)
file.write(‘### Possible Templates\n### Ordered by Blast Rank\n\n’)
k = 0
for record in dictionary:
file.write(‘-‘*60)
k +=1
file.write(‘\n# %s\n’ %k)
file.write(‘-‘*60)

file.write(‘\n\nTemplate Name: %(PDBID)s (Chain %(Chain)s)\n\n# BLAST RESULTS\n\nScore: %(Blast_Score)s\nE-Value: %(E-Value)s\n\n# PDB INFORMATION\n\nAcquisition Method: %(method)s\nResolution: %(resolution)s\nPDB Deposition Date: %(deposition)s\nSource Organism: %(organism)s\n\n# NEEDLE RESULTS\n\nScore: %(Needle_Score)s\nLength: %(Length)s\nIdentity: %(Identity)s\nSimilarity: %(Similarity)s\nGaps: %(Gaps)s\n\n%(Alignment)s\n\n’ %record)

file.close()
print “\n## Details file created”

############## Arguments For Running Script ####################

def process_args():
“””To process arguments in the command line when calling the script”””

from optparse import OptionParser

parser = OptionParser()
parser.add_option(“-s”, “–sequence”, action=”store”, dest=”sequence”, help=”Aminoacid Sequence”)
parser.add_option(“-B”, “–blastresults”, action=”store”, dest=”blast_results”, default=50, help=”(BLAST) Number of results to keep”)
parser.add_option(“-P”, “–pdbresults”, action=”store”, dest=”pdb_results”, default=10, help=”(PDB) Number of results to process”)
parser.add_option(“-k”, “–keepdata”, action=”store_true”, dest=”keep”, default=False, help=”Saves PDB files and NEEDLE output to local folder”)
parser.add_option(“-o”, “–output”, action=”store”, dest=”output”, help=”Name of details file (Not required if fasta file given)”)
parser.add_option(“-f”, “–inputfile”, action=”store”, dest=”input”, help=”Name of input file”)

return parser.parse_args()

############## MAIN ##################

USAGE = “python Model-o-Matic.py -s [-o ]/-f [-M -P -k

if __name__ == ‘__main__’:

(opts, args) = process_args()

if len(sys.argv) < 2: print USAGE else: M = ModeloMatic() if opts.input != None and os.path.exists(opts.input): # If the user gives a file instead of sequence sequences = [] output = [] print "## Extracting Sequence(s) from file..." handle = open(opts.input, 'rU') for record in SeqIO.parse(handle, "fasta"): # Parses the fasta format output.append(record.id) sequences.append(record.seq.tostring()) handle.close() else: sequences = [opts.sequence] output = [opts.output] for i in range(len(sequences)): if M.validateSequence(sequences[i])[0]: blast = M.blastSequence(sequences[i], int(opts.blast_results), bool(opts.keep)) mydict = M.getPDBs(blast, int(opts.pdb_results), bool(opts.keep)) for adict in mydict: needle = M.pairwiseAlign(sequences[i], adict['PDBSequence'], bool(opts.keep)) adict.update(M.needleParser(needle, bool(opts.keep))) M.detailsFile(sequences[i], mydict, output[i]) else: print "Bad Sequence: %s" %sequence[/sourcecode]

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s