Model-o-Matic.py: A new version

We sometimes learn a small detail that changes our way of thinking about a particular problem. That happened to me the past week when I learned what the line of code below meant:

if __name__ == '__main__'

Essentially it means: “If the script is being run on itself, do this, if not (being imported for example into another script), don’t”.

I applied this line to my previous model-o-matic.py script and a new better version came out. It is essentially the same idea and processing. It changed a little bit on flexibility and usability:

  • Can now accept a fasta format file with how many sequences you want or a single sequence in the command line.
  • Does not produce all the intermediate files (NEEDLE output, BLAST output, PDBs, etc) unless stated so by -k flag
  • More object-oriented programming makes it easier to 1) understand 2) reuse 3) import

And, with no further ado, here’s the script:

DOWNLOAD HERE (Use the new server version here)

#!/usr/bin/python

# Generates a details file with info necessary for modelling protocol
# Aimed at saving time with blasting, needleing, etc.
# Joao Rodrigues @ Utrecht
# Last Update: 28/02/2009
# Email: anaryin@gmail.com

############# Imports

import cStringIO, sys, urllib2, re, time, os # Included in Python
from Bio.Blast import NCBIWWW, NCBIXML
from Bio.PDB.PDBParser import PDBParser
from Bio import SeqIO
from SOAPpy import WSDL

############# Model-o-Matic Class

class ModeloMatic():
“””Gathers information to model by homology a given protein aminoacid sequence”””

def __init__(self):
print “## MODEL-O-MATIC ##\n”

def blastSequence(self, sequence, nresults, keep):
“””Retrieves Homologous Sequences with PDB structures solved”””

print “\n## Performing BLAST on %s(…)” %sequence[0:15]
resultsWithPDB = [] # List of Dictionaries

blastOutput = cStringIO.StringIO(
NCBIWWW.qblast(
‘blastp’, # Program
‘pdb’, # Database
sequence,
hitlist_size=nresults # How many results to parse?
).read())

if keep:
print “\t## Saving BLAST results to BLAST_OUTPUT.txt”
file = open(‘BLAST_OUTPUT.txt’, ‘w’)
file.write(blastOutput.getvalue())
file.close()

print “\n## Parsing BLAST results”
blastResults = NCBIXML.parse(blastOutput)

for hit in blastResults:
for alignment in hit.alignments:
chain = alignment.title.split(‘|’)[4][0]
pdbName = alignment.title.split(‘|’)[3]
# Blast Numbers
for hsp in alignment.hsps:
e_value = float(hsp.expect)
score = “%d” %int(hsp.bits)
PDBSequence = hsp.sbjct.replace(‘-‘, ”)

resultsWithPDB.append({‘PDBID’:pdbName, ‘Chain’:chain, ‘E-Value’:e_value, ‘Blast_Score’:score, ‘PDBSequence’:PDBSequence})

return resultsWithPDB

def validateSequence(self, sequence):
“””Checks if a given protein sequence is valid for blasting or not”””

# One-Letter-Code Options from http://www.ncbi.nlm.nih.gov/BLAST/blastcgihelp.shtml
#
# A alanine P proline
# B aspartate/asparagine Q glutamine
# C cystine R arginine
# D aspartate S serine
# E glutamate T threonine
# F phenylalanine U selenocysteine
# G glycine V valine
# H histidine W tryptophan
# I isoleucine Y tyrosine
# K lysine Z glutamate/glutamine
# L leucine X any
# M methionine
# N asparagine – gap of indeterminate length

allowed = [‘A’,’B’,’C’,’D’,’E’,’F’,’G’,’H’,’I’,’K’,’L’,’M’,’N’,’P’,’Q’,’R’,’S’,’T’,’U’,’V’,’W’,’Y’,’Z’,’X”-‘]

for letter in [i.upper() for i in set(sequence)]:
if letter not in allowed:
return (0, sequence.index(letter))

return (1,)

def getPDBs(self, blastResults, nresults, keep):
“””Queries Brookhaven PDB for info on the results of the BLAST”””

print “\n## Processing BLAST results”

results = []

rcsbLink = “http://www.rcsb.org/pdb/files/”

P = PDBParser()

for structure in blastResults[:nresults]: # Processes X values of the BLAST results

PDBID = structure[‘PDBID’]
print “\t## Getting PDB information for %s” %PDBID

pdb = urllib2.urlopen(rcsbLink+PDBID.lower()+’.pdb’).read()

if keep:
print “\t## Saving %s.pdb” %PDBID
file = open(‘%s.pdb’%PDBID, ‘w’)
file.write(pdb)
file.close()

P.get_structure(PDBID, cStringIO.StringIO(pdb))
info = P.get_header()

try:
resolution = “%.2f” %float(info[‘resolution’])
except TypeError:
resolution = “NA”

structure.update({‘method’:info[“structure_method”], ‘resolution’:resolution, ‘deposition’:info[‘deposition_date’], ‘organism’:info[‘source’][‘1’][‘organism_scientific’]})

results.append(structure)

return results

def pairwiseAlign(self, modelseq, templateseq, keep):
“””Pairwise Alignment using Needleman-Wunsch Algorithm available at EBI server”””

wsdlLink = “http://www.ebi.ac.uk/Tools/webservices/wsdl/WSEmboss.wsdl”

sequence1 = cStringIO.StringIO(‘>SEQUENCE\n’+str(modelseq)+’\n’).getvalue()
sequence2 = cStringIO.StringIO(‘>TEMPLATE\n’+str(templateseq)+’\n’).getvalue()

server = WSDL.Proxy(wsdlLink)

align_params = {
‘tool’:’needle’,
’email’:’anaryin@gmail.com’,
‘aformat’: ‘markx10’,
# ‘gapopen’: 10.0
# ‘gapextend’: 0.5
}

align_data = [{‘type’:’asequence’, ‘content’:sequence1}, {‘type’:’bsequence’, ‘content’:sequence2}]
print “## Running NEEDLE”
jobId = server.run(params=align_params,content=align_data) # Submit to Webservice

status = ‘PENDING’
while status==’PENDING’ or status == ‘RUNNING’:
status = server.checkStatus(jobId)
print “\tNeedle Status:”,status
if status == ‘RUNNING’ or status == ‘PENDING’:
time.sleep(15)

needleOutput = server.poll(jobId, ‘tooloutput’) # Get Results

if keep:
print “\t## Saving NEEDLE output”
file = open(‘NEEDLE.output’, ‘a’)
file.write(needleOutput)
file.close()

return needleOutput

def needleParser(self, rawOutput, keep):
“””Parses the output of needle (format markx10)”””

infoRegex = re.compile(‘.Length:.(\d+)\s# Identity:\s+([\d/\(\)\.% ]+)\s# Similarity:\s+([\d/\(\)\.% ]+)\s# Gaps:\s+([\d/\(\)\.% ]+)\s# Score:\s+([\d\.]+)’)

needleStats = re.findall(infoRegex, rawOutput)[0]
alignment = ‘\n’.join([line for line in rawOutput.split(‘\n’) if len(line)>1 and line[0] != ‘#’ and line[0] != ‘;’])

return {“Length”:needleStats[0], “Identity”:needleStats[1], “Similarity”:needleStats[2], “Gaps”:needleStats[3], “Needle_Score”:needleStats[4], “Alignment”:alignment}

def detailsFile(self, sequence, dictionary, output):
“””From a dictionary with all the information writes the Details file”””

k = 0
while os.path.exists(output+’.details’): # Safety measure
k += 1
output = output+str(k)

print “\n## Creating output file: %s.details” %output

file = open(output+’.details’, ‘w’)
file.write(‘### Model-o-Matic: Automated Template Analysis Script\n###\n’)
file.write(‘### Submitted Sequence %s\n###\n’ %sequence)
file.write(‘### Possible Templates\n### Ordered by Blast Rank\n\n’)
k = 0
for record in dictionary:
file.write(‘-‘*60)
k +=1
file.write(‘\n# %s\n’ %k)
file.write(‘-‘*60)

file.write(‘\n\nTemplate Name: %(PDBID)s (Chain %(Chain)s)\n\n# BLAST RESULTS\n\nScore: %(Blast_Score)s\nE-Value: %(E-Value)s\n\n# PDB INFORMATION\n\nAcquisition Method: %(method)s\nResolution: %(resolution)s\nPDB Deposition Date: %(deposition)s\nSource Organism: %(organism)s\n\n# NEEDLE RESULTS\n\nScore: %(Needle_Score)s\nLength: %(Length)s\nIdentity: %(Identity)s\nSimilarity: %(Similarity)s\nGaps: %(Gaps)s\n\n%(Alignment)s\n\n’ %record)

file.close()
print “\n## Details file created”

############## Arguments For Running Script ####################

def process_args():
“””To process arguments in the command line when calling the script”””

from optparse import OptionParser

parser = OptionParser()
parser.add_option(“-s”, “–sequence”, action=”store”, dest=”sequence”, help=”Aminoacid Sequence”)
parser.add_option(“-B”, “–blastresults”, action=”store”, dest=”blast_results”, default=50, help=”(BLAST) Number of results to keep”)
parser.add_option(“-P”, “–pdbresults”, action=”store”, dest=”pdb_results”, default=10, help=”(PDB) Number of results to process”)
parser.add_option(“-k”, “–keepdata”, action=”store_true”, dest=”keep”, default=False, help=”Saves PDB files and NEEDLE output to local folder”)
parser.add_option(“-o”, “–output”, action=”store”, dest=”output”, help=”Name of details file (Not required if fasta file given)”)
parser.add_option(“-f”, “–inputfile”, action=”store”, dest=”input”, help=”Name of input file”)

return parser.parse_args()

############## MAIN ##################

USAGE = “python Model-o-Matic.py -s [-o ]/-f [-M -P -k

if __name__ == ‘__main__’:

(opts, args) = process_args()

if len(sys.argv) < 2: print USAGE else: M = ModeloMatic() if opts.input != None and os.path.exists(opts.input): # If the user gives a file instead of sequence sequences = [] output = [] print "## Extracting Sequence(s) from file..." handle = open(opts.input, 'rU') for record in SeqIO.parse(handle, "fasta"): # Parses the fasta format output.append(record.id) sequences.append(record.seq.tostring()) handle.close() else: sequences = [opts.sequence] output = [opts.output] for i in range(len(sequences)): if M.validateSequence(sequences[i])[0]: blast = M.blastSequence(sequences[i], int(opts.blast_results), bool(opts.keep)) mydict = M.getPDBs(blast, int(opts.pdb_results), bool(opts.keep)) for adict in mydict: needle = M.pairwiseAlign(sequences[i], adict['PDBSequence'], bool(opts.keep)) adict.update(M.needleParser(needle, bool(opts.keep))) M.detailsFile(sequences[i], mydict, output[i]) else: print "Bad Sequence: %s" %sequence[/sourcecode]

Advertisements

One thought on “Model-o-Matic.py: A new version

  1. Skype has established its website-based client beta for
    the world, right after introducing it broadly in the U.S.
    and You.K. previously this month. Skype for Internet also now
    facilitates Linux and Chromebook for instant online messaging
    communication (no voice and video however, those call for a connect-in installing).

    The expansion from the beta contributes help for a longer selection of languages to aid reinforce
    that international functionality

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s