#!/usr/bin/python import os import sys import random import re from string import maketrans from optparse import OptionParser from optparse import OptionGroup from rna.hairpin import * from rna import FileUtil from rna import RNAStructure from rna import RNAforester from rna import RNAforesterDatabaseSearch def databaseSearch(options, databaseFile, inputFile=sys.stdin): """options is a dictionary like: {'verbose': False, 'databaseFile': 'data/structures.fa', 'inputFile': 'data/hbv.fa'} """ if not databaseFile: print "ERROR: Database file not defined!\nUsage: rna_forester_database_search.py -d database_file [options] -i [input.fa]\nor\n rna_forester_database_search.py -d database_file [options] < input.fa" exit(1) rnaforesterDS = RNAforesterDatabaseSearch() rnaforesterDS.setDatabaseFile(databaseFile) fileUtil = FileUtil() searchStructure = fileUtil.loadFASTA(inputFile)[0] rnaforesterDS.run(searchStructure) if options.varna_align: return printResultsVarnaAlign( rnaforesterDS ) elif options.varna_align_applet: return printResultsVarnaAlign( rnaforesterDS ).translate(maketrans(';"', '__')).translate(maketrans('\n', ';')) else: return printResultsBlastLike( options, rnaforesterDS ) def printResultsVarnaAlign( rnaforesterDS ): _results = '' for i, alignment in enumerate( rnaforesterDS.getAlignmentList() ): # print alignment.getSmallStructure(), '\n' # print alignment.getLargeStructure(), '\n' # return _results += ''.join(( # str(alignment.getSmallStructure()), '>', "IDS:%-3d " % i + " |" + alignment.getSmallStructure().getName(), '\n', alignment.getSmallStructure().getSequence(), '\n', alignment.getSmallStructure().getStructure(), '\n', alignment.getAlignedSmallStructure().translate( maketrans("(.)[*]", "---(.)")), '\n', '\n', # str(alignment.getLargeStructure()), # structure.setName( "IDL:%-3d " % i + "score: %4d" % alignment.getScore() + " |" + structure.getName()) '>', "IDL:%-3d " % i + "score: %4d" % alignment.getScore() + " |" + alignment.getLargeStructure().getName(), '\n', alignment.getLargeStructure().getSequence(), '\n', alignment.getLargeStructure().getStructure(), '\n', alignment.getAlignedLargeStructure().translate( maketrans("(.)[*]", "---(.)")), '\n', )) return _results def printResultsBlastLike( options, rnaforesterDS ): # Get length of the longest line _longestLineLength = 0 for alignment in rnaforesterDS.getAlignmentList(): if alignment.getAlignment(0).getLongestString() > _longestLineLength: _longestLineLength = alignment.getAlignment(0).getLongestString() if alignment.getAlignment(1).getLongestString() > _longestLineLength: _longestLineLength = alignment.getAlignment(1).getLongestString() _results = '' for alignment in rnaforesterDS.getAlignmentList(): # _results += ''.join(('-'*_longestLineLength+1,'\nDatabaseId:',str(alignment.getDatabaseId()), '\tScore:', str(alignment.getScore()), '\tAlignment starting at position:', str(alignment.getStartingPosition()), '\n', str(alignment.getAlignment(0)), '\n', str(alignment.getAlignment(1)), '\n')) # smallLen = len(alignment.getAlignment(0).getStructure().translate(maketrans('', ''), '-')) # largeLen = len(alignment.getAlignment(0).getStructure().translate(maketrans('', ''), '-')) smallGaps = alignment.getAlignment(0).getStructure().count('-') largeGaps = alignment.getAlignment(1).getStructure().count('-') smallLen = len(alignment.getAlignment(0).getStructure()) largeLen = len(alignment.getAlignment(1).getStructure()) smallIdentities = smallLen - smallGaps largeIdentities = largeLen - largeGaps largeStartPos = alignment.getStartingPosition() + 1 largeEndPos = largeStartPos + largeLen - 1 smallStartPos = 1 smallEndPos = smallStartPos + smallLen - 1 #figure out whether we need to pad the position numbers with spaces if len(str(largeStartPos)) != 1: _padding_left = len(str(largeStartPos)) - 1 # raise RuntimeError, "PADDING: largeStartPos: %s, _padding_left: %s" % (largeStartPos, _padding_left) else: _padding_left = 0 if len(str(largeEndPos)) != len(str(smallEndPos)): _padding_right = len(str(largeEndPos)) - len(str(smallEndPos)) else: _padding_right = 0 if options.html_output: _results += ''.join(( '\n', '-' * (_longestLineLength + 1), '\n', 'Score = %d' % alignment.getScore(), '\n', 'Identities with small structure = %d/%d (%d%%), Gaps = %d/%d (%d%%)' % (smallIdentities, smallLen, float(smallIdentities)/smallLen*100, smallGaps, smallLen, float(smallGaps)/smallLen*100), '\n', 'Identities with large structure = %d/%d (%d%%), Gaps = %d/%d (%d%%)' % (largeIdentities, largeLen, float(largeIdentities)/largeLen*100, largeGaps, largeLen, float(largeGaps)/largeLen*100), '\n', 'Strand = Plus / Plus (Minus strand search not supported)', '\n', # 'Start: %d' % alignment.getStartingPosition(), '\n', '\n', 'Your query modified by gaps: Length = %d' % smallLen, '\n', '\n', '>', alignment.getAlignment(0).getName(), '\n', # '%d ' % smallStartPos, alignment.getAlignment(0).getSequence(), ' %d' % smallEndPos, '\n', # '%d ' % smallStartPos, alignment.getAlignment(0).getStructure(), ' %d' % smallEndPos, '\n', '%d ' % smallStartPos, '%s' % ' ' * _padding_left, htmlizeStructure( alignment.getAlignment(0).getSequence(), alignment.getAlignment(1).getSequence() ), '%s' % ' ' * _padding_right, ' %d' % smallEndPos, '\n', '%d ' % smallStartPos, '%s' % ' ' * _padding_left, htmlizeStructure( alignment.getAlignment(0).getStructure(), alignment.getAlignment(1).getSequence() ), '%s' % ' ' * _padding_right, ' %d' % smallEndPos, '\n', '\n', 'The target hit adjusted by gaps:', '\n', '>', alignment.getAlignment(1).getName(), '\n', # '%d ' % largeStartPos, alignment.getAlignment(1).getSequence(), ' %d' % largeEndPos, '\n', # '%d ' % largeStartPos, alignment.getAlignment(1).getStructure(), ' %d' % largeEndPos, '\n\n', '%d ' % largeStartPos, htmlizeStructure( alignment.getAlignment(1).getSequence(), alignment.getAlignment(1).getSequence() ), ' %d' % largeEndPos, '\n', '%d ' % largeStartPos, htmlizeStructure( alignment.getAlignment(1).getStructure(), alignment.getAlignment(1).getSequence() ), ' %d' % largeEndPos, '\n', )) else: _results += ''.join(( '\n', '-' * (_longestLineLength + 1), '\n', 'Score = %d' % alignment.getScore(), '\n', 'Identities with small structure = %d/%d (%d%%), Gaps = %d/%d (%d%%)' % (smallIdentities, smallLen, float(smallIdentities)/smallLen*100, smallGaps, smallLen, float(smallGaps)/smallLen*100), '\n', 'Identities with large structure = %d/%d (%d%%), Gaps = %d/%d (%d%%)' % (largeIdentities, largeLen, float(largeIdentities)/largeLen*100, largeGaps, largeLen, float(largeGaps)/largeLen*100), '\n', 'Strand = Plus / Plus (Minus strand search not supported)', '\n', # 'Start: %d' % alignment.getStartingPosition(), '\n', '\n', 'Your query modified by gaps: Length = %d' % smallLen, '\n', '\n', '>', alignment.getAlignment(0).getName(), '\n', '%d ' % smallStartPos, alignment.getAlignment(0).getSequence(), ' %d' % smallEndPos, '\n', '%d ' % smallStartPos, alignment.getAlignment(0).getStructure(), ' %d' % smallEndPos, '\n', '\n', 'The target hit adjusted by gaps:', '\n', '>', alignment.getAlignment(1).getName(), '\n', '%d ' % largeStartPos, alignment.getAlignment(1).getSequence(), ' %d' % largeEndPos, '\n', '%d ' % largeStartPos, alignment.getAlignment(1).getStructure(), ' %d' % largeEndPos, '\n\n', )) return _results def htmlizeStructure(smallAlignment, largeAlignment): smallRes = [ '' ] largeRes = [] for i, smallChar in enumerate( smallAlignment ): if largeAlignment[i] == '-' : smallRes.append( '%s' % smallChar ) else: smallRes.append( smallChar ) smallRes.append( '' ) return ''.join(smallRes) #------------------------------------------------------------------------------------------------------------------------ #Your modified query: #> HBV encapsidation signal # Length = XXXX # #Score = 65 #Identities = XX/XX (X%), Gaps = X/X (X%) #Strand = Plus / Plus # #11 uguacaugucccacuguucaagcc-uccaagcugugccuug-gguggcuuugg--ggcauggaca 72 #11 (((.(((((((((......(((((-((((((......))))-)).))))))))--)))))).))) 72 # #The target hit: #> IRESite_2DExp_Struct_Id:21 HAV #XX gguucaggguuc--u-u--aaaucuguuucucuaua-agaacacucauuuuucacgcuuucuguc XX #XX (((..(((((..--.-.--((((..((((((.....-)))).))..)))).....)))))..))) XX # ['__doc__', '__init__', '__module__', '__str__', 'getName', 'getSequence', 'getStructure', 'getTranslatedString', 'loadFASTA', 'name', 'sequence', 'setName', 'structure'] # name is 'HBV encapsidation signal' # ------------------------------------------------------------------------------------------------------------------------ # DatabaseId:0 Score:65 Alignment starting at position:11 # >HBVencapsidationsignal # uguacaugucccacuguucaagcc-uccaagcugugccuug-gguggcuuugg--ggcauggaca # (((.(((((((((......(((((-((((((......))))-)).))))))))--)))))).))) # # >IRESite_2DExp_Struct_Id:21 HAV # gguucaggguuc--u-u--aaaucuguuucucuaua-agaacacucauuuuucacgcuuucuguc # (((..(((((..--.-.--((((..((((((.....-)))).))..)))).....)))))..))) return _results def main(): parser = OptionParser(usage="\n%prog -d database_file [options] -i input.fa\n%prog -d database_file [options] < input.fa") parser.add_option("-v", "--verbose", dest="verbose", action="store_true", default=False, help="be verbose") parser.add_option("-i", action="store", type="str", dest="inputFile") parser.add_option("-d", action="store", type="str", dest="databaseFile") parser.add_option("-a", "--varna-align", dest="varna_align", action="store_true", default=False, help="varna align output") parser.add_option("", "--varna-align-applet", dest="varna_align_applet", action="store_true", default=False, help="varna align output") parser.add_option("", "--html-output", dest="html_output", action="store_true", default=False, help="htmlize output") (options, args) = parser.parse_args() if options.verbose: print 'Database: ' + str(options.databaseFile) + ', Input file: ' + str(options.inputFile) # rna_forester_database_search.py executed with {'verbose': False, 'databaseFile': 'structure_ss_experiments_public_all.fa', 'inputFile': '/home/ires/IRESite/tmp/RNAforester_query_wO1zld.in'} # sys.argv=['/home/ires/public_html/IRES2/RNAforester/rna_forester_database_search.py', '-d', 'structure_ss_experiments_public_all.fa', '-i', '/home/ires/IRESite/tmp/RNAforester_query_wO1zld.in'] # ignore the broken value of databaseFile with full path and use sys.argv[2] instead # return "rna_forester_database_search.py executed with %s\nsys.argv=%s" % (str(options), str(sys.argv)) # {'verbose': False, 'databaseFile': 'data/structures.fa', 'inputFile': 'data/hbv.fa'} _results = databaseSearch(options, sys.argv[2], options.inputFile) return _results if __name__ == "__main__": print main()