# Parse matrix data and sequence data files

 

def parseArray(fn, dataset=1, key='PO', term='/'):

    '''

    Read a formatted data file in matrix format and

    compile data into a dictionary

    '''

    f = open(fn)

 

    # skip to required data set

    for _ in range(dataset):

        try:

            line = f.next()

            while not line.startswith(key):

                line = f.next()

        except StopIteration, e:

            print 'We have reached the end of the file!'

            f.close()

            return False

 

    headerList = line.strip().split()[1:]

    lineList = []

 

    line = f.next().strip()

    while not line.startswith(term):

        if line != '':

            lineList.append(line.strip().split())

        line = f.next().strip()

 

    f.close()

 

    # Key list

    keys = [i[0] for i in lineList]

    # Values list

    values = [[float(s) for s in item] for item in [j[1:] for j in lineList]]

 

    # Create a dictionary from keys and values

    lineDict = dict(zip(keys, values))

 

    dataDict = {}

 

    for i, item in enumerate(headerList):

        dataDict[item] = {}

        for key in lineDict:

            dataDict[item][key] = lineDict[key][i]

 

    # Add 1.0 to every element in dataDict subdictionaries

    for keyMain in dataDict:

        for keySub in dataDict[keyMain]:

            dataDict[keyMain][keySub] += 1.0

 

    # Normalize original data (with 1 added) and update data

    valueSums = [sum(item)+4 for item in values]

    # print valueSums

 

    for keyMain in dataDict:

        for keySub in dataDict[keyMain]:

            dataDict[keyMain][keySub] /= valueSums[int(keySub)-1]

 

    return dataDict

 

 

def parseData(fn, dataset=1, key='>'):

    '''

    Read a formatted data file of alpha sequences

    Return a list of sequences

    The first element in the list is the header

    '''   

    # initialize output list

    dataList = []

   

    # open file for reading

    f = open(fn)

   

    # skip to required data set

    for _ in range(dataset):

        try:

            s = f.next()

            while not s.startswith(key):

                s = f.next()

        except StopIteration, e:

            print 'We have reached the end of the file!'

            f.close()

            return False

 

    # initialize output list

    dataList = [s,]

       

    for line in f:

        if not line.startswith(key):

            dataList.append(line.strip())

        else:

            break

 

    f.close()

    return dataList

 

def compileData(fnArray, fnSeq, arraySet=1, seqSet=1):

    # sequence factor dictionary

    value={"A":0.3,"T":0.3,"C":0.2,"G":0.2}

   

    dataArray = parseArray(fnArray, arraySet)

    if dataArray:

        dataSeq = parseData(fnSeq, seqSet)

        if not dataSeq:

            return False

    else:

        return None

   

    # This is the complete sequence 

    seq = ''.join(dataSeq[1:])

    # These are the subkeys of dataArray - '01', '02', '03',.............

    subKeys = dataArray['A'].keys()

    subKeys.sort()

 

    # Calculate num/den for each slice of sequence

    # Each sequence slice length = length of subKeys

    # Example:

    # seq = 'ATCGATA'

    # subKeys length = 3

    # 'ATC', 'TCG', 'CGA', 'GAT', 'ATA'

    numList = []

    denList = []

    seqList = []

    for i in xrange(len(seq) - len(subKeys) + 1):

        subseq = seq[0:len(subKeys)]

        seqList.append(subseq)

        num, den = 1, 1

        for j, s in enumerate(subseq):

            num *= dataArray[s][subKeys[j]]

            den *= value[s]

        numList.append(num)

        denList.append(den)

        seq = seq[1:]

 

    resultList = []

    for i, num in enumerate(numList):

        resultList.append(num/denList[i])

 

    outStr = '\n'.join(['Sequence = %s Calculation = %0.12f' % (seqList[i], res) for i, res in enumerate(resultList)])

    return 'Array set # = %d\nSequence set # = %d\nSequence Header: %s\n%s' % (arraySet, seqSet, dataSeq[0], outStr)

 

if __name__ == '__main__':

   

    fnArray = r'H:\TEMP\temsys\data9.txt'

    fnSeq = r'H:\TEMP\temsys\data12.txt'

   

    outputfile = r'H:\TEMP\temsys\sequence_calc_data.txt'

   

    arraySet = 1

    outList = []

    calcdata = 1

    while not calcdata is None:

        seqSet = 1

        while True:

            calcdata = compileData(fnArray, fnSeq, arraySet, seqSet)

            if calcdata:

                outList.append(calcdata)

                seqSet += 1

            else:

                break

        arraySet += 1

 

    f = open(outputfile, 'w')

    f.write('\n'.join(outList))

    f.close()   

 

'''

>>> Array set # = 1

Sequence set # = 3

Sequence Header: >Cp36_PRR|Drosophila melanogaster|Cp36|FBgn0000359|X:8324430..8324513

 

Sequence = TCTAGAGATCTGGGCA Calculation = 0.000520377928

Sequence = CTAGAGATCTGGGCAC Calculation = 0.000011324924

Sequence = TAGAGATCTGGGCACG Calculation = 0.000010676845

Sequence = AGAGATCTGGGCACGA Calculation = 0.000043154836

Sequence = GAGATCTGGGCACGAT Calculation = 0.000049390322

Sequence = AGATCTGGGCACGATG Calculation = 0.000000078869

Sequence = GATCTGGGCACGATGG Calculation = 0.003679435071

Sequence = ATCTGGGCACGATGGC Calculation = 0.000004580993

Sequence = TCTGGGCACGATGGCG Calculation = 0.000025964167

Sequence = CTGGGCACGATGGCGA Calculation = 0.000190953272

Sequence = TGGGCACGATGGCGAG Calculation = 0.000209084862

Sequence = GGGCACGATGGCGAGA Calculation = 0.000349499483

Sequence = GGCACGATGGCGAGAC Calculation = 0.000014551293

Sequence = GCACGATGGCGAGACA Calculation = 0.000253266698

Sequence = CACGATGGCGAGACAA Calculation = 0.000002088444

Sequence = ACGATGGCGAGACAAA Calculation = 0.000085759837

Sequence = CGATGGCGAGACAAAG Calculation = 0.000719291466

Sequence = GATGGCGAGACAAAGA Calculation = 0.108603646410

Sequence = ATGGCGAGACAAAGAT Calculation = 0.000022105017

Sequence = TGGCGAGACAAAGATG Calculation = 0.074916911295

Sequence = GGCGAGACAAAGATGC Calculation = 0.000654673006

Sequence = GCGAGACAAAGATGCG Calculation = 0.002905350767

Sequence = CGAGACAAAGATGCGG Calculation = 0.040711263424

Sequence = GAGACAAAGATGCGGC Calculation = 0.000066332349

Sequence = AGACAAAGATGCGGCG Calculation = 0.000844706696

Sequence = GACAAAGATGCGGCGC Calculation = 0.001363986600

Sequence = ACAAAGATGCGGCGCA Calculation = 0.000000158236

Sequence = CAAAGATGCGGCGCAA Calculation = 0.000248960708

Sequence = AAAGATGCGGCGCAAA Calculation = 0.000003482795

Sequence = AAGATGCGGCGCAAAA Calculation = 0.000003790517

Sequence = AGATGCGGCGCAAAAT Calculation = 0.000062906122

Sequence = GATGCGGCGCAAAATC Calculation = 0.000000630359

Sequence = ATGCGGCGCAAAATCG Calculation = 0.000041339176

Sequence = TGCGGCGCAAAATCGG Calculation = 0.007412276588

Sequence = GCGGCGCAAAATCGGA Calculation = 0.000109927284

Sequence = CGGCGCAAAATCGGAA Calculation = 0.032381958151

Sequence = GGCGCAAAATCGGAAA Calculation = 0.027066447384

Sequence = GCGCAAAATCGGAAAT Calculation = 0.000038441301

Sequence = CGCAAAATCGGAAATG Calculation = 0.016863436369

Sequence = GCAAAATCGGAAATGG Calculation = 0.016099091359

Sequence = CAAAATCGGAAATGGA Calculation = 0.000929346454

Sequence = AAAATCGGAAATGGAG Calculation = 0.000186989034

Sequence = AAATCGGAAATGGAGA Calculation = 0.003120608869

Sequence = AATCGGAAATGGAGAT Calculation = 0.000031851876

Sequence = ATCGGAAATGGAGATG Calculation = 0.000387934984

Sequence = TCGGAAATGGAGATGG Calculation = 0.000028928662

Sequence = CGGAAATGGAGATGGA Calculation = 0.858721770074

Sequence = GGAAATGGAGATGGAT Calculation = 0.000032582474

Sequence = GAAATGGAGATGGATC Calculation = 0.000194328378

Sequence = AAATGGAGATGGATCA Calculation = 0.000000025115

Sequence = AATGGAGATGGATCAC Calculation = 0.000005746845

Sequence = ATGGAGATGGATCACG Calculation = 0.000000225826

Sequence = TGGAGATGGATCACGT Calculation = 0.093243689191

Sequence = GGAGATGGATCACGTA Calculation = 0.000581140752

Sequence = GAGATGGATCACGTAG Calculation = 0.000002101908

Sequence = AGATGGATCACGTAGC Calculation = 0.000016524721

Sequence = GATGGATCACGTAGCC Calculation = 0.000029313806

Sequence = ATGGATCACGTAGCCG Calculation = 0.000535232860

Sequence = TGGATCACGTAGCCGG Calculation = 0.000015091041

Sequence = GGATCACGTAGCCGGC Calculation = 0.000010864488

Sequence = GATCACGTAGCCGGCC Calculation = 0.000023539371

Sequence = ATCACGTAGCCGGCCA Calculation = 0.001552014384

Sequence = TCACGTAGCCGGCCAT Calculation = 0.000000040841

Sequence = CACGTAGCCGGCCATG Calculation = 0.000005420914

Sequence = ACGTAGCCGGCCATGG Calculation = 0.000010765295

Sequence = CGTAGCCGGCCATGGC Calculation = 0.002425152785

Sequence = GTAGCCGGCCATGGCG Calculation = 0.000000198520

Sequence = TAGCCGGCCATGGCGG Calculation = 0.000220954056

>>>

'''

 

''' Data Files

NA bin

PO A C G T

01 0.45 8.27 0.00 11.39

02 0.00 0.00 10.02 10.09

03 5.80 1.39 0.00 12.93

04 12.33 5.18 2.60 0.00

05 12.43 0.00 0.00 7.68

06 18.55 0.00 1.57 0.00

07 0.05 0.58 0.00 19.48

08 20.11 0.00 0.00 0.00

09 20.06 0.05 0.00 0.00

10 20.11 0.00 0.00 0.00

11 0.00 15.33 0.00 4.78

12 20.06 0.05 0.00 0.00

13 14.99 0.35 4.78 0.00

14 13.64 2.42 3.37 0.68

15 5.03 0.00 15.08 0.00

16 7.23 0.45 10.94 1.49

//

//

NA bap

PO A C G T

01 0.00 3.67 0.00 0.00

02 0.00 0.00 3.67 0.00

03 0.00 0.00 0.00 3.67

04 0.00 3.67 0.00 0.00

05 3.67 0.00 0.00 0.00

06 3.46 0.00 0.22 0.00

07 0.00 0.00 3.67 0.00

08 0.00 0.00 0.00 3.67

09 0.00 0.00 0.00 3.67

10 0.00 3.67 0.00 0.00

11 3.67 0.00 0.00 0.00

12 3.67 0.00 0.00 0.00

13 0.00 0.00 3.67 0.00

14 0.00 0.00 0.00 3.67

15 0.00 0.00 3.67 0.00

16 0.00 3.67 0.00 0.00

//

//

NA bcd

PO A C G T

01 42.55 8.75 145.86 8.14

02 0.14 0.53 204.64 0.00

03 126.83 78.02 0.11 0.34

04 0.21 0.17 0.00 204.92

05 0.00 12.38 0.43 192.50

06 174.48 0.95 1.32 28.56

07 79.53 4.70 100.44 20.64

//

//

NA bin

PO A C G T

01 0.45 8.27 0.00 11.39

02 0.00 0.00 10.02 10.09

03 5.80 1.39 0.00 12.93

04 12.33 5.18 2.60 0.00

05 12.43 0.00 0.00 7.68

06 18.55 0.00 1.57 0.00

07 0.05 0.58 0.00 19.48

08 20.11 0.00 0.00 0.00

09 20.06 0.05 0.00 0.00

10 20.11 0.00 0.00 0.00

11 0.00 15.33 0.00 4.78

12 20.06 0.05 0.00 0.00

13 14.99 0.35 4.78 0.00

14 13.64 2.42 3.37 0.68

15 5.03 0.00 15.08 0.00

16 7.23 0.45 10.94 1.49

//

//

 

>CG9571_O-E|Drosophila melanogaster|CG9571|FBgn0031086|X:19926374..19927133

CCAGTCCACCGGCCGCCGATCTATTTATACGAGAGGAAGAGGCTGAACTCGAGGATTACCCGTGTATCCTGGGACGCG

GATTAGCGATCCATTCCCCTTTTAATCGCCGCGCAAACAGATTCATGAAAGCCTTCGGATTCATTCATTGATCCACAT

CTACGGGAACGGGAGTCGCAAACGTTTTCGGATTAGCGCTGGACTAGCGGTTTCTAAATTGGATTATTTCTACCTGAC

CCTGGAGCCATCGTCCTCGTCCTCCGTCCCTTAGCGCCTCCTGCATGGATGTCGTTTTTGGGTTTCATACCTTTTCAC

ACTGGAAAAATACGGAATTTGTTGTAAGCCCTTTCAAGACGAATGGGATTTAGCTTCGGATGTCAACGTCACCATAAT

CATATTAGGAATATTTCTACTCAATTGCAATATTGGTACTTTTCTGACTGTAAACGCGATGATAATTACAAATATGCC

TAATTTGCTGTCTTTATAATCAAATGGAGTTCTTTATATTTCCAAAATATTGAAATTCCGATTCCCTAGAAAATAATA

CGTTTTTCTGTTATTAATAAAAAACCAATAGGAAAGTTCTCAAAAATTACTCTGTTGTATTTGATCATTTCTTTTCCG

GTATAATCTTTTATTTTAAGCATTCCCATGTGAATAAATTTCAGACTAATGTATTAATAAGATGTCGTGTTTTTCCAC

TTACAAATTTCTCATACAGCTGGATATATACTACGAGTACTATACACATGCTCTGGG

>Cp36_DRR|Drosophila melanogaster|Cp36|FBgn0000359|X:8323349..8324136

AGTCGACCAGCACGAGATCTCACCTACCTTCTTTATAAGCGGGGTCTCTAGAAGCTAAATCCATGTCCACGTCAAACC

AAAGACTTGCGGTCTCCAGACCATTGAGTTCTATAAATGGGACTGAGCCACACCATACACCACACACCACACATACAC

ACACGCCAACACATTACACACAACACGAACTACACAAACACTGAGATTAAGGAAATTATTAAAAAAAATAATAAAATT

AATACAAAAAAAATATATATATATACAAAAATTTGTTGTGTTTGAATTGAATTAAGAGCTTATCAAGAAAAAAATTTC

AGTGACTCATAATACACTACTCTACAAGTTTAAATTGAATCAACAATTTAACTTTCATTGCTCAGGTTTTTAGTAACA

ATGTTTATATAAGTTTAGGTATAACAAATGATTTAAATATAAGATACTGTATTTCACATTGAGACGAAACAATCCACC

GAAAATCATAAAATATAAGAATGTTGCATTTTATTTTTAAAAATAAAGATGCCTTTTAAGAGGAATAACTTAAATGTC

TTTAATACCTTTGAATTTAATTATATGGCTAATAAACACAAACTTAAAGCTTAAAACTGCATCGAATTGAATGCGGTT

ATAAATGTACTTATATATCTAATATAATCTGCTAATATGGTTTACATGGTATATCTTTCTCGGAAATTTTTACAAAAA

TTATCTATTCATATATCTCGAGCGTAAGATATTTATCAGTTTATAGATAACATCTTTAAATTTGGGTGATTAAAAAAA

AACATTG

>Cp36_PRR|Drosophila melanogaster|Cp36|FBgn0000359|X:8324430..8324513

TCTAGAGATCTGGGCACGATGGCGAGACAAAGATGCGGCGCAAAATCGGAAATGGAGATGGATCACGTAGCCGGCCAT

GGCGG

>Him_distal|Drosophila melanogaster|Him|FBgn0030900|X:18039896..18043470

GGTTTTCTGCGATGGCTTCCGCGCCAGCTGAAGTATCTGATTTGCTGCCTTGTTTTTGTTGATATTTCTGCGAAGGGA

CTTGTGCTTTTCAAATGGCCTTTTTTTGGGATTACGGCAAGGGCGCGTTTCCCACGCTCGATCCCCACTTACCATTGG

TGCACGCGATTGCGGCAAGCTGCTGAGGCAAGCTATTAAACGCCACACTGGGCCGGGGGGCGGTACCGGTGGGCGTGG

CAGGGGAGTCGACACATGTTGTGTGCCAGAGAACTTTGCTCCGATCCCCAGATCATCAAATAGTTGTCGCTGTCTGCT

CGTGCGCAAATTGCAATACTTTGCATACCCTTACTGCAGGGTATCTGAGCTTGGACTTTAAATAAGGGGGTATAACAT

AGCTTATACTCTCTATCTCTGTTATAAAGTCAATTTTCCTTAGATCTTTAGTACAGTGGGTAGTTAAGGAGACATAAC

TTCCAAAAAAAAAAACTATAAAATTGCAATAATTTATGCAAAATATGTATTTTATTGAATGGGATGAATAATTTACCT

TATACGACTGTAAAACATTTCTAACGATTAAATGCACTTCTAAAAGTTTTCCCACAAGTAGGTGAGCTATTATGCTAA

GCGTTCCATGACTTGGAATCTAAGATCTTGTTTTGATCTTCGCTGATCTTTGAGAACTCGGGGATTACTTACACATTT

CTGGGCAGGCACAAGTGGGCCGAGGCAGTGTAGATTCATCACGTTTTCACTCAACACACGCAGCTCATTAACAGCCCC

GCTGACAACTTGTCAGGACTTCCCCCTCGTGAATCCCCCTGCTACGCAACCCCCATTCCCCGCCCATTCCAACACTTC

CCGCCGGGAGCGTGGGAAATTATGCGTGTTGGTGGGACGTCGGGCGGTGAAAATTGGCGCGCTCTTCGGGGGGCCACA

CCGCGTGGCATTGACAACTCTTCCACATTTCGCGCCCAACGATGCGTTGGCATCAGTGGGTCACAGGGATTACGGCTG

GCTGGGATTCCAGAGCCAGATCTTTTTCAGCCAAAACTTTCAGCTTTCGAAGACCTCAAGCGATAGGAGAGTGTCGGA

AGTCCAGAAATAGACGCGTAGCACATAAATTATGGATCGTATCGAGTATCGATTAGCCCGGGACAAGCGAAGCGATAG

GGAGACATATTTTTATTACCCTCTCGGGGACCTGCACTTGTTGGCTTCGCTTCTATGAAAGATCCCTCTACCATATCA

CGTATGTGGGCTCCCCCAATCGAACCGAGTTGTGGGAAATGTTTTCCCAGGCCAACAGCTAATTGTCACTCCAAGGGT

TGTCCCCGCAGCCCAGACGACAGATAAGCGGGCAAGTGAAGCCCAGCGATCTGAGTCAAGTGAAGGGCTTCAATTTCT

TTCCCGAGTGGAACTGGGATATCGAAATTACATTTGTAACAGACGTTTTAGTCCGCAATCCTCAGCTAATGGGACTTA

CGAACATATATTCATCTGAAATTCAAGAACATGCGCACTTAAAGAGCAGGGAAGTCGCACACGCGCAAGTCAGGCGCT

CAAAAAGGGATCTTCGGAGGTACAGTGGGCAAAAGACTGTAAATAAATAATATAAATAAAATAATATTTAGCTCTATG

TGTTTATATAATCTACAAAGTAGTTAACAAAAAATATAAAATGGATATAAAAATACATCTTATATATCCCTATAATAA

GAAATAAATAATAATTTTAGTAAATTAATTTTGTTACACAAAGTACCTGTATTATTACCTCTTTTTTGTTGGTTGGTT

CTTTTTTGATGTGGCCCCACTGTGCTCTCTTATCAGTGCGACAATCAGGCATTGCCTTTCCCCATCGGGGGATTCTAA

TTCCGTGGACGATGGGCCGAAACGCCTATAAAGTCGCTCATTAAAAATGTTTAATTATGGCCCATCTTGCATCTTGCA

CCGATGTGGATGGGGTTTGTCGGCAATGATTTACATTATAAAAATGCCCGTTATCTGAGCATTTTGTACGCTCCACTC

CCTCTTCCCCCCTCCAAAAAAAAAAAAAACAGATATGTATATTCCCCGAGATATTCCCAAGCGGCCAAAAATAGACGC

AAATTGTAACGCACTTGAAGTGCACTCTGAAACATCTTGAAGTCCAAATAAAATAGCAGAGAGACCCACAATAATATA

CGTTGATATACACATGTATATATGTATGTATGTACATAAAGGGCCAGGAGCAGGAACGTTAGGCATGCGGTGGTACGA

GCACCGTGGTGCGAGCGAGAGCGCTGTGCTGCCTGAGGGAGAGGTAGCGAGTGGGTTGCATTGCGCACACAGAACATG

TGAATGCAGAGTTCAAGTGCATGCCGTGACACAGACACGCACACACACACACGCACACACAGATGAGTAGCCGCTGCA

AAGTGTTTTTTCCCAGGCGCTATTTATAATATGCATCCCGTCGCCGATCCGATCCGATCCAATCCAATCCGATTGGAT

CCCATCTTGCGGCACTACGATTATGACGCTCGACACGATGATGCATTCGCAGAGTTTCCCGATCGCAGAGTACCCTGT

ACTCGAGTAGTTTTTAGATGCAGTATTATTAAGTAGAAAATTGTAACCGTATAATATTCCATTATATTAAATATTTTT

ATAGCACTAAAGAAATAAAAGCCCATTTTATAATTTATATTACAAAAATACTTAACCATAGAAACTTATGATATGATA

CCAATATTTAAGTTCCAAAAAATGTAGAACATTTTTAAGTATATACTCGAAAATATTAATTTTCAAAATTGATATTCA

AGAGATATTATAAAAAGATCCCCATTCTAAATATCTAACATCATGCCATGCTTTCTAATGAGTATAGTATACCCCTGC

TACCCTGTCAATCCGCAAAACAGGCGCCGAAACATGCGGTTTCTCGCAGCAGACTGCCACGGGAAAAATTCGGTTCGA

GATTTGGGAATGGATGTATGACGGAGCAGAAGGAGCAGGACCCGGATTTCGGATTTCGGAATGGATATGGAAATGAAG

ATGGAAATGGGACTTTGACTGCGCGACGGCCACATGCGCCGCTGGCGATGCCGCTGGATGTTGCATGTGGCAGCGGTC

GGTGCAGCAGCGAAAGTGTTGCAGCTGTATGAGAGGGTCTATTTTTGGGGCGATTGTGCGGCGCTGGTGCTGCCACAT

GTGTTCTGTGTTGGGCTGCTAAAAGGCATTGTAATGAGAGCAGAAAATAGAATTGACTCCACTTGAGCAATGTCCCAT

AAAGCGGGAGTTTCGAGTTTGGCGCGCAATGTGCCGCACCAGCAAACGAACAAAAGAAAAAAAAAAAAAAAAAACACA

GCCAGTAACACATGGGCCCACGAGTTATGTTTTATTTTTAATCCCACAAAGAGTCGATCTCCAAAACAAACCCGCAGA

GAGCACATATAAAGAGACTCGGTGGACGAGTGGTTCGAAACAGTCTTCCGCCGCAGCTCGACGCGCTCGCATATCGGG

AATATATAGATCGGAGATATCGCAGGACCCACAGCAGAGCAGAGCCGCAGAGCCACCAACCTCG

>Him_proximal|Drosophila melanogaster|Him|FBgn0030900|X:18041232..18043470

GCCCAGACGACAGATAAGCGGGCAAGTGAAGCCCAGCGATCTGAGTCAAGTGAAGGGCTTCAATTTCTTTCCCGAGTG

GAACTGGGATATCGAAATTACATTTGTAACAGACGTTTTAGTCCGCAATCCTCAGCTAATGGGACTTACGAACATATA

TTCATCTGAAATTCAAGAACATGCGCACTTAAAGAGCAGGGAAGTCGCACACGCGCAAGTCAGGCGCTCAAAAAGGGA

TCTTCGGAGGTACAGTGGGCAAAAGACTGTAAATAAATAATATAAATAAAATAATATTTAGCTCTATGTGTTTATATA

ATCTACAAAGTAGTTAACAAAAAATATAAAATGGATATAAAAATACATCTTATATATCCCTATAATAAGAAATAAATA

ATAATTTTAGTAAATTAATTTTGTTACACAAAGTACCTGTATTATTACCTCTTTTTTGTTGGTTGGTTCTTTTTTGAT

GTGGCCCCACTGTGCTCTCTTATCAGTGCGACAATCAGGCATTGCCTTTCCCCATCGGGGGATTCTAATTCCGTGGAC

GATGGGCCGAAACGCCTATAAAGTCGCTCATTAAAAATGTTTAATTATGGCCCATCTTGCATCTTGCACCGATGTGGA

TGGGGTTTGTCGGCAATGATTTACATTATAAAAATGCCCGTTATCTGAGCATTTTGTACGCTCCACTCCCTCTTCCCC

CCTCCAAAAAAAAAAAAAACAGATATGTATATTCCCCGAGATATTCCCAAGCGGCCAAAAATAGACGCAAATTGTAAC

GCACTTGAAGTGCACTCTGAAACATCTTGAAGTCCAAATAAAATAGCAGAGAGACCCACAATAATATACGTTGATATA

CACATGTATATATGTATGTATGTACATAAAGGGCCAGGAGCAGGAACGTTAGGCATGCGGTGGTACGAGCACCGTGGT

GCGAGCGAGAGCGCTGTGCTGCCTGAGGGAGAGGTAGCGAGTGGGTTGCATTGCGCACACAGAACATGTGAATGCAGA

GTTCAAGTGCATGCCGTGACACAGACACGCACACACACACACGCACACACAGATGAGTAGCCGCTGCAAAGTGTTTTT

TCCCAGGCGCTATTTATAATATGCATCCCGTCGCCGATCCGATCCGATCCAATCCAATCCGATTGGATCCCATCTTGC

GGCACTACGATTATGACGCTCGACACGATGATGCATTCGCAGAGTTTCCCGATCGCAGAGTACCCTGTACTCGAGTAG

TTTTTAGATGCAGTATTATTAAGTAGAAAATTGTAACCGTATAATATTCCATTATATTAAATATTTTTATAGCACTAA

AGAAATAAAAGCCCATTTTATAATTTATATTACAAAAATACTTAACCATAGAAACTTATGATATGATACCAATATTTA

AGTTCCAAAAAATGTAGAACATTTTTAAGTATATACTCGAAAATATTAATTTTCAAAATTGATATTCAAGAGATATTA

TAAAAAGATCCCCATTCTAAATATCTAACATCATGCCATGCTTTCTAATGAGTATAGTATACCCCTGCTACCCTGTCA

ATCCGCAAAACAGGCGCCGAAACATGCGGTTTCTCGCAGCAGACTGCCACGGGAAAAATTCGGTTCGAGATTTGGGAA

TGGATGTATGACGGAGCAGAAGGAGCAGGACCCGGATTTCGGATTTCGGAATGGATATGGAAATGAAGATGGAAATGG

GACTTTGACTGCGCGACGGCCACATGCGCCGCTGGCGATGCCGCTGGATGTTGCATGTGGCAGCGGTCGGTGCAGCAG

CGAAAGTGTTGCAGCTGTATGAGAGGGTCTATTTTTGGGGCGATTGTGCGGCGCTGGTGCTGCCACATGTGTTCTGTG

TTGGGCTGCTAAAAGGCATTGTAATGAGAGCAGAAAATAGAATTGACTCCACTTGAGCAATGTCCCATAAAGCGGGAG

TTTCGAGTTTGGCGCGCAATGTGCCGCACCAGCAAACGAACAAAAGAAAAAAAAAAAAAAAAAACACAGCCAGTAACA

CATGGGCCCACGAGTTATGTTTTATTTTTAATCCCACAAAGAGTCGATCTCCAAAACAAACCCGCAGAGAGCACATAT

AAAGAGACTCGGTGGACGAGTGGTTCGAAACAGTCTTCCGCCGCAGCTCGACGCGCTCGCATATCGGGAATATATAGA

TCGGAGATATCGCAGGACCCACAGCAGAGCAGAGCCGCAGAGCCACCAACCTCG

>Obp18a_prom|Drosophila melanogaster|Obp18a|FBgn0030985|X:18969778..18972746

ATGGCGAAAATCTGTTTCCCAACTAACAATGAGCGCATCATCACAGCTCTATATATATAACCCATCGATTTGCTAATT

CAGCTCAAAAGTAGACAGGAGATTTTAATTAAATAATTGGATGCTACTTTACATTCGCCACACACCAACAAATAAAGT

CTATAATTGAAATTTTAAGCGCAGTTCCCGATTATGAGCTACACGTATGTCGTATGCGCAATATCTGCATTACAATTG

CCAATAGTAAATTACCAACTTGGTTTTCTTCATATTTATTAAGATAGAAAACATACAATTTTTGGCTTTTACACTCCA

AGCATCTCTGAAGTTTAAACAAAAAACATATGTGTAGCCTATCTACTGTATTGGACTTTATTCGTATATTTTATATGG

TTCATTAATATAGGTATAAATACAAATTATATTCACGCTTTGCGATTTGCAGCGAATATCACATCTTATACACGATGT

AAAAAAAAAAAAAATATTTCGTCATGTTTTTAGGTTGGCCGCAGGCAGTGCTCACTGTACCGCCACAATGTTTATCGT

TTTGCATTTTTTTTTTCTTTGTTTTCTTGCGGTTTCCCCTAATTATCTTTAGTATAAACTTAGTCTACTGTCTTTTTT

GGTAAGTATTTTCGTGATGGGCTCGTCTATGCGAATTCCCATTTCCAATGAATAAATAAAGTAATTAGAACATTAAAA

TTAGCAATAAAACACGTACATTTAAAGCTGACAACAAAAAAAAAAAGTATTCTTATGTTAAACTGTAGTATGTGCCTA

TGCAATATTAAGAACAATTAAATAAAATAGCATATTAACTTATGGCAGCACTTTGTTGCTATGTTTATGTTTATGTTT

ATGCACGCAGTTAGGCCAGGGCGGATGTAACATGATCACCCACTCGAAGGCAAAAAGTATAAGTGCATGGTCAGCATT

CACACGCCGACCAAATACATATTACATACGTACATACATATCTCGCTCTCCCGATAAGCCTAGATATATAAGATATAC

ATAAGAACGCCGCTCCGCTGCTGGCGTACCCGGCAGCGCAGCTACGCGGATTAGCCTAAGTCCAAATATATTAAAAAC

TGTAAAATCAGAGAGACTCTGTAGACGTTGAGCTGACAGAACCATTTCTGCCTACTCTAAAATCAAAAGAAGAAATTG

AATAAATATATGTCAGCCCGACGGCTGCCTTCAACTTAAAACGGACTTGTGTTCTGAATTGGAGTTCATCATTACATG

GCGACCGTGACAGTCGTCCAACGCTGGACGAATTGACCAAAGCTGGTGAAAACAAAGGAACAAAGGAACACTGGACTG

GAAGAAGACTGGACTAATTAAATGGAACTGCAAAAACCAAGGAAAAATCTGAGTGAGTAGAGTTCTATTGAGTATGGG

CAAACACCGTGGCGGTTTGAAAACTAAGCTGAATAAACGTATAGCCCACGTAAGGTGGCTAATATACGGTCAGCAAAC

GCCACCGGTTTGGTCGAAAGCTCTAAAGCTACATGCAGAGCTAGACCACTTGTTGCAATATCAGCAAGAATTAAAGAC

CCATAAGCTCGAGAAAACTCACTCAGATAATATTAAAAATATACCCACAATTAATGAAGTTCCAAAATACCAGGCATG

TCCAGCACCAGCACCAGCATTAACAAAACCAAAGAAGTCCTGCCCCCCTGGCTGCGAAGGAATCTGGAGTCCCCACTG

CCTGGGGACTTGTGAGCGACCATCGACGTCTTCAGCGGCGAAGAAATAGACAGCAGCGAGGGAGTGTCAGCGTGCCAC

CCCCGGCGACGCCCAGCTGACACCTGATGAGCATCATCAACAGCAGAATATAATAATAAATATATATAAATATAAAGT

AAATATAAAATATATATAGATAAGAAAAATTGTAAGAAATATTGTAAAACGGAGCATATACTATTATGCCCTGTTAAC

CCAATATGGCCCGTGAAGCCATAGCTAGAATCAGGCAGGCAACAATGTAAAATACAATTTTTTTTTACTCTTGCGAAC

ATTGAAAGATTTTATAAATAGATAATTCCAAACATAAATGTCTATAGAGACAAATGAAATAAGTAAAACTGAAAATAA

AAGTATATACAAAGGAAATTTTCTATTCTATTCTCCAAAATATAAAATTAGTATACCCAAAATGGGTCTAATAGACAC

TAAAACTGTGGACTCTACAGCCAATGTAATAAATAAAGTAGAAGTCCAAAATGCAGACTTGTTCTGGATAACCATAAT

ACTAATTGTAATTGCATTAATTATGGTATCCAATGCATTAATAAAAATATACAAACTGCATAACAAGTGTCTTAAGAA

ACGATACCGTAGCACTGCTAACGGTATAGATAATATTTAAGGAAGATCTTTAATAAAGTCAATTATGAATGAAAATAT

GAGAAAAATTATATGAAAAAAAAAAAATAATAAATAAAAAAAAAAATATAAAACGTAATATTGAATTTATCTACGTTA

AAAAAAAAAATATATACAAATGAATAAATTTGAAGTTATGAGTATACCACAGCATGGACTGGGAAAAGCTTGTTGATC

AGATAAAAGATCAAAATGAAAATTTCAGAAAATCCTATAAGTGCTTAACGCAAAACAGATCAACACAAGCTGTAACAA

TCAATAGGAATGCCCAAGTCTTGGTAAATAGTTATAATGAAATCAGAGAGTTGATCCAACAAAATAGAAAGAATTTGG

AACGCAAACAGTGTGCTAAGGCTTTGAACCTACTGGTGACATTAAGAGAAAAATTAATATTTATAAAAAATAAATTCA

GTCTCCAGATAGAAATTCCAACCATAGTAAACACCCCACTAAGAATAAATTTGAATGAAGACAGCACTAACTCTGACG

AGGAAGATAGGACTATAGTCAAGGAAGACATTAAAGAGGAAGATCTTCACGATCTAACTATACCAGCAAAATTAATGC

TGAA

>Obp19a_prom|Drosophila melanogaster|Obp19a|FBgn0031109|X:20223943..20226446

CCACCTGCGAAATGGGTCATAGTATATGTATTTGTAAAAAATGTATGTAAAAAAATGTTAAATTAATAATTTTGAATT

TCAATTTGGAGCTGAAAATAATATTTTGTGTCCATCAACAGCTCCAAAGCGATGGTTCATTTTATCTTGTGTGCGTTC

AATAGAATCACTCTTACGTTAGCGCGTCCATTGATGGTTGTCCCATTGAAGTACTTCTTAAAGCCGTCGGCCATTGCT

ACTGGACTGGATCTGGAGATCTGGAGATCTGGATTTGGGGTCGGGTCCGGGTGAGAGCTGAGTGTGTTCTGCCTATAG

CTCCGAGCGAGAACCTAATGACAAGCAGCGAAGTGCAAAGCTCGGCCAACTAGATTACAAAGTCGATTCATTGGCAGG

ATTCGATTTTTATTGACTCAACGAGGTGGTACATGAGTTTGGTCCCCAAGCCTTTAACTGTGGCATCGAGGACCGGAA

AGGGGGTGCTGATTATAAATAGTTATGGATTGCTGACGGGTCGAATGGGTCGGAGCGGTGGGGAGCCATGACTTCAAT

GATTTGGCAGCATCGGCGCCCTAGCCATGGAGCATGGCCTGCTGGCAGCCCTTGCAGTAGAGCTTGGTCTCGCGCCGC

TTCGTGTTGCGGCGGTGCATCTTGACCAGGACGTAGACGAGTCCCAACGAGGCCCAGGTGGCCTTGGCTACCTGTGGG

TTTCGGTGGCGTATTTGGGCGCATCTTGTGTACTGCCGTGTACTGAATCACTTACATTGGCGCGACCACGCATGGTCT

GGCTGTTGAAGGCTTCGTTGAAGTTGAAATGATCGGACATCTTTGGATCGTTGTTGACCGGATTGGCGTGGCTTTTAA

CAAAAGATTAAAATTTGGATTCGATATTCGACCTGTATTTTAGACCGGGATTCGGATTGTGACTTTTAAACGTTCGAA

ATGAAAGGAATGTTACTGACAGTCGTCAAAGCCGACTCGGGTTTCCCAACTAGAGAGAATGCTGAAGTCTAGTACCGA

CTAATGGGATACCCATTAATTACTGCTTAAATACTGTGATGAAAATTGAGATATGCAAGAGGCAAATCGAAAGTTTTG

GACATTTTCATATTGTACCTTTAACCAACTTCAGAATTCATTGAGCTAAATACCATTTACAATTTTATGAAATTTTTA

AGCATGTTACAGCTATAACTATTTTTAAACCAGTTACTAGATTCGTTGAAAATTGTATGTCACACAGAACTTCTTGCC

ATCCTGGTCGGAATTAGGATCACTAGCCAAGCCGATATGGCTATGTCTGTCCGTATGAAAGTCTTGGAATCTGATATT

AACATCGCATATCGATCGACCATTATATATCTAATATATCCTCTACAAATGTATTTTATCACCTAGCTAGCATGTAAA

CATTCTGGCCTATTTAGCTGTACGCTTCAGTTATGCTAATGCAAACATAAGCCTTTTGTGATATTATAATTTACATTT

ATTATTTATTGCAGTTAGCTTTATCAGCGATTTGGGCTCATGCCACACGCAATACTACTTATTTCAACGTCATCAGTT

GTACTAAATGCACAAATGAAATACATTTCGCCAAATAAATGCCAACTTGCAACTAATTTGAATGCTAATCAAACCGAA

CTACTCATTTGCATACAAGGTAATAGGTGGTTAAAGTGAGTGTAATGGACTTACTTAAGGGGTTACAAGGCTTATATT

TAAAATGCCTGCCTTGTAATTAAATTTTTAAATATATTGGAAAAAAATGGCCACTTGTTATGTGAGTCTCCAGAAAAA

AAACAAAAAAACAGCAACCATCTGGTATGCAAAATATCTGGTGGTAGCAAAATATCTGGTGGTATCTGGTGGACTATC

AAAATATAAAAACTTTTTTTTCCAGATAGTATATCTTAAAATCAGCATCTTGAAGGAGTATATGTAAATAGCAAACTA

TTTGTAAAAATAGATTTTATTTTATAATTTTTTAAGATATATACCAAACATTATTACCGATTGTGATTATCTTTACAT

TGTTTGACCTCAAAACGGAAAACTGGATGCGCGGTATCCATGCGACCCTAACTCTGGAACCGATTTTGGAACCGCCCC

GTTAGATCTCAGATTGAAACCTTATTTGCATTCGCATGATCGCTGATGAACACTGGGGAAATGCGGCCCAGCAATGGG

ATTGTCAACGCATCTCGGCCAGAATCGCGCCTCGCATGCCACCTCGCACGGTGACCACATACCTGTGTACACTGTCAA

TTAACGTGGCAAGATTATAGCCCGGCCAGAAAGTAATCCGCCCCAGGAACACCACCCACCGCCCGCCCATTTGGATAT

GGAAATGGGCAGTGGGGGCGGCGATTGGCGCTAACCCATAATTCCCACACCCACTTAGCGGTTCGATCGAACCAATAT

GAAGTCATTTGCATGTCGGGGGCCGTGTATAAAAGGAGTCGCCGATGGGTCTGGAGTCTGGAATCCGCCAAATCGTCT

CGGAAAT

>Obp19b_prom|Drosophila melanogaster|Obp19b|FBgn0031110|X:20224439..20227440

ATTGCTGACGGGTCGAATGGGTCGGAGCGGTGGGGAGCCATGACTTCAATGATTTGGCAGCATCGGCGCCCTAGCCAT

GGAGCATGGCCTGCTGGCAGCCCTTGCAGTAGAGCTTGGTCTCGCGCCGCTTCGTGTTGCGGCGGTGCATCTTGACCA

GGACGTAGACGAGTCCCAACGAGGCCCAGGTGGCCTTGGCTACCTGTGGGTTTCGGTGGCGTATTTGGGCGCATCTTG

TGTACTGCCGTGTACTGAATCACTTACATTGGCGCGACCACGCATGGTCTGGCTGTTGAAGGCTTCGTTGAAGTTGAA

ATGATCGGACATCTTTGGATCGTTGTTGACCGGATTGGCGTGGCTTTTAACAAAAGATTAAAATTTGGATTCGATATT

CGACCTGTATTTTAGACCGGGATTCGGATTGTGACTTTTAAACGTTCGAAATGAAAGGAATGTTACTGACAGTCGTCA

AAGCCGACTCGGGTTTCCCAACTAGAGAGAATGCTGAAGTCTAGTACCGACTAATGGGATACCCATTAATTACTGCTT

AAATACTGTGATGAAAATTGAGATATGCAAGAGGCAAATCGAAAGTTTTGGACATTTTCATATTGTACCTTTAACCAA

CTTCAGAATTCATTGAGCTAAATACCATTTACAATTTTATGAAATTTTTAAGCATGTTACAGCTATAACTATTTTTAA

ACCAGTTACTAGATTCGTTGAAAATTGTATGTCACACAGAACTTCTTGCCATCCTGGTCGGAATTAGGATCACTAGCC

AAGCCGATATGGCTATGTCTGTCCGTATGAAAGTCTTGGAATCTGATATTAACATCGCATATCGATCGACCATTATAT

ATCTAATATATCCTCTACAAATGTATTTTATCACCTAGCTAGCATGTAAACATTCTGGCCTATTTAGCTGTACGCTTC

AGTTATGCTAATGCAAACATAAGCCTTTTGTGATATTATAATTTACATTTATTATTTATTGCAGTTAGCTTTATCAGC

GATTTGGGCTCATGCCACACGCAATACTACTTATTTCAACGTCATCAGTTGTACTAAATGCACAAATGAAATACATTT

CGCCAAATAAATGCCAACTTGCAACTAATTTGAATGCTAATCAAACCGAACTACTCATTTGCATACAAGGTAATAGGT

GGTTAAAGTGAGTGTAATGGACTTACTTAAGGGGTTACAAGGCTTATATTTAAAATGCCTGCCTTGTAATTAAATTTT

TAAATATATTGGAAAAAAATGGCCACTTGTTATGTGAGTCTCCAGAAAAAAAACAAAAAAACAGCAACCATCTGGTAT

GCAAAATATCTGGTGGTAGCAAAATATCTGGTGGTATCTGGTGGACTATCAAAATATAAAAACTTTTTTTTCCAGATA

GTATATCTTAAAATCAGCATCTTGAAGGAGTATATGTAAATAGCAAACTATTTGTAAAAATAGATTTTATTTTATAAT

TTTTTAAGATATATACCAAACATTATTACCGATTGTGATTATCTTTACATTGTTTGACCTCAAAACGGAAAACTGGAT

GCGCGGTATCCATGCGACCCTAACTCTGGAACCGATTTTGGAACCGCCCCGTTAGATCTCAGATTGAAACCTTATTTG

CATTCGCATGATCGCTGATGAACACTGGGGAAATGCGGCCCAGCAATGGGATTGTCAACGCATCTCGGCCAGAATCGC

GCCTCGCATGCCACCTCGCACGGTGACCACATACCTGTGTACACTGTCAATTA

'''