from sets import Set as set

import re

import time

 

def anagrams(s):

    if s == "":

        return [s]

    else:

        ans = []

        for an in anagrams(s[1:]):

            for pos in range(len(an)+1):

                ans.append(an[:pos]+s[0]+an[pos:])

        u={}

        for i in ans:

            u[i]=1

        return u.keys()

 

# strList = ['ACGCATTCA', 'ACTGGATAC', 'TCAGCCATC', 'CAGCCATCT', 'TCATAGACC', 'ATACGCATC', 'TCAGTCATC']

# strList = [s for i,s in enumerate(anagrams('ACGCATTCA')) if not i%100]

# strList = anagrams('ACGCATTCA')[:1299]

strList = anagrams('ACGCATTCA')[:100]

 

def patt_match(sList):

    global count, linecount, itemcount

    count = linecount = itemcount = 0

    sList = sList[:]

    patt = re.compile('[ACGT]')

    dd = {}

    indx = 0

    while len(sList) > 0:

        s1 = sList[0]

        for j, item in enumerate(sList[1:]):

            res = ''

            for i, s in enumerate(s1):

                count += 1

                if s == item[i]:

                    res += s

                else:

                    res += '.'

            if patt.search(res):

                if dd.has_key(res):

                    dd[res].append([indx, j+1+indx])

                else:

                    dd[res] = [[indx, j+1+indx], ]

            linecount += 1

        indx += 1

        itemcount += 1

        sList.pop(0)

    return dd

 

def patt_match_str(dd):

    outList = []

    keys = dd.keys()

    keys.sort()

 

    for key in keys:

        quan = len(set([item[j] for j in range(2) for item in dd[key]]))

        outList.append('(%s) %d occurrences:' % (key, quan))

        for v in dd[key]:

            outList.append('    Pattern between sequence %d and %d' % (v[0], v[1]))

    return outList

 

s = 'Opening output file: %s' % time.ctime()

dd = patt_match(strList)

s += '\nBuilding output string: %s' % time.ctime()

dataList = patt_match_str(dd)

dataList.insert(0, s)

dataList.append('Writing data to output file: %s' % time.ctime())

 

 

fn = r'H:\TEMP\temsys\string_patterns1.txt'

f = open(fn, 'w')

f.write('\n'.join(dataList))

f.close()

 

 

''' Sample output

Opening output file: Wed Aug 01 08:50:53 2007

(........A) 676 occurrences:

    Pattern between sequence 0 and 72

    Pattern between sequence 0 and 89

    Pattern between sequence 0 and 154

.............................................

(.C.GAC.A.) 8 occurrences:

    Pattern between sequence 274 and 327

    Pattern between sequence 274 and 848

    Pattern between sequence 590 and 1649

    Pattern between sequence 880 and 1885

    Pattern between sequence 1636 and 1649

(.C.GAC.AT) 3 occurrences:

    Pattern between sequence 848 and 1649

    Pattern between sequence 848 and 1885

    Pattern between sequence 1649 and 1885

(.C.GAC.C.) 2 occurrences:

    Pattern between sequence 34 and 294

(.C.GAC.T.) 2 occurrences:

    Pattern between sequence 307 and 631

(.C.GACA..) 10 occurrences:

    Pattern between sequence 34 and 307

    Pattern between sequence 34 and 574

    Pattern between sequence 307 and 327

    Pattern between sequence 327 and 574

    Pattern between sequence 880 and 1966

    Pattern between sequence 1036 and 1162

    Pattern between sequence 1036 and 1649

    Pattern between sequence 1963 and 1966

(.C.GACA.T) 5 occurrences:

    Pattern between sequence 34 and 1649

    Pattern between sequence 327 and 1162

    Pattern between sequence 327 and 1966

    Pattern between sequence 1162 and 1966

(.C.GACAAT) 2 occurrences:

    Pattern between sequence 327 and 1649

...........................................

(TTGCCA..A) 2 occurrences:

    Pattern between sequence 33 and 612

(TTGCCA.A.) 2 occurrences:

    Pattern between sequence 612 and 1016

(TTGCCAA..) 2 occurrences:

    Pattern between sequence 33 and 1016

Writing data to output file: Wed Aug 01 08:51:53 2007

'''

 

'''

s1 = 'ACGCATTCA'

s2 = 'ACTGGATAC'

s3 = 'TCAGCCATC'

s4 = 'CAGCCATCT'

 

set(s1).intersection(set(s2))

'''