from sets import
Set as set
import re
import time
def anagrams(s):
if s == "":
return [s]
else:
ans = []
for an in anagrams(s[1:]):
for pos in range(len(an)+1):
ans.append(an[:pos]+s[0]+an[pos:])
u={}
for i in ans:
u[i]=1
return u.keys()
# strList =
['ACGCATTCA', 'ACTGGATAC', 'TCAGCCATC', 'CAGCCATCT', 'TCATAGACC', 'ATACGCATC',
'TCAGTCATC']
# strList = [s
for i,s in enumerate(anagrams('ACGCATTCA')) if not i%100]
# strList =
anagrams('ACGCATTCA')[:1299]
strList =
anagrams('ACGCATTCA')[:100]
def
patt_match(sList):
global count, linecount, itemcount
count = linecount = itemcount = 0
sList
= sList[:]
patt = re.compile('[ACGT]')
dd = {}
indx = 0
while len(sList) > 0:
s1 = sList[0]
for j, item in enumerate(sList[1:]):
res = ''
for i, s in enumerate(s1):
count += 1
if s == item[i]:
res += s
else:
res += '.'
if patt.search(res):
if dd.has_key(res):
dd[res].append([indx,
j+1+indx])
else:
dd[res] = [[indx,
j+1+indx], ]
linecount += 1
indx += 1
itemcount += 1
sList.pop(0)
return dd
def
patt_match_str(dd):
outList = []
keys = dd.keys()
keys.sort()
for key in keys:
quan = len(set([item[j] for j in
range(2) for item in dd[key]]))
outList.append('(%s) %d occurrences:' %
(key, quan))
for v in dd[key]:
outList.append(' Pattern between sequence %d and %d' % (v[0],
v[1]))
return outList
s = 'Opening
output file: %s' % time.ctime()
dd =
patt_match(strList)
s += '\nBuilding
output string: %s' % time.ctime()
dataList =
patt_match_str(dd)
dataList.insert(0,
s)
dataList.append('Writing
data to output file: %s' % time.ctime())
fn =
r'H:\TEMP\temsys\string_patterns1.txt'
f = open(fn, 'w')
f.write('\n'.join(dataList))
f.close()
''' Sample output
Opening output
file: Wed Aug 01 08:50:53 2007
(........A) 676
occurrences:
Pattern between sequence 0 and 72
Pattern between sequence 0 and 89
Pattern between sequence 0 and 154
.............................................
(.C.GAC.A.) 8
occurrences:
Pattern between sequence 274 and 327
Pattern between sequence 274 and 848
Pattern between sequence 590 and 1649
Pattern between sequence 880 and 1885
Pattern between sequence 1636 and 1649
(.C.GAC.AT) 3
occurrences:
Pattern between sequence 848 and 1649
Pattern between sequence 848 and 1885
Pattern between sequence 1649 and 1885
(.C.GAC.C.) 2
occurrences:
Pattern between sequence 34 and 294
(.C.GAC.T.) 2
occurrences:
Pattern between sequence 307 and 631
(.C.GACA..) 10
occurrences:
Pattern between sequence 34 and 307
Pattern between sequence 34 and 574
Pattern between sequence 307 and 327
Pattern between sequence 327 and 574
Pattern between sequence 880 and 1966
Pattern between sequence 1036 and 1162
Pattern between sequence 1036 and 1649
Pattern between sequence 1963 and 1966
(.C.GACA.T) 5
occurrences:
Pattern between sequence 34 and 1649
Pattern between sequence 327 and 1162
Pattern between sequence 327 and 1966
Pattern between sequence 1162 and 1966
(.C.GACAAT) 2
occurrences:
Pattern between sequence 327 and 1649
...........................................
(TTGCCA..A) 2
occurrences:
Pattern between sequence 33 and 612
(TTGCCA.A.) 2
occurrences:
Pattern between sequence 612 and 1016
(TTGCCAA..) 2
occurrences:
Pattern between sequence 33 and 1016
Writing data to
output file: Wed Aug 01 08:51:53 2007
'''
'''
s1 = 'ACGCATTCA'
s2 = 'ACTGGATAC'
s3 = 'TCAGCCATC'
s4 = 'CAGCCATCT'
set(s1).intersection(set(s2))
'''