Back to SDS/2 Parametric Scripts
"""
Extract data from a
file and format into rows and columns, comma delimited:
No.,gene,locus_tag,protein_id,GeneID
1,1..1317,CRP_001,CRP_001,YP_802398.1,4414829
2,1314..2816,CRP_002,CRP_002,YP_802399.1,4414830
3,2785..3477,CRP_003,CRP_003,YP_802400.1,4414831
"""
import os
fn = (os.path.join('H:\\',
'TEMP', 'temsys', 'gene.txt'))
fOut
= (os.path.join('H:\\', 'TEMP', 'temsys',
'gene_.txt'))
def geneData(fn, fOut):
f = open(fn, 'r')
line_no = 1
lineLst
= []
itemLst
= []
labelLst
= ['No.', 'gene', 'locus_tag', 'protein_id',
'GeneID']
protien_id =
False
for line in f:
if 'gene' in
line:
itemLst += [str(line_no), line.split()[1]]
elif
'locus_tag' in line:
itemLst.append(line.split('=')[1].strip('"\n'))
elif
'protein_id' in
line:
itemLst.append(line.split('=')[1].strip('"\n'))
protien_id
= True
elif
'GeneID' in line and protien_id
== True:
itemLst.append(line.split(':')[1].strip('"\n'))
lineLst.append(itemLst)
itemLst = []
line_no
+= 1
protien_id
= False
f.close()
f = open(fOut, 'w')
f.write(','.join(labelLst)+'\n')
for line in lineLst:
f.write(','.join(line)+'\n')
f.close()
return labelLst+lineLst
if __name__ == '__main__':
geneData(fn, fOut)
"""
gene 1..1317
/locus_tag="CRP_001"
/db_xref="GeneID:4414829"
CDS 1..1317
/locus_tag="CRP_001"
/codon_start=1
/transl_table=11
/product="tRNA modification GTPase"
/protein_id="YP_802398.1"
/db_xref="GI:116334903"
/db_xref="GeneID:4414829"
/translation="KNLKCFINKIVDNKDFSKNNYSDVKILFNKFSF"
gene 1314..2816
/locus_tag="CRP_002"
/db_xref="GeneID:4414830"
CDS 1314..2816
/locus_tag="CRP_002"
/codon_start=1
/transl_table=11
/product="glucose
inhibited division protein A"
/protein_id="YP_802399.1"
/db_xref="GI:116334904"
/db_xref="GeneID:4414830"
/translation="KIKLFDNFYLFKLIIIMSKYYGYIKKKYFK"
gene 2785..3477
/locus_tag="CRP_003"
/db_xref="GeneID:4414831"
CDS 2785..3477
/locus_tag="CRP_003"
/codon_start=1
/transl_table=11
/product="F0F1-type
ATP synthase A subunit"
/protein_id="YP_802400.1"
/db_xref="GI:116334905"
/db_xref="GeneID:4414831"
/translation="MVILKKNILNNFLNFKIIDLNLIILL"
"""