Back to SDS/2 Parametric Scripts

 

"""

Extract data from a file and format into rows and columns, comma delimited:

No.,gene,locus_tag,protein_id,GeneID

1,1..1317,CRP_001,CRP_001,YP_802398.1,4414829

2,1314..2816,CRP_002,CRP_002,YP_802399.1,4414830

3,2785..3477,CRP_003,CRP_003,YP_802400.1,4414831

"""

import os

 

fn = (os.path.join('H:\\', 'TEMP', 'temsys', 'gene.txt'))

fOut = (os.path.join('H:\\', 'TEMP', 'temsys', 'gene_.txt'))

 

def geneData(fn, fOut):

    f = open(fn, 'r')

    line_no = 1

    lineLst = []

    itemLst = []

    labelLst = ['No.', 'gene', 'locus_tag', 'protein_id', 'GeneID']

    protien_id = False

    for line in f:

        if 'gene' in line:

            itemLst += [str(line_no), line.split()[1]]

        elif 'locus_tag' in line:

            itemLst.append(line.split('=')[1].strip('"\n'))

        elif 'protein_id' in  line:

            itemLst.append(line.split('=')[1].strip('"\n'))

            protien_id = True

        elif 'GeneID' in line and protien_id == True:

            itemLst.append(line.split(':')[1].strip('"\n'))

            lineLst.append(itemLst)

            itemLst = []

            line_no += 1

            protien_id = False

    f.close()

    f = open(fOut, 'w')

    f.write(','.join(labelLst)+'\n')

    for line in lineLst:

        f.write(','.join(line)+'\n')

    f.close()          

    return labelLst+lineLst

 

 

if __name__ == '__main__':

    geneData(fn, fOut)

 

   

"""

gene 1..1317

/locus_tag="CRP_001"

/db_xref="GeneID:4414829"

CDS 1..1317

/locus_tag="CRP_001"

/codon_start=1

/transl_table=11

/product="tRNA modification GTPase"

/protein_id="YP_802398.1"

/db_xref="GI:116334903"

/db_xref="GeneID:4414829"

/translation="KNLKCFINKIVDNKDFSKNNYSDVKILFNKFSF"

 

gene 1314..2816

/locus_tag="CRP_002"

/db_xref="GeneID:4414830"

CDS 1314..2816

/locus_tag="CRP_002"

/codon_start=1

/transl_table=11

/product="glucose inhibited division protein A"

/protein_id="YP_802399.1"

/db_xref="GI:116334904"

/db_xref="GeneID:4414830"

/translation="KIKLFDNFYLFKLIIIMSKYYGYIKKKYFK"

 

gene 2785..3477

/locus_tag="CRP_003"

/db_xref="GeneID:4414831"

CDS 2785..3477

/locus_tag="CRP_003"

/codon_start=1

/transl_table=11

/product="F0F1-type ATP synthase A subunit"

/protein_id="YP_802400.1"

/db_xref="GI:116334905"

/db_xref="GeneID:4414831"

/translation="MVILKKNILNNFLNFKIIDLNLIILL"

"""