Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import os
import sys
import glob
from Bio import SeqIO
from datetime import date
today = date.today()
# /db/genbank/current/flat/gbbct1.seq
# Criteres : 16S > 800pb
# NCBI taxa reference
# taxo_ref = "/projet/florilege/Florilege_GenBank/taxa+id_microorganisms.txt"
# Functions
def prep_ncbi(ncbi_file):
with open(ncbi_file, 'r') as ncbi:
dict_ncbi = {}
count = 0
for line in ncbi:
list_ncbi = line.split('\t')
if list_ncbi[1].replace('ncbi:', '') not in dict_ncbi:
dict_ncbi[list_ncbi[1].replace('ncbi:','')] = list_ncbi[2]
count += 1
print("=> Nombre de taxID : " + str(count))
return dict_ncbi
def clean_gb(gb_file):
os.system("LC_CTYPE=C tr -d '\256' < " + file + " > " + os.path.basename(file).replace('.seq', '_clean.seq'))
def get_values(record):
accession = record.name
length = len(record.seq)
species = record.annotations["organism"]
journal = record.annotations["references"][0].pubmed_id
strain = host = source = country = ''
for feature in record.features:
if feature.type == 'source':
# TaxID
if 'db_xref' in feature.qualifiers:
taxID = feature.qualifiers['db_xref'][0].replace('taxon:', '')
else:
taxID = 0
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# Strain
if 'strain' in feature.qualifiers:
if strain == '':
strain = feature.qualifiers['strain'][0]
else:
strain += '|' + feature.qualifiers['strain'][0]
# Host
if 'host' in feature.qualifiers:
if host == '':
host = feature.qualifiers['host'][0]
else:
host += '|' + feature.qualifiers['host'][0]
# Isolation_source
if 'isolation_source' in feature.qualifiers:
if source == '':
source = feature.qualifiers['isolation_source'][0]
else:
source += '|' + feature.qualifiers['isolation_source'][0]
# Country
if 'country' in feature.qualifiers:
if country == '':
country = feature.qualifiers['country'][0]
else:
country += '|' + feature.qualifiers['country'][0]
return accession, length, species, strain, taxID, journal, source, host, country
# Main
if __name__ == '__main__':
import sys
from optparse import OptionParser, OptionGroup
optparser = OptionParser(usage="%extract data from genbank bank")
group = OptionGroup(optparser, "Main Options", "")
optparser.add_option("-t", "--taxoref", default="None", dest="taxo_ref", help="", metavar="FILE")
optparser.add_option("-d", "--dbpath", default="None", dest="db_path", help="", metavar="FILE")
optparser.add_option("-o", "--fout", default="None", dest="f_out", help="tsv file that will contain the genbank extract", metavar="FILE")
(options, args) = optparser.parse_args()
dict_ncbi = prep_ncbi(options.taxo_ref)
ok = 0
#with open('GenBank_extraction_'+today.strftime("%Y%m%d")+'.tsv', 'w') as outfile:
with open(options.f_out, 'w') as outfile:
# accession | length | species | strain | ncbi_taxid | journal | isolation_source
#outfile.write("accession\tlength\tspecies\tstrain\tncbi_taxid\tjournal\tisolation_source\thost\tcountry\n")
outfile.write("accession\tlength\tspecies\tstrain\tncbi_taxid\tjournal\tisolation_source\thost\n")
f_bct = glob.glob(options.db_path + '/gbbct*.seq')
f_inv = glob.glob(options.db_path + '/gbinv*.seq')
f_vrl = glob.glob(options.db_path + '/gbvrl*.seq')
#f_bct = glob.glob('/db/genbank/current/flat/gbbct*.seq')
#f_inv = glob.glob('/db/genbank/current/flat/gbinv*.seq')
#f_vrl = glob.glob('/db/genbank/current/flat/gbvrl*.seq')
files = f_bct + f_inv + f_vrl
#files = ['/db/genbank/current/flat/gbbct411.seq']
for file in files:
clean_gb(file)
for record in SeqIO.parse(os.path.basename(file).replace('.seq', '_clean.seq'), "genbank"):
if ' 16S ' in record.description and len(record.seq) > 800:
accession, length, species, strain, taxID, journal, source, host, country = get_values(record)
# --- Ecriture --- #
#if (source != "" or host != "" or country != "") and taxID in dict_ncbi :
if (source != "" or host != "") and taxID != 0 and taxID in dict_ncbi :
#outfile.write(accession + "\t" + str(length) + "\t" + species + "\t" + strain + "\t" + taxID + "\t" + journal + "\t" + source + "\t" + host + "\t" + country + " \n")
outfile.write(accession + "\t" + str(length) + "\t" + species + "\t" + strain + "\t" + taxID + "\t" + journal + "\t" + source + "\t" + host + " \n")
ok += 1
os.remove(os.path.basename(file).replace('.seq', '_clean.seq'))
print("=> Number of data collected : ", str(ok))