import os import sys import glob from Bio import SeqIO from datetime import date today = date.today() # /db/genbank/current/flat/gbbct1.seq # Criteres : 16S > 800pb # NCBI taxa reference # taxo_ref = "/projet/florilege/Florilege_GenBank/taxa+id_microorganisms.txt" # Functions def prep_ncbi(ncbi_file): with open(ncbi_file, 'r') as ncbi: dict_ncbi = {} count = 0 for line in ncbi: list_ncbi = line.split('\t') if list_ncbi[1].replace('ncbi:', '') not in dict_ncbi: dict_ncbi[list_ncbi[1].replace('ncbi:','')] = list_ncbi[2] count += 1 print("=> Nombre de taxID : " + str(count)) return dict_ncbi def clean_gb(gb_file): os.system("LC_CTYPE=C tr -d '\256' < " + file + " > " + os.path.basename(file).replace('.seq', '_clean.seq')) def get_values(record): accession = record.name length = len(record.seq) species = record.annotations["organism"] journal = record.annotations["references"][0].pubmed_id strain = host = source = country = '' for feature in record.features: if feature.type == 'source': # TaxID if 'db_xref' in feature.qualifiers: taxID = feature.qualifiers['db_xref'][0].replace('taxon:', '') else: taxID = 0 # Strain if 'strain' in feature.qualifiers: if strain == '': strain = feature.qualifiers['strain'][0] else: strain += '|' + feature.qualifiers['strain'][0] # Host if 'host' in feature.qualifiers: if host == '': host = feature.qualifiers['host'][0] else: host += '|' + feature.qualifiers['host'][0] # Isolation_source if 'isolation_source' in feature.qualifiers: if source == '': source = feature.qualifiers['isolation_source'][0] else: source += '|' + feature.qualifiers['isolation_source'][0] # Country if 'country' in feature.qualifiers: if country == '': country = feature.qualifiers['country'][0] else: country += '|' + feature.qualifiers['country'][0] return accession, length, species, strain, taxID, journal, source, host, country # Main if __name__ == '__main__': import sys from optparse import OptionParser, OptionGroup optparser = OptionParser(usage="%extract data from genbank bank") group = OptionGroup(optparser, "Main Options", "") optparser.add_option("-t", "--taxoref", default="None", dest="taxo_ref", help="", metavar="FILE") optparser.add_option("-d", "--dbpath", default="None", dest="db_path", help="", metavar="FILE") optparser.add_option("-o", "--fout", default="None", dest="f_out", help="tsv file that will contain the genbank extract", metavar="FILE") (options, args) = optparser.parse_args() dict_ncbi = prep_ncbi(options.taxo_ref) ok = 0 #with open('GenBank_extraction_'+today.strftime("%Y%m%d")+'.tsv', 'w') as outfile: with open(options.f_out, 'w') as outfile: # accession | length | species | strain | ncbi_taxid | journal | isolation_source #outfile.write("accession\tlength\tspecies\tstrain\tncbi_taxid\tjournal\tisolation_source\thost\tcountry\n") outfile.write("accession\tlength\tspecies\tstrain\tncbi_taxid\tjournal\tisolation_source\thost\n") f_bct = glob.glob(options.db_path + '/gbbct*.seq') f_inv = glob.glob(options.db_path + '/gbinv*.seq') f_vrl = glob.glob(options.db_path + '/gbvrl*.seq') #f_bct = glob.glob('/db/genbank/current/flat/gbbct*.seq') #f_inv = glob.glob('/db/genbank/current/flat/gbinv*.seq') #f_vrl = glob.glob('/db/genbank/current/flat/gbvrl*.seq') files = f_bct + f_inv + f_vrl #files = ['/db/genbank/current/flat/gbbct411.seq'] for file in files: clean_gb(file) for record in SeqIO.parse(os.path.basename(file).replace('.seq', '_clean.seq'), "genbank"): if ' 16S ' in record.description and len(record.seq) > 800: accession, length, species, strain, taxID, journal, source, host, country = get_values(record) # --- Ecriture --- # #if (source != "" or host != "" or country != "") and taxID in dict_ncbi : if (source != "" or host != "") and taxID != 0 and taxID in dict_ncbi : #outfile.write(accession + "\t" + str(length) + "\t" + species + "\t" + strain + "\t" + taxID + "\t" + journal + "\t" + source + "\t" + host + "\t" + country + " \n") outfile.write(accession + "\t" + str(length) + "\t" + species + "\t" + strain + "\t" + taxID + "\t" + journal + "\t" + source + "\t" + host + " \n") ok += 1 os.remove(os.path.basename(file).replace('.seq', '_clean.seq')) print("=> Number of data collected : ", str(ok))