diff --git a/ancillaries/extended-microorganisms-taxonomy/README b/ancillaries/extended-microorganisms-taxonomy/README deleted file mode 100644 index 52a8a7668aae8f96165bfd58ff4875f00df570da..0000000000000000000000000000000000000000 --- a/ancillaries/extended-microorganisms-taxonomy/README +++ /dev/null @@ -1 +0,0 @@ -taxonomie : 21 octobre 2021 diff --git a/process_DSMZ_corpus.snakefile b/process_DSMZ_corpus.snakefile index 733b679a802c62963c39f6c53b9ceb1d19bf3717..34d530e68130a5cbbff5eab8ff3f8c1c4f86c87d 100644 --- a/process_DSMZ_corpus.snakefile +++ b/process_DSMZ_corpus.snakefile @@ -1,6 +1,5 @@ ## Processing data sources DSMZ -## config file configfile: 'config/config.yaml' ## variables, check values into config file @@ -20,62 +19,87 @@ _map_bacdive_taxid = config['MAP_BACDIVE_TAXID'] ''' all ''' rule all: - input: - 'corpora/florilege/dsmz/dsmz-results.txt' - + input: + 'corpora/florilege/dsmz/dsmz-results.txt' + + + +rule extract_sample_type: + output: + 'corpora/dsmz/dsmz-data/sample_type.txt' + + input: + script='softwares/scripts/bacdive-extract-isolation.py', + bacdive_entries='ancillaries/extended-microorganisms-taxonomy/bacdive-strains/entries' + + shell: '''{input.script} {input.bacdive_entries} >{output}''' + ''' get habitats ''' rule get_dsmz_habitat: - input: - file={_dsmz_habitat_corpus} - output: - habitats='corpora/dsmz/habitats.txt' - shell: """cut -f4 {input.file} |sort -u > {output.habitats}""" + input: + 'corpora/dsmz/dsmz-data/sample_type.txt' + + output: + 'corpora/dsmz/habitats.txt' + + shell: '''cut -f2 {input} |sort -u > {output}''' ''' map habitats of microorganisms ''' rule map_dsmz_habitats: - input: - habitats='corpora/dsmz/habitats.txt' - output: - mapped_habitats='corpora/dsmz/mapped_habitats.txt' - params: - plan='plans/map_habitats.plan', - onto='ancillaries/BioNLP-OST+EnovFood-Habitat.obo', - tomap='ancillaries/BioNLP-OST+EnovFood-Habitat.tomap', - graylist='ancillaries/graylist_extended.heads', - emptywords='ancillaries/stopwords_EN.ttg', - outdir='corpora/dsmz', - outfile='mapped_habitats.txt' - singularity:config["SINGULARITY_IMG"] - shell: """alvisnlp -J-Xmx32g -cleanTmp -verbose \ - -alias input {input.habitats} \ + input: + 'corpora/dsmz/habitats.txt' + + output: + 'corpora/dsmz/mapped_habitats.txt' + + params: + plan='plans/map_habitats.plan', + onto='ancillaries/BioNLP-OST+EnovFood-Habitat.obo', + tomap='ancillaries/BioNLP-OST+EnovFood-Habitat.tomap', + graylist='ancillaries/graylist_extended.heads', + emptywords='ancillaries/stopwords_EN.ttg', + outdir='corpora/dsmz', + outfile='mapped_habitats.txt' + + singularity: + config["SINGULARITY_IMG"] + + shell: '''~/code/alvisnlp/.test/alvisnlp/bin/alvisnlp -J-Xmx32g -cleanTmp -verbose \ + -alias input {input} \ -outputDir {params.outdir} \ - -alias output {params.outfile} \ - -alias ontobiotope {params.onto} \ - -xalias '<ontobiotope-tomap empty-words="{params.emptywords}" graylist="{params.graylist}" whole-proxy-distance="false">{params.tomap}</ontobiotope-tomap>' \ - {params.plan} - """ + -feat inhibit-syntax inhibit-syntax \ + -alias output {params.outfile} \ + -alias ontobiotope {params.onto} \ + -xalias '<ontobiotope-tomap empty-words="{params.emptywords}" graylist="{params.graylist}" whole-proxy-distance="false">{params.tomap}</ontobiotope-tomap>' \ + {params.plan}''' ''' format results ''' rule format_dsmz_results: - input: - habitats_file={_dsmz_habitat_corpus}, - habitats='corpora/dsmz/mapped_habitats.txt' - output: - result='corpora/florilege/dsmz/dsmz-results.txt' - params: - taxa_id_full={_ncbi_taxo_id}', - bacdive_taxid={_map_bacdive_taxid}, - conda: 'softwares/envs/obo-utils-env.yaml' - shell: 'python softwares/scripts/format-dsmz-results.py --bacdive-taxid {params.bacdive_taxid} --taxonomy {params.taxa_id_full} --dsmz-habitats {input.habitats_file} --mapped-habitats {input.habitats} > {output.result}' + input: + script='softwares/scripts/bacdive-format-results.py', + bacdive_sample_type='corpora/dsmz/dsmz-data/sample_type.txt', + mapped_habitats='corpora/dsmz/mapped_habitats.txt', + taxids='ancillaries/extended-microorganisms-taxonomy/taxid_microorganisms.txt', + bacdive_to_taxid='ancillaries/extended-microorganisms-taxonomy/bacdive-match/bacdive-to-taxid.txt' + + output: + result='corpora/florilege/dsmz/dsmz-results.txt', + logfile='corpora/florilege/dsmz/dsmz-results.log' + + conda: + 'softwares/envs/obo-utils-env.yaml' + + shell: + '''{input.script} --bacdive-sample-type {input.bacdive_sample_type} --taxids {input.taxids} --bacdive-to-taxid {input.bacdive_to_taxid} --mapped-habitats {input.mapped_habitats} >{output.result} 2>{output.logfile}''' diff --git a/softwares/scripts/bacdive-extract-isolation.py b/softwares/scripts/bacdive-extract-isolation.py new file mode 100755 index 0000000000000000000000000000000000000000..e5bb350814b2698c629f3904887b3a2f951ca0de --- /dev/null +++ b/softwares/scripts/bacdive-extract-isolation.py @@ -0,0 +1,49 @@ +#!/bin/env python3 + +import logging +import sys +import json +import os + + +logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO) +Logger = logging.getLogger(__name__) + + +BACDIVE_ENTRIES = sys.argv[1] + + +COUNT = { + 'files': 0, + 'entries': 0, + 'entries without isolation': 0, + 'isolations': 0 +} +KEYS = set() +Logger.info('reading BacDive entries in directory: %s' % BACDIVE_ENTRIES) +for dirpath, _, filenames in os.walk(BACDIVE_ENTRIES): + for fn in filenames: + if not fn.endswith('.json'): + continue + with open(os.path.join(dirpath, fn)) as f: + COUNT['files'] += 1 + j = json.load(f) + res = j['results'] + if res: + for entry in res.values(): + bacdive_id = str(entry['General']['BacDive-ID']) + COUNT['entries'] += 1 + try: + isolation = entry['Isolation, sampling and environmental information']['isolation'] + if isinstance(isolation, dict): + isolation = [isolation] + for iso in isolation: + KEYS |= set(iso.keys()) + sample_type = ' '.join(iso['sample type'].split()) + sys.stdout.write('%s\t%s\n' % (bacdive_id, sample_type)) + COUNT['isolations'] += 1 + except KeyError: + COUNT['entries without isolation'] += 1 +for k, v in COUNT.items(): + Logger.info('%s: %d' % (k, v)) +Logger.info('isolation keys: %s' % str(KEYS)) diff --git a/softwares/scripts/bacdive-format-results.py b/softwares/scripts/bacdive-format-results.py new file mode 100755 index 0000000000000000000000000000000000000000..51437805816eedb38e6d98973c7e349671c92c68 --- /dev/null +++ b/softwares/scripts/bacdive-format-results.py @@ -0,0 +1,68 @@ +#!/bin/env python3 + +import logging +import sys +import argparse + + +logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO) +Logger = logging.getLogger(__name__) + + +class BacDiveFormatResult(argparse.ArgumentParser): + def __init__(self): + argparse.ArgumentParser.__init__(self, 'format DSMZ extraction results') + self.add_argument('--bacdive-sample-type', action='store', required=True, dest='bacdive_sample_type', help='') + self.add_argument('--taxids', action='store', required=True, dest='taxids', help='') + self.add_argument('--bacdive-to-taxid', action='store', required=True, dest='bacdive_to_taxid', help='') + self.add_argument('--mapped-habitats', action='store', required=True, dest='mapped_habitats', help='') + + @staticmethod + def read_entries(filename, *args): + with open(filename) as f: + for line in f: + cols = line.strip().split('\t') + yield dict(zip(args, cols)) + + def run(self): + args = self.parse_args() + bacdive_sample_type = list(BacDiveFormatResult.read_entries(args.bacdive_sample_type, 'bacdive_id', 'habitat')) + taxid_microorganisms = dict((taxon['taxid'], taxon) for taxon in BacDiveFormatResult.read_entries(args.taxids, 'taxid', 'name', 'path', 'rank')) + bacdive_to_taxid = dict((bdt['bacdive_id'], bdt['taxid']) for bdt in BacDiveFormatResult.read_entries(args.bacdive_to_taxid, 'bacdive_id', 'taxid')) + mapped_habitats = dict((mh['text'], mh) for mh in BacDiveFormatResult.read_entries(args.mapped_habitats, 'text', 'form', 'obt_id', 'name', 'path')) + for bd in bacdive_sample_type: + bacdive_id = bd['bacdive_id'] + if bacdive_id not in bacdive_to_taxid: + Logger.warning('no taxon for %s' % bacdive_id) + continue + taxid = bacdive_to_taxid[bacdive_id] + if taxid not in taxid_microorganisms: + Logger.warning('could not find taxon info for %s' % taxid) + continue + taxinfo = taxid_microorganisms[taxid] + habitat = bd['habitat'] + if habitat not in mapped_habitats: + Logger.warning('no habitat found for %s (%s)' % (bacdive_id, habitat)) + continue + habinfo = mapped_habitats[habitat] + sys.stdout.write('\t'.join(( + taxinfo['name'], + taxinfo['name'], + taxinfo['taxid'], + taxinfo['path'], + habinfo['form'], + habinfo['obt_id'], + habinfo['name'], + habinfo['path'], + bacdive_id + ))) + sys.stdout.write('\n') + + +BACDIVE_SAMPLE_TYPE_FILE = 'corpora/dsmz/dsmz-data/sample_type.txt' +TAXID_MICROORGANISMS_FILE = 'ancillaries/extended-microorganisms-taxonomy/taxid_microorganisms.txt' +BACDIVE_TO_TAXID_FILE = 'ancillaries/extended-microorganisms-taxonomy/bacdive-match/bacdive-to-taxid.txt' +MAPPED_HABITATS_FILE = 'corpora/dsmz/mapped_habitats.txt' + +if __name__ == '__main__': + BacDiveFormatResult().run() diff --git a/softwares/scripts/format-dsmz-results.py b/softwares/scripts/format-dsmz-results.py deleted file mode 100644 index 44e5ba54415b301a52e6878dad396641c4cda03a..0000000000000000000000000000000000000000 --- a/softwares/scripts/format-dsmz-results.py +++ /dev/null @@ -1,105 +0,0 @@ -import re -import argparse - -parser = argparse.ArgumentParser() -parser.add_argument('--bacdive-taxid', action='store', default='ancillaries/ncbi-taxonomy-prefix/dsmz-match/bacdive-to-taxid.txt', help='Bacdive id to taxid mapping file') -parser.add_argument('--dsmz-habitats', action='store', default='corpora/dsmz/dsmz-data/category=origin-key=sample_type.tsv', help='DSMZ habitat file') -parser.add_argument('--taxonomy', action='store', default='ancillaries/ncbi-taxonomy-prefix/taxa+id_full.txt', help='taxonomy file') -parser.add_argument('--mapped-habitats', action='store', default='corpora/dsmz/mapped_habitats.txt', help='mapped habitat file') - -args = parser.parse_args() - -bacdive_taxid_file = args.bacdive_taxid -taxonomy_file = args.taxonomy -dsmz_habitat_file = args.dsmz_habitats -mapped_habitat_file = args.mapped_habitats - -habitat_dict = {} -hf = open(mapped_habitat_file, "r") -for line in hf: - line = line.rstrip() - p = re.compile(r'([^\t]+)\t(\S.+)') - m = p.match(line) - if(m): - habitat = m.group(1) - habitat_info = m.group(2) - if habitat in habitat_dict: - habitat_dict[habitat].add(habitat_info) - else: - habitat_dict[habitat] = {habitat_info} -hf.close() - -taxonomy = {} -tax = open(taxonomy_file, "r") -for num,line in enumerate(tax,1): - line = line.rstrip() - fields = line.split("\t") - taxid=fields[1] - if(not taxid in taxonomy): - taxonomy[taxid]={} - taxonomy[taxid]['name']=fields[2]; - taxonomy[taxid]['path']=fields[3]; -tax.close() - -bacdiveTaxaIds = {} -dtf = open(bacdive_taxid_file, "r") -for num,line in enumerate(dtf,1): - line = line.rstrip() - bacdiveid,taxid = line.split("\t") - bacdiveTaxaIds[bacdiveid]=taxid; -dtf.close() - -def add_entry(mappings, habitat, concepts, taxid, taxon, name, path, bacdiveid): - for concept in concepts: - surface_form, concept_id, concept_name, concept_path = concept.split('\t') - key = taxid+"-"+concept_id - if key in mappings: - mappings[key]['habitat']['surface'].add(surface_form) - mappings[key]['taxon']['surface'].add(taxon) - mappings[key]['bacdiveid'].add(bacdiveid) - else: - mappings[key] = {} - mappings[key]['habitat'] = {} - mappings[key]['taxon'] = {} - mappings[key]['bacdiveid'] = {bacdiveid} - mappings[key]['habitat']['concept_id'] = concept_id - mappings[key]['habitat']['concept_name'] = concept_name - mappings[key]['habitat']['concept_path'] = concept_path - mappings[key]['taxon']['taxid'] = taxid - mappings[key]['taxon']['canonical_name'] = name - mappings[key]['taxon']['path'] = path - mappings[key]['habitat']['surface'] = {surface_form} - mappings[key]['taxon']['surface'] = {taxon} - return mappings - - -unique_mappings = {} -dhf = open(dsmz_habitat_file, "r") -for num, line in enumerate(dhf, 1): - if (num > 1): - line = line.rstrip() - fields = re.split("\t+", line) - if(len(fields) == 4): - bacdiveid = fields[0] - habitat = re.sub(r'\s+', ' ', fields[3]).strip() - if bacdiveid in bacdiveTaxaIds: - taxid = bacdiveTaxaIds[bacdiveid] - if taxid in taxonomy: - name = taxonomy[taxid]['name'] - path = taxonomy[taxid]['path'] - if habitat in habitat_dict: - concepts = habitat_dict[habitat] - unique_mappings = add_entry(unique_mappings, habitat, concepts, taxid, name, name, path, bacdiveid) -dhf.close() - -for mapping in unique_mappings.values(): - print("\t".join(('|'.join(mapping['taxon']['surface']), - mapping['taxon']['canonical_name'], - mapping['taxon']['taxid'], - mapping['taxon']['path'], - '|'.join(mapping['habitat']['surface']), - mapping['habitat']['concept_id'], - mapping['habitat']['concept_name'], - mapping['habitat']['concept_path'], - ','.join(mapping['bacdiveid'])) - ))