diff --git a/ancillaries/extended-microorganisms-taxonomy/README b/ancillaries/extended-microorganisms-taxonomy/README
deleted file mode 100644
index 52a8a7668aae8f96165bfd58ff4875f00df570da..0000000000000000000000000000000000000000
--- a/ancillaries/extended-microorganisms-taxonomy/README
+++ /dev/null
@@ -1 +0,0 @@
-taxonomie : 21 octobre 2021 
diff --git a/process_DSMZ_corpus.snakefile b/process_DSMZ_corpus.snakefile
index 733b679a802c62963c39f6c53b9ceb1d19bf3717..34d530e68130a5cbbff5eab8ff3f8c1c4f86c87d 100644
--- a/process_DSMZ_corpus.snakefile
+++ b/process_DSMZ_corpus.snakefile
@@ -1,6 +1,5 @@
 ## Processing data sources DSMZ
 
-## config file
 configfile: 'config/config.yaml'
 
 ## variables, check values into config file
@@ -20,62 +19,87 @@ _map_bacdive_taxid = config['MAP_BACDIVE_TAXID']
 ''' all
 '''
 rule all:
-	input:
-		'corpora/florilege/dsmz/dsmz-results.txt'
-		
+    input:
+        'corpora/florilege/dsmz/dsmz-results.txt'
+
+
+
+rule extract_sample_type:
+    output:
+        'corpora/dsmz/dsmz-data/sample_type.txt'
+
+    input:
+        script='softwares/scripts/bacdive-extract-isolation.py',
+        bacdive_entries='ancillaries/extended-microorganisms-taxonomy/bacdive-strains/entries'
+
+    shell: '''{input.script} {input.bacdive_entries} >{output}'''
+
 
 '''
 get habitats
 '''
 rule get_dsmz_habitat:
-	input:
-		file={_dsmz_habitat_corpus}
-	output:
-		habitats='corpora/dsmz/habitats.txt'
-	shell: """cut -f4 {input.file} |sort -u > {output.habitats}"""
+    input:
+        'corpora/dsmz/dsmz-data/sample_type.txt'
+
+    output:
+        'corpora/dsmz/habitats.txt'
+
+    shell: '''cut -f2 {input} |sort -u > {output}'''
 
 
 '''
 map habitats of microorganisms
 '''
 rule map_dsmz_habitats:
-	input:
-		habitats='corpora/dsmz/habitats.txt'
-	output:
-		mapped_habitats='corpora/dsmz/mapped_habitats.txt'
-	params:
-		plan='plans/map_habitats.plan',
-		onto='ancillaries/BioNLP-OST+EnovFood-Habitat.obo',
-		tomap='ancillaries/BioNLP-OST+EnovFood-Habitat.tomap',
-		graylist='ancillaries/graylist_extended.heads',
-		emptywords='ancillaries/stopwords_EN.ttg',
-		outdir='corpora/dsmz',
-		outfile='mapped_habitats.txt'
-	singularity:config["SINGULARITY_IMG"]
-	shell: """alvisnlp -J-Xmx32g -cleanTmp -verbose \
-	-alias input {input.habitats} \
+    input:
+        'corpora/dsmz/habitats.txt'
+
+    output:
+        'corpora/dsmz/mapped_habitats.txt'
+
+    params:
+        plan='plans/map_habitats.plan',
+        onto='ancillaries/BioNLP-OST+EnovFood-Habitat.obo',
+        tomap='ancillaries/BioNLP-OST+EnovFood-Habitat.tomap',
+        graylist='ancillaries/graylist_extended.heads',
+        emptywords='ancillaries/stopwords_EN.ttg',
+        outdir='corpora/dsmz',
+        outfile='mapped_habitats.txt'
+
+    singularity:
+        config["SINGULARITY_IMG"]
+
+    shell: '''~/code/alvisnlp/.test/alvisnlp/bin/alvisnlp -J-Xmx32g -cleanTmp -verbose \
+    -alias input {input} \
     -outputDir {params.outdir} \
-	-alias output {params.outfile} \
-	-alias ontobiotope {params.onto} \
-	-xalias '<ontobiotope-tomap empty-words="{params.emptywords}" graylist="{params.graylist}" whole-proxy-distance="false">{params.tomap}</ontobiotope-tomap>' \
-	{params.plan}
-	"""
+    -feat inhibit-syntax inhibit-syntax \
+    -alias output {params.outfile} \
+    -alias ontobiotope {params.onto} \
+    -xalias '<ontobiotope-tomap empty-words="{params.emptywords}" graylist="{params.graylist}" whole-proxy-distance="false">{params.tomap}</ontobiotope-tomap>' \
+    {params.plan}'''
 
 
 '''
 format results
 '''
 rule format_dsmz_results:
-	input:
-		habitats_file={_dsmz_habitat_corpus},
-		habitats='corpora/dsmz/mapped_habitats.txt'
-	output:
-		result='corpora/florilege/dsmz/dsmz-results.txt'
-	params:
-		taxa_id_full={_ncbi_taxo_id}',
-		bacdive_taxid={_map_bacdive_taxid},
-	conda: 'softwares/envs/obo-utils-env.yaml'
-	shell: 'python softwares/scripts/format-dsmz-results.py --bacdive-taxid {params.bacdive_taxid} --taxonomy {params.taxa_id_full} --dsmz-habitats {input.habitats_file} --mapped-habitats {input.habitats} > {output.result}'
+    input:
+        script='softwares/scripts/bacdive-format-results.py',
+        bacdive_sample_type='corpora/dsmz/dsmz-data/sample_type.txt',
+        mapped_habitats='corpora/dsmz/mapped_habitats.txt',
+        taxids='ancillaries/extended-microorganisms-taxonomy/taxid_microorganisms.txt',
+        bacdive_to_taxid='ancillaries/extended-microorganisms-taxonomy/bacdive-match/bacdive-to-taxid.txt'
+
+    output:
+        result='corpora/florilege/dsmz/dsmz-results.txt',
+        logfile='corpora/florilege/dsmz/dsmz-results.log'
+
+    conda:
+        'softwares/envs/obo-utils-env.yaml'
+
+    shell:
+        '''{input.script} --bacdive-sample-type {input.bacdive_sample_type} --taxids {input.taxids} --bacdive-to-taxid {input.bacdive_to_taxid} --mapped-habitats {input.mapped_habitats} >{output.result} 2>{output.logfile}'''
 
 
 
diff --git a/softwares/scripts/bacdive-extract-isolation.py b/softwares/scripts/bacdive-extract-isolation.py
new file mode 100755
index 0000000000000000000000000000000000000000..e5bb350814b2698c629f3904887b3a2f951ca0de
--- /dev/null
+++ b/softwares/scripts/bacdive-extract-isolation.py
@@ -0,0 +1,49 @@
+#!/bin/env python3
+
+import logging
+import sys
+import json
+import os
+
+
+logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO)
+Logger = logging.getLogger(__name__)
+
+
+BACDIVE_ENTRIES = sys.argv[1]
+
+
+COUNT = {
+    'files': 0,
+    'entries': 0,
+    'entries without isolation': 0,
+    'isolations': 0
+}
+KEYS = set()
+Logger.info('reading BacDive entries in directory: %s' % BACDIVE_ENTRIES)
+for dirpath, _, filenames in os.walk(BACDIVE_ENTRIES):
+    for fn in filenames:
+        if not fn.endswith('.json'):
+            continue
+        with open(os.path.join(dirpath, fn)) as f:
+            COUNT['files'] += 1
+            j = json.load(f)
+            res = j['results']
+            if res:
+                for entry in res.values():
+                    bacdive_id = str(entry['General']['BacDive-ID'])
+                    COUNT['entries'] += 1
+                    try:
+                        isolation = entry['Isolation, sampling and environmental information']['isolation']
+                        if isinstance(isolation, dict):
+                            isolation = [isolation]
+                        for iso in isolation:
+                            KEYS |= set(iso.keys())
+                            sample_type = ' '.join(iso['sample type'].split())
+                            sys.stdout.write('%s\t%s\n' % (bacdive_id, sample_type))
+                            COUNT['isolations'] += 1
+                    except KeyError:
+                        COUNT['entries without isolation'] += 1
+for k, v in COUNT.items():
+    Logger.info('%s: %d' % (k, v))
+Logger.info('isolation keys: %s' % str(KEYS))
diff --git a/softwares/scripts/bacdive-format-results.py b/softwares/scripts/bacdive-format-results.py
new file mode 100755
index 0000000000000000000000000000000000000000..51437805816eedb38e6d98973c7e349671c92c68
--- /dev/null
+++ b/softwares/scripts/bacdive-format-results.py
@@ -0,0 +1,68 @@
+#!/bin/env python3
+
+import logging
+import sys
+import argparse
+
+
+logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO)
+Logger = logging.getLogger(__name__)
+
+
+class BacDiveFormatResult(argparse.ArgumentParser):
+    def __init__(self):
+        argparse.ArgumentParser.__init__(self, 'format DSMZ extraction results')
+        self.add_argument('--bacdive-sample-type', action='store', required=True, dest='bacdive_sample_type', help='')
+        self.add_argument('--taxids', action='store', required=True, dest='taxids', help='')
+        self.add_argument('--bacdive-to-taxid', action='store', required=True, dest='bacdive_to_taxid', help='')
+        self.add_argument('--mapped-habitats', action='store', required=True, dest='mapped_habitats', help='')
+
+    @staticmethod
+    def read_entries(filename, *args):
+        with open(filename) as f:
+            for line in f:
+                cols = line.strip().split('\t')
+                yield dict(zip(args, cols))
+
+    def run(self):
+        args = self.parse_args()
+        bacdive_sample_type = list(BacDiveFormatResult.read_entries(args.bacdive_sample_type, 'bacdive_id', 'habitat'))
+        taxid_microorganisms = dict((taxon['taxid'], taxon) for taxon in BacDiveFormatResult.read_entries(args.taxids, 'taxid', 'name', 'path', 'rank'))
+        bacdive_to_taxid = dict((bdt['bacdive_id'], bdt['taxid']) for bdt in BacDiveFormatResult.read_entries(args.bacdive_to_taxid, 'bacdive_id', 'taxid'))
+        mapped_habitats = dict((mh['text'], mh) for mh in BacDiveFormatResult.read_entries(args.mapped_habitats, 'text', 'form', 'obt_id', 'name', 'path'))
+        for bd in bacdive_sample_type:
+            bacdive_id = bd['bacdive_id']
+            if bacdive_id not in bacdive_to_taxid:
+                Logger.warning('no taxon for %s' % bacdive_id)
+                continue
+            taxid = bacdive_to_taxid[bacdive_id]
+            if taxid not in taxid_microorganisms:
+                Logger.warning('could not find taxon info for %s' % taxid)
+                continue
+            taxinfo = taxid_microorganisms[taxid]
+            habitat = bd['habitat']
+            if habitat not in mapped_habitats:
+                Logger.warning('no habitat found for %s (%s)' % (bacdive_id, habitat))
+                continue
+            habinfo = mapped_habitats[habitat]
+            sys.stdout.write('\t'.join((
+                taxinfo['name'],
+                taxinfo['name'],
+                taxinfo['taxid'],
+                taxinfo['path'],
+                habinfo['form'],
+                habinfo['obt_id'],
+                habinfo['name'],
+                habinfo['path'],
+                bacdive_id
+            )))
+            sys.stdout.write('\n')
+
+
+BACDIVE_SAMPLE_TYPE_FILE = 'corpora/dsmz/dsmz-data/sample_type.txt'
+TAXID_MICROORGANISMS_FILE = 'ancillaries/extended-microorganisms-taxonomy/taxid_microorganisms.txt'
+BACDIVE_TO_TAXID_FILE = 'ancillaries/extended-microorganisms-taxonomy/bacdive-match/bacdive-to-taxid.txt'
+MAPPED_HABITATS_FILE = 'corpora/dsmz/mapped_habitats.txt'
+
+if __name__ == '__main__':
+    BacDiveFormatResult().run()
diff --git a/softwares/scripts/format-dsmz-results.py b/softwares/scripts/format-dsmz-results.py
deleted file mode 100644
index 44e5ba54415b301a52e6878dad396641c4cda03a..0000000000000000000000000000000000000000
--- a/softwares/scripts/format-dsmz-results.py
+++ /dev/null
@@ -1,105 +0,0 @@
-import re
-import argparse
-
-parser = argparse.ArgumentParser()
-parser.add_argument('--bacdive-taxid', action='store', default='ancillaries/ncbi-taxonomy-prefix/dsmz-match/bacdive-to-taxid.txt', help='Bacdive id to taxid mapping file')
-parser.add_argument('--dsmz-habitats', action='store', default='corpora/dsmz/dsmz-data/category=origin-key=sample_type.tsv', help='DSMZ habitat file')
-parser.add_argument('--taxonomy', action='store', default='ancillaries/ncbi-taxonomy-prefix/taxa+id_full.txt', help='taxonomy file')
-parser.add_argument('--mapped-habitats', action='store', default='corpora/dsmz/mapped_habitats.txt', help='mapped habitat file')
-
-args = parser.parse_args()
-
-bacdive_taxid_file = args.bacdive_taxid
-taxonomy_file = args.taxonomy
-dsmz_habitat_file = args.dsmz_habitats
-mapped_habitat_file = args.mapped_habitats
-
-habitat_dict = {}
-hf = open(mapped_habitat_file, "r")
-for line in hf:
-    line = line.rstrip()
-    p = re.compile(r'([^\t]+)\t(\S.+)')
-    m = p.match(line)
-    if(m):
-        habitat = m.group(1)
-        habitat_info = m.group(2)
-        if habitat in habitat_dict:
-            habitat_dict[habitat].add(habitat_info)
-        else:
-            habitat_dict[habitat] = {habitat_info}
-hf.close()
-
-taxonomy = {}
-tax = open(taxonomy_file, "r")
-for num,line in enumerate(tax,1):
-    line = line.rstrip()
-    fields = line.split("\t")
-    taxid=fields[1]
-    if(not taxid in taxonomy):
-        taxonomy[taxid]={}
-        taxonomy[taxid]['name']=fields[2];
-        taxonomy[taxid]['path']=fields[3];
-tax.close()
-
-bacdiveTaxaIds = {}
-dtf = open(bacdive_taxid_file, "r")
-for num,line in enumerate(dtf,1):
-    line = line.rstrip()
-    bacdiveid,taxid = line.split("\t")
-    bacdiveTaxaIds[bacdiveid]=taxid;
-dtf.close()
-
-def add_entry(mappings, habitat, concepts, taxid, taxon, name, path, bacdiveid):
-    for concept in concepts:
-        surface_form, concept_id, concept_name, concept_path = concept.split('\t')
-        key = taxid+"-"+concept_id
-        if key in mappings:
-            mappings[key]['habitat']['surface'].add(surface_form)
-            mappings[key]['taxon']['surface'].add(taxon)
-            mappings[key]['bacdiveid'].add(bacdiveid)
-        else:
-            mappings[key] = {}
-            mappings[key]['habitat'] = {}
-            mappings[key]['taxon'] = {}
-            mappings[key]['bacdiveid'] = {bacdiveid}
-            mappings[key]['habitat']['concept_id'] = concept_id
-            mappings[key]['habitat']['concept_name'] = concept_name
-            mappings[key]['habitat']['concept_path'] = concept_path
-            mappings[key]['taxon']['taxid'] = taxid
-            mappings[key]['taxon']['canonical_name'] = name
-            mappings[key]['taxon']['path'] = path
-            mappings[key]['habitat']['surface'] = {surface_form}
-            mappings[key]['taxon']['surface'] = {taxon}
-    return mappings
-
-
-unique_mappings = {}
-dhf = open(dsmz_habitat_file, "r")
-for num, line in enumerate(dhf, 1):
-    if (num > 1):
-        line = line.rstrip()
-        fields = re.split("\t+", line)
-        if(len(fields) == 4):
-            bacdiveid = fields[0] 
-            habitat = re.sub(r'\s+', ' ', fields[3]).strip()
-            if bacdiveid in bacdiveTaxaIds:
-                taxid = bacdiveTaxaIds[bacdiveid]
-                if taxid in taxonomy:
-                    name = taxonomy[taxid]['name']
-                    path = taxonomy[taxid]['path']
-                    if habitat in habitat_dict:
-                        concepts = habitat_dict[habitat]
-                        unique_mappings = add_entry(unique_mappings, habitat, concepts, taxid, name, name, path, bacdiveid)
-dhf.close()
-
-for mapping in unique_mappings.values():
-   print("\t".join(('|'.join(mapping['taxon']['surface']),
-             mapping['taxon']['canonical_name'],
-             mapping['taxon']['taxid'],
-             mapping['taxon']['path'],
-             '|'.join(mapping['habitat']['surface']),
-             mapping['habitat']['concept_id'],
-             mapping['habitat']['concept_name'],
-             mapping['habitat']['concept_path'],
-             ','.join(mapping['bacdiveid']))
-   ))