diff --git a/plans/entities.plan b/plans/entities.plan index 8523ddb14c67d45b2edc40e07b71539e55ef52cd..f98ea1c1a7e009f4d9006ca56f147057fbb3ad01 100644 --- a/plans/entities.plan +++ b/plans/entities.plan @@ -17,10 +17,6 @@ <param name="input-xslt"> <alias module="read.pubmed" param="xslTransform"/> </param> - - <param name="batch"> - <alias module="read.pubmed" param="constantDocumentFeatures"/> - </param> <param name="outputDir"> <alias module="output.doc-mesh" param="outDir"/> @@ -143,7 +139,8 @@ <param name="NCBI_taxa_ontobiotope"> <alias module="habitats.tomap-habitats.map-living-organisms" param="mappingFile"/> </param> - + + <!-- habitats--> <param name="ontobiotope-habitat"> <alias module="habitats.tomap-habitats.concept-names" param="oboFiles"/> @@ -198,15 +195,14 @@ <read> <pubmed class="XMLReader"> - <sourcePath>corpora/pubmed/batches/0001/batch.xml</sourcePath> - <xslTransform>ancillaries/microbes-2019-pubmed2alvisnlp.xslt</xslTransform> - <constantDocumentFeatures>batch=0001</constantDocumentFeatures> + <sourcePath>corpora/&corpus;/batches/&batch;/batch.xml</sourcePath> + <xslTransform>ancillaries/&corpus;-pubmed2alvisnlp.xslt</xslTransform> </pubmed> <bionlp-st class="BioNLPSTReader"> <active>true</active> <sectionName>abstract</sectionName> - <textDir>corpora/BioNLP-OST-2019/batches/BB19-kb+ner/bionlp-st</textDir> + <textDir>corpora/&corpus;/batches/&batch;/bionlp-st</textDir> </bionlp-st> </read> @@ -408,7 +404,7 @@ <!-- Run Yatea term extractor --> <yatea class="YateaExtractor"> <sectionFilter>@name == "title" or @name == "abstract"</sectionFilter> - <xmlTermsFile>yatea/candidates.xml</xmlTermsFile> + <xmlTermsFile>corpora/&corpus;/batches/&batch;/yatea/candidates.xml</xmlTermsFile> <posFeature>tt_pos</posFeature> <configDir>ancillaries/YaTeA/config-habitats</configDir> <localeDir>ancillaries/YaTeA/locale</localeDir> @@ -419,7 +415,7 @@ <!-- Run Yatea term extractor on variants --> <yatea-var class="YateaExtractor"> <sectionFilter>@name == "title" or @name == "abstract"</sectionFilter> - <xmlTermsFile>yatea-var/candidates.xml</xmlTermsFile> + <xmlTermsFile>corpora/&corpus;/batches/&batch;/yatea-var/candidates.xml</xmlTermsFile> <posFeature>tt_pos</posFeature> <lemmaFeature>variant</lemmaFeature> <configDir>ancillaries/YaTeA/config-habitats</configDir> @@ -673,12 +669,12 @@ <output> <doc-mesh class="TabularExport"> - <outDir>.</outDir> + <outDir>corpora/&corpus;/batches/&batch;</outDir> <files>$</files> <fileName>"doc-mesh.txt"</fileName> <lines>documents.sections:mesh</lines> <columns separator=";"> - document.@batch; + "&batch;"; document.@id; @UI; @mesh-name; @@ -687,12 +683,12 @@ </doc-mesh> <taxa class="TabularExport"> - <outDir>.</outDir> + <outDir>corpora/&corpus;/batches/&batch;</outDir> <files>$</files> <fileName>"taxa.txt"</fileName> <lines>documents.sections.layer:taxa</lines> <columns separator=";"> - document.@batch; + "&batch;"; section.document.@id; section.@name; start ^ "-" ^ end; @@ -706,12 +702,12 @@ </taxa> <microorganisms class="TabularExport"> - <outDir>.</outDir> + <outDir>corpora/&corpus;/batches/&batch;</outDir> <files>$</files> <fileName>"microorganisms.txt"</fileName> <lines>documents.sections.layer:microorganism</lines> <columns separator=";"> - document.@batch; + "&batch;"; section.document.@id; section.@name; start ^ "-" ^ end; @@ -725,12 +721,12 @@ </microorganisms> <microorganisms-short class="TabularExport"> - <outDir>.</outDir> + <outDir>corpora/&corpus;/batches/&batch;</outDir> <files>$</files> <fileName>"microorganisms-short.txt"</fileName> <lines>documents.sections.layer:microorganism[outside:words and not @form == outside:words.@form]</lines> <columns separator=";"> - document.@batch; + "&batch;"; section.document.@id; section.@name; start ^ "-" ^ end; @@ -744,12 +740,12 @@ </microorganisms-short> <bacteria class="TabularExport"> - <outDir>.</outDir> + <outDir>corpora/&corpus;/batches/&batch;</outDir> <files>$</files> <fileName>"bacteria.txt"</fileName> <lines>documents.sections.layer:bacteria</lines> <columns separator=";"> - document.@batch; + "&batch;"; section.document.@id; section.@name; start ^ "-" ^ end; @@ -763,12 +759,12 @@ </bacteria> <habitats class="TabularExport"> - <outDir>.</outDir> + <outDir>corpora/&corpus;/batches/&batch;</outDir> <files>$</files> <fileName>"habitats.txt"</fileName> <lines>documents.sections.layer:habitats</lines> <columns separator=";"> - document.@batch; + "&batch;"; section.document.@id; section.@name; start ^ "-" ^ end; @@ -784,12 +780,12 @@ </habitats> <phenotypes class="TabularExport"> - <outDir>.</outDir> + <outDir>corpora/&corpus;/batches/&batch;</outDir> <files>$</files> <fileName>"phenotypes.txt"</fileName> <lines>documents.sections.layer:phenotypes</lines> <columns separator=";"> - document.@batch; + "&batch;"; section.document.@id; section.@name; start ^ "-" ^ end; @@ -805,12 +801,12 @@ </phenotypes> <uses class="TabularExport"> - <outDir>.</outDir> + <outDir>corpora/&corpus;/batches/&batch;</outDir> <files>$</files> <fileName>"uses.txt"</fileName> <lines>documents.sections.layer:uses</lines> <columns separator=";"> - document.@batch; + "&batch;"; section.document.@id; section.@name; start ^ "-" ^ end; @@ -826,12 +822,12 @@ </uses> <geo class="TabularExport"> - <outDir>.</outDir> + <outDir>corpora/&corpus;/batches/&batch;</outDir> <files>$</files> <fileName>"geo.txt"</fileName> <lines>documents.sections.layer:Geographical</lines> <columns separator=";"> - document.@batch; + "&batch;"; section.document.@id; section.@name; start ^ "-" ^ end; @@ -841,12 +837,12 @@ </geo> <relations class="TabularExport"> - <outDir>.</outDir> + <outDir>corpora/&corpus;/batches/&batch;</outDir> <files>$</files> <fileName>"relations.txt"</fileName> <lines>documents.sections.relations:CooccurrenceLocalization.tuples</lines> <columns separator=";"> - document.@batch; + "&batch;"; section.document.@id; section.@name; args:Bacterium.@taxid; @@ -864,12 +860,12 @@ </relations> <relations-pheno class="TabularExport"> - <outDir>.</outDir> + <outDir>corpora/&corpus;/batches/&batch;</outDir> <files>$</files> <fileName>"phenotype-relations.txt"</fileName> <lines>documents.sections.relations:PhenotypeRelation.tuples</lines> <columns separator=";"> - document.@batch; + "&batch;"; section.document.@id; section.@name; args:Microorganism.@taxid; @@ -887,12 +883,12 @@ </relations-pheno> <relations-use class="TabularExport"> - <outDir>.</outDir> + <outDir>corpora/&corpus;/batches/&batch;</outDir> <files>$</files> <fileName>"uses-relations.txt"</fileName> <lines>documents.sections.relations:UseRelation.tuples</lines> <columns separator=";"> - document.@batch; + "&batch;"; section.document.@id; section.@name; args:Microorganism.@taxid; @@ -978,7 +974,7 @@ </index-sentences> <sentences class="TabularExport"> - <outDir>.</outDir> + <outDir>corpora/&corpus;/batches/&batch;</outDir> <files>$</files> <fileName>"sentences.txt"</fileName> <lines>documents.sections.layer:sentences[@name != "author"]</lines> @@ -999,7 +995,7 @@ </sentences> <anaphora class="TabularExport"> - <outDir>.</outDir> + <outDir>corpora/&corpus;/batches/&batch;</outDir> <files>$</files> <fileName>"anaphora.txt"</fileName> <lines>documents.sections.relations:coreferences.tuples[args:Ante]</lines> @@ -1032,7 +1028,7 @@ </anaphora> <dependencies class="TabularExport"> - <outDir>.</outDir> + <outDir>corpora/&corpus;/batches/&batch;</outDir> <files>$</files> <fileName>"dependencies.txt"</fileName> <lines>documents.sections[@name != "author"].relations:dependencies.tuples</lines> @@ -1251,11 +1247,11 @@ </habitat-ancestors> <index class="AlvisDBIndexer"> - <indexDir>adb</indexDir> + <indexDir>corpora/&corpus;/batches/&batch;/adb</indexDir> <elements> <relations> <items>documents.sections.relations:CooccurrenceLocalization.tuples[args:Bacterium[@bacteria == "true"]]</items> - <id>section.document.@batch ^ id:unique</id> + <id>"&batch;_" ^ id:unique</id> <name>"Localization"</name> <type>"localization"</type> <args>args:Bacterium|args:Localization</args> @@ -1271,7 +1267,7 @@ </adb> <index class="AlvisIRIndexer"> - <indexDir>index</indexDir> + <indexDir>corpora/&corpus;/batches/&batch;/index</indexDir> <tokenPositionGap>9216</tokenPositionGap> <fieldNames>title,abstract,author,full-author,pmid,year,journal,mesh,url</fieldNames> <relations> @@ -1408,8 +1404,8 @@ </documents> </index> -<!-- <index-food class="AlvisIRIndexer"> - <indexDir>.</indexDir> + <index-food class="AlvisIRIndexer"> + <indexDir>corpora/&corpus;/batches/&batch;/index-food</indexDir> <tokenPositionGap>9216</tokenPositionGap> <fieldNames>title,abstract,author,full-author,pmid,year,journal,mesh,url</fieldNames> <relations> @@ -1542,7 +1538,7 @@ <keyword>document.@url</keyword> </fields> </documents> - </index-food>--> + </index-food> <!-- HTML visualization --> <add-feature class="Action"> @@ -1567,14 +1563,14 @@ </add-feature3> <html class="QuickHTML"> <active>false</active> - <outDir>./html</outDir> + <outDir>corpora/&corpus;/batches/&batch;/html</outDir> <classFeature>ne-type</classFeature> <layers>phenotypes,microorganism,habitats</layers> <colors>#99cc00,#ffcc99,#ffd333,#ffd666</colors> </html> <words class="TabularExport"> - <outDir>.</outDir> + <outDir>corpora/&corpus;/batches/&batch;</outDir> <files>$</files> <fileName>"words.txt"</fileName> <lines>documents.sections[@name == "title" or @name == "abstract"].layer:words</lines> @@ -1586,7 +1582,7 @@ <bionlp-st-a2> <habitats class="TabularExport"> - <outDir>a2</outDir> + <outDir>corpora/&corpus;/batches/&batch;/a2</outDir> <files>documents.sections</files> <fileName>document.@id ^ ".a2"</fileName> <lines>layer:habitats</lines> @@ -1599,7 +1595,7 @@ <phenotypes class="TabularExport"> <append/> - <outDir>a2</outDir> + <outDir>corpora/&corpus;/batches/&batch;/a2</outDir> <files>documents.sections</files> <fileName>document.@id ^ ".a2"</fileName> <lines>layer:phenotypes</lines> @@ -1612,7 +1608,7 @@ <microorganisms class="TabularExport"> <append/> - <outDir>a2</outDir> + <outDir>corpora/&corpus;/batches/&batch;/a2</outDir> <files>documents.sections</files> <fileName>document.@id ^ ".a2"</fileName> <lines>layer:microorganism</lines> @@ -1625,7 +1621,7 @@ <obt class="TabularExport"> <append/> - <outDir>a2</outDir> + <outDir>corpora/&corpus;/batches/&batch;/a2</outDir> <files>documents.sections</files> <fileName>document.@id ^ ".a2"</fileName> <lines>layer:habitats|layer:phenotypes</lines> @@ -1637,7 +1633,7 @@ <taxid class="TabularExport"> <append/> - <outDir>a2</outDir> + <outDir>corpora/&corpus;/batches/&batch;/a2</outDir> <files>documents.sections</files> <fileName>document.@id ^ ".a2"</fileName> <lines>layer:microorganism</lines> @@ -1649,7 +1645,7 @@ <lives-in class="TabularExport"> <append/> - <outDir>a2</outDir> + <outDir>corpora/&corpus;/batches/&batch;/a2</outDir> <files>documents.sections</files> <fileName>document.@id ^ ".a2"</fileName> <lines>relations:CooccurrenceLocalization.tuples[args:Localization.@concept-id != ""]</lines> @@ -1661,7 +1657,7 @@ <exhibits class="TabularExport"> <append/> - <outDir>a2</outDir> + <outDir>corpora/&corpus;/batches/&batch;/a2</outDir> <files>documents.sections</files> <fileName>document.@id ^ ".a2"</fileName> <lines>relations:PhenotypeRelation.tuples</lines> @@ -1674,7 +1670,7 @@ <success class="TabularExport"> - <outDir>.</outDir> + <outDir>corpora/&corpus;/batches/&batch;</outDir> <files>$</files> <fileName>"success.txt"</fileName> <lines>documents</lines> diff --git a/plans/map_habitats.plan b/plans/map_habitats.plan index 5aff349486ac1d6145c03930300193eeb36b05e4..07777780e184fa6ede0bf8d55785a7e3a463d7f9 100644 --- a/plans/map_habitats.plan +++ b/plans/map_habitats.plan @@ -168,7 +168,7 @@ <!-- Run Yatea term extractor --> <yatea class="YateaExtractor"> - <xmlTermsFile>yatea/candidates.xml</xmlTermsFile> + <xmlTermsFile>ancillaries/yatea/candidates.xml</xmlTermsFile> <posFeature>tt_pos</posFeature> <configDir>ancillaries/YaTeA/config-habitats</configDir> <localeDir>ancillaries/YaTeA/locale</localeDir> @@ -178,7 +178,7 @@ <!-- Run Yatea term extractor on variants --> <yatea-var class="YateaExtractor"> - <xmlTermsFile>yatea-var/candidates.xml</xmlTermsFile> + <xmlTermsFile>ancillaries/yatea-var/candidates.xml</xmlTermsFile> <posFeature>tt_pos</posFeature> <lemmaFeature>variant</lemmaFeature> <configDir>ancillaries/YaTeA/config-habitats</configDir> @@ -258,7 +258,7 @@ <setFeatures/> </add-score> - <tomap-habitats file="plans/tomap-habitats.plan"/> + <tomap-habitats file="plans/tomap-habitats-generic.plan"/> <!-- <remove-living-org-overlapping-geo class="Action"> --> <!-- <target>documents.sections.layer:habitats[@concept-path ?= "OBT:000002" and span:Geographical]</target> --> diff --git a/plans/tomap-habitats.plan b/plans/tomap-habitats.plan index da458fd1932cceace194e9cae09134bd2bf10027..46bbb98d0e1a2aadb023988465071097e7a11a72 100644 --- a/plans/tomap-habitats.plan +++ b/plans/tomap-habitats.plan @@ -3,18 +3,18 @@ <!-- ToMap on lemmas --> <tomap class="TomapProjector"> - <yateaFile output-feed="true">yatea/candidates.xml</yateaFile> + <yateaFile output-feed="true">corpora/&corpus;/batches/&batch;/yatea/candidates.xml</yateaFile> <targetLayerName>habitats</targetLayerName> <conceptFeature>concept-id</conceptFeature> <explanationFeaturePrefix>explain_</explanationFeaturePrefix> - <tomapClassifier empty-words="ancillaries/stopwords_EN.ttg" graylist="ancillaries/graylist_extended.heads" whole-proxy-distance="false">ancillaries/BioNLP-OST+EnovFood-Habitat.tomap</tomapClassifier> + <tomapClassifier empty-words="ancillaries/stopwords_EN.ttg" graylist="ancillaries/graylist_extended.heads" whole-proxy-distance="false">&ontobiotope;-Habitat.tomap</tomapClassifier> <lemmaKeys/> <subject feature="lemma" layer="words"/> <scoreFeature>score</scoreFeature> </tomap> <concept-names class="OBOMapper"> - <oboFiles>ancillaries/BioNLP-OST+EnovFood-Habitat.obo</oboFiles> + <oboFiles>&ontobiotope;-Habitat.obo</oboFiles> <idKeys/> <target>documents.sections.layer:habitats</target> <form>@concept-id</form> @@ -26,18 +26,18 @@ <tomap-on-alternative-lemmas> <tomap class="TomapProjector"> - <yateaFile output-feed="true">yatea/candidates.xml</yateaFile> + <yateaFile output-feed="true">corpora/&corpus;/batches/&batch;/yatea/candidates.xml</yateaFile> <targetLayerName>habitats2</targetLayerName> <conceptFeature>concept-id</conceptFeature> <explanationFeaturePrefix>explain_</explanationFeaturePrefix> - <tomapClassifier empty-words="ancillaries/stopwords_EN.ttg" graylist="ancillaries/graylist_extended.heads" whole-proxy-distance="false">ancillaries/BioNLP-OST+EnovFood-Habitat.tomap</tomapClassifier> + <tomapClassifier empty-words="ancillaries/stopwords_EN.ttg" graylist="ancillaries/graylist_extended.heads" whole-proxy-distance="false">&ontobiotope;-Habitat.tomap</tomapClassifier> <lemmaKeys/> <subject feature="lemma2" layer="words"/> <scoreFeature>score</scoreFeature> </tomap> <concept-names class="OBOMapper"> - <oboFiles>ancillaries/BioNLP-OST+EnovFood-Habitat.obo</oboFiles> + <oboFiles>&ontobiotope;-Habitat.obo</oboFiles> <idKeys/> <target>documents.sections.layer:habitats2</target> <form>@concept-id</form> @@ -58,17 +58,17 @@ <tomap-no-lemmakeys> <tomap class="TomapProjector"> - <yateaFile output-feed="true">yatea/candidates.xml</yateaFile> + <yateaFile output-feed="true">corpora/&corpus;/batches/&batch;/yatea/candidates.xml</yateaFile> <targetLayerName>habitats3</targetLayerName> <conceptFeature>concept-id</conceptFeature> <explanationFeaturePrefix>explain_</explanationFeaturePrefix> - <tomapClassifier empty-words="ancillaries/stopwords_EN.ttg" graylist="ancillaries/graylist_extended.heads" whole-proxy-distance="false">ancillaries/BioNLP-OST+EnovFood-Habitat.tomap</tomapClassifier> + <tomapClassifier empty-words="ancillaries/stopwords_EN.ttg" graylist="ancillaries/graylist_extended.heads" whole-proxy-distance="false">&ontobiotope;-Habitat.tomap</tomapClassifier> <subject feature="lemma" layer="words"/> <scoreFeature>score</scoreFeature> </tomap> <concept-names class="OBOMapper"> - <oboFiles>ancillaries/BioNLP-OST+EnovFood-Habitat.obo</oboFiles> + <oboFiles>&ontobiotope;-Habitat.obo</oboFiles> <idKeys/> <target>documents.sections.layer:habitats3</target> <form>@concept-id</form> @@ -89,18 +89,18 @@ <tomap-on-variants> <tomap class="TomapProjector"> - <yateaFile output-feed="true">yatea-var/candidates.xml</yateaFile> + <yateaFile output-feed="true">corpora/&corpus;/batches/&batch;/yatea-var/candidates.xml</yateaFile> <targetLayerName>habitats4</targetLayerName> <conceptFeature>concept-id</conceptFeature> <explanationFeaturePrefix>explain_</explanationFeaturePrefix> - <tomapClassifier empty-words="ancillaries/stopwords_EN.ttg" graylist="ancillaries/graylist_extended.heads" whole-proxy-distance="false">ancillaries/BioNLP-OST+EnovFood-Habitat.tomap</tomapClassifier> + <tomapClassifier empty-words="ancillaries/stopwords_EN.ttg" graylist="ancillaries/graylist_extended.heads" whole-proxy-distance="false">&ontobiotope;-Habitat.tomap</tomapClassifier> <lemmaKeys/> <subject feature="variant" layer="words"/> <scoreFeature>score</scoreFeature> </tomap> <concept-names class="OBOMapper"> - <oboFiles>ancillaries/BioNLP-OST+EnovFood-Habitat.obo</oboFiles> + <oboFiles>&ontobiotope;-Habitat.obo</oboFiles> <idKeys/> <target>documents.sections.layer:habitats4</target> <form>@concept-id</form> @@ -121,17 +121,17 @@ <tomap-no-lemmakeys-word-form> <tomap class="TomapProjector"> - <yateaFile output-feed="true">yatea/candidates.xml</yateaFile> + <yateaFile output-feed="true">corpora/&corpus;/batches/&batch;/yatea/candidates.xml</yateaFile> <targetLayerName>habitats5</targetLayerName> <conceptFeature>concept-id</conceptFeature> <explanationFeaturePrefix>explain_</explanationFeaturePrefix> - <tomapClassifier empty-words="ancillaries/stopwords_EN.ttg" graylist="ancillaries/graylist_extended.heads" whole-proxy-distance="false">ancillaries/BioNLP-OST+EnovFood-Habitat.tomap</tomapClassifier> + <tomapClassifier empty-words="ancillaries/stopwords_EN.ttg" graylist="ancillaries/graylist_extended.heads" whole-proxy-distance="false">&ontobiotope;-Habitat.tomap</tomapClassifier> <subject feature="form" layer="words"/> <scoreFeature>score</scoreFeature> </tomap> <concept-names class="OBOMapper"> - <oboFiles>ancillaries/BioNLP-OST+EnovFood-Habitat.obo</oboFiles> + <oboFiles>&ontobiotope;-Habitat.obo</oboFiles> <idKeys/> <target>documents.sections.layer:habitats5</target> <form>@concept-id</form> @@ -196,7 +196,7 @@ <bioyatea-projection class="YateaTermsProjector"> <targetLayerName>yateaTerms</targetLayerName> <!--<yateaFile inhibitCheck="true">words_prepro/default/xml/candidates_pp.xml</yateaFile>--> - <yateaFile output-feed="yes">yatea/candidates.xml</yateaFile> <!-- ??? --> + <yateaFile output-feed="yes">corpora/&corpus;/batches/&batch;/yatea/candidates.xml</yateaFile> <!-- ??? --> <subject layer="words"/> <termLemma>lemma</termLemma> </bioyatea-projection> @@ -388,20 +388,20 @@ <removeFromLayer/> </delete-not-selected> - <!-- <rule-for-food-corpus> - <tag-overlap-food class="Action"> --> +<rule-for-food-corpus> + <tag-overlap-food class="Action"> <!-- <target>documents.sections.layer:habitats[not @concept-path =~ "OBT:000008/"]</target> --> - <!-- <target>documents[@id in "corpora/&corpus;/food-pmids.txt"].sections.layer:habitats[not @concept-path =~ "OBT:000008/"]</target> + <target>documents[@id in "corpora/&corpus;/food-pmids.txt"].sections.layer:habitats[not @concept-path =~ "OBT:000008/"]</target> <action>set:feat:overlap-food(span:habitats[@score == target.@score and @concept-path =~ "OBT:000008/"])</action> <setFeatures/> </tag-overlap-food> <remove-overlap-food class="Action"> - <target>documents[@id in "corpora/&corpus;/food-pmids.txt"].sections.layer:habitats[not @concept-path =~ "OBT:000008/" and not @overlap-food == ""]</target>--> + <target>documents[@id in "corpora/&corpus;/food-pmids.txt"].sections.layer:habitats[not @concept-path =~ "OBT:000008/" and not @overlap-food == ""]</target> <!-- <target>documents.sections.layer:habitats[not @concept-path =~ "OBT:000008/" and not @overlap-food == ""]</target> --> - <!-- <action>remove:habitats</action> + <action>remove:habitats</action> <removeFromLayer/> </remove-overlap-food> -</rule-for-food-corpus>--> +</rule-for-food-corpus> <!-- Keep only the highest scored concepts --> <keep-highest class="Action"> @@ -714,7 +714,7 @@ <!-- Add concept-path in case some are missing --> <concept-path class="OBOMapper"> - <oboFiles>ancillaries/BioNLP-OST+EnovFood-Habitat.obo</oboFiles> + <oboFiles>&ontobiotope;-Habitat.obo</oboFiles> <idKeys/> <target>documents.sections.layer:habitats</target> <form>@concept-id</form> diff --git a/plans/tomap-microbial-phenotypes.plan b/plans/tomap-microbial-phenotypes.plan index f55357aa56a7004280b8a5bae6cef17a501a97fe..17481643bb3b90c66ab16caa337ee600eb2f1f0f 100644 --- a/plans/tomap-microbial-phenotypes.plan +++ b/plans/tomap-microbial-phenotypes.plan @@ -2,18 +2,18 @@ <!-- ToMap on lemmas --> <tomap class="TomapProjector"> - <yateaFile output-feed="true">yatea/candidates.xml</yateaFile> + <yateaFile output-feed="true">corpora/&corpus;/batches/&batch;/yatea/candidates.xml</yateaFile> <targetLayerName>phenotypes</targetLayerName> <conceptFeature>concept-id</conceptFeature> <explanationFeaturePrefix>explain_</explanationFeaturePrefix> - <tomapClassifier empty-words="ancillaries/stopwords_EN.ttg" whole-proxy-distance="false">ancillaries/BioNLP-OST+EnovFood-Phenotype.tomap</tomapClassifier> + <tomapClassifier empty-words="ancillaries/stopwords_EN.ttg" whole-proxy-distance="false">&ontobiotope;-Phenotype.tomap</tomapClassifier> <lemmaKeys/> <subject feature="lemma" layer="words"/> <scoreFeature>score</scoreFeature> </tomap> <concept-names class="OBOMapper"> - <oboFiles>ancillaries/BioNLP-OST+EnovFood-Phenotype.obo</oboFiles> + <oboFiles>&ontobiotope;-Phenotype.obo</oboFiles> <idKeys/> <target>documents.sections.layer:phenotypes</target> <form>@concept-id</form> @@ -25,18 +25,18 @@ <tomap-on-alternative-lemmas> <tomap class="TomapProjector"> - <yateaFile output-feed="true">yatea/candidates.xml</yateaFile> + <yateaFile output-feed="true">corpora/&corpus;/batches/&batch;/yatea/candidates.xml</yateaFile> <targetLayerName>phenotypes2</targetLayerName> <conceptFeature>concept-id</conceptFeature> <explanationFeaturePrefix>explain_</explanationFeaturePrefix> - <tomapClassifier empty-words="ancillaries/stopwords_EN.ttg" whole-proxy-distance="false">ancillaries/BioNLP-OST+EnovFood-Phenotype.tomap</tomapClassifier> + <tomapClassifier empty-words="ancillaries/stopwords_EN.ttg" whole-proxy-distance="false">&ontobiotope;-Phenotype.tomap</tomapClassifier> <lemmaKeys/> <subject feature="lemma2" layer="words"/> <scoreFeature>score</scoreFeature> </tomap> <concept-names class="OBOMapper"> - <oboFiles>ancillaries/BioNLP-OST+EnovFood-Phenotype.obo</oboFiles> + <oboFiles>&ontobiotope;-Phenotype.obo</oboFiles> <idKeys/> <target>documents.sections.layer:phenotypes2</target> <form>@concept-id</form> @@ -57,17 +57,17 @@ <tomap-no-lemmakeys> <tomap class="TomapProjector"> - <yateaFile output-feed="true">yatea/candidates.xml</yateaFile> + <yateaFile output-feed="true">corpora/&corpus;/batches/&batch;/yatea/candidates.xml</yateaFile> <targetLayerName>phenotypes3</targetLayerName> <conceptFeature>concept-id</conceptFeature> <explanationFeaturePrefix>explain_</explanationFeaturePrefix> - <tomapClassifier empty-words="ancillaries/stopwords_EN.ttg" whole-proxy-distance="false">ancillaries/BioNLP-OST+EnovFood-Phenotype.tomap</tomapClassifier> + <tomapClassifier empty-words="ancillaries/stopwords_EN.ttg" whole-proxy-distance="false">&ontobiotope;-Phenotype.tomap</tomapClassifier> <subject feature="lemma" layer="words"/> <scoreFeature>score</scoreFeature> </tomap> <concept-names class="OBOMapper"> - <oboFiles>ancillaries/BioNLP-OST+EnovFood-Phenotype.obo</oboFiles> + <oboFiles>&ontobiotope;-Phenotype.obo</oboFiles> <idKeys/> <target>documents.sections.layer:phenotypes3</target> <form>@concept-id</form> @@ -89,18 +89,18 @@ <tomap-on-variants> <tomap class="TomapProjector"> - <yateaFile output-feed="true">yatea-var/candidates.xml</yateaFile> + <yateaFile output-feed="true">corpora/&corpus;/batches/&batch;/yatea-var/candidates.xml</yateaFile> <targetLayerName>phenotypes4</targetLayerName> <conceptFeature>concept-id</conceptFeature> <explanationFeaturePrefix>explain_</explanationFeaturePrefix> - <tomapClassifier empty-words="ancillaries/stopwords_EN.ttg" whole-proxy-distance="false">ancillaries/BioNLP-OST+EnovFood-Phenotype.tomap</tomapClassifier> + <tomapClassifier empty-words="ancillaries/stopwords_EN.ttg" whole-proxy-distance="false">&ontobiotope;-Phenotype.tomap</tomapClassifier> <lemmaKeys/> <subject feature="variant" layer="words"/> <scoreFeature>score</scoreFeature> </tomap> <concept-names class="OBOMapper"> - <oboFiles>ancillaries/BioNLP-OST+EnovFood-Phenotype.obo</oboFiles> + <oboFiles>&ontobiotope;-Phenotype.obo</oboFiles> <idKeys/> <target>documents.sections.layer:phenotypes4</target> <form>@concept-id</form> @@ -121,17 +121,17 @@ <tomap-no-lemmakeys-word-form> <tomap class="TomapProjector"> - <yateaFile output-feed="true">yatea/candidates.xml</yateaFile> + <yateaFile output-feed="true">corpora/&corpus;/batches/&batch;/yatea/candidates.xml</yateaFile> <targetLayerName>phenotypes5</targetLayerName> <conceptFeature>concept-id</conceptFeature> <explanationFeaturePrefix>explain_</explanationFeaturePrefix> - <tomapClassifier empty-words="ancillaries/stopwords_EN.ttg" whole-proxy-distance="false">ancillaries/BioNLP-OST+EnovFood-Phenotype.tomap</tomapClassifier> + <tomapClassifier empty-words="ancillaries/stopwords_EN.ttg" whole-proxy-distance="false">&ontobiotope;-Phenotype.tomap</tomapClassifier> <subject feature="form" layer="words"/> <scoreFeature>score</scoreFeature> </tomap> <concept-names class="OBOMapper"> - <oboFiles>ancillaries/BioNLP-OST+EnovFood-Phenotype.obo</oboFiles> + <oboFiles>&ontobiotope;-Phenotype.obo</oboFiles> <idKeys/> <target>documents.sections.layer:phenotypes5</target> <form>@concept-id</form> @@ -166,7 +166,7 @@ <bioyatea-projection class="YateaTermsProjector"> <targetLayerName>yateaTerms</targetLayerName> <!--<yateaFile inhibitCheck="true">words_prepro/default/xml/candidates_pp.xml</yateaFile>--> - <yateaFile output-feed="yes">yatea/candidates.xml</yateaFile> <!-- ??? --> + <yateaFile output-feed="yes">corpora/&corpus;/batches/&batch;/yatea/candidates.xml</yateaFile> <!-- ??? --> <subject layer="words"/> <termLemma>lemma</termLemma> </bioyatea-projection> @@ -394,7 +394,7 @@ <!-- <idFeature>molecule-id</idFeature> --> <!-- <nameFeature>molecule-name</nameFeature> --> <!-- <pathFeature>molecule-path</pathFeature> --> - <!-- <oboFiles>ancillaries/BioNLP-OST+EnovFood-Molecule.obo</oboFiles> --> + <!-- <oboFiles>&ontobiotope;-Molecule.obo</oboFiles> --> <!-- <subject feature="lemma" layer="words"/> --> <!-- <targetLayerName>molecules</targetLayerName> --> <!-- </molecule-projection-on-lemmas> --> @@ -404,7 +404,7 @@ <!-- <idFeature>molecule-id</idFeature> --> <!-- <nameFeature>molecule-name</nameFeature> --> <!-- <pathFeature>molecule-path</pathFeature> --> - <!-- <oboFiles>ancillaries/BioNLP-OST+EnovFood-Molecule.obo</oboFiles> --> + <!-- <oboFiles>&ontobiotope;-Molecule.obo</oboFiles> --> <!-- <subject feature="form" layer="words"/> --> <!-- <targetLayerName>molecules2</targetLayerName> --> <!-- </molecule-projection-on-words> --> @@ -469,7 +469,7 @@ <!-- Add concept-path in case some are missing --> <concept-path class="OBOMapper"> - <oboFiles>ancillaries/BioNLP-OST+EnovFood-Phenotype.obo</oboFiles> + <oboFiles>&ontobiotope;-Phenotype.obo</oboFiles> <idKeys/> <target>documents.sections.layer:phenotypes</target> <form>@concept-id</form> diff --git a/plans/use-extraction.plan b/plans/use-extraction.plan index 87195d8d5434aaa4ac887c01765ea83e3c1d4a05..5d12302993b0490526cb682ad0934cca5820bda1 100644 --- a/plans/use-extraction.plan +++ b/plans/use-extraction.plan @@ -2,7 +2,7 @@ <alvisnlp-plan id="Use-extraction"> <exact-match class="OBOProjector"> - <oboFiles>ancillaries/Use_V2.obo</oboFiles> + <oboFiles>&ontobiotope-use;.obo</oboFiles> <targetLayerName>uses</targetLayerName> <subject feature="form" layer="words"/> <idFeature>concept-id</idFeature> @@ -12,7 +12,7 @@ </exact-match> <exact-match-2 class="OBOProjector"> - <oboFiles>ancillaries/Use_V2.obo</oboFiles> + <oboFiles>&ontobiotope-use;.obo</oboFiles> <targetLayerName>uses2</targetLayerName> <subject feature="lemma" layer="words"/> <idFeature>concept-id</idFeature> diff --git a/process-evaluate_BioNLP-OST.snakefile b/process-evaluate_BioNLP-OST.snakefile index 629e84b58eea01802d1b2c754c720a8783456ecc..a237c0749b04326ee1aae3462fb93dfee352bd44 100644 --- a/process-evaluate_BioNLP-OST.snakefile +++ b/process-evaluate_BioNLP-OST.snakefile @@ -41,14 +41,9 @@ rule run_bionlp_prediction: params: batch="{B}", corpus='BioNLP-OST-2019', - inhibitSyntax='inhibit-syntax', - onto_habitat='ancillaries/BioNLP-OST+EnovFood-Habitat.obo', - tomap_habitat='ancillaries/BioNLP-OST+EnovFood-Habitat.tomap', - onto_pheno='ancillaries/BioNLP-OST+EnovFood-Phenotype.obo', - tomap_pheno='ancillaries/BioNLP-OST+EnovFood-Phenotype.tomap', - graylist='ancillaries/graylist_extended.heads', - emptywords='ancillaries/stopwords_EN.ttg', - ontobiotopeUse='ancillaries/Use_V2.obo', + inhibitSyntax='inhibit-syntax', + onto='ancillaries/BioNLP-OST+EnovFood', + ontobiotopeUse='ancillaries/Use_V2', plan='plans/entities.plan', dir='corpora/BioNLP-OST-2019/batches/{B}/', taxid_microorganisms='ancillaries/extended-microorganisms-taxonomy/taxid_microorganisms.txt', @@ -59,14 +54,14 @@ rule run_bionlp_prediction: -log {log} \ -alias format bionlp-st \ -alias input-dir {input.dir} \ - -outputDir {params.dir} \ + -alias input-xslt {input.xslt} \ + -alias outputDir {params.dir} \ -environmentEntities \ + -entity corpus {params.corpus} \ -feat inhibit-syntax {params.inhibitSyntax} \ - -alias ontobiotope-habitat {params.onto_habitat} \ - -xalias '<ontobiotope-tomap-habitat empty-words="{params.emptywords}" graylist="{params.graylist}" whole-proxy-distance="false">{params.tomap_habitat}</ontobiotope-tomap-habitat>' \ - -alias ontobiotope-phenotypes {params.onto_pheno} \ - -xalias '<ontobiotope-tomap-phenotypes empty-words="{params.emptywords}" whole-proxy-distance="false">{params.tomap_pheno}</ontobiotope-tomap-phenotypes>' \ - -alias ontobiotope-use {params.ontobiotopeUse} \ + -entity ontobiotope {params.onto} \ + -entity ontobiotope-use {params.ontobiotopeUse} \ + -entity batch {params.batch} \ -alias taxid_microorganisms {params.taxid_microorganisms} \ -alias taxa+id_full {params.taxa_id_full} \ {params.plan} diff --git a/process_CIRM_corpus.snakefile b/process_CIRM_corpus.snakefile index f9f49e74a7c963826b96a1eed27b4ebbecd48f8c..2b172cf78ac91f2f0387201244ff924c6a48a7ee 100644 --- a/process_CIRM_corpus.snakefile +++ b/process_CIRM_corpus.snakefile @@ -24,8 +24,8 @@ rule get_cirm_bia_taxa_habitats: strain_index='1', habitat_index='15' output: - taxa='corpora/cirm/bia/bia_taxa.txt', - habitats='corpora/cirm/bia/bia_habitats.txt', + taxa='corpora/cirm/bia_taxa.txt', + habitats='corpora/cirm/bia_habitats.txt', tsv='corpora/cirm/BIA_2021/florilege_export_final_17_02_21.tsv' conda: 'softwares/envs/python3_pandas_env.yaml' shell: """ @@ -42,8 +42,8 @@ rule get_cirm_yeast_taxa_habitats: taxa_index='1', habitat_index='10,11' output: - taxa='corpora/cirm/levures/yeast_taxa.txt', - habitats='corpora/cirm/levures/yeast_habitats.txt', + taxa='corpora/cirm/yeast_taxa.txt', + habitats='corpora/cirm/yeast_habitats.txt', tsv='corpora/cirm/Levures_2021/Florilege_21012021.tsv' conda: 'softwares/envs/python3_pandas_env.yaml' shell: """ @@ -61,8 +61,8 @@ rule get_cirm_cfbp_taxa_habitats: strain_index='1', habitat_index='6,10,13,14' output: - taxa='corpora/cirm/cfbp/cfbp_taxa.txt', - habitats='corpora/cirm/cfbp/cfbp_habitats.txt', + taxa='corpora/cirm/cfbp_taxa.txt', + habitats='corpora/cirm/cfbp_habitats.txt', tsv='corpora/cirm/CFBP_2021/20210617_PPortier.tsv' conda: 'softwares/envs/python3_pandas_env.yaml' shell: """ @@ -75,9 +75,9 @@ map microorganisms ''' rule map_cirm_bia_microorganisms: input: - taxa='corpora/cirm/bia/bia_taxa.txt' + taxa='corpora/cirm/bia_taxa.txt' output: - mapped_taxaids='corpora/cirm/bia/mapped_bia_taxa.txt' + mapped_taxaids='corpora/cirm/mapped_bia_taxa.txt' params: plan='plans/map_microorganisms.plan', taxid_microorganisms='ancillaries/extended-microorganisms-taxonomy/taxid_microorganisms.txt', @@ -96,9 +96,9 @@ map microorganisms (CIRM Levures) ''' rule map_cirm_yeast_microorganisms: input: - taxa='corpora/cirm/levures/yeast_taxa.txt' + taxa='corpora/cirm/yeast_taxa.txt' output: - mapped_taxaids='corpora/cirm/levures/mapped_yeast_taxa.txt' + mapped_taxaids='corpora/cirm/mapped_yeast_taxa.txt' params: plan='plans/map_microorganisms.plan', taxid_microorganisms='ancillaries/extended-microorganisms-taxonomy/taxid_microorganisms.txt', @@ -118,9 +118,9 @@ map microorganisms (CIRM CFBP) ''' rule map_cirm_cfbp_microorganisms: input: - taxa='corpora/cirm/cfbp/cfbp_taxa.txt' + taxa='corpora/cirm/cfbp_taxa.txt' output: - mapped_taxa='corpora/cirm/cfbp/mapped_cfbp_taxa.txt' + mapped_taxa='corpora/cirm/mapped_cfbp_taxa.txt' params: plan='plans/map_microorganisms.plan', taxid_microorganisms='ancillaries/extended-microorganisms-taxonomy/taxid_microorganisms.txt', @@ -139,22 +139,19 @@ map habitats of microorganisms ''' rule map_cirm_habitats: input: - habitats='corpora/cirm/bia/bia_habitats.txt' + habitats='corpora/cirm/bia_habitats.txt' output: - mapped_habitats='corpora/cirm/bia/mapped_bia_habitats.txt' + mapped_habitats='corpora/cirm/mapped_bia_habitats.txt' params: plan='plans/map_habitats.plan', onto='ancillaries/BioNLP-OST+EnovFood-Habitat.obo', tomap='ancillaries/BioNLP-OST+EnovFood-Habitat.tomap', graylist='ancillaries/graylist_extended.heads', - emptywords='ancillaries/stopwords_EN.ttg', - outdir='corpora/cirm/bia', - outfile='mapped_bia_habitats.txt' + emptywords='ancillaries/stopwords_EN.ttg' singularity:config["SINGULARITY_IMG"] shell: """alvisnlp -J-Xmx32g -cleanTmp -verbose \ -alias input {input.habitats} \ - -outputDir {params.outdir} \ - -alias output {params.outfile} \ + -alias output {output.mapped_habitats} \ -alias ontobiotope {params.onto} \ -xalias '<ontobiotope-tomap empty-words="{params.emptywords}" graylist="{params.graylist}" whole-proxy-distance="false">{params.tomap}</ontobiotope-tomap>' \ {params.plan} @@ -165,22 +162,19 @@ map habitats of microorganisms (CIRM Levures) ''' rule map_cirm_yeast_habitats: input: - habitats='corpora/cirm/levures/yeast_habitats.txt' + habitats='corpora/cirm/yeast_habitats.txt' output: - mapped_habitats='corpora/cirm/levures/mapped_yeast_habitats.txt' + mapped_habitats='corpora/cirm/mapped_yeast_habitats.txt' params: plan='plans/map_habitats.plan', onto='ancillaries/BioNLP-OST+EnovFood-Habitat.obo', tomap='ancillaries/BioNLP-OST+EnovFood-Habitat.tomap', graylist='ancillaries/graylist_extended.heads', - emptywords='ancillaries/stopwords_EN.ttg', - outdir='corpora/cirm/levures', - outfile='mapped_yeast_habitats.txt' + emptywords='ancillaries/stopwords_EN.ttg' singularity:config["SINGULARITY_IMG"] shell: """alvisnlp -J-Xmx32g -cleanTmp -verbose \ -alias input {input.habitats} \ - -outputDir {params.outdir} \ - -alias output {params.outfile} \ + -alias output {output.mapped_habitats} \ -alias ontobiotope {params.onto} \ -xalias '<ontobiotope-tomap empty-words="{params.emptywords}" graylist="{params.graylist}" whole-proxy-distance="false">{params.tomap}</ontobiotope-tomap>' \ {params.plan} @@ -191,22 +185,19 @@ map habitats of microorganisms (CIRM CFBP) ''' rule map_cirm_cfbp_habitats: input: - habitats='corpora/cirm/cfbp/cfbp_habitats.txt' + habitats='corpora/cirm/cfbp_habitats.txt' output: - mapped_habitats='corpora/cirm/cfbp/mapped_cfbp_habitats.txt' + mapped_habitats='corpora/cirm/mapped_cfbp_habitats.txt' params: plan='plans/map_habitats.plan', onto='ancillaries/BioNLP-OST+EnovFood-Habitat.obo', tomap='ancillaries/BioNLP-OST+EnovFood-Habitat.tomap', graylist='ancillaries/graylist_extended.heads', - emptywords='ancillaries/stopwords_EN.ttg', - outdir='corpora/cirm/cfbp', - outfile='mapped_cfbp_habitats.txt' + emptywords='ancillaries/stopwords_EN.ttg' singularity:config["SINGULARITY_IMG"] shell: """alvisnlp -J-Xmx32g -cleanTmp -verbose \ -alias input {input.habitats} \ - -outputDir {params.outdir} \ - -alias output {params.outfile} \ + -alias output {output.mapped_habitats} \ -alias ontobiotope {params.onto} \ -xalias '<ontobiotope-tomap empty-words="{params.emptywords}" graylist="{params.graylist}" whole-proxy-distance="false">{params.tomap}</ontobiotope-tomap>' \ {params.plan} @@ -218,8 +209,8 @@ format results rule format_cirm_results: input: file='corpora/cirm/BIA_2021/florilege_export_final_17_02_21.tsv', - taxa='corpora/cirm/bia/mapped_bia_taxa.txt', - habitats='corpora/cirm/bia/mapped_bia_habitats.txt' + taxa='corpora/cirm/mapped_bia_taxa.txt', + habitats='corpora/cirm/mapped_bia_habitats.txt' output: result='corpora/florilege/cirm/cirm-bia-results.txt' params: @@ -235,8 +226,8 @@ format results (CIRM Levures) rule format_cirm_yeast_results: input: file='corpora/cirm/Levures_2021/Florilege_21012021.tsv', - taxa='corpora/cirm/levures/mapped_yeast_taxa.txt', - habitats='corpora/cirm/levures/mapped_yeast_habitats.txt' + taxa='corpora/cirm/mapped_yeast_taxa.txt', + habitats='corpora/cirm/mapped_yeast_habitats.txt' output: result='corpora/florilege/cirm/cirm-yeast-results.txt' params: @@ -251,8 +242,8 @@ format results (CIRM CFBP) rule format_cirm_cfbp_results: input: file='corpora/cirm/CFBP_2021/20210617_PPortier.tsv', - taxa='corpora/cirm/cfbp/mapped_cfbp_taxa.txt', - habitats='corpora/cirm/cfbp/mapped_cfbp_habitats.txt' + taxa='corpora/cirm/mapped_cfbp_taxa.txt', + habitats='corpora/cirm/mapped_cfbp_habitats.txt' output: result='corpora/florilege/cirm/cirm-cfbp-results.txt' params: diff --git a/process_DSMZ_corpus.snakefile b/process_DSMZ_corpus.snakefile index 8fde651932d3c4255ef92b8bbbfb11f822308cf3..b4fa67fa3a0830be87fa78e37a69270333359c55 100644 --- a/process_DSMZ_corpus.snakefile +++ b/process_DSMZ_corpus.snakefile @@ -35,14 +35,11 @@ rule map_dsmz_habitats: onto='ancillaries/BioNLP-OST+EnovFood-Habitat.obo', tomap='ancillaries/BioNLP-OST+EnovFood-Habitat.tomap', graylist='ancillaries/graylist_extended.heads', - emptywords='ancillaries/stopwords_EN.ttg', - outdir='corpora/dsmz', - outfile='mapped_habitats.txt' + emptywords='ancillaries/stopwords_EN.ttg' singularity:config["SINGULARITY_IMG"] shell: """alvisnlp -J-Xmx32g -cleanTmp -verbose \ -alias input {input.habitats} \ - -outputDir {params.outdir} \ - -alias output {params.outfile} \ + -alias output {output.mapped_habitats} \ -alias ontobiotope {params.onto} \ -xalias '<ontobiotope-tomap empty-words="{params.emptywords}" graylist="{params.graylist}" whole-proxy-distance="false">{params.tomap}</ontobiotope-tomap>' \ {params.plan} diff --git a/process_GenBank_corpus.snakefile b/process_GenBank_corpus.snakefile index 888835e413b1864c216cebcf38a65355270f103e..388f9ab1da3af0ffa392a81c44802b85b812d21b 100644 --- a/process_GenBank_corpus.snakefile +++ b/process_GenBank_corpus.snakefile @@ -79,14 +79,11 @@ rule map_genbank_habitats: tomap='ancillaries/BioNLP-OST+EnovFood-Habitat.tomap', graylist='ancillaries/graylist_extended.heads', emptywords='ancillaries/stopwords_EN.ttg', - inhibitSyntax='inhibit-syntax', - outdir='corpora/genbank', - outfile='mapped_habitats.txt' + inhibitSyntax='inhibit-syntax' singularity:config["SINGULARITY_IMG"] shell: """alvisnlp -J-Xmx32g -cleanTmp -verbose \ -alias input {input.habitats} \ - -outputDir {params.outdir} \ - -alias output {params.outfile} \ + -alias output {output.mapped_habitats}\ -alias ontobiotope {params.onto} \ -feat inhibit-syntax {params.inhibitSyntax} \ -xalias '<ontobiotope-tomap empty-words="{params.emptywords}" graylist="{params.graylist}" whole-proxy-distance="false">{params.tomap}</ontobiotope-tomap>' \ diff --git a/process_PubMed_corpus.snakefile b/process_PubMed_corpus.snakefile index 5710034a3d3862750f5fcff9027c2a2411ed89db..7fd42f33b809b64d119f5f88c543976bf8ebc097 100644 --- a/process_PubMed_corpus.snakefile +++ b/process_PubMed_corpus.snakefile @@ -46,14 +46,9 @@ rule run_pubmed_entities: params: batch="{B}", corpus='pubmed', - inhibitSyntax='inhibit-syntax', - onto_habitat='ancillaries/BioNLP-OST+EnovFood-Habitat.obo', - tomap_habitat='ancillaries/BioNLP-OST+EnovFood-Habitat.tomap', - onto_pheno='ancillaries/BioNLP-OST+EnovFood-Phenotype.obo', - tomap_pheno='ancillaries/BioNLP-OST+EnovFood-Phenotype.tomap', - graylist='ancillaries/graylist_extended.heads', - emptywords='ancillaries/stopwords_EN.ttg', - ontobiotopeUse='ancillaries/Use_V2.obo', + inhibitSyntax='inhibit-syntax', + onto='ancillaries/BioNLP-OST+EnovFood', + ontobiotopeUse='ancillaries/Use_V2', plan='plans/entities.plan', dir='corpora/pubmed/batches/{B}/', taxid_microorganisms='ancillaries/extended-microorganisms-taxonomy/taxid_microorganisms.txt', @@ -66,15 +61,13 @@ rule run_pubmed_entities: -alias format pubmed \ -alias input {input.file} \ -alias input-xslt {input.xslt} \ - -alias batch batch={params.batch} \ - -outputDir {params.dir} \ + -alias outputDir {params.dir} \ -environmentEntities \ + -entity corpus {params.corpus} \ -feat inhibit-syntax {params.inhibitSyntax} \ - -alias ontobiotope-habitat {params.onto_habitat} \ - -xalias '<ontobiotope-tomap-habitat empty-words="{params.emptywords}" graylist="{params.graylist}" whole-proxy-distance="false">{params.tomap_habitat}</ontobiotope-tomap-habitat>' \ - -alias ontobiotope-phenotypes {params.onto_pheno} \ - -xalias '<ontobiotope-tomap-phenotypes empty-words="{params.emptywords}" whole-proxy-distance="false">{params.tomap_pheno}</ontobiotope-tomap-phenotypes>' \ - -alias ontobiotope-use {params.ontobiotopeUse} \ + -entity ontobiotope {params.onto} \ + -entity ontobiotope-use {params.ontobiotopeUse} \ + -entity batch {params.batch} \ -alias taxid_microorganisms {params.taxid_microorganisms} \ -alias taxa+id_full {params.taxa_id_full} \ {params.plan}