From 033669cf324eafcdd2b542236c8913af1f578c75 Mon Sep 17 00:00:00 2001 From: Robert Bossy <Robert.Bossy@inra.fr> Date: Mon, 24 Jun 2024 12:07:32 +0200 Subject: [PATCH 1/6] Added EPMC reader the main plan has to be used with parameters: input: XML file or directory containing XML files xslt: the stylesheet for EPMC XML provided in anciliaries/pmc2alvisnlp.xslt --- ancillaries/pmc2alvisnlp.xslt | 95 +++++++++++++++++++++++++++++++++++ plans/omnicrobe_main.plan | 6 +++ plans/read-epmc.plan | 27 ++++++++++ 3 files changed, 128 insertions(+) create mode 100644 ancillaries/pmc2alvisnlp.xslt create mode 100644 plans/read-epmc.plan diff --git a/ancillaries/pmc2alvisnlp.xslt b/ancillaries/pmc2alvisnlp.xslt new file mode 100644 index 00000000..0f07d52e --- /dev/null +++ b/ancillaries/pmc2alvisnlp.xslt @@ -0,0 +1,95 @@ +<xsl:stylesheet version="1.0" + xmlns:xsl="http://www.w3.org/1999/XSL/Transform" + xmlns:xlink="http://www.w3.org/1999/xlink" + xmlns:mml="http://www.w3.org/1998/Math/MathML" + xmlns:a="xalan://fr.inra.maiage.bibliome.alvisnlp.bibliomefactory.modules.xml.XMLReader2" + xmlns:inline="http://bibliome.jouy.inra.fr/alvisnlp/bibliome-module-factory/inline" + extension-element-prefixes="a inline" + > + + <xsl:template match="/"> + <xsl:apply-templates select="article-set/article|article"/> + </xsl:template> + + <xsl:template match="article"> + <a:document xpath-id="concat('PMC', front/article-meta/article-id[@pub-id-type = 'pmcid'])"> + <a:feature name="article-type" xpath-value="@article-type"/> + <xsl:apply-templates select="front/journal-meta"/> + <xsl:apply-templates select="front/article-meta"/> + <xsl:apply-templates select="body"/> + </a:document> + </xsl:template> + + <xsl:template match="journal-meta"> + <a:feature name="journal" xpath-value="journal-title"/> + <a:feature name="issn" xpath-value="issn"/> + <a:feature name="publisher" xpath-value="publisher/publisher-name"/> + </xsl:template> + + <xsl:template match="article-meta"> + <xsl:apply-templates select="article-id"/> + <xsl:apply-templates select="title-group/article-title"/> + <xsl:apply-templates select="contrib-group/contrib[@contrib-type = 'author' or @contrib-type = 'presenting-author']"/> + <xsl:apply-templates select="pub-date"/> + <xsl:apply-templates select="ext-link"/> + <xsl:apply-templates select="permissions"/> + <xsl:apply-templates select="abstract"/> + </xsl:template> + + <xsl:template match="article-id"> + <a:feature xpath-name="@pub-id-type" xpath-value="."/> + </xsl:template> + + <xsl:template match="article-title"> + <a:section name="title" xpath-contents="."/> + </xsl:template> + + <xsl:template match="contrib"> + <xsl:variable name="suffix"> + <xsl:choose> + <xsl:when test="name/suffix"> + <xsl:value-of select="concat(' ', name/suffix)"/> + </xsl:when> + <xsl:otherwise/> + </xsl:choose> + </xsl:variable> + <xsl:variable name="aff-id"> + <xsl:value-of select="xref[@ref-type = 'aff']/@rid"/> + </xsl:variable> + <a:section name="author" xpath-contents="concat(name/given-names, ' ', name/surname, $suffix)"> + <a:feature name="given-names" xpath-value="name/given-names"/> + <a:feature name="surname" xpath-value="name/surname"/> + <a:feature name="suffix" xpath-value="name/suffix"/> + <a:feature name="affiliation" xpath-value="../../aff[@id = $aff-id]/text()"/> + </a:section> + </xsl:template> + + <xsl:template match="pub-date"> + <a:feature name="year" xpath-value="year"/> + </xsl:template> + + <xsl:template match="ext-link"> + <a:feature name="ext-link" xpath-value="@xlink:href"/> + </xsl:template> + + <xsl:template match="permissions"> + <a:feature name="copyright-statement" xpath-value="copyright-statement"/> + <a:feature name="license-link" xpath-value="license/@xlink:href"/> + <a:feature name="license" xpath-value="license"/> + </xsl:template> + + <xsl:template match="abstract|body"> + <a:section xpath-name="name()" xpath-contents="."> + <xsl:for-each select="a:inline()[name() != 'label' and name() != 'title' and name() != 'sec' and name() != 'abstract' and name() != 'body']"> + <a:annotation start="@inline:start" end="@inline:end" layers="html"> + <a:feature name="tag" xpath-value="name()"/> + </a:annotation> + </xsl:for-each> + <xsl:for-each select="a:inline()[(name() = 'label' or name() = 'title') and name(parent::*) = 'sec']"> + <a:annotation start="@inline:start" end="@inline:end" layers="html"> + <a:feature name="tag" xpath-value="concat('h', count(ancestor::sec))"/> + </a:annotation> + </xsl:for-each> + </a:section> + </xsl:template> +</xsl:stylesheet> diff --git a/plans/omnicrobe_main.plan b/plans/omnicrobe_main.plan index 453072cf..abdce1a5 100644 --- a/plans/omnicrobe_main.plan +++ b/plans/omnicrobe_main.plan @@ -8,6 +8,7 @@ <param name="input"> <alias module="read.pubmed" param="source"/> + <alias module="read.epmc" param="input"/> </param> <param name="input-dir"> @@ -16,6 +17,7 @@ <param name="input-xslt"> <alias module="read.pubmed" param="xslTransform"/> + <alias module="read.epmc" param="xslt"/> </param> <param name="batch"> @@ -173,6 +175,10 @@ <constantDocumentFeatures>batch=0001</constantDocumentFeatures> </pubmed> + <epmc href="plans/read-epmc.plan"> + <remove>tables</remove> + </epmc> + <bionlp-st class="BioNLPSTReader"> <active>true</active> <section>abstract</section> diff --git a/plans/read-epmc.plan b/plans/read-epmc.plan new file mode 100644 index 00000000..89bef97d --- /dev/null +++ b/plans/read-epmc.plan @@ -0,0 +1,27 @@ +<alvisnlp-plan id="read-epmc"> + <param name="input"> + <alias module="read" param="source"/> + </param> + + <param name="xslt"> + <alias module="read" param="xslTransform"/> + </param> + + <param name="remove"> + <alias module="remove" param="select"/> + </param> + + <read class="XMLReader"/> + + <remove> + <tables class="Action"> + <target>documents.sections.layer:html[@tag == "table-wrap"]</target> + <action>add:strip-layer</action> + <addToLayer/> + </tables> + </remove> + + <do-remove class="RemoveContents"> + <stripLayer>strip-layer</stripLayer> + </do-remove> +</alvisnlp-plan> -- GitLab From de56bcf15e400ba28273ed65e3107580b93bfe45 Mon Sep 17 00:00:00 2001 From: Robert Bossy <Robert.Bossy@inrae.fr> Date: Wed, 23 Oct 2024 15:49:37 +0200 Subject: [PATCH 2/6] process section "body" --- plans/omnicrobe_main.plan | 4 ++-- plans/output.plan | 6 +++--- plans/term-extraction.plan | 6 +++--- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/plans/omnicrobe_main.plan b/plans/omnicrobe_main.plan index abdce1a5..af8f9daa 100644 --- a/plans/omnicrobe_main.plan +++ b/plans/omnicrobe_main.plan @@ -214,7 +214,7 @@ <!-- Taxa recognition --> <taxa file="plans/taxa.plan"> - <sectionFilter>@name == "title" or @name == "abstract" or @name == "text"</sectionFilter> + <sectionFilter>str:inset(@name, "title", "abstract", "body")</sectionFilter> </taxa> <!-- Project stopwords --> @@ -222,7 +222,7 @@ <!-- Geographical location recognition --> <geo file="plans/geo.plan"> - <sectionFilter>@name == "title" or @name == "abstract" or @name == "text"</sectionFilter> + <sectionFilter>str:inset(@name, "title", "abstract", "body")</sectionFilter> </geo> <segmentation file="plans/segmentation.plan"/> diff --git a/plans/output.plan b/plans/output.plan index 1f7505d8..94061051 100644 --- a/plans/output.plan +++ b/plans/output.plan @@ -219,7 +219,7 @@ <index class="AlvisIRIndexer"> <indexDir>index</indexDir> <tokenPositionGap>9216</tokenPositionGap> - <fieldNames>title,abstract,author,full-author,pmid,year,journal,mesh,url</fieldNames> + <fieldNames>title,abstract,body,author,full-author,pmid,year,journal,mesh,url</fieldNames> <relations> <livesin>taxon,habitat</livesin> <exhibits>taxon,phenotype</exhibits> @@ -228,7 +228,7 @@ <propertyKeys/> <documents> <fields> - <instances>sections:title | sections:abstract</instances> + <instances>sections:title | sections:abstract | sections:body</instances> <annotations> <instances>layer:microorganism</instances> <text>"{taxon}" ^ @path ^ "/"</text> @@ -462,4 +462,4 @@ <columns>@id</columns> </success> -</alvisnlp-plan> \ No newline at end of file +</alvisnlp-plan> diff --git a/plans/term-extraction.plan b/plans/term-extraction.plan index 00407ffa..344dc131 100644 --- a/plans/term-extraction.plan +++ b/plans/term-extraction.plan @@ -46,7 +46,7 @@ <!-- Run Yatea term extractor --> <yatea class="YateaExtractor"> - <sectionFilter>@name == "title" or @name == "abstract"</sectionFilter> + <sectionFilter>str:inset(@name, "title", "abstract", "text", "body")</sectionFilter> <xmlTermsFile>yatea/candidates.xml</xmlTermsFile> <posFeature>pos</posFeature> <configDir>ancillaries/YaTeA/config-habitats</configDir> @@ -57,7 +57,7 @@ <!-- Run Yatea term extractor on variants --> <yatea-var class="YateaExtractor"> - <sectionFilter>@name == "title" or @name == "abstract"</sectionFilter> + <sectionFilter>str:inset(@name, "title", "abstract", "text", "body")</sectionFilter> <xmlTermsFile>yatea-var/candidates.xml</xmlTermsFile> <posFeature>pos</posFeature> <lemmaFeature>variant</lemmaFeature> @@ -67,4 +67,4 @@ <!--perlLib>/projet/mig/work/textemig/biotopes/software/yatea-lib</perlLib--> </yatea-var> -</alvisnlp-plan> \ No newline at end of file +</alvisnlp-plan> -- GitLab From 6831acb686f05797ec70e87c129bc1c14eb9f067 Mon Sep 17 00:00:00 2001 From: Mouhamadou Ba <mandiayba@gmail.com> Date: Fri, 25 Oct 2024 15:27:16 +0200 Subject: [PATCH 3/6] add pipeline to process epmc --- config/config.yaml | 10 ++ process_epmc.snakefile | 230 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 240 insertions(+) create mode 100644 process_epmc.snakefile diff --git a/config/config.yaml b/config/config.yaml index 4c690578..8ba700de 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -64,6 +64,16 @@ PUBMED_DB: "/db/pubmed/current/index" PUBMED_MICROBIO_MESH_TERMS: "corpora/pubmed/microbio-mesh-terms.txt" PUBMED_XSLT_FILE: "corpora/pubmed/microbes-2019-pubmed2alvisnlp.xslt" +## epmc +PUBMED_FOLDER_NAME: "epmc" +PUBMED_CORPUS_HOME: "corpora/epmc" +PUBMED_BATCHES_HOME: "corpora/epmc/batches" +PUBMED_HABITAT_RESULT: "ancillaries/Florilege/2019-12-12/PubMed-Habitat-2019-12-12.txt" +PUBMED_PHENOTYPE_RESULT: "ancillaries/Florilege/2019-12-12/PubMed-Phenotype-2019-12-12.txt" +PUBMED_DB: "/db/pubmed/current/index" +PUBMED_MICROBIO_MESH_TERMS: "corpora/pubmed/microbio-mesh-terms.txt" +PUBMED_XSLT_FILE: "corpora/pubmed/microbes-2019-pubmed2alvisnlp.xslt" + ## bionlp-ost BIONLPOST_BATCHES_HOME: "corpora/BioNLP-OST-2019/batches" BIONLPOST_API: "http://bibliome.jouy.inra.fr/demo/BioNLP-OST-2019-Evaluation/api" diff --git a/process_epmc.snakefile b/process_epmc.snakefile new file mode 100644 index 00000000..434c7bb5 --- /dev/null +++ b/process_epmc.snakefile @@ -0,0 +1,230 @@ +## local rule +# localrules: all, concat_results + +## config file +configfile: "config/config.yaml" + +''' +## variables, check values into config file +_ontobiotope = config['ONTOBIOTOPE'] +_names = config['NCBI_TAXO_NAMES'] +_pubmed_batches_home = config['EPMC_BATCHES_HOME'] +_pubmed_xslt_file = config['EPMC_XSLT_FILE'] +_ncbi_taxo_microorganisms = config['NCBI_TAXO_MICROORGANISMS'] +_ncbi_taxo_id = config['NCBI_TAXO_ID'] +_ncbi_taxo_and_id_microorganisms = config['NCBI_TAXO_AND_ID_MICROORGANISMS'] +''' + + +## document batches +BATCHES, = glob_wildcards(config["EPMC_BATCHES_HOME"] + "/{id}/PMC*.xml") + +## list of the results +RESULTS = ["relations", "phenotype-relations", "uses-relations", "microorganisms", "habitats", "phenotypes", "uses"] + + +''' +all +''' +rule all: + input: + relations=expand(config["EPMC_BATCHES_HOME"] + "/{B}/relations.txt", B=BATCHES), + phenotypeRelations=expand(config["EPMC_BATCHES_HOME"] + "/{B}/phenotype-relations.txt", B=BATCHES), + usesRelations=expand(config["EPMC_BATCHES_HOME"] + "/{B}/uses-relations.txt", B=BATCHES), + microorganisms=expand(config["EPMC_BATCHES_HOME"] + "/{B}/microorganisms.txt", B=BATCHES), + habitats=expand(config["EPMC_BATCHES_HOME"] + "/{B}/habitats.txt", B=BATCHES), + phenotypes=expand(config["EPMC_BATCHES_HOME"] + "/{B}/phenotypes.txt", B=BATCHES), + uses=expand(config["EPMC_BATCHES_HOME"] + "/{B}/uses.txt", B=BATCHES), + index=expand(config["EPMC_BATCHES_HOME"] + "/{B}/index", B=BATCHES), + index_folder="corpora/florilege/alvisir/index", + expander_folder="corpora/florilege/alvisir/expander", + florilege_Habitat_result="corpora/florilege/epmc/PubMed-Habitat.txt", + florilege_Phenotype_result="corpora/florilege/epmc/PubMed-Phenotype.txt", + florilege_Use_result="corpora/florilege/epmc/PubMed-Use.txt", + result=expand("corpora/florilege/epmc/{R}.full.txt", R=RESULTS) + + +''' +Extract entities in different corpus +batches using the alvisnlp plan (omnicrobe_main.plan) +''' +rule run_epmc_main: + input: + file = config['EPMC_BATCHES_HOME'] + "/{B}/", + xslt = config['EPMC_XSLT_FILE'] + output: + results=expand(config["EPMC_BATCHES_HOME"] + "/{{B}}/{R}.txt", R=RESULTS), + index=directory(config["EPMC_BATCHES_HOME"] + "/{B}/index"), + yatea_candidates=config["EPMC_BATCHES_HOME"] + "/{B}/yatea/candidates.xml", + yatea_var_candidates=config["EPMC_BATCHES_HOME"] + "/{B}/yatea-var/candidates.xml" + params: + batch="{B}", + corpus='empc', + onto_habitat='share/BioNLP-OST+EnovFood-Habitat.obo', + tomap_habitat='share/BioNLP-OST+EnovFood-Habitat.tomap', + onto_pheno='share/BioNLP-OST+EnovFood-Phenotype.obo', + tomap_pheno='share/BioNLP-OST+EnovFood-Phenotype.tomap', + graylist='share/graylist_extended.heads', + emptywords='share/stopwords_EN.ttg', + onto_use='share/BioNLP-OST+EnovFood-Use.obo', + plan='plans/omnicrobe_main.plan', + dir=config["EPMC_BATCHES_HOME"] + "/{B}/", + taxid_microorganisms=config['NCBI_TAXO_MICROORGANISMS'], + taxa_id_full=config['NCBI_TAXO_ID'], + dummy= config["EPMC_BATCHES_HOME"] + '/{B}/bionlp-st', + log="alvisnlp.log" + singularity:config["SINGULARITY_IMG"] + shell:""" + rm -f {output.yatea_candidates} {output.yatea_var_candidates} && mkdir -p {params.dummy} && alvisnlp -J-XX:+UseSerialGC -J-Xmx30g -cleanTmp -verbose \ + -log {params.log} \ + -alias format pubmed \ + -alias input {input.file} \ + -alias input-xslt {input.xslt} \ + -alias batch batch={params.batch} \ + -outputDir {params.dir} \ + -alias ontobiotope-habitat {params.onto_habitat} \ + -xalias '<ontobiotope-tomap-habitat empty-words="{params.emptywords}" graylist="{params.graylist}" whole-proxy-distance="false">{params.tomap_habitat}</ontobiotope-tomap-habitat>' \ + -alias ontobiotope-phenotypes {params.onto_pheno} \ + -xalias '<ontobiotope-tomap-phenotypes empty-words="{params.emptywords}" whole-proxy-distance="false">{params.tomap_pheno}</ontobiotope-tomap-phenotypes>' \ + -alias ontobiotope-use {params.onto_use} \ + -alias taxid_microorganisms {params.taxid_microorganisms} \ + -alias taxa+id_full {params.taxa_id_full} \ + {params.plan} + """ + + +''' +select results to concat + +rule selectToConcat: + input: + results=expand(config["EPMC_BATCHES_HOME"] + "/{{B}}/{R}.txt", R=RESULTS), + index=config["EPMC_BATCHES_HOME"] + "/{B}/index" + output: + results=config["EPMC_BATCHES_HOME"] + "/{B}/{R}.txt" +''' + + +''' +concat the different results +for +* relations +* phenotype-relations +* uses-relations +* microorganisms +* habitats +* phenotypes +* uses +/!\ bash arguments too long if you use cat +''' +rule concat_results: + input: + expand(config["EPMC_BATCHES_HOME"] + "/{B}/{{R}}.txt", B=BATCHES) + output: + result="corpora/epmc/{R}.full.txt" + run: + with open(output.result, 'w') as out: + for fname in input: + with open(fname) as infile: + out.write(infile.read()) + +''' +select files to be formated +''' +rule select: + input: + files=expand("corpora/epmc/{R}.full.txt", R=RESULTS) + output: + relation="corpora/epmc/relations.full.txt", + phenotype_relations="corpora/epmc/phenotype-relations.full.txt", + use_relations="corpora/epmc/uses-relations.full.txt" + shell:""" + """ + + + +''' +merge indexes from the batches +''' +rule merge_epmc_index: + input: + index=expand(config["EPMC_BATCHES_HOME"] +"/{B}/index", B=BATCHES) + output: + index_folder=directory("corpora/epmc/index") + params: + alvisir=config["ALVISIR_HOME"] + shell: """ + java -cp {params.alvisir}/lib/lucene-core-3.6.1.jar:{params.alvisir}/lib/lucene-misc-3.6.1.jar \ + org.apache.lucene.misc.IndexMergeTool \ + {output.index_folder} {input.index} + """ + + + +''' +create the expander +''' +rule create_epmc_expander: + input: + expander="share/expander.xml", + taxa_id_microorganisms=config['NCBI_TAXO_AND_ID_MICROORGANISMS'], + onto_habitat="share/BioNLP-OST+EnovFood-Habitat.obo", + onto_phenotype="share/BioNLP-OST+EnovFood-Phenotype.obo", + onto_use="share/BioNLP-OST+EnovFood-Use.obo" + output: + expander_folder=directory("corpora/epmc/expander") + params: + alvisir=config["ALVISIR_HOME"] + shell:""" + {params.alvisir}/bin/alvisir-index-expander {output.expander_folder} {input.expander} + """ + + +''' +formatting the extracted *relations* for +integration in Florilege +''' +rule format_epmc_relations: + input: + file="corpora/epmc/relations.full.txt" + output: + florilege_result="corpora/florilege/epmc/EPMC-Habitat.txt" + conda: 'softwares/envs/python3_env.yaml' + shell:""" + python softwares/scripts/format-pubmed-results.py \ + --pubmed-results {input.file} \ + > {output.florilege_result} + """ + + +''' +formatting the extracted *phenotype relations* for +integration in Florilege +''' +rule format_epmc_phenotype_relations: + input: + file="corpora/epmc/phenotype-relations.full.txt" + output: + florilege_result="corpora/florilege/epmc/EPMC-Phenotype.txt" + conda: 'softwares/envs/python3_env.yaml' + shell:""" + python softwares/scripts/format-pubmed-results.py \ + --pubmed-results {input.file} \ + > {output.florilege_result} + """ + +''' +formatting the extracted *use relations* for +integration in Florilege +''' +rule format_epmc_use_relations: + input: + file="corpora/epmc/uses-relations.full.txt" + output: + florilege_result="corpora/florilege/epmc/EPMC-Use.txt" + conda: 'softwares/envs/python3_env.yaml' + shell:""" + python softwares/scripts/format-pubmed-results.py \ + --pubmed-results {input.file} \ + > {output.florilege_result} + """ \ No newline at end of file -- GitLab From 2f7ff62b80be5cb173f7cc99864f508c060427f4 Mon Sep 17 00:00:00 2001 From: Robert Bossy <Robert.Bossy@inra.fr> Date: Mon, 24 Jun 2024 12:07:32 +0200 Subject: [PATCH 4/6] Added EPMC reader the main plan has to be used with parameters: input: XML file or directory containing XML files xslt: the stylesheet for EPMC XML provided in anciliaries/pmc2alvisnlp.xslt --- ancillaries/pmc2alvisnlp.xslt | 95 +++++++++++++++++++++++++++++++++++ plans/omnicrobe_main.plan | 6 +++ plans/read-epmc.plan | 27 ++++++++++ 3 files changed, 128 insertions(+) create mode 100644 ancillaries/pmc2alvisnlp.xslt create mode 100644 plans/read-epmc.plan diff --git a/ancillaries/pmc2alvisnlp.xslt b/ancillaries/pmc2alvisnlp.xslt new file mode 100644 index 00000000..0f07d52e --- /dev/null +++ b/ancillaries/pmc2alvisnlp.xslt @@ -0,0 +1,95 @@ +<xsl:stylesheet version="1.0" + xmlns:xsl="http://www.w3.org/1999/XSL/Transform" + xmlns:xlink="http://www.w3.org/1999/xlink" + xmlns:mml="http://www.w3.org/1998/Math/MathML" + xmlns:a="xalan://fr.inra.maiage.bibliome.alvisnlp.bibliomefactory.modules.xml.XMLReader2" + xmlns:inline="http://bibliome.jouy.inra.fr/alvisnlp/bibliome-module-factory/inline" + extension-element-prefixes="a inline" + > + + <xsl:template match="/"> + <xsl:apply-templates select="article-set/article|article"/> + </xsl:template> + + <xsl:template match="article"> + <a:document xpath-id="concat('PMC', front/article-meta/article-id[@pub-id-type = 'pmcid'])"> + <a:feature name="article-type" xpath-value="@article-type"/> + <xsl:apply-templates select="front/journal-meta"/> + <xsl:apply-templates select="front/article-meta"/> + <xsl:apply-templates select="body"/> + </a:document> + </xsl:template> + + <xsl:template match="journal-meta"> + <a:feature name="journal" xpath-value="journal-title"/> + <a:feature name="issn" xpath-value="issn"/> + <a:feature name="publisher" xpath-value="publisher/publisher-name"/> + </xsl:template> + + <xsl:template match="article-meta"> + <xsl:apply-templates select="article-id"/> + <xsl:apply-templates select="title-group/article-title"/> + <xsl:apply-templates select="contrib-group/contrib[@contrib-type = 'author' or @contrib-type = 'presenting-author']"/> + <xsl:apply-templates select="pub-date"/> + <xsl:apply-templates select="ext-link"/> + <xsl:apply-templates select="permissions"/> + <xsl:apply-templates select="abstract"/> + </xsl:template> + + <xsl:template match="article-id"> + <a:feature xpath-name="@pub-id-type" xpath-value="."/> + </xsl:template> + + <xsl:template match="article-title"> + <a:section name="title" xpath-contents="."/> + </xsl:template> + + <xsl:template match="contrib"> + <xsl:variable name="suffix"> + <xsl:choose> + <xsl:when test="name/suffix"> + <xsl:value-of select="concat(' ', name/suffix)"/> + </xsl:when> + <xsl:otherwise/> + </xsl:choose> + </xsl:variable> + <xsl:variable name="aff-id"> + <xsl:value-of select="xref[@ref-type = 'aff']/@rid"/> + </xsl:variable> + <a:section name="author" xpath-contents="concat(name/given-names, ' ', name/surname, $suffix)"> + <a:feature name="given-names" xpath-value="name/given-names"/> + <a:feature name="surname" xpath-value="name/surname"/> + <a:feature name="suffix" xpath-value="name/suffix"/> + <a:feature name="affiliation" xpath-value="../../aff[@id = $aff-id]/text()"/> + </a:section> + </xsl:template> + + <xsl:template match="pub-date"> + <a:feature name="year" xpath-value="year"/> + </xsl:template> + + <xsl:template match="ext-link"> + <a:feature name="ext-link" xpath-value="@xlink:href"/> + </xsl:template> + + <xsl:template match="permissions"> + <a:feature name="copyright-statement" xpath-value="copyright-statement"/> + <a:feature name="license-link" xpath-value="license/@xlink:href"/> + <a:feature name="license" xpath-value="license"/> + </xsl:template> + + <xsl:template match="abstract|body"> + <a:section xpath-name="name()" xpath-contents="."> + <xsl:for-each select="a:inline()[name() != 'label' and name() != 'title' and name() != 'sec' and name() != 'abstract' and name() != 'body']"> + <a:annotation start="@inline:start" end="@inline:end" layers="html"> + <a:feature name="tag" xpath-value="name()"/> + </a:annotation> + </xsl:for-each> + <xsl:for-each select="a:inline()[(name() = 'label' or name() = 'title') and name(parent::*) = 'sec']"> + <a:annotation start="@inline:start" end="@inline:end" layers="html"> + <a:feature name="tag" xpath-value="concat('h', count(ancestor::sec))"/> + </a:annotation> + </xsl:for-each> + </a:section> + </xsl:template> +</xsl:stylesheet> diff --git a/plans/omnicrobe_main.plan b/plans/omnicrobe_main.plan index 73d9ac5b..18f42cdc 100644 --- a/plans/omnicrobe_main.plan +++ b/plans/omnicrobe_main.plan @@ -8,6 +8,7 @@ <param name="input"> <alias module="read.pubmed" param="source"/> + <alias module="read.epmc" param="input"/> </param> <param name="input-dir"> @@ -16,6 +17,7 @@ <param name="input-xslt"> <alias module="read.pubmed" param="xslTransform"/> + <alias module="read.epmc" param="xslt"/> </param> <param name="batch"> @@ -173,6 +175,10 @@ <constantDocumentFeatures>batch=0001</constantDocumentFeatures> </pubmed> + <epmc href="plans/read-epmc.plan"> + <remove>tables</remove> + </epmc> + <bionlp-st class="BioNLPSTReader"> <active>true</active> <section>abstract</section> diff --git a/plans/read-epmc.plan b/plans/read-epmc.plan new file mode 100644 index 00000000..89bef97d --- /dev/null +++ b/plans/read-epmc.plan @@ -0,0 +1,27 @@ +<alvisnlp-plan id="read-epmc"> + <param name="input"> + <alias module="read" param="source"/> + </param> + + <param name="xslt"> + <alias module="read" param="xslTransform"/> + </param> + + <param name="remove"> + <alias module="remove" param="select"/> + </param> + + <read class="XMLReader"/> + + <remove> + <tables class="Action"> + <target>documents.sections.layer:html[@tag == "table-wrap"]</target> + <action>add:strip-layer</action> + <addToLayer/> + </tables> + </remove> + + <do-remove class="RemoveContents"> + <stripLayer>strip-layer</stripLayer> + </do-remove> +</alvisnlp-plan> -- GitLab From 3de84da83dcadfcd225025e4f6ca3862491ce7dd Mon Sep 17 00:00:00 2001 From: Robert Bossy <Robert.Bossy@inrae.fr> Date: Wed, 23 Oct 2024 15:49:37 +0200 Subject: [PATCH 5/6] process section "body" --- plans/omnicrobe_main.plan | 4 ++-- plans/output.plan | 6 +++--- plans/term-extraction.plan | 6 +++--- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/plans/omnicrobe_main.plan b/plans/omnicrobe_main.plan index 18f42cdc..1584d2db 100644 --- a/plans/omnicrobe_main.plan +++ b/plans/omnicrobe_main.plan @@ -214,7 +214,7 @@ <!-- Taxa recognition --> <taxa file="plans/taxa.plan"> - <sectionFilter>@name == "title" or @name == "abstract" or @name == "text"</sectionFilter> + <sectionFilter>str:inset(@name, "title", "abstract", "body")</sectionFilter> </taxa> <!-- Project stopwords --> @@ -222,7 +222,7 @@ <!-- Geographical location recognition --> <geo file="plans/geo.plan"> - <sectionFilter>@name == "title" or @name == "abstract" or @name == "text"</sectionFilter> + <sectionFilter>str:inset(@name, "title", "abstract", "body")</sectionFilter> </geo> <segmentation file="plans/segmentation.plan"/> diff --git a/plans/output.plan b/plans/output.plan index 1f7505d8..94061051 100644 --- a/plans/output.plan +++ b/plans/output.plan @@ -219,7 +219,7 @@ <index class="AlvisIRIndexer"> <indexDir>index</indexDir> <tokenPositionGap>9216</tokenPositionGap> - <fieldNames>title,abstract,author,full-author,pmid,year,journal,mesh,url</fieldNames> + <fieldNames>title,abstract,body,author,full-author,pmid,year,journal,mesh,url</fieldNames> <relations> <livesin>taxon,habitat</livesin> <exhibits>taxon,phenotype</exhibits> @@ -228,7 +228,7 @@ <propertyKeys/> <documents> <fields> - <instances>sections:title | sections:abstract</instances> + <instances>sections:title | sections:abstract | sections:body</instances> <annotations> <instances>layer:microorganism</instances> <text>"{taxon}" ^ @path ^ "/"</text> @@ -462,4 +462,4 @@ <columns>@id</columns> </success> -</alvisnlp-plan> \ No newline at end of file +</alvisnlp-plan> diff --git a/plans/term-extraction.plan b/plans/term-extraction.plan index 279ecd53..87c55117 100644 --- a/plans/term-extraction.plan +++ b/plans/term-extraction.plan @@ -46,7 +46,7 @@ <!-- Run Yatea term extractor --> <yatea class="YateaExtractor"> - <sectionFilter>@name == "title" or @name == "abstract"</sectionFilter> + <sectionFilter>str:inset(@name, "title", "abstract", "text", "body")</sectionFilter> <xmlTermsFile>yatea/candidates.xml</xmlTermsFile> <posFeature>pos</posFeature> <configDir>share/YaTeA/config-habitats</configDir> @@ -57,7 +57,7 @@ <!-- Run Yatea term extractor on variants --> <yatea-var class="YateaExtractor"> - <sectionFilter>@name == "title" or @name == "abstract"</sectionFilter> + <sectionFilter>str:inset(@name, "title", "abstract", "text", "body")</sectionFilter> <xmlTermsFile>yatea-var/candidates.xml</xmlTermsFile> <posFeature>pos</posFeature> <lemmaFeature>variant</lemmaFeature> @@ -67,4 +67,4 @@ <!--perlLib>/projet/mig/work/textemig/biotopes/software/yatea-lib</perlLib--> </yatea-var> -</alvisnlp-plan> \ No newline at end of file +</alvisnlp-plan> -- GitLab From 59a2201f8325571abdc92c6ee7e6231cd29ae196 Mon Sep 17 00:00:00 2001 From: Mouhamadou Ba <mandiayba@gmail.com> Date: Fri, 25 Oct 2024 15:27:16 +0200 Subject: [PATCH 6/6] add pipeline to process epmc --- process_epmc.snakefile | 230 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 230 insertions(+) create mode 100644 process_epmc.snakefile diff --git a/process_epmc.snakefile b/process_epmc.snakefile new file mode 100644 index 00000000..434c7bb5 --- /dev/null +++ b/process_epmc.snakefile @@ -0,0 +1,230 @@ +## local rule +# localrules: all, concat_results + +## config file +configfile: "config/config.yaml" + +''' +## variables, check values into config file +_ontobiotope = config['ONTOBIOTOPE'] +_names = config['NCBI_TAXO_NAMES'] +_pubmed_batches_home = config['EPMC_BATCHES_HOME'] +_pubmed_xslt_file = config['EPMC_XSLT_FILE'] +_ncbi_taxo_microorganisms = config['NCBI_TAXO_MICROORGANISMS'] +_ncbi_taxo_id = config['NCBI_TAXO_ID'] +_ncbi_taxo_and_id_microorganisms = config['NCBI_TAXO_AND_ID_MICROORGANISMS'] +''' + + +## document batches +BATCHES, = glob_wildcards(config["EPMC_BATCHES_HOME"] + "/{id}/PMC*.xml") + +## list of the results +RESULTS = ["relations", "phenotype-relations", "uses-relations", "microorganisms", "habitats", "phenotypes", "uses"] + + +''' +all +''' +rule all: + input: + relations=expand(config["EPMC_BATCHES_HOME"] + "/{B}/relations.txt", B=BATCHES), + phenotypeRelations=expand(config["EPMC_BATCHES_HOME"] + "/{B}/phenotype-relations.txt", B=BATCHES), + usesRelations=expand(config["EPMC_BATCHES_HOME"] + "/{B}/uses-relations.txt", B=BATCHES), + microorganisms=expand(config["EPMC_BATCHES_HOME"] + "/{B}/microorganisms.txt", B=BATCHES), + habitats=expand(config["EPMC_BATCHES_HOME"] + "/{B}/habitats.txt", B=BATCHES), + phenotypes=expand(config["EPMC_BATCHES_HOME"] + "/{B}/phenotypes.txt", B=BATCHES), + uses=expand(config["EPMC_BATCHES_HOME"] + "/{B}/uses.txt", B=BATCHES), + index=expand(config["EPMC_BATCHES_HOME"] + "/{B}/index", B=BATCHES), + index_folder="corpora/florilege/alvisir/index", + expander_folder="corpora/florilege/alvisir/expander", + florilege_Habitat_result="corpora/florilege/epmc/PubMed-Habitat.txt", + florilege_Phenotype_result="corpora/florilege/epmc/PubMed-Phenotype.txt", + florilege_Use_result="corpora/florilege/epmc/PubMed-Use.txt", + result=expand("corpora/florilege/epmc/{R}.full.txt", R=RESULTS) + + +''' +Extract entities in different corpus +batches using the alvisnlp plan (omnicrobe_main.plan) +''' +rule run_epmc_main: + input: + file = config['EPMC_BATCHES_HOME'] + "/{B}/", + xslt = config['EPMC_XSLT_FILE'] + output: + results=expand(config["EPMC_BATCHES_HOME"] + "/{{B}}/{R}.txt", R=RESULTS), + index=directory(config["EPMC_BATCHES_HOME"] + "/{B}/index"), + yatea_candidates=config["EPMC_BATCHES_HOME"] + "/{B}/yatea/candidates.xml", + yatea_var_candidates=config["EPMC_BATCHES_HOME"] + "/{B}/yatea-var/candidates.xml" + params: + batch="{B}", + corpus='empc', + onto_habitat='share/BioNLP-OST+EnovFood-Habitat.obo', + tomap_habitat='share/BioNLP-OST+EnovFood-Habitat.tomap', + onto_pheno='share/BioNLP-OST+EnovFood-Phenotype.obo', + tomap_pheno='share/BioNLP-OST+EnovFood-Phenotype.tomap', + graylist='share/graylist_extended.heads', + emptywords='share/stopwords_EN.ttg', + onto_use='share/BioNLP-OST+EnovFood-Use.obo', + plan='plans/omnicrobe_main.plan', + dir=config["EPMC_BATCHES_HOME"] + "/{B}/", + taxid_microorganisms=config['NCBI_TAXO_MICROORGANISMS'], + taxa_id_full=config['NCBI_TAXO_ID'], + dummy= config["EPMC_BATCHES_HOME"] + '/{B}/bionlp-st', + log="alvisnlp.log" + singularity:config["SINGULARITY_IMG"] + shell:""" + rm -f {output.yatea_candidates} {output.yatea_var_candidates} && mkdir -p {params.dummy} && alvisnlp -J-XX:+UseSerialGC -J-Xmx30g -cleanTmp -verbose \ + -log {params.log} \ + -alias format pubmed \ + -alias input {input.file} \ + -alias input-xslt {input.xslt} \ + -alias batch batch={params.batch} \ + -outputDir {params.dir} \ + -alias ontobiotope-habitat {params.onto_habitat} \ + -xalias '<ontobiotope-tomap-habitat empty-words="{params.emptywords}" graylist="{params.graylist}" whole-proxy-distance="false">{params.tomap_habitat}</ontobiotope-tomap-habitat>' \ + -alias ontobiotope-phenotypes {params.onto_pheno} \ + -xalias '<ontobiotope-tomap-phenotypes empty-words="{params.emptywords}" whole-proxy-distance="false">{params.tomap_pheno}</ontobiotope-tomap-phenotypes>' \ + -alias ontobiotope-use {params.onto_use} \ + -alias taxid_microorganisms {params.taxid_microorganisms} \ + -alias taxa+id_full {params.taxa_id_full} \ + {params.plan} + """ + + +''' +select results to concat + +rule selectToConcat: + input: + results=expand(config["EPMC_BATCHES_HOME"] + "/{{B}}/{R}.txt", R=RESULTS), + index=config["EPMC_BATCHES_HOME"] + "/{B}/index" + output: + results=config["EPMC_BATCHES_HOME"] + "/{B}/{R}.txt" +''' + + +''' +concat the different results +for +* relations +* phenotype-relations +* uses-relations +* microorganisms +* habitats +* phenotypes +* uses +/!\ bash arguments too long if you use cat +''' +rule concat_results: + input: + expand(config["EPMC_BATCHES_HOME"] + "/{B}/{{R}}.txt", B=BATCHES) + output: + result="corpora/epmc/{R}.full.txt" + run: + with open(output.result, 'w') as out: + for fname in input: + with open(fname) as infile: + out.write(infile.read()) + +''' +select files to be formated +''' +rule select: + input: + files=expand("corpora/epmc/{R}.full.txt", R=RESULTS) + output: + relation="corpora/epmc/relations.full.txt", + phenotype_relations="corpora/epmc/phenotype-relations.full.txt", + use_relations="corpora/epmc/uses-relations.full.txt" + shell:""" + """ + + + +''' +merge indexes from the batches +''' +rule merge_epmc_index: + input: + index=expand(config["EPMC_BATCHES_HOME"] +"/{B}/index", B=BATCHES) + output: + index_folder=directory("corpora/epmc/index") + params: + alvisir=config["ALVISIR_HOME"] + shell: """ + java -cp {params.alvisir}/lib/lucene-core-3.6.1.jar:{params.alvisir}/lib/lucene-misc-3.6.1.jar \ + org.apache.lucene.misc.IndexMergeTool \ + {output.index_folder} {input.index} + """ + + + +''' +create the expander +''' +rule create_epmc_expander: + input: + expander="share/expander.xml", + taxa_id_microorganisms=config['NCBI_TAXO_AND_ID_MICROORGANISMS'], + onto_habitat="share/BioNLP-OST+EnovFood-Habitat.obo", + onto_phenotype="share/BioNLP-OST+EnovFood-Phenotype.obo", + onto_use="share/BioNLP-OST+EnovFood-Use.obo" + output: + expander_folder=directory("corpora/epmc/expander") + params: + alvisir=config["ALVISIR_HOME"] + shell:""" + {params.alvisir}/bin/alvisir-index-expander {output.expander_folder} {input.expander} + """ + + +''' +formatting the extracted *relations* for +integration in Florilege +''' +rule format_epmc_relations: + input: + file="corpora/epmc/relations.full.txt" + output: + florilege_result="corpora/florilege/epmc/EPMC-Habitat.txt" + conda: 'softwares/envs/python3_env.yaml' + shell:""" + python softwares/scripts/format-pubmed-results.py \ + --pubmed-results {input.file} \ + > {output.florilege_result} + """ + + +''' +formatting the extracted *phenotype relations* for +integration in Florilege +''' +rule format_epmc_phenotype_relations: + input: + file="corpora/epmc/phenotype-relations.full.txt" + output: + florilege_result="corpora/florilege/epmc/EPMC-Phenotype.txt" + conda: 'softwares/envs/python3_env.yaml' + shell:""" + python softwares/scripts/format-pubmed-results.py \ + --pubmed-results {input.file} \ + > {output.florilege_result} + """ + +''' +formatting the extracted *use relations* for +integration in Florilege +''' +rule format_epmc_use_relations: + input: + file="corpora/epmc/uses-relations.full.txt" + output: + florilege_result="corpora/florilege/epmc/EPMC-Use.txt" + conda: 'softwares/envs/python3_env.yaml' + shell:""" + python softwares/scripts/format-pubmed-results.py \ + --pubmed-results {input.file} \ + > {output.florilege_result} + """ \ No newline at end of file -- GitLab