From 033669cf324eafcdd2b542236c8913af1f578c75 Mon Sep 17 00:00:00 2001
From: Robert Bossy <Robert.Bossy@inra.fr>
Date: Mon, 24 Jun 2024 12:07:32 +0200
Subject: [PATCH 1/6] Added EPMC reader

the main plan has to be used with parameters:
input: XML file or directory containing XML files
xslt: the stylesheet for EPMC XML provided in anciliaries/pmc2alvisnlp.xslt
---
 ancillaries/pmc2alvisnlp.xslt | 95 +++++++++++++++++++++++++++++++++++
 plans/omnicrobe_main.plan     |  6 +++
 plans/read-epmc.plan          | 27 ++++++++++
 3 files changed, 128 insertions(+)
 create mode 100644 ancillaries/pmc2alvisnlp.xslt
 create mode 100644 plans/read-epmc.plan

diff --git a/ancillaries/pmc2alvisnlp.xslt b/ancillaries/pmc2alvisnlp.xslt
new file mode 100644
index 00000000..0f07d52e
--- /dev/null
+++ b/ancillaries/pmc2alvisnlp.xslt
@@ -0,0 +1,95 @@
+<xsl:stylesheet version="1.0"
+                xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+                xmlns:xlink="http://www.w3.org/1999/xlink"
+		xmlns:mml="http://www.w3.org/1998/Math/MathML"
+                xmlns:a="xalan://fr.inra.maiage.bibliome.alvisnlp.bibliomefactory.modules.xml.XMLReader2"
+                xmlns:inline="http://bibliome.jouy.inra.fr/alvisnlp/bibliome-module-factory/inline"
+                extension-element-prefixes="a inline"
+		>
+
+  <xsl:template match="/">
+    <xsl:apply-templates select="article-set/article|article"/>
+  </xsl:template>
+
+  <xsl:template match="article">
+    <a:document xpath-id="concat('PMC', front/article-meta/article-id[@pub-id-type = 'pmcid'])">
+      <a:feature name="article-type" xpath-value="@article-type"/>
+      <xsl:apply-templates select="front/journal-meta"/>
+      <xsl:apply-templates select="front/article-meta"/>
+      <xsl:apply-templates select="body"/>
+   </a:document>
+  </xsl:template>
+
+  <xsl:template match="journal-meta">
+    <a:feature name="journal" xpath-value="journal-title"/>
+    <a:feature name="issn" xpath-value="issn"/>
+    <a:feature name="publisher" xpath-value="publisher/publisher-name"/>
+  </xsl:template>
+
+  <xsl:template match="article-meta">
+    <xsl:apply-templates select="article-id"/>
+    <xsl:apply-templates select="title-group/article-title"/>
+    <xsl:apply-templates select="contrib-group/contrib[@contrib-type = 'author' or @contrib-type = 'presenting-author']"/>
+    <xsl:apply-templates select="pub-date"/>
+    <xsl:apply-templates select="ext-link"/>
+    <xsl:apply-templates select="permissions"/>
+    <xsl:apply-templates select="abstract"/>
+  </xsl:template>
+
+  <xsl:template match="article-id">
+    <a:feature xpath-name="@pub-id-type" xpath-value="."/>
+  </xsl:template>
+
+  <xsl:template match="article-title">
+    <a:section name="title" xpath-contents="."/>
+  </xsl:template>
+
+  <xsl:template match="contrib">
+    <xsl:variable name="suffix">
+      <xsl:choose>
+	<xsl:when test="name/suffix">
+	  <xsl:value-of select="concat(' ', name/suffix)"/>
+	</xsl:when>
+	<xsl:otherwise/>
+      </xsl:choose>
+    </xsl:variable>
+    <xsl:variable name="aff-id">
+      <xsl:value-of select="xref[@ref-type = 'aff']/@rid"/>
+    </xsl:variable>
+    <a:section name="author" xpath-contents="concat(name/given-names, ' ', name/surname, $suffix)">
+      <a:feature name="given-names" xpath-value="name/given-names"/>
+      <a:feature name="surname" xpath-value="name/surname"/>
+      <a:feature name="suffix" xpath-value="name/suffix"/>
+      <a:feature name="affiliation" xpath-value="../../aff[@id = $aff-id]/text()"/>
+    </a:section>
+  </xsl:template>
+
+  <xsl:template match="pub-date">
+    <a:feature name="year" xpath-value="year"/>
+  </xsl:template>
+
+  <xsl:template match="ext-link">
+    <a:feature name="ext-link" xpath-value="@xlink:href"/>
+  </xsl:template>
+
+  <xsl:template match="permissions">
+    <a:feature name="copyright-statement" xpath-value="copyright-statement"/>
+    <a:feature name="license-link" xpath-value="license/@xlink:href"/>
+    <a:feature name="license" xpath-value="license"/>
+  </xsl:template>
+
+  <xsl:template match="abstract|body">
+    <a:section xpath-name="name()" xpath-contents=".">
+      <xsl:for-each select="a:inline()[name() != 'label' and name() != 'title' and name() != 'sec' and name() != 'abstract' and name() != 'body']">
+	<a:annotation start="@inline:start" end="@inline:end" layers="html">
+          <a:feature name="tag" xpath-value="name()"/>
+        </a:annotation>
+      </xsl:for-each>
+      <xsl:for-each select="a:inline()[(name() = 'label' or name() = 'title') and name(parent::*) = 'sec']">
+	<a:annotation start="@inline:start" end="@inline:end" layers="html">
+          <a:feature name="tag" xpath-value="concat('h', count(ancestor::sec))"/>
+        </a:annotation>
+      </xsl:for-each>
+    </a:section>
+  </xsl:template>
+</xsl:stylesheet>
diff --git a/plans/omnicrobe_main.plan b/plans/omnicrobe_main.plan
index 453072cf..abdce1a5 100644
--- a/plans/omnicrobe_main.plan
+++ b/plans/omnicrobe_main.plan
@@ -8,6 +8,7 @@
   
   <param name="input">
     <alias module="read.pubmed" param="source"/>
+    <alias module="read.epmc" param="input"/>
   </param>
   
   <param name="input-dir">
@@ -16,6 +17,7 @@
 
   <param name="input-xslt">
     <alias module="read.pubmed" param="xslTransform"/>
+    <alias module="read.epmc" param="xslt"/>
   </param>
     
   <param name="batch">
@@ -173,6 +175,10 @@
       <constantDocumentFeatures>batch=0001</constantDocumentFeatures>
     </pubmed>
 
+    <epmc href="plans/read-epmc.plan">
+      <remove>tables</remove>
+    </epmc>
+
     <bionlp-st class="BioNLPSTReader">
       <active>true</active>
       <section>abstract</section>
diff --git a/plans/read-epmc.plan b/plans/read-epmc.plan
new file mode 100644
index 00000000..89bef97d
--- /dev/null
+++ b/plans/read-epmc.plan
@@ -0,0 +1,27 @@
+<alvisnlp-plan id="read-epmc">
+  <param name="input">
+    <alias module="read" param="source"/>
+  </param>
+
+  <param name="xslt">
+    <alias module="read" param="xslTransform"/>
+  </param>
+
+  <param name="remove">
+    <alias module="remove" param="select"/>
+  </param>
+  
+  <read class="XMLReader"/>
+
+  <remove>
+    <tables class="Action">
+      <target>documents.sections.layer:html[@tag == "table-wrap"]</target>
+      <action>add:strip-layer</action>
+      <addToLayer/>
+    </tables>
+  </remove>
+  
+  <do-remove class="RemoveContents">
+    <stripLayer>strip-layer</stripLayer>
+  </do-remove>
+</alvisnlp-plan>
-- 
GitLab


From de56bcf15e400ba28273ed65e3107580b93bfe45 Mon Sep 17 00:00:00 2001
From: Robert Bossy <Robert.Bossy@inrae.fr>
Date: Wed, 23 Oct 2024 15:49:37 +0200
Subject: [PATCH 2/6] process section "body"

---
 plans/omnicrobe_main.plan  | 4 ++--
 plans/output.plan          | 6 +++---
 plans/term-extraction.plan | 6 +++---
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/plans/omnicrobe_main.plan b/plans/omnicrobe_main.plan
index abdce1a5..af8f9daa 100644
--- a/plans/omnicrobe_main.plan
+++ b/plans/omnicrobe_main.plan
@@ -214,7 +214,7 @@
 
   <!-- Taxa recognition -->
   <taxa file="plans/taxa.plan">
-    <sectionFilter>@name == "title" or @name == "abstract" or @name == "text"</sectionFilter>
+    <sectionFilter>str:inset(@name, "title", "abstract", "body")</sectionFilter>
   </taxa>
 
   <!-- Project stopwords -->
@@ -222,7 +222,7 @@
 
     <!-- Geographical location recognition -->
   <geo file="plans/geo.plan">
-    <sectionFilter>@name == "title" or @name == "abstract" or @name == "text"</sectionFilter>
+    <sectionFilter>str:inset(@name, "title", "abstract", "body")</sectionFilter>
   </geo>
 
   <segmentation file="plans/segmentation.plan"/>
diff --git a/plans/output.plan b/plans/output.plan
index 1f7505d8..94061051 100644
--- a/plans/output.plan
+++ b/plans/output.plan
@@ -219,7 +219,7 @@
   <index class="AlvisIRIndexer">
     <indexDir>index</indexDir>
     <tokenPositionGap>9216</tokenPositionGap>
-    <fieldNames>title,abstract,author,full-author,pmid,year,journal,mesh,url</fieldNames>
+    <fieldNames>title,abstract,body,author,full-author,pmid,year,journal,mesh,url</fieldNames>
     <relations>
       <livesin>taxon,habitat</livesin>
       <exhibits>taxon,phenotype</exhibits>
@@ -228,7 +228,7 @@
     <propertyKeys/>
     <documents>
       <fields>
-	<instances>sections:title | sections:abstract</instances>
+	<instances>sections:title | sections:abstract | sections:body</instances>
 	<annotations>
 	  <instances>layer:microorganism</instances>
 	  <text>"{taxon}" ^ @path ^ "/"</text>
@@ -462,4 +462,4 @@
     <columns>@id</columns>
   </success>
 
-</alvisnlp-plan>
\ No newline at end of file
+</alvisnlp-plan>
diff --git a/plans/term-extraction.plan b/plans/term-extraction.plan
index 00407ffa..344dc131 100644
--- a/plans/term-extraction.plan
+++ b/plans/term-extraction.plan
@@ -46,7 +46,7 @@
 
     <!-- Run Yatea term extractor -->
     <yatea class="YateaExtractor">
-      <sectionFilter>@name == "title" or @name == "abstract"</sectionFilter>
+      <sectionFilter>str:inset(@name, "title", "abstract", "text", "body")</sectionFilter>
       <xmlTermsFile>yatea/candidates.xml</xmlTermsFile>
       <posFeature>pos</posFeature>
       <configDir>ancillaries/YaTeA/config-habitats</configDir>
@@ -57,7 +57,7 @@
 
     <!-- Run Yatea term extractor on variants -->
     <yatea-var class="YateaExtractor">
-      <sectionFilter>@name == "title" or @name == "abstract"</sectionFilter>
+      <sectionFilter>str:inset(@name, "title", "abstract", "text", "body")</sectionFilter>
       <xmlTermsFile>yatea-var/candidates.xml</xmlTermsFile>
       <posFeature>pos</posFeature>
       <lemmaFeature>variant</lemmaFeature>
@@ -67,4 +67,4 @@
       <!--perlLib>/projet/mig/work/textemig/biotopes/software/yatea-lib</perlLib-->
     </yatea-var>
 
-</alvisnlp-plan>
\ No newline at end of file
+</alvisnlp-plan>
-- 
GitLab


From 6831acb686f05797ec70e87c129bc1c14eb9f067 Mon Sep 17 00:00:00 2001
From: Mouhamadou Ba <mandiayba@gmail.com>
Date: Fri, 25 Oct 2024 15:27:16 +0200
Subject: [PATCH 3/6] add pipeline to process epmc

---
 config/config.yaml     |  10 ++
 process_epmc.snakefile | 230 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 240 insertions(+)
 create mode 100644 process_epmc.snakefile

diff --git a/config/config.yaml b/config/config.yaml
index 4c690578..8ba700de 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -64,6 +64,16 @@ PUBMED_DB: "/db/pubmed/current/index"
 PUBMED_MICROBIO_MESH_TERMS: "corpora/pubmed/microbio-mesh-terms.txt"
 PUBMED_XSLT_FILE: "corpora/pubmed/microbes-2019-pubmed2alvisnlp.xslt"
 
+## epmc
+PUBMED_FOLDER_NAME: "epmc"
+PUBMED_CORPUS_HOME: "corpora/epmc"
+PUBMED_BATCHES_HOME: "corpora/epmc/batches"
+PUBMED_HABITAT_RESULT: "ancillaries/Florilege/2019-12-12/PubMed-Habitat-2019-12-12.txt"
+PUBMED_PHENOTYPE_RESULT: "ancillaries/Florilege/2019-12-12/PubMed-Phenotype-2019-12-12.txt"
+PUBMED_DB: "/db/pubmed/current/index"
+PUBMED_MICROBIO_MESH_TERMS: "corpora/pubmed/microbio-mesh-terms.txt"
+PUBMED_XSLT_FILE: "corpora/pubmed/microbes-2019-pubmed2alvisnlp.xslt"
+
 ## bionlp-ost
 BIONLPOST_BATCHES_HOME: "corpora/BioNLP-OST-2019/batches"
 BIONLPOST_API: "http://bibliome.jouy.inra.fr/demo/BioNLP-OST-2019-Evaluation/api"
diff --git a/process_epmc.snakefile b/process_epmc.snakefile
new file mode 100644
index 00000000..434c7bb5
--- /dev/null
+++ b/process_epmc.snakefile
@@ -0,0 +1,230 @@
+## local rule
+# localrules: all, concat_results
+
+## config file
+configfile: "config/config.yaml"
+
+'''
+## variables, check values into config file
+_ontobiotope = config['ONTOBIOTOPE']
+_names = config['NCBI_TAXO_NAMES']
+_pubmed_batches_home = config['EPMC_BATCHES_HOME']
+_pubmed_xslt_file = config['EPMC_XSLT_FILE']
+_ncbi_taxo_microorganisms = config['NCBI_TAXO_MICROORGANISMS']
+_ncbi_taxo_id = config['NCBI_TAXO_ID']
+_ncbi_taxo_and_id_microorganisms = config['NCBI_TAXO_AND_ID_MICROORGANISMS']
+'''
+
+
+## document batches
+BATCHES, = glob_wildcards(config["EPMC_BATCHES_HOME"] + "/{id}/PMC*.xml")
+
+## list of the results
+RESULTS = ["relations", "phenotype-relations", "uses-relations", "microorganisms", "habitats", "phenotypes", "uses"]
+
+
+'''
+all
+'''
+rule all:
+    input:
+        relations=expand(config["EPMC_BATCHES_HOME"] + "/{B}/relations.txt", B=BATCHES),
+        phenotypeRelations=expand(config["EPMC_BATCHES_HOME"] + "/{B}/phenotype-relations.txt", B=BATCHES),
+        usesRelations=expand(config["EPMC_BATCHES_HOME"] + "/{B}/uses-relations.txt", B=BATCHES),
+        microorganisms=expand(config["EPMC_BATCHES_HOME"] + "/{B}/microorganisms.txt", B=BATCHES),
+        habitats=expand(config["EPMC_BATCHES_HOME"] + "/{B}/habitats.txt", B=BATCHES),
+        phenotypes=expand(config["EPMC_BATCHES_HOME"] + "/{B}/phenotypes.txt", B=BATCHES),
+        uses=expand(config["EPMC_BATCHES_HOME"] + "/{B}/uses.txt", B=BATCHES),
+        index=expand(config["EPMC_BATCHES_HOME"] + "/{B}/index", B=BATCHES),
+        index_folder="corpora/florilege/alvisir/index",
+        expander_folder="corpora/florilege/alvisir/expander",
+        florilege_Habitat_result="corpora/florilege/epmc/PubMed-Habitat.txt",
+        florilege_Phenotype_result="corpora/florilege/epmc/PubMed-Phenotype.txt",
+        florilege_Use_result="corpora/florilege/epmc/PubMed-Use.txt",
+        result=expand("corpora/florilege/epmc/{R}.full.txt", R=RESULTS)
+
+
+'''
+Extract entities in different corpus 
+batches using the alvisnlp plan (omnicrobe_main.plan)
+'''
+rule run_epmc_main:
+    input:
+        file = config['EPMC_BATCHES_HOME'] + "/{B}/",
+        xslt = config['EPMC_XSLT_FILE']
+    output:
+        results=expand(config["EPMC_BATCHES_HOME"] + "/{{B}}/{R}.txt", R=RESULTS),
+        index=directory(config["EPMC_BATCHES_HOME"] + "/{B}/index"),
+        yatea_candidates=config["EPMC_BATCHES_HOME"] + "/{B}/yatea/candidates.xml",
+        yatea_var_candidates=config["EPMC_BATCHES_HOME"] + "/{B}/yatea-var/candidates.xml"
+    params:
+        batch="{B}",
+        corpus='empc',
+        onto_habitat='share/BioNLP-OST+EnovFood-Habitat.obo',
+        tomap_habitat='share/BioNLP-OST+EnovFood-Habitat.tomap',
+        onto_pheno='share/BioNLP-OST+EnovFood-Phenotype.obo',
+        tomap_pheno='share/BioNLP-OST+EnovFood-Phenotype.tomap',
+        graylist='share/graylist_extended.heads',
+        emptywords='share/stopwords_EN.ttg',
+        onto_use='share/BioNLP-OST+EnovFood-Use.obo',
+        plan='plans/omnicrobe_main.plan',
+        dir=config["EPMC_BATCHES_HOME"] + "/{B}/",
+        taxid_microorganisms=config['NCBI_TAXO_MICROORGANISMS'],
+        taxa_id_full=config['NCBI_TAXO_ID'],
+        dummy= config["EPMC_BATCHES_HOME"] + '/{B}/bionlp-st',
+        log="alvisnlp.log"
+    singularity:config["SINGULARITY_IMG"]
+    shell:"""
+        rm -f {output.yatea_candidates} {output.yatea_var_candidates} && mkdir -p {params.dummy} && alvisnlp -J-XX:+UseSerialGC -J-Xmx30g -cleanTmp -verbose \
+        -log {params.log} \
+        -alias format pubmed \
+        -alias input {input.file} \
+        -alias input-xslt {input.xslt} \
+        -alias batch batch={params.batch} \
+        -outputDir {params.dir} \
+        -alias ontobiotope-habitat {params.onto_habitat} \
+        -xalias '<ontobiotope-tomap-habitat empty-words="{params.emptywords}" graylist="{params.graylist}" whole-proxy-distance="false">{params.tomap_habitat}</ontobiotope-tomap-habitat>' \
+        -alias ontobiotope-phenotypes {params.onto_pheno} \
+        -xalias '<ontobiotope-tomap-phenotypes empty-words="{params.emptywords}" whole-proxy-distance="false">{params.tomap_pheno}</ontobiotope-tomap-phenotypes>' \
+        -alias ontobiotope-use {params.onto_use} \
+        -alias taxid_microorganisms {params.taxid_microorganisms} \
+        -alias taxa+id_full {params.taxa_id_full} \
+        {params.plan}
+        """
+
+
+'''
+select results to concat
+
+rule selectToConcat:
+    input:
+        results=expand(config["EPMC_BATCHES_HOME"] + "/{{B}}/{R}.txt", R=RESULTS),
+        index=config["EPMC_BATCHES_HOME"] + "/{B}/index"
+    output:
+        results=config["EPMC_BATCHES_HOME"] + "/{B}/{R}.txt"
+'''
+
+
+'''
+concat the different results
+for 
+* relations 
+* phenotype-relations 
+* uses-relations 
+* microorganisms 
+* habitats 
+* phenotypes 
+* uses
+/!\ bash arguments too long if you use cat
+'''
+rule concat_results:
+    input: 
+        expand(config["EPMC_BATCHES_HOME"] + "/{B}/{{R}}.txt", B=BATCHES)
+    output:
+        result="corpora/epmc/{R}.full.txt"
+    run:
+        with open(output.result, 'w') as out:
+            for fname in input:
+                with open(fname) as infile:
+                    out.write(infile.read())
+
+'''
+select files to be formated
+'''
+rule select:
+    input:
+        files=expand("corpora/epmc/{R}.full.txt", R=RESULTS)
+    output:
+        relation="corpora/epmc/relations.full.txt",
+                phenotype_relations="corpora/epmc/phenotype-relations.full.txt",
+                use_relations="corpora/epmc/uses-relations.full.txt"
+    shell:"""
+        """
+            
+
+
+'''
+merge indexes from the batches
+'''
+rule merge_epmc_index:
+    input:
+        index=expand(config["EPMC_BATCHES_HOME"] +"/{B}/index", B=BATCHES)
+    output:
+        index_folder=directory("corpora/epmc/index")
+    params:
+        alvisir=config["ALVISIR_HOME"]
+    shell: """
+        java -cp {params.alvisir}/lib/lucene-core-3.6.1.jar:{params.alvisir}/lib/lucene-misc-3.6.1.jar \
+        org.apache.lucene.misc.IndexMergeTool \
+        {output.index_folder} {input.index}
+        """
+
+
+
+'''
+create the expander
+'''
+rule create_epmc_expander:
+    input:
+        expander="share/expander.xml",
+        taxa_id_microorganisms=config['NCBI_TAXO_AND_ID_MICROORGANISMS'],
+                onto_habitat="share/BioNLP-OST+EnovFood-Habitat.obo",
+        onto_phenotype="share/BioNLP-OST+EnovFood-Phenotype.obo",
+        onto_use="share/BioNLP-OST+EnovFood-Use.obo"
+    output:
+        expander_folder=directory("corpora/epmc/expander")
+    params:
+        alvisir=config["ALVISIR_HOME"]
+    shell:"""
+        {params.alvisir}/bin/alvisir-index-expander {output.expander_folder} {input.expander}
+          """
+
+
+'''
+formatting the extracted *relations* for 
+integration in Florilege
+'''
+rule format_epmc_relations:
+    input:
+        file="corpora/epmc/relations.full.txt"
+    output:
+        florilege_result="corpora/florilege/epmc/EPMC-Habitat.txt"
+    conda: 'softwares/envs/python3_env.yaml'
+    shell:"""
+        python softwares/scripts/format-pubmed-results.py \
+        --pubmed-results {input.file} \
+        > {output.florilege_result}
+          """
+
+
+'''
+formatting the extracted *phenotype relations* for 
+integration in Florilege
+'''
+rule format_epmc_phenotype_relations:
+    input:
+        file="corpora/epmc/phenotype-relations.full.txt"
+    output:
+        florilege_result="corpora/florilege/epmc/EPMC-Phenotype.txt"
+    conda: 'softwares/envs/python3_env.yaml'
+    shell:"""
+        python softwares/scripts/format-pubmed-results.py \
+        --pubmed-results {input.file} \
+        > {output.florilege_result}
+          """
+
+'''
+formatting the extracted *use relations* for 
+integration in Florilege
+'''
+rule format_epmc_use_relations:
+    input:
+        file="corpora/epmc/uses-relations.full.txt"
+    output:
+        florilege_result="corpora/florilege/epmc/EPMC-Use.txt"
+    conda: 'softwares/envs/python3_env.yaml'
+    shell:"""
+        python softwares/scripts/format-pubmed-results.py \
+        --pubmed-results {input.file} \
+        > {output.florilege_result}
+          """
\ No newline at end of file
-- 
GitLab


From 2f7ff62b80be5cb173f7cc99864f508c060427f4 Mon Sep 17 00:00:00 2001
From: Robert Bossy <Robert.Bossy@inra.fr>
Date: Mon, 24 Jun 2024 12:07:32 +0200
Subject: [PATCH 4/6] Added EPMC reader

the main plan has to be used with parameters:
input: XML file or directory containing XML files
xslt: the stylesheet for EPMC XML provided in anciliaries/pmc2alvisnlp.xslt
---
 ancillaries/pmc2alvisnlp.xslt | 95 +++++++++++++++++++++++++++++++++++
 plans/omnicrobe_main.plan     |  6 +++
 plans/read-epmc.plan          | 27 ++++++++++
 3 files changed, 128 insertions(+)
 create mode 100644 ancillaries/pmc2alvisnlp.xslt
 create mode 100644 plans/read-epmc.plan

diff --git a/ancillaries/pmc2alvisnlp.xslt b/ancillaries/pmc2alvisnlp.xslt
new file mode 100644
index 00000000..0f07d52e
--- /dev/null
+++ b/ancillaries/pmc2alvisnlp.xslt
@@ -0,0 +1,95 @@
+<xsl:stylesheet version="1.0"
+                xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+                xmlns:xlink="http://www.w3.org/1999/xlink"
+		xmlns:mml="http://www.w3.org/1998/Math/MathML"
+                xmlns:a="xalan://fr.inra.maiage.bibliome.alvisnlp.bibliomefactory.modules.xml.XMLReader2"
+                xmlns:inline="http://bibliome.jouy.inra.fr/alvisnlp/bibliome-module-factory/inline"
+                extension-element-prefixes="a inline"
+		>
+
+  <xsl:template match="/">
+    <xsl:apply-templates select="article-set/article|article"/>
+  </xsl:template>
+
+  <xsl:template match="article">
+    <a:document xpath-id="concat('PMC', front/article-meta/article-id[@pub-id-type = 'pmcid'])">
+      <a:feature name="article-type" xpath-value="@article-type"/>
+      <xsl:apply-templates select="front/journal-meta"/>
+      <xsl:apply-templates select="front/article-meta"/>
+      <xsl:apply-templates select="body"/>
+   </a:document>
+  </xsl:template>
+
+  <xsl:template match="journal-meta">
+    <a:feature name="journal" xpath-value="journal-title"/>
+    <a:feature name="issn" xpath-value="issn"/>
+    <a:feature name="publisher" xpath-value="publisher/publisher-name"/>
+  </xsl:template>
+
+  <xsl:template match="article-meta">
+    <xsl:apply-templates select="article-id"/>
+    <xsl:apply-templates select="title-group/article-title"/>
+    <xsl:apply-templates select="contrib-group/contrib[@contrib-type = 'author' or @contrib-type = 'presenting-author']"/>
+    <xsl:apply-templates select="pub-date"/>
+    <xsl:apply-templates select="ext-link"/>
+    <xsl:apply-templates select="permissions"/>
+    <xsl:apply-templates select="abstract"/>
+  </xsl:template>
+
+  <xsl:template match="article-id">
+    <a:feature xpath-name="@pub-id-type" xpath-value="."/>
+  </xsl:template>
+
+  <xsl:template match="article-title">
+    <a:section name="title" xpath-contents="."/>
+  </xsl:template>
+
+  <xsl:template match="contrib">
+    <xsl:variable name="suffix">
+      <xsl:choose>
+	<xsl:when test="name/suffix">
+	  <xsl:value-of select="concat(' ', name/suffix)"/>
+	</xsl:when>
+	<xsl:otherwise/>
+      </xsl:choose>
+    </xsl:variable>
+    <xsl:variable name="aff-id">
+      <xsl:value-of select="xref[@ref-type = 'aff']/@rid"/>
+    </xsl:variable>
+    <a:section name="author" xpath-contents="concat(name/given-names, ' ', name/surname, $suffix)">
+      <a:feature name="given-names" xpath-value="name/given-names"/>
+      <a:feature name="surname" xpath-value="name/surname"/>
+      <a:feature name="suffix" xpath-value="name/suffix"/>
+      <a:feature name="affiliation" xpath-value="../../aff[@id = $aff-id]/text()"/>
+    </a:section>
+  </xsl:template>
+
+  <xsl:template match="pub-date">
+    <a:feature name="year" xpath-value="year"/>
+  </xsl:template>
+
+  <xsl:template match="ext-link">
+    <a:feature name="ext-link" xpath-value="@xlink:href"/>
+  </xsl:template>
+
+  <xsl:template match="permissions">
+    <a:feature name="copyright-statement" xpath-value="copyright-statement"/>
+    <a:feature name="license-link" xpath-value="license/@xlink:href"/>
+    <a:feature name="license" xpath-value="license"/>
+  </xsl:template>
+
+  <xsl:template match="abstract|body">
+    <a:section xpath-name="name()" xpath-contents=".">
+      <xsl:for-each select="a:inline()[name() != 'label' and name() != 'title' and name() != 'sec' and name() != 'abstract' and name() != 'body']">
+	<a:annotation start="@inline:start" end="@inline:end" layers="html">
+          <a:feature name="tag" xpath-value="name()"/>
+        </a:annotation>
+      </xsl:for-each>
+      <xsl:for-each select="a:inline()[(name() = 'label' or name() = 'title') and name(parent::*) = 'sec']">
+	<a:annotation start="@inline:start" end="@inline:end" layers="html">
+          <a:feature name="tag" xpath-value="concat('h', count(ancestor::sec))"/>
+        </a:annotation>
+      </xsl:for-each>
+    </a:section>
+  </xsl:template>
+</xsl:stylesheet>
diff --git a/plans/omnicrobe_main.plan b/plans/omnicrobe_main.plan
index 73d9ac5b..18f42cdc 100644
--- a/plans/omnicrobe_main.plan
+++ b/plans/omnicrobe_main.plan
@@ -8,6 +8,7 @@
   
   <param name="input">
     <alias module="read.pubmed" param="source"/>
+    <alias module="read.epmc" param="input"/>
   </param>
   
   <param name="input-dir">
@@ -16,6 +17,7 @@
 
   <param name="input-xslt">
     <alias module="read.pubmed" param="xslTransform"/>
+    <alias module="read.epmc" param="xslt"/>
   </param>
     
   <param name="batch">
@@ -173,6 +175,10 @@
       <constantDocumentFeatures>batch=0001</constantDocumentFeatures>
     </pubmed>
 
+    <epmc href="plans/read-epmc.plan">
+      <remove>tables</remove>
+    </epmc>
+
     <bionlp-st class="BioNLPSTReader">
       <active>true</active>
       <section>abstract</section>
diff --git a/plans/read-epmc.plan b/plans/read-epmc.plan
new file mode 100644
index 00000000..89bef97d
--- /dev/null
+++ b/plans/read-epmc.plan
@@ -0,0 +1,27 @@
+<alvisnlp-plan id="read-epmc">
+  <param name="input">
+    <alias module="read" param="source"/>
+  </param>
+
+  <param name="xslt">
+    <alias module="read" param="xslTransform"/>
+  </param>
+
+  <param name="remove">
+    <alias module="remove" param="select"/>
+  </param>
+  
+  <read class="XMLReader"/>
+
+  <remove>
+    <tables class="Action">
+      <target>documents.sections.layer:html[@tag == "table-wrap"]</target>
+      <action>add:strip-layer</action>
+      <addToLayer/>
+    </tables>
+  </remove>
+  
+  <do-remove class="RemoveContents">
+    <stripLayer>strip-layer</stripLayer>
+  </do-remove>
+</alvisnlp-plan>
-- 
GitLab


From 3de84da83dcadfcd225025e4f6ca3862491ce7dd Mon Sep 17 00:00:00 2001
From: Robert Bossy <Robert.Bossy@inrae.fr>
Date: Wed, 23 Oct 2024 15:49:37 +0200
Subject: [PATCH 5/6] process section "body"

---
 plans/omnicrobe_main.plan  | 4 ++--
 plans/output.plan          | 6 +++---
 plans/term-extraction.plan | 6 +++---
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/plans/omnicrobe_main.plan b/plans/omnicrobe_main.plan
index 18f42cdc..1584d2db 100644
--- a/plans/omnicrobe_main.plan
+++ b/plans/omnicrobe_main.plan
@@ -214,7 +214,7 @@
 
   <!-- Taxa recognition -->
   <taxa file="plans/taxa.plan">
-    <sectionFilter>@name == "title" or @name == "abstract" or @name == "text"</sectionFilter>
+    <sectionFilter>str:inset(@name, "title", "abstract", "body")</sectionFilter>
   </taxa>
 
   <!-- Project stopwords -->
@@ -222,7 +222,7 @@
 
     <!-- Geographical location recognition -->
   <geo file="plans/geo.plan">
-    <sectionFilter>@name == "title" or @name == "abstract" or @name == "text"</sectionFilter>
+    <sectionFilter>str:inset(@name, "title", "abstract", "body")</sectionFilter>
   </geo>
 
   <segmentation file="plans/segmentation.plan"/>
diff --git a/plans/output.plan b/plans/output.plan
index 1f7505d8..94061051 100644
--- a/plans/output.plan
+++ b/plans/output.plan
@@ -219,7 +219,7 @@
   <index class="AlvisIRIndexer">
     <indexDir>index</indexDir>
     <tokenPositionGap>9216</tokenPositionGap>
-    <fieldNames>title,abstract,author,full-author,pmid,year,journal,mesh,url</fieldNames>
+    <fieldNames>title,abstract,body,author,full-author,pmid,year,journal,mesh,url</fieldNames>
     <relations>
       <livesin>taxon,habitat</livesin>
       <exhibits>taxon,phenotype</exhibits>
@@ -228,7 +228,7 @@
     <propertyKeys/>
     <documents>
       <fields>
-	<instances>sections:title | sections:abstract</instances>
+	<instances>sections:title | sections:abstract | sections:body</instances>
 	<annotations>
 	  <instances>layer:microorganism</instances>
 	  <text>"{taxon}" ^ @path ^ "/"</text>
@@ -462,4 +462,4 @@
     <columns>@id</columns>
   </success>
 
-</alvisnlp-plan>
\ No newline at end of file
+</alvisnlp-plan>
diff --git a/plans/term-extraction.plan b/plans/term-extraction.plan
index 279ecd53..87c55117 100644
--- a/plans/term-extraction.plan
+++ b/plans/term-extraction.plan
@@ -46,7 +46,7 @@
 
     <!-- Run Yatea term extractor -->
     <yatea class="YateaExtractor">
-      <sectionFilter>@name == "title" or @name == "abstract"</sectionFilter>
+      <sectionFilter>str:inset(@name, "title", "abstract", "text", "body")</sectionFilter>
       <xmlTermsFile>yatea/candidates.xml</xmlTermsFile>
       <posFeature>pos</posFeature>
       <configDir>share/YaTeA/config-habitats</configDir>
@@ -57,7 +57,7 @@
 
     <!-- Run Yatea term extractor on variants -->
     <yatea-var class="YateaExtractor">
-      <sectionFilter>@name == "title" or @name == "abstract"</sectionFilter>
+      <sectionFilter>str:inset(@name, "title", "abstract", "text", "body")</sectionFilter>
       <xmlTermsFile>yatea-var/candidates.xml</xmlTermsFile>
       <posFeature>pos</posFeature>
       <lemmaFeature>variant</lemmaFeature>
@@ -67,4 +67,4 @@
       <!--perlLib>/projet/mig/work/textemig/biotopes/software/yatea-lib</perlLib-->
     </yatea-var>
 
-</alvisnlp-plan>
\ No newline at end of file
+</alvisnlp-plan>
-- 
GitLab


From 59a2201f8325571abdc92c6ee7e6231cd29ae196 Mon Sep 17 00:00:00 2001
From: Mouhamadou Ba <mandiayba@gmail.com>
Date: Fri, 25 Oct 2024 15:27:16 +0200
Subject: [PATCH 6/6] add pipeline to process epmc

---
 process_epmc.snakefile | 230 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 230 insertions(+)
 create mode 100644 process_epmc.snakefile

diff --git a/process_epmc.snakefile b/process_epmc.snakefile
new file mode 100644
index 00000000..434c7bb5
--- /dev/null
+++ b/process_epmc.snakefile
@@ -0,0 +1,230 @@
+## local rule
+# localrules: all, concat_results
+
+## config file
+configfile: "config/config.yaml"
+
+'''
+## variables, check values into config file
+_ontobiotope = config['ONTOBIOTOPE']
+_names = config['NCBI_TAXO_NAMES']
+_pubmed_batches_home = config['EPMC_BATCHES_HOME']
+_pubmed_xslt_file = config['EPMC_XSLT_FILE']
+_ncbi_taxo_microorganisms = config['NCBI_TAXO_MICROORGANISMS']
+_ncbi_taxo_id = config['NCBI_TAXO_ID']
+_ncbi_taxo_and_id_microorganisms = config['NCBI_TAXO_AND_ID_MICROORGANISMS']
+'''
+
+
+## document batches
+BATCHES, = glob_wildcards(config["EPMC_BATCHES_HOME"] + "/{id}/PMC*.xml")
+
+## list of the results
+RESULTS = ["relations", "phenotype-relations", "uses-relations", "microorganisms", "habitats", "phenotypes", "uses"]
+
+
+'''
+all
+'''
+rule all:
+    input:
+        relations=expand(config["EPMC_BATCHES_HOME"] + "/{B}/relations.txt", B=BATCHES),
+        phenotypeRelations=expand(config["EPMC_BATCHES_HOME"] + "/{B}/phenotype-relations.txt", B=BATCHES),
+        usesRelations=expand(config["EPMC_BATCHES_HOME"] + "/{B}/uses-relations.txt", B=BATCHES),
+        microorganisms=expand(config["EPMC_BATCHES_HOME"] + "/{B}/microorganisms.txt", B=BATCHES),
+        habitats=expand(config["EPMC_BATCHES_HOME"] + "/{B}/habitats.txt", B=BATCHES),
+        phenotypes=expand(config["EPMC_BATCHES_HOME"] + "/{B}/phenotypes.txt", B=BATCHES),
+        uses=expand(config["EPMC_BATCHES_HOME"] + "/{B}/uses.txt", B=BATCHES),
+        index=expand(config["EPMC_BATCHES_HOME"] + "/{B}/index", B=BATCHES),
+        index_folder="corpora/florilege/alvisir/index",
+        expander_folder="corpora/florilege/alvisir/expander",
+        florilege_Habitat_result="corpora/florilege/epmc/PubMed-Habitat.txt",
+        florilege_Phenotype_result="corpora/florilege/epmc/PubMed-Phenotype.txt",
+        florilege_Use_result="corpora/florilege/epmc/PubMed-Use.txt",
+        result=expand("corpora/florilege/epmc/{R}.full.txt", R=RESULTS)
+
+
+'''
+Extract entities in different corpus 
+batches using the alvisnlp plan (omnicrobe_main.plan)
+'''
+rule run_epmc_main:
+    input:
+        file = config['EPMC_BATCHES_HOME'] + "/{B}/",
+        xslt = config['EPMC_XSLT_FILE']
+    output:
+        results=expand(config["EPMC_BATCHES_HOME"] + "/{{B}}/{R}.txt", R=RESULTS),
+        index=directory(config["EPMC_BATCHES_HOME"] + "/{B}/index"),
+        yatea_candidates=config["EPMC_BATCHES_HOME"] + "/{B}/yatea/candidates.xml",
+        yatea_var_candidates=config["EPMC_BATCHES_HOME"] + "/{B}/yatea-var/candidates.xml"
+    params:
+        batch="{B}",
+        corpus='empc',
+        onto_habitat='share/BioNLP-OST+EnovFood-Habitat.obo',
+        tomap_habitat='share/BioNLP-OST+EnovFood-Habitat.tomap',
+        onto_pheno='share/BioNLP-OST+EnovFood-Phenotype.obo',
+        tomap_pheno='share/BioNLP-OST+EnovFood-Phenotype.tomap',
+        graylist='share/graylist_extended.heads',
+        emptywords='share/stopwords_EN.ttg',
+        onto_use='share/BioNLP-OST+EnovFood-Use.obo',
+        plan='plans/omnicrobe_main.plan',
+        dir=config["EPMC_BATCHES_HOME"] + "/{B}/",
+        taxid_microorganisms=config['NCBI_TAXO_MICROORGANISMS'],
+        taxa_id_full=config['NCBI_TAXO_ID'],
+        dummy= config["EPMC_BATCHES_HOME"] + '/{B}/bionlp-st',
+        log="alvisnlp.log"
+    singularity:config["SINGULARITY_IMG"]
+    shell:"""
+        rm -f {output.yatea_candidates} {output.yatea_var_candidates} && mkdir -p {params.dummy} && alvisnlp -J-XX:+UseSerialGC -J-Xmx30g -cleanTmp -verbose \
+        -log {params.log} \
+        -alias format pubmed \
+        -alias input {input.file} \
+        -alias input-xslt {input.xslt} \
+        -alias batch batch={params.batch} \
+        -outputDir {params.dir} \
+        -alias ontobiotope-habitat {params.onto_habitat} \
+        -xalias '<ontobiotope-tomap-habitat empty-words="{params.emptywords}" graylist="{params.graylist}" whole-proxy-distance="false">{params.tomap_habitat}</ontobiotope-tomap-habitat>' \
+        -alias ontobiotope-phenotypes {params.onto_pheno} \
+        -xalias '<ontobiotope-tomap-phenotypes empty-words="{params.emptywords}" whole-proxy-distance="false">{params.tomap_pheno}</ontobiotope-tomap-phenotypes>' \
+        -alias ontobiotope-use {params.onto_use} \
+        -alias taxid_microorganisms {params.taxid_microorganisms} \
+        -alias taxa+id_full {params.taxa_id_full} \
+        {params.plan}
+        """
+
+
+'''
+select results to concat
+
+rule selectToConcat:
+    input:
+        results=expand(config["EPMC_BATCHES_HOME"] + "/{{B}}/{R}.txt", R=RESULTS),
+        index=config["EPMC_BATCHES_HOME"] + "/{B}/index"
+    output:
+        results=config["EPMC_BATCHES_HOME"] + "/{B}/{R}.txt"
+'''
+
+
+'''
+concat the different results
+for 
+* relations 
+* phenotype-relations 
+* uses-relations 
+* microorganisms 
+* habitats 
+* phenotypes 
+* uses
+/!\ bash arguments too long if you use cat
+'''
+rule concat_results:
+    input: 
+        expand(config["EPMC_BATCHES_HOME"] + "/{B}/{{R}}.txt", B=BATCHES)
+    output:
+        result="corpora/epmc/{R}.full.txt"
+    run:
+        with open(output.result, 'w') as out:
+            for fname in input:
+                with open(fname) as infile:
+                    out.write(infile.read())
+
+'''
+select files to be formated
+'''
+rule select:
+    input:
+        files=expand("corpora/epmc/{R}.full.txt", R=RESULTS)
+    output:
+        relation="corpora/epmc/relations.full.txt",
+                phenotype_relations="corpora/epmc/phenotype-relations.full.txt",
+                use_relations="corpora/epmc/uses-relations.full.txt"
+    shell:"""
+        """
+            
+
+
+'''
+merge indexes from the batches
+'''
+rule merge_epmc_index:
+    input:
+        index=expand(config["EPMC_BATCHES_HOME"] +"/{B}/index", B=BATCHES)
+    output:
+        index_folder=directory("corpora/epmc/index")
+    params:
+        alvisir=config["ALVISIR_HOME"]
+    shell: """
+        java -cp {params.alvisir}/lib/lucene-core-3.6.1.jar:{params.alvisir}/lib/lucene-misc-3.6.1.jar \
+        org.apache.lucene.misc.IndexMergeTool \
+        {output.index_folder} {input.index}
+        """
+
+
+
+'''
+create the expander
+'''
+rule create_epmc_expander:
+    input:
+        expander="share/expander.xml",
+        taxa_id_microorganisms=config['NCBI_TAXO_AND_ID_MICROORGANISMS'],
+                onto_habitat="share/BioNLP-OST+EnovFood-Habitat.obo",
+        onto_phenotype="share/BioNLP-OST+EnovFood-Phenotype.obo",
+        onto_use="share/BioNLP-OST+EnovFood-Use.obo"
+    output:
+        expander_folder=directory("corpora/epmc/expander")
+    params:
+        alvisir=config["ALVISIR_HOME"]
+    shell:"""
+        {params.alvisir}/bin/alvisir-index-expander {output.expander_folder} {input.expander}
+          """
+
+
+'''
+formatting the extracted *relations* for 
+integration in Florilege
+'''
+rule format_epmc_relations:
+    input:
+        file="corpora/epmc/relations.full.txt"
+    output:
+        florilege_result="corpora/florilege/epmc/EPMC-Habitat.txt"
+    conda: 'softwares/envs/python3_env.yaml'
+    shell:"""
+        python softwares/scripts/format-pubmed-results.py \
+        --pubmed-results {input.file} \
+        > {output.florilege_result}
+          """
+
+
+'''
+formatting the extracted *phenotype relations* for 
+integration in Florilege
+'''
+rule format_epmc_phenotype_relations:
+    input:
+        file="corpora/epmc/phenotype-relations.full.txt"
+    output:
+        florilege_result="corpora/florilege/epmc/EPMC-Phenotype.txt"
+    conda: 'softwares/envs/python3_env.yaml'
+    shell:"""
+        python softwares/scripts/format-pubmed-results.py \
+        --pubmed-results {input.file} \
+        > {output.florilege_result}
+          """
+
+'''
+formatting the extracted *use relations* for 
+integration in Florilege
+'''
+rule format_epmc_use_relations:
+    input:
+        file="corpora/epmc/uses-relations.full.txt"
+    output:
+        florilege_result="corpora/florilege/epmc/EPMC-Use.txt"
+    conda: 'softwares/envs/python3_env.yaml'
+    shell:"""
+        python softwares/scripts/format-pubmed-results.py \
+        --pubmed-results {input.file} \
+        > {output.florilege_result}
+          """
\ No newline at end of file
-- 
GitLab