From b3ed57d4dbc4bc841c3df3302c15233f357872ac Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Victor?= <francois.victor@inrae.fr>
Date: Thu, 11 Jul 2024 17:24:19 +0200
Subject: [PATCH 01/11] implementation of model selection for MLE using Minka's
 BIC

---
 prob_dim_red/linear_gaussian_ppca.py | 104 ++++++++++++++++++++++++++-
 prob_dim_red/utils.py                |   3 +-
 2 files changed, 103 insertions(+), 4 deletions(-)

diff --git a/prob_dim_red/linear_gaussian_ppca.py b/prob_dim_red/linear_gaussian_ppca.py
index 2e00029..f17d7f4 100644
--- a/prob_dim_red/linear_gaussian_ppca.py
+++ b/prob_dim_red/linear_gaussian_ppca.py
@@ -17,6 +17,8 @@
 # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
+# pylint:disable=too-many-lines
+
 """file containing the main functions and classes of the package"""
 # imports
 import itertools
@@ -140,10 +142,12 @@ class LinearGaussianPpcaMleResult(_LinearGaussianPpcaResult):
 
     # pylint:disable=invalid-name
     mean: npt.NDArray[Any]
+    W: npt.NDArray[Any]
     noise_var: np.float64
     log_likelihood: np.float64
     model_var_inv: npt.NDArray[Any]
     log_det_model_var: npt.NDArray[Any]
+    bic: np.float64
 
 
 class LinearGaussianPPCAEstimMLE(_LinearGaussianPPCAEstim):
@@ -165,9 +169,17 @@ class LinearGaussianPPCAEstimMLE(_LinearGaussianPPCAEstim):
         """
         return (self.centered_data.T @ self.centered_data) / self.N
 
-    def fit(self):
+    @cached_property
+    def tr_sample_cov(self):
         """
-        Fit the Linear Gaussian PPCA model using MLE.
+        Cached sample covariance trace.
+        """
+        return np.trace(self.sample_cov)
+
+    @cached_property
+    def evd(self):
+        """
+        Cached truncated eigen value decomposition
         """
         # pylint:disable=duplicate-code
         eig_val, eig_vec = scipy.linalg.eigh(
@@ -176,6 +188,17 @@ class LinearGaussianPPCAEstimMLE(_LinearGaussianPPCAEstim):
         eig_val = eig_val[::-1]
         eig_vec = eig_vec[:, ::-1]
 
+        return eig_val, eig_vec
+
+    def fit(self):
+        """
+        Fit the Linear Gaussian PPCA model using MLE.
+        """
+        if self._result is not None:
+            return
+
+        eig_val, eig_vec = self.evd
+
         noise_var_mle = (np.trace(self.sample_cov) - eig_val.sum()) / (
             self.X_dim - self.Z_dim
         )
@@ -204,6 +227,17 @@ class LinearGaussianPPCAEstimMLE(_LinearGaussianPPCAEstim):
                 + utils.trace_matmul(model_var_inv, self.sample_cov, sym=True)
             )
         )  # pylint:disable=duplicate-code
+
+        m_dim = (
+            self.X_dim * self.Z_dim - (self.Z_dim * (self.Z_dim + 1)) // 2
+        )  # dimension of Stiefel manifold
+
+        log_bic = (
+            -(self.N / 2) * np.log(eig_val).sum()
+            - (self.N * (self.X_dim - self.Z_dim) / 2) * np.log(noise_var_mle)
+            - (m_dim + self.Z_dim) / 2 * np.log(self.N)
+        )
+
         self._result = LinearGaussianPpcaMleResult(
             mean=self.mean,
             W=w_mle,
@@ -211,6 +245,7 @@ class LinearGaussianPPCAEstimMLE(_LinearGaussianPPCAEstim):
             log_likelihood=log_likelihood,
             model_var_inv=model_var_inv,
             log_det_model_var=log_det_model_var,
+            bic=log_bic,
         )
 
 
@@ -575,6 +610,71 @@ class LinearGaussianPPCAEstimEM(_LinearGaussianPPCAEstim):
         return self
 
 
+class MLE_Model_Selection(LinearGaussianPPCAEstimMLE):
+    """
+    Class for model selection of MLE Linear Gaussian PPCA using Minka's
+    Bayesian Entropy Criterion approximation.
+
+    Attributes
+    ----------
+    n_components_max: int | None
+        Maximum number of components specified by the user
+    """
+
+    n_components_max: int | None
+
+    def __init__(self, data, n_components_max=None):
+        super().__init__(data, n_components=None)
+        self.n_components_max = (
+            n_components_max if n_components_max is not None else self.X_dim - 1
+        )
+        self.n_components_max = min(self.n_components_max, self.X_dim)
+
+    def fit(self):
+        if self.Z_dim is not None:
+            return super().fit()
+
+        lambdas, eig_vecs = np.empty(0), np.empty((self.X_dim, 0))
+
+        # computing bics by blocks of powers of 2 until max bic
+        for upper, lower in itertools.pairwise(
+            2**k if k > 0 else 0 for k in itertools.count()
+        ):
+            lower = min(lower, self.n_components_max)
+
+            # computing eigvals by blocks bounded by lower and upper
+            # s.t. eigval[n - lower] < eigval[n - upper]
+            eig_val, eig_vec = scipy.linalg.eigh(
+                self.sample_cov,
+                subset_by_index=(self.X_dim - lower, self.X_dim - upper - 1),
+            )
+
+            lambdas = np.append(lambdas, eig_val[::-1])
+            eig_vecs = np.append(eig_vecs, eig_vec[:, ::-1], axis=1)
+
+            vec_k = np.arange(1, lower + 1)
+
+            bic = (
+                -self.N / 2 * np.cumsum(np.log(lambdas))
+                - self.N
+                * (self.X_dim - vec_k)
+                / 2
+                * np.log(
+                    (self.tr_sample_cov - np.cumsum(lambdas)) / (self.X_dim - vec_k)
+                )
+                - (self.X_dim * vec_k - vec_k * (vec_k + 1) / 2 + vec_k)
+                / 2
+                * np.log(self.N)
+            )
+
+            if (bic.max() != bic[-1]) or (lower >= self.n_components_max + 1):
+                break
+
+        self.Z_dim = bic.argmax() + 1
+        self.evd = lambdas[: self.Z_dim], eig_vecs[:, : self.Z_dim]
+        return super().fit()
+
+
 def ppca_mle(data: npt.NDArray[Any], n_components: int) -> LinearGaussianPpcaMleResult:
     """
     Perform Linear Gaussian PPCA estimation using Maximum Likelihood Estimation (MLE).
diff --git a/prob_dim_red/utils.py b/prob_dim_red/utils.py
index d35e321..c5f9d16 100644
--- a/prob_dim_red/utils.py
+++ b/prob_dim_red/utils.py
@@ -18,8 +18,7 @@
 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 """file containing functions and classes that may be useful"""
-from typing import NamedTuple
-from typing import Any
+from typing import NamedTuple, Any
 import numpy as np
 import numpy.typing as npt
 
-- 
GitLab


From 17a4bf7d04b83e4689c1a414499a2b0980615785 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Victor?= <francois.victor@inrae.fr>
Date: Thu, 8 Aug 2024 11:59:32 +0200
Subject: [PATCH 02/11] Dealing with zeros and inf safely, computing bic the
 same way in MLE and model selection

---
 prob_dim_red/linear_gaussian_ppca.py | 231 ++++++++++++++++++++++++---
 prob_dim_red/utils.py                |  77 +++++++++
 2 files changed, 283 insertions(+), 25 deletions(-)

diff --git a/prob_dim_red/linear_gaussian_ppca.py b/prob_dim_red/linear_gaussian_ppca.py
index f17d7f4..9b8ab39 100644
--- a/prob_dim_red/linear_gaussian_ppca.py
+++ b/prob_dim_red/linear_gaussian_ppca.py
@@ -24,7 +24,8 @@
 import itertools
 from typing import Any, Optional, Generator
 from dataclasses import dataclass
-from functools import cached_property
+from functools import cached_property, reduce
+import operator
 
 import numpy as np
 import numpy.typing as npt
@@ -35,7 +36,7 @@ from prob_dim_red import utils
 _ARD_ALPHA_INV_LOWER_BOUND = 1e-200
 
 
-class _NotFittedModelError(Exception):
+class NotFittedModelError(Exception):
     """
     Custom exception raised when a model has not been fitted yet.
     """
@@ -113,11 +114,11 @@ class _LinearGaussianPPCAEstim:
 
         Raises
         ------
-        _NotFittedModelError
+        NotFittedModelError
             If the model has not been fitted yet.
         """
         if self._result is None:
-            raise _NotFittedModelError
+            raise NotFittedModelError
         return self._result
 
 
@@ -177,7 +178,7 @@ class LinearGaussianPPCAEstimMLE(_LinearGaussianPPCAEstim):
         return np.trace(self.sample_cov)
 
     @cached_property
-    def evd(self):
+    def _evd(self):
         """
         Cached truncated eigen value decomposition
         """
@@ -197,35 +198,46 @@ class LinearGaussianPPCAEstimMLE(_LinearGaussianPPCAEstim):
         if self._result is not None:
             return
 
-        eig_val, eig_vec = self.evd
+        eig_val, eig_vec = self._evd
 
-        noise_var_mle = (np.trace(self.sample_cov) - eig_val.sum()) / (
+        # for consistancy with model selection these sums must be associative and commutative,
+        # numerical errors must not depend on order of the summation
+        eig_val_sum = np.float64(
+            reduce(operator.add, (utils.FractionOrInf(x) for x in eig_val))
+        )
+        log_eig_val_sum = np.float64(
+            reduce(operator.add, (utils.FractionOrInf(x) for x in np.log(eig_val)))
+        )
+
+        noise_var_mle = utils.positive_part(self.tr_sample_cov - eig_val_sum) / (
             self.X_dim - self.Z_dim
         )
 
-        w_mle = eig_vec * np.sqrt(eig_val - noise_var_mle)[None, :]
+        w_mle = eig_vec * np.sqrt(utils.positive_part(eig_val - noise_var_mle))[None, :]
 
         posterior_precision = w_mle.T @ w_mle + noise_var_mle * np.eye(self.Z_dim)
         posterior_variance = np.linalg.inv(posterior_precision)
 
-        model_var_inv = (
-            1
-            / (noise_var_mle)
-            * (np.eye(self.X_dim) - w_mle @ posterior_variance @ w_mle.T)
+        model_var_inv = (1 / (noise_var_mle) if noise_var_mle > 0 else np.inf) * (
+            np.eye(self.X_dim) - w_mle @ posterior_variance @ w_mle.T
         )
 
-        log_det_model_var = (self.X_dim - self.Z_dim) * np.log(noise_var_mle) + np.log(
-            eig_val
-        ).sum()
+        log_det_model_var = (self.X_dim - self.Z_dim) * (
+            np.log(noise_var_mle) if noise_var_mle > 0 else -np.inf
+        ) + log_eig_val_sum
         # pylint:disable=duplicate-code
         log_likelihood = (
-            -self.N
-            / 2
-            * (
-                self.X_dim * np.log(2 * np.pi)
-                + log_det_model_var
-                + utils.trace_matmul(model_var_inv, self.sample_cov, sym=True)
+            (
+                -self.N
+                / 2
+                * (
+                    self.X_dim * np.log(2 * np.pi)
+                    + log_det_model_var
+                    + utils.trace_matmul(model_var_inv, self.sample_cov, sym=True)
+                )
             )
+            if noise_var_mle > 0
+            else -np.inf
         )  # pylint:disable=duplicate-code
 
         m_dim = (
@@ -233,9 +245,13 @@ class LinearGaussianPPCAEstimMLE(_LinearGaussianPPCAEstim):
         )  # dimension of Stiefel manifold
 
         log_bic = (
-            -(self.N / 2) * np.log(eig_val).sum()
-            - (self.N * (self.X_dim - self.Z_dim) / 2) * np.log(noise_var_mle)
-            - (m_dim + self.Z_dim) / 2 * np.log(self.N)
+            (
+                -(self.N / 2) * log_eig_val_sum
+                - (self.N * (self.X_dim - self.Z_dim) / 2) * np.log(noise_var_mle)
+                - (m_dim + self.Z_dim) / 2 * np.log(self.N)
+            )
+            if noise_var_mle > 0
+            else -np.inf
         )
 
         self._result = LinearGaussianPpcaMleResult(
@@ -671,10 +687,175 @@ class MLE_Model_Selection(LinearGaussianPPCAEstimMLE):
                 break
 
         self.Z_dim = bic.argmax() + 1
-        self.evd = lambdas[: self.Z_dim], eig_vecs[:, : self.Z_dim]
+        self._evd = lambdas[: self.Z_dim], eig_vecs[:, : self.Z_dim]
         return super().fit()
 
 
+class MLE_Model_Selection_v2(LinearGaussianPPCAEstimMLE):
+    """
+    Class for model selection of MLE Linear Gaussian PPCA using
+    Minka's Bayesian Entropy Criterion approximation.
+
+    Attributes
+    ----------
+    n_components_max: int | None
+        Maximum number of components specified by the user
+    strategy: str
+        Strategy to compute BICs, either 'grow' or 'all'
+    """
+
+    n_components_max: int | None
+    strategy: str
+    _bics: np.ndarray
+
+    def __init__(
+        self,
+        data: np.ndarray,
+        n_components_max: Optional[int] = None,
+        strategy: str = "grow",
+    ):
+        super().__init__(data, n_components=None)
+        self.n_components_max = (
+            n_components_max if n_components_max is not None else self.X_dim - 1
+        )
+        self.n_components_max = min(self.n_components_max, self.X_dim - 1)
+        self.strategy = strategy
+        self._bics = np.empty(0)
+
+    def fit(self):
+        if self.Z_dim is not None:
+            return super().fit()
+
+        lambdas, eig_vecs = np.empty(0), np.empty((self.X_dim, 0))
+
+        if self.strategy == "grow":
+            # computing bics by blocks of powers of 2 until max bic
+            for upper, lower in itertools.pairwise(
+                2**k if k > 0 else 0 for k in itertools.count()
+            ):
+                lower = min(lower, self.n_components_max)
+
+                # computing eigvals by blocks bounded by lower and upper
+                # s.t. eigval[n - lower] < eigval[n - upper]
+                eig_val, eig_vec = scipy.linalg.eigh(
+                    self.sample_cov,
+                    subset_by_index=(self.X_dim - lower, self.X_dim - upper - 1),
+                )
+
+                lambdas = utils.positive_part(np.append(lambdas, eig_val[::-1]))
+                eig_vecs = np.append(eig_vecs, eig_vec[:, ::-1], axis=1)
+
+                # for consistancy with model selection these sums must be associative and
+                # commutative, numerical errors must not depend on order of the summation
+                lambdas_cumsum = np.array(
+                    list(
+                        itertools.accumulate(
+                            (utils.FractionOrInf(x) for x in lambdas),
+                            operator.add,
+                        )
+                    ),
+                    dtype=np.float64,
+                )
+                log_lambdas_cumsum = np.array(
+                    list(
+                        itertools.accumulate(
+                            (
+                                utils.FractionOrInf(x)
+                                for x in utils.log_with_zeros(lambdas)
+                            ),
+                            operator.add,
+                        )
+                    ),
+                    dtype=np.float64,
+                )
+
+                vec_k = np.arange(1, lower + 1)
+                bics = (
+                    -self.N / 2 * log_lambdas_cumsum
+                    - self.N
+                    * (self.X_dim - vec_k)
+                    / 2
+                    * utils.log_with_zeros(
+                        utils.positive_part(self.tr_sample_cov - lambdas_cumsum)
+                        / (self.X_dim - vec_k)
+                    )
+                    - (self.X_dim * vec_k - vec_k * (vec_k + 1) / 2 + vec_k)
+                    / 2
+                    * np.log(self.N)
+                )
+
+                if (bics.max() != bics[-1]) or (lower >= self.n_components_max + 1):
+                    break
+
+        elif self.strategy == "all":
+
+            lower = self.n_components_max
+
+            eig_val, eig_vec = scipy.linalg.eigh(
+                self.sample_cov,
+                subset_by_index=(self.X_dim - lower, self.X_dim - 1),
+            )
+            lambdas = utils.positive_part(np.append(lambdas, eig_val[::-1]))
+            eig_vecs = np.append(eig_vecs, eig_vec[:, ::-1], axis=1)
+
+            vec_k = np.arange(1, lower + 1)
+
+            # for consistancy with model selection these sums must be associative and commutative,
+            # numerical errors must not depend on order of the summation
+            lambdas_cumsum = np.array(
+                list(
+                    itertools.accumulate(
+                        (utils.FractionOrInf(x) for x in lambdas),
+                        operator.add,
+                    )
+                ),
+                dtype=np.float64,
+            )
+            log_lambdas_cumsum = np.array(
+                list(
+                    itertools.accumulate(
+                        (utils.FractionOrInf(x) for x in utils.log_with_zeros(lambdas)),
+                        operator.add,
+                    )
+                ),
+                dtype=np.float64,
+            )
+
+            bics = (
+                -self.N / 2 * log_lambdas_cumsum
+                - self.N
+                * (self.X_dim - vec_k)
+                / 2
+                * utils.log_with_zeros(
+                    utils.positive_part(self.tr_sample_cov - lambdas_cumsum)
+                    / (self.X_dim - vec_k)
+                )
+                - (self.X_dim * vec_k - vec_k * (vec_k + 1) / 2 + vec_k)
+                / 2
+                * np.log(self.N)
+            )
+
+        self._bics = bics
+        self.Z_dim = self._bics.argmax() + 1
+        self._evd = lambdas[: self.Z_dim], eig_vecs[:, : self.Z_dim]
+        return super().fit()
+
+    @property
+    def bics(self) -> np.ndarray:
+        """
+        Get the BICs computed during the model selection.
+
+        Returns
+        -------
+        np.ndarray
+            Array of BIC values.
+        """
+        if self.Z_dim is None:
+            raise NotFittedModelError
+
+        return self._bics
+
+
 def ppca_mle(data: npt.NDArray[Any], n_components: int) -> LinearGaussianPpcaMleResult:
     """
     Perform Linear Gaussian PPCA estimation using Maximum Likelihood Estimation (MLE).
diff --git a/prob_dim_red/utils.py b/prob_dim_red/utils.py
index c5f9d16..b12f432 100644
--- a/prob_dim_red/utils.py
+++ b/prob_dim_red/utils.py
@@ -18,7 +18,10 @@
 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 """file containing functions and classes that may be useful"""
+from dataclasses import dataclass
 from typing import NamedTuple, Any
+from fractions import Fraction
+
 import numpy as np
 import numpy.typing as npt
 
@@ -167,3 +170,77 @@ def trace_matmul(
         return np.einsum("ij,ij->", mat_a, mat_b)
 
     return np.einsum("ij,ji->", mat_a, mat_b)
+
+
+def positive_part(x: npt.ArrayLike) -> npt.NDArray[Any]:
+    """
+    Compute the positive part of an array element-wise.
+
+    This function returns an array where each element is the maximum of the input
+    element and zero, effectively replacing all negative values with zero.
+
+    Parameters
+    ----------
+    x : array-like
+        Input array or scalar to compute the positive part.
+
+    Returns
+    -------
+    ndarray
+        An array with the same shape as `x`, where each element is the positive
+        part of the corresponding element in `x`.
+    """
+    return np.maximum(x, 0)
+
+
+def log_with_zeros(x: npt.ArrayLike) -> npt.NDArray[Any]:
+    """
+    Compute the natural logarithm of an array, treating zeros safely.
+
+    This function computes the natural logarithm of each element in the input array.
+    For elements equal to zero, it returns positive infinity instead of `-inf`, and
+    for negative elements, the function asserts an error as the logarithm is undefined.
+
+    Parameters
+    ----------
+    x : array-like
+        Input array or scalar for which the logarithm is computed. All elements must be
+        non-negative.
+
+    Returns
+    -------
+    ndarray
+        An array with the same shape as `x`, where each element is the natural logarithm
+        of the corresponding element in `x`. Elements that are zero are mapped to positive infinity.
+    """
+    assert (np.asarray(x) >= 0).all()
+
+    return np.log(x, out=np.full_like(x, np.inf), where=x > 0)
+
+
+@dataclass
+class FractionOrInf:
+    """
+    Attributes
+    ----------
+    value
+    """
+
+    value: Fraction | np.float64
+
+    def __init__(self, value: Fraction | np.float64):
+        if isinstance(value, Fraction):
+            self.value = value
+            return
+        if np.isinf(value):
+            self.value = value
+            return
+        self.value = Fraction(value)
+
+    def __add__(self, oth: "FractionOrInf"):
+        if isinstance(self.value, Fraction) and isinstance(oth.value, Fraction):
+            return FractionOrInf(self.value + oth.value)
+        return FractionOrInf(np.float64(self.value) + np.float64(oth.value))
+
+    def __float__(self):
+        return float(self.value)
-- 
GitLab


From 2a20d47b8a84b715fb7545c35735cf3264dba5a0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Victor?= <francois.victor@inrae.fr>
Date: Fri, 9 Aug 2024 15:15:32 +0200
Subject: [PATCH 03/11] modif complete likelihood

---
 prob_dim_red/linear_gaussian_ppca.py | 71 +++++++++++++++++++---------
 1 file changed, 49 insertions(+), 22 deletions(-)

diff --git a/prob_dim_red/linear_gaussian_ppca.py b/prob_dim_red/linear_gaussian_ppca.py
index 9b8ab39..910667e 100644
--- a/prob_dim_red/linear_gaussian_ppca.py
+++ b/prob_dim_red/linear_gaussian_ppca.py
@@ -30,6 +30,7 @@ import operator
 import numpy as np
 import numpy.typing as npt
 import scipy
+from tqdm import tqdm
 
 from prob_dim_red import utils
 
@@ -276,6 +277,8 @@ class _DataDescFixed:
         Number of data points.
     X_dim : int
         Dimension of the data.
+    Z_dim : int | None
+        Dimension of the latent.
     sample_cov : npt.NDArray[Any]
         Sample covariance of the data.
     """
@@ -283,6 +286,7 @@ class _DataDescFixed:
     # pylint:disable=invalid-name
     N: int
     X_dim: int
+    Z_dim: int | None
     centered_data: npt.NDArray[Any]
 
     @cached_property
@@ -398,28 +402,25 @@ class _EM_state:
         Complete log-likelihood of the model.
         """
         # pylint:disable=line-too-long
+        if self.noise_var == 0:
+            return -np.inf
+
         return (
-            (
-                -self.fixed.N
-                * self.fixed.X_dim
-                / 2
-                * np.log(2 * np.pi * self.noise_var)
-                - self.fixed.N
-                / 2
-                * self.noise_var
-                * (
-                    np.trace(self.posterior_variance)
-                    + np.trace(
-                        self.posterior_variance_factor_load_sample_cov_factor_load_posterior_variance_mat_prod
-                    )
+            -self.fixed.N * self.fixed.Z_dim / 2 * np.log(2 * np.pi)
+            - self.fixed.N * self.fixed.X_dim / 2 * np.log(2 * np.pi * self.noise_var)
+            - self.fixed.N
+            / 2
+            * (
+                self.noise_var * np.trace(self.posterior_variance)
+                + np.trace(
+                    self.posterior_variance_factor_load_sample_cov_factor_load_posterior_variance_mat_prod
                 )
-                - 1 / 2 * self.fixed.N * self.fixed.tr_sample_cov
             )
-            + 1
+            - self.fixed.N / 2 * self.fixed.tr_sample_cov
+            + self.fixed.N
             / self.noise_var
             * np.trace(
-                self.fixed.N
-                * self.posterior_variance_factor_load_sample_cov_factor_load_mat_prod
+                self.posterior_variance_factor_load_sample_cov_factor_load_mat_prod
             )
             - self.fixed.N
             / 2
@@ -440,6 +441,8 @@ class _EM_state:
         """
         Log-likelihood of the model.
         """
+        if self.noise_var == 0:
+            return -np.inf
         return (
             -self.fixed.N
             / 2
@@ -512,7 +515,10 @@ class LinearGaussianPPCAEstimEM(_LinearGaussianPPCAEstim):
         """
         super().__init__(*args, **kwargs)
         self.fixed = _DataDescFixed(
-            N=self.N, X_dim=self.X_dim, centered_data=self.centered_data
+            N=self.N,
+            X_dim=self.X_dim,
+            Z_dim=self.Z_dim,
+            centered_data=self.centered_data,
         )
         self.state = None
 
@@ -543,7 +549,7 @@ class LinearGaussianPPCAEstimEM(_LinearGaussianPPCAEstim):
 
         # W_new, _r = np.linalg.qr(W_new)
 
-        noise_var_new = (
+        noise_var_new = utils.positive_part(
             1
             / self.X_dim
             * (
@@ -598,6 +604,7 @@ class LinearGaussianPPCAEstimEM(_LinearGaussianPPCAEstim):
         max_iterations=None,
         error_tolerance=1e-6,
         trace=False,
+        progress=True,
     ) -> Optional[list[_EM_state]]:
         """
         Fit the Linear Gaussian PPCA model using the EM algorithm.
@@ -610,6 +617,8 @@ class LinearGaussianPPCAEstimEM(_LinearGaussianPPCAEstim):
             Tolerance for the convergence of the EM algorithm.
         trace : bool
             Whether to store the intermediate states of the EM algorithm.
+        progress : bool
+            Show progress with tqdm (not compatible with trace).
 
         Returns
         -------
@@ -620,8 +629,23 @@ class LinearGaussianPPCAEstimEM(_LinearGaussianPPCAEstim):
         if trace:
             return states
 
-        for _ in states:
-            pass
+        if progress:
+            last_comp_lik = 0
+            scale = 0
+            for state in (progress_bar := tqdm(states, unit_scale=True)):
+                gap, last_comp_lik = (
+                    state.complete_log_likelihood - last_comp_lik,
+                    state.complete_log_likelihood,
+                )
+                scale += 0.1 * (np.log10(np.abs(gap)) - scale)
+                dscale = max(int(-np.floor(scale - 0.5)), 0)
+                fmt = "comp-log-lik={" + f":.{dscale:d}f" + "}"
+                progress_bar.set_postfix_str(
+                    fmt.format(state.complete_log_likelihood), refresh=False
+                )
+        else:
+            for _ in states:
+                pass
 
         return self
 
@@ -1105,7 +1129,10 @@ class LinearGaussianPPCAEstimARD(_LinearGaussianPPCAEstim):
         """
         super().__init__(*args, **(kwargs | {"n_components": None}))
         self.fixed = _DataDescFixed(
-            N=self.N, X_dim=self.X_dim, centered_data=self.centered_data
+            N=self.N,
+            X_dim=self.X_dim,
+            Z_dim=self.fixed.Z_dim,
+            centered_data=self.centered_data,
         )
         self.state = None
 
-- 
GitLab


From 3c949b40400a342c4e44b929d05df1112d3d9472 Mon Sep 17 00:00:00 2001
From: Julien Chiquet <julien.chiquet@inrae.fr>
Date: Fri, 9 Aug 2024 16:26:38 +0200
Subject: [PATCH 04/11] fix and rewriting of Q in Gaussian pPCA EM

---
 prob_dim_red/linear_gaussian_ppca.py | 43 ++++++++++++----------------
 1 file changed, 18 insertions(+), 25 deletions(-)

diff --git a/prob_dim_red/linear_gaussian_ppca.py b/prob_dim_red/linear_gaussian_ppca.py
index 910667e..385dc9a 100644
--- a/prob_dim_red/linear_gaussian_ppca.py
+++ b/prob_dim_red/linear_gaussian_ppca.py
@@ -405,35 +405,29 @@ class _EM_state:
         if self.noise_var == 0:
             return -np.inf
 
-        return (
-            -self.fixed.N * self.fixed.Z_dim / 2 * np.log(2 * np.pi)
-            - self.fixed.N * self.fixed.X_dim / 2 * np.log(2 * np.pi * self.noise_var)
-            - self.fixed.N
-            / 2
+        return (-self.fixed.N / 2) * (
+            self.fixed.Z_dim * np.log(2 * np.pi)
+            + self.fixed.X_dim * np.log(2 * np.pi * self.noise_var)
+            + self.noise_var * np.trace(self.posterior_variance)
+            + np.trace(
+                self.posterior_variance_factor_load_sample_cov_factor_load_posterior_variance_mat_prod
+            )
+            + (1 / self.noise_var)
             * (
-                self.noise_var * np.trace(self.posterior_variance)
-                + np.trace(
-                    self.posterior_variance_factor_load_sample_cov_factor_load_posterior_variance_mat_prod
+                self.fixed.tr_sample_cov
+                - 2
+                * np.trace(
+                    self.posterior_variance_factor_load_sample_cov_factor_load_mat_prod
+                )
+                + utils.trace_matmul(
+                    self.posterior_variance_factor_load_sample_cov_factor_load_posterior_variance_mat_prod,
+                    self.noiseless_posterior_precision,
+                    sym=True,
                 )
             )
-            - self.fixed.N / 2 * self.fixed.tr_sample_cov
-            + self.fixed.N
-            / self.noise_var
-            * np.trace(
-                self.posterior_variance_factor_load_sample_cov_factor_load_mat_prod
-            )
-            - self.fixed.N
-            / 2
-            * utils.trace_matmul(
+            + utils.trace_matmul(
                 self.posterior_variance, self.noiseless_posterior_precision, sym=True
             )
-            - self.fixed.N
-            / (2 * self.noise_var)
-            * utils.trace_matmul(
-                self.posterior_variance_factor_load_sample_cov_factor_load_posterior_variance_mat_prod,
-                self.noiseless_posterior_precision,
-                sym=True,
-            )
         )
 
     @cached_property
@@ -812,7 +806,6 @@ class MLE_Model_Selection_v2(LinearGaussianPPCAEstimMLE):
                     break
 
         elif self.strategy == "all":
-
             lower = self.n_components_max
 
             eig_val, eig_vec = scipy.linalg.eigh(
-- 
GitLab


From adc5f7b58038b8edc8fdccc10abe1d2cf7e837c6 Mon Sep 17 00:00:00 2001
From: Julien Chiquet <julien.chiquet@inrae.fr>
Date: Fri, 9 Aug 2024 17:10:14 +0200
Subject: [PATCH 05/11] added tqdm to package dependencies

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index d12ad9d..557b1fb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -27,7 +27,7 @@ classifiers = [
   "Intended Audience :: Science/Research",
   "Topic :: Scientific/Engineering",
 ]
-dependencies = ["numpy", "scipy", "matplotlib"]
+dependencies = ["numpy", "scipy", "matplotlib", "tqdm"]
 
 [project.optional-dependencies]
 dev = ["build",]
-- 
GitLab


From d430b4fb6c5cb538abb53e92b88d05efc16975db Mon Sep 17 00:00:00 2001
From: Julien Chiquet <julien.chiquet@inrae.fr>
Date: Fri, 9 Aug 2024 18:52:02 +0200
Subject: [PATCH 06/11] added missing trace term in noise_var_new for EM in
 Gaussian pPCA

---
 prob_dim_red/linear_gaussian_ppca.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/prob_dim_red/linear_gaussian_ppca.py b/prob_dim_red/linear_gaussian_ppca.py
index 385dc9a..853337d 100644
--- a/prob_dim_red/linear_gaussian_ppca.py
+++ b/prob_dim_red/linear_gaussian_ppca.py
@@ -548,10 +548,17 @@ class LinearGaussianPPCAEstimEM(_LinearGaussianPPCAEstim):
             / self.X_dim
             * (
                 self.fixed.tr_sample_cov
-                - utils.trace_matmul(
+                - 2
+                * utils.trace_matmul(
                     self.state.sample_cov_factor_load_mat_prod,
                     (self.state.posterior_variance @ W_new.T),
                 )
+                + utils.trace_matmul(
+                    self.state.noise_var * self.state.posterior_variance
+                    + self.state.posterior_variance_factor_load_sample_cov_factor_load_posterior_variance_mat_prod,
+                    W_new.T @ W_new,
+                    sym=True,
+                )
             )
         )
 
-- 
GitLab


From ff016f2b9fa8f019f819ea9413ecc7c1073ec078 Mon Sep 17 00:00:00 2001
From: Julien Chiquet <julien.chiquet@inrae.fr>
Date: Fri, 9 Aug 2024 19:09:06 +0200
Subject: [PATCH 07/11] added loglik in progress bar for pPCA

---
 prob_dim_red/linear_gaussian_ppca.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/prob_dim_red/linear_gaussian_ppca.py b/prob_dim_red/linear_gaussian_ppca.py
index 853337d..3cf5f47 100644
--- a/prob_dim_red/linear_gaussian_ppca.py
+++ b/prob_dim_red/linear_gaussian_ppca.py
@@ -634,15 +634,20 @@ class LinearGaussianPPCAEstimEM(_LinearGaussianPPCAEstim):
             last_comp_lik = 0
             scale = 0
             for state in (progress_bar := tqdm(states, unit_scale=True)):
-                gap, last_comp_lik = (
+                gap, last_comp_lik, last_lik = (
                     state.complete_log_likelihood - last_comp_lik,
                     state.complete_log_likelihood,
+                    state.log_likelihood,
                 )
                 scale += 0.1 * (np.log10(np.abs(gap)) - scale)
                 dscale = max(int(-np.floor(scale - 0.5)), 0)
-                fmt = "comp-log-lik={" + f":.{dscale:d}f" + "}"
+                fmt_comp_lik = "comp-log-lik={" + f":.{dscale:d}f" + "}"
+                fmt_marg_lik = "marg-log-lik={" + f":.{dscale:d}f" + "}"
                 progress_bar.set_postfix_str(
-                    fmt.format(state.complete_log_likelihood), refresh=False
+                    fmt_comp_lik.format(last_comp_lik)
+                    + " - "
+                    + fmt_marg_lik.format(last_lik),
+                    refresh=False,
                 )
         else:
             for _ in states:
-- 
GitLab


From 09fb275e5f7425ba39f98fa2a1c7c1238b4efa7b Mon Sep 17 00:00:00 2001
From: Julien Chiquet <julien.chiquet@inrae.fr>
Date: Fri, 9 Aug 2024 19:18:13 +0200
Subject: [PATCH 08/11] disable line too long

---
 prob_dim_red/linear_gaussian_ppca.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/prob_dim_red/linear_gaussian_ppca.py b/prob_dim_red/linear_gaussian_ppca.py
index 3cf5f47..d011ae9 100644
--- a/prob_dim_red/linear_gaussian_ppca.py
+++ b/prob_dim_red/linear_gaussian_ppca.py
@@ -536,6 +536,7 @@ class LinearGaussianPPCAEstimEM(_LinearGaussianPPCAEstim):
         Perform one step of the EM algorithm.
         """
         # pylint:disable=invalid-name
+        # pylint:disable=line-too-long
         W_new = self.state.sample_cov_factor_load_mat_prod @ np.linalg.inv(
             self.state.noise_var * np.eye(self.Z_dim)
             + self.state.posterior_variance_factor_load_sample_cov_factor_load_mat_prod
-- 
GitLab


From d9d365c8f37b47496a5e5446dc192358edfc131e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Victor?= <francois.victor@inrae.fr>
Date: Wed, 14 Aug 2024 10:19:35 +0200
Subject: [PATCH 09/11] proper typing of numpy arrays & removing np.abs on EM
 stopping criteria.

---
 prob_dim_red/linear_gaussian_ppca.py | 126 ++++++++++++++-------------
 prob_dim_red/utils.py                |   8 +-
 2 files changed, 68 insertions(+), 66 deletions(-)

diff --git a/prob_dim_red/linear_gaussian_ppca.py b/prob_dim_red/linear_gaussian_ppca.py
index d011ae9..6f52fc7 100644
--- a/prob_dim_red/linear_gaussian_ppca.py
+++ b/prob_dim_red/linear_gaussian_ppca.py
@@ -22,7 +22,7 @@
 """file containing the main functions and classes of the package"""
 # imports
 import itertools
-from typing import Any, Optional, Generator
+from typing import Optional, Generator
 from dataclasses import dataclass
 from functools import cached_property, reduce
 import operator
@@ -50,11 +50,11 @@ class _LinearGaussianPpcaResult:
 
     Attributes
     ----------
-    W : npt.NDArray[Any]
+    W : npt.NDArray[np.float64]
         Factor-loading matrix.
     """
 
-    W: npt.NDArray[Any]  # pylint:disable=invalid-name
+    W: npt.NDArray[np.float64]  # pylint:disable=invalid-name
 
 
 # pylint:disable=too-few-public-methods
@@ -64,9 +64,9 @@ class _LinearGaussianPPCAEstim:
 
     Attributes
     ----------
-    mean : npt.NDArray[Any]
+    mean : npt.NDArray[np.float64]
         Mean of the data.
-    centered_data : npt.NDArray[Any]
+    centered_data : npt.NDArray[np.float64]
         Centered data.
     X_dim : int
         Dimension corresponding to the number of variables.
@@ -78,20 +78,20 @@ class _LinearGaussianPPCAEstim:
         Result of the estimation.
     """
 
-    mean: npt.NDArray[Any]
-    centered_data: npt.NDArray[Any]
+    mean: npt.NDArray[np.float64]
+    centered_data: npt.NDArray[np.float64]
     X_dim: int
     Z_dim: int
     N: int
     _result: None | _LinearGaussianPpcaResult
 
-    def __init__(self, data: npt.NDArray[Any], n_components: int):
+    def __init__(self, data: npt.NDArray[np.float64], n_components: int):
         """
         Initialize the Linear Gaussian PPCA estimation object.
 
         Parameters
         ----------
-        data : npt.NDArray[Any]
+        data : npt.NDArray[np.float64]
             Data to be modeled.
         n_components : int
             Number of latent dimensions.
@@ -130,25 +130,25 @@ class LinearGaussianPpcaMleResult(_LinearGaussianPpcaResult):
 
     Attributes
     ----------
-    mean : npt.NDArray[Any]
+    mean : npt.NDArray[np.float64]
         Mean of the data.
     noise_var : np.float64
         Noise variance.
     log_likelihood : np.float64
         Log-likelihood of the model.
-    model_var_inv : npt.NDArray[Any]
+    model_var_inv : npt.NDArray[np.float64]
         Inverse of the model covariance.
-    log_det_model_var : npt.NDArray[Any]
+    log_det_model_var : npt.NDArray[np.float64]
         Log-determinant of the model covariance.
     """
 
     # pylint:disable=invalid-name
-    mean: npt.NDArray[Any]
-    W: npt.NDArray[Any]
+    mean: npt.NDArray[np.float64]
+    W: npt.NDArray[np.float64]
     noise_var: np.float64
     log_likelihood: np.float64
-    model_var_inv: npt.NDArray[Any]
-    log_det_model_var: npt.NDArray[Any]
+    model_var_inv: npt.NDArray[np.float64]
+    log_det_model_var: npt.NDArray[np.float64]
     bic: np.float64
 
 
@@ -279,7 +279,7 @@ class _DataDescFixed:
         Dimension of the data.
     Z_dim : int | None
         Dimension of the latent.
-    sample_cov : npt.NDArray[Any]
+    sample_cov : npt.NDArray[np.float64]
         Sample covariance of the data.
     """
 
@@ -287,7 +287,7 @@ class _DataDescFixed:
     N: int
     X_dim: int
     Z_dim: int | None
-    centered_data: npt.NDArray[Any]
+    centered_data: npt.NDArray[np.float64]
 
     @cached_property
     def tr_sample_cov(self):
@@ -320,7 +320,7 @@ class _EM_state:
 
     Attributes
     ----------
-    W : npt.NDArray[Any]
+    W : npt.NDArray[np.float64]
         Factor-loading matrix.
     noise_var : np.float64
         Noise variance σ².
@@ -329,41 +329,41 @@ class _EM_state:
     """
 
     # pylint:disable=invalid-name
-    W: npt.NDArray[Any]
+    W: npt.NDArray[np.float64]
     noise_var: np.float64
     fixed: _DataDescFixed
 
     @cached_property
-    def noiseless_posterior_precision(self) -> npt.NDArray[Any]:
+    def noiseless_posterior_precision(self) -> npt.NDArray[np.float64]:
         """Noiseless posterior precision matrix: Wáµ€ W"""
         return self.W.T @ self.W
 
     @cached_property
-    def posterior_precision(self) -> npt.NDArray[Any]:
+    def posterior_precision(self) -> npt.NDArray[np.float64]:
         """Posterior precision matrix: M = Wᵀ W + σ² I"""
         return self.noiseless_posterior_precision + self.noise_var * np.eye(
             self.W.shape[1]
         )
 
     @cached_property
-    def posterior_variance(self) -> npt.NDArray[Any]:
+    def posterior_variance(self) -> npt.NDArray[np.float64]:
         """Posterior variance matrix: (Wᵀ W + σ²I)⁻¹ = M⁻¹"""
         return np.linalg.inv(self.posterior_precision)
 
     @cached_property
-    def sample_cov_factor_load_mat_prod(self) -> npt.NDArray[Any]:
+    def sample_cov_factor_load_mat_prod(self) -> npt.NDArray[np.float64]:
         """Sample covariance and factor loading matrix product: (Wáµ€ Sáµ€)áµ€= SW"""
         return self.fixed.sample_cov_matmul_smth(self.W)
 
     @cached_property
-    def factor_load_sample_cov_factor_load_mat_prod(self) -> npt.NDArray[Any]:
+    def factor_load_sample_cov_factor_load_mat_prod(self) -> npt.NDArray[np.float64]:
         """Factor loading, sample covariance, and factor loading matrix product: Wáµ€(Wáµ€Sáµ€)áµ€= Wáµ€ SW"""
         return self.W.T @ self.sample_cov_factor_load_mat_prod
 
     @cached_property
     def posterior_variance_factor_load_sample_cov_factor_load_mat_prod(
         self,
-    ) -> npt.NDArray[Any]:
+    ) -> npt.NDArray[np.float64]:
         """Posterior variance, factor loading, sample covariance
         and factor loading matrix product: M⁻¹ Wᵀ S W"""
         return (
@@ -373,7 +373,7 @@ class _EM_state:
     @cached_property
     def posterior_variance_factor_load_sample_cov_factor_load_posterior_variance_mat_prod(
         self,
-    ) -> npt.NDArray[Any]:
+    ) -> npt.NDArray[np.float64]:
         """Posterior variance, factor loading, sample covariance,
         factor loading, and posterior variance matrix product: M⁻¹ Wᵀ S W M⁻¹"""
         return (
@@ -382,17 +382,17 @@ class _EM_state:
         )  # M⁻¹ Wᵀ S W M⁻¹
 
     @cached_property
-    def model_var(self) -> npt.NDArray[Any]:
+    def model_var(self) -> npt.NDArray[np.float64]:
         """Model covariance matrix: W Wᵀ + σ²I = C"""
         return self.W @ self.W.T + self.noise_var * np.eye(self.fixed.X_dim)
 
     @cached_property
-    def log_det_model_var(self) -> npt.NDArray[Any]:
+    def log_det_model_var(self) -> npt.NDArray[np.float64]:
         """Log-determinant of the model covariance matrix: log |C|"""
         return np.linalg.slogdet(self.model_var)[1]
 
     @cached_property
-    def model_var_inv(self) -> npt.NDArray[Any]:
+    def model_var_inv(self) -> npt.NDArray[np.float64]:
         """Inverse of the model covariance matrix: C⁻¹ = (W Wᵀ + σ²I)⁻¹"""
         return np.linalg.inv(self.model_var)
 
@@ -462,7 +462,7 @@ class LinearGaussianPpcaEMResult(_LinearGaussianPpcaResult):
 
     Attributes
     ----------
-    mean : npt.NDArray[Any]
+    mean : npt.NDArray[np.float64]
         Mean of the data.
     noise_var : np.float64
         Noise variance.
@@ -470,19 +470,19 @@ class LinearGaussianPpcaEMResult(_LinearGaussianPpcaResult):
         Complete log-likelihood of the model.
     log_likelihood : np.float64
         Marginal log-likelihood of the model.
-    model_var_inv : npt.NDArray[Any]
+    model_var_inv : npt.NDArray[np.float64]
         Inverse of the model covariance.
-    log_det_model_var : npt.NDArray[Any]
+    log_det_model_var : npt.NDArray[np.float64]
         Log-determinant of the model covariance.
     """
 
     # pylint:disable=invalid-name
-    mean: npt.NDArray[Any]
+    mean: npt.NDArray[np.float64]
     noise_var: np.float64
     complete_log_likelihood: np.float64
     log_likelihood: np.float64
-    model_var_inv: npt.NDArray[Any]
-    log_det_model_var: npt.NDArray[Any]
+    model_var_inv: npt.NDArray[np.float64]
+    log_det_model_var: npt.NDArray[np.float64]
 
 
 class LinearGaussianPPCAEstimEM(_LinearGaussianPPCAEstim):
@@ -585,7 +585,7 @@ class LinearGaussianPPCAEstimEM(_LinearGaussianPPCAEstim):
 
             yield self.state
             likelihood, old_likelihood = self.state.complete_log_likelihood, likelihood
-            if np.abs(likelihood - old_likelihood) < error_tolerance:
+            if (likelihood - old_likelihood) < error_tolerance:
                 break
 
         U, S, _ = np.linalg.svd(self.state.W, full_matrices=False)
@@ -886,13 +886,15 @@ class MLE_Model_Selection_v2(LinearGaussianPPCAEstimMLE):
         return self._bics
 
 
-def ppca_mle(data: npt.NDArray[Any], n_components: int) -> LinearGaussianPpcaMleResult:
+def ppca_mle(
+    data: npt.NDArray[np.float64], n_components: int
+) -> LinearGaussianPpcaMleResult:
     """
     Perform Linear Gaussian PPCA estimation using Maximum Likelihood Estimation (MLE).
 
     Parameters
     ----------
-    data : npt.NDArray[Any]
+    data : npt.NDArray[np.float64]
         Data to be modeled.
     n_components : int
         Number of latent dimensions.
@@ -915,7 +917,7 @@ class _ARD_state:
 
     Attributes
     ----------
-    W : npt.NDArray[Any]
+    W : npt.NDArray[np.float64]
         Factor-loading matrix.
     noise_var : np.float64
         Noise variance σ².
@@ -924,14 +926,14 @@ class _ARD_state:
     """
 
     # pylint:disable=invalid-name
-    W: npt.NDArray[Any]
+    W: npt.NDArray[np.float64]
     noise_var: np.float64
     fixed: _DataDescFixed
-    alpha: npt.NDArray[Any]
+    alpha: npt.NDArray[np.float64]
 
     @classmethod
     def from_raw_parameters(
-        cls, W: npt.NDArray[Any], noise_var: np.float64, fixed: _DataDescFixed
+        cls, W: npt.NDArray[np.float64], noise_var: np.float64, fixed: _DataDescFixed
     ) -> "_ARD_state":
         """
         Classmethod constructor of _ARD_state that takes in input parameters and
@@ -944,41 +946,41 @@ class _ARD_state:
         return cls(W[:, mask], noise_var, fixed, 1 / alpha_inv[mask])
 
     @cached_property
-    def noiseless_posterior_precision(self) -> npt.NDArray[Any]:
+    def noiseless_posterior_precision(self) -> npt.NDArray[np.float64]:
         """Noiseless posterior precision matrix: Wáµ€ W"""
         return self.W.T @ self.W
 
     @cached_property
-    def posterior_precision(self) -> npt.NDArray[Any]:
+    def posterior_precision(self) -> npt.NDArray[np.float64]:
         """Posterior precision matrix: M = Wᵀ W + σ² I"""
         return self.noiseless_posterior_precision + self.noise_var * np.eye(
             self.W.shape[1]
         )
 
     @cached_property
-    def posterior_variance(self) -> npt.NDArray[Any]:
+    def posterior_variance(self) -> npt.NDArray[np.float64]:
         """Posterior variance matrix: (Wᵀ W + σ²I)⁻¹ = M⁻¹"""
         return np.linalg.inv(self.posterior_precision)
 
     @cached_property
-    def alpha_posterior_precision_prod(self) -> npt.NDArray[Any]:
+    def alpha_posterior_precision_prod(self) -> npt.NDArray[np.float64]:
         """ARD matrix and posterior precision matrix product : AM"""
         return self.alpha[:, None] * self.posterior_precision
 
     @cached_property
-    def sample_cov_factor_load_mat_prod(self) -> npt.NDArray[Any]:
+    def sample_cov_factor_load_mat_prod(self) -> npt.NDArray[np.float64]:
         """Sample covariance and factor loading matrix product: (Wáµ€ Sáµ€)áµ€= SW"""
         return self.fixed.sample_cov_matmul_smth(self.W)
 
     @cached_property
-    def factor_load_sample_cov_factor_load_mat_prod(self) -> npt.NDArray[Any]:
+    def factor_load_sample_cov_factor_load_mat_prod(self) -> npt.NDArray[np.float64]:
         """Factor loading, sample covariance, and factor loading matrix product: Wáµ€(Wáµ€Sáµ€)áµ€= Wáµ€ SW"""
         return self.W.T @ self.sample_cov_factor_load_mat_prod
 
     @cached_property
     def posterior_variance_factor_load_sample_cov_factor_load_mat_prod(
         self,
-    ) -> npt.NDArray[Any]:
+    ) -> npt.NDArray[np.float64]:
         """Posterior variance, factor loading, sample covariance
         and factor loading matrix product: M⁻¹ Wᵀ S W"""
         return (
@@ -988,7 +990,7 @@ class _ARD_state:
     @cached_property
     def posterior_variance_factor_load_sample_cov_factor_load_posterior_variance_mat_prod(
         self,
-    ) -> npt.NDArray[Any]:
+    ) -> npt.NDArray[np.float64]:
         """Posterior variance, factor loading, sample covariance,
         factor loading, and posterior variance matrix product: M⁻¹ Wᵀ S W M⁻¹"""
         return (
@@ -997,17 +999,17 @@ class _ARD_state:
         )  # M⁻¹ Wᵀ S W M⁻¹
 
     @cached_property
-    def model_var(self) -> npt.NDArray[Any]:
+    def model_var(self) -> npt.NDArray[np.float64]:
         """Model covariance matrix: W Wᵀ + σ²I = C"""
         return self.W @ self.W.T + self.noise_var * np.eye(self.fixed.X_dim)
 
     @cached_property
-    def log_det_model_var(self) -> npt.NDArray[Any]:
+    def log_det_model_var(self) -> npt.NDArray[np.float64]:
         """Log-determinant of the model covariance matrix: log |C|"""
         return np.linalg.slogdet(self.model_var)[1]
 
     @cached_property
-    def model_var_inv(self) -> npt.NDArray[Any]:
+    def model_var_inv(self) -> npt.NDArray[np.float64]:
         """Inverse of the model covariance matrix: C⁻¹ = (W Wᵀ + σ²I)⁻¹"""
         return np.linalg.inv(self.model_var)
 
@@ -1085,9 +1087,9 @@ class LinearGaussianPpcaARDResult(_LinearGaussianPpcaResult):
 
     Attributes
     ----------
-    mean : npt.NDArray[Any]
+    mean : npt.NDArray[np.float64]
         Mean of the data.
-    A : npt.NDArray[Any]
+    A : npt.NDArray[np.float64]
         automatic relevance determination matrix.
     noise_var : np.float64
         Noise variance.
@@ -1095,20 +1097,20 @@ class LinearGaussianPpcaARDResult(_LinearGaussianPpcaResult):
         Complete log-likelihood of the model.
     log_likelihood : np.float64
         Marginal log-likelihood of the model.
-    model_var_inv : npt.NDArray[Any]
+    model_var_inv : npt.NDArray[np.float64]
         Inverse of the model covariance.
-    log_det_model_var : npt.NDArray[Any]
+    log_det_model_var : npt.NDArray[np.float64]
         Log-determinant of the model covariance.
     """
 
     # pylint:disable=invalid-name
-    mean: npt.NDArray[Any]
-    A: npt.NDArray[Any]
+    mean: npt.NDArray[np.float64]
+    A: npt.NDArray[np.float64]
     noise_var: np.float64
     complete_log_likelihood: np.float64
     log_likelihood: np.float64
-    model_var_inv: npt.NDArray[Any]
-    log_det_model_var: npt.NDArray[Any]
+    model_var_inv: npt.NDArray[np.float64]
+    log_det_model_var: npt.NDArray[np.float64]
 
 
 class LinearGaussianPPCAEstimARD(_LinearGaussianPPCAEstim):
diff --git a/prob_dim_red/utils.py b/prob_dim_red/utils.py
index b12f432..65dc5f0 100644
--- a/prob_dim_red/utils.py
+++ b/prob_dim_red/utils.py
@@ -19,7 +19,7 @@
 
 """file containing functions and classes that may be useful"""
 from dataclasses import dataclass
-from typing import NamedTuple, Any
+from typing import NamedTuple
 from fractions import Fraction
 
 import numpy as np
@@ -99,7 +99,7 @@ def _vecto(A):
     return _VectorizedArray(is_order_c, is_order_f, vec)
 
 
-def data_load(data: npt.ArrayLike) -> npt.NDArray[Any]:
+def data_load(data: npt.ArrayLike) -> npt.NDArray[np.float64]:
     """
     Load data from an array-like object and return a NumPy array.
 
@@ -172,7 +172,7 @@ def trace_matmul(
     return np.einsum("ij,ji->", mat_a, mat_b)
 
 
-def positive_part(x: npt.ArrayLike) -> npt.NDArray[Any]:
+def positive_part(x: npt.ArrayLike) -> npt.NDArray[np.float64]:
     """
     Compute the positive part of an array element-wise.
 
@@ -193,7 +193,7 @@ def positive_part(x: npt.ArrayLike) -> npt.NDArray[Any]:
     return np.maximum(x, 0)
 
 
-def log_with_zeros(x: npt.ArrayLike) -> npt.NDArray[Any]:
+def log_with_zeros(x: npt.ArrayLike) -> npt.NDArray[np.float64]:
     """
     Compute the natural logarithm of an array, treating zeros safely.
 
-- 
GitLab


From 865c9908a586a243ab2468628d585ab668646a4b Mon Sep 17 00:00:00 2001
From: Julien Chiquet <julien.chiquet@inrae.fr>
Date: Wed, 14 Aug 2024 12:22:25 +0200
Subject: [PATCH 10/11] assess convergence on marginal loglikelihood in EM +
 sipplificaiton in noise-var

---
 prob_dim_red/linear_gaussian_ppca.py | 19 +++++--------------
 1 file changed, 5 insertions(+), 14 deletions(-)

diff --git a/prob_dim_red/linear_gaussian_ppca.py b/prob_dim_red/linear_gaussian_ppca.py
index 6f52fc7..b3e69b3 100644
--- a/prob_dim_red/linear_gaussian_ppca.py
+++ b/prob_dim_red/linear_gaussian_ppca.py
@@ -542,23 +542,14 @@ class LinearGaussianPPCAEstimEM(_LinearGaussianPPCAEstim):
             + self.state.posterior_variance_factor_load_sample_cov_factor_load_mat_prod
         )
 
-        # W_new, _r = np.linalg.qr(W_new)
-
-        noise_var_new = utils.positive_part(
+        noise_var_new = (
             1
             / self.X_dim
             * (
                 self.fixed.tr_sample_cov
-                - 2
-                * utils.trace_matmul(
+                - utils.trace_matmul(
                     self.state.sample_cov_factor_load_mat_prod,
-                    (self.state.posterior_variance @ W_new.T),
-                )
-                + utils.trace_matmul(
-                    self.state.noise_var * self.state.posterior_variance
-                    + self.state.posterior_variance_factor_load_sample_cov_factor_load_posterior_variance_mat_prod,
-                    W_new.T @ W_new,
-                    sym=True,
+                    self.state.posterior_variance @ W_new.T,
                 )
             )
         )
@@ -573,7 +564,7 @@ class LinearGaussianPPCAEstimEM(_LinearGaussianPPCAEstim):
         """
         self._em_init()
         yield self.state
-        likelihood = self.state.complete_log_likelihood
+        likelihood = self.state.log_likelihood
         for _ in (
             range(max_iterations) if max_iterations is not None else itertools.count()
         ):
@@ -584,7 +575,7 @@ class LinearGaussianPPCAEstimEM(_LinearGaussianPPCAEstim):
             #     self.state.W = U * S[None,:]
 
             yield self.state
-            likelihood, old_likelihood = self.state.complete_log_likelihood, likelihood
+            likelihood, old_likelihood = self.state.log_likelihood, likelihood
             if (likelihood - old_likelihood) < error_tolerance:
                 break
 
-- 
GitLab


From 9fc4e964d85a8a881d833e7ad3c5b0ad897b2cc7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Victor?= <francois.victor@inrae.fr>
Date: Thu, 29 Aug 2024 17:36:31 +0200
Subject: [PATCH 11/11] adding minimal test set and modif test parameters

---
 .gitlab-ci.yml                     |  6 +++---
 pyproject.toml                     |  5 +++++
 tests/test_linear_gaussian_ppca.py | 21 ++++++---------------
 3 files changed, 14 insertions(+), 18 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 58fe857..6c9043c 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -31,7 +31,7 @@ run_tests_py310:
     - pip install pytest
   script:
     - pip install .
-    - pytest
+    - pytest -m minimal
 
 run_tests_py311:
   stage: tests
@@ -40,7 +40,7 @@ run_tests_py311:
     - pip install pytest
   script:
     - pip install .
-    - pytest
+    - pytest -m minimal
 
 run_tests_py312:
   stage: tests
@@ -49,7 +49,7 @@ run_tests_py312:
     - pip install pytest
   script:
     - pip install .
-    - pytest
+    - pytest -m minimal
 
 build_package:
   stage: build
diff --git a/pyproject.toml b/pyproject.toml
index 557b1fb..249ebf0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -39,3 +39,8 @@ prob_dim_red = "prob_dim_red._cli:main"
 [project.urls]
 homepage = "https://forgemia.inra.fr/mia-paris/reduction-dimension/prob_dim_red"
 repository = "https://forgemia.inra.fr/mia-paris/reduction-dimension/prob_dim_red"
+
+[tool.pytest.ini_options]
+markers = [
+    "minimal: minimal test set (select with '-m \"minimal\"')",
+]
diff --git a/tests/test_linear_gaussian_ppca.py b/tests/test_linear_gaussian_ppca.py
index 43bf5e0..0134c86 100644
--- a/tests/test_linear_gaussian_ppca.py
+++ b/tests/test_linear_gaussian_ppca.py
@@ -6,8 +6,10 @@ import numpy as np
 
 from prob_dim_red import linear_gaussian_ppca
 
-ALL_PARAMS_MLE = [
-    (n, xd, zd, st, sd)
+ALL_PARAMS_MLE_EM = [
+    pytest.param(
+        (n, xd, zd, st, sd), marks=(pytest.mark.minimal if (n, xd) == (100, 10) else ())
+    )
     for n in (100, 1000, 10000)
     for xd in (10, 50, 100)
     for zd in (1, 2, 5, 50)
@@ -17,7 +19,7 @@ ALL_PARAMS_MLE = [
 ]
 
 
-@pytest.mark.parametrize("param", ALL_PARAMS_MLE)
+@pytest.mark.parametrize("param", ALL_PARAMS_MLE_EM)
 def test_mle(param):
     N, X_DIM, Z_DIM, sigma_true, seed = param
     random = np.random.RandomState(seed)  # pylint: disable=no-member
@@ -44,19 +46,8 @@ def test_mle(param):
     ).mean() ** 0.5 < 1000 * sigma_true * (Z_DIM / N) ** 0.5
 
 
-ALL_PARAMS_EM = [
-    (n, xd, zd, st, sd)
-    for n in (1000,)
-    for xd in (10, 20, 50)
-    for zd in (1, 2, 5, 8)
-    for st in (1e-1, 0.1, 5)
-    for sd in (1, 2, 3)
-    if zd < xd
-]
-
-
 # pylint: disable=too-many-locals
-@pytest.mark.parametrize("param", ALL_PARAMS_EM)
+@pytest.mark.parametrize("param", ALL_PARAMS_MLE_EM)
 def test_em(param):
     N, X_DIM, Z_DIM, sigma_true, seed = param
     random = np.random.RandomState(seed)  # pylint: disable=no-member
-- 
GitLab