From b3ed57d4dbc4bc841c3df3302c15233f357872ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Victor?= <francois.victor@inrae.fr> Date: Thu, 11 Jul 2024 17:24:19 +0200 Subject: [PATCH 01/11] implementation of model selection for MLE using Minka's BIC --- prob_dim_red/linear_gaussian_ppca.py | 104 ++++++++++++++++++++++++++- prob_dim_red/utils.py | 3 +- 2 files changed, 103 insertions(+), 4 deletions(-) diff --git a/prob_dim_red/linear_gaussian_ppca.py b/prob_dim_red/linear_gaussian_ppca.py index 2e00029..f17d7f4 100644 --- a/prob_dim_red/linear_gaussian_ppca.py +++ b/prob_dim_red/linear_gaussian_ppca.py @@ -17,6 +17,8 @@ # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# pylint:disable=too-many-lines + """file containing the main functions and classes of the package""" # imports import itertools @@ -140,10 +142,12 @@ class LinearGaussianPpcaMleResult(_LinearGaussianPpcaResult): # pylint:disable=invalid-name mean: npt.NDArray[Any] + W: npt.NDArray[Any] noise_var: np.float64 log_likelihood: np.float64 model_var_inv: npt.NDArray[Any] log_det_model_var: npt.NDArray[Any] + bic: np.float64 class LinearGaussianPPCAEstimMLE(_LinearGaussianPPCAEstim): @@ -165,9 +169,17 @@ class LinearGaussianPPCAEstimMLE(_LinearGaussianPPCAEstim): """ return (self.centered_data.T @ self.centered_data) / self.N - def fit(self): + @cached_property + def tr_sample_cov(self): """ - Fit the Linear Gaussian PPCA model using MLE. + Cached sample covariance trace. + """ + return np.trace(self.sample_cov) + + @cached_property + def evd(self): + """ + Cached truncated eigen value decomposition """ # pylint:disable=duplicate-code eig_val, eig_vec = scipy.linalg.eigh( @@ -176,6 +188,17 @@ class LinearGaussianPPCAEstimMLE(_LinearGaussianPPCAEstim): eig_val = eig_val[::-1] eig_vec = eig_vec[:, ::-1] + return eig_val, eig_vec + + def fit(self): + """ + Fit the Linear Gaussian PPCA model using MLE. + """ + if self._result is not None: + return + + eig_val, eig_vec = self.evd + noise_var_mle = (np.trace(self.sample_cov) - eig_val.sum()) / ( self.X_dim - self.Z_dim ) @@ -204,6 +227,17 @@ class LinearGaussianPPCAEstimMLE(_LinearGaussianPPCAEstim): + utils.trace_matmul(model_var_inv, self.sample_cov, sym=True) ) ) # pylint:disable=duplicate-code + + m_dim = ( + self.X_dim * self.Z_dim - (self.Z_dim * (self.Z_dim + 1)) // 2 + ) # dimension of Stiefel manifold + + log_bic = ( + -(self.N / 2) * np.log(eig_val).sum() + - (self.N * (self.X_dim - self.Z_dim) / 2) * np.log(noise_var_mle) + - (m_dim + self.Z_dim) / 2 * np.log(self.N) + ) + self._result = LinearGaussianPpcaMleResult( mean=self.mean, W=w_mle, @@ -211,6 +245,7 @@ class LinearGaussianPPCAEstimMLE(_LinearGaussianPPCAEstim): log_likelihood=log_likelihood, model_var_inv=model_var_inv, log_det_model_var=log_det_model_var, + bic=log_bic, ) @@ -575,6 +610,71 @@ class LinearGaussianPPCAEstimEM(_LinearGaussianPPCAEstim): return self +class MLE_Model_Selection(LinearGaussianPPCAEstimMLE): + """ + Class for model selection of MLE Linear Gaussian PPCA using Minka's + Bayesian Entropy Criterion approximation. + + Attributes + ---------- + n_components_max: int | None + Maximum number of components specified by the user + """ + + n_components_max: int | None + + def __init__(self, data, n_components_max=None): + super().__init__(data, n_components=None) + self.n_components_max = ( + n_components_max if n_components_max is not None else self.X_dim - 1 + ) + self.n_components_max = min(self.n_components_max, self.X_dim) + + def fit(self): + if self.Z_dim is not None: + return super().fit() + + lambdas, eig_vecs = np.empty(0), np.empty((self.X_dim, 0)) + + # computing bics by blocks of powers of 2 until max bic + for upper, lower in itertools.pairwise( + 2**k if k > 0 else 0 for k in itertools.count() + ): + lower = min(lower, self.n_components_max) + + # computing eigvals by blocks bounded by lower and upper + # s.t. eigval[n - lower] < eigval[n - upper] + eig_val, eig_vec = scipy.linalg.eigh( + self.sample_cov, + subset_by_index=(self.X_dim - lower, self.X_dim - upper - 1), + ) + + lambdas = np.append(lambdas, eig_val[::-1]) + eig_vecs = np.append(eig_vecs, eig_vec[:, ::-1], axis=1) + + vec_k = np.arange(1, lower + 1) + + bic = ( + -self.N / 2 * np.cumsum(np.log(lambdas)) + - self.N + * (self.X_dim - vec_k) + / 2 + * np.log( + (self.tr_sample_cov - np.cumsum(lambdas)) / (self.X_dim - vec_k) + ) + - (self.X_dim * vec_k - vec_k * (vec_k + 1) / 2 + vec_k) + / 2 + * np.log(self.N) + ) + + if (bic.max() != bic[-1]) or (lower >= self.n_components_max + 1): + break + + self.Z_dim = bic.argmax() + 1 + self.evd = lambdas[: self.Z_dim], eig_vecs[:, : self.Z_dim] + return super().fit() + + def ppca_mle(data: npt.NDArray[Any], n_components: int) -> LinearGaussianPpcaMleResult: """ Perform Linear Gaussian PPCA estimation using Maximum Likelihood Estimation (MLE). diff --git a/prob_dim_red/utils.py b/prob_dim_red/utils.py index d35e321..c5f9d16 100644 --- a/prob_dim_red/utils.py +++ b/prob_dim_red/utils.py @@ -18,8 +18,7 @@ # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """file containing functions and classes that may be useful""" -from typing import NamedTuple -from typing import Any +from typing import NamedTuple, Any import numpy as np import numpy.typing as npt -- GitLab From 17a4bf7d04b83e4689c1a414499a2b0980615785 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Victor?= <francois.victor@inrae.fr> Date: Thu, 8 Aug 2024 11:59:32 +0200 Subject: [PATCH 02/11] Dealing with zeros and inf safely, computing bic the same way in MLE and model selection --- prob_dim_red/linear_gaussian_ppca.py | 231 ++++++++++++++++++++++++--- prob_dim_red/utils.py | 77 +++++++++ 2 files changed, 283 insertions(+), 25 deletions(-) diff --git a/prob_dim_red/linear_gaussian_ppca.py b/prob_dim_red/linear_gaussian_ppca.py index f17d7f4..9b8ab39 100644 --- a/prob_dim_red/linear_gaussian_ppca.py +++ b/prob_dim_red/linear_gaussian_ppca.py @@ -24,7 +24,8 @@ import itertools from typing import Any, Optional, Generator from dataclasses import dataclass -from functools import cached_property +from functools import cached_property, reduce +import operator import numpy as np import numpy.typing as npt @@ -35,7 +36,7 @@ from prob_dim_red import utils _ARD_ALPHA_INV_LOWER_BOUND = 1e-200 -class _NotFittedModelError(Exception): +class NotFittedModelError(Exception): """ Custom exception raised when a model has not been fitted yet. """ @@ -113,11 +114,11 @@ class _LinearGaussianPPCAEstim: Raises ------ - _NotFittedModelError + NotFittedModelError If the model has not been fitted yet. """ if self._result is None: - raise _NotFittedModelError + raise NotFittedModelError return self._result @@ -177,7 +178,7 @@ class LinearGaussianPPCAEstimMLE(_LinearGaussianPPCAEstim): return np.trace(self.sample_cov) @cached_property - def evd(self): + def _evd(self): """ Cached truncated eigen value decomposition """ @@ -197,35 +198,46 @@ class LinearGaussianPPCAEstimMLE(_LinearGaussianPPCAEstim): if self._result is not None: return - eig_val, eig_vec = self.evd + eig_val, eig_vec = self._evd - noise_var_mle = (np.trace(self.sample_cov) - eig_val.sum()) / ( + # for consistancy with model selection these sums must be associative and commutative, + # numerical errors must not depend on order of the summation + eig_val_sum = np.float64( + reduce(operator.add, (utils.FractionOrInf(x) for x in eig_val)) + ) + log_eig_val_sum = np.float64( + reduce(operator.add, (utils.FractionOrInf(x) for x in np.log(eig_val))) + ) + + noise_var_mle = utils.positive_part(self.tr_sample_cov - eig_val_sum) / ( self.X_dim - self.Z_dim ) - w_mle = eig_vec * np.sqrt(eig_val - noise_var_mle)[None, :] + w_mle = eig_vec * np.sqrt(utils.positive_part(eig_val - noise_var_mle))[None, :] posterior_precision = w_mle.T @ w_mle + noise_var_mle * np.eye(self.Z_dim) posterior_variance = np.linalg.inv(posterior_precision) - model_var_inv = ( - 1 - / (noise_var_mle) - * (np.eye(self.X_dim) - w_mle @ posterior_variance @ w_mle.T) + model_var_inv = (1 / (noise_var_mle) if noise_var_mle > 0 else np.inf) * ( + np.eye(self.X_dim) - w_mle @ posterior_variance @ w_mle.T ) - log_det_model_var = (self.X_dim - self.Z_dim) * np.log(noise_var_mle) + np.log( - eig_val - ).sum() + log_det_model_var = (self.X_dim - self.Z_dim) * ( + np.log(noise_var_mle) if noise_var_mle > 0 else -np.inf + ) + log_eig_val_sum # pylint:disable=duplicate-code log_likelihood = ( - -self.N - / 2 - * ( - self.X_dim * np.log(2 * np.pi) - + log_det_model_var - + utils.trace_matmul(model_var_inv, self.sample_cov, sym=True) + ( + -self.N + / 2 + * ( + self.X_dim * np.log(2 * np.pi) + + log_det_model_var + + utils.trace_matmul(model_var_inv, self.sample_cov, sym=True) + ) ) + if noise_var_mle > 0 + else -np.inf ) # pylint:disable=duplicate-code m_dim = ( @@ -233,9 +245,13 @@ class LinearGaussianPPCAEstimMLE(_LinearGaussianPPCAEstim): ) # dimension of Stiefel manifold log_bic = ( - -(self.N / 2) * np.log(eig_val).sum() - - (self.N * (self.X_dim - self.Z_dim) / 2) * np.log(noise_var_mle) - - (m_dim + self.Z_dim) / 2 * np.log(self.N) + ( + -(self.N / 2) * log_eig_val_sum + - (self.N * (self.X_dim - self.Z_dim) / 2) * np.log(noise_var_mle) + - (m_dim + self.Z_dim) / 2 * np.log(self.N) + ) + if noise_var_mle > 0 + else -np.inf ) self._result = LinearGaussianPpcaMleResult( @@ -671,10 +687,175 @@ class MLE_Model_Selection(LinearGaussianPPCAEstimMLE): break self.Z_dim = bic.argmax() + 1 - self.evd = lambdas[: self.Z_dim], eig_vecs[:, : self.Z_dim] + self._evd = lambdas[: self.Z_dim], eig_vecs[:, : self.Z_dim] return super().fit() +class MLE_Model_Selection_v2(LinearGaussianPPCAEstimMLE): + """ + Class for model selection of MLE Linear Gaussian PPCA using + Minka's Bayesian Entropy Criterion approximation. + + Attributes + ---------- + n_components_max: int | None + Maximum number of components specified by the user + strategy: str + Strategy to compute BICs, either 'grow' or 'all' + """ + + n_components_max: int | None + strategy: str + _bics: np.ndarray + + def __init__( + self, + data: np.ndarray, + n_components_max: Optional[int] = None, + strategy: str = "grow", + ): + super().__init__(data, n_components=None) + self.n_components_max = ( + n_components_max if n_components_max is not None else self.X_dim - 1 + ) + self.n_components_max = min(self.n_components_max, self.X_dim - 1) + self.strategy = strategy + self._bics = np.empty(0) + + def fit(self): + if self.Z_dim is not None: + return super().fit() + + lambdas, eig_vecs = np.empty(0), np.empty((self.X_dim, 0)) + + if self.strategy == "grow": + # computing bics by blocks of powers of 2 until max bic + for upper, lower in itertools.pairwise( + 2**k if k > 0 else 0 for k in itertools.count() + ): + lower = min(lower, self.n_components_max) + + # computing eigvals by blocks bounded by lower and upper + # s.t. eigval[n - lower] < eigval[n - upper] + eig_val, eig_vec = scipy.linalg.eigh( + self.sample_cov, + subset_by_index=(self.X_dim - lower, self.X_dim - upper - 1), + ) + + lambdas = utils.positive_part(np.append(lambdas, eig_val[::-1])) + eig_vecs = np.append(eig_vecs, eig_vec[:, ::-1], axis=1) + + # for consistancy with model selection these sums must be associative and + # commutative, numerical errors must not depend on order of the summation + lambdas_cumsum = np.array( + list( + itertools.accumulate( + (utils.FractionOrInf(x) for x in lambdas), + operator.add, + ) + ), + dtype=np.float64, + ) + log_lambdas_cumsum = np.array( + list( + itertools.accumulate( + ( + utils.FractionOrInf(x) + for x in utils.log_with_zeros(lambdas) + ), + operator.add, + ) + ), + dtype=np.float64, + ) + + vec_k = np.arange(1, lower + 1) + bics = ( + -self.N / 2 * log_lambdas_cumsum + - self.N + * (self.X_dim - vec_k) + / 2 + * utils.log_with_zeros( + utils.positive_part(self.tr_sample_cov - lambdas_cumsum) + / (self.X_dim - vec_k) + ) + - (self.X_dim * vec_k - vec_k * (vec_k + 1) / 2 + vec_k) + / 2 + * np.log(self.N) + ) + + if (bics.max() != bics[-1]) or (lower >= self.n_components_max + 1): + break + + elif self.strategy == "all": + + lower = self.n_components_max + + eig_val, eig_vec = scipy.linalg.eigh( + self.sample_cov, + subset_by_index=(self.X_dim - lower, self.X_dim - 1), + ) + lambdas = utils.positive_part(np.append(lambdas, eig_val[::-1])) + eig_vecs = np.append(eig_vecs, eig_vec[:, ::-1], axis=1) + + vec_k = np.arange(1, lower + 1) + + # for consistancy with model selection these sums must be associative and commutative, + # numerical errors must not depend on order of the summation + lambdas_cumsum = np.array( + list( + itertools.accumulate( + (utils.FractionOrInf(x) for x in lambdas), + operator.add, + ) + ), + dtype=np.float64, + ) + log_lambdas_cumsum = np.array( + list( + itertools.accumulate( + (utils.FractionOrInf(x) for x in utils.log_with_zeros(lambdas)), + operator.add, + ) + ), + dtype=np.float64, + ) + + bics = ( + -self.N / 2 * log_lambdas_cumsum + - self.N + * (self.X_dim - vec_k) + / 2 + * utils.log_with_zeros( + utils.positive_part(self.tr_sample_cov - lambdas_cumsum) + / (self.X_dim - vec_k) + ) + - (self.X_dim * vec_k - vec_k * (vec_k + 1) / 2 + vec_k) + / 2 + * np.log(self.N) + ) + + self._bics = bics + self.Z_dim = self._bics.argmax() + 1 + self._evd = lambdas[: self.Z_dim], eig_vecs[:, : self.Z_dim] + return super().fit() + + @property + def bics(self) -> np.ndarray: + """ + Get the BICs computed during the model selection. + + Returns + ------- + np.ndarray + Array of BIC values. + """ + if self.Z_dim is None: + raise NotFittedModelError + + return self._bics + + def ppca_mle(data: npt.NDArray[Any], n_components: int) -> LinearGaussianPpcaMleResult: """ Perform Linear Gaussian PPCA estimation using Maximum Likelihood Estimation (MLE). diff --git a/prob_dim_red/utils.py b/prob_dim_red/utils.py index c5f9d16..b12f432 100644 --- a/prob_dim_red/utils.py +++ b/prob_dim_red/utils.py @@ -18,7 +18,10 @@ # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """file containing functions and classes that may be useful""" +from dataclasses import dataclass from typing import NamedTuple, Any +from fractions import Fraction + import numpy as np import numpy.typing as npt @@ -167,3 +170,77 @@ def trace_matmul( return np.einsum("ij,ij->", mat_a, mat_b) return np.einsum("ij,ji->", mat_a, mat_b) + + +def positive_part(x: npt.ArrayLike) -> npt.NDArray[Any]: + """ + Compute the positive part of an array element-wise. + + This function returns an array where each element is the maximum of the input + element and zero, effectively replacing all negative values with zero. + + Parameters + ---------- + x : array-like + Input array or scalar to compute the positive part. + + Returns + ------- + ndarray + An array with the same shape as `x`, where each element is the positive + part of the corresponding element in `x`. + """ + return np.maximum(x, 0) + + +def log_with_zeros(x: npt.ArrayLike) -> npt.NDArray[Any]: + """ + Compute the natural logarithm of an array, treating zeros safely. + + This function computes the natural logarithm of each element in the input array. + For elements equal to zero, it returns positive infinity instead of `-inf`, and + for negative elements, the function asserts an error as the logarithm is undefined. + + Parameters + ---------- + x : array-like + Input array or scalar for which the logarithm is computed. All elements must be + non-negative. + + Returns + ------- + ndarray + An array with the same shape as `x`, where each element is the natural logarithm + of the corresponding element in `x`. Elements that are zero are mapped to positive infinity. + """ + assert (np.asarray(x) >= 0).all() + + return np.log(x, out=np.full_like(x, np.inf), where=x > 0) + + +@dataclass +class FractionOrInf: + """ + Attributes + ---------- + value + """ + + value: Fraction | np.float64 + + def __init__(self, value: Fraction | np.float64): + if isinstance(value, Fraction): + self.value = value + return + if np.isinf(value): + self.value = value + return + self.value = Fraction(value) + + def __add__(self, oth: "FractionOrInf"): + if isinstance(self.value, Fraction) and isinstance(oth.value, Fraction): + return FractionOrInf(self.value + oth.value) + return FractionOrInf(np.float64(self.value) + np.float64(oth.value)) + + def __float__(self): + return float(self.value) -- GitLab From 2a20d47b8a84b715fb7545c35735cf3264dba5a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Victor?= <francois.victor@inrae.fr> Date: Fri, 9 Aug 2024 15:15:32 +0200 Subject: [PATCH 03/11] modif complete likelihood --- prob_dim_red/linear_gaussian_ppca.py | 71 +++++++++++++++++++--------- 1 file changed, 49 insertions(+), 22 deletions(-) diff --git a/prob_dim_red/linear_gaussian_ppca.py b/prob_dim_red/linear_gaussian_ppca.py index 9b8ab39..910667e 100644 --- a/prob_dim_red/linear_gaussian_ppca.py +++ b/prob_dim_red/linear_gaussian_ppca.py @@ -30,6 +30,7 @@ import operator import numpy as np import numpy.typing as npt import scipy +from tqdm import tqdm from prob_dim_red import utils @@ -276,6 +277,8 @@ class _DataDescFixed: Number of data points. X_dim : int Dimension of the data. + Z_dim : int | None + Dimension of the latent. sample_cov : npt.NDArray[Any] Sample covariance of the data. """ @@ -283,6 +286,7 @@ class _DataDescFixed: # pylint:disable=invalid-name N: int X_dim: int + Z_dim: int | None centered_data: npt.NDArray[Any] @cached_property @@ -398,28 +402,25 @@ class _EM_state: Complete log-likelihood of the model. """ # pylint:disable=line-too-long + if self.noise_var == 0: + return -np.inf + return ( - ( - -self.fixed.N - * self.fixed.X_dim - / 2 - * np.log(2 * np.pi * self.noise_var) - - self.fixed.N - / 2 - * self.noise_var - * ( - np.trace(self.posterior_variance) - + np.trace( - self.posterior_variance_factor_load_sample_cov_factor_load_posterior_variance_mat_prod - ) + -self.fixed.N * self.fixed.Z_dim / 2 * np.log(2 * np.pi) + - self.fixed.N * self.fixed.X_dim / 2 * np.log(2 * np.pi * self.noise_var) + - self.fixed.N + / 2 + * ( + self.noise_var * np.trace(self.posterior_variance) + + np.trace( + self.posterior_variance_factor_load_sample_cov_factor_load_posterior_variance_mat_prod ) - - 1 / 2 * self.fixed.N * self.fixed.tr_sample_cov ) - + 1 + - self.fixed.N / 2 * self.fixed.tr_sample_cov + + self.fixed.N / self.noise_var * np.trace( - self.fixed.N - * self.posterior_variance_factor_load_sample_cov_factor_load_mat_prod + self.posterior_variance_factor_load_sample_cov_factor_load_mat_prod ) - self.fixed.N / 2 @@ -440,6 +441,8 @@ class _EM_state: """ Log-likelihood of the model. """ + if self.noise_var == 0: + return -np.inf return ( -self.fixed.N / 2 @@ -512,7 +515,10 @@ class LinearGaussianPPCAEstimEM(_LinearGaussianPPCAEstim): """ super().__init__(*args, **kwargs) self.fixed = _DataDescFixed( - N=self.N, X_dim=self.X_dim, centered_data=self.centered_data + N=self.N, + X_dim=self.X_dim, + Z_dim=self.Z_dim, + centered_data=self.centered_data, ) self.state = None @@ -543,7 +549,7 @@ class LinearGaussianPPCAEstimEM(_LinearGaussianPPCAEstim): # W_new, _r = np.linalg.qr(W_new) - noise_var_new = ( + noise_var_new = utils.positive_part( 1 / self.X_dim * ( @@ -598,6 +604,7 @@ class LinearGaussianPPCAEstimEM(_LinearGaussianPPCAEstim): max_iterations=None, error_tolerance=1e-6, trace=False, + progress=True, ) -> Optional[list[_EM_state]]: """ Fit the Linear Gaussian PPCA model using the EM algorithm. @@ -610,6 +617,8 @@ class LinearGaussianPPCAEstimEM(_LinearGaussianPPCAEstim): Tolerance for the convergence of the EM algorithm. trace : bool Whether to store the intermediate states of the EM algorithm. + progress : bool + Show progress with tqdm (not compatible with trace). Returns ------- @@ -620,8 +629,23 @@ class LinearGaussianPPCAEstimEM(_LinearGaussianPPCAEstim): if trace: return states - for _ in states: - pass + if progress: + last_comp_lik = 0 + scale = 0 + for state in (progress_bar := tqdm(states, unit_scale=True)): + gap, last_comp_lik = ( + state.complete_log_likelihood - last_comp_lik, + state.complete_log_likelihood, + ) + scale += 0.1 * (np.log10(np.abs(gap)) - scale) + dscale = max(int(-np.floor(scale - 0.5)), 0) + fmt = "comp-log-lik={" + f":.{dscale:d}f" + "}" + progress_bar.set_postfix_str( + fmt.format(state.complete_log_likelihood), refresh=False + ) + else: + for _ in states: + pass return self @@ -1105,7 +1129,10 @@ class LinearGaussianPPCAEstimARD(_LinearGaussianPPCAEstim): """ super().__init__(*args, **(kwargs | {"n_components": None})) self.fixed = _DataDescFixed( - N=self.N, X_dim=self.X_dim, centered_data=self.centered_data + N=self.N, + X_dim=self.X_dim, + Z_dim=self.fixed.Z_dim, + centered_data=self.centered_data, ) self.state = None -- GitLab From 3c949b40400a342c4e44b929d05df1112d3d9472 Mon Sep 17 00:00:00 2001 From: Julien Chiquet <julien.chiquet@inrae.fr> Date: Fri, 9 Aug 2024 16:26:38 +0200 Subject: [PATCH 04/11] fix and rewriting of Q in Gaussian pPCA EM --- prob_dim_red/linear_gaussian_ppca.py | 43 ++++++++++++---------------- 1 file changed, 18 insertions(+), 25 deletions(-) diff --git a/prob_dim_red/linear_gaussian_ppca.py b/prob_dim_red/linear_gaussian_ppca.py index 910667e..385dc9a 100644 --- a/prob_dim_red/linear_gaussian_ppca.py +++ b/prob_dim_red/linear_gaussian_ppca.py @@ -405,35 +405,29 @@ class _EM_state: if self.noise_var == 0: return -np.inf - return ( - -self.fixed.N * self.fixed.Z_dim / 2 * np.log(2 * np.pi) - - self.fixed.N * self.fixed.X_dim / 2 * np.log(2 * np.pi * self.noise_var) - - self.fixed.N - / 2 + return (-self.fixed.N / 2) * ( + self.fixed.Z_dim * np.log(2 * np.pi) + + self.fixed.X_dim * np.log(2 * np.pi * self.noise_var) + + self.noise_var * np.trace(self.posterior_variance) + + np.trace( + self.posterior_variance_factor_load_sample_cov_factor_load_posterior_variance_mat_prod + ) + + (1 / self.noise_var) * ( - self.noise_var * np.trace(self.posterior_variance) - + np.trace( - self.posterior_variance_factor_load_sample_cov_factor_load_posterior_variance_mat_prod + self.fixed.tr_sample_cov + - 2 + * np.trace( + self.posterior_variance_factor_load_sample_cov_factor_load_mat_prod + ) + + utils.trace_matmul( + self.posterior_variance_factor_load_sample_cov_factor_load_posterior_variance_mat_prod, + self.noiseless_posterior_precision, + sym=True, ) ) - - self.fixed.N / 2 * self.fixed.tr_sample_cov - + self.fixed.N - / self.noise_var - * np.trace( - self.posterior_variance_factor_load_sample_cov_factor_load_mat_prod - ) - - self.fixed.N - / 2 - * utils.trace_matmul( + + utils.trace_matmul( self.posterior_variance, self.noiseless_posterior_precision, sym=True ) - - self.fixed.N - / (2 * self.noise_var) - * utils.trace_matmul( - self.posterior_variance_factor_load_sample_cov_factor_load_posterior_variance_mat_prod, - self.noiseless_posterior_precision, - sym=True, - ) ) @cached_property @@ -812,7 +806,6 @@ class MLE_Model_Selection_v2(LinearGaussianPPCAEstimMLE): break elif self.strategy == "all": - lower = self.n_components_max eig_val, eig_vec = scipy.linalg.eigh( -- GitLab From adc5f7b58038b8edc8fdccc10abe1d2cf7e837c6 Mon Sep 17 00:00:00 2001 From: Julien Chiquet <julien.chiquet@inrae.fr> Date: Fri, 9 Aug 2024 17:10:14 +0200 Subject: [PATCH 05/11] added tqdm to package dependencies --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index d12ad9d..557b1fb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,7 +27,7 @@ classifiers = [ "Intended Audience :: Science/Research", "Topic :: Scientific/Engineering", ] -dependencies = ["numpy", "scipy", "matplotlib"] +dependencies = ["numpy", "scipy", "matplotlib", "tqdm"] [project.optional-dependencies] dev = ["build",] -- GitLab From d430b4fb6c5cb538abb53e92b88d05efc16975db Mon Sep 17 00:00:00 2001 From: Julien Chiquet <julien.chiquet@inrae.fr> Date: Fri, 9 Aug 2024 18:52:02 +0200 Subject: [PATCH 06/11] added missing trace term in noise_var_new for EM in Gaussian pPCA --- prob_dim_red/linear_gaussian_ppca.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/prob_dim_red/linear_gaussian_ppca.py b/prob_dim_red/linear_gaussian_ppca.py index 385dc9a..853337d 100644 --- a/prob_dim_red/linear_gaussian_ppca.py +++ b/prob_dim_red/linear_gaussian_ppca.py @@ -548,10 +548,17 @@ class LinearGaussianPPCAEstimEM(_LinearGaussianPPCAEstim): / self.X_dim * ( self.fixed.tr_sample_cov - - utils.trace_matmul( + - 2 + * utils.trace_matmul( self.state.sample_cov_factor_load_mat_prod, (self.state.posterior_variance @ W_new.T), ) + + utils.trace_matmul( + self.state.noise_var * self.state.posterior_variance + + self.state.posterior_variance_factor_load_sample_cov_factor_load_posterior_variance_mat_prod, + W_new.T @ W_new, + sym=True, + ) ) ) -- GitLab From ff016f2b9fa8f019f819ea9413ecc7c1073ec078 Mon Sep 17 00:00:00 2001 From: Julien Chiquet <julien.chiquet@inrae.fr> Date: Fri, 9 Aug 2024 19:09:06 +0200 Subject: [PATCH 07/11] added loglik in progress bar for pPCA --- prob_dim_red/linear_gaussian_ppca.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/prob_dim_red/linear_gaussian_ppca.py b/prob_dim_red/linear_gaussian_ppca.py index 853337d..3cf5f47 100644 --- a/prob_dim_red/linear_gaussian_ppca.py +++ b/prob_dim_red/linear_gaussian_ppca.py @@ -634,15 +634,20 @@ class LinearGaussianPPCAEstimEM(_LinearGaussianPPCAEstim): last_comp_lik = 0 scale = 0 for state in (progress_bar := tqdm(states, unit_scale=True)): - gap, last_comp_lik = ( + gap, last_comp_lik, last_lik = ( state.complete_log_likelihood - last_comp_lik, state.complete_log_likelihood, + state.log_likelihood, ) scale += 0.1 * (np.log10(np.abs(gap)) - scale) dscale = max(int(-np.floor(scale - 0.5)), 0) - fmt = "comp-log-lik={" + f":.{dscale:d}f" + "}" + fmt_comp_lik = "comp-log-lik={" + f":.{dscale:d}f" + "}" + fmt_marg_lik = "marg-log-lik={" + f":.{dscale:d}f" + "}" progress_bar.set_postfix_str( - fmt.format(state.complete_log_likelihood), refresh=False + fmt_comp_lik.format(last_comp_lik) + + " - " + + fmt_marg_lik.format(last_lik), + refresh=False, ) else: for _ in states: -- GitLab From 09fb275e5f7425ba39f98fa2a1c7c1238b4efa7b Mon Sep 17 00:00:00 2001 From: Julien Chiquet <julien.chiquet@inrae.fr> Date: Fri, 9 Aug 2024 19:18:13 +0200 Subject: [PATCH 08/11] disable line too long --- prob_dim_red/linear_gaussian_ppca.py | 1 + 1 file changed, 1 insertion(+) diff --git a/prob_dim_red/linear_gaussian_ppca.py b/prob_dim_red/linear_gaussian_ppca.py index 3cf5f47..d011ae9 100644 --- a/prob_dim_red/linear_gaussian_ppca.py +++ b/prob_dim_red/linear_gaussian_ppca.py @@ -536,6 +536,7 @@ class LinearGaussianPPCAEstimEM(_LinearGaussianPPCAEstim): Perform one step of the EM algorithm. """ # pylint:disable=invalid-name + # pylint:disable=line-too-long W_new = self.state.sample_cov_factor_load_mat_prod @ np.linalg.inv( self.state.noise_var * np.eye(self.Z_dim) + self.state.posterior_variance_factor_load_sample_cov_factor_load_mat_prod -- GitLab From d9d365c8f37b47496a5e5446dc192358edfc131e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Victor?= <francois.victor@inrae.fr> Date: Wed, 14 Aug 2024 10:19:35 +0200 Subject: [PATCH 09/11] proper typing of numpy arrays & removing np.abs on EM stopping criteria. --- prob_dim_red/linear_gaussian_ppca.py | 126 ++++++++++++++------------- prob_dim_red/utils.py | 8 +- 2 files changed, 68 insertions(+), 66 deletions(-) diff --git a/prob_dim_red/linear_gaussian_ppca.py b/prob_dim_red/linear_gaussian_ppca.py index d011ae9..6f52fc7 100644 --- a/prob_dim_red/linear_gaussian_ppca.py +++ b/prob_dim_red/linear_gaussian_ppca.py @@ -22,7 +22,7 @@ """file containing the main functions and classes of the package""" # imports import itertools -from typing import Any, Optional, Generator +from typing import Optional, Generator from dataclasses import dataclass from functools import cached_property, reduce import operator @@ -50,11 +50,11 @@ class _LinearGaussianPpcaResult: Attributes ---------- - W : npt.NDArray[Any] + W : npt.NDArray[np.float64] Factor-loading matrix. """ - W: npt.NDArray[Any] # pylint:disable=invalid-name + W: npt.NDArray[np.float64] # pylint:disable=invalid-name # pylint:disable=too-few-public-methods @@ -64,9 +64,9 @@ class _LinearGaussianPPCAEstim: Attributes ---------- - mean : npt.NDArray[Any] + mean : npt.NDArray[np.float64] Mean of the data. - centered_data : npt.NDArray[Any] + centered_data : npt.NDArray[np.float64] Centered data. X_dim : int Dimension corresponding to the number of variables. @@ -78,20 +78,20 @@ class _LinearGaussianPPCAEstim: Result of the estimation. """ - mean: npt.NDArray[Any] - centered_data: npt.NDArray[Any] + mean: npt.NDArray[np.float64] + centered_data: npt.NDArray[np.float64] X_dim: int Z_dim: int N: int _result: None | _LinearGaussianPpcaResult - def __init__(self, data: npt.NDArray[Any], n_components: int): + def __init__(self, data: npt.NDArray[np.float64], n_components: int): """ Initialize the Linear Gaussian PPCA estimation object. Parameters ---------- - data : npt.NDArray[Any] + data : npt.NDArray[np.float64] Data to be modeled. n_components : int Number of latent dimensions. @@ -130,25 +130,25 @@ class LinearGaussianPpcaMleResult(_LinearGaussianPpcaResult): Attributes ---------- - mean : npt.NDArray[Any] + mean : npt.NDArray[np.float64] Mean of the data. noise_var : np.float64 Noise variance. log_likelihood : np.float64 Log-likelihood of the model. - model_var_inv : npt.NDArray[Any] + model_var_inv : npt.NDArray[np.float64] Inverse of the model covariance. - log_det_model_var : npt.NDArray[Any] + log_det_model_var : npt.NDArray[np.float64] Log-determinant of the model covariance. """ # pylint:disable=invalid-name - mean: npt.NDArray[Any] - W: npt.NDArray[Any] + mean: npt.NDArray[np.float64] + W: npt.NDArray[np.float64] noise_var: np.float64 log_likelihood: np.float64 - model_var_inv: npt.NDArray[Any] - log_det_model_var: npt.NDArray[Any] + model_var_inv: npt.NDArray[np.float64] + log_det_model_var: npt.NDArray[np.float64] bic: np.float64 @@ -279,7 +279,7 @@ class _DataDescFixed: Dimension of the data. Z_dim : int | None Dimension of the latent. - sample_cov : npt.NDArray[Any] + sample_cov : npt.NDArray[np.float64] Sample covariance of the data. """ @@ -287,7 +287,7 @@ class _DataDescFixed: N: int X_dim: int Z_dim: int | None - centered_data: npt.NDArray[Any] + centered_data: npt.NDArray[np.float64] @cached_property def tr_sample_cov(self): @@ -320,7 +320,7 @@ class _EM_state: Attributes ---------- - W : npt.NDArray[Any] + W : npt.NDArray[np.float64] Factor-loading matrix. noise_var : np.float64 Noise variance σ². @@ -329,41 +329,41 @@ class _EM_state: """ # pylint:disable=invalid-name - W: npt.NDArray[Any] + W: npt.NDArray[np.float64] noise_var: np.float64 fixed: _DataDescFixed @cached_property - def noiseless_posterior_precision(self) -> npt.NDArray[Any]: + def noiseless_posterior_precision(self) -> npt.NDArray[np.float64]: """Noiseless posterior precision matrix: Wáµ€ W""" return self.W.T @ self.W @cached_property - def posterior_precision(self) -> npt.NDArray[Any]: + def posterior_precision(self) -> npt.NDArray[np.float64]: """Posterior precision matrix: M = Wáµ€ W + σ² I""" return self.noiseless_posterior_precision + self.noise_var * np.eye( self.W.shape[1] ) @cached_property - def posterior_variance(self) -> npt.NDArray[Any]: + def posterior_variance(self) -> npt.NDArray[np.float64]: """Posterior variance matrix: (Wáµ€ W + σ²I)â»Â¹ = Mâ»Â¹""" return np.linalg.inv(self.posterior_precision) @cached_property - def sample_cov_factor_load_mat_prod(self) -> npt.NDArray[Any]: + def sample_cov_factor_load_mat_prod(self) -> npt.NDArray[np.float64]: """Sample covariance and factor loading matrix product: (Wáµ€ Sáµ€)áµ€= SW""" return self.fixed.sample_cov_matmul_smth(self.W) @cached_property - def factor_load_sample_cov_factor_load_mat_prod(self) -> npt.NDArray[Any]: + def factor_load_sample_cov_factor_load_mat_prod(self) -> npt.NDArray[np.float64]: """Factor loading, sample covariance, and factor loading matrix product: Wáµ€(Wáµ€Sáµ€)áµ€= Wáµ€ SW""" return self.W.T @ self.sample_cov_factor_load_mat_prod @cached_property def posterior_variance_factor_load_sample_cov_factor_load_mat_prod( self, - ) -> npt.NDArray[Any]: + ) -> npt.NDArray[np.float64]: """Posterior variance, factor loading, sample covariance and factor loading matrix product: Mâ»Â¹ Wáµ€ S W""" return ( @@ -373,7 +373,7 @@ class _EM_state: @cached_property def posterior_variance_factor_load_sample_cov_factor_load_posterior_variance_mat_prod( self, - ) -> npt.NDArray[Any]: + ) -> npt.NDArray[np.float64]: """Posterior variance, factor loading, sample covariance, factor loading, and posterior variance matrix product: Mâ»Â¹ Wáµ€ S W Mâ»Â¹""" return ( @@ -382,17 +382,17 @@ class _EM_state: ) # Mâ»Â¹ Wáµ€ S W Mâ»Â¹ @cached_property - def model_var(self) -> npt.NDArray[Any]: + def model_var(self) -> npt.NDArray[np.float64]: """Model covariance matrix: W Wáµ€ + σ²I = C""" return self.W @ self.W.T + self.noise_var * np.eye(self.fixed.X_dim) @cached_property - def log_det_model_var(self) -> npt.NDArray[Any]: + def log_det_model_var(self) -> npt.NDArray[np.float64]: """Log-determinant of the model covariance matrix: log |C|""" return np.linalg.slogdet(self.model_var)[1] @cached_property - def model_var_inv(self) -> npt.NDArray[Any]: + def model_var_inv(self) -> npt.NDArray[np.float64]: """Inverse of the model covariance matrix: Câ»Â¹ = (W Wáµ€ + σ²I)â»Â¹""" return np.linalg.inv(self.model_var) @@ -462,7 +462,7 @@ class LinearGaussianPpcaEMResult(_LinearGaussianPpcaResult): Attributes ---------- - mean : npt.NDArray[Any] + mean : npt.NDArray[np.float64] Mean of the data. noise_var : np.float64 Noise variance. @@ -470,19 +470,19 @@ class LinearGaussianPpcaEMResult(_LinearGaussianPpcaResult): Complete log-likelihood of the model. log_likelihood : np.float64 Marginal log-likelihood of the model. - model_var_inv : npt.NDArray[Any] + model_var_inv : npt.NDArray[np.float64] Inverse of the model covariance. - log_det_model_var : npt.NDArray[Any] + log_det_model_var : npt.NDArray[np.float64] Log-determinant of the model covariance. """ # pylint:disable=invalid-name - mean: npt.NDArray[Any] + mean: npt.NDArray[np.float64] noise_var: np.float64 complete_log_likelihood: np.float64 log_likelihood: np.float64 - model_var_inv: npt.NDArray[Any] - log_det_model_var: npt.NDArray[Any] + model_var_inv: npt.NDArray[np.float64] + log_det_model_var: npt.NDArray[np.float64] class LinearGaussianPPCAEstimEM(_LinearGaussianPPCAEstim): @@ -585,7 +585,7 @@ class LinearGaussianPPCAEstimEM(_LinearGaussianPPCAEstim): yield self.state likelihood, old_likelihood = self.state.complete_log_likelihood, likelihood - if np.abs(likelihood - old_likelihood) < error_tolerance: + if (likelihood - old_likelihood) < error_tolerance: break U, S, _ = np.linalg.svd(self.state.W, full_matrices=False) @@ -886,13 +886,15 @@ class MLE_Model_Selection_v2(LinearGaussianPPCAEstimMLE): return self._bics -def ppca_mle(data: npt.NDArray[Any], n_components: int) -> LinearGaussianPpcaMleResult: +def ppca_mle( + data: npt.NDArray[np.float64], n_components: int +) -> LinearGaussianPpcaMleResult: """ Perform Linear Gaussian PPCA estimation using Maximum Likelihood Estimation (MLE). Parameters ---------- - data : npt.NDArray[Any] + data : npt.NDArray[np.float64] Data to be modeled. n_components : int Number of latent dimensions. @@ -915,7 +917,7 @@ class _ARD_state: Attributes ---------- - W : npt.NDArray[Any] + W : npt.NDArray[np.float64] Factor-loading matrix. noise_var : np.float64 Noise variance σ². @@ -924,14 +926,14 @@ class _ARD_state: """ # pylint:disable=invalid-name - W: npt.NDArray[Any] + W: npt.NDArray[np.float64] noise_var: np.float64 fixed: _DataDescFixed - alpha: npt.NDArray[Any] + alpha: npt.NDArray[np.float64] @classmethod def from_raw_parameters( - cls, W: npt.NDArray[Any], noise_var: np.float64, fixed: _DataDescFixed + cls, W: npt.NDArray[np.float64], noise_var: np.float64, fixed: _DataDescFixed ) -> "_ARD_state": """ Classmethod constructor of _ARD_state that takes in input parameters and @@ -944,41 +946,41 @@ class _ARD_state: return cls(W[:, mask], noise_var, fixed, 1 / alpha_inv[mask]) @cached_property - def noiseless_posterior_precision(self) -> npt.NDArray[Any]: + def noiseless_posterior_precision(self) -> npt.NDArray[np.float64]: """Noiseless posterior precision matrix: Wáµ€ W""" return self.W.T @ self.W @cached_property - def posterior_precision(self) -> npt.NDArray[Any]: + def posterior_precision(self) -> npt.NDArray[np.float64]: """Posterior precision matrix: M = Wáµ€ W + σ² I""" return self.noiseless_posterior_precision + self.noise_var * np.eye( self.W.shape[1] ) @cached_property - def posterior_variance(self) -> npt.NDArray[Any]: + def posterior_variance(self) -> npt.NDArray[np.float64]: """Posterior variance matrix: (Wáµ€ W + σ²I)â»Â¹ = Mâ»Â¹""" return np.linalg.inv(self.posterior_precision) @cached_property - def alpha_posterior_precision_prod(self) -> npt.NDArray[Any]: + def alpha_posterior_precision_prod(self) -> npt.NDArray[np.float64]: """ARD matrix and posterior precision matrix product : AM""" return self.alpha[:, None] * self.posterior_precision @cached_property - def sample_cov_factor_load_mat_prod(self) -> npt.NDArray[Any]: + def sample_cov_factor_load_mat_prod(self) -> npt.NDArray[np.float64]: """Sample covariance and factor loading matrix product: (Wáµ€ Sáµ€)áµ€= SW""" return self.fixed.sample_cov_matmul_smth(self.W) @cached_property - def factor_load_sample_cov_factor_load_mat_prod(self) -> npt.NDArray[Any]: + def factor_load_sample_cov_factor_load_mat_prod(self) -> npt.NDArray[np.float64]: """Factor loading, sample covariance, and factor loading matrix product: Wáµ€(Wáµ€Sáµ€)áµ€= Wáµ€ SW""" return self.W.T @ self.sample_cov_factor_load_mat_prod @cached_property def posterior_variance_factor_load_sample_cov_factor_load_mat_prod( self, - ) -> npt.NDArray[Any]: + ) -> npt.NDArray[np.float64]: """Posterior variance, factor loading, sample covariance and factor loading matrix product: Mâ»Â¹ Wáµ€ S W""" return ( @@ -988,7 +990,7 @@ class _ARD_state: @cached_property def posterior_variance_factor_load_sample_cov_factor_load_posterior_variance_mat_prod( self, - ) -> npt.NDArray[Any]: + ) -> npt.NDArray[np.float64]: """Posterior variance, factor loading, sample covariance, factor loading, and posterior variance matrix product: Mâ»Â¹ Wáµ€ S W Mâ»Â¹""" return ( @@ -997,17 +999,17 @@ class _ARD_state: ) # Mâ»Â¹ Wáµ€ S W Mâ»Â¹ @cached_property - def model_var(self) -> npt.NDArray[Any]: + def model_var(self) -> npt.NDArray[np.float64]: """Model covariance matrix: W Wáµ€ + σ²I = C""" return self.W @ self.W.T + self.noise_var * np.eye(self.fixed.X_dim) @cached_property - def log_det_model_var(self) -> npt.NDArray[Any]: + def log_det_model_var(self) -> npt.NDArray[np.float64]: """Log-determinant of the model covariance matrix: log |C|""" return np.linalg.slogdet(self.model_var)[1] @cached_property - def model_var_inv(self) -> npt.NDArray[Any]: + def model_var_inv(self) -> npt.NDArray[np.float64]: """Inverse of the model covariance matrix: Câ»Â¹ = (W Wáµ€ + σ²I)â»Â¹""" return np.linalg.inv(self.model_var) @@ -1085,9 +1087,9 @@ class LinearGaussianPpcaARDResult(_LinearGaussianPpcaResult): Attributes ---------- - mean : npt.NDArray[Any] + mean : npt.NDArray[np.float64] Mean of the data. - A : npt.NDArray[Any] + A : npt.NDArray[np.float64] automatic relevance determination matrix. noise_var : np.float64 Noise variance. @@ -1095,20 +1097,20 @@ class LinearGaussianPpcaARDResult(_LinearGaussianPpcaResult): Complete log-likelihood of the model. log_likelihood : np.float64 Marginal log-likelihood of the model. - model_var_inv : npt.NDArray[Any] + model_var_inv : npt.NDArray[np.float64] Inverse of the model covariance. - log_det_model_var : npt.NDArray[Any] + log_det_model_var : npt.NDArray[np.float64] Log-determinant of the model covariance. """ # pylint:disable=invalid-name - mean: npt.NDArray[Any] - A: npt.NDArray[Any] + mean: npt.NDArray[np.float64] + A: npt.NDArray[np.float64] noise_var: np.float64 complete_log_likelihood: np.float64 log_likelihood: np.float64 - model_var_inv: npt.NDArray[Any] - log_det_model_var: npt.NDArray[Any] + model_var_inv: npt.NDArray[np.float64] + log_det_model_var: npt.NDArray[np.float64] class LinearGaussianPPCAEstimARD(_LinearGaussianPPCAEstim): diff --git a/prob_dim_red/utils.py b/prob_dim_red/utils.py index b12f432..65dc5f0 100644 --- a/prob_dim_red/utils.py +++ b/prob_dim_red/utils.py @@ -19,7 +19,7 @@ """file containing functions and classes that may be useful""" from dataclasses import dataclass -from typing import NamedTuple, Any +from typing import NamedTuple from fractions import Fraction import numpy as np @@ -99,7 +99,7 @@ def _vecto(A): return _VectorizedArray(is_order_c, is_order_f, vec) -def data_load(data: npt.ArrayLike) -> npt.NDArray[Any]: +def data_load(data: npt.ArrayLike) -> npt.NDArray[np.float64]: """ Load data from an array-like object and return a NumPy array. @@ -172,7 +172,7 @@ def trace_matmul( return np.einsum("ij,ji->", mat_a, mat_b) -def positive_part(x: npt.ArrayLike) -> npt.NDArray[Any]: +def positive_part(x: npt.ArrayLike) -> npt.NDArray[np.float64]: """ Compute the positive part of an array element-wise. @@ -193,7 +193,7 @@ def positive_part(x: npt.ArrayLike) -> npt.NDArray[Any]: return np.maximum(x, 0) -def log_with_zeros(x: npt.ArrayLike) -> npt.NDArray[Any]: +def log_with_zeros(x: npt.ArrayLike) -> npt.NDArray[np.float64]: """ Compute the natural logarithm of an array, treating zeros safely. -- GitLab From 865c9908a586a243ab2468628d585ab668646a4b Mon Sep 17 00:00:00 2001 From: Julien Chiquet <julien.chiquet@inrae.fr> Date: Wed, 14 Aug 2024 12:22:25 +0200 Subject: [PATCH 10/11] assess convergence on marginal loglikelihood in EM + sipplificaiton in noise-var --- prob_dim_red/linear_gaussian_ppca.py | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) diff --git a/prob_dim_red/linear_gaussian_ppca.py b/prob_dim_red/linear_gaussian_ppca.py index 6f52fc7..b3e69b3 100644 --- a/prob_dim_red/linear_gaussian_ppca.py +++ b/prob_dim_red/linear_gaussian_ppca.py @@ -542,23 +542,14 @@ class LinearGaussianPPCAEstimEM(_LinearGaussianPPCAEstim): + self.state.posterior_variance_factor_load_sample_cov_factor_load_mat_prod ) - # W_new, _r = np.linalg.qr(W_new) - - noise_var_new = utils.positive_part( + noise_var_new = ( 1 / self.X_dim * ( self.fixed.tr_sample_cov - - 2 - * utils.trace_matmul( + - utils.trace_matmul( self.state.sample_cov_factor_load_mat_prod, - (self.state.posterior_variance @ W_new.T), - ) - + utils.trace_matmul( - self.state.noise_var * self.state.posterior_variance - + self.state.posterior_variance_factor_load_sample_cov_factor_load_posterior_variance_mat_prod, - W_new.T @ W_new, - sym=True, + self.state.posterior_variance @ W_new.T, ) ) ) @@ -573,7 +564,7 @@ class LinearGaussianPPCAEstimEM(_LinearGaussianPPCAEstim): """ self._em_init() yield self.state - likelihood = self.state.complete_log_likelihood + likelihood = self.state.log_likelihood for _ in ( range(max_iterations) if max_iterations is not None else itertools.count() ): @@ -584,7 +575,7 @@ class LinearGaussianPPCAEstimEM(_LinearGaussianPPCAEstim): # self.state.W = U * S[None,:] yield self.state - likelihood, old_likelihood = self.state.complete_log_likelihood, likelihood + likelihood, old_likelihood = self.state.log_likelihood, likelihood if (likelihood - old_likelihood) < error_tolerance: break -- GitLab From 9fc4e964d85a8a881d833e7ad3c5b0ad897b2cc7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Victor?= <francois.victor@inrae.fr> Date: Thu, 29 Aug 2024 17:36:31 +0200 Subject: [PATCH 11/11] adding minimal test set and modif test parameters --- .gitlab-ci.yml | 6 +++--- pyproject.toml | 5 +++++ tests/test_linear_gaussian_ppca.py | 21 ++++++--------------- 3 files changed, 14 insertions(+), 18 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 58fe857..6c9043c 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -31,7 +31,7 @@ run_tests_py310: - pip install pytest script: - pip install . - - pytest + - pytest -m minimal run_tests_py311: stage: tests @@ -40,7 +40,7 @@ run_tests_py311: - pip install pytest script: - pip install . - - pytest + - pytest -m minimal run_tests_py312: stage: tests @@ -49,7 +49,7 @@ run_tests_py312: - pip install pytest script: - pip install . - - pytest + - pytest -m minimal build_package: stage: build diff --git a/pyproject.toml b/pyproject.toml index 557b1fb..249ebf0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,3 +39,8 @@ prob_dim_red = "prob_dim_red._cli:main" [project.urls] homepage = "https://forgemia.inra.fr/mia-paris/reduction-dimension/prob_dim_red" repository = "https://forgemia.inra.fr/mia-paris/reduction-dimension/prob_dim_red" + +[tool.pytest.ini_options] +markers = [ + "minimal: minimal test set (select with '-m \"minimal\"')", +] diff --git a/tests/test_linear_gaussian_ppca.py b/tests/test_linear_gaussian_ppca.py index 43bf5e0..0134c86 100644 --- a/tests/test_linear_gaussian_ppca.py +++ b/tests/test_linear_gaussian_ppca.py @@ -6,8 +6,10 @@ import numpy as np from prob_dim_red import linear_gaussian_ppca -ALL_PARAMS_MLE = [ - (n, xd, zd, st, sd) +ALL_PARAMS_MLE_EM = [ + pytest.param( + (n, xd, zd, st, sd), marks=(pytest.mark.minimal if (n, xd) == (100, 10) else ()) + ) for n in (100, 1000, 10000) for xd in (10, 50, 100) for zd in (1, 2, 5, 50) @@ -17,7 +19,7 @@ ALL_PARAMS_MLE = [ ] -@pytest.mark.parametrize("param", ALL_PARAMS_MLE) +@pytest.mark.parametrize("param", ALL_PARAMS_MLE_EM) def test_mle(param): N, X_DIM, Z_DIM, sigma_true, seed = param random = np.random.RandomState(seed) # pylint: disable=no-member @@ -44,19 +46,8 @@ def test_mle(param): ).mean() ** 0.5 < 1000 * sigma_true * (Z_DIM / N) ** 0.5 -ALL_PARAMS_EM = [ - (n, xd, zd, st, sd) - for n in (1000,) - for xd in (10, 20, 50) - for zd in (1, 2, 5, 8) - for st in (1e-1, 0.1, 5) - for sd in (1, 2, 3) - if zd < xd -] - - # pylint: disable=too-many-locals -@pytest.mark.parametrize("param", ALL_PARAMS_EM) +@pytest.mark.parametrize("param", ALL_PARAMS_MLE_EM) def test_em(param): N, X_DIM, Z_DIM, sigma_true, seed = param random = np.random.RandomState(seed) # pylint: disable=no-member -- GitLab