Source code for pychemelt.utils.svd

"""
Module containing functions to perform Singular Value Decomposition (SVD) and Principal Component Analysis (PCA)
on spectral data, along with utilities for manipulating basis spectra and coefficients.

Author: Osvaldo Burastero
"""

import numpy as np


[docs] def apply_svd(X): """ Perform Singular Value Decomposition (SVD) on the input data matrix X. Parameters ---------- X : numpy array of shape (n_wavelengths, n_measurements) The input data matrix to decompose. Returns ------- explained_variance : numpy array The cumulative explained variance for each component. basis_spectra : numpy array The left singular vectors (U matrix) representing the basis spectra. coefficients : numpy array The coefficients associated with each basis spectrum. """ U, S, _ = np.linalg.svd(X) # Calculate the total variance or correlation total_variance = np.sum(S ** 2) cumulative_variance = np.cumsum(S ** 2) # The matrix V contains the variation of each component against the temperature / measurement dimension a_is = [] for i in range(U.shape[1]): def coefficients_bi(column): return U[:, i].dot(column) a_i = np.apply_along_axis(coefficients_bi, axis=0, arr=X) a_is.append(a_i) coefficients = np.array(a_is) # Basis spectra basis_spectra = U # Cumulated explained variance of the components explained_variance = cumulative_variance / total_variance * 100 return explained_variance, basis_spectra, coefficients
[docs] def filter_basis_spectra(explained_variance, basis_spectra_all, coefficients_all, explained_variance_threshold=99): """ Filter the basis spectra and coefficients based on the explained variance threshold Parameters ---------- explained_variance : numpy array The cumulative explained variance for each component. basis_spectra_all : numpy array The left singular vectors (U matrix) representing the basis spectra. coefficients_all : numpy array The coefficients associated with each basis spectrum. explained_variance_threshold : float, optional The threshold for explained variance to filter components. Default is 99. Returns ------- basis_spectra : numpy array The filtered basis spectra. coefficients : numpy array The filtered coefficients. k : int The number of components that meet the explained variance threshold. """ # Find the number of components (k) that capture at least threshold*100 percent of the variance or correlation k = np.sum(explained_variance < explained_variance_threshold) + 1 basis_spectra = basis_spectra_all[:, :k] coefficients = coefficients_all[:k, :] return basis_spectra, coefficients, k
[docs] def align_basis_spectra_and_coefficients(X, basis_spectra, coefficients): """ Align the basis spectra peaks to the original data Parameters ---------- X : numpy array of shape (n_samples, n_features) The input data matrix. basis_spectra : numpy array The basis spectra obtained from SVD. coefficients : numpy array The coefficients associated with each basis spectrum. Returns ------- basis_spectra : numpy array The aligned basis spectra. coefficients : numpy array The adjusted coefficients. """ # Align basis spectra peaks to the original data # In other words, we want that if the original spectra has a peak with positive values of the CD signal, # so does our basis spectra # Fix the n_cutoff to remove the first n and last n rows of X before finding the peak n_cutoff = 5 maxV_abs = np.abs(np.max(X[n_cutoff:-n_cutoff, :])) minV_abs = np.abs(np.min(X[n_cutoff:-n_cutoff, :])) positive_peak = maxV_abs > minV_abs k = basis_spectra.shape[1] for i in range(k): prcomp_i = basis_spectra[:, i] maxV_abs_prcomp_i = np.abs(np.max(prcomp_i[n_cutoff:-n_cutoff])) minV_abs_prcomp_i = np.abs(np.min(prcomp_i[n_cutoff:-n_cutoff])) positive_peak_prcomp_i = maxV_abs_prcomp_i > minV_abs_prcomp_i if positive_peak_prcomp_i != positive_peak: coeff_i = coefficients[i, :] basis_spectra[:, i] = - prcomp_i coefficients[i, :] = - coeff_i return basis_spectra, coefficients
[docs] def angle_from_cathets(adjacent_leg, opposite_leg): """ Calculate the angle between the hypotenuse and the adjacent leg of a right triangle. Parameters ---------- adjacent_leg : float Length of the adjacent leg. opposite_leg : float Length of the opposite leg. Returns ------- angle_in_radians : float Angle in radians between the hypotenuse and the adjacent leg. """ hypotenuse = np.sqrt(adjacent_leg ** 2 + opposite_leg ** 2) return np.arccos(adjacent_leg / hypotenuse)
[docs] def get_2d_counterclockwise_rot_matrix(angle_in_radians): """ Obtain the rotation matrix for a 2d coordinates system using a counterclockwise direction Parameters ---------- angle_in_radians : float Angle in radians for the rotation. Returns ------- rotM : numpy array 2x2 rotation matrix. """ rotM = np.array([[np.cos(angle_in_radians), np.sin(angle_in_radians)], [-np.sin(angle_in_radians), np.cos(angle_in_radians)]]) return rotM
[docs] def get_3d_counterclockwise_rot_matrix_around_z_axis(angle_in_radians): """ Obtain the rotation matrix for a 3d coordinates system around the z axis using a counterclockwise direction Parameters ---------- angle_in_radians : float Angle in radians for the rotation. Returns ------- rotM : numpy array 3x3 rotation matrix. """ rotM = np.array([[np.cos(angle_in_radians), np.sin(angle_in_radians), 0], [-np.sin(angle_in_radians), np.cos(angle_in_radians), 0], [0, 0, 1]]) return rotM
[docs] def get_3d_clockwise_rot_matrix_around_y_axis(angle_in_radians): """ Obtain the rotation matrix for a 3d coordinates system around the y axis using a clockwise direction Parameters ---------- angle_in_radians : float Angle in radians for the rotation. Returns ------- rotM : numpy array 3x3 rotation matrix. """ rotM = np.array([[np.cos(angle_in_radians), 0, np.sin(angle_in_radians)], [0, 1, 0], [-np.sin(angle_in_radians), 0, np.cos(angle_in_radians)]]) return rotM
[docs] def rotate_two_basis_spectra(X, basis_spectra, pca_based=False): """ Create a new basis spectra using a linear combination of the first and second basis spectra Parameters ---------- X : numpy array The raw data matrix of size n*m, where 'n' is the number of measured wavelengths and 'm' is the number of acquired spectra. basis_spectra : numpy array The matrix containing the set of basis spectra. pca_based : bool, optional Boolean to decide if we need to center the matrix X. Default is False. Returns ------- basis_spectra_new : numpy array The new set of basis spectra. coefficients : numpy array The new set of associated coefficients. """ if pca_based: X_mean = np.mean(X, axis=1, keepdims=True) X = X - X_mean first_spectrum = X[:, 0] c1 = first_spectrum.dot(basis_spectra[:, 0]) c2 = first_spectrum.dot(basis_spectra[:, 1]) rotAngle = angle_from_cathets(c1, c2) rotM = get_2d_counterclockwise_rot_matrix(rotAngle) basis_spectra_new = np.dot(basis_spectra[:, :2], rotM) coefficients = np.dot(basis_spectra_new.T, X) return basis_spectra_new, coefficients
[docs] def rotate_three_basis_spectra(X, basis_spectra, pca_based=False): """ Create a new basis spectra using a linear combination from the first, second and third basis spectra Parameters ---------- X : numpy array The raw data matrix of size n*m, where 'n' is the number of measured wavelengths and 'm' is the number of acquired spectra. basis_spectra : numpy array The matrix containing the set of basis spectra. pca_based : bool, optional Boolean to decide if we need to center the matrix X. Default is False. Returns ------- basis_spectra_new : numpy array The new set of basis spectra. coefficients_subset : numpy array The new set of associated coefficients. """ if pca_based: X_mean = np.mean(X, axis=1, keepdims=True) X = X - X_mean first_spectrum = X[:, 0] c1 = first_spectrum.dot(basis_spectra[:, 0]) c2 = first_spectrum.dot(basis_spectra[:, 1]) c3 = first_spectrum.dot(basis_spectra[:, 2]) zAngle = angle_from_cathets(c1, c2) yAngle = angle_from_cathets(np.sqrt(c1 ** 2 + c2 ** 2), c3) rotZaxis = get_3d_counterclockwise_rot_matrix_around_z_axis(zAngle) rotYaxis = get_3d_clockwise_rot_matrix_around_y_axis(yAngle) basis_z_rot = np.dot(basis_spectra[:, :3], rotZaxis) basis_spectra_new = np.dot(basis_z_rot, rotYaxis) coefficients = np.dot(basis_spectra_new.T, X) return basis_spectra_new, coefficients
[docs] def reconstruct_spectra(basis_spectra, coefficients, X=None, pca_based=False): """ Reconstruct the original spectra based on the set of basis spectra and the associated coefficients Parameters ---------- basis_spectra : numpy array The matrix containing the set of basis spectra. coefficients : numpy array The associated coefficients of each basis spectrum. X : numpy array, optional Only used if pca_based equals TRUE! X is the raw data matrix of size n*m, where 'n' is the number of measured wavelengths and 'm' is the number of acquired spectra. pca_based : bool, optional Boolean to decide if we need to extract the mean from the the X raw data matrix. Default is False. Returns ------- fitted : numpy array The reconstructed matrix which should be close the original raw data. """ fitted = (basis_spectra @ coefficients) # Add the mean, if needed if pca_based: X_mean = np.mean(X, axis=1, keepdims=True) fitted = fitted + X_mean return fitted
[docs] def explained_variance_from_orthogonal_vectors(vectors, coefficients, total_variance): """ Useful to get the percentage of variance, not in the coordinate space provided by PCA/SVD, but against a different set of (rotated) vectors. Parameters ---------- vectors : numpy array The set of orthogonal vectors. coefficients : numpy array The associated coefficients of each orthogonal vector. total_variance : float The total variance of the original data (mean subtracted if we performed PCA...). Returns ------- explained_variance : list The amount of explained variance by each orthogonal vector. """ explained_variance = [] for i in range(vectors.shape[1]): a = np.linalg.norm(coefficients[i, :]) ** 2 b = np.linalg.norm(vectors[:, i]) ** 2 explained_variance.append(a / b) return 100 * np.cumsum(explained_variance) / total_variance
[docs] def apply_pca(X): """ Perform Principal Component Analysis (PCA) on the input data matrix X. Parameters ---------- X : numpy array of shape (n_wavelengths, n_measurements) The input data matrix to decompose. Returns ------- cum_sum_eigenvalues : numpy array The cumulative explained variance for each principal component. principal_components : numpy array The principal components (eigenvectors) representing the basis spectra. coefficients : numpy array The coefficients associated with each principal component. """ X = X.T # We need to transpose X to have samples as rows and features as columns X_mean = np.mean(X, axis=0) X = X - X_mean # compute the covariance matrix cov_mat = np.cov(X, rowvar=False) # find the eigen vectors and associated eigen values eigen_values, eigen_vectors = np.linalg.eigh(cov_mat) # sort the eigenvalues in descending order sorted_index = np.argsort(eigen_values)[::-1] sorted_eigenvalue = eigen_values[sorted_index] # similarly sort the eigenvectors sorted_eigenvectors = eigen_vectors[:, sorted_index] # compute the total variance total_eigenvalues = np.sum(sorted_eigenvalue) # compute the explained variance exp_var_pca = (sorted_eigenvalue / total_eigenvalues * 100) # compute the cumulative explained variance cum_sum_eigenvalues = np.cumsum(exp_var_pca) principal_components = sorted_eigenvectors a_is = [] for i in range(principal_components.shape[1]): def coefficients_bi(column): # Your custom logic here return principal_components[:, i].dot(column) a_i = np.apply_along_axis(coefficients_bi, axis=1, arr=X) a_is.append(a_i) coefficients = np.array(a_is) return cum_sum_eigenvalues, principal_components, coefficients
[docs] def recalc_explained_variance(basis_spectra, coefficients, X, pca_based=False): """ Recalculate the explained variance of a set of basis spectra and associated coefficients Parameters ---------- basis_spectra : numpy array The basis spectra. coefficients : numpy array The associated coefficients of each basis spectrum. X : numpy array The raw data matrix of size n*m, where 'n' is the number of measured wavelengths and 'm' is the number of acquired spectra. pca_based : bool, optional Boolean to decide if we need to center the matrix X. Default is False. Returns ------- explained_variance : numpy array The cumulative explained variance for each component. """ if pca_based: X_mean = np.mean(X, axis=1, keepdims=True) X = X - X_mean total_variance = np.linalg.norm(X) ** 2 # Total variance in the data explained_variance = explained_variance_from_orthogonal_vectors( basis_spectra, coefficients, total_variance) return explained_variance