Source code for pychemelt.thermal_oligomer

"""
Main class to handle thermal denaturation data of mono- and oligomers up to tetramers
The current model assumes the proteins' unfolding is reversible
"""

import pandas as pd
import numpy as np
from warnings import warn

from itertools import chain
from copy import deepcopy

from .main import Sample

from .utils.signals import (
    map_two_state_model_to_signal_fx,
)

from .utils.math import (
    temperature_to_kelvin,
    relative_errors,
    constant_baseline,
    linear_baseline,
    quadratic_baseline,
    exponential_baseline,
)

from .utils.processing import (
    guess_Tm_from_derivative,
    set_param_bounds,
    adjust_value_to_interval,
    re_arrange_params,
    re_arrange_predictions,
    subset_data,
    estimate_signal_baseline_params,
    oligomer_number,
    transform_to_list
)

from .utils.fitting import (
    fit_line_robust,
    fit_oligomer_unfolding_single_slopes,
    fit_oligomer_unfolding_shared_slopes_many_signals,
    fit_oligomer_unfolding_many_signals,
    evaluate_fitting_and_refit,
    baseline_fx_name_to_req_params
)



[docs] class ThermalOligomer(Sample): """ Class to hold the data of a DSF experiment of thermal unfolding with different concentrations of an oligomer. """
[docs] def __init__(self, name='Test'): super().__init__(name) self.nr_olig = 0 # Number of oligomer concentrations in data self.model = None # Oligomer model type self.oligomeric = True # Flag for oligomer or denaturant
[docs] def set_model(self, model_name): """ Set subunit number of the oligomer used for the analysis. Currently supported are two state models of monomers, dimers, trimers and tetramers Parameters ---------- model_name : str name of the used model. Can be: "Monomer", "Dimer", "Trimer", "Tetramer". Case insensitive Raises ------ ValueError If the provided model name is not in the supported list. Notes ----- This method creates/updates the following attributes on the instance: - self.model: oligomeric model used for analysis """ allowed_models = ["monomer", "dimer", "trimer", "tetramer"] model = model_name.lower() if model not in allowed_models: raise ValueError( f"Invalid model '{model_name}'. " f"Allowed models are: {', '.join(m.capitalize() for m in allowed_models)}." ) # Save model with first letter uppercase self.model = model.capitalize() return None
[docs] def set_concentrations(self, concentrations=None): """ Set the oligomeric concentrations for the sample Parameters ---------- concentrations : list, optional List of oligomer concentrations. If None, use the sample conditions Notes ----- Creates/updates attribute `oligomer_concentrations_pre` (numpy.ndarray) """ if concentrations is None: concentrations = self.conditions concentrations = transform_to_list(concentrations) self.oligomer_concentrations_pre = np.array(concentrations) return None
[docs] def select_conditions( self, boolean_lst=None, normalise_to_global_max=True): """ For each signal, select the conditions to be used for the analysis Parameters ---------- boolean_lst : list of bool, optional List of booleans selecting which conditions to keep. If None, keep all. normalise_to_global_max : bool, optional If True, normalise the signal to the global maximum - per signal type Notes ----- Creates/updates several attributes used by downstream fitting: - signal_lst_multiple, temp_lst_multiple : lists of lists with selected data - oligomer_concentrations : list of selected oligomer concentrations - oligomer_concentrations_expanded : flattened numpy array matching expanded signals - boolean_lst, normalise_to_global_max, nr_olig : control flags/values """ # If boolean_lst is a boolean, convert it to a list of one boolean boolean_lst = transform_to_list(boolean_lst) if boolean_lst is None: self.signal_lst_multiple = self.signal_lst_pre_multiple self.temp_lst_multiple = self.temp_lst_pre_multiple self.oligomer_concentrations = self.oligomer_concentrations_pre else: self.signal_lst_multiple = [None for _ in range(len(self.signal_lst_pre_multiple))] self.temp_lst_multiple = [None for _ in range(len(self.temp_lst_pre_multiple))] for i in range(len(self.signal_lst_pre_multiple)): self.signal_lst_multiple[i] = [x for j, x in enumerate(self.signal_lst_pre_multiple[i]) if boolean_lst[j]] self.temp_lst_multiple[i] = [x for j, x in enumerate(self.temp_lst_pre_multiple[i]) if boolean_lst[j]] self.oligomer_concentrations = [x for i, x in enumerate(self.oligomer_concentrations_pre) if boolean_lst[i]] if normalise_to_global_max: flat = list(chain.from_iterable(chain.from_iterable(self.signal_lst_multiple))) global_max = np.max(flat) # Global maximum across all signals self.data_global_max = global_max for i in range(len(self.signal_lst_multiple)): self.signal_lst_multiple[i] = [x / global_max * 100 for x in self.signal_lst_multiple[i]] self.nr_olig = len(self.oligomer_concentrations) # For compatibility self.nr_den = self.nr_olig # Expand the number of oligomer concentrations to match the number of signals oligomer_concentrations = [self.oligomer_concentrations for _ in range(self.nr_signals)] self.oligomer_concentrations_expanded = np.concatenate(oligomer_concentrations, axis=0) self.boolean_lst = boolean_lst self.normalise_to_global_max = normalise_to_global_max self.oligomer_concentrations = np.array(self.oligomer_concentrations) # Needed for compatibility self.denaturant_concentrations = self.oligomer_concentrations return None
[docs] def guess_Cp(self): """ Guess the Cp of the assembled oligomer by the number of residues. Raises ------ ValueError If `self.n_residues` is not set. Notes ----- The number of residues represent the total number of residues in the oligomer This method creates/updates attributes used later in fitting: - Cp0 assigned to self.Cp0 """ # If the number of residues is still zero, raise an error if self.n_residues == 0: raise ValueError('The number of residues is still zero. Please set n_residues before calling guess_Cp') Cp0 = self.n_residues * 0.0148 - 0.1267 # Cp0 needs to be positive Cp0 = max(Cp0, 0) self.Cp0 = Cp0 return None
[docs] def estimate_baseline_parameters( self, native_baseline_type, unfolded_baseline_type, window_range_native=12, window_range_unfolded=12): """ Estimate the baseline parameters for multiple signals of the oligomer. The native baseline represents the curve for the assemble doligomer while the unfolded baseline represents the curve for the unfolded and disassembled oligomer. Parameters ---------- native_baseline_type : str one of 'constant', 'linear', 'quadratic', 'exponential' unfolded_baseline_type : str one of 'constant', 'linear', 'quadratic', 'exponential' window_range_native : int, optional Range of the window (in degrees) to estimate the baselines and slopes of the native state window_range_unfolded : int, optional Range of the window (in degrees) to estimate the baselines and slopes of the unfolded state Notes ----- This method sets or updates these attributes: - bNs_per_signal, bUs_per_signal, kNs_per_signal, kUs_per_signal, qNs_per_signal, qUs_per_signal - poly_order_native, poly_order_unfolded """ self.first_param_Ns_per_signal = [] self.first_param_Us_per_signal = [] self.second_param_Ns_per_signal = [] self.second_param_Us_per_signal = [] self.third_param_Ns_per_signal = [] self.third_param_Us_per_signal = [] # If the sample isoligomeric, we need to correct the signal for the concentrations of the oligomer if self.oligomeric: oligomer_concentrations = np.repeat(self.oligomer_concentrations, np.array(self.signal_lst_multiple).shape[-1]) oligomer_concentrations = np.split(oligomer_concentrations, len(self.oligomer_concentrations)) for i in range(len(self.signal_lst_multiple)): if self.oligomeric: adjusted_signal_lst_multiple = list(np.array(self.signal_lst_multiple[i])/ np.array(oligomer_concentrations)) p1Ns, p1Us, p2Ns, p2Us, p3Ns, p3Us = estimate_signal_baseline_params( self.signal_lst_multiple[i] if not self.oligomeric else adjusted_signal_lst_multiple, self.temp_lst_multiple[i], native_baseline_type, unfolded_baseline_type, window_range_native, window_range_unfolded, oligomer_number(self.model) ) self.first_param_Ns_per_signal.append(p1Ns) self.first_param_Us_per_signal.append(p1Us) self.second_param_Ns_per_signal.append(p2Ns) self.second_param_Us_per_signal.append(p2Us) self.third_param_Ns_per_signal.append(p3Ns) self.third_param_Us_per_signal.append(p3Us) baseline_fx_dic = { 'constant': constant_baseline, 'linear': linear_baseline, 'quadratic': quadratic_baseline, 'exponential': exponential_baseline } self.baseline_N_fx = baseline_fx_dic[native_baseline_type] self.baseline_U_fx = baseline_fx_dic[unfolded_baseline_type] self.native_baseline_type = native_baseline_type self.unfolded_baseline_type = unfolded_baseline_type return None
[docs] def create_dg_df(self): """ Create a dataframe of the dg values versus temperature """ # Create a dataframe of the parameters Tm, DHm, Cp0 = self.global_fit_params[:3] T_c = np.arange(0, 150, 0.5) T = temperature_to_kelvin(T_c) Tm = temperature_to_kelvin(Tm) DG = DHm * (1 - T / Tm) + Cp0 * (T - Tm - T * np.log(T / Tm)) dg_df = pd.DataFrame({ 'DG (kcal/mol)': DG, 'Temperature (°C)': T_c }) self.dg_df = dg_df return None
[docs] def fit_thermal_unfolding_global( self, cp_limits=None, dh_limits=None, tm_limits=None, cp_value=None): """ Fit the thermal unfolding of the sample using the signal and temperature data We fit all the curves at once, with global thermodynamic parameters but local slopes and local baselines) Multiple signals can be fitted at the same time, such as 350nm and 330nm Parameters ---------- cp_limits : list, optional List of two values, the lower and upper bounds for the Cp value. If None, bounds set automatically dh_limits : list, optional List of two values, the lower and upper bounds for the dH value. If None, bounds set automatically tm_limits : list, optional List of two values, the lower and upper bounds for the Tm value. If None, bounds set automatically cp_value : float, optional If provided, the Cp value is fixed to this value, the bounds are ignored Notes ----- This is a heavy routine that creates/updates many fitting-related attributes, including: - bNs_expanded, bUs_expanded, kNs_expanded, kUs_expanded, qNs_expanded, qUs_expanded - p0, low_bounds, high_bounds, global_fit_params, rel_errors - predicted_lst_multiple, params_names, params_df, dg_df - flags: global_fit_done, limited_tm, limited_dh, limited_cp, fixed_cp """ # Requires Cp0 if self.Cp0 <= 0: raise ValueError('Cp0 must be positive. Please run guess_Cp before fitting globally.') # Get Guess of Tm: tm_lst = [] x1 = 6 x2 = 11 if not hasattr(self, "deriv_lst_multiple"): self.estimate_derivative() for i in range(len(self.signal_lst_multiple)): tm_lst.append(guess_Tm_from_derivative( self.temp_deriv_lst_multiple[i], self.deriv_lst_multiple[i], x1, x2 )) Tm = np.average(tm_lst) dh = 100 if self.model == 'Dimer': dh = 120 elif self.model == 'Trimer': dh = 150 elif self.model == 'Tetramer': dh = 180 p0 = [Tm, dh, self.Cp0] params_names = [ 'Tm (°C)', 'ΔH (kcal/mol)', 'Cp (kcal/mol/°C)'] self.first_param_Ns_expanded = np.concatenate(self.first_param_Ns_per_signal, axis=0) self.first_param_Us_expanded = np.concatenate(self.first_param_Us_per_signal, axis=0) self.second_param_Ns_expanded = np.concatenate(self.second_param_Ns_per_signal, axis=0) self.second_param_Us_expanded = np.concatenate(self.second_param_Us_per_signal, axis=0) self.third_param_Ns_expanded = np.concatenate(self.third_param_Ns_per_signal, axis=0) self.third_param_Us_expanded = np.concatenate(self.third_param_Us_per_signal, axis=0) p0 = np.concatenate([p0, self.first_param_Ns_expanded, self.first_param_Us_expanded]) # We need to append as many bN and bU as the number of oligomer concentrations # times the number of signal types for signal in self.signal_names: params_names += (['intercept_native - ' + str(self.oligomer_concentrations[i]) + ' - ' + str(signal) for i in range(self.nr_olig)]) for signal in self.signal_names: params_names += (['intercept_unfolded - ' + str(self.oligomer_concentrations[i]) + ' - ' + str(signal) for i in range(self.nr_olig)]) if self.native_baseline_type in ['linear', 'quadratic','exponential']: param_name = 'pre_exponential_factor_native' if self.native_baseline_type == 'exponential' else 'slope_term_native' p0 = np.concatenate([p0, self.second_param_Ns_expanded]) for signal in self.signal_names: params_names += ([param_name + ' - ' + str(self.oligomer_concentrations[i]) + ' - ' + str(signal) for i in range(self.nr_olig)]) if self.unfolded_baseline_type in ['linear', 'quadratic','exponential']: param_name = 'pre_exponential_factor_unfolded' if self.unfolded_baseline_type == 'exponential' else 'slope_term_unfolded' p0 = np.concatenate([p0, self.second_param_Us_expanded]) for signal in self.signal_names: params_names += ([param_name + ' - ' + str(self.oligomer_concentrations[i]) + ' - ' + str(signal) for i in range(self.nr_olig)]) if self.native_baseline_type in ['quadratic', 'exponential']: param_name = 'exponential_coefficient_native' if self.native_baseline_type == 'exponential' else 'quadratic_term_native' p0 = np.concatenate([p0, self.third_param_Ns_expanded]) for signal in self.signal_names: params_names += ([param_name + ' - ' + str(self.oligomer_concentrations[i]) + ' - ' + str(signal) for i in range(self.nr_olig)]) if self.unfolded_baseline_type in ['quadratic', 'exponential']: param_name = 'exponential_coefficient_unfolded' if self.unfolded_baseline_type == 'exponential' else 'quadratic_term_unfolded' p0 = np.concatenate([p0, self.third_param_Us_expanded]) for signal in self.signal_names: params_names += ([param_name + ' - ' + str(self.oligomer_concentrations[i]) + ' - ' + str(signal) for i in range(self.nr_olig)]) low_bounds = (p0.copy()) high_bounds = (p0.copy()) low_bounds[3:], high_bounds[3:] = set_param_bounds(p0[3:],params_names[3:]) # Adjusting boundaries based on the size of the oligomer self.limited_tm = tm_limits is not None if self.limited_tm: tm_lower, tm_upper = tm_limits else: tm_lower = p0[0] - 12 if self.model == 'Monomer' else p0[0] - 20 tm_upper = p0[0] + 20 if self.model == 'Monomer' else p0[0] + 30 if self.model in ['Trimer', 'Tetramer']: tm_lower, tm_upper, p0[0] = tm_lower + 10, tm_upper + 10, p0[0] + 10 if self.model == 'Tetramer': tm_lower, tm_upper, p0[0] = tm_lower + 10, tm_upper + 20, p0[0] + 20 low_bounds[0] = tm_lower high_bounds[0] = tm_upper # Verify that the initial guess is within the user-defined limits p0[0] = adjust_value_to_interval(p0[0], tm_lower, tm_upper,1) self.limited_dh = dh_limits is not None if self.limited_dh: dh_lower, dh_upper = dh_limits p0[1] = adjust_value_to_interval(p0[1], dh_lower, dh_upper, 1) else: dh_lower = 10 dh_upper = 500 low_bounds[1] = dh_lower high_bounds[1] = dh_upper self.cp_value = cp_value self.fixed_cp = cp_value is not None self.limited_cp = cp_limits is not None and not self.fixed_cp if self.limited_cp: cp_lower, cp_upper = cp_limits else: cp_lower, cp_upper = 0.1, 5 if self.fixed_cp: # Remove the Cp from p0, low_bounds and high_bounds # Remove Cp0 from the parameter names p0 = np.delete(p0, 2) low_bounds = np.delete(low_bounds, 2) high_bounds = np.delete(high_bounds, 2) params_names.pop(2) else: low_bounds[2] = cp_lower high_bounds[2] = cp_upper # Verify that the Cp initial guess is within the user-defined limits p0[2] = adjust_value_to_interval(p0[2], cp_lower, cp_upper, 0.5) # Populate the expanded signal and temperature lists self.expand_multiple_signal() signal_fx = map_two_state_model_to_signal_fx(self.model) kwargs = { 'oligomer_concentrations' : self.oligomer_concentrations_expanded, 'initial_parameters': p0, 'low_bounds' : low_bounds, 'high_bounds' : high_bounds, 'cp_value' : cp_value, 'baseline_native_fx' : self.baseline_N_fx, 'baseline_unfolded_fx' : self.baseline_U_fx, 'signal_fx' : signal_fx, } fit_fx = fit_oligomer_unfolding_single_slopes # Do a quick prefit with a reduced data set if self.pre_fit: kwargs['list_of_temperatures'] = self.temp_lst_expanded_subset kwargs['list_of_signals'] = self.signal_lst_expanded_subset global_fit_params, cov, predicted = fit_fx(**kwargs) p0 = global_fit_params # Now use the whole dataset kwargs['list_of_temperatures'] = self.temp_lst_expanded kwargs['list_of_signals'] = self.signal_lst_expanded global_fit_params, cov, predicted = fit_fx(**kwargs) global_fit_params, cov, predicted, p0, low_bounds, high_bounds = evaluate_fitting_and_refit( global_fit_params, cov, predicted, high_bounds, low_bounds, p0, False, self.limited_cp, self.limited_dh, self.limited_tm, self.fixed_cp, kwargs, fit_fx, fit_m_value=False, ) # If the fitting returns a large error it is recommended to turn off the CP0 value fitting error = np.nansum((np.array(predicted) - np.array(self.signal_lst_expanded)) ** 2) if self.normalise_to_global_max: if error > 10000: warn('The fitted signal deviates heavily from the experimental data in the global fit. ' 'Consider not fitting the CP0 value by setting "cp_value=0" in the fit_thermal_unfolding_global() function') else: flat = list(chain.from_iterable(chain.from_iterable(self.signal_lst_multiple))) global_max = np.max(flat) # Global maximum across all signals error = error / global_max * 100 if not self.normalise_to_global_max else error if error > 10: warn('The fitted signal deviates heavily from the experimental data in the global fit. ' 'Consider not fitting the CP0 value by setting "cp_value=0" in the fit_thermal_unfolding_global() function') rel_errors = relative_errors(global_fit_params, cov) self.p0 = p0 self.low_bounds = low_bounds self.high_bounds = high_bounds self.global_fit_params = global_fit_params self.rel_errors = rel_errors self.predicted_lst_multiple = re_arrange_predictions(predicted, self.nr_signals, self.nr_olig) self.global_fit_done = True self.params_names = params_names self.create_params_df() self.create_dg_df() return None
[docs] def fit_thermal_unfolding_global_global(self): """ Fit the thermal unfolding of the sample using the signal and temperature data We fit all the curves at once, with global thermodynamic parameters and global slopes (but local baselines) Multiple refers to the fact that we fit many signals at the same time, such as 350nm and 330nm Must be run after fit_thermal_unfolding_global_multiple Notes ----- Updates global fitting attributes and sets `global_global_fit_done` when complete. """ # Requires global fit done if not self.global_fit_done: self.fit_thermal_unfolding_global() if self.signal_ids is None: self.set_signal_id() param_init = 2 + (self.cp_value is None) p0 = self.global_fit_params[:param_init] low_bounds = self.low_bounds[:param_init] high_bounds = self.high_bounds[:param_init] n_datasets = self.nr_olig * self.nr_signals p1Ns = self.global_fit_params[param_init:param_init + n_datasets] p1Us = self.global_fit_params[param_init + n_datasets:param_init + 2 * n_datasets] low_bounds_p1Ns = self.low_bounds[param_init:param_init + n_datasets] low_bounds_p1Us = self.low_bounds[param_init + n_datasets:param_init + 2 * n_datasets] high_bounds_p1Ns = self.high_bounds[param_init:param_init + n_datasets] high_bounds_p1Us = self.high_bounds[param_init + n_datasets:param_init + 2 * n_datasets] id_start = param_init + 2 * n_datasets params_names = self.params_names[:id_start] if self.native_baseline_type in ['linear', 'quadratic','exponential']: param_name = 'pre_exponential_factor_native' if self.native_baseline_type == 'exponential' else 'slope_term_native' p2Ns = self.global_fit_params[id_start:id_start + n_datasets] params_names += [param_name + ' - ' + signal_name for signal_name in self.signal_names] low_bounds_p2Ns = self.low_bounds[id_start:id_start + n_datasets] high_bounds_p2Ns = self.high_bounds[id_start:id_start + n_datasets] id_start += n_datasets if self.unfolded_baseline_type in ['linear', 'quadratic','exponential']: param_name = 'pre_exponential_factor_unfolded' if self.unfolded_baseline_type == 'exponential' else 'slope_term_unfolded' p2Us = self.global_fit_params[id_start:id_start + n_datasets] params_names += [param_name + ' - ' + signal_name for signal_name in self.signal_names] low_bounds_p2Us = self.low_bounds[id_start:id_start + n_datasets] high_bounds_p2Us = self.high_bounds[id_start:id_start + n_datasets] id_start += n_datasets if self.native_baseline_type in ['quadratic', 'exponential']: param_name = 'exponential_coefficient_native' if self.native_baseline_type == 'exponential' else 'quadratic_term_native' p3Ns = self.global_fit_params[id_start:id_start + n_datasets] params_names += [param_name + ' - ' + signal_name for signal_name in self.signal_names] low_bounds_p3Ns = self.low_bounds[id_start:id_start + n_datasets] high_bounds_p3Ns = self.high_bounds[id_start:id_start + n_datasets] id_start += n_datasets if self.unfolded_baseline_type in ['quadratic', 'exponential']: param_name = 'exponential_coefficient_unfolded' if self.unfolded_baseline_type == 'exponential' else 'quadratic_term_unfolded' p3Us = self.global_fit_params[id_start:id_start + n_datasets] params_names += [param_name + ' - ' + signal_name for signal_name in self.signal_names] low_bounds_p3Us = self.low_bounds[id_start:id_start + n_datasets] high_bounds_p3Us = self.high_bounds[id_start:id_start + n_datasets] p0 = np.concatenate([p0, p1Ns, p1Us]) low_bounds = np.concatenate([low_bounds, low_bounds_p1Ns, low_bounds_p1Us]) high_bounds = np.concatenate([high_bounds, high_bounds_p1Ns, high_bounds_p1Us]) # Baselines are still independent for each signal and oligomer concentration # Slopes and quadratic terms are shared - per signal only if self.native_baseline_type in ['linear', 'quadratic','exponential']: p2Ns = re_arrange_params(p2Ns, self.nr_signals) low_bounds_p2Ns = re_arrange_params(low_bounds_p2Ns, self.nr_signals) high_bounds_p2Ns = re_arrange_params(high_bounds_p2Ns, self.nr_signals) for kNs_i, low_bounds_kNs_i, high_bounds_kNs_i in zip(p2Ns, low_bounds_p2Ns, high_bounds_p2Ns): p0 = np.append(p0, np.median(kNs_i)) low_bounds = np.append(low_bounds, np.min(low_bounds_kNs_i)) high_bounds = np.append(high_bounds, np.max(high_bounds_kNs_i)) if self.unfolded_baseline_type in ['linear', 'quadratic','exponential']: p2Us = re_arrange_params(p2Us, self.nr_signals) low_bounds_p2Us = re_arrange_params(low_bounds_p2Us, self.nr_signals) high_bounds_p2Us = re_arrange_params(high_bounds_p2Us, self.nr_signals) for kUs_i, low_bounds_kUs_i, high_bounds_kUs_i in zip(p2Us, low_bounds_p2Us, high_bounds_p2Us): p0 = np.append(p0, np.median(kUs_i)) low_bounds = np.append(low_bounds, np.min(low_bounds_kUs_i)) high_bounds = np.append(high_bounds, np.max(high_bounds_kUs_i)) if self.native_baseline_type in ['quadratic', 'exponential']: p3Ns = re_arrange_params(p3Ns, self.nr_signals) low_bounds_p3Ns = re_arrange_params(low_bounds_p3Ns, self.nr_signals) high_bounds_p3Ns = re_arrange_params(high_bounds_p3Ns, self.nr_signals) for qNs_i, low_bounds_qNs_i, high_bounds_qNs_i in zip(p3Ns, low_bounds_p3Ns, high_bounds_p3Ns): p0 = np.append(p0, np.median(qNs_i)) low_bounds = np.append(low_bounds, np.min(low_bounds_qNs_i)) high_bounds = np.append(high_bounds, np.max(high_bounds_qNs_i)) if self.unfolded_baseline_type in ['quadratic', 'exponential']: p3Us = re_arrange_params(p3Us, self.nr_signals) low_bounds_p3Us = re_arrange_params(low_bounds_p3Us, self.nr_signals) high_bounds_p3Us = re_arrange_params(high_bounds_p3Us, self.nr_signals) for qUs_i, low_bounds_qUs_i, high_bounds_qUs_i in zip(p3Us, low_bounds_p3Us, high_bounds_p3Us): p0 = np.append(p0, np.median(qUs_i)) low_bounds = np.append(low_bounds, np.min(low_bounds_qUs_i)) high_bounds = np.append(high_bounds, np.max(high_bounds_qUs_i)) signal_fx = map_two_state_model_to_signal_fx(self.model) kwargs = { 'oligomer_concentrations': self.oligomer_concentrations_expanded, 'list_of_temperatures': self.temp_lst_expanded_subset, 'list_of_signals': self.signal_lst_expanded_subset, 'initial_parameters': p0, 'low_bounds': low_bounds, 'high_bounds': high_bounds, 'cp_value': self.cp_value, 'signal_ids':self.signal_ids, 'baseline_native_fx': self.baseline_N_fx, 'baseline_unfolded_fx': self.baseline_U_fx, 'signal_fx' : signal_fx, } fit_fx = fit_oligomer_unfolding_shared_slopes_many_signals if self.pre_fit: # Do a pre-fit with a reduced data set global_fit_params, cov, predicted = fit_fx(**kwargs) p0 = global_fit_params # End of pre-fit # Use whole dataset kwargs['list_of_temperatures'] = self.temp_lst_expanded kwargs['list_of_signals'] = self.signal_lst_expanded global_fit_params, cov, predicted = fit_fx(**kwargs) global_fit_params, cov, predicted, p0, low_bounds, high_bounds = evaluate_fitting_and_refit( global_fit_params, cov, predicted, high_bounds, low_bounds, p0, False, self.limited_cp, self.limited_dh, self.limited_tm, self.fixed_cp, kwargs, fit_fx, fit_m_value=False, ) rel_errors = relative_errors(global_fit_params, cov) self.p0 = p0 self.low_bounds = low_bounds self.high_bounds = high_bounds self.global_fit_params = global_fit_params self.rel_errors = rel_errors self.predicted_lst_multiple = re_arrange_predictions( predicted, self.nr_signals, self.nr_olig) self.params_names = params_names self.create_params_df() self.create_dg_df() self.global_global_fit_done = True return None
[docs] def fit_thermal_unfolding_global_global_global( self, model_scale_factor=True): """ Fit the thermal unfolding of the sample using the signal and temperature data We fit all the curves at once, with global thermodynamic parameters, global slopes and global baselines Must be run after fit_thermal_unfolding_global_global Parameters ---------- model_scale_factor : bool, optional If True, model a scale factor for each oligomer concentration Notes ----- Updates many global fitting attributes and sets `global_global_global_fit_done` when complete. If `model_scale_factor` is True the method also creates scaled signal attributes: - signal_lst_multiple_scaled, predicted_lst_multiple_scaled """ # Requires global global fit done if not self.global_global_fit_done: self.fit_thermal_unfolding_global_global() param_init = 2 + (self.cp_value is None) params_names = self.params_names[:param_init] p0 = self.global_fit_params[:param_init] low_bounds = self.low_bounds[:param_init] high_bounds = self.high_bounds[:param_init] n_datasets = self.nr_olig * self.nr_signals p1Ns = self.global_fit_params[param_init:param_init + n_datasets] p1Us = self.global_fit_params[param_init + n_datasets:param_init + 2 * n_datasets] p1Ns_per_signal = re_arrange_params(p1Ns, self.nr_signals) p1Us_per_signal = re_arrange_params(p1Us, self.nr_signals) m1s, b1s, m1s_low, b1s_low, m1s_high, b1s_high = [], [], [], [], [], [] m2s, b2s, m2s_low, b2s_low, m2s_high, b2s_high = [], [], [], [], [], [] for p1Ns, p1Us in zip(p1Ns_per_signal, p1Us_per_signal): # Estimate the slope of bNs versus oligomer concentration m1, b1 = fit_line_robust(self.oligomer_concentrations, p1Ns) m1_low = m1 / 100 if m1 > 0 else 100 * m1 m1_high = 100 * m1 if m1 > 0 else m1 / 100 b1_low = b1 / 100 if b1 > 0 else 100 * b1 b1_high = 100 * b1 if b1 > 0 else b1 / 100 # Estimate the slope of bUs versus oligomer concentration m2, b2 = fit_line_robust(self.oligomer_concentrations, p1Us) m2_low = m2 / 100 if m2 > 0 else 100 * m2 m2_high = 100 * m2 if m2 > 0 else m2 / 100 b2_low = b2 / 100 if b2 > 0 else 100 * b2 b2_high = 100 * b2 if b2 > 0 else b2 / 100 m1s.append(m1) b1s.append(b1) m1s_low.append(m1_low) b1s_low.append(b1_low) m1s_high.append(m1_high) b1s_high.append(b1_high) m2s.append(m2) b2s.append(b2) m2s_low.append(m2_low) b2s_low.append(b2_low) m2s_high.append(m2_high) b2s_high.append(b2_high) idx = param_init + 2 * n_datasets params_names += ['intercept_native - ' + signal_name for signal_name in self.signal_names] params_names += ['intercept_unfolded - ' + signal_name for signal_name in self.signal_names] if self.native_baseline_type in ['linear', 'quadratic','exponential']: param_name = 'pre_exponential_factor_native' if self.native_baseline_type == 'exponential' else 'slope_term_native' kNs = self.global_fit_params[idx:idx + self.nr_signals] low_bounds_kNs = self.low_bounds[idx:idx + self.nr_signals] high_bounds_kNs = self.high_bounds[idx:idx + self.nr_signals] idx += self.nr_signals params_names += [param_name + ' - ' + signal_name for signal_name in self.signal_names] if self.unfolded_baseline_type in ['linear', 'quadratic','exponential']: param_name = 'pre_exponential_factor_unfolded' if self.unfolded_baseline_type == 'exponential' else 'slope_term_unfolded' kUs = self.global_fit_params[idx:idx + self.nr_signals] low_bounds_kUs = self.low_bounds[idx:idx + self.nr_signals] high_bounds_kUs = self.high_bounds[idx:idx + self.nr_signals] idx += self.nr_signals params_names += [param_name + ' - ' + signal_name for signal_name in self.signal_names] params_names += ['oligomer_slope_term_native - ' + signal_name for signal_name in self.signal_names] params_names += ['oligomer_slope_term_unfolded - ' + signal_name for signal_name in self.signal_names] if self.native_baseline_type in ['quadratic', 'exponential']: param_name = 'exponential_coefficient_native' if self.native_baseline_type == 'exponential' else 'quadratic_term_native' qNs = self.global_fit_params[idx:idx + self.nr_signals] low_bounds_qNs = self.low_bounds[idx:idx + self.nr_signals] high_bounds_qNs = self.high_bounds[idx:idx + self.nr_signals] idx += self.nr_signals params_names += [param_name + ' - ' + signal_name for signal_name in self.signal_names] if self.unfolded_baseline_type in ['quadratic', 'exponential']: param_name = 'exponential_coefficient_unfolded' if self.unfolded_baseline_type == 'exponential' else 'quadratic_term_unfolded' qUs = self.global_fit_params[idx:idx + self.nr_signals] low_bounds_qUs = self.low_bounds[idx:idx + self.nr_signals] high_bounds_qUs = self.high_bounds[idx:idx + self.nr_signals] idx += self.nr_signals params_names += [param_name + ' - ' + signal_name for signal_name in self.signal_names] p0 = np.concatenate([p0, b1s, b2s]) low_bounds = np.concatenate([low_bounds, b1s_low, b2s_low]) high_bounds = np.concatenate([high_bounds, b1s_high, b2s_high]) if self.native_baseline_type in ['linear', 'quadratic','exponential']: p0 = np.concatenate([p0, kNs]) low_bounds = np.concatenate([low_bounds, low_bounds_kNs]) high_bounds = np.concatenate([high_bounds, high_bounds_kNs]) if self.unfolded_baseline_type in ['linear', 'quadratic','exponential']: p0 = np.concatenate([p0, kUs]) low_bounds = np.concatenate([low_bounds, low_bounds_kUs]) high_bounds = np.concatenate([high_bounds, high_bounds_kUs]) p0 = np.concatenate([p0, m1s, m2s]) low_bounds = np.concatenate([low_bounds, m1s_low, m2s_low]) high_bounds = np.concatenate([high_bounds, m1s_high, m2s_high]) if self.native_baseline_type in ['quadratic', 'exponential']: p0 = np.concatenate([p0, qNs]) low_bounds = np.concatenate([low_bounds, low_bounds_qNs]) high_bounds = np.concatenate([high_bounds, high_bounds_qNs]) if self.unfolded_baseline_type in ['quadratic', 'exponential']: p0 = np.concatenate([p0, qUs]) low_bounds = np.concatenate([low_bounds, low_bounds_qUs]) high_bounds = np.concatenate([high_bounds, high_bounds_qUs]) # Increase the bounds for c_N and c_U # Find index in the param names for signal_name in self.signal_names: c_N_name = 'oligomer_slope_term_native - ' + signal_name c_U_name = 'oligomer_slope_term_unfolded - ' + signal_name c_N_idx = params_names.index(c_N_name) c_U_idx = params_names.index(c_U_name) low_bounds[c_N_idx] -= 5 high_bounds[c_N_idx] += 5 low_bounds[c_U_idx] -= 5 high_bounds[c_U_idx] += 5 # If required, include a scale factor for each oligomer concentration if model_scale_factor: # The last oligomer concentration is fixed to 1, the rest are fitted scale_factors = [1 for _ in range(self.nr_olig - 1)] scale_factors_low = [0.5882 for _ in range(self.nr_olig - 1)] scale_factors_high = [1.7 for _ in range(self.nr_olig - 1)] p0 = np.concatenate([p0, scale_factors]) low_bounds = np.concatenate([low_bounds, scale_factors_low]) high_bounds = np.concatenate([high_bounds, scale_factors_high]) params_names += ['Scale factor - ' + str(d) + ' (M). ID: ' + str(i) for i, d in enumerate(self.oligomer_concentrations)] params_names.pop() # Remove the last one, as it is fixed to 1 scale_factor_exclude_ids = [self.nr_olig - 1] if model_scale_factor else [] signal_fx = map_two_state_model_to_signal_fx(self.model) # Do a prefit with a reduced dataset kwargs = { 'list_of_temperatures' : self.temp_lst_expanded_subset, 'list_of_signals' : self.signal_lst_expanded_subset, 'signal_ids' : self.signal_ids, 'oligomer_concentrations': self.oligomer_concentrations_expanded, 'initial_parameters': p0, 'low_bounds': low_bounds, 'high_bounds': high_bounds, 'model_scale_factor':model_scale_factor, 'cp_value' : self.cp_value, 'scale_factor_exclude_ids':scale_factor_exclude_ids, 'signal_fx' : signal_fx, 'baseline_native_fx' : self.baseline_N_fx, 'baseline_unfolded_fx' : self.baseline_U_fx, 'fit_native_olig_slope' : True, 'fit_unfolded_olig_slope' : True } fit_fx = fit_oligomer_unfolding_many_signals if self.pre_fit: global_fit_params, cov, predicted = fit_fx(**kwargs) # Assign the fitted parameters to the initial guess for the full dataset p0 = global_fit_params # End of prefit with reduced dataset # Use the whole dataset kwargs['list_of_signals'] = self.signal_lst_expanded kwargs['list_of_temperatures'] = self.temp_lst_expanded global_fit_params, cov, predicted = fit_fx(**kwargs) # Remove scale factors that are not significant if model_scale_factor: # 2 parameters corresponding to Tm, dH # plus Cp if fitted idx_start = 2 + (self.cp_value is None) native_factor = 2+np.sum(baseline_fx_name_to_req_params(self.baseline_N_fx)) unfolded_factor = 2+np.sum(baseline_fx_name_to_req_params(self.baseline_U_fx)) # Add index according to the native baseline polynomial order idx_start += native_factor * self.nr_signals # Add index according to the unfolded baseline polynomial order idx_start += unfolded_factor * self.nr_signals for _ in range(5): # Sort in ascending order the IDs to exclude scale_factor_exclude_ids = sorted(scale_factor_exclude_ids) n_fixed_factors = len(scale_factor_exclude_ids) n_fit_factors = self.nr_olig - n_fixed_factors if n_fit_factors == 0: break sf_params = global_fit_params[idx_start:(idx_start + n_fit_factors)] idxs_to_remove = [] re_fit = False # Add dummy variable where we need to skip the index for id in scale_factor_exclude_ids: sf_params = np.insert(sf_params, id, np.nan) for i, sf in enumerate(sf_params): if i in scale_factor_exclude_ids: continue if 0.995 <= sf <= 1.015: # Exclude the scale factor from the fit scale_factor_exclude_ids.append(i) re_fit = True j1 = np.sum(np.array(scale_factor_exclude_ids) < i) j2 = len(idxs_to_remove) idxs_to_remove.append(idx_start + i - j1 + j2) if not re_fit: break else: for idx in reversed(idxs_to_remove): global_fit_params = np.delete(global_fit_params, idx) low_bounds = np.delete(low_bounds, idx) high_bounds = np.delete(high_bounds, idx) del params_names[idx] kwargs['initial_parameters'] = global_fit_params kwargs['low_bounds'] = low_bounds kwargs['high_bounds'] = high_bounds kwargs['scale_factor_exclude_ids'] = scale_factor_exclude_ids global_fit_params, cov, predicted = fit_fx(**kwargs) rel_errors = relative_errors(global_fit_params, cov) self.params_names = params_names self.p0 = p0 self.low_bounds = low_bounds self.high_bounds = high_bounds self.global_fit_params = global_fit_params self.rel_errors = rel_errors self.predicted_lst_multiple = re_arrange_predictions( predicted, self.nr_signals, self.nr_olig) self.create_params_df() self.create_dg_df() self.global_global_global_fit_done = True # Obtained the scaled signal too if model_scale_factor: # signal scaled hos one sublist per selected signal type signal_scaled = deepcopy(self.signal_lst_multiple) predicted_scaled = deepcopy(self.predicted_lst_multiple) for value, param in zip(self.global_fit_params, self.params_names): if 'Scale factor' in param: id = int(param.split('(M). ID: ')[-1]) for i in range(len(signal_scaled)): signal_scaled[i][id] /= value predicted_scaled[i][id] /= value self.signal_lst_multiple_scaled = signal_scaled self.predicted_lst_multiple_scaled = predicted_scaled return None
[docs] def signal_to_df(self, signal_type='raw', scaled=False): """ Create a dataframe with three columns: Temperature, Signal, and oligomer. Optimized for speed by avoiding per-curve DataFrame creation. Parameters ---------- signal_type : {'raw', 'fitted', 'derivative'}, optional Which signal to include in the dataframe. 'raw' uses experimental data, 'fitted' uses model predictions, 'derivative' uses the estimated derivative signal. scaled : bool, optional If True and signal_type == 'fitted' or 'raw', use the scaled versions if available. Returns ------- pd.DataFrame A DataFrame with columns: ['Temperature', 'Signal', 'Oligomer', 'ID']. """ # Flatten all arrays and repeat oligomer values accordingly if signal_type == 'derivative': deriv_lst = self.deriv_lst_multiple[0] temp_lst = self.temp_deriv_lst_multiple[0] signal_all = np.concatenate(deriv_lst) temp_all = np.concatenate(temp_lst) else: # temperature is shared for the experimental and fitted signals temp_lst = self.temp_lst_multiple[0] if self.max_points is not None: temp_lst = [subset_data(x, self.max_points) for x in temp_lst] temp_all = np.concatenate(temp_lst) # fitted data signal does not need subset! if signal_type == 'fitted': if not scaled: predicted_lst = self.predicted_lst_multiple[0] else: predicted_lst = self.predicted_lst_multiple_scaled[0] signal_all = np.concatenate(predicted_lst) temp_all = np.concatenate(temp_lst) # Signal_type set to 'raw' else: if not scaled: signal_lst = self.signal_lst_multiple[0] else: signal_lst = self.signal_lst_multiple_scaled[0] if self.max_points is not None: signal_lst = [subset_data(x, self.max_points) for x in signal_lst] signal_all = np.concatenate(signal_lst) oligomer_all = np.concatenate([ np.full_like(temp_lst[i], self.oligomer_concentrations[i], dtype=np.float64) for i in range(len(temp_lst)) ]) # Add an ID column, so we can identify the curves, even with the same oligomer concentration id_all = np.concatenate([ np.full_like(temp_lst[i], i, dtype=np.int32) for i in range(len(temp_lst)) ]) signal_df = pd.DataFrame({ 'Temperature': temp_all, 'Signal': signal_all, 'Oligomer': oligomer_all, 'ID': id_all }) return signal_df