"""Auto tuning."""
import cyclum.models
import sklearn.decomposition
import sklearn.metrics
import math
[docs]class CyclumAutoTune(cyclum.models.AutoEncoder):
"""Circular autoencoder with automatically decided number of linear components
We first perform PCA on the data, and record the MSE of having first 1, 2, ..., max_linear_dims + 1 components.
We then try to train a circular autoencoder with 0, 1, ..., max_linear_dims linear components.
We compare circular autoencoder with i linear components with PCA with (i + 1) components, for i = 0, 1, ...
We record the first i where the difference of loss compared with PCA is greater than both (i - 1) and (i + 1),
or just (i + 1) if i == 0.
At the end, this class will be a UNTRAINED model, which has optimal numbers of linear components.
You can train it will all your data, more epochs, and better learning rate.
:param data: The data used to decide number of linear components. For a large dataset, you may use a representative portion of it.
:param max_linear_dims: maximum number of linear dimensions.
:param epochs: number of epochs for each test
:param verbose: per how many epochs does it report the loss, time consumption, etc.
:param rate: training rate
:param early_stop: Stop checking more linear components when result decided? ONLY affects the elbow plot. NO influence on result.
:param encoder_depth: depth of encoder, i.e., number of *hidden* layers
:param encoder_width: width of encoder, one of the following:
* An integer stands for number of nodes per layer. All hidden layers will have the same number of nodes.
* A list, whose length is equal to `encoder_depth`, of integers stand for numbers of nodes of the layers.
:param dropout_rate: rate for dropout.
:param nonlinear_reg: strength of regularization on the nonlinear encoder.
:param linear_reg: strength of regularization on the linear encoder.
Examples:
>>> from cyclum.hdfrw import hdf2mat, mat2hdf
>>> df = hdf2mat('path_to_hdf.h5')
>>> m = CyclumAutoTune(df.values, max_linear_dims=5)
>>> m.train(df.values)
>>> pseudotime = m.predict_pseudotime(df.values)
>>> mat2hdf(pseudotime, 'path_to_pseudotime.h5')
"""
def __init__(self, data, max_linear_dims=3, epochs=500, verbose=100, rate=5e-4, early_stop=False,
encoder_depth=2, encoder_width=50, dropout_rate=0.1, nonlinear_reg=1e-4, linear_reg=1e-4):
print("Auto tuning number of linear components...")
self.max_linear_dims = max_linear_dims
print("Performing PCA...")
pca_model = sklearn.decomposition.PCA(n_components=self.max_linear_dims + 2)
pca_load = pca_model.fit_transform(data)
pca_comp = pca_model.components_
self.pca_loss = [sklearn.metrics.mean_squared_error(data, pca_load[:, 0:(i + 1)] @ pca_comp[0:(i + 1), :]) for i
in range(self.max_linear_dims + 2)]
print("Training Autoencoder with...")
self.ae_loss = []
best_n_linear_dims = None
for i in range(self.max_linear_dims + 1):
print(f" {i} linear dimensions...")
model = cyclum.models.AutoEncoder(input_width=data.shape[1],
encoder_depth=encoder_depth,
encoder_width=encoder_width,
n_circular_unit=1,
n_logistic_unit=0,
n_linear_unit=0,
n_linear_bypass=i,
dropout_rate=dropout_rate,
nonlinear_reg=nonlinear_reg,
linear_reg=linear_reg)
history = model.train(data, epochs=epochs, verbose=verbose, rate=rate)
self.ae_loss.append(history.history['loss'][-1])
print(self.ae_loss)
print(self.pca_loss)
if i > 0 and best_n_linear_dims is None and (self.pca_loss[i] - self.ae_loss[i] <
self.pca_loss[i - 1] - self.ae_loss[i - 1]):
best_n_linear_dims = i - 1
print(f"Found! Use {best_n_linear_dims} linear components...")
if early_stop:
break
else:
print("Early stop disabled, continue to check all cases...")
if best_n_linear_dims is None:
best_n_linear_dims = max_linear_dims
print(f"Have not found one. Suggest raise max_linear_dims. Use {max_linear_dims} linear components...")
super().__init__(input_width=data.shape[1],
encoder_depth=encoder_depth,
encoder_width=encoder_width,
n_circular_unit=1,
n_logistic_unit=0,
n_linear_unit=0,
n_linear_bypass=best_n_linear_dims,
dropout_rate=dropout_rate,
nonlinear_reg=nonlinear_reg,
linear_reg=linear_reg)
[docs] def show_elbow(self):
"""Show an elbow plot of both PCA and autoencoder
You will observe the time when autoencoder become to have a higher loss than PCA.
The previous time is considered as the best model.
:return: figure object
"""
import matplotlib.pyplot as plt
fig = plt.figure()
plt.plot(list(range(len(self.pca_loss))), self.pca_loss, "+-")
plt.plot(list(range(len(self.ae_loss))), self.ae_loss, "x-")
plt.legend(['PCA', 'AE'])
plt.xticks(range(max(map(len, [self.pca_loss, self.ae_loss]))))
plt.xlabel('X: X linear components or (X+1) PCs')
plt.ylabel('Mean squared error')
return fig
[docs] def show_bar(self, root=False):
"""Show a bar plot for what percentage of more loss is handled by the circular component
:return: figure object
"""
import matplotlib.pyplot as plt
import numpy
fig = plt.figure()
if root:
linear_handled = numpy.sqrt(self.pca_loss[:-1]) - numpy.sqrt(self.pca_loss[1:])
circular_handled = numpy.sqrt(self.pca_loss[:-1]) - numpy.sqrt(self.ae_loss)
else:
linear_handled = numpy.array(self.pca_loss[:-1]) - numpy.array(self.pca_loss[1:])
circular_handled = numpy.array(self.pca_loss[:-1]) - numpy.array(self.ae_loss)
plt.bar(list(range(len(self.ae_loss))), circular_handled / linear_handled)
plt.xticks(list(range(len(self.ae_loss))))
plt.xlabel('X: X linear components')
plt.ylabel('Relative loss decrease')
return fig