Following the procedure described in this SO question, I am trying to transform my (greatly performing) convolutional Autoencoder into a Variational version of the same Autoencoder. As explained in the cited post, this essentially boils down to adding the KL divergence in the loss and to adding a layer for the sampling.
My data is composed of 1D time series.
This is my code:
from __future__ import division import numpy as np import matplotlib.pyplot as plt from keras.layers import Input, Dense, Lambda, Reshape, Conv1D, MaxPooling1D, UpSampling1D, Flatten from keras.models import Model from keras import backend as K from keras import losses from keras.callbacks import Callback X_train = np.loadtxt(..)# here I load my data X_test = np.loadtxt(..) # dimension of input n_x = X_train.shape X_train = np.reshape(X_train, [1500, n_x, 1]) X_test = np.reshape(X_test, [500, n_x, 1]) # dimension of latent space (batch size by latent dim) batch_size = 100 latent_dim = 2 x = Input(shape=(n_x, 1)) conv_1 = Conv1D(8, kernel_size=16, padding='valid', activation='tanh')(x) maxp1 = MaxPooling1D(2)(conv_1) conv_2 = Conv1D(16, kernel_size=64, padding='valid', activation='tanh')(maxp1) maxp2 = MaxPooling1D(2)(conv_2) flatten = Flatten()(maxp2) hidden = Dense(700, activation='tanh')(flatten) hidden2 = Dense(400, activation='tanh')(hidden) z_mean = Dense(latent_dim)(hidden2) z_log_var = Dense(latent_dim)(hidden2) epsilon_std = 1.0 def sampling(args): z_mean, z_log_var = args epsilon = K.random_normal(shape=(batch_size, latent_dim), mean=0., stddev=epsilon_std) return(z_mean + K.exp(z_log_var/2) * epsilon) z = Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_var]) decoder = Dense(700, activation='tanh')(z) decoder = Reshape((700, 1))(decoder) de_conv_1 = Conv1D(16, kernel_size=64, padding='valid', activation='tanh')(decoder) upsamp = UpSampling1D(2)(de_conv_1) de_conv_2 = Conv1D(8, kernel_size=16, padding='valid', activation='tanh')(upsamp) upsamp = UpSampling1D(2)(de_conv_2) flatten = Flatten()(upsamp) x_decoded_mean = Dense(n_x)(flatten) x_decoded_mean = Reshape([n_x, 1])(x_decoded_mean) epochs = 200 vae = Model(x, x_decoded_mean) vae.summary() ## define loss (sum of reconstruction and KL divergence) def vae_loss(y_true, y_pred): # E[log P(X|z)] recon = K.sum(K.square(y_true - y_pred), axis=[1, 2]) # D_KL(Q(z|X) || P(z|X)) kl = K.exp(z_log_var) + K.square(z_mean) - 1. - z_log_var kl = 0.5 * K.sum(kl, axis=-1) return K.mean(recon + kl) def KL_loss(y_true, y_pred): return(0.5 * K.sum(K.exp(z_log_var) + K.square(z_mean) - 1. - z_log_var, axis=-1)) def recon_loss(y_true, y_pred): return K.sum(K.square(y_true - y_pred), axis=[1, 2]) # compile and fit vae.compile(optimizer='adam', loss=vae_loss, metrics = [KL_loss, recon_loss]) vae_hist = vae.fit(X_train, X_train, batch_size=batch_size, epochs=epochs, validation_data = (X_test, X_test))
This results in horrible reconstructions. I am wondering if this has to do with the balance between KL and reconstruction loss, which is a notorious problem in Variational Autoencoder. However, even trying with different variations of using mean, sum etc in the definition of the KL and Reconstruction losses, the performance remains very poor. I have also tried with KL divergence Annealing, as explained e.g. here, but basically the problem I face in that case is that once the KL loss kicks in, the total loss stops decreasing.