From 55b19a83d7d149bf5b8db164079fca0db809cfa1 Mon Sep 17 00:00:00 2001
From: anshuln <anshulnasery@gmail.com>
Date: Sat, 13 Jul 2019 07:56:24 +0530
Subject: [PATCH 01/11] Adds gradient computations for layers

---
 invtf/layers.py | 115 +++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 90 insertions(+), 25 deletions(-)

diff --git a/invtf/layers.py b/invtf/layers.py
index 9c02286..71fccb6 100644
--- a/invtf/layers.py
+++ b/invtf/layers.py
@@ -10,8 +10,48 @@
 	The log-det computations normalizes wrt full dimension. 
 
 """
-
-class Linear(keras.layers.Layer): 
+#TODO Write unit tests
+class LayerWithGrads(keras.layers.Layer):    
+	'''
+	This is a virtual class from which all layer classes need to inherit
+	It has the function `compute gradients` which is used for constant 
+	memory backprop.
+	'''
+	def __init__(self,**kwargs):
+		super(LayerWithGrads,self).__init__(**kwargs)
+
+	def call(self,X):
+		raise NotImplementedError
+
+	def call_inv(self,X):
+		raise NotImplementedError
+
+	def compute_gradients(self,x,dy,regularizer=None):  
+		'''
+		Computes gradients for backward pass
+		Args:
+			x - tensor compatible with forward pass, input to the layer
+			dy - incoming gradient from backprop
+			regularizer - function, indicates dependence of loss on weights of layer
+		Returns
+			dy - gradients wrt input, to be backpropagated
+			grads - gradients wrt weights
+		'''
+		#TODO check if log_det of AffineCouplingLayer depends needs a regularizer.
+		with tf.GradientTape() as tape:
+			tape.watch(x)
+			y_ = self.call(x)   #Required to register the operation onto the gradient tape
+		grads_combined = tape.gradient(y_,[x]+self.trainable_variables,output_gradients=dy)
+		dy,grads = grads_combined[0],grads_combined[1:]
+
+		if regularizer is not None:
+			with tf.GradientTape() as tape:
+				reg = -regularizer()
+			grads_wrt_reg = tape.gradient(reg, self.trainable_variables)
+			grads = [a[0]+a[1] for a in zip(grads,grads_wrt_reg)]
+		return dy,grads
+
+class Linear(LayerWithGrads): 
 
 	def __init__(self, **kwargs): super(Linear, self).__init__(**kwargs)
 
@@ -39,7 +79,7 @@ def compute_output_shape(self, input_shape):
 		return input_shape
 
 
-class Affine(keras.layers.Layer): 
+class Affine(LayerWithGrads): 
 
 	"""
 		The exp parameter allows the scaling to be exp(s) \odot X. 
@@ -154,6 +194,27 @@ def log_det(self): 		return 0.
 
 	def compute_output_shape(self, input_shape): return input_shape
 
+	def compute_gradients(self,x,dy,regularizer=None):  
+		'''
+		Computes gradients for backward pass
+		Since the coupling layers do not inherit from `LayerWithGrads`, this 
+		function is re-written. See TODO of AffineCoupling for further info
+		Args:
+			x - tensor compatible with forward pass, input to the layer
+			dy - incoming gradient from backprop
+			regularizer - function, indicates dependence of loss on weights of layer
+		Returns
+			dy - gradients wrt input, to be backpropagated
+			grads - gradients wrt weights
+		'''
+		with tf.GradientTape() as tape:
+			tape.watch(x)
+			y_ = self.call(x)   #Required to register the operation onto the gradient tape
+		grads_combined = tape.gradient(y_,[x]+self.trainable_variables,output_gradients=dy)
+		dy,grads = grads_combined[0],grads_combined[1:]
+
+		return dy,grads
+
 
 
 """
@@ -168,7 +229,7 @@ def compute_output_shape(self, input_shape): return input_shape
 	For now assumes the use of convolutions 
 
 """
-class AffineCoupling(keras.Sequential):  	
+class AffineCoupling(keras.Sequential):  	#TODO Check gradient computations with and without reg
 
 	unique_id = 1
 
@@ -268,7 +329,25 @@ def compute_output_shape(self, input_shape): return input_shape
 	def summary(self, line_length=None, positions=None, print_fn=None):
 		print_summary(self, line_length=line_length, positions=positions, print_fn=print_fn) # fixes stupid issue.
 
-
+	def compute_gradients(self,x,dy,regularizer=None):  
+		'''
+		Computes gradients for backward pass
+		Args:
+			x - tensor compatible with forward pass, input to the layer
+			dy - incoming gradient from backprop
+			regularizer - function, indicates dependence of loss on weights of layer
+		Returns
+			dy - gradients wrt input, to be backpropagated
+			grads - gradients wrt weights
+		'''
+		#TODO check if log_det of AffineCouplingLayer needs a regularizer.
+		with tf.GradientTape() as tape:
+			tape.watch(x)
+			y_ = self.call(x)   #Required to register the operation onto the gradient tape
+		grads_combined = tape.gradient(y_,[x]+self.trainable_variables,output_gradients=dy)
+		dy,grads = grads_combined[0],grads_combined[1:]
+
+		return dy,grads
 
 
 """
@@ -278,7 +357,7 @@ def summary(self, line_length=None, positions=None, print_fn=None):
 		- Downscale images, e.g. alternate pixels and have 4 lower dim images and stack them. 
 		- ... 
 """
-class Squeeze(keras.layers.Layer): 
+class Squeeze(LayerWithGrads): 
 
 	def call(self, X): 
 		n, self.w, self.h, self.c = X.shape
@@ -292,7 +371,7 @@ def log_det(self): return 0.
 
 # TODO: for now assumes target is +-1, refactor to support any target. 
 # Refactor 127.5 
-class Normalize(keras.layers.Layer):  # normalizes data after dequantization. 
+class Normalize(LayerWithGrads):  # normalizes data after dequantization. 
 
 	def __init__(self, target=[-1,+1], scale=127.5, input_shape=None): 
 		super(Normalize, self).__init__(input_shape=input_shape)
@@ -333,7 +412,7 @@ def log_det(self): return 0.
 class ActNorm(keras.layers.Layer): pass 
 
 
-class Inv1x1Conv(keras.layers.Layer):  
+class Inv1x1Conv(LayerWithGrads):  
 
 	"""
 		Based on Glow page 11 appendix B. 
@@ -375,12 +454,13 @@ def call_inv(self, Z):
 		return tf.nn.conv2d(Z, _W, [1,1,1,1], "SAME")
 
 	def log_det(self): 		 # TODO: Fix this issue!!! 
+		print(self.h, self.w, tf.linalg.det(self.W))
 		return self.h * self.w * tf.math.log(tf.abs( tf.linalg.det(self.W) ))  
 
 	def compute_output_shape(self, input_shape): return input_shape
 
 
-class Glow1x1Conv(keras.layers.Layer): 
+class Glow1x1Conv(LayerWithGrads): 
 
 	# Could be speed up parameterizing in LU decomposition. 
 	def build(self, input_shape): 
@@ -426,7 +506,7 @@ def compute_output_shape(self, input_shape): return input_shape
 
 
 
-class Conv3DCirc(keras.layers.Layer): 
+class Conv3DCirc(LayerWithGrads): 
 
 	def __init__(self,trainable=True): 
 		self.built = False
@@ -485,21 +565,6 @@ def compute_output_shape(self, input_shape):
 		return tf.TensorShape(input_shape[1:])
 
 
-class Reshape(keras.layers.Layer):
-	def __init__(self, shape): 
-		self.shape = shape
-		super(Reshape, self).__init__()
-
-	def call(self, X): 
-		self.prev_shape = X.shape
-		return tf.reshape(X, (-1, ) + self.shape)
-
-	def log_det(self): return .0
-
-	def call_inv(self, X): return tf.reshape(X, self.input_shape)
-
-
-
 
 class InvResNet(keras.layers.Layer): 			pass # model should automatically use gradient checkpointing if this is used. 
 

From 1a8d7c87e54317a76a4ec6915cb3ae6f31fdf230 Mon Sep 17 00:00:00 2001
From: anshuln <anshulnasery@gmail.com>
Date: Sat, 13 Jul 2019 17:14:24 +0530
Subject: [PATCH 02/11] Adds const memory backprop computations for layers

---
 invtf/layers_const_backprop.py | 596 +++++++++++++++++++++++++++++++++
 1 file changed, 596 insertions(+)
 create mode 100644 invtf/layers_const_backprop.py

diff --git a/invtf/layers_const_backprop.py b/invtf/layers_const_backprop.py
new file mode 100644
index 0000000..d1f7637
--- /dev/null
+++ b/invtf/layers_const_backprop.py
@@ -0,0 +1,596 @@
+import tensorflow as tf
+import tensorflow.keras as keras 
+import numpy as np
+from tensorflow.keras.layers import ReLU
+from invtf.override import print_summary
+from invtf.coupling_strategy import *
+
+"""
+	Known issue with multi-scale architecture. 
+	The log-det computations normalizes wrt full dimension. 
+
+"""
+#TODO Write unit tests
+class LayerWithGrads(keras.layers.Layer):    
+	'''
+	This is a virtual class from which all layer classes need to inherit
+	It has the function `compute gradients` which is used for constant 
+	memory backprop.
+	'''
+	def __init__(self,**kwargs):
+		super(LayerWithGrads,self).__init__(**kwargs)
+
+	def call(self,X):
+		raise NotImplementedError
+
+	def call_inv(self,X):
+		raise NotImplementedError
+
+	def compute_gradients(self,x,dy,regularizer=None):  
+		'''
+		Computes gradients for backward pass
+		Args:
+			x - tensor compatible with forward pass, input to the layer
+			dy - incoming gradient from backprop
+			regularizer - function, indicates dependence of loss on weights of layer
+		Returns
+			dy - gradients wrt input, to be backpropagated
+			grads - gradients wrt weights
+		'''
+		#TODO check if log_det of AffineCouplingLayer depends needs a regularizer.
+		with tf.GradientTape() as tape:
+			tape.watch(x)
+			y_ = self.call(x)   #Required to register the operation onto the gradient tape
+		grads_combined = tape.gradient(y_,[x]+self.trainable_variables,output_gradients=dy)
+		dy,grads = grads_combined[0],grads_combined[1:]
+
+		if regularizer is not None:
+			with tf.GradientTape() as tape:
+				reg = -regularizer()
+			grads_wrt_reg = tape.gradient(reg, self.trainable_variables)
+			grads = [a[0]+a[1] for a in zip(grads,grads_wrt_reg)]
+		return dy,grads
+
+class Linear(LayerWithGrads): 
+
+	def __init__(self, **kwargs): super(Linear, self).__init__(**kwargs)
+
+	def build(self, input_shape): 
+
+		assert len(input_shape) == 2
+		_, d = input_shape
+
+		self.W = self.add_weight(shape=(d, d), 	initializer='identity', name="linear_weight")
+		self.b = self.add_weight(shape=(d), 	initializer='zero',		name="linear_bias")
+		
+		super(Linear, self).build(input_shape)
+		self.built = True
+
+	def call(self, X): 		return X @ self.W + self.b 
+
+	def call_inv(self, Z):  return (Z - self.b) @ tf.linalg.inv(self.W)
+
+	def jacobian(self):		return self.W
+
+	def log_det(self): 		return tf.math.log(tf.abs(tf.linalg.det(self.jacobian())))
+
+	def compute_output_shape(self, input_shape): 
+		self.output_shape = input_shape
+		return input_shape
+
+
+class Affine(LayerWithGrads): 
+
+	"""
+		The exp parameter allows the scaling to be exp(s) \odot X. 
+		This cancels out the log in the log_det computations. 
+	"""
+
+	def __init__(self, exp=False, **kwargs): 
+		self.exp = exp
+		super(Affine, self).__init__(**kwargs)
+
+	def build(self, input_shape): 
+
+		#assert len(input_shape) == 2
+		d = input_shape[1:]
+
+		self.w = self.add_weight(shape=d, 	initializer='ones', name="affine_scale") 
+		self.b = self.add_weight(shape=d, 	initializer='zero', name="affine_bias")
+
+		super(Affine, self).build(input_shape)
+		self.built = True
+
+	def call(self, X): 		
+		if self.exp: 	return X * tf.exp(self.w) + self.b 
+		else: 			return X * self.w 		  + self.b
+
+	def call_inv(self, Z):  
+		if self.exp:	return (Z - self.b) / tf.exp(self.w)
+		else: 			return (Z - self.b) / self.w
+
+	def jacobian(self):		return self.w
+
+	def eigenvalues(self): 	return self.w
+
+	def log_det(self): 		
+		if self.exp: 	return tf.reduce_sum(tf.abs(self.eigenvalues()))
+		else: 			return tf.reduce_sum(tf.math.log(tf.abs(self.eigenvalues())))
+
+	def compute_output_shape(self, input_shape): 
+		self.output_shape = input_shape
+		return input_shape
+
+
+
+"""
+	For simplicity we vectorize input and apply coupling to even/odd entries. 
+	Could also use upper/lower. Refactor this to support specifying the pattern as a parameter. 
+
+	TODO: 
+		Potentially refactor so we can add directly to AdditiveCoupling instead of creating 'm'
+		by (potentially adding to Sequential) and passing this on to AdditiveCoupling. 
+		The main issue is AdditiveCoupling is R^2-> R^2 while m:R^1->R^1, so if we 
+		add directly to AdditiveCoupling we run into issues with miss matching dimensions. 
+	
+"""
+class AdditiveCoupling(keras.Sequential): 
+
+	unique_id = 1
+
+	def __init__(self, part=0, strategy=SplitOnHalfStrategy()): # strategy: alternate / split  ;; alternate does odd/even, split has upper/lower. 
+		super(AdditiveCoupling, self).__init__(name="add_coupling_%i"%AdditiveCoupling.unique_id)
+		AdditiveCoupling.unique_id += 1
+		self.part 	= part 
+		self.strategy = strategy
+
+
+	def build(self, input_shape):
+
+		self.layers[0].build(input_shape=(None, 28**2/2))
+		out_dim = self.layers[0].compute_output_shape(input_shape=(None, 28**2/2))
+
+		for layer in self.layers[1:]:  
+			layer.build(input_shape=out_dim)
+			out_dim = layer.compute_output_shape(input_shape=out_dim)
+
+	def call_(self, X): 
+		for layer in self.layers: 
+			X = layer.call(X)
+		return X
+
+	def call(self, X): 		
+		shape 	= tf.shape(X)
+		d 		= tf.reduce_prod(shape[1:])
+		X 		= tf.reshape(X, (shape[0], d))
+
+		x0, x1 = self.strategy.split(X)
+
+		if self.part == 0: x0 		= x0 + self.call_(x1)
+		if self.part == 1: x1 		= x1 + self.call_(x0)
+
+		X = self.strategy.combine(x0, x1)
+
+		X 		= tf.reshape(X, shape)
+		return X
+
+	def call_inv(self, Z):	 
+		shape 	= tf.shape(Z)
+		d 		= tf.reduce_prod(shape[1:])
+		Z 		= tf.reshape(Z, (shape[0], d))
+
+		z0, z1 = self.strategy.split(Z)
+		
+		if self.part == 0: z0 		= z0 - self.call_(z1)
+		if self.part == 1: z1 		= z1 - self.call_(z0)
+
+		Z = self.strategy.combine(z0, z1)
+
+		Z 		= tf.reshape(Z, shape)
+		return Z
+
+
+	def log_det(self): 		return 0. 
+
+	def compute_output_shape(self, input_shape): return input_shape
+
+	def compute_gradients(self,x,dy,regularizer=None):  
+		'''
+		Computes gradients for backward pass
+		Since the coupling layers do not inherit from `LayerWithGrads`, this 
+		function is re-written. See TODO of AffineCoupling for further info
+		Args:
+			x - tensor compatible with forward pass, input to the layer
+			dy - incoming gradient from backprop
+			regularizer - function, indicates dependence of loss on weights of layer
+		Returns
+			dy - gradients wrt input, to be backpropagated
+			grads - gradients wrt weights
+		'''
+		with tf.GradientTape() as tape:
+			tape.watch(x)
+			y_ = self.call(x)   #Required to register the operation onto the gradient tape
+		grads_combined = tape.gradient(y_,[x]+self.trainable_variables,output_gradients=dy)
+		dy,grads = grads_combined[0],grads_combined[1:]
+
+		if regularizer is not None:
+			with tf.GradientTape() as tape:
+				reg = -regularizer()
+			grads_wrt_reg = tape.gradient(reg, self.trainable_variables)
+			grads = [a[0]+a[1] for a in zip(grads,grads_wrt_reg)]
+		return dy,grads
+
+
+
+"""
+	The affine coupling layer is described in NICE, REALNVP and GLOW. 
+	The description in Glow use a single network to output scale s and transform t, 
+	it seems the description in REALNVP is a bit more general refering to s and t as 
+	different functions. From this perspective Glow change the affine layer to have
+	weight sharing between s and t. 
+	 Specifying a single function is a lot simpler code-wise, we thus use that approach. 
+
+
+	For now assumes the use of convolutions 
+
+"""
+class AffineCoupling(keras.Sequential):  	#TODO Check gradient computations with and without reg
+
+	unique_id = 1
+
+	def __init__(self, part=0, strategy=SplitChannelsStrategy()): 
+		super(AffineCoupling, self).__init__(name="aff_coupling_%i"%AffineCoupling.unique_id)
+		AffineCoupling.unique_id += 1
+		self.part 		= part 
+		self.strategy 	= strategy
+
+	def build(self, input_shape):
+
+		# handle the issue with each network output something larger. 
+		_, h, w, c = input_shape
+
+
+		h, w, c = self.strategy.coupling_shape(input_shape=(h,w,c))
+
+		self.layers[0].build(input_shape=(None, h, w, c))
+		out_dim = self.layers[0].compute_output_shape(input_shape=(None, h, w, c))
+		self.layers[0].output_shape_ = out_dim
+
+		for layer in self.layers[1:]:  
+			layer.build(input_shape=out_dim)
+			out_dim = layer.compute_output_shape(input_shape=out_dim)
+			layer.output_shape_ = out_dim
+
+	def call_(self, X): 
+
+		in_shape = tf.shape(X)
+		n, h, w, c = X.shape
+
+		for layer in self.layers: 
+			X = layer.call(X) # residual 
+
+		# TODO: Could have a part of network learned specifically for s,t to not ONLY have wegith sharing? 
+		
+		X = tf.reshape(X, (-1, h, w, c*2))
+		s = X[:, :, w//2:, :]
+		t = X[:, :, :w//2, :]  
+
+		s = tf.reshape(s, in_shape)
+		t = tf.reshape(t, in_shape)
+
+		return s, t
+
+	def call(self, X): 		
+
+		x0, x1 = self.strategy.split(X)
+
+		if self.part == 0: 
+			s, t 	= self.call_(x1)
+			x0 		= x0*s + t
+
+		if self.part == 1: 
+			s, t 	= self.call_(x0)
+			x1 		= x1*s + t 
+
+		X 		= self.strategy.combine(x0, x1)
+		return X
+
+	def call_inv(self, Z):	 
+		z0, z1 = self.strategy.split(Z)
+		
+		if self.part == 0: 
+			s, t 	= self.call_(z1)
+			z0 		= (z0 - t)/s
+		if self.part == 1: 
+			s, t 	= self.call_(z0)
+			z1 		= (z1 - t)/s
+
+		Z 		= self.strategy.combine(z0, z1)
+		return Z
+
+
+	def log_det(self): 		 
+
+		# TODO: save 's' instead of recomputing. 
+
+		X 		= self.input
+		n 		= tf.dtypes.cast(tf.shape(X)[0], tf.float32)
+
+		x0, x1 = self.strategy.split(X)
+
+		if self.part == 0: 
+			s, t 	= self.call_(x1)
+		if self.part == 1: 
+			s, t 	= self.call_(x0)
+
+		# there is an issue with 's' being divided by dimension 'd' later:
+		# If we used MultiScale it will be lower dimensional, in this case
+		# we should not divide by d but d//2. 
+
+		return tf.reduce_sum(tf.math.log(tf.abs(s))) / n
+
+	def compute_output_shape(self, input_shape): return input_shape
+
+	def summary(self, line_length=None, positions=None, print_fn=None):
+		print_summary(self, line_length=line_length, positions=positions, print_fn=print_fn) # fixes stupid issue.
+
+	def compute_gradients(self,x,dy,regularizer=None):  
+		'''
+		Computes gradients for backward pass
+		Args:
+			x - tensor compatible with forward pass, input to the layer
+			dy - incoming gradient from backprop
+			regularizer - function, indicates dependence of loss on weights of layer
+		Returns
+			dy - gradients wrt input, to be backpropagated
+			grads - gradients wrt weights
+		'''
+		#TODO check if log_det of AffineCouplingLayer needs a regularizer. -- DONE, it does
+		with tf.GradientTape() as tape:
+			tape.watch(x)
+			y_ = self.call(x)   #Required to register the operation onto the gradient tape
+		grads_combined = tape.gradient(y_,[x]+self.trainable_variables,output_gradients=dy)
+		dy,grads = grads_combined[0],grads_combined[1:]
+
+		if regularizer is not None:
+			with tf.GradientTape() as tape:
+				reg = -regularizer()
+			grads_wrt_reg = tape.gradient(reg, self.trainable_variables)
+			grads = [a[0]+a[1] for a in zip(grads,grads_wrt_reg)]
+		return dy,grads
+
+
+"""
+	Try different techniques: I'm implementing the simplest case, just reshape to desired shape. 
+	TODO: Implement the following Squeeze strategies: 
+		- RealNVP
+		- Downscale images, e.g. alternate pixels and have 4 lower dim images and stack them. 
+		- ... 
+"""
+class Squeeze(LayerWithGrads): 
+
+	def call(self, X): 
+		n, self.w, self.h, self.c = X.shape
+		return tf.reshape(X, [-1, self.w//2, self.h//2, self.c*4])
+
+	def call_inv(self, X): 
+		return tf.reshape(X, [-1, self.w, self.h, self.c])
+		
+	def log_det(self): return 0. 
+
+
+# TODO: for now assumes target is +-1, refactor to support any target. 
+# Refactor 127.5 
+class Normalize(LayerWithGrads):  # normalizes data after dequantization. 
+
+	def __init__(self, target=[-1,+1], scale=127.5, input_shape=None): 
+		super(Normalize, self).__init__(input_shape=input_shape)
+		self.target = target
+		self.d 		= np.prod(input_shape)
+		self.scale  = 1/127.5
+
+	def call(self, X):  
+		X 			= X * self.scale  - 1
+		return X
+
+	def call_inv(self, Z): 
+		Z = Z + 1
+		Z = Z / self.scale
+		return Z
+
+	def log_det(self): return self.d * tf.math.log(self.scale) 
+
+
+class MultiScale(keras.layers.Layer): 
+
+	def call(self, X):  # TODO: have different strategies here, and combine it with how coupling layer works? 
+		n, w, h, c = X.shape
+		Z = X[:, :, :, c//2:]
+		X = X[:, :, :, :c//2]
+		return X, Z
+	
+	def call_inv(self, X, Z): 
+		return tf.concat((X, Z), axis=-1)
+
+	def compute_output_shape(self, input_shape): 
+		n, h, w, c = input_shape
+		return (n, h, w, c//2)
+
+	def log_det(self): return 0.
+
+
+class ActNorm(keras.layers.Layer): pass 
+
+
+class Inv1x1Conv(LayerWithGrads):  
+
+	"""
+		Based on Glow page 11 appendix B. 
+		It is possible to speed up determinant computation by using PLU or QR decomposition
+		as proposed in Glow and Emerging Conv papers respectively. 
+
+		Add bias to this operation? Try to see if it makes any difference. 
+
+		Try to compare speed / numerical stability etc for different implementations: 
+
+			1. PLU decomposition
+			2. QR
+			3. Normal determinant O(c^3)
+			4. tensordot vs conv2d. 
+	"""
+
+	def __init__(self, **kwargs): super(Inv1x1Conv, self).__init__(**kwargs)
+
+	def build(self, input_shape): 
+
+		_, h, w, c = input_shape
+		self.c = c
+		self.h = h
+		self.w = w
+
+		#w_init = np.linalg.qr(np.random.randn(c,c))[0]
+		self.W 		= self.add_weight(shape=(c, c), initializer=keras.initializers.Orthogonal(gain=1.0, seed=None), name="inv_1x1_conv")
+		self.W_inv 	= tf.linalg.inv(self.W)
+		
+		super(Inv1x1Conv, self).build(input_shape)
+		self.built = True
+
+	def call(self, X): 	
+		_W = tf.reshape(self.W, (1,1, self.c, self.c))
+		return tf.nn.conv2d(X, _W, [1,1,1,1], "SAME")
+
+	def call_inv(self, Z):  
+		_W = tf.reshape(self.W_inv, (1,1, self.c, self.c))
+		return tf.nn.conv2d(Z, _W, [1,1,1,1], "SAME")
+
+	def log_det(self): 		 # TODO: Fix this issue!!! 
+		print(self.h, self.w, tf.linalg.det(self.W))
+		return self.h * self.w * tf.math.log(tf.abs( tf.linalg.det(self.W) ))  
+
+	def compute_output_shape(self, input_shape): return input_shape
+
+
+class Glow1x1Conv(LayerWithGrads): 
+
+	# Could be speed up parameterizing in LU decomposition. 
+	def build(self, input_shape): 
+		_, h, w, c = input_shape
+
+		self.h, self.w = h, w
+	
+		# make L and U lower and upper triangular by masking with zeros. 
+		self.L 				= self.add_weight(shape=(c, c), initializer="zeros", name="weights")
+		self.U				= self.add_weight(shape=(c, c), initializer="zeros", name="weights")
+		self.eigenvals		= self.add_weight(shape=(c, 1), initializer="ones", name="weights")
+
+		identity 			= tf.constant(np.identity(c), dtype=tf.float32)
+
+		""" 
+		>>> # Creating masks. 
+		>>> np.triu(np.ones((4,4)), k=+1)
+		array([[0., 1., 1., 1.],
+			   [0., 0., 1., 1.],
+			   [0., 0., 0., 1.],
+			   [0., 0., 0., 0.]])
+		>>> np.tril(np.ones((4,4)), k=-1)
+		array([[0., 0., 0., 0.],
+			   [1., 0., 0., 0.],
+			   [1., 1., 0., 0.],
+			   [1., 1., 1., 0.]])
+		"""
+
+		upper_mask = tf.constant(np.triu(np.ones((c,c)), k=+1), dtype=tf.float32)
+		lower_mask = tf.constant(np.tril(np.ones((c,c)), k=-1), dtype=tf.float32)
+
+		self.L = lower_mask * self.L + identity
+		self.U = upper_mask * self.U + identity
+		
+		self.kernel 		= self.L @ (self.eigenvals * self.U) 
+		self.kernel_inv 	= tf.linalg.inv(self.kernel)
+
+	def call(self, inputs): 		return tf.tensordot(inputs, self.kernel, axes=((-1), (0))) 
+	def call_inv(self, inputs):		return tf.tensordot(inputs, self.kernel_inv, axes=((-1), (0))) 
+	def log_det(self): 				return self.h * self.w * tf.reduce_sum(tf.math.log(tf.abs(self.eigenvals))) 
+
+	def compute_output_shape(self, input_shape): return input_shape
+
+
+
+class Conv3DCirc(LayerWithGrads): 
+
+	def __init__(self,trainable=True): 
+		self.built = False
+		super(Conv3DCirc, self).__init__()
+
+	def call(self, X): 
+		if self.built == False:    #For some reason the layer is not being built without this line
+			self.build(X.get_shape().as_list())
+
+		#The next 2 lines are a redundant computation necessary because w needs to be an EagerTensor for the output to be eagerly executed, and that was not the case earlier
+		#EagerTensor is required for backprop to work...
+		#Further, updating w_real will automatically trigger an update on self.w, so it is better to not store w at all
+		#TODO - figure out a way to avoid, or open an issue with tf...
+		self.w  = tf.cast(self.w_real, dtype=tf.complex64)
+		self.w  = tf.signal.fft3d(self.w / self.scale)
+
+		X = tf.cast(X, dtype=tf.complex64)
+		X = tf.signal.fft3d(X / self.scale) 
+		X = X * self.w
+		X = tf.signal.ifft3d(X * self.scale ) 
+		X = tf.math.real(X)
+		return X
+
+
+	def call_inv(self, X): 
+		X = tf.cast(X, dtype=tf.complex64)
+		X = tf.signal.fft3d(X * self.scale ) # self.scale correctly 
+		#The next 2 lines are a redundant computation necessary because w needs to be an EagerTensor for the output to be eagerly executed, and that was not the case earlier
+		self.w  = tf.cast(self.w_real, dtype=tf.complex64)
+		self.w  = tf.signal.fft3d(self.w / self.scale)
+
+		X = X / self.w
+
+		X = tf.signal.ifft3d(X / self.scale)   
+		X = tf.math.real(X)
+		return X
+
+	def log_det(self):  return tf.math.reduce_sum(tf.math.log(tf.math.abs(tf.signal.fft3d(tf.cast(self.w_real/self.scale,dtype=tf.complex64)))))    #Need to return EagerTensor
+
+
+	def build(self, input_shape): 
+		self.scale = np.sqrt(np.prod(input_shape[1:])) # np.sqrt(np.prod([a.value for a in input_shape[1:]]))
+
+		# todo; change to [[[1, 0000],[0000], [000]] 
+
+		def identitiy_initializer_real(shape, dtype=None):
+			return (tf.math.real(tf.signal.ifft3d(tf.ones(shape, dtype=tf.complex64)*self.scale))) 
+
+		self.w_real     = self.add_variable(name="w_real",shape=input_shape[1:], initializer=identitiy_initializer_real, trainable=True)
+		# self.w    = tf.cast(self.w_real, dtype=tf.complex64)  #hacky way to initialize real w and actual w, since tf does weird stuff if 'variable' is modified
+		# self.w    = tf.signal.fft3d(self.w / self.scale)
+		self.built = True
+		
+
+	def compute_output_shape(self, input_shape): 
+		return tf.TensorShape(input_shape[1:])
+
+
+
+class InvResNet(keras.layers.Layer): 			pass # model should automatically use gradient checkpointing if this is used. 
+
+
+# the 3D case, refactor to make it into the general case. 
+# make experiment with nD case, maybe put reshape into it? 
+# Theoretically time is the same? 
+class CircularConv(keras.layers.Layer): 
+
+	def __init__(self, dim=3):  # 
+		self.dim = dim 
+
+	def call(self, X): 		pass
+	
+	def call_inv(self, X): 	pass
+
+	def log_det(self): 		pass
+
+

From fb1bb54211834992ed086a5ec438783572153b50 Mon Sep 17 00:00:00 2001
From: anshuln <anshulnasery@gmail.com>
Date: Sat, 13 Jul 2019 18:17:26 +0530
Subject: [PATCH 03/11] Merge

---
 invtf/layers_const_backprop.py | 812 ++++++++++++++++-----------------
 1 file changed, 406 insertions(+), 406 deletions(-)

diff --git a/invtf/layers_const_backprop.py b/invtf/layers_const_backprop.py
index d1f7637..9b3d6e8 100644
--- a/invtf/layers_const_backprop.py
+++ b/invtf/layers_const_backprop.py
@@ -6,417 +6,417 @@
 from invtf.coupling_strategy import *
 
 """
-	Known issue with multi-scale architecture. 
-	The log-det computations normalizes wrt full dimension. 
+    Known issue with multi-scale architecture. 
+    The log-det computations normalizes wrt full dimension. 
 
 """
 #TODO Write unit tests
 class LayerWithGrads(keras.layers.Layer):    
-	'''
-	This is a virtual class from which all layer classes need to inherit
-	It has the function `compute gradients` which is used for constant 
-	memory backprop.
-	'''
-	def __init__(self,**kwargs):
-		super(LayerWithGrads,self).__init__(**kwargs)
-
-	def call(self,X):
-		raise NotImplementedError
-
-	def call_inv(self,X):
-		raise NotImplementedError
-
-	def compute_gradients(self,x,dy,regularizer=None):  
-		'''
-		Computes gradients for backward pass
-		Args:
-			x - tensor compatible with forward pass, input to the layer
-			dy - incoming gradient from backprop
-			regularizer - function, indicates dependence of loss on weights of layer
-		Returns
-			dy - gradients wrt input, to be backpropagated
-			grads - gradients wrt weights
-		'''
-		#TODO check if log_det of AffineCouplingLayer depends needs a regularizer.
-		with tf.GradientTape() as tape:
-			tape.watch(x)
-			y_ = self.call(x)   #Required to register the operation onto the gradient tape
-		grads_combined = tape.gradient(y_,[x]+self.trainable_variables,output_gradients=dy)
-		dy,grads = grads_combined[0],grads_combined[1:]
-
-		if regularizer is not None:
-			with tf.GradientTape() as tape:
-				reg = -regularizer()
-			grads_wrt_reg = tape.gradient(reg, self.trainable_variables)
-			grads = [a[0]+a[1] for a in zip(grads,grads_wrt_reg)]
-		return dy,grads
+    '''
+    This is a virtual class from which all layer classes need to inherit
+    It has the function `compute gradients` which is used for constant 
+    memory backprop.
+    '''
+    def __init__(self,**kwargs):
+        super(LayerWithGrads,self).__init__(**kwargs)
+
+    def call(self,X):
+        raise NotImplementedError
+
+    def call_inv(self,X):
+        raise NotImplementedError
+
+    def compute_gradients(self,x,dy,regularizer=None):  
+        '''
+        Computes gradients for backward pass
+        Args:
+            x - tensor compatible with forward pass, input to the layer
+            dy - incoming gradient from backprop
+            regularizer - function, indicates dependence of loss on weights of layer
+        Returns
+            dy - gradients wrt input, to be backpropagated
+            grads - gradients wrt weights
+        '''
+        #TODO check if log_det of AffineCouplingLayer depends needs a regularizer.
+        with tf.GradientTape() as tape:
+            tape.watch(x)
+            y_ = self.call(x)   #Required to register the operation onto the gradient tape
+        grads_combined = tape.gradient(y_,[x]+self.trainable_variables,output_gradients=dy)
+        dy,grads = grads_combined[0],grads_combined[1:]
+
+        if regularizer is not None:
+            with tf.GradientTape() as tape:
+                reg = -regularizer()
+            grads_wrt_reg = tape.gradient(reg, self.trainable_variables)
+            grads = [a[0]+a[1] for a in zip(grads,grads_wrt_reg)]
+        return dy,grads
 
 class Linear(LayerWithGrads): 
 
-	def __init__(self, **kwargs): super(Linear, self).__init__(**kwargs)
+    def __init__(self, **kwargs): super(Linear, self).__init__(**kwargs)
 
-	def build(self, input_shape): 
+    def build(self, input_shape): 
 
-		assert len(input_shape) == 2
-		_, d = input_shape
+        assert len(input_shape) == 2
+        _, d = input_shape
 
-		self.W = self.add_weight(shape=(d, d), 	initializer='identity', name="linear_weight")
-		self.b = self.add_weight(shape=(d), 	initializer='zero',		name="linear_bias")
-		
-		super(Linear, self).build(input_shape)
-		self.built = True
+        self.W = self.add_weight(shape=(d, d),  initializer='identity', name="linear_weight")
+        self.b = self.add_weight(shape=(d),     initializer='zero',     name="linear_bias")
+        
+        super(Linear, self).build(input_shape)
+        self.built = True
 
-	def call(self, X): 		return X @ self.W + self.b 
+    def call(self, X):      return X @ self.W + self.b 
 
-	def call_inv(self, Z):  return (Z - self.b) @ tf.linalg.inv(self.W)
+    def call_inv(self, Z):  return (Z - self.b) @ tf.linalg.inv(self.W)
 
-	def jacobian(self):		return self.W
+    def jacobian(self):     return self.W
 
-	def log_det(self): 		return tf.math.log(tf.abs(tf.linalg.det(self.jacobian())))
+    def log_det(self):      return tf.math.log(tf.abs(tf.linalg.det(self.jacobian())))
 
-	def compute_output_shape(self, input_shape): 
-		self.output_shape = input_shape
-		return input_shape
+    def compute_output_shape(self, input_shape): 
+        self.output_shape = input_shape
+        return input_shape
 
 
 class Affine(LayerWithGrads): 
 
-	"""
-		The exp parameter allows the scaling to be exp(s) \odot X. 
-		This cancels out the log in the log_det computations. 
-	"""
+    """
+        The exp parameter allows the scaling to be exp(s) \odot X. 
+        This cancels out the log in the log_det computations. 
+    """
 
-	def __init__(self, exp=False, **kwargs): 
-		self.exp = exp
-		super(Affine, self).__init__(**kwargs)
+    def __init__(self, exp=False, **kwargs): 
+        self.exp = exp
+        super(Affine, self).__init__(**kwargs)
 
-	def build(self, input_shape): 
+    def build(self, input_shape): 
 
-		#assert len(input_shape) == 2
-		d = input_shape[1:]
+        #assert len(input_shape) == 2
+        d = input_shape[1:]
 
-		self.w = self.add_weight(shape=d, 	initializer='ones', name="affine_scale") 
-		self.b = self.add_weight(shape=d, 	initializer='zero', name="affine_bias")
+        self.w = self.add_weight(shape=d,   initializer='ones', name="affine_scale") 
+        self.b = self.add_weight(shape=d,   initializer='zero', name="affine_bias")
 
-		super(Affine, self).build(input_shape)
-		self.built = True
+        super(Affine, self).build(input_shape)
+        self.built = True
 
-	def call(self, X): 		
-		if self.exp: 	return X * tf.exp(self.w) + self.b 
-		else: 			return X * self.w 		  + self.b
+    def call(self, X):      
+        if self.exp:    return X * tf.exp(self.w) + self.b 
+        else:           return X * self.w         + self.b
 
-	def call_inv(self, Z):  
-		if self.exp:	return (Z - self.b) / tf.exp(self.w)
-		else: 			return (Z - self.b) / self.w
+    def call_inv(self, Z):  
+        if self.exp:    return (Z - self.b) / tf.exp(self.w)
+        else:           return (Z - self.b) / self.w
 
-	def jacobian(self):		return self.w
+    def jacobian(self):     return self.w
 
-	def eigenvalues(self): 	return self.w
+    def eigenvalues(self):  return self.w
 
-	def log_det(self): 		
-		if self.exp: 	return tf.reduce_sum(tf.abs(self.eigenvalues()))
-		else: 			return tf.reduce_sum(tf.math.log(tf.abs(self.eigenvalues())))
+    def log_det(self):      
+        if self.exp:    return tf.reduce_sum(tf.abs(self.eigenvalues()))
+        else:           return tf.reduce_sum(tf.math.log(tf.abs(self.eigenvalues())))
 
-	def compute_output_shape(self, input_shape): 
-		self.output_shape = input_shape
-		return input_shape
+    def compute_output_shape(self, input_shape): 
+        self.output_shape = input_shape
+        return input_shape
 
 
 
 """
-	For simplicity we vectorize input and apply coupling to even/odd entries. 
-	Could also use upper/lower. Refactor this to support specifying the pattern as a parameter. 
-
-	TODO: 
-		Potentially refactor so we can add directly to AdditiveCoupling instead of creating 'm'
-		by (potentially adding to Sequential) and passing this on to AdditiveCoupling. 
-		The main issue is AdditiveCoupling is R^2-> R^2 while m:R^1->R^1, so if we 
-		add directly to AdditiveCoupling we run into issues with miss matching dimensions. 
-	
+    For simplicity we vectorize input and apply coupling to even/odd entries. 
+    Could also use upper/lower. Refactor this to support specifying the pattern as a parameter. 
+
+    TODO: 
+        Potentially refactor so we can add directly to AdditiveCoupling instead of creating 'm'
+        by (potentially adding to Sequential) and passing this on to AdditiveCoupling. 
+        The main issue is AdditiveCoupling is R^2-> R^2 while m:R^1->R^1, so if we 
+        add directly to AdditiveCoupling we run into issues with miss matching dimensions. 
+    
 """
 class AdditiveCoupling(keras.Sequential): 
 
-	unique_id = 1
+    unique_id = 1
 
-	def __init__(self, part=0, strategy=SplitOnHalfStrategy()): # strategy: alternate / split  ;; alternate does odd/even, split has upper/lower. 
-		super(AdditiveCoupling, self).__init__(name="add_coupling_%i"%AdditiveCoupling.unique_id)
-		AdditiveCoupling.unique_id += 1
-		self.part 	= part 
-		self.strategy = strategy
+    def __init__(self, part=0, strategy=SplitOnHalfStrategy()): # strategy: alternate / split  ;; alternate does odd/even, split has upper/lower. 
+        super(AdditiveCoupling, self).__init__(name="add_coupling_%i"%AdditiveCoupling.unique_id)
+        AdditiveCoupling.unique_id += 1
+        self.part   = part 
+        self.strategy = strategy
 
 
-	def build(self, input_shape):
+    def build(self, input_shape):
 
-		self.layers[0].build(input_shape=(None, 28**2/2))
-		out_dim = self.layers[0].compute_output_shape(input_shape=(None, 28**2/2))
+        self.layers[0].build(input_shape=(None, 28**2/2))
+        out_dim = self.layers[0].compute_output_shape(input_shape=(None, 28**2/2))
 
-		for layer in self.layers[1:]:  
-			layer.build(input_shape=out_dim)
-			out_dim = layer.compute_output_shape(input_shape=out_dim)
+        for layer in self.layers[1:]:  
+            layer.build(input_shape=out_dim)
+            out_dim = layer.compute_output_shape(input_shape=out_dim)
 
-	def call_(self, X): 
-		for layer in self.layers: 
-			X = layer.call(X)
-		return X
+    def call_(self, X): 
+        for layer in self.layers: 
+            X = layer.call(X)
+        return X
 
-	def call(self, X): 		
-		shape 	= tf.shape(X)
-		d 		= tf.reduce_prod(shape[1:])
-		X 		= tf.reshape(X, (shape[0], d))
+    def call(self, X):      
+        shape   = tf.shape(X)
+        d       = tf.reduce_prod(shape[1:])
+        X       = tf.reshape(X, (shape[0], d))
 
-		x0, x1 = self.strategy.split(X)
+        x0, x1 = self.strategy.split(X)
 
-		if self.part == 0: x0 		= x0 + self.call_(x1)
-		if self.part == 1: x1 		= x1 + self.call_(x0)
+        if self.part == 0: x0       = x0 + self.call_(x1)
+        if self.part == 1: x1       = x1 + self.call_(x0)
 
-		X = self.strategy.combine(x0, x1)
+        X = self.strategy.combine(x0, x1)
 
-		X 		= tf.reshape(X, shape)
-		return X
+        X       = tf.reshape(X, shape)
+        return X
 
-	def call_inv(self, Z):	 
-		shape 	= tf.shape(Z)
-		d 		= tf.reduce_prod(shape[1:])
-		Z 		= tf.reshape(Z, (shape[0], d))
+    def call_inv(self, Z):   
+        shape   = tf.shape(Z)
+        d       = tf.reduce_prod(shape[1:])
+        Z       = tf.reshape(Z, (shape[0], d))
 
-		z0, z1 = self.strategy.split(Z)
-		
-		if self.part == 0: z0 		= z0 - self.call_(z1)
-		if self.part == 1: z1 		= z1 - self.call_(z0)
+        z0, z1 = self.strategy.split(Z)
+        
+        if self.part == 0: z0       = z0 - self.call_(z1)
+        if self.part == 1: z1       = z1 - self.call_(z0)
 
-		Z = self.strategy.combine(z0, z1)
+        Z = self.strategy.combine(z0, z1)
 
-		Z 		= tf.reshape(Z, shape)
-		return Z
+        Z       = tf.reshape(Z, shape)
+        return Z
 
 
-	def log_det(self): 		return 0. 
+    def log_det(self):      return 0. 
 
-	def compute_output_shape(self, input_shape): return input_shape
+    def compute_output_shape(self, input_shape): return input_shape
 
-	def compute_gradients(self,x,dy,regularizer=None):  
-		'''
-		Computes gradients for backward pass
-		Since the coupling layers do not inherit from `LayerWithGrads`, this 
-		function is re-written. See TODO of AffineCoupling for further info
-		Args:
-			x - tensor compatible with forward pass, input to the layer
-			dy - incoming gradient from backprop
-			regularizer - function, indicates dependence of loss on weights of layer
-		Returns
-			dy - gradients wrt input, to be backpropagated
-			grads - gradients wrt weights
-		'''
-		with tf.GradientTape() as tape:
-			tape.watch(x)
-			y_ = self.call(x)   #Required to register the operation onto the gradient tape
-		grads_combined = tape.gradient(y_,[x]+self.trainable_variables,output_gradients=dy)
-		dy,grads = grads_combined[0],grads_combined[1:]
+    def compute_gradients(self,x,dy,regularizer=None):  
+        '''
+        Computes gradients for backward pass
+        Since the coupling layers do not inherit from `LayerWithGrads`, this 
+        function is re-written. See TODO of AffineCoupling for further info
+        Args:
+            x - tensor compatible with forward pass, input to the layer
+            dy - incoming gradient from backprop
+            regularizer - function, indicates dependence of loss on weights of layer
+        Returns
+            dy - gradients wrt input, to be backpropagated
+            grads - gradients wrt weights
+        '''
+        with tf.GradientTape() as tape:
+            tape.watch(x)
+            y_ = self.call(x)   #Required to register the operation onto the gradient tape
+        grads_combined = tape.gradient(y_,[x]+self.trainable_variables,output_gradients=dy)
+        dy,grads = grads_combined[0],grads_combined[1:]
 
-		if regularizer is not None:
-			with tf.GradientTape() as tape:
-				reg = -regularizer()
-			grads_wrt_reg = tape.gradient(reg, self.trainable_variables)
-			grads = [a[0]+a[1] for a in zip(grads,grads_wrt_reg)]
-		return dy,grads
+        if regularizer is not None:
+            with tf.GradientTape() as tape:
+                reg = -regularizer()
+            grads_wrt_reg = tape.gradient(reg, self.trainable_variables)
+            grads = [a[0]+a[1] for a in zip(grads,grads_wrt_reg)]
+        return dy,grads
 
 
 
 """
-	The affine coupling layer is described in NICE, REALNVP and GLOW. 
-	The description in Glow use a single network to output scale s and transform t, 
-	it seems the description in REALNVP is a bit more general refering to s and t as 
-	different functions. From this perspective Glow change the affine layer to have
-	weight sharing between s and t. 
-	 Specifying a single function is a lot simpler code-wise, we thus use that approach. 
+    The affine coupling layer is described in NICE, REALNVP and GLOW. 
+    The description in Glow use a single network to output scale s and transform t, 
+    it seems the description in REALNVP is a bit more general refering to s and t as 
+    different functions. From this perspective Glow change the affine layer to have
+    weight sharing between s and t. 
+     Specifying a single function is a lot simpler code-wise, we thus use that approach. 
 
 
-	For now assumes the use of convolutions 
+    For now assumes the use of convolutions 
 
 """
-class AffineCoupling(keras.Sequential):  	#TODO Check gradient computations with and without reg
+class AffineCoupling(keras.Sequential):     #TODO Check gradient computations with and without reg
 
-	unique_id = 1
+    unique_id = 1
 
-	def __init__(self, part=0, strategy=SplitChannelsStrategy()): 
-		super(AffineCoupling, self).__init__(name="aff_coupling_%i"%AffineCoupling.unique_id)
-		AffineCoupling.unique_id += 1
-		self.part 		= part 
-		self.strategy 	= strategy
+    def __init__(self, part=0, strategy=SplitChannelsStrategy()): 
+        super(AffineCoupling, self).__init__(name="aff_coupling_%i"%AffineCoupling.unique_id)
+        AffineCoupling.unique_id += 1
+        self.part       = part 
+        self.strategy   = strategy
 
-	def build(self, input_shape):
+    def build(self, input_shape):
 
-		# handle the issue with each network output something larger. 
-		_, h, w, c = input_shape
+        # handle the issue with each network output something larger. 
+        _, h, w, c = input_shape
 
 
-		h, w, c = self.strategy.coupling_shape(input_shape=(h,w,c))
+        h, w, c = self.strategy.coupling_shape(input_shape=(h,w,c))
 
-		self.layers[0].build(input_shape=(None, h, w, c))
-		out_dim = self.layers[0].compute_output_shape(input_shape=(None, h, w, c))
-		self.layers[0].output_shape_ = out_dim
+        self.layers[0].build(input_shape=(None, h, w, c))
+        out_dim = self.layers[0].compute_output_shape(input_shape=(None, h, w, c))
+        self.layers[0].output_shape_ = out_dim
 
-		for layer in self.layers[1:]:  
-			layer.build(input_shape=out_dim)
-			out_dim = layer.compute_output_shape(input_shape=out_dim)
-			layer.output_shape_ = out_dim
+        for layer in self.layers[1:]:  
+            layer.build(input_shape=out_dim)
+            out_dim = layer.compute_output_shape(input_shape=out_dim)
+            layer.output_shape_ = out_dim
 
-	def call_(self, X): 
+    def call_(self, X): 
 
-		in_shape = tf.shape(X)
-		n, h, w, c = X.shape
+        in_shape = tf.shape(X)
+        n, h, w, c = X.shape
 
-		for layer in self.layers: 
-			X = layer.call(X) # residual 
+        for layer in self.layers: 
+            X = layer.call(X) # residual 
 
-		# TODO: Could have a part of network learned specifically for s,t to not ONLY have wegith sharing? 
-		
-		X = tf.reshape(X, (-1, h, w, c*2))
-		s = X[:, :, w//2:, :]
-		t = X[:, :, :w//2, :]  
+        # TODO: Could have a part of network learned specifically for s,t to not ONLY have wegith sharing? 
+        
+        X = tf.reshape(X, (-1, h, w, c*2))
+        s = X[:, :, w//2:, :]
+        t = X[:, :, :w//2, :]  
 
-		s = tf.reshape(s, in_shape)
-		t = tf.reshape(t, in_shape)
+        s = tf.reshape(s, in_shape)
+        t = tf.reshape(t, in_shape)
 
-		return s, t
+        return s, t
 
-	def call(self, X): 		
+    def call(self, X):      
 
-		x0, x1 = self.strategy.split(X)
+        x0, x1 = self.strategy.split(X)
 
-		if self.part == 0: 
-			s, t 	= self.call_(x1)
-			x0 		= x0*s + t
+        if self.part == 0: 
+            s, t    = self.call_(x1)
+            x0      = x0*s + t
 
-		if self.part == 1: 
-			s, t 	= self.call_(x0)
-			x1 		= x1*s + t 
+        if self.part == 1: 
+            s, t    = self.call_(x0)
+            x1      = x1*s + t 
 
-		X 		= self.strategy.combine(x0, x1)
-		return X
+        X       = self.strategy.combine(x0, x1)
+        return X
 
-	def call_inv(self, Z):	 
-		z0, z1 = self.strategy.split(Z)
-		
-		if self.part == 0: 
-			s, t 	= self.call_(z1)
-			z0 		= (z0 - t)/s
-		if self.part == 1: 
-			s, t 	= self.call_(z0)
-			z1 		= (z1 - t)/s
+    def call_inv(self, Z):   
+        z0, z1 = self.strategy.split(Z)
+        
+        if self.part == 0: 
+            s, t    = self.call_(z1)
+            z0      = (z0 - t)/s
+        if self.part == 1: 
+            s, t    = self.call_(z0)
+            z1      = (z1 - t)/s
 
-		Z 		= self.strategy.combine(z0, z1)
-		return Z
+        Z       = self.strategy.combine(z0, z1)
+        return Z
 
 
-	def log_det(self): 		 
+    def log_det(self):       
 
-		# TODO: save 's' instead of recomputing. 
+        # TODO: save 's' instead of recomputing. 
 
-		X 		= self.input
-		n 		= tf.dtypes.cast(tf.shape(X)[0], tf.float32)
+        X       = self.input
+        n       = tf.dtypes.cast(tf.shape(X)[0], tf.float32)
 
-		x0, x1 = self.strategy.split(X)
+        x0, x1 = self.strategy.split(X)
 
-		if self.part == 0: 
-			s, t 	= self.call_(x1)
-		if self.part == 1: 
-			s, t 	= self.call_(x0)
+        if self.part == 0: 
+            s, t    = self.call_(x1)
+        if self.part == 1: 
+            s, t    = self.call_(x0)
 
-		# there is an issue with 's' being divided by dimension 'd' later:
-		# If we used MultiScale it will be lower dimensional, in this case
-		# we should not divide by d but d//2. 
+        # there is an issue with 's' being divided by dimension 'd' later:
+        # If we used MultiScale it will be lower dimensional, in this case
+        # we should not divide by d but d//2. 
 
-		return tf.reduce_sum(tf.math.log(tf.abs(s))) / n
+        return tf.reduce_sum(tf.math.log(tf.abs(s))) / n
 
-	def compute_output_shape(self, input_shape): return input_shape
+    def compute_output_shape(self, input_shape): return input_shape
 
-	def summary(self, line_length=None, positions=None, print_fn=None):
-		print_summary(self, line_length=line_length, positions=positions, print_fn=print_fn) # fixes stupid issue.
+    def summary(self, line_length=None, positions=None, print_fn=None):
+        print_summary(self, line_length=line_length, positions=positions, print_fn=print_fn) # fixes stupid issue.
 
-	def compute_gradients(self,x,dy,regularizer=None):  
-		'''
-		Computes gradients for backward pass
-		Args:
-			x - tensor compatible with forward pass, input to the layer
-			dy - incoming gradient from backprop
-			regularizer - function, indicates dependence of loss on weights of layer
-		Returns
-			dy - gradients wrt input, to be backpropagated
-			grads - gradients wrt weights
-		'''
-		#TODO check if log_det of AffineCouplingLayer needs a regularizer. -- DONE, it does
-		with tf.GradientTape() as tape:
-			tape.watch(x)
-			y_ = self.call(x)   #Required to register the operation onto the gradient tape
-		grads_combined = tape.gradient(y_,[x]+self.trainable_variables,output_gradients=dy)
-		dy,grads = grads_combined[0],grads_combined[1:]
+    def compute_gradients(self,x,dy,regularizer=None):  
+        '''
+        Computes gradients for backward pass
+        Args:
+            x - tensor compatible with forward pass, input to the layer
+            dy - incoming gradient from backprop
+            regularizer - function, indicates dependence of loss on weights of layer
+        Returns
+            dy - gradients wrt input, to be backpropagated
+            grads - gradients wrt weights
+        '''
+        #TODO check if log_det of AffineCouplingLayer needs a regularizer. -- DONE, it does
+        with tf.GradientTape() as tape:
+            tape.watch(x)
+            y_ = self.call(x)   #Required to register the operation onto the gradient tape
+        grads_combined = tape.gradient(y_,[x]+self.trainable_variables,output_gradients=dy)
+        dy,grads = grads_combined[0],grads_combined[1:]
 
-		if regularizer is not None:
-			with tf.GradientTape() as tape:
-				reg = -regularizer()
-			grads_wrt_reg = tape.gradient(reg, self.trainable_variables)
-			grads = [a[0]+a[1] for a in zip(grads,grads_wrt_reg)]
-		return dy,grads
+        if regularizer is not None:
+            with tf.GradientTape() as tape:
+                reg = -regularizer()
+            grads_wrt_reg = tape.gradient(reg, self.trainable_variables)
+            grads = [a[0]+a[1] for a in zip(grads,grads_wrt_reg)]
+        return dy,grads
 
 
 """
-	Try different techniques: I'm implementing the simplest case, just reshape to desired shape. 
-	TODO: Implement the following Squeeze strategies: 
-		- RealNVP
-		- Downscale images, e.g. alternate pixels and have 4 lower dim images and stack them. 
-		- ... 
+    Try different techniques: I'm implementing the simplest case, just reshape to desired shape. 
+    TODO: Implement the following Squeeze strategies: 
+        - RealNVP
+        - Downscale images, e.g. alternate pixels and have 4 lower dim images and stack them. 
+        - ... 
 """
 class Squeeze(LayerWithGrads): 
 
-	def call(self, X): 
-		n, self.w, self.h, self.c = X.shape
-		return tf.reshape(X, [-1, self.w//2, self.h//2, self.c*4])
+    def call(self, X): 
+        n, self.w, self.h, self.c = X.shape
+        return tf.reshape(X, [-1, self.w//2, self.h//2, self.c*4])
 
-	def call_inv(self, X): 
-		return tf.reshape(X, [-1, self.w, self.h, self.c])
-		
-	def log_det(self): return 0. 
+    def call_inv(self, X): 
+        return tf.reshape(X, [-1, self.w, self.h, self.c])
+        
+    def log_det(self): return 0. 
 
 
 # TODO: for now assumes target is +-1, refactor to support any target. 
 # Refactor 127.5 
 class Normalize(LayerWithGrads):  # normalizes data after dequantization. 
 
-	def __init__(self, target=[-1,+1], scale=127.5, input_shape=None): 
-		super(Normalize, self).__init__(input_shape=input_shape)
-		self.target = target
-		self.d 		= np.prod(input_shape)
-		self.scale  = 1/127.5
+    def __init__(self, target=[-1,+1], scale=127.5, input_shape=None): 
+        super(Normalize, self).__init__(input_shape=input_shape)
+        self.target = target
+        self.d      = np.prod(input_shape)
+        self.scale  = 1/127.5
 
-	def call(self, X):  
-		X 			= X * self.scale  - 1
-		return X
+    def call(self, X):  
+        X           = X * self.scale  - 1
+        return X
 
-	def call_inv(self, Z): 
-		Z = Z + 1
-		Z = Z / self.scale
-		return Z
+    def call_inv(self, Z): 
+        Z = Z + 1
+        Z = Z / self.scale
+        return Z
 
-	def log_det(self): return self.d * tf.math.log(self.scale) 
+    def log_det(self): return self.d * tf.math.log(self.scale) 
 
 
 class MultiScale(keras.layers.Layer): 
 
-	def call(self, X):  # TODO: have different strategies here, and combine it with how coupling layer works? 
-		n, w, h, c = X.shape
-		Z = X[:, :, :, c//2:]
-		X = X[:, :, :, :c//2]
-		return X, Z
-	
-	def call_inv(self, X, Z): 
-		return tf.concat((X, Z), axis=-1)
+    def call(self, X):  # TODO: have different strategies here, and combine it with how coupling layer works? 
+        n, w, h, c = X.shape
+        Z = X[:, :, :, c//2:]
+        X = X[:, :, :, :c//2]
+        return X, Z
+    
+    def call_inv(self, X, Z): 
+        return tf.concat((X, Z), axis=-1)
 
-	def compute_output_shape(self, input_shape): 
-		n, h, w, c = input_shape
-		return (n, h, w, c//2)
+    def compute_output_shape(self, input_shape): 
+        n, h, w, c = input_shape
+        return (n, h, w, c//2)
 
-	def log_det(self): return 0.
+    def log_det(self): return 0.
 
 
 class ActNorm(keras.layers.Layer): pass 
@@ -424,159 +424,159 @@ class ActNorm(keras.layers.Layer): pass
 
 class Inv1x1Conv(LayerWithGrads):  
 
-	"""
-		Based on Glow page 11 appendix B. 
-		It is possible to speed up determinant computation by using PLU or QR decomposition
-		as proposed in Glow and Emerging Conv papers respectively. 
+    """
+        Based on Glow page 11 appendix B. 
+        It is possible to speed up determinant computation by using PLU or QR decomposition
+        as proposed in Glow and Emerging Conv papers respectively. 
 
-		Add bias to this operation? Try to see if it makes any difference. 
+        Add bias to this operation? Try to see if it makes any difference. 
 
-		Try to compare speed / numerical stability etc for different implementations: 
+        Try to compare speed / numerical stability etc for different implementations: 
 
-			1. PLU decomposition
-			2. QR
-			3. Normal determinant O(c^3)
-			4. tensordot vs conv2d. 
-	"""
+            1. PLU decomposition
+            2. QR
+            3. Normal determinant O(c^3)
+            4. tensordot vs conv2d. 
+    """
 
-	def __init__(self, **kwargs): super(Inv1x1Conv, self).__init__(**kwargs)
+    def __init__(self, **kwargs): super(Inv1x1Conv, self).__init__(**kwargs)
 
-	def build(self, input_shape): 
+    def build(self, input_shape): 
 
-		_, h, w, c = input_shape
-		self.c = c
-		self.h = h
-		self.w = w
+        _, h, w, c = input_shape
+        self.c = c
+        self.h = h
+        self.w = w
 
-		#w_init = np.linalg.qr(np.random.randn(c,c))[0]
-		self.W 		= self.add_weight(shape=(c, c), initializer=keras.initializers.Orthogonal(gain=1.0, seed=None), name="inv_1x1_conv")
-		self.W_inv 	= tf.linalg.inv(self.W)
-		
-		super(Inv1x1Conv, self).build(input_shape)
-		self.built = True
+        #w_init = np.linalg.qr(np.random.randn(c,c))[0]
+        self.W      = self.add_weight(shape=(c, c), initializer=keras.initializers.Orthogonal(gain=1.0, seed=None), name="inv_1x1_conv")
+        self.W_inv  = tf.linalg.inv(self.W)
+        
+        super(Inv1x1Conv, self).build(input_shape)
+        self.built = True
 
-	def call(self, X): 	
-		_W = tf.reshape(self.W, (1,1, self.c, self.c))
-		return tf.nn.conv2d(X, _W, [1,1,1,1], "SAME")
+    def call(self, X):  
+        _W = tf.reshape(self.W, (1,1, self.c, self.c))
+        return tf.nn.conv2d(X, _W, [1,1,1,1], "SAME")
 
-	def call_inv(self, Z):  
-		_W = tf.reshape(self.W_inv, (1,1, self.c, self.c))
-		return tf.nn.conv2d(Z, _W, [1,1,1,1], "SAME")
+    def call_inv(self, Z):  
+        _W = tf.reshape(self.W_inv, (1,1, self.c, self.c))
+        return tf.nn.conv2d(Z, _W, [1,1,1,1], "SAME")
 
-	def log_det(self): 		 # TODO: Fix this issue!!! 
-		print(self.h, self.w, tf.linalg.det(self.W))
-		return self.h * self.w * tf.math.log(tf.abs( tf.linalg.det(self.W) ))  
+    def log_det(self):       # TODO: Fix this issue!!! 
+        print(self.h, self.w, tf.linalg.det(self.W))
+        return self.h * self.w * tf.math.log(tf.abs( tf.linalg.det(self.W) ))  
 
-	def compute_output_shape(self, input_shape): return input_shape
+    def compute_output_shape(self, input_shape): return input_shape
 
 
 class Glow1x1Conv(LayerWithGrads): 
 
-	# Could be speed up parameterizing in LU decomposition. 
-	def build(self, input_shape): 
-		_, h, w, c = input_shape
+    # Could be speed up parameterizing in LU decomposition. 
+    def build(self, input_shape): 
+        _, h, w, c = input_shape
 
-		self.h, self.w = h, w
-	
-		# make L and U lower and upper triangular by masking with zeros. 
-		self.L 				= self.add_weight(shape=(c, c), initializer="zeros", name="weights")
-		self.U				= self.add_weight(shape=(c, c), initializer="zeros", name="weights")
-		self.eigenvals		= self.add_weight(shape=(c, 1), initializer="ones", name="weights")
+        self.h, self.w = h, w
+    
+        # make L and U lower and upper triangular by masking with zeros. 
+        self.L              = self.add_weight(shape=(c, c), initializer="zeros", name="weights")
+        self.U              = self.add_weight(shape=(c, c), initializer="zeros", name="weights")
+        self.eigenvals      = self.add_weight(shape=(c, 1), initializer="ones", name="weights")
 
-		identity 			= tf.constant(np.identity(c), dtype=tf.float32)
+        identity            = tf.constant(np.identity(c), dtype=tf.float32)
 
-		""" 
-		>>> # Creating masks. 
-		>>> np.triu(np.ones((4,4)), k=+1)
-		array([[0., 1., 1., 1.],
-			   [0., 0., 1., 1.],
-			   [0., 0., 0., 1.],
-			   [0., 0., 0., 0.]])
-		>>> np.tril(np.ones((4,4)), k=-1)
-		array([[0., 0., 0., 0.],
-			   [1., 0., 0., 0.],
-			   [1., 1., 0., 0.],
-			   [1., 1., 1., 0.]])
-		"""
+        """ 
+        >>> # Creating masks. 
+        >>> np.triu(np.ones((4,4)), k=+1)
+        array([[0., 1., 1., 1.],
+               [0., 0., 1., 1.],
+               [0., 0., 0., 1.],
+               [0., 0., 0., 0.]])
+        >>> np.tril(np.ones((4,4)), k=-1)
+        array([[0., 0., 0., 0.],
+               [1., 0., 0., 0.],
+               [1., 1., 0., 0.],
+               [1., 1., 1., 0.]])
+        """
 
-		upper_mask = tf.constant(np.triu(np.ones((c,c)), k=+1), dtype=tf.float32)
-		lower_mask = tf.constant(np.tril(np.ones((c,c)), k=-1), dtype=tf.float32)
+        upper_mask = tf.constant(np.triu(np.ones((c,c)), k=+1), dtype=tf.float32)
+        lower_mask = tf.constant(np.tril(np.ones((c,c)), k=-1), dtype=tf.float32)
 
-		self.L = lower_mask * self.L + identity
-		self.U = upper_mask * self.U + identity
-		
-		self.kernel 		= self.L @ (self.eigenvals * self.U) 
-		self.kernel_inv 	= tf.linalg.inv(self.kernel)
+        self.L = lower_mask * self.L + identity
+        self.U = upper_mask * self.U + identity
+        
+        self.kernel         = self.L @ (self.eigenvals * self.U) 
+        self.kernel_inv     = tf.linalg.inv(self.kernel)
 
-	def call(self, inputs): 		return tf.tensordot(inputs, self.kernel, axes=((-1), (0))) 
-	def call_inv(self, inputs):		return tf.tensordot(inputs, self.kernel_inv, axes=((-1), (0))) 
-	def log_det(self): 				return self.h * self.w * tf.reduce_sum(tf.math.log(tf.abs(self.eigenvals))) 
+    def call(self, inputs):         return tf.tensordot(inputs, self.kernel, axes=((-1), (0))) 
+    def call_inv(self, inputs):     return tf.tensordot(inputs, self.kernel_inv, axes=((-1), (0))) 
+    def log_det(self):              return self.h * self.w * tf.reduce_sum(tf.math.log(tf.abs(self.eigenvals))) 
 
-	def compute_output_shape(self, input_shape): return input_shape
+    def compute_output_shape(self, input_shape): return input_shape
 
 
 
 class Conv3DCirc(LayerWithGrads): 
 
-	def __init__(self,trainable=True): 
-		self.built = False
-		super(Conv3DCirc, self).__init__()
+    def __init__(self,trainable=True): 
+        self.built = False
+        super(Conv3DCirc, self).__init__()
 
-	def call(self, X): 
-		if self.built == False:    #For some reason the layer is not being built without this line
-			self.build(X.get_shape().as_list())
+    def call(self, X): 
+        if self.built == False:    #For some reason the layer is not being built without this line
+            self.build(X.get_shape().as_list())
 
-		#The next 2 lines are a redundant computation necessary because w needs to be an EagerTensor for the output to be eagerly executed, and that was not the case earlier
-		#EagerTensor is required for backprop to work...
-		#Further, updating w_real will automatically trigger an update on self.w, so it is better to not store w at all
-		#TODO - figure out a way to avoid, or open an issue with tf...
-		self.w  = tf.cast(self.w_real, dtype=tf.complex64)
-		self.w  = tf.signal.fft3d(self.w / self.scale)
+        #The next 2 lines are a redundant computation necessary because w needs to be an EagerTensor for the output to be eagerly executed, and that was not the case earlier
+        #EagerTensor is required for backprop to work...
+        #Further, updating w_real will automatically trigger an update on self.w, so it is better to not store w at all
+        #TODO - figure out a way to avoid, or open an issue with tf...
+        self.w  = tf.cast(self.w_real, dtype=tf.complex64)
+        self.w  = tf.signal.fft3d(self.w / self.scale)
 
-		X = tf.cast(X, dtype=tf.complex64)
-		X = tf.signal.fft3d(X / self.scale) 
-		X = X * self.w
-		X = tf.signal.ifft3d(X * self.scale ) 
-		X = tf.math.real(X)
-		return X
+        X = tf.cast(X, dtype=tf.complex64)
+        X = tf.signal.fft3d(X / self.scale) 
+        X = X * self.w
+        X = tf.signal.ifft3d(X * self.scale ) 
+        X = tf.math.real(X)
+        return X
 
 
-	def call_inv(self, X): 
-		X = tf.cast(X, dtype=tf.complex64)
-		X = tf.signal.fft3d(X * self.scale ) # self.scale correctly 
-		#The next 2 lines are a redundant computation necessary because w needs to be an EagerTensor for the output to be eagerly executed, and that was not the case earlier
-		self.w  = tf.cast(self.w_real, dtype=tf.complex64)
-		self.w  = tf.signal.fft3d(self.w / self.scale)
+    def call_inv(self, X): 
+        X = tf.cast(X, dtype=tf.complex64)
+        X = tf.signal.fft3d(X * self.scale ) # self.scale correctly 
+        #The next 2 lines are a redundant computation necessary because w needs to be an EagerTensor for the output to be eagerly executed, and that was not the case earlier
+        self.w  = tf.cast(self.w_real, dtype=tf.complex64)
+        self.w  = tf.signal.fft3d(self.w / self.scale)
 
-		X = X / self.w
+        X = X / self.w
 
-		X = tf.signal.ifft3d(X / self.scale)   
-		X = tf.math.real(X)
-		return X
+        X = tf.signal.ifft3d(X / self.scale)   
+        X = tf.math.real(X)
+        return X
 
-	def log_det(self):  return tf.math.reduce_sum(tf.math.log(tf.math.abs(tf.signal.fft3d(tf.cast(self.w_real/self.scale,dtype=tf.complex64)))))    #Need to return EagerTensor
+    def log_det(self):  return tf.math.reduce_sum(tf.math.log(tf.math.abs(tf.signal.fft3d(tf.cast(self.w_real/self.scale,dtype=tf.complex64)))))    #Need to return EagerTensor
 
 
-	def build(self, input_shape): 
-		self.scale = np.sqrt(np.prod(input_shape[1:])) # np.sqrt(np.prod([a.value for a in input_shape[1:]]))
+    def build(self, input_shape): 
+        self.scale = np.sqrt(np.prod(input_shape[1:])) # np.sqrt(np.prod([a.value for a in input_shape[1:]]))
 
-		# todo; change to [[[1, 0000],[0000], [000]] 
+        # todo; change to [[[1, 0000],[0000], [000]] 
 
-		def identitiy_initializer_real(shape, dtype=None):
-			return (tf.math.real(tf.signal.ifft3d(tf.ones(shape, dtype=tf.complex64)*self.scale))) 
+        def identitiy_initializer_real(shape, dtype=None):
+            return (tf.math.real(tf.signal.ifft3d(tf.ones(shape, dtype=tf.complex64)*self.scale))) 
 
-		self.w_real     = self.add_variable(name="w_real",shape=input_shape[1:], initializer=identitiy_initializer_real, trainable=True)
-		# self.w    = tf.cast(self.w_real, dtype=tf.complex64)  #hacky way to initialize real w and actual w, since tf does weird stuff if 'variable' is modified
-		# self.w    = tf.signal.fft3d(self.w / self.scale)
-		self.built = True
-		
+        self.w_real     = self.add_variable(name="w_real",shape=input_shape[1:], initializer=identitiy_initializer_real, trainable=True)
+        # self.w    = tf.cast(self.w_real, dtype=tf.complex64)  #hacky way to initialize real w and actual w, since tf does weird stuff if 'variable' is modified
+        # self.w    = tf.signal.fft3d(self.w / self.scale)
+        self.built = True
+        
 
-	def compute_output_shape(self, input_shape): 
-		return tf.TensorShape(input_shape[1:])
+    def compute_output_shape(self, input_shape): 
+        return tf.TensorShape(input_shape[1:])
 
 
 
-class InvResNet(keras.layers.Layer): 			pass # model should automatically use gradient checkpointing if this is used. 
+class InvResNet(keras.layers.Layer):            pass # model should automatically use gradient checkpointing if this is used. 
 
 
 # the 3D case, refactor to make it into the general case. 
@@ -584,13 +584,13 @@ class InvResNet(keras.layers.Layer): 			pass # model should automatically use gr
 # Theoretically time is the same? 
 class CircularConv(keras.layers.Layer): 
 
-	def __init__(self, dim=3):  # 
-		self.dim = dim 
+    def __init__(self, dim=3):  # 
+        self.dim = dim 
 
-	def call(self, X): 		pass
-	
-	def call_inv(self, X): 	pass
+    def call(self, X):      pass
+    
+    def call_inv(self, X):  pass
 
-	def log_det(self): 		pass
+    def log_det(self):      pass
 
 

From 97bf377ed6f571158d0a0689a74ff801c463aeba Mon Sep 17 00:00:00 2001
From: anshuln <anshulnasery@gmail.com>
Date: Sat, 13 Jul 2019 20:18:56 +0530
Subject: [PATCH 04/11] Adds fit function, needs some testing

---
 invtf/generator_const_backprop.py | 318 ++++++++++++
 invtf/layers_const_backprop.py    | 808 +++++++++++++++---------------
 2 files changed, 722 insertions(+), 404 deletions(-)
 create mode 100644 invtf/generator_const_backprop.py

diff --git a/invtf/generator_const_backprop.py b/invtf/generator_const_backprop.py
new file mode 100644
index 0000000..fc61601
--- /dev/null
+++ b/invtf/generator_const_backprop.py
@@ -0,0 +1,318 @@
+"""
+	Contains the generator class with constant memory depth backprop  
+
+"""
+
+import os 
+
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
+os.environ['TF_CPP_MIN_VLOG_LEVEL']='3'
+
+import tqdm
+import tensorflow as tf
+import invtf.grow_memory
+import tensorflow.keras as keras 
+import numpy as np
+import invtf.latent
+import matplotlib.pyplot as plt 
+from invtf.dequantize import *
+from invtf.layers import *
+from invtf import latent
+
+
+
+"""
+
+	TODO: 
+
+	- Support specifying different latent distributions, see e.g. NICE. 
+
+	- The fit currently uses a dummy 'y=X'. It is not used, but removing it causes an error with 'total_loss'. 
+		Removing might speed up. 
+
+	Comments:
+		We are miss-using the Sequential thing as it is normally just a linear stack of layers. 
+		If we use the multi-scale architecture this is not the case, as it has multiple outputs. 
+
+"""
+class Generator(keras.Sequential): 
+
+	def __init__(self, latent=latent.Normal(28**2)):
+		self.latent = latent 
+
+		super(Generator, self).__init__()
+
+
+
+	# Sequential is normally only for linear stack, however, the multiple outputs in multi-scale architecture
+	# is fairly straight forward, so we change Sequential slightly to allow multiple outputs just for the
+	# case of the MultiScale layer. Refactor this to make a new variant MutliSqualeSequential which
+	# Generator inherents from. 
+	
+	def add(self, layer): 
+		from tensorflow.python.keras.utils import tf_utils
+		from tensorflow.python.keras.engine import training_utils
+		from tensorflow.python.util import nest
+		from tensorflow.python.keras.utils import layer_utils
+		from tensorflow.python.util import tf_inspect
+
+
+		# If we are passed a Keras tensor created by keras.Input(), we can extract
+		# the input layer from its keras history and use that without any loss of
+		# generality.
+		if hasattr(layer, '_keras_history'):
+			origin_layer = layer._keras_history[0]
+			if isinstance(origin_layer, keras.layers.InputLayer):
+				layer = origin_layer
+
+		if not isinstance(layer, keras.layers.Layer):
+			raise TypeError('The added layer must be '
+											'an instance of class Layer. '
+											'Found: ' + str(layer))
+
+		tf_utils.assert_no_legacy_layers([layer])
+
+		self.built = False
+		set_inputs = False
+		if not self._layers:
+			if isinstance(layer, keras.layers.InputLayer):
+				# Corner case where the user passes an InputLayer layer via `add`.
+				assert len(nest.flatten(layer._inbound_nodes[-1].output_tensors)) == 1
+				set_inputs = True
+			else:
+				batch_shape, dtype = training_utils.get_input_shape_and_dtype(layer)
+				if batch_shape:
+					# Instantiate an input layer.
+					x = keras.layers.Input(
+							batch_shape=batch_shape, dtype=dtype, name=layer.name + '_input')
+					# This will build the current layer
+					# and create the node connecting the current layer
+					# to the input layer we just created.
+					layer(x)
+					set_inputs = True
+
+			if set_inputs:
+				# If an input layer (placeholder) is available.
+				if len(nest.flatten(layer._inbound_nodes[-1].output_tensors)) != 1:
+					raise ValueError('All layers in a Sequential model '
+													 'should have a single output tensor. '
+													 'For multi-output layers, '
+													 'use the functional API.')
+				self.outputs = [
+						nest.flatten(layer._inbound_nodes[-1].output_tensors)[0]
+				]
+				self.inputs = layer_utils.get_source_inputs(self.outputs[0])
+
+		elif self.outputs:
+			# If the model is being built continuously on top of an input layer:
+			# refresh its output.
+			output_tensor = layer(self.outputs[0])
+			if len(nest.flatten(output_tensor)) != 1 and not isinstance(layer, MultiScale):
+				raise TypeError('All layers in a Sequential model '
+												'should have a single output tensor. '
+												'For multi-output layers, '
+												'use the functional API.')
+			self.outputs = [output_tensor]
+
+		if self.outputs:
+			# True if set_inputs or self._is_graph_network or if adding a layer
+			# to an already built deferred seq model.
+			self.built = True
+
+		if set_inputs or self._is_graph_network:
+			self._init_graph_network(self.inputs, self.outputs, name=self.name)
+		else:
+			self._layers.append(layer)
+		if self._layers:
+			self._track_layers(self._layers)
+
+		self._layer_call_argspecs[layer] = tf_inspect.getfullargspec(layer.call)
+
+
+
+	def predict(self, X, dequantize=True): 
+
+		Zs = [] 
+
+		for layer in self.layers: 
+
+			# allow deactivating dequenatize 
+			# refactor to just look into name of layer and skip if it has dequantize in name or something like that. 
+			if not dequantize and isinstance(layer, UniformDequantize):     continue    
+			if not dequantize and isinstance(layer, VariationalDequantize): continue    
+
+			# if isinstance(layer, MultiScale): 
+			# 	X, Z = layer.call(X)
+			# 	Zs.append(Z)
+			# 	continue
+
+			X = layer.call(X)
+
+		# TODO: make sure this does not break case without multiscale architecture.
+		# append Zs to X;; do by vectorize and then concat. 
+
+		return X, Zs
+
+	def predict_inv(self, X, Z=None): 
+		n = X.shape[0]
+
+		for layer in self.layers[::-1]: 
+
+			if isinstance(layer, MultiScale): 
+				X = layer.call_inv(X, Z.pop())
+
+			else: 
+				X = layer.call_inv(X)
+
+		return np.array(X, dtype=np.int32) # makes it easier on matplotlib. 
+
+	def log_det(self): 
+		logdet = 0.
+
+		for layer in self.layers: 
+			if isinstance(layer, tf.keras.layers.InputLayer):   continue 
+			logdet += layer.log_det()
+		return logdet
+
+
+	def loss(self,  y_pred):  
+		#   computes negative log likelihood in bits per dimension. 
+		# We are overriding the fit function, so we do not need to conform to tf.keras's pointless args.
+		return self.loss_log_det( y_pred) + self.loss_log_latent_density( y_pred)
+
+	def loss_log_det(self,  y_pred): 
+		# divide by /d to get per dimension and divide by log(2) to get from log base E to log base 2. 
+		d           = tf.cast(tf.reduce_prod(y_pred.shape[1:]),         tf.float32)
+		norm        = d * np.log(2.) 
+		log_det     = self.log_det() / norm
+
+		return      - log_det
+
+
+	def loss_log_latent_density(self,  y_pred): 
+		# divide by /d to get per dimension and divide by log(2) to get from log base E to log base 2. 
+		batch_size  = tf.cast(tf.shape(y_pred)[0],  tf.float32)
+		d           = tf.cast(tf.reduce_prod(y_pred.shape[1:]),         tf.float32)
+		norm        = d * np.log(2.) 
+		normal      = self.latent.log_density(y_pred) / (norm * batch_size)
+
+		return      - normal
+
+	def compile(self, **kwargs): 
+		# overrides what'ever loss the user specifieds; change to complain with exception if they specify it with
+		#TODO remove this function, since we are overriding fit, we don't need this
+		kwargs['loss']      = self.loss 
+
+		def lg_det(y_true, y_pred):     return self.loss_log_det(y_true, y_pred)
+		def lg_latent(y_true, y_pred):  return self.loss_log_latent_density(y_true, y_pred)
+		def lg_perfect(y_true, y_pred): return self.loss_log_latent_density(y_true, self.latent.sample(n=1000))
+
+		kwargs['metrics'] = [lg_det, lg_latent, lg_perfect]
+
+		super(Generator, self).compile(**kwargs)
+
+	def compute_and_apply_gradients(self,X,optimizer=None):
+		'''
+		Computes gradients efficiently and updates weights
+		Returns - Loss on the batch
+		'''
+		x = self.call(X)        #I think putting this in context records all operations onto the tape, thereby destroying purpose of checkpointing...
+		last_layer = self.layers[-1]
+		#Computing gradients of loss function wrt the last acticvation
+		with tf.GradientTape() as tape:
+		    tape.watch(x)
+		    loss = self.loss(x)    #May have to change
+		grads_combined = tape.gradient(loss,[x])
+		dy = grads_combined[0]
+		y = x
+		#Computing gradients for each layer
+		for layer in self.layers[::-1]:     
+		    x = layer.call_inv(y)
+		    dy,grads = layer.compute_gradients(x,dy,layer.log_det)	#TODO implement scaling here...
+		    optimizer.apply_gradients(zip(grads,layer.trainable_variables))
+		    y = x 
+		return loss
+
+	def fit(self, X, batch_size=32,epochs=1,optimizer=tf.optimizers.Adam(),**kwargs): 
+		'''
+		Fits the model on dataset `X 
+		'''
+		# TODO add all other args from tf.keras.Model.fit 
+		# TODO return a history object instead of array of losses
+		all_losses = []
+		for j in range(epochs):
+			num_batches = X.shape[0] // batch_size
+			X = np.random.permutation(X)
+			#Minibatch gradient descent
+			for i in range(0,X.shape[0]-X.shape[0]%batch_size,batch_size):    
+			# grads,loss = model.compute_gradients(X[i:(i+batch_size)])
+				losses = []
+				loss = self.compute_and_apply_gradients(X[i:(i+batch_size)],optimizer)
+				losses.append(loss.numpy())
+				loss = np.mean(losses)  
+			print(loss)
+			all_losses+=losses
+		return all_losses
+
+	def fit_generator(self, generator,batches_per_epoch,epochs=1,optimizer=tf.optimizers.Adam(),**kwargs): 
+		'''
+		Fits model on the data generator `generator
+		'''
+		#TODO add all other args from tf.keras.Model.fit_generator
+		all_losses = []
+		for j in range(epochs):
+			for i in tqdm(range(num_batches)):
+				losses = []
+				loss = self.compute_and_apply_gradients(next(X),optimizer)
+				losses.append(loss.numpy())
+			loss = np.mean(losses)  
+			print(loss)
+			all_losses+=losses
+		return all_losses
+
+	def rec(self, X): 
+
+		X, Zs = self.predict(X, dequantize=False) # TODO: deactivate dequantize. 
+		rec = self.predict_inv(X, Zs)
+		return rec
+
+	def check_inv(self, X, precision=10**(-5)): 
+		img_shape = X.shape[1:]
+
+		rec = self.rec(X)
+
+		if not np.allclose(X, rec, atol=precision):
+			fig, ax = plt.subplots(5, 3)
+			for i in range(5): 
+				ax[i, 0].imshow(X[i].reshape(img_shape).astype(np.int32))
+				ax[i, 0].set_title("Image")
+				ax[i, 1].imshow(rec[i].reshape(img_shape))
+				ax[i, 1].set_title("Reconstruction")
+				ax[i, 2].imshow((X[i]-rec[i]).reshape(img_shape))
+				ax[i, 2].set_title("Difference")
+				plt.show()
+
+
+	def sample(self, n=1000, fix_latent=True):  
+		#Z  = self.latent.sample(n=n, fix_latent=fix_latent)
+		
+		# Figure out how to handle shape of Z. If no multi-scale arch we want to do reshape below. 
+		# If multi-scale arch we don't want to, predict_inv handles it. Figure out who has the responsibility. 
+
+		output_shape    = self.layers[-1].output_shape[1:]
+
+		X = self.latent.sample(shape=(n, ) + output_shape)
+
+		for layer in self.layers[::-1]: 
+
+			if isinstance(layer, MultiScale): 
+				Z = self.latent.sample(shape=X.shape)
+				X = layer.call_inv(X, Z)
+			else: 
+				X = layer.call_inv(X)
+
+		return np.array(X, dtype=np.int32) # makes it easier on matplotlib. 
+
+		return fakes
+
+		
diff --git a/invtf/layers_const_backprop.py b/invtf/layers_const_backprop.py
index 9b3d6e8..92b5aff 100644
--- a/invtf/layers_const_backprop.py
+++ b/invtf/layers_const_backprop.py
@@ -6,417 +6,417 @@
 from invtf.coupling_strategy import *
 
 """
-    Known issue with multi-scale architecture. 
-    The log-det computations normalizes wrt full dimension. 
+	Known issue with multi-scale architecture. 
+	The log-det computations normalizes wrt full dimension. 
 
 """
 #TODO Write unit tests
 class LayerWithGrads(keras.layers.Layer):    
-    '''
-    This is a virtual class from which all layer classes need to inherit
-    It has the function `compute gradients` which is used for constant 
-    memory backprop.
-    '''
-    def __init__(self,**kwargs):
-        super(LayerWithGrads,self).__init__(**kwargs)
-
-    def call(self,X):
-        raise NotImplementedError
-
-    def call_inv(self,X):
-        raise NotImplementedError
-
-    def compute_gradients(self,x,dy,regularizer=None):  
-        '''
-        Computes gradients for backward pass
-        Args:
-            x - tensor compatible with forward pass, input to the layer
-            dy - incoming gradient from backprop
-            regularizer - function, indicates dependence of loss on weights of layer
-        Returns
-            dy - gradients wrt input, to be backpropagated
-            grads - gradients wrt weights
-        '''
-        #TODO check if log_det of AffineCouplingLayer depends needs a regularizer.
-        with tf.GradientTape() as tape:
-            tape.watch(x)
-            y_ = self.call(x)   #Required to register the operation onto the gradient tape
-        grads_combined = tape.gradient(y_,[x]+self.trainable_variables,output_gradients=dy)
-        dy,grads = grads_combined[0],grads_combined[1:]
-
-        if regularizer is not None:
-            with tf.GradientTape() as tape:
-                reg = -regularizer()
-            grads_wrt_reg = tape.gradient(reg, self.trainable_variables)
-            grads = [a[0]+a[1] for a in zip(grads,grads_wrt_reg)]
-        return dy,grads
+	'''
+	This is a virtual class from which all layer classes need to inherit
+	It has the function `compute gradients` which is used for constant 
+	memory backprop.
+	'''
+	def __init__(self,**kwargs):
+		super(LayerWithGrads,self).__init__(**kwargs)
+
+	def call(self,X):
+		raise NotImplementedError
+
+	def call_inv(self,X):
+		raise NotImplementedError
+
+	def compute_gradients(self,x,dy,regularizer=None):  
+		'''
+		Computes gradients for backward pass
+		Args:
+			x - tensor compatible with forward pass, input to the layer
+			dy - incoming gradient from backprop
+			regularizer - function, indicates dependence of loss on weights of layer
+		Returns
+			dy - gradients wrt input, to be backpropagated
+			grads - gradients wrt weights
+		'''
+		#TODO check if log_det of AffineCouplingLayer depends needs a regularizer.
+		with tf.GradientTape() as tape:
+			tape.watch(x)
+			y_ = self.call(x)   #Required to register the operation onto the gradient tape
+		grads_combined = tape.gradient(y_,[x]+self.trainable_variables,output_gradients=dy)
+		dy,grads = grads_combined[0],grads_combined[1:]
+
+		if regularizer is not None:
+			with tf.GradientTape() as tape:
+				reg = -regularizer()
+			grads_wrt_reg = tape.gradient(reg, self.trainable_variables)
+			grads = [a[0]+a[1] for a in zip(grads,grads_wrt_reg)]
+		return dy,grads
 
 class Linear(LayerWithGrads): 
 
-    def __init__(self, **kwargs): super(Linear, self).__init__(**kwargs)
+	def __init__(self, **kwargs): super(Linear, self).__init__(**kwargs)
 
-    def build(self, input_shape): 
+	def build(self, input_shape): 
 
-        assert len(input_shape) == 2
-        _, d = input_shape
+		assert len(input_shape) == 2
+		_, d = input_shape
 
-        self.W = self.add_weight(shape=(d, d),  initializer='identity', name="linear_weight")
-        self.b = self.add_weight(shape=(d),     initializer='zero',     name="linear_bias")
-        
-        super(Linear, self).build(input_shape)
-        self.built = True
+		self.W = self.add_weight(shape=(d, d),  initializer='identity', name="linear_weight")
+		self.b = self.add_weight(shape=(d),     initializer='zero',     name="linear_bias")
+		
+		super(Linear, self).build(input_shape)
+		self.built = True
 
-    def call(self, X):      return X @ self.W + self.b 
+	def call(self, X):      return X @ self.W + self.b 
 
-    def call_inv(self, Z):  return (Z - self.b) @ tf.linalg.inv(self.W)
+	def call_inv(self, Z):  return (Z - self.b) @ tf.linalg.inv(self.W)
 
-    def jacobian(self):     return self.W
+	def jacobian(self):     return self.W
 
-    def log_det(self):      return tf.math.log(tf.abs(tf.linalg.det(self.jacobian())))
+	def log_det(self):      return tf.math.log(tf.abs(tf.linalg.det(self.jacobian())))
 
-    def compute_output_shape(self, input_shape): 
-        self.output_shape = input_shape
-        return input_shape
+	def compute_output_shape(self, input_shape): 
+		self.output_shape = input_shape
+		return input_shape
 
 
 class Affine(LayerWithGrads): 
 
-    """
-        The exp parameter allows the scaling to be exp(s) \odot X. 
-        This cancels out the log in the log_det computations. 
-    """
+	"""
+		The exp parameter allows the scaling to be exp(s) \odot X. 
+		This cancels out the log in the log_det computations. 
+	"""
 
-    def __init__(self, exp=False, **kwargs): 
-        self.exp = exp
-        super(Affine, self).__init__(**kwargs)
+	def __init__(self, exp=False, **kwargs): 
+		self.exp = exp
+		super(Affine, self).__init__(**kwargs)
 
-    def build(self, input_shape): 
+	def build(self, input_shape): 
 
-        #assert len(input_shape) == 2
-        d = input_shape[1:]
+		#assert len(input_shape) == 2
+		d = input_shape[1:]
 
-        self.w = self.add_weight(shape=d,   initializer='ones', name="affine_scale") 
-        self.b = self.add_weight(shape=d,   initializer='zero', name="affine_bias")
+		self.w = self.add_weight(shape=d,   initializer='ones', name="affine_scale") 
+		self.b = self.add_weight(shape=d,   initializer='zero', name="affine_bias")
 
-        super(Affine, self).build(input_shape)
-        self.built = True
+		super(Affine, self).build(input_shape)
+		self.built = True
 
-    def call(self, X):      
-        if self.exp:    return X * tf.exp(self.w) + self.b 
-        else:           return X * self.w         + self.b
+	def call(self, X):      
+		if self.exp:    return X * tf.exp(self.w) + self.b 
+		else:           return X * self.w         + self.b
 
-    def call_inv(self, Z):  
-        if self.exp:    return (Z - self.b) / tf.exp(self.w)
-        else:           return (Z - self.b) / self.w
+	def call_inv(self, Z):  
+		if self.exp:    return (Z - self.b) / tf.exp(self.w)
+		else:           return (Z - self.b) / self.w
 
-    def jacobian(self):     return self.w
+	def jacobian(self):     return self.w
 
-    def eigenvalues(self):  return self.w
+	def eigenvalues(self):  return self.w
 
-    def log_det(self):      
-        if self.exp:    return tf.reduce_sum(tf.abs(self.eigenvalues()))
-        else:           return tf.reduce_sum(tf.math.log(tf.abs(self.eigenvalues())))
+	def log_det(self):      
+		if self.exp:    return tf.reduce_sum(tf.abs(self.eigenvalues()))
+		else:           return tf.reduce_sum(tf.math.log(tf.abs(self.eigenvalues())))
 
-    def compute_output_shape(self, input_shape): 
-        self.output_shape = input_shape
-        return input_shape
+	def compute_output_shape(self, input_shape): 
+		self.output_shape = input_shape
+		return input_shape
 
 
 
 """
-    For simplicity we vectorize input and apply coupling to even/odd entries. 
-    Could also use upper/lower. Refactor this to support specifying the pattern as a parameter. 
-
-    TODO: 
-        Potentially refactor so we can add directly to AdditiveCoupling instead of creating 'm'
-        by (potentially adding to Sequential) and passing this on to AdditiveCoupling. 
-        The main issue is AdditiveCoupling is R^2-> R^2 while m:R^1->R^1, so if we 
-        add directly to AdditiveCoupling we run into issues with miss matching dimensions. 
-    
+	For simplicity we vectorize input and apply coupling to even/odd entries. 
+	Could also use upper/lower. Refactor this to support specifying the pattern as a parameter. 
+
+	TODO: 
+		Potentially refactor so we can add directly to AdditiveCoupling instead of creating 'm'
+		by (potentially adding to Sequential) and passing this on to AdditiveCoupling. 
+		The main issue is AdditiveCoupling is R^2-> R^2 while m:R^1->R^1, so if we 
+		add directly to AdditiveCoupling we run into issues with miss matching dimensions. 
+	
 """
 class AdditiveCoupling(keras.Sequential): 
 
-    unique_id = 1
+	unique_id = 1
 
-    def __init__(self, part=0, strategy=SplitOnHalfStrategy()): # strategy: alternate / split  ;; alternate does odd/even, split has upper/lower. 
-        super(AdditiveCoupling, self).__init__(name="add_coupling_%i"%AdditiveCoupling.unique_id)
-        AdditiveCoupling.unique_id += 1
-        self.part   = part 
-        self.strategy = strategy
+	def __init__(self, part=0, strategy=SplitOnHalfStrategy()): # strategy: alternate / split  ;; alternate does odd/even, split has upper/lower. 
+		super(AdditiveCoupling, self).__init__(name="add_coupling_%i"%AdditiveCoupling.unique_id)
+		AdditiveCoupling.unique_id += 1
+		self.part   = part 
+		self.strategy = strategy
 
 
-    def build(self, input_shape):
+	def build(self, input_shape):
 
-        self.layers[0].build(input_shape=(None, 28**2/2))
-        out_dim = self.layers[0].compute_output_shape(input_shape=(None, 28**2/2))
+		self.layers[0].build(input_shape=(None, 28**2/2))
+		out_dim = self.layers[0].compute_output_shape(input_shape=(None, 28**2/2))
 
-        for layer in self.layers[1:]:  
-            layer.build(input_shape=out_dim)
-            out_dim = layer.compute_output_shape(input_shape=out_dim)
+		for layer in self.layers[1:]:  
+			layer.build(input_shape=out_dim)
+			out_dim = layer.compute_output_shape(input_shape=out_dim)
 
-    def call_(self, X): 
-        for layer in self.layers: 
-            X = layer.call(X)
-        return X
+	def call_(self, X): 
+		for layer in self.layers: 
+			X = layer.call(X)
+		return X
 
-    def call(self, X):      
-        shape   = tf.shape(X)
-        d       = tf.reduce_prod(shape[1:])
-        X       = tf.reshape(X, (shape[0], d))
+	def call(self, X):      
+		shape   = tf.shape(X)
+		d       = tf.reduce_prod(shape[1:])
+		X       = tf.reshape(X, (shape[0], d))
 
-        x0, x1 = self.strategy.split(X)
+		x0, x1 = self.strategy.split(X)
 
-        if self.part == 0: x0       = x0 + self.call_(x1)
-        if self.part == 1: x1       = x1 + self.call_(x0)
+		if self.part == 0: x0       = x0 + self.call_(x1)
+		if self.part == 1: x1       = x1 + self.call_(x0)
 
-        X = self.strategy.combine(x0, x1)
+		X = self.strategy.combine(x0, x1)
 
-        X       = tf.reshape(X, shape)
-        return X
+		X       = tf.reshape(X, shape)
+		return X
 
-    def call_inv(self, Z):   
-        shape   = tf.shape(Z)
-        d       = tf.reduce_prod(shape[1:])
-        Z       = tf.reshape(Z, (shape[0], d))
+	def call_inv(self, Z):   
+		shape   = tf.shape(Z)
+		d       = tf.reduce_prod(shape[1:])
+		Z       = tf.reshape(Z, (shape[0], d))
 
-        z0, z1 = self.strategy.split(Z)
-        
-        if self.part == 0: z0       = z0 - self.call_(z1)
-        if self.part == 1: z1       = z1 - self.call_(z0)
+		z0, z1 = self.strategy.split(Z)
+		
+		if self.part == 0: z0       = z0 - self.call_(z1)
+		if self.part == 1: z1       = z1 - self.call_(z0)
 
-        Z = self.strategy.combine(z0, z1)
+		Z = self.strategy.combine(z0, z1)
 
-        Z       = tf.reshape(Z, shape)
-        return Z
+		Z       = tf.reshape(Z, shape)
+		return Z
 
 
-    def log_det(self):      return 0. 
+	def log_det(self):      return 0. 
 
-    def compute_output_shape(self, input_shape): return input_shape
+	def compute_output_shape(self, input_shape): return input_shape
 
-    def compute_gradients(self,x,dy,regularizer=None):  
-        '''
-        Computes gradients for backward pass
-        Since the coupling layers do not inherit from `LayerWithGrads`, this 
-        function is re-written. See TODO of AffineCoupling for further info
-        Args:
-            x - tensor compatible with forward pass, input to the layer
-            dy - incoming gradient from backprop
-            regularizer - function, indicates dependence of loss on weights of layer
-        Returns
-            dy - gradients wrt input, to be backpropagated
-            grads - gradients wrt weights
-        '''
-        with tf.GradientTape() as tape:
-            tape.watch(x)
-            y_ = self.call(x)   #Required to register the operation onto the gradient tape
-        grads_combined = tape.gradient(y_,[x]+self.trainable_variables,output_gradients=dy)
-        dy,grads = grads_combined[0],grads_combined[1:]
+	def compute_gradients(self,x,dy,regularizer=None):  
+		'''
+		Computes gradients for backward pass
+		Since the coupling layers do not inherit from `LayerWithGrads`, this 
+		function is re-written. See TODO of AffineCoupling for further info
+		Args:
+			x - tensor compatible with forward pass, input to the layer
+			dy - incoming gradient from backprop
+			regularizer - function, indicates dependence of loss on weights of layer
+		Returns
+			dy - gradients wrt input, to be backpropagated
+			grads - gradients wrt weights
+		'''
+		with tf.GradientTape() as tape:
+			tape.watch(x)
+			y_ = self.call(x)   #Required to register the operation onto the gradient tape
+		grads_combined = tape.gradient(y_,[x]+self.trainable_variables,output_gradients=dy)
+		dy,grads = grads_combined[0],grads_combined[1:]
 
-        if regularizer is not None:
-            with tf.GradientTape() as tape:
-                reg = -regularizer()
-            grads_wrt_reg = tape.gradient(reg, self.trainable_variables)
-            grads = [a[0]+a[1] for a in zip(grads,grads_wrt_reg)]
-        return dy,grads
+		if regularizer is not None:
+			with tf.GradientTape() as tape:
+				reg = -regularizer()
+			grads_wrt_reg = tape.gradient(reg, self.trainable_variables)
+			grads = [a[0]+a[1] for a in zip(grads,grads_wrt_reg)]
+		return dy,grads
 
 
 
 """
-    The affine coupling layer is described in NICE, REALNVP and GLOW. 
-    The description in Glow use a single network to output scale s and transform t, 
-    it seems the description in REALNVP is a bit more general refering to s and t as 
-    different functions. From this perspective Glow change the affine layer to have
-    weight sharing between s and t. 
-     Specifying a single function is a lot simpler code-wise, we thus use that approach. 
+	The affine coupling layer is described in NICE, REALNVP and GLOW. 
+	The description in Glow use a single network to output scale s and transform t, 
+	it seems the description in REALNVP is a bit more general refering to s and t as 
+	different functions. From this perspective Glow change the affine layer to have
+	weight sharing between s and t. 
+	 Specifying a single function is a lot simpler code-wise, we thus use that approach. 
 
 
-    For now assumes the use of convolutions 
+	For now assumes the use of convolutions 
 
 """
 class AffineCoupling(keras.Sequential):     #TODO Check gradient computations with and without reg
 
-    unique_id = 1
+	unique_id = 1
 
-    def __init__(self, part=0, strategy=SplitChannelsStrategy()): 
-        super(AffineCoupling, self).__init__(name="aff_coupling_%i"%AffineCoupling.unique_id)
-        AffineCoupling.unique_id += 1
-        self.part       = part 
-        self.strategy   = strategy
+	def __init__(self, part=0, strategy=SplitChannelsStrategy()): 
+		super(AffineCoupling, self).__init__(name="aff_coupling_%i"%AffineCoupling.unique_id)
+		AffineCoupling.unique_id += 1
+		self.part       = part 
+		self.strategy   = strategy
 
-    def build(self, input_shape):
+	def build(self, input_shape):
 
-        # handle the issue with each network output something larger. 
-        _, h, w, c = input_shape
+		# handle the issue with each network output something larger. 
+		_, h, w, c = input_shape
 
 
-        h, w, c = self.strategy.coupling_shape(input_shape=(h,w,c))
+		h, w, c = self.strategy.coupling_shape(input_shape=(h,w,c))
 
-        self.layers[0].build(input_shape=(None, h, w, c))
-        out_dim = self.layers[0].compute_output_shape(input_shape=(None, h, w, c))
-        self.layers[0].output_shape_ = out_dim
+		self.layers[0].build(input_shape=(None, h, w, c))
+		out_dim = self.layers[0].compute_output_shape(input_shape=(None, h, w, c))
+		self.layers[0].output_shape_ = out_dim
 
-        for layer in self.layers[1:]:  
-            layer.build(input_shape=out_dim)
-            out_dim = layer.compute_output_shape(input_shape=out_dim)
-            layer.output_shape_ = out_dim
+		for layer in self.layers[1:]:  
+			layer.build(input_shape=out_dim)
+			out_dim = layer.compute_output_shape(input_shape=out_dim)
+			layer.output_shape_ = out_dim
 
-    def call_(self, X): 
+	def call_(self, X): 
 
-        in_shape = tf.shape(X)
-        n, h, w, c = X.shape
+		in_shape = tf.shape(X)
+		n, h, w, c = X.shape
 
-        for layer in self.layers: 
-            X = layer.call(X) # residual 
+		for layer in self.layers: 
+			X = layer.call(X) # residual 
 
-        # TODO: Could have a part of network learned specifically for s,t to not ONLY have wegith sharing? 
-        
-        X = tf.reshape(X, (-1, h, w, c*2))
-        s = X[:, :, w//2:, :]
-        t = X[:, :, :w//2, :]  
+		# TODO: Could have a part of network learned specifically for s,t to not ONLY have wegith sharing? 
+		
+		X = tf.reshape(X, (-1, h, w, c*2))
+		s = X[:, :, w//2:, :]
+		t = X[:, :, :w//2, :]  
 
-        s = tf.reshape(s, in_shape)
-        t = tf.reshape(t, in_shape)
+		s = tf.reshape(s, in_shape)
+		t = tf.reshape(t, in_shape)
 
-        return s, t
+		return s, t
 
-    def call(self, X):      
+	def call(self, X):      
 
-        x0, x1 = self.strategy.split(X)
+		x0, x1 = self.strategy.split(X)
 
-        if self.part == 0: 
-            s, t    = self.call_(x1)
-            x0      = x0*s + t
+		if self.part == 0: 
+			s, t    = self.call_(x1)
+			x0      = x0*s + t
 
-        if self.part == 1: 
-            s, t    = self.call_(x0)
-            x1      = x1*s + t 
+		if self.part == 1: 
+			s, t    = self.call_(x0)
+			x1      = x1*s + t 
 
-        X       = self.strategy.combine(x0, x1)
-        return X
+		X       = self.strategy.combine(x0, x1)
+		return X
 
-    def call_inv(self, Z):   
-        z0, z1 = self.strategy.split(Z)
-        
-        if self.part == 0: 
-            s, t    = self.call_(z1)
-            z0      = (z0 - t)/s
-        if self.part == 1: 
-            s, t    = self.call_(z0)
-            z1      = (z1 - t)/s
+	def call_inv(self, Z):   
+		z0, z1 = self.strategy.split(Z)
+		
+		if self.part == 0: 
+			s, t    = self.call_(z1)
+			z0      = (z0 - t)/s
+		if self.part == 1: 
+			s, t    = self.call_(z0)
+			z1      = (z1 - t)/s
 
-        Z       = self.strategy.combine(z0, z1)
-        return Z
+		Z       = self.strategy.combine(z0, z1)
+		return Z
 
 
-    def log_det(self):       
+	def log_det(self):       
 
-        # TODO: save 's' instead of recomputing. 
+		# TODO: save 's' instead of recomputing. 
 
-        X       = self.input
-        n       = tf.dtypes.cast(tf.shape(X)[0], tf.float32)
+		X       = self.input
+		n       = tf.dtypes.cast(tf.shape(X)[0], tf.float32)
 
-        x0, x1 = self.strategy.split(X)
+		x0, x1 = self.strategy.split(X)
 
-        if self.part == 0: 
-            s, t    = self.call_(x1)
-        if self.part == 1: 
-            s, t    = self.call_(x0)
+		if self.part == 0: 
+			s, t    = self.call_(x1)
+		if self.part == 1: 
+			s, t    = self.call_(x0)
 
-        # there is an issue with 's' being divided by dimension 'd' later:
-        # If we used MultiScale it will be lower dimensional, in this case
-        # we should not divide by d but d//2. 
+		# there is an issue with 's' being divided by dimension 'd' later:
+		# If we used MultiScale it will be lower dimensional, in this case
+		# we should not divide by d but d//2. 
 
-        return tf.reduce_sum(tf.math.log(tf.abs(s))) / n
+		return tf.reduce_sum(tf.math.log(tf.abs(s))) / n
 
-    def compute_output_shape(self, input_shape): return input_shape
+	def compute_output_shape(self, input_shape): return input_shape
 
-    def summary(self, line_length=None, positions=None, print_fn=None):
-        print_summary(self, line_length=line_length, positions=positions, print_fn=print_fn) # fixes stupid issue.
+	def summary(self, line_length=None, positions=None, print_fn=None):
+		print_summary(self, line_length=line_length, positions=positions, print_fn=print_fn) # fixes stupid issue.
 
-    def compute_gradients(self,x,dy,regularizer=None):  
-        '''
-        Computes gradients for backward pass
-        Args:
-            x - tensor compatible with forward pass, input to the layer
-            dy - incoming gradient from backprop
-            regularizer - function, indicates dependence of loss on weights of layer
-        Returns
-            dy - gradients wrt input, to be backpropagated
-            grads - gradients wrt weights
-        '''
-        #TODO check if log_det of AffineCouplingLayer needs a regularizer. -- DONE, it does
-        with tf.GradientTape() as tape:
-            tape.watch(x)
-            y_ = self.call(x)   #Required to register the operation onto the gradient tape
-        grads_combined = tape.gradient(y_,[x]+self.trainable_variables,output_gradients=dy)
-        dy,grads = grads_combined[0],grads_combined[1:]
+	def compute_gradients(self,x,dy,regularizer=None):  
+		'''
+		Computes gradients for backward pass
+		Args:
+			x - tensor compatible with forward pass, input to the layer
+			dy - incoming gradient from backprop
+			regularizer - function, indicates dependence of loss on weights of layer
+		Returns
+			dy - gradients wrt input, to be backpropagated
+			grads - gradients wrt weights
+		'''
+		#TODO check if log_det of AffineCouplingLayer needs a regularizer. -- DONE, it does
+		with tf.GradientTape() as tape:
+			tape.watch(x)
+			y_ = self.call(x)   #Required to register the operation onto the gradient tape
+		grads_combined = tape.gradient(y_,[x]+self.trainable_variables,output_gradients=dy)
+		dy,grads = grads_combined[0],grads_combined[1:]
 
-        if regularizer is not None:
-            with tf.GradientTape() as tape:
-                reg = -regularizer()
-            grads_wrt_reg = tape.gradient(reg, self.trainable_variables)
-            grads = [a[0]+a[1] for a in zip(grads,grads_wrt_reg)]
-        return dy,grads
+		if regularizer is not None:
+			with tf.GradientTape() as tape:
+				reg = -regularizer()
+			grads_wrt_reg = tape.gradient(reg, self.trainable_variables)
+			grads = [a[0]+a[1] for a in zip(grads,grads_wrt_reg)]
+		return dy,grads
 
 
 """
-    Try different techniques: I'm implementing the simplest case, just reshape to desired shape. 
-    TODO: Implement the following Squeeze strategies: 
-        - RealNVP
-        - Downscale images, e.g. alternate pixels and have 4 lower dim images and stack them. 
-        - ... 
+	Try different techniques: I'm implementing the simplest case, just reshape to desired shape. 
+	TODO: Implement the following Squeeze strategies: 
+		- RealNVP
+		- Downscale images, e.g. alternate pixels and have 4 lower dim images and stack them. 
+		- ... 
 """
 class Squeeze(LayerWithGrads): 
 
-    def call(self, X): 
-        n, self.w, self.h, self.c = X.shape
-        return tf.reshape(X, [-1, self.w//2, self.h//2, self.c*4])
+	def call(self, X): 
+		n, self.w, self.h, self.c = X.shape
+		return tf.reshape(X, [-1, self.w//2, self.h//2, self.c*4])
 
-    def call_inv(self, X): 
-        return tf.reshape(X, [-1, self.w, self.h, self.c])
-        
-    def log_det(self): return 0. 
+	def call_inv(self, X): 
+		return tf.reshape(X, [-1, self.w, self.h, self.c])
+		
+	def log_det(self): return 0. 
 
 
 # TODO: for now assumes target is +-1, refactor to support any target. 
 # Refactor 127.5 
 class Normalize(LayerWithGrads):  # normalizes data after dequantization. 
 
-    def __init__(self, target=[-1,+1], scale=127.5, input_shape=None): 
-        super(Normalize, self).__init__(input_shape=input_shape)
-        self.target = target
-        self.d      = np.prod(input_shape)
-        self.scale  = 1/127.5
+	def __init__(self, target=[-1,+1], scale=127.5, input_shape=None): 
+		super(Normalize, self).__init__(input_shape=input_shape)
+		self.target = target
+		self.d      = np.prod(input_shape)
+		self.scale  = 1/127.5
 
-    def call(self, X):  
-        X           = X * self.scale  - 1
-        return X
+	def call(self, X):  
+		X           = X * self.scale  - 1
+		return X
 
-    def call_inv(self, Z): 
-        Z = Z + 1
-        Z = Z / self.scale
-        return Z
+	def call_inv(self, Z): 
+		Z = Z + 1
+		Z = Z / self.scale
+		return Z
 
-    def log_det(self): return self.d * tf.math.log(self.scale) 
+	def log_det(self): return self.d * tf.math.log(self.scale) 
 
 
 class MultiScale(keras.layers.Layer): 
 
-    def call(self, X):  # TODO: have different strategies here, and combine it with how coupling layer works? 
-        n, w, h, c = X.shape
-        Z = X[:, :, :, c//2:]
-        X = X[:, :, :, :c//2]
-        return X, Z
-    
-    def call_inv(self, X, Z): 
-        return tf.concat((X, Z), axis=-1)
+	def call(self, X):  # TODO: have different strategies here, and combine it with how coupling layer works? 
+		n, w, h, c = X.shape
+		Z = X[:, :, :, c//2:]
+		X = X[:, :, :, :c//2]
+		return X, Z
+	
+	def call_inv(self, X, Z): 
+		return tf.concat((X, Z), axis=-1)
 
-    def compute_output_shape(self, input_shape): 
-        n, h, w, c = input_shape
-        return (n, h, w, c//2)
+	def compute_output_shape(self, input_shape): 
+		n, h, w, c = input_shape
+		return (n, h, w, c//2)
 
-    def log_det(self): return 0.
+	def log_det(self): return 0.
 
 
 class ActNorm(keras.layers.Layer): pass 
@@ -424,155 +424,155 @@ class ActNorm(keras.layers.Layer): pass
 
 class Inv1x1Conv(LayerWithGrads):  
 
-    """
-        Based on Glow page 11 appendix B. 
-        It is possible to speed up determinant computation by using PLU or QR decomposition
-        as proposed in Glow and Emerging Conv papers respectively. 
+	"""
+		Based on Glow page 11 appendix B. 
+		It is possible to speed up determinant computation by using PLU or QR decomposition
+		as proposed in Glow and Emerging Conv papers respectively. 
 
-        Add bias to this operation? Try to see if it makes any difference. 
+		Add bias to this operation? Try to see if it makes any difference. 
 
-        Try to compare speed / numerical stability etc for different implementations: 
+		Try to compare speed / numerical stability etc for different implementations: 
 
-            1. PLU decomposition
-            2. QR
-            3. Normal determinant O(c^3)
-            4. tensordot vs conv2d. 
-    """
+			1. PLU decomposition
+			2. QR
+			3. Normal determinant O(c^3)
+			4. tensordot vs conv2d. 
+	"""
 
-    def __init__(self, **kwargs): super(Inv1x1Conv, self).__init__(**kwargs)
+	def __init__(self, **kwargs): super(Inv1x1Conv, self).__init__(**kwargs)
 
-    def build(self, input_shape): 
+	def build(self, input_shape): 
 
-        _, h, w, c = input_shape
-        self.c = c
-        self.h = h
-        self.w = w
+		_, h, w, c = input_shape
+		self.c = c
+		self.h = h
+		self.w = w
 
-        #w_init = np.linalg.qr(np.random.randn(c,c))[0]
-        self.W      = self.add_weight(shape=(c, c), initializer=keras.initializers.Orthogonal(gain=1.0, seed=None), name="inv_1x1_conv")
-        self.W_inv  = tf.linalg.inv(self.W)
-        
-        super(Inv1x1Conv, self).build(input_shape)
-        self.built = True
+		#w_init = np.linalg.qr(np.random.randn(c,c))[0]
+		self.W      = self.add_weight(shape=(c, c), initializer=keras.initializers.Orthogonal(gain=1.0, seed=None), name="inv_1x1_conv")
+		self.W_inv  = tf.linalg.inv(self.W)
+		
+		super(Inv1x1Conv, self).build(input_shape)
+		self.built = True
 
-    def call(self, X):  
-        _W = tf.reshape(self.W, (1,1, self.c, self.c))
-        return tf.nn.conv2d(X, _W, [1,1,1,1], "SAME")
+	def call(self, X):  
+		_W = tf.reshape(self.W, (1,1, self.c, self.c))
+		return tf.nn.conv2d(X, _W, [1,1,1,1], "SAME")
 
-    def call_inv(self, Z):  
-        _W = tf.reshape(self.W_inv, (1,1, self.c, self.c))
-        return tf.nn.conv2d(Z, _W, [1,1,1,1], "SAME")
+	def call_inv(self, Z):  
+		_W = tf.reshape(self.W_inv, (1,1, self.c, self.c))
+		return tf.nn.conv2d(Z, _W, [1,1,1,1], "SAME")
 
-    def log_det(self):       # TODO: Fix this issue!!! 
-        print(self.h, self.w, tf.linalg.det(self.W))
-        return self.h * self.w * tf.math.log(tf.abs( tf.linalg.det(self.W) ))  
+	def log_det(self):       # TODO: Fix this issue!!! 
+		print(self.h, self.w, tf.linalg.det(self.W))
+		return self.h * self.w * tf.math.log(tf.abs( tf.linalg.det(self.W) ))  
 
-    def compute_output_shape(self, input_shape): return input_shape
+	def compute_output_shape(self, input_shape): return input_shape
 
 
 class Glow1x1Conv(LayerWithGrads): 
 
-    # Could be speed up parameterizing in LU decomposition. 
-    def build(self, input_shape): 
-        _, h, w, c = input_shape
+	# Could be speed up parameterizing in LU decomposition. 
+	def build(self, input_shape): 
+		_, h, w, c = input_shape
 
-        self.h, self.w = h, w
-    
-        # make L and U lower and upper triangular by masking with zeros. 
-        self.L              = self.add_weight(shape=(c, c), initializer="zeros", name="weights")
-        self.U              = self.add_weight(shape=(c, c), initializer="zeros", name="weights")
-        self.eigenvals      = self.add_weight(shape=(c, 1), initializer="ones", name="weights")
+		self.h, self.w = h, w
+	
+		# make L and U lower and upper triangular by masking with zeros. 
+		self.L              = self.add_weight(shape=(c, c), initializer="zeros", name="weights")
+		self.U              = self.add_weight(shape=(c, c), initializer="zeros", name="weights")
+		self.eigenvals      = self.add_weight(shape=(c, 1), initializer="ones", name="weights")
 
-        identity            = tf.constant(np.identity(c), dtype=tf.float32)
+		identity            = tf.constant(np.identity(c), dtype=tf.float32)
 
-        """ 
-        >>> # Creating masks. 
-        >>> np.triu(np.ones((4,4)), k=+1)
-        array([[0., 1., 1., 1.],
-               [0., 0., 1., 1.],
-               [0., 0., 0., 1.],
-               [0., 0., 0., 0.]])
-        >>> np.tril(np.ones((4,4)), k=-1)
-        array([[0., 0., 0., 0.],
-               [1., 0., 0., 0.],
-               [1., 1., 0., 0.],
-               [1., 1., 1., 0.]])
-        """
+		""" 
+		>>> # Creating masks. 
+		>>> np.triu(np.ones((4,4)), k=+1)
+		array([[0., 1., 1., 1.],
+			   [0., 0., 1., 1.],
+			   [0., 0., 0., 1.],
+			   [0., 0., 0., 0.]])
+		>>> np.tril(np.ones((4,4)), k=-1)
+		array([[0., 0., 0., 0.],
+			   [1., 0., 0., 0.],
+			   [1., 1., 0., 0.],
+			   [1., 1., 1., 0.]])
+		"""
 
-        upper_mask = tf.constant(np.triu(np.ones((c,c)), k=+1), dtype=tf.float32)
-        lower_mask = tf.constant(np.tril(np.ones((c,c)), k=-1), dtype=tf.float32)
+		upper_mask = tf.constant(np.triu(np.ones((c,c)), k=+1), dtype=tf.float32)
+		lower_mask = tf.constant(np.tril(np.ones((c,c)), k=-1), dtype=tf.float32)
 
-        self.L = lower_mask * self.L + identity
-        self.U = upper_mask * self.U + identity
-        
-        self.kernel         = self.L @ (self.eigenvals * self.U) 
-        self.kernel_inv     = tf.linalg.inv(self.kernel)
+		self.L = lower_mask * self.L + identity
+		self.U = upper_mask * self.U + identity
+		
+		self.kernel         = self.L @ (self.eigenvals * self.U) 
+		self.kernel_inv     = tf.linalg.inv(self.kernel)
 
-    def call(self, inputs):         return tf.tensordot(inputs, self.kernel, axes=((-1), (0))) 
-    def call_inv(self, inputs):     return tf.tensordot(inputs, self.kernel_inv, axes=((-1), (0))) 
-    def log_det(self):              return self.h * self.w * tf.reduce_sum(tf.math.log(tf.abs(self.eigenvals))) 
+	def call(self, inputs):         return tf.tensordot(inputs, self.kernel, axes=((-1), (0))) 
+	def call_inv(self, inputs):     return tf.tensordot(inputs, self.kernel_inv, axes=((-1), (0))) 
+	def log_det(self):              return self.h * self.w * tf.reduce_sum(tf.math.log(tf.abs(self.eigenvals))) 
 
-    def compute_output_shape(self, input_shape): return input_shape
+	def compute_output_shape(self, input_shape): return input_shape
 
 
 
 class Conv3DCirc(LayerWithGrads): 
 
-    def __init__(self,trainable=True): 
-        self.built = False
-        super(Conv3DCirc, self).__init__()
+	def __init__(self,trainable=True): 
+		self.built = False
+		super(Conv3DCirc, self).__init__()
 
-    def call(self, X): 
-        if self.built == False:    #For some reason the layer is not being built without this line
-            self.build(X.get_shape().as_list())
+	def call(self, X): 
+		if self.built == False:    #For some reason the layer is not being built without this line
+			self.build(X.get_shape().as_list())
 
-        #The next 2 lines are a redundant computation necessary because w needs to be an EagerTensor for the output to be eagerly executed, and that was not the case earlier
-        #EagerTensor is required for backprop to work...
-        #Further, updating w_real will automatically trigger an update on self.w, so it is better to not store w at all
-        #TODO - figure out a way to avoid, or open an issue with tf...
-        self.w  = tf.cast(self.w_real, dtype=tf.complex64)
-        self.w  = tf.signal.fft3d(self.w / self.scale)
+		#The next 2 lines are a redundant computation necessary because w needs to be an EagerTensor for the output to be eagerly executed, and that was not the case earlier
+		#EagerTensor is required for backprop to work...
+		#Further, updating w_real will automatically trigger an update on self.w, so it is better to not store w at all
+		#TODO - figure out a way to avoid, or open an issue with tf...
+		self.w  = tf.cast(self.w_real, dtype=tf.complex64)
+		self.w  = tf.signal.fft3d(self.w / self.scale)
 
-        X = tf.cast(X, dtype=tf.complex64)
-        X = tf.signal.fft3d(X / self.scale) 
-        X = X * self.w
-        X = tf.signal.ifft3d(X * self.scale ) 
-        X = tf.math.real(X)
-        return X
+		X = tf.cast(X, dtype=tf.complex64)
+		X = tf.signal.fft3d(X / self.scale) 
+		X = X * self.w
+		X = tf.signal.ifft3d(X * self.scale ) 
+		X = tf.math.real(X)
+		return X
 
 
-    def call_inv(self, X): 
-        X = tf.cast(X, dtype=tf.complex64)
-        X = tf.signal.fft3d(X * self.scale ) # self.scale correctly 
-        #The next 2 lines are a redundant computation necessary because w needs to be an EagerTensor for the output to be eagerly executed, and that was not the case earlier
-        self.w  = tf.cast(self.w_real, dtype=tf.complex64)
-        self.w  = tf.signal.fft3d(self.w / self.scale)
+	def call_inv(self, X): 
+		X = tf.cast(X, dtype=tf.complex64)
+		X = tf.signal.fft3d(X * self.scale ) # self.scale correctly 
+		#The next 2 lines are a redundant computation necessary because w needs to be an EagerTensor for the output to be eagerly executed, and that was not the case earlier
+		self.w  = tf.cast(self.w_real, dtype=tf.complex64)
+		self.w  = tf.signal.fft3d(self.w / self.scale)
 
-        X = X / self.w
+		X = X / self.w
 
-        X = tf.signal.ifft3d(X / self.scale)   
-        X = tf.math.real(X)
-        return X
+		X = tf.signal.ifft3d(X / self.scale)   
+		X = tf.math.real(X)
+		return X
 
-    def log_det(self):  return tf.math.reduce_sum(tf.math.log(tf.math.abs(tf.signal.fft3d(tf.cast(self.w_real/self.scale,dtype=tf.complex64)))))    #Need to return EagerTensor
+	def log_det(self):  return tf.math.reduce_sum(tf.math.log(tf.math.abs(tf.signal.fft3d(tf.cast(self.w_real/self.scale,dtype=tf.complex64)))))    #Need to return EagerTensor
 
 
-    def build(self, input_shape): 
-        self.scale = np.sqrt(np.prod(input_shape[1:])) # np.sqrt(np.prod([a.value for a in input_shape[1:]]))
+	def build(self, input_shape): 
+		self.scale = np.sqrt(np.prod(input_shape[1:])) # np.sqrt(np.prod([a.value for a in input_shape[1:]]))
 
-        # todo; change to [[[1, 0000],[0000], [000]] 
+		# todo; change to [[[1, 0000],[0000], [000]] 
 
-        def identitiy_initializer_real(shape, dtype=None):
-            return (tf.math.real(tf.signal.ifft3d(tf.ones(shape, dtype=tf.complex64)*self.scale))) 
+		def identitiy_initializer_real(shape, dtype=None):
+			return (tf.math.real(tf.signal.ifft3d(tf.ones(shape, dtype=tf.complex64)*self.scale))) 
 
-        self.w_real     = self.add_variable(name="w_real",shape=input_shape[1:], initializer=identitiy_initializer_real, trainable=True)
-        # self.w    = tf.cast(self.w_real, dtype=tf.complex64)  #hacky way to initialize real w and actual w, since tf does weird stuff if 'variable' is modified
-        # self.w    = tf.signal.fft3d(self.w / self.scale)
-        self.built = True
-        
+		self.w_real     = self.add_variable(name="w_real",shape=input_shape[1:], initializer=identitiy_initializer_real, trainable=True)
+		# self.w    = tf.cast(self.w_real, dtype=tf.complex64)  #hacky way to initialize real w and actual w, since tf does weird stuff if 'variable' is modified
+		# self.w    = tf.signal.fft3d(self.w / self.scale)
+		self.built = True
+		
 
-    def compute_output_shape(self, input_shape): 
-        return tf.TensorShape(input_shape[1:])
+	def compute_output_shape(self, input_shape): 
+		return tf.TensorShape(input_shape[1:])
 
 
 
@@ -584,13 +584,13 @@ class InvResNet(keras.layers.Layer):            pass # model should automaticall
 # Theoretically time is the same? 
 class CircularConv(keras.layers.Layer): 
 
-    def __init__(self, dim=3):  # 
-        self.dim = dim 
+	def __init__(self, dim=3):  # 
+		self.dim = dim 
 
-    def call(self, X):      pass
-    
-    def call_inv(self, X):  pass
+	def call(self, X):      pass
+	
+	def call_inv(self, X):  pass
 
-    def log_det(self):      pass
+	def log_det(self):      pass
 
 

From 85fe1713cf7452a1379ec632cb75ea9930105ce3 Mon Sep 17 00:00:00 2001
From: anshuln <anshulnasery@gmail.com>
Date: Mon, 15 Jul 2019 21:16:05 +0530
Subject: [PATCH 05/11] Adds const memory backprop for AffineCoupling and other
 layers. However, in some tests, the computed gradient is NaN for
 AffineCoupling, and this bug remains even if O(n) gradient calculations are
 used.

---
 invtf/generator_const_backprop.py | 49 ++++++++++++++++---------------
 invtf/layers_const_backprop.py    | 40 ++++++++++++++++++++++---
 2 files changed, 61 insertions(+), 28 deletions(-)

diff --git a/invtf/generator_const_backprop.py b/invtf/generator_const_backprop.py
index fc61601..96782ff 100644
--- a/invtf/generator_const_backprop.py
+++ b/invtf/generator_const_backprop.py
@@ -8,7 +8,7 @@
 os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
 os.environ['TF_CPP_MIN_VLOG_LEVEL']='3'
 
-import tqdm
+from tqdm import tqdm
 import tensorflow as tf
 import invtf.grow_memory
 import tensorflow.keras as keras 
@@ -130,28 +130,28 @@ def add(self, layer):
 
 
 
-	def predict(self, X, dequantize=True): 
+	# def predict(self, X, dequantize=True): 
 
-		Zs = [] 
+	# 	Zs = [] 
 
-		for layer in self.layers: 
+	# 	for layer in self.layers: 
 
-			# allow deactivating dequenatize 
-			# refactor to just look into name of layer and skip if it has dequantize in name or something like that. 
-			if not dequantize and isinstance(layer, UniformDequantize):     continue    
-			if not dequantize and isinstance(layer, VariationalDequantize): continue    
+	# 		# allow deactivating dequenatize 
+	# 		# refactor to just look into name of layer and skip if it has dequantize in name or something like that. 
+	# 		if not dequantize and isinstance(layer, UniformDequantize):     continue    
+	# 		if not dequantize and isinstance(layer, VariationalDequantize): continue    
 
-			# if isinstance(layer, MultiScale): 
-			# 	X, Z = layer.call(X)
-			# 	Zs.append(Z)
-			# 	continue
+	# 		# if isinstance(layer, MultiScale): 
+	# 		# 	X, Z = layer.call(X)
+	# 		# 	Zs.append(Z)
+	# 		# 	continue
 
-			X = layer.call(X)
+	# 		X = layer.call(X)
 
-		# TODO: make sure this does not break case without multiscale architecture.
-		# append Zs to X;; do by vectorize and then concat. 
+	# 	# TODO: make sure this does not break case without multiscale architecture.
+	# 	# append Zs to X;; do by vectorize and then concat. 
 
-		return X, Zs
+	# 	return X, Zs
 
 	def predict_inv(self, X, Z=None): 
 		n = X.shape[0]
@@ -220,17 +220,17 @@ def compute_and_apply_gradients(self,X,optimizer=None):
 		last_layer = self.layers[-1]
 		#Computing gradients of loss function wrt the last acticvation
 		with tf.GradientTape() as tape:
-		    tape.watch(x)
-		    loss = self.loss(x)    #May have to change
+			tape.watch(x)
+			loss = self.loss(x)    #May have to change
 		grads_combined = tape.gradient(loss,[x])
 		dy = grads_combined[0]
 		y = x
 		#Computing gradients for each layer
 		for layer in self.layers[::-1]:     
-		    x = layer.call_inv(y)
-		    dy,grads = layer.compute_gradients(x,dy,layer.log_det)	#TODO implement scaling here...
-		    optimizer.apply_gradients(zip(grads,layer.trainable_variables))
-		    y = x 
+			x = layer.call_inv(y)
+			dy,grads = layer.compute_gradients(x,dy,layer.log_det)	#TODO implement scaling here...
+			optimizer.apply_gradients(zip(grads,layer.trainable_variables))
+			y = x 
 		return loss
 
 	def fit(self, X, batch_size=32,epochs=1,optimizer=tf.optimizers.Adam(),**kwargs): 
@@ -244,13 +244,14 @@ def fit(self, X, batch_size=32,epochs=1,optimizer=tf.optimizers.Adam(),**kwargs)
 			num_batches = X.shape[0] // batch_size
 			X = np.random.permutation(X)
 			#Minibatch gradient descent
-			for i in range(0,X.shape[0]-X.shape[0]%batch_size,batch_size):    
+			for i in tqdm(range(0,X.shape[0]-X.shape[0]%batch_size,batch_size)):    
+				print("Minibatch: ",i)
 			# grads,loss = model.compute_gradients(X[i:(i+batch_size)])
 				losses = []
 				loss = self.compute_and_apply_gradients(X[i:(i+batch_size)],optimizer)
 				losses.append(loss.numpy())
 				loss = np.mean(losses)  
-			print(loss)
+			print('Epoch: {}, loss: {}'.format(j,loss))
 			all_losses+=losses
 		return all_losses
 
diff --git a/invtf/layers_const_backprop.py b/invtf/layers_const_backprop.py
index 8e1d9b4..113784f 100644
--- a/invtf/layers_const_backprop.py
+++ b/invtf/layers_const_backprop.py
@@ -364,7 +364,7 @@ def call(self, X):
 	def call_inv(self, X): 
 		return tf.reshape(X, [-1, self.w, self.h, self.c])
 		
-	def log_det(self): return 0. 
+	def log_det(self): return tf.zeros((1,)) 
 
 
 # TODO: for now assumes target is +-1, refactor to support any target. 
@@ -405,7 +405,7 @@ def compute_output_shape(self, input_shape):
 		n, h, w, c = input_shape
 		return (n, h, w, c//2)
 
-	def log_det(self): return 0.
+	def log_det(self): return tf.zeros((1,))
 
 
 
@@ -488,7 +488,7 @@ def call_inv(self, X):  pass
 	def log_det(self):      pass
 
 
-class ActNorm(keras.layers.Layer): 
+class ActNorm(LayerWithGrads): 
 
 	"""
 		The exp parameter allows the scaling to be exp(s) \odot X. 
@@ -610,8 +610,10 @@ def call(self, X):
 			x1 		= x1*s + t 
 
 		self.precompute_log_det(s, X)
-
+		# print("s",np.isnan(s),np.isnan(t))
 		X 		= self.strategy.combine(x0, x1)
+		print("s",np.isnan(s).all(),"t",np.isnan(t).all())
+		print("X0",np.isnan(x0).all(),"X1",np.isnan(x1).all())
 		return X
 
 	def call_inv(self, Z):	 
@@ -637,3 +639,33 @@ def compute_output_shape(self, input_shape): return input_shape
 
 	def summary(self, line_length=None, positions=None, print_fn=None):
 		print_summary(self, line_length=line_length, positions=positions, print_fn=print_fn) # fixes stupid issue.
+
+	def compute_gradients(self,x,dy,regularizer=None):  
+		'''
+		Computes gradients for backward pass
+		Args:
+			x - tensor compatible with forward pass, input to the layer
+			dy - incoming gradient from backprop
+			regularizer - function, indicates dependence of loss on weights of layer
+		Returns
+			dy - gradients wrt input, to be backpropagated
+			grads - gradients wrt weights
+		'''
+		#TODO check if log_det of AffineCouplingLayer depends needs a regularizer. -- It does
+		with tf.GradientTape(persistent=False) as tape:	#Since log_det is computed within call
+			tape.watch(x)
+			y_ = self.call(x)   #Required to register the operation onto the gradient tape
+			reg = self._log_det
+		#TODO known issue, gradient goes to nan in some cases...
+		grads_combined = tape.gradient(y_,[x]+self.trainable_variables,output_gradients=dy)
+		grads_wrt_reg = tape.gradient(reg,self.trainable_variables)
+		dy,grads = grads_combined[0],grads_combined[1:]
+		grads = [a[0]+a[1] for a in zip(grads,grads_wrt_reg)]
+		del tape 	#Since tape was persistent, we need this
+
+
+		# if regularizer is not None:
+		# 	with tf.GradientTape() as tape:
+		# 		reg = -regularizer()
+		# 	grads_wrt_reg = tape.gradient(reg, self.trainable_variables)
+		return dy,grads

From 1c95d3867cbe04e6a07eee8021739c82d30b330a Mon Sep 17 00:00:00 2001
From: anshuln <anshulnasery@gmail.com>
Date: Mon, 15 Jul 2019 22:49:25 +0530
Subject: [PATCH 06/11] Adds custom fit funtion using ndarray or tf Tensor

---
 invtf/generator_const_backprop.py | 67 +++++++++++++++++++++++++------
 invtf/layers_const_backprop.py    |  7 ++--
 2 files changed, 58 insertions(+), 16 deletions(-)

diff --git a/invtf/generator_const_backprop.py b/invtf/generator_const_backprop.py
index 96782ff..6ac49cf 100644
--- a/invtf/generator_const_backprop.py
+++ b/invtf/generator_const_backprop.py
@@ -233,26 +233,67 @@ def compute_and_apply_gradients(self,X,optimizer=None):
 			y = x 
 		return loss
 
-	def fit(self, X, batch_size=32,epochs=1,optimizer=tf.optimizers.Adam(),**kwargs): 
+	def fit(self, X, batch_size=32,epochs=1,verbose=1,validation_split=0.0,
+    validation_data=None,
+    shuffle=True,
+	initial_epoch=0,
+    steps_per_epoch=None,
+    validation_steps=None,
+    validation_freq=1,
+	optimizer=tf.optimizers.Adam(),**kwargs): 
 		'''
-		Fits the model on dataset `X 
+		Fits the model on dataset `X (not a generator)
+		Note - for very big datasets, the function will give OOM, 
+			   consider using a generator
+		Args-
+		X - Data to be fitted. Maybe one of the following-
+				tf.EagerTensor
+				np.ndarray
+				#TODO add support for tf.data.Dataset and tf.keras.Sequence
+		batch_size - Number of elements in each minibatch
+		verbose - Logging level
+		validation_split - Amount of data to be used for validation in each epoch
+						   For tensors or arrays, data is extracted from initial part of dataset.
+		shuffle - Should training data be shuffled before mini-batches are extracted
+		steps_per_epoch - Number of training steps per epoch. Used mainly for generators.
+	    validation_steps - Number of validation steps per epoch. Used mainly for generators.
+
 		'''
-		# TODO add all other args from tf.keras.Model.fit 
+		# TODO add all callbacks from tf.keras.Model.fit 
 		# TODO return a history object instead of array of losses
 		all_losses = []
-		for j in range(epochs):
-			num_batches = X.shape[0] // batch_size
-			X = np.random.permutation(X)
+		if validation_split > 0 and validation_data is None:
+			validation_data = X[:int(len(X)*validation_split)]
+			X = X[int(len(X)*validation_split):]
+
+		epoch_gen = range(initial_epoch,epochs)
+		if verbose == 1:
+			epoch_gen = tqdm(epoch_gen)
+		batch_size = min(batch_size,X.shape[0])	#Sanity check
+		num_batches = X.shape[0] // batch_size
+		if steps_per_epoch == None:
+			steps_per_epoch = num_batches
+
+		for j in epoch_gen:
+			if shuffle == True:
+				X = np.random.permutation(X)	#Works for np.ndarray and tf.EagerTensor, however, turns everything to numpy
 			#Minibatch gradient descent
-			for i in tqdm(range(0,X.shape[0]-X.shape[0]%batch_size,batch_size)):    
-				print("Minibatch: ",i)
-			# grads,loss = model.compute_gradients(X[i:(i+batch_size)])
+			range_gen = range(steps_per_epoch)
+			if verbose == 2:
+				range_gen = tqdm(range_gen)
+			for i in range_gen:    
 				losses = []
-				loss = self.compute_and_apply_gradients(X[i:(i+batch_size)],optimizer)
+				loss = self.compute_and_apply_gradients(X[i*batch_size:(i+1)*(batch_size)],optimizer)
 				losses.append(loss.numpy())
-				loss = np.mean(losses)  
-			print('Epoch: {}, loss: {}'.format(j,loss))
+			loss = np.mean(losses)  
 			all_losses+=losses
+			to_print = 'Epoch: {}/{}, training_loss: {}'.format(j,epochs,loss)
+			if validation_data is not None:
+				val_loss = self.loss(validation_data)
+				to_print += ', val_loss: {}'.format(val_loss.numpy())	#TODO return val_loss somehow
+			if verbose == 2:
+				print(to_print)
+
 		return all_losses
 
 	def fit_generator(self, generator,batches_per_epoch,epochs=1,optimizer=tf.optimizers.Adam(),**kwargs): 
@@ -267,7 +308,7 @@ def fit_generator(self, generator,batches_per_epoch,epochs=1,optimizer=tf.optimi
 				loss = self.compute_and_apply_gradients(next(X),optimizer)
 				losses.append(loss.numpy())
 			loss = np.mean(losses)  
-			print(loss)
+			print('Epoch: {}, loss: {}'.format(j,loss))
 			all_losses+=losses
 		return all_losses
 
diff --git a/invtf/layers_const_backprop.py b/invtf/layers_const_backprop.py
index 113784f..2ce1315 100644
--- a/invtf/layers_const_backprop.py
+++ b/invtf/layers_const_backprop.py
@@ -612,8 +612,9 @@ def call(self, X):
 		self.precompute_log_det(s, X)
 		# print("s",np.isnan(s),np.isnan(t))
 		X 		= self.strategy.combine(x0, x1)
-		print("s",np.isnan(s).all(),"t",np.isnan(t).all())
-		print("X0",np.isnan(x0).all(),"X1",np.isnan(x1).all())
+		#Diagnostic statements for testing NaN gradient
+		# print("s",np.isnan(s).all(),"t",np.isnan(t).all())
+		# print("X0",np.isnan(x0).all(),"X1",np.isnan(x1).all())
 		return X
 
 	def call_inv(self, Z):	 
@@ -652,7 +653,7 @@ def compute_gradients(self,x,dy,regularizer=None):
 			grads - gradients wrt weights
 		'''
 		#TODO check if log_det of AffineCouplingLayer depends needs a regularizer. -- It does
-		with tf.GradientTape(persistent=False) as tape:	#Since log_det is computed within call
+		with tf.GradientTape(persistent=True) as tape:	#Since log_det is computed within call
 			tape.watch(x)
 			y_ = self.call(x)   #Required to register the operation onto the gradient tape
 			reg = self._log_det

From eefd903efa6ca8222002015946059f67dc689719 Mon Sep 17 00:00:00 2001
From: anshuln <anshulnasery@gmail.com>
Date: Tue, 16 Jul 2019 14:36:36 +0530
Subject: [PATCH 07/11] Adds unit tests for gradient descent

---
 test/gradients.py | 90 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 90 insertions(+)
 create mode 100644 test/gradients.py

diff --git a/test/gradients.py b/test/gradients.py
new file mode 100644
index 0000000..7c8aac3
--- /dev/null
+++ b/test/gradients.py
@@ -0,0 +1,90 @@
+import unittest
+import sys
+sys.path.append("../")
+import invtf.latent
+# import invtf.layers
+#from tensorflow.python.ops.parallel_for.gradients import jacobian
+import tensorflow as tf
+import tensorflow.keras as keras
+import numpy as np
+from invtf.generator_const_backprop import Generator as GenConst
+from invtf.layers_const_backprop import *
+from tensorflow.keras.layers import ReLU, Dense, Flatten, Conv2D	
+
+class GeneratorGradTest(GenConst):
+	def compute_gradients(self,X):
+		x = self.call(X)        #I think putting this in context records all operations onto the tape, thereby destroying purpose of checkpointing...
+		last_layer = self.layers[-1]
+		d = np.prod(X.shape[1:])
+		#Computing gradients of loss function wrt the last acticvation
+		with tf.GradientTape() as tape:
+			tape.watch(x)
+			loss = self.loss(x)    #May have to change
+		grads_combined = tape.gradient(loss,[x])
+		dy = grads_combined[0]
+		y = x
+		#Computing gradients for each layer
+		gradients = []
+		for layer in self.layers[::-1]:     
+			x = layer.call_inv(y)
+			dy,grads = layer.compute_gradients(x,dy,layer.log_det,d*np.log(2.))	#TODO implement scaling here -- DONE
+			gradients+=(grads)
+			y = x 
+		if len(gradients) > 1:	#TODO better fix for issue with only single gradient
+			return [gradients[-2]]+[gradients[-1]]+gradients[0:-2][::-1]
+		else:
+			return gradients
+
+	def actual_gradients(self,X):
+		with tf.GradientTape() as tape:
+			loss = self.loss(self.call(X))
+		grads = tape.gradient(loss,self.trainable_variables)
+		return grads
+
+class TestGradients(unittest.TestCase):
+	X = keras.datasets.cifar10.load_data()[0][0][:5].astype('f') # a single cifar image batch.
+
+	def assertGrad(self,g,X):
+		computed_grads = g.compute_gradients(X)
+		actual_grads = g.actual_gradients(X) 
+		A = [np.allclose(np.abs(x[0]-x[1]),0,atol=1, rtol=0.1) for x in zip(computed_grads,actual_grads) if x[0] is not None]
+		# print("computed",computed_grads,"actual_grads",actual_grads)
+		print("Max discrepancy in gradients",np.max(np.array([(np.max(np.abs(x[0]-x[1]))) for x in zip(computed_grads,actual_grads) if x[0] is not None])))
+		self.assertTrue(np.array(A).all())
+
+	def test_circ_conv(self):
+		X = TestGradients.X 
+		d = 32*32*3
+		g = GeneratorGradTest(invtf.latent.Normal(d)) 
+		g.add(Conv3DCirc())
+		g.predict(X[:1])
+		self.assertGrad(g,X)		
+
+	def test_inv_conv(self):
+		X = TestGradients.X 
+		d = 32*32*3
+		g = GeneratorGradTest(invtf.latent.Normal(d)) 
+		g.add(Inv1x1ConvPLU())
+		g.predict(X[:1])
+		self.assertGrad(g,X)		
+
+	def test_act_norm(self):
+		X = TestGradients.X 
+		d = 32*32*3
+		g = GeneratorGradTest(invtf.latent.Normal(d)) 
+		g.add(ActNorm())
+		g.predict(X[:1])
+		self.assertGrad(g,X)		
+
+	def test_affine_coupling(self):
+		X = TestGradients.X 
+		d = 32*32*3
+		g = GeneratorGradTest(invtf.latent.Normal(d)) 
+		b = AffineCoupling()
+		b.add(Flatten())
+		b.add(Dense(d,activation='relu'))
+		g.add(Squeeze())
+		g.add(b)
+		# g.predict(X[:1])
+		self.assertGrad(g,X)		
+

From 990b30275345d28b927d73421102f98b61ea6ce0 Mon Sep 17 00:00:00 2001
From: anshuln <anshulnasery@gmail.com>
Date: Tue, 16 Jul 2019 17:34:27 +0530
Subject: [PATCH 08/11] Fixes incorrect gradient for AffineCoupling

---
 invtf/generator_const_backprop.py |  2 +-
 invtf/layers_const_backprop.py    | 84 +++++++++++++++----------------
 test.py                           |  6 +--
 test/gradients.py                 | 58 ++++++++++-----------
 4 files changed, 76 insertions(+), 74 deletions(-)

diff --git a/invtf/generator_const_backprop.py b/invtf/generator_const_backprop.py
index 6ac49cf..3ff077d 100644
--- a/invtf/generator_const_backprop.py
+++ b/invtf/generator_const_backprop.py
@@ -229,7 +229,7 @@ def compute_and_apply_gradients(self,X,optimizer=None):
 		for layer in self.layers[::-1]:     
 			x = layer.call_inv(y)
 			dy,grads = layer.compute_gradients(x,dy,layer.log_det)	#TODO implement scaling here...
-			optimizer.apply_gradients(zip(grads,layer.trainable_variables))
+			optimizer.apply_gradients(zip(gradientsrads,layer.trainable_variables))
 			y = x 
 		return loss
 
diff --git a/invtf/layers_const_backprop.py b/invtf/layers_const_backprop.py
index 9c8a930..93dab11 100644
--- a/invtf/layers_const_backprop.py
+++ b/invtf/layers_const_backprop.py
@@ -27,7 +27,7 @@ def call(self,X):
 	def call_inv(self,X):
 		raise NotImplementedError
 
-	def compute_gradients(self,x,dy,regularizer=None):  
+	def compute_gradients(self,x,dy,regularizer=None,scaling=1):  
 		'''
 		Computes gradients for backward pass
 		Args:
@@ -47,7 +47,7 @@ def compute_gradients(self,x,dy,regularizer=None):
 
 		if regularizer is not None:
 			with tf.GradientTape() as tape:
-				reg = -regularizer()
+				reg = -regularizer()/scaling
 			grads_wrt_reg = tape.gradient(reg, self.trainable_variables)
 			grads = [a[0]+a[1] for a in zip(grads,grads_wrt_reg)]
 		return dy,grads
@@ -148,12 +148,12 @@ def build(self, input_shape):
 
 		# random orthogonal matrix 
 		# check if tf.linalg.qr and tf.linalg.lu are more stable than scipy. 
-		self.kernel 	= self.add_weight(initializer=keras.initializers.Orthogonal(), shape=(c, c), name="inv_1x1_conv_P")
+		self.kernel     = self.add_weight(initializer=keras.initializers.Orthogonal(), shape=(c, c), name="inv_1x1_conv_P")
 	
 		super(Inv1x1Conv, self).build(input_shape)
 		self.built = True
 
-	def call(self, X): 	
+	def call(self, X):  
 		_W = tf.reshape(self.kernel, (1,1, self.c, self.c))
 		return tf.nn.conv2d(X, _W, [1,1,1,1], "SAME")
 
@@ -163,7 +163,7 @@ def call_inv(self, Z):
 		_W = tf.reshape(self.kernel_inv, (1,1, self.c, self.c))
 		return tf.nn.conv2d(Z, _W, [1,1,1,1], "SAME")
 
-	def log_det(self): 		  # det computations are way too instable here.. 
+	def log_det(self):        # det computations are way too instable here.. 
 		return self.h * self.w * tf.math.log(tf.abs( tf.linalg.det(self.kernel) ))   
 
 	def compute_output_shape(self, input_shape): return input_shape
@@ -196,7 +196,7 @@ def build(self, input_shape):
 		# random orthogonal matrix 
 		# check if tf.linalg.qr and tf.linalg.lu are more stable than scipy. 
 		import scipy
-		w 		= scipy.linalg.qr(np.random.normal(0, 1, (self.c, self.c)))[0].astype(np.float32)
+		w       = scipy.linalg.qr(np.random.normal(0, 1, (self.c, self.c)))[0].astype(np.float32)
 		P, L, U = scipy.linalg.lu(w)
 
 		def init_P(self, shape=None, dtype=None): return P
@@ -212,7 +212,7 @@ def init_U(self, shape=None, dtype=None): return U
 
 		L_mask = tf.constant(np.triu(np.ones((c,c)), k=+1), dtype=tf.float32)
 		P_mask = tf.constant(np.tril(np.ones((c,c)), k=-1), dtype=tf.float32)
-		I 	   = tf.constant(np.identity(c), dtype=tf.float32)
+		I      = tf.constant(np.identity(c), dtype=tf.float32)
 
 		self.P = self.P * P_mask + I
 		self.L = self.L * L_mask + I
@@ -223,15 +223,15 @@ def init_U(self, shape=None, dtype=None): return U
 		self.L_inv = tf.linalg.inv(tf.dtypes.cast(L, dtype=tf.float64))
 		self.U_inv = tf.linalg.inv(tf.dtypes.cast(U, dtype=tf.float64))
 
-		self.kernel_inv 	= tf.linalg.inv(self.kernel) # tf.dtypes.cast(self.U_inv @ self.L_inv @ self.P_inv, dtype=tf.float32)
+		self.kernel_inv     = tf.linalg.inv(self.kernel) # tf.dtypes.cast(self.U_inv @ self.L_inv @ self.P_inv, dtype=tf.float32)
 
-		#self.I_ 			= self.kernel @ tf.linalg.inv(self.kernel)
-		#self.I 				= self.kernel @ self.kernel_inv
+		#self.I_            = self.kernel @ tf.linalg.inv(self.kernel)
+		#self.I                 = self.kernel @ self.kernel_inv
 	
 		super(Inv1x1Conv, self).build(input_shape)
 		self.built = True
 
-	def call(self, X): 	
+	def call(self, X):  
 		_W = tf.reshape(self.kernel, (1,1, self.c, self.c))
 		return tf.nn.conv2d(X, _W, [1,1,1,1], "SAME")
 
@@ -239,7 +239,7 @@ def call_inv(self, Z):
 		_W = tf.reshape(self.kernel_inv, (1,1, self.c, self.c))
 		return tf.nn.conv2d(Z, _W, [1,1,1,1], "SAME")
 
-	def log_det(self): 		  # det computations are way too instable here.. 
+	def log_det(self):        # det computations are way too instable here.. 
 		return self.h * self.w * tf.math.log(tf.abs( tf.linalg.det(self.kernel) ))   # Looks fine? 
 
 	def compute_output_shape(self, input_shape): return input_shape
@@ -320,7 +320,7 @@ def log_det(self):      return 0.
 
 	def compute_output_shape(self, input_shape): return input_shape
 
-	def compute_gradients(self,x,dy,regularizer=None):  
+	def compute_gradients(self,x,dy,regularizer=None,scaling=1):  
 		'''
 		Computes gradients for backward pass
 		Since the coupling layers do not inherit from `LayerWithGrads`, this 
@@ -524,16 +524,16 @@ def build(self, input_shape):
 		self.h = h
 		self.w = w
 
-		self.s = self.add_weight(shape=c, 	initializer='ones', name="affine_scale") 
-		self.b = self.add_weight(shape=c, 	initializer='zero', name="affine_bias")
+		self.s = self.add_weight(shape=c,   initializer='ones', name="affine_scale") 
+		self.b = self.add_weight(shape=c,   initializer='zero', name="affine_bias")
 
 		super(ActNorm, self).build(input_shape)
 		self.built = True
 
-	def call(self, X): 		return X * self.s + self.b
+	def call(self, X):      return X * self.s + self.b
 	def call_inv(self, Z):  return (Z - self.b) / self.s
 
-	def log_det(self): 		return self.h * self.w * tf.reduce_sum(tf.math.log(tf.abs(self.s)))
+	def log_det(self):      return self.h * self.w * tf.reduce_sum(tf.math.log(tf.abs(self.s)))
 
 	def compute_output_shape(self, input_shape): 
 		self.output_shape = input_shape
@@ -554,7 +554,7 @@ def compute_output_shape(self, input_shape):
 	For now assumes the use of convolutions 
 
 """
-class AffineCoupling(LayerWithGrads): # Sequential):  	
+class AffineCoupling(LayerWithGrads): # Sequential):    
 
 	def add(self, layer): self.layers.append(layer)
 
@@ -563,8 +563,8 @@ def add(self, layer): self.layers.append(layer)
 	def __init__(self, part=0, strategy=SplitChannelsStrategy()): 
 		super(AffineCoupling, self).__init__(name="aff_coupling_%i"%AffineCoupling.unique_id)
 		AffineCoupling.unique_id += 1
-		self.part 		= part 
-		self.strategy 	= strategy
+		self.part       = part 
+		self.strategy   = strategy
 		self.layers = []
 		self._is_graph_network = False
 
@@ -616,51 +616,51 @@ def call_(self, X):
 
 		return s, t
 
-	def call(self, X): 		
+	def call(self, X):      
 
 		x0, x1 = self.strategy.split(X)
 
 		if self.part == 0: 
-			s, t 	= self.call_(x1)
-			x0 		= x0*s + t # glow changed order of this? i.e. translate then scale. 
+			s, t    = self.call_(x1)
+			x0      = x0*s + t # glow changed order of this? i.e. translate then scale. 
 
 		if self.part == 1: 
-			s, t 	= self.call_(x0)
-			x1 		= x1*s + t 
+			s, t    = self.call_(x0)
+			x1      = x1*s + t 
 
 		self.precompute_log_det(s, X)
 		# print("s",np.isnan(s),np.isnan(t))
-		X 		= self.strategy.combine(x0, x1)
+		X       = self.strategy.combine(x0, x1)
 		#Diagnostic statements for testing NaN gradient
 		# print("s",np.isnan(s).all(),"t",np.isnan(t).all())
 		# print("X0",np.isnan(x0).all(),"X1",np.isnan(x1).all())
 		return X
 
-	def call_inv(self, Z):	 
+	def call_inv(self, Z):   
 		z0, z1 = self.strategy.split(Z)
 		
 		if self.part == 0: 
-			s, t 	= self.call_(z1)
-			z0 		= (z0 - t)/s
+			s, t    = self.call_(z1)
+			z0      = (z0 - t)/s
 		if self.part == 1: 
-			s, t 	= self.call_(z0)
-			z1 		= (z1 - t)/s
+			s, t    = self.call_(z0)
+			z1      = (z1 - t)/s
 
-		Z 		= self.strategy.combine(z0, z1)
+		Z       = self.strategy.combine(z0, z1)
 		return Z
 
 	def precompute_log_det(self, s, X): 
-		n 		= tf.dtypes.cast(tf.shape(X)[0], tf.float32)
+		n       = tf.dtypes.cast(tf.shape(X)[0], tf.float32)
 		self._log_det = tf.reduce_sum(tf.math.log(tf.abs(s))) / n
 
-	def log_det(self): 		  return self._log_det
+	def log_det(self):        return self._log_det
 
 	def compute_output_shape(self, input_shape): return input_shape
 
 	def summary(self, line_length=None, positions=None, print_fn=None):
 		print_summary(self, line_length=line_length, positions=positions, print_fn=print_fn) # fixes stupid issue.
 
-	def compute_gradients(self,x,dy,regularizer=None):  
+	def compute_gradients(self,x,dy,regularizer=None,scaling=1):  
 		'''
 		Computes gradients for backward pass
 		Args:
@@ -672,20 +672,20 @@ def compute_gradients(self,x,dy,regularizer=None):
 			grads - gradients wrt weights
 		'''
 		#TODO check if log_det of AffineCouplingLayer depends needs a regularizer. -- It does
-		with tf.GradientTape(persistent=True) as tape:	#Since log_det is computed within call
+		with tf.GradientTape(persistent=True) as tape:  #Since log_det is computed within call
 			tape.watch(x)
 			y_ = self.call(x)   #Required to register the operation onto the gradient tape
-			reg = self._log_det
-		#TODO known issue, gradient goes to nan in some cases...
+			reg = -self._log_det/scaling
 		grads_combined = tape.gradient(y_,[x]+self.trainable_variables,output_gradients=dy)
 		grads_wrt_reg = tape.gradient(reg,self.trainable_variables)
 		dy,grads = grads_combined[0],grads_combined[1:]
 		grads = [a[0]+a[1] for a in zip(grads,grads_wrt_reg)]
-		del tape 	#Since tape was persistent, we need this
+		del tape    #Since tape was persistent, we need this
 
 
 		# if regularizer is not None:
-		# 	with tf.GradientTape() as tape:
-		# 		reg = -regularizer()
-		# 	grads_wrt_reg = tape.gradient(reg, self.trainable_variables)
+		#   with tf.GradientTape() as tape:
+		#       reg = -regularizer()
+		#   grads_wrt_reg = tape.gradient(reg, self.trainable_variables)
+		#TODO fix bug of incorrect dy
 		return dy,grads
diff --git a/test.py b/test.py
index e72b3f5..10329b0 100644
--- a/test.py
+++ b/test.py
@@ -51,10 +51,10 @@
 """
 import unittest
 #from test.shape 		import *
-from test.jacobian 		import *
+# from test.jacobian 		import *
 #from test.optimality 	import *
-from test.inverse 		import *
-
+# from test.inverse 		import *
+from test.gradients import *
 if __name__ == "__main__": 
 
 	unittest.main()
diff --git a/test/gradients.py b/test/gradients.py
index 7c8aac3..ee14810 100644
--- a/test/gradients.py
+++ b/test/gradients.py
@@ -31,7 +31,7 @@ def compute_gradients(self,X):
 			gradients+=(grads)
 			y = x 
 		if len(gradients) > 1:	#TODO better fix for issue with only single gradient
-			return [gradients[-2]]+[gradients[-1]]+gradients[0:-2][::-1]
+			return [gradients[-1]]+[gradients[-2]]+gradients[0:-2][::-1]
 		else:
 			return gradients
 
@@ -47,43 +47,45 @@ class TestGradients(unittest.TestCase):
 	def assertGrad(self,g,X):
 		computed_grads = g.compute_gradients(X)
 		actual_grads = g.actual_gradients(X) 
-		A = [np.allclose(np.abs(x[0]-x[1]),0,atol=1, rtol=0.1) for x in zip(computed_grads,actual_grads) if x[0] is not None]
-		# print("computed",computed_grads,"actual_grads",actual_grads)
-		print("Max discrepancy in gradients",np.max(np.array([(np.max(np.abs(x[0]-x[1]))) for x in zip(computed_grads,actual_grads) if x[0] is not None])))
+		A = [np.allclose(np.abs(x[0]-x[1]),0,atol=2, rtol=0.1) for x in zip(computed_grads,actual_grads) if x[0] is not None]
+		print("computed",computed_grads,"actual_grads",actual_grads)
+		print("Max discrepancy in gradients",np.max(np.array([np.max((np.abs(x[0]-x[1]))) for x in zip(computed_grads,actual_grads) if x[0] is not None])))
 		self.assertTrue(np.array(A).all())
 
-	def test_circ_conv(self):
-		X = TestGradients.X 
-		d = 32*32*3
-		g = GeneratorGradTest(invtf.latent.Normal(d)) 
-		g.add(Conv3DCirc())
-		g.predict(X[:1])
-		self.assertGrad(g,X)		
+	# def test_circ_conv(self):
+	# 	X = TestGradients.X 
+	# 	d = 32*32*3
+	# 	g = GeneratorGradTest(invtf.latent.Normal(d)) 
+	# 	g.add(Conv3DCirc())
+	# 	g.predict(X[:1])
+	# 	self.assertGrad(g,X)		
 
-	def test_inv_conv(self):
-		X = TestGradients.X 
-		d = 32*32*3
-		g = GeneratorGradTest(invtf.latent.Normal(d)) 
-		g.add(Inv1x1ConvPLU())
-		g.predict(X[:1])
-		self.assertGrad(g,X)		
+	# def test_inv_conv(self):
+	# 	X = TestGradients.X 
+	# 	d = 32*32*3
+	# 	g = GeneratorGradTest(invtf.latent.Normal(d)) 
+	# 	g.add(Inv1x1ConvPLU())
+	# 	g.predict(X[:1])
+	# 	self.assertGrad(g,X)		
 
-	def test_act_norm(self):
-		X = TestGradients.X 
-		d = 32*32*3
-		g = GeneratorGradTest(invtf.latent.Normal(d)) 
-		g.add(ActNorm())
-		g.predict(X[:1])
-		self.assertGrad(g,X)		
+	# def test_act_norm(self):
+	# 	X = TestGradients.X 
+	# 	d = 32*32*3
+	# 	g = GeneratorGradTest(invtf.latent.Normal(d)) 
+	# 	g.add(ActNorm())
+	# 	g.predict(X[:1])
+	# 	self.assertGrad(g,X)		
 
 	def test_affine_coupling(self):
-		X = TestGradients.X 
-		d = 32*32*3
+		X = np.random.normal(0,1,(5,2,2,2)).astype('f')
+		print(X.shape)
+		d = 2*2*2
 		g = GeneratorGradTest(invtf.latent.Normal(d)) 
 		b = AffineCoupling()
 		b.add(Flatten())
-		b.add(Dense(d,activation='relu'))
+		b.add(Dense(d,activation='sigmoid'))
 		g.add(Squeeze())
+		g.add(Conv3DCirc())
 		g.add(b)
 		# g.predict(X[:1])
 		self.assertGrad(g,X)		

From 70dfb2399d963b8a87a16f28b5615eda1eb61689 Mon Sep 17 00:00:00 2001
From: anshuln <anshulnasery@gmail.com>
Date: Thu, 18 Jul 2019 10:46:14 +0530
Subject: [PATCH 09/11] Minor bug fixes

---
 invtf/generator_const_backprop.py | 8 +++++---
 invtf/layers_const_backprop.py    | 6 +++---
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/invtf/generator_const_backprop.py b/invtf/generator_const_backprop.py
index b15cc12..1871065 100644
--- a/invtf/generator_const_backprop.py
+++ b/invtf/generator_const_backprop.py
@@ -363,9 +363,11 @@ def fit_generator(self, generator,steps_per_epoch=None,initial_epoch=0,
 				print(to_print)
 			all_losses+=losses
 			val_count+=1
-        try:
-            if enqueuer is not None:
-                enqueuer.stop()			
+		try:
+			if enqueuer is not None:
+				enqueuer.stop()			
+		except:
+			pass
 		return all_losses
 
 	def rec(self, X): 
diff --git a/invtf/layers_const_backprop.py b/invtf/layers_const_backprop.py
index 41c3d54..b31669b 100644
--- a/invtf/layers_const_backprop.py
+++ b/invtf/layers_const_backprop.py
@@ -477,9 +477,9 @@ def log_det(self): return tf.zeros((1,))
 
 
 class Conv3DCirc(LayerWithGrads): 
-"""
-	There's an issue with scaling, which intuitively makes step-size VERY small. 
-"""
+	"""
+		There's an issue with scaling, which intuitively makes step-size VERY small. 
+	"""
 
 	def __init__(self,trainable=True): 
 		self.built = False

From 74904a0669f72039206a3ee93e8c69725d8e1d1d Mon Sep 17 00:00:00 2001
From: anshuln <anshulnasery@gmail.com>
Date: Thu, 18 Jul 2019 11:33:32 +0530
Subject: [PATCH 10/11] Potential fix for incorrect dy issue

---
 invtf/layers_const_backprop.py | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/invtf/layers_const_backprop.py b/invtf/layers_const_backprop.py
index b31669b..f4bfe45 100644
--- a/invtf/layers_const_backprop.py
+++ b/invtf/layers_const_backprop.py
@@ -49,7 +49,7 @@ def compute_gradients(self,x,dy,regularizer=None,scaling=1):
 			with tf.GradientTape() as tape:
 				reg = -regularizer()/scaling
 			grads_wrt_reg = tape.gradient(reg, self.trainable_variables)
-			grads = [a[0]+a[1] for a in zip(grads,grads_wrt_reg)]
+			grads = [a[0]+a[1] for a in zip(grads,grads_wrt_reg) if a[1] is not None]
 		return dy,grads
 
 class Linear(LayerWithGrads): 
@@ -710,22 +710,19 @@ def compute_gradients(self,x,dy,regularizer=None,scaling=1):
 			grads - gradients wrt weights
 		'''
 		#TODO check if log_det of AffineCouplingLayer depends needs a regularizer. -- It does
+		#TODO fix bug of incorrect dy
 		with tf.GradientTape(persistent=True) as tape:  #Since log_det is computed within call
 			tape.watch(x)
 			y_ = self.call(x)   #Required to register the operation onto the gradient tape
 			reg = -self._log_det/scaling
 		grads_combined = tape.gradient(y_,[x]+self.trainable_variables,output_gradients=dy)
 		grads_wrt_reg = tape.gradient(reg,self.trainable_variables)
+		grads_of_inp = tape.gradient(reg,[x])
 		dy,grads = grads_combined[0],grads_combined[1:]
 		grads = [a[0]+a[1] for a in zip(grads,grads_wrt_reg)]
+		n       = tf.dtypes.cast(tf.shape(x)[0], tf.float32)
+		dy = [a[1]+a[0] for a in zip(dy,grads_of_inp)]	#TODO check this expression, seems numerically approximate
 		del tape    #Since tape was persistent, we need this
-
-
-		# if regularizer is not None:
-		#   with tf.GradientTape() as tape:
-		#       reg = -regularizer()
-		#   grads_wrt_reg = tape.gradient(reg, self.trainable_variables)
-		#TODO fix bug of incorrect dy
 		return dy,grads
 
 class ReduceNumBits(LayerWithGrads): 

From 62b88226e3ce414405c425b37862f8f9b89f7b8e Mon Sep 17 00:00:00 2001
From: anshuln <anshulnasery@gmail.com>
Date: Thu, 18 Jul 2019 11:43:07 +0530
Subject: [PATCH 11/11] Fixes order of gradients in tests

---
 test/gradients.py | 55 +++++++++++++++++++++++------------------------
 1 file changed, 27 insertions(+), 28 deletions(-)

diff --git a/test/gradients.py b/test/gradients.py
index 7a605e3..d1aec04 100644
--- a/test/gradients.py
+++ b/test/gradients.py
@@ -12,6 +12,8 @@
 from tensorflow.keras.layers import ReLU, Dense, Flatten, Conv2D	
 
 class GeneratorGradTest(GenConst):
+	def prune(self,l):
+		return [x for sublist in l for x in sublist if len(sublist)>0]
 	def compute_gradients(self,X):
 		x = self.call(X)        #I think putting this in context records all operations onto the tape, thereby destroying purpose of checkpointing...
 		last_layer = self.layers[-1]
@@ -28,12 +30,9 @@ def compute_gradients(self,X):
 		for layer in self.layers[::-1]:     
 			x = layer.call_inv(y)
 			dy,grads = layer.compute_gradients(x,dy,layer.log_det,d*np.log(2.))	#TODO implement scaling here -- DONE
-			gradients+=(grads)
+			gradients=[grads]+gradients
 			y = x 
-		if len(gradients) > 1:	#TODO better fix for issue with only single gradient
-			return [gradients[-1]]+[gradients[-2]]+gradients[0:-2][::-1]
-		else:
-			return gradients
+		return self.prune(gradients)
 
 	def actual_gradients(self,X):
 		with tf.GradientTape() as tape:
@@ -47,34 +46,34 @@ class TestGradients(unittest.TestCase):
 	def assertGrad(self,g,X):
 		computed_grads = g.compute_gradients(X)
 		actual_grads = g.actual_gradients(X) 
-		A = [np.allclose(np.abs(x[0]-x[1]),0,atol=2, rtol=0.1) for x in zip(computed_grads,actual_grads) if x[0] is not None]
-		print("computed",computed_grads,"actual_grads",actual_grads)
+		A = [np.allclose(np.abs(x[0]-x[1]),0,atol=1, rtol=0.1) for x in zip(computed_grads,actual_grads) if x[0] is not None]
+		# print("computed",computed_grads,"actual_grads",actual_grads)
 		print("Max discrepancy in gradients",np.max(np.array([np.max((np.abs(x[0]-x[1]))) for x in zip(computed_grads,actual_grads) if x[0] is not None])))
 		self.assertTrue(np.array(A).all())
 
-	# def test_circ_conv(self):
-	# 	X = TestGradients.X 
-	# 	d = 32*32*3
-	# 	g = GeneratorGradTest(invtf.latent.Normal(d)) 
-	# 	g.add(Conv3DCirc())
-	# 	g.predict(X[:1])
-	# 	self.assertGrad(g,X)		
+	def test_circ_conv(self):
+		X = TestGradients.X 
+		d = 32*32*3
+		g = GeneratorGradTest(invtf.latent.Normal(d)) 
+		g.add(Conv3DCirc())
+		g.predict(X[:1])
+		self.assertGrad(g,X)		
 
-	# def test_inv_conv(self):
-	# 	X = TestGradients.X 
-	# 	d = 32*32*3
-	# 	g = GeneratorGradTest(invtf.latent.Normal(d)) 
-	# 	g.add(Inv1x1ConvPLU())
-	# 	g.predict(X[:1])
-	# 	self.assertGrad(g,X)		
+	def test_inv_conv(self):
+		X = TestGradients.X 
+		d = 32*32*3
+		g = GeneratorGradTest(invtf.latent.Normal(d)) 
+		g.add(Inv1x1ConvPLU())
+		g.predict(X[:1])
+		self.assertGrad(g,X)		
 
-	# def test_act_norm(self):
-	# 	X = TestGradients.X 
-	# 	d = 32*32*3
-	# 	g = GeneratorGradTest(invtf.latent.Normal(d)) 
-	# 	g.add(ActNorm())
-	# 	g.predict(X[:1])
-	# 	self.assertGrad(g,X)		
+	def test_act_norm(self):
+		X = TestGradients.X 
+		d = 32*32*3
+		g = GeneratorGradTest(invtf.latent.Normal(d)) 
+		g.add(ActNorm())
+		g.predict(X[:1])
+		self.assertGrad(g,X)		
 
 	def test_affine_coupling(self):
 		X = np.random.normal(0,1,(5,2,2,2)).astype('f')