diff --git a/Machine_Learning/src/NLP/SignGAN/Architecture-1 (Transformer).pdf b/Machine_Learning/src/NLP/SignGAN/Architecture-1 (Transformer).pdf
new file mode 100644
index 00000000..dc052ce4
Binary files /dev/null and b/Machine_Learning/src/NLP/SignGAN/Architecture-1 (Transformer).pdf differ
diff --git a/Machine_Learning/src/NLP/SignGAN/bert_utils.py b/Machine_Learning/src/NLP/SignGAN/bert_utils.py
new file mode 100644
index 00000000..3316b914
--- /dev/null
+++ b/Machine_Learning/src/NLP/SignGAN/bert_utils.py
@@ -0,0 +1,85 @@
+import tensorflow as tf
+import numpy as np
+import bert
+
+#max_sequence_length = 64
+
+class Bert(object):
+    def __init__(self, max_sequence_length=64):
+        super(Bert, self).__init__()
+        self.max_seq_length = max_sequence_length
+        self.model_dir = 'models/multi_cased_L-12_H-768_A-12'
+        self.model_ckpt = 'models/multi_cased_L-12_H-768_A-12/bert_model.ckpt'
+        self.vocab_file = 'models/multi_cased_L-12_H-768_A-12/vocab.txt'
+
+        self.model = self.create_bert_model()
+        self.tokenizer = self.create_bert_tokenizer()
+
+    def create_bert_model(self):
+        bert_params = bert.params_from_pretrained_ckpt(self.model_dir)
+        l_bert = bert.BertModelLayer.from_params(bert_params, name="bert")
+
+        l_input_ids = tf.keras.layers.Input(shape=(self.max_seq_length,), dtype='int32')
+        output = l_bert(l_input_ids)
+        model = tf.keras.Model(inputs=l_input_ids, outputs=output)
+        model.build(input_shape=(None, self.max_seq_length))
+
+        bert.load_stock_weights(l_bert, self.model_ckpt)
+
+        return model
+
+    def create_bert_tokenizer(self):
+        model_name = 'multi_cased_L-12_H-768_A-12'
+        do_lower_case = not (model_name.find("cased") == 0 or model_name.find("multi_cased") == 0)
+        bert.bert_tokenization.validate_case_matches_checkpoint(do_lower_case, self.model_ckpt)
+        tokenizer = bert.bert_tokenization.FullTokenizer(self.vocab_file, do_lower_case)
+
+        return tokenizer
+
+    def tokenize(self, sequence):
+        sequence = self.tokenizer.tokenize(sequence)
+        return ["[CLS]"] + sequence + ["[SEP]"]
+
+    def get_sequence_ids(self, tokenized_sequence):
+        sequence_ids = self.tokenizer.convert_tokens_to_ids(tokenized_sequence)
+        sequence_ids = sequence_ids + [0] * (self.max_seq_length - len(sequence_ids))    # padding
+        return sequence_ids
+
+    def preprocess(self, sequence):
+        return self.get_sequence_ids(self.tokenize(sequence))
+
+    def preprocess_batch(self, sequence_list):
+        if type(sequence_list) != list:
+            sequence_list = [sequence_list]
+
+        preprocessed_sequence_ids = []
+        for sequence in sequence_list:
+            preprocessed_sequence_ids.append(self.preprocess(sequence))
+        return np.array(preprocessed_sequence_ids)
+
+    def __call__(self, sequence_list):
+        #if type(sequence_list) != list or sequence_list:
+        #    sequence_list = [sequence_list]
+        preprocessed_sequence_ids = self.preprocess_batch(sequence_list)
+        output = self.model.predict(preprocessed_sequence_ids)      # (?, 64, 768)  for all sentences in batch
+        word_embeddings = output                                    # (?, 64, 768)
+        
+        sentence_embeddings = []
+        for sentence in output:
+            sentence_vector = sentence[0]                           # feature vector for [CLS] is the sentence vector for each sentence
+            sentence_embeddings.append(sentence_vector)
+
+        sentence_embeddings = np.array(sentence_embeddings)         # (?, 768)
+
+        return word_embeddings, sentence_embeddings
+
+'''
+def main():
+    bert_model = Bert(64)
+    word_embeddings, sentence_embeddings = bert_model.predict(['sonst wechselhaft mit schauern und gewittern die uns auch am wochenende begleiten', 
+                                                            'und nun die wettervorhersage für morgen donnerstag den zwölften august'])
+    print(word_embeddings.shape, sentence_embeddings.shape)
+
+if __name__ == "__main__":
+    main()
+'''
\ No newline at end of file
diff --git a/Machine_Learning/src/NLP/SignGAN/models/multi_cased_L-12_H-768_A-12/README.md b/Machine_Learning/src/NLP/SignGAN/models/multi_cased_L-12_H-768_A-12/README.md
new file mode 100644
index 00000000..12c94828
--- /dev/null
+++ b/Machine_Learning/src/NLP/SignGAN/models/multi_cased_L-12_H-768_A-12/README.md
@@ -0,0 +1 @@
+Multilingual BERT was used as German Text Encoder
diff --git a/Machine_Learning/src/NLP/SignGAN/phoenix-2014-T.v3/README.md b/Machine_Learning/src/NLP/SignGAN/phoenix-2014-T.v3/README.md
new file mode 100644
index 00000000..55de7852
--- /dev/null
+++ b/Machine_Learning/src/NLP/SignGAN/phoenix-2014-T.v3/README.md
@@ -0,0 +1,2 @@
+Dataset stored here.
+Look at utils/video.py for location specifics
diff --git a/Machine_Learning/src/NLP/SignGAN/readme.md b/Machine_Learning/src/NLP/SignGAN/readme.md
new file mode 100644
index 00000000..c8648d54
--- /dev/null
+++ b/Machine_Learning/src/NLP/SignGAN/readme.md
@@ -0,0 +1 @@
+# SignGAN
diff --git a/Machine_Learning/src/NLP/SignGAN/signgan.ipynb b/Machine_Learning/src/NLP/SignGAN/signgan.ipynb
new file mode 100644
index 00000000..d5b04cd0
--- /dev/null
+++ b/Machine_Learning/src/NLP/SignGAN/signgan.ipynb
@@ -0,0 +1,589 @@
+{
+ "metadata": {
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.4-final"
+  },
+  "orig_nbformat": 2,
+  "kernelspec": {
+   "name": "research",
+   "display_name": "research"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2,
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# SignGAN"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Importing the Libraries"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import tensorflow as tf\n",
+    "import numpy as np\n",
+    "import glob\n",
+    "import os\n",
+    "\n",
+    "from bert_utils import Bert\n",
+    "from utils.video import Video\n",
+    "from utils.conv_attention import *\n",
+    "from utils.generator import *\n",
+    "from utils.discriminator import *\n",
+    "from utils.losses import *"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "config = tf.compat.v1.ConfigProto()\n",
+    "config.gpu_options.allow_growth = True\n",
+    "sess = tf.compat.v1.Session(config=config)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Pretrained BERT Model\n",
+    "Multilingual Cased BERT is used"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": "Done loading 196 BERT weights from: models/multi_cased_L-12_H-768_A-12/bert_model.ckpt into <bert.model.BertModelLayer object at 0x000002ED2903A888> (prefix:bert). Count of weights not found in the checkpoint was: [0]. Count of weights with mismatched shape: [0]\nUnused weights from checkpoint: \n\tbert/embeddings/token_type_embeddings\n\tbert/pooler/dense/bias\n\tbert/pooler/dense/kernel\n\tcls/predictions/output_bias\n\tcls/predictions/transform/LayerNorm/beta\n\tcls/predictions/transform/LayerNorm/gamma\n\tcls/predictions/transform/dense/bias\n\tcls/predictions/transform/dense/kernel\n\tcls/seq_relationship/output_bias\n\tcls/seq_relationship/output_weights\n(1, 64, 768) (1, 768)\n"
+    }
+   ],
+   "source": [
+    "bert = Bert()\n",
+    "word_embeddings, sentence_embeddings = bert.predict(['sonst wechselhaft mit schauern und gewittern die uns auch am wochenende begleiten'])\n",
+    "print(word_embeddings.shape, sentence_embeddings.shape)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Global Variables"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "num_clips = 32\n",
+    "T = 16 # let\n",
+    "MAX_VIDEO_LENGTH = 512      # 475 is the longest\n",
+    "FRAME_DIM = (64, 64, 3)\n",
+    "VIDEO_DIM = (512, 64, 64, 3)\n",
+    "data_dir = 'phoenix-2014-T.v3/PHOENIX-2014-T-release-v3/PHOENIX-2014-T/features/fullFrame-210x260px/'"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Video Object\n",
+    "with example from training set"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": "(640, 64, 64, 3)\n(40, 16, 64, 64, 3)\n"
+    },
+    {
+     "output_type": "execute_result",
+     "data": {
+      "text/plain": "'\\npad_mask = video_obj.padding_mask(current_sequence_length)\\nprint(pad_mask.shape)\\n\\nlook_mask = video_obj.look_ahead_mask()\\nprint(look_mask.shape)\\n'"
+     },
+     "metadata": {},
+     "execution_count": 4
+    }
+   ],
+   "source": [
+    "video_obj = Video(T, MAX_VIDEO_LENGTH, FRAME_DIM, VIDEO_DIM, data_dir)\n",
+    "\n",
+    "video_real = video_obj.get_video('train', '05January_2010_Tuesday_tagesschau-2664')\n",
+    "current_sequence_length = video_real.shape[0]\n",
+    "\n",
+    "video_real = video_obj.preprocess_video(video_real)\n",
+    "print(video_real.shape)\n",
+    "\n",
+    "video_real = video_obj.divide_sequence(video_real)\n",
+    "print(video_real.shape)\n",
+    "\n",
+    "video_wrong = video_obj.get_video('train', '03June_2011_Friday_tagesschau-7649')\n",
+    "current_sequence_length = video_wrong.shape[0]\n",
+    "\n",
+    "video_wrong = video_obj.preprocess_video(video_wrong)\n",
+    "print(video_wrong.shape)\n",
+    "\n",
+    "video_wrong = video_obj.divide_sequence(video_wrong)\n",
+    "print(video_wrong.shape)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Attention\n",
+    "* First, perform convolutions across the whole video, then go for separable self attention for the whole video masked, i.e, (40, 40T, H, W, C), outputs (40, 40T, H, W, C), where T=16 and 40=640//T. 640 being the length of the whole video.\n",
+    "* Convolution operations will be restricted to only T frames at a time. There will be no intermingling of 2 or more sets of T frames. Thus 3D CONV will take care of extracting local, temporal and spatial features only. Hack : Batched\n",
+    "* After the conv operations there will be MAX_SEQ_LENGTH // T i.e, 40 (here) elements each of size (T, H, W, C), making the output of convolution ops, to be of dim -> (40, T, H, W, C) converted to (40T, H, W, C). This will be passed through attention blocks and outputs (40, 40T, H, W, C) along with masking.\n",
+    "* This attention performs masked attention. We will have MAX_SEQ_LENGTH // T i.e, 40 (here) masks in total each for time, height, width. Each mask being of shape (40, 40T, H, W, H*W) for time, W*T for height and H*T for width.\n",
+    "* Only after the whole video is generated are the losses calculated, and backpropped.\n",
+    "* During testing just \"start\" token will be provided and the rest of the sequence will be just padding, and each time the generator produces T frames, those T frames will be concatenated along with the \"start\" token and then convoluted again to produce the next T frames.\n",
+    "\n",
+    "### Attention mechanism will require residual connections otherwise gradients will vanish\n",
+    "\n",
+    "## Word Frame Attention\n",
+    "* Last dimension of both masked_attention_output and semantic_word_matrix must be same\n",
+    "* 2nd last dimenstion i.e, 1st dimenstion of semantic_word_matrix should equal to H*W of masked-separable-self-attention output \n",
+    "* that is, we bring both to a common semantic space using conv for frames and dense for embeddings"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": "(4, 16, 8, 8, 3)\n"
+    }
+   ],
+   "source": [
+    "attention = Attention(channel_dimension=3)\n",
+    "\n",
+    "num_clips = 32\n",
+    "t, h, w, c = 16, 8, 8, 3\n",
+    "\n",
+    "x = tf.random.normal(shape=(t, h, w, c), mean=0.5, stddev=0.5)\n",
+    "\n",
+    "# this step is done only before the 1st attention block\n",
+    "x = tf.reshape(tf.repeat(x, repeats=num_clips, axis=0), (num_clips, t, h, w, c))\n",
+    "\n",
+    "mask_t = look_ahead_mask(num_clips, (t, h, w, h*w))\n",
+    "mask_h = look_ahead_mask(num_clips, (t, h, w, t*w))\n",
+    "mask_w = look_ahead_mask(num_clips, (t, h, w, t*h))\n",
+    "\n",
+    "word_embeddings = tf.squeeze(word_embeddings)\n",
+    "\n",
+    "print(x.shape)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Conv-Attention Block\n",
+    "* The input (640, 64, 64, 3) video will be explicitly split into (40, 16, 64, 64, 3).\n",
+    "* HACK : This reshaped input will be fed to 3D conv block with batch size being 40(hack), for local spatial and temporal feature extraction\n",
+    "* Conv Block output will be (40, 16, 8, 8, 64), which will be reshaped to (640, 8, 8, 64) and sent forward for masked-separable-self-attention followed by word-frame-attention for a few number of times (Attention Block), with addition and layer normalisation after each attention block\n",
+    "* Conv Block : \n",
+    "    * Format : num_filters, kernel, strides, padding\n",
+    "    * {8, (3, 3, 3), (2, 2, 2), same} -> {16, (3, 3, 3), (2, 2, 2), same} -> {32, (3, 3, 3), (2, 2, 2), same} -> {out_channels(64), (3, 3, 3), (2, 2, 2), same}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": "Overall Video :  (64, 64, 64, 3)\nInput Video Shape :  (4, 16, 64, 64, 3)\nAttention Output Shape :  (4, 8, 8, 8, 64)\n"
+    }
+   ],
+   "source": [
+    "num_clips = 4   # instead of 32\n",
+    "t, h, w, c = 16, 64, 64, 3\n",
+    "print(\"Overall Video : \", (num_clips * t, h, w, c))\n",
+    "print(\"Input Video Shape : \", (num_clips, t, h, w, c))\n",
+    "\n",
+    "x = tf.random.normal(shape=(num_clips, t, h, w, c), mean=0.5, stddev=0.5)\n",
+    "word_embeddings = tf.squeeze(word_embeddings)\n",
+    "\n",
+    "conv_attn = ConvAttn(num_attention_blocks=4)\n",
+    "x = conv_attn(x, word_embeddings)\n",
+    "print(\"Attention Output Shape : \", x.shape)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Conditional GAN (3D)\n",
+    "* The conditions generated by ConvAttn block will be used along ith upscaled randomness to generate a video with T frames for each batch of size num_clips(40)\n",
+    "* use LayerNorm instead of BatchNorm, because BatchNorm outputs NaN because of the padding and masking\n",
+    "* In the future if this fails, even this block may have attention blocks in the intermediate layers\n",
+    "* Upsampling z : \n",
+    "* Concatenate upsampled z with conv_attn_output\n",
+    "* upsample to produce the video\n",
+    "* this 'z' will be upscaled to num_clips x (num_clips x t, h, w, channels) and concatenated with conv_attn_output, along the channel dimension, which together will be upscaled using deconv to produce a 40 x (16, 64, 64, 3) video. Hack : Use 40 as batch size throughout\n",
+    "* We will feed 'z' from outside. If it is inside it'll stay constant and won't be random"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": "Output Shape :  (4, 16, 64, 64, 3)\nOutput Video Shape :  (64, 64, 64, 3)\n"
+    }
+   ],
+   "source": [
+    "z = tf.random.normal(shape=(1, 100))\n",
+    "cdcgan = CDCGAN()\n",
+    "z = cdcgan(z, x)\n",
+    "print(\"Output Shape : \", z.shape)\n",
+    "print(\"Output Video Shape : \", (z.shape[0] * z.shape[1], z.shape[2], z.shape[3], z.shape[4]))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Thus output video is in the shape we wanted"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Full Generator\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": "(64, 768)\nConv1 Out Shape :  (32, 16, 64, 64, 8)\nConv2 Out Shape :  (32, 8, 32, 32, 16)\nConv3 Out Shape :  (32, 4, 16, 16, 32)\nConv4 Out Shape :  (32, 2, 8, 8, 64)\n(32, 8, 8, 8, 256) (32, 8, 8, 8, 512)\n(32, 16, 64, 64, 3)\n"
+    }
+   ],
+   "source": [
+    "num_clips = 32   # instead of 32\n",
+    "t, h, w, c = 16, 64, 64, 3\n",
+    "\n",
+    "x = tf.random.normal(shape=(num_clips, t, h, w, c), mean=0.5, stddev=0.5)\n",
+    "word_embeddings = tf.squeeze(word_embeddings)\n",
+    "z = tf.random.normal(shape=(1, 100))\n",
+    "\n",
+    "generator = Generator()\n",
+    "x = generator(x, word_embeddings, z)\n",
+    "print(x.shape)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Discriminators\n",
+    "* Batched\n",
+    "* For discriminators we also consider the batch dimension\n",
+    "* Before the video goes into the discriminator we have to reshape the video to (1, num_clips * t, h, w, c), 1 -> batch_size"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Video Discriminator\n",
+    "* Gotta setup proper input pipelines for discriminator training\n",
+    "* 64, 64, 64, 3 -> 32, 64, 64, 16 -> 16, 64, 64, 32 -> 8, 32, 32, 64 -> 4, 16, 16, 128 -> 2, 8, 8, 256 -> 1, 4, 4, 512"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": "tf.Tensor(\n[[0.45877308]\n [0.6192273 ]], shape=(2, 1), dtype=float32)\n"
+    }
+   ],
+   "source": [
+    "v = tf.random.normal((2, 32 * 16, 64, 64, 3))   # batch dimension considered \n",
+    "s = tf.random.normal((2, 768))\n",
+    "\n",
+    "video_disc = VideoDiscriminator()\n",
+    "vid_disc_out = video_disc(v, s)\n",
+    "print(vid_disc_out)     # 2 outputs for batch_size = 2"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Frame Discriminator\n",
+    "* 2-fold\n",
+    "* Outputs \"{0 ... 1}\" for single frame level\n",
+    "* Outputs temporal (difference between 2 consecutive frames in euclidean norm, for each pair) downscaled as output, i.e, 1 number as output per pair of consecutive frames\n",
+    "* One part of both the discriminators are shared"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": "(2, 8) (2, 7)\n"
+    }
+   ],
+   "source": [
+    "# Total frames kept 8 instead of 640 because of ResourceExhaustError\n",
+    "v = tf.random.normal((2, 8, 64, 64, 3))    # kept batch_size = 2 here\n",
+    "s = tf.random.normal((2, 768))\n",
+    "\n",
+    "frame_disc = FrameDiscriminator()\n",
+    "frame_disc_out, motion_disc_out = frame_disc(v, s)\n",
+    "\n",
+    "print(frame_disc_out.shape, motion_disc_out.shape)\n",
+    "# Thus 2 outputs of each frame and motion disc, for batch_size=2"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Full Discriminator"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "discriminator = Discriminator()\n",
+    "\n",
+    "v = tf.random.normal((1, 32 * 16, 64, 64, 3))   # batch dimension considered \n",
+    "s = tf.random.normal((1, 768))                  # keeping batch_size = 1\n",
+    "\n",
+    "video_disc_out, frame_disc_out, motion_disc_out = discriminator(v, s)\n",
+    "print(video_disc_out.shape, frame_disc_out.shape, motion_disc_out.shape)\n",
+    "print(discriminator.summary())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Losses\n",
+    "* Matching Aware Losses\n",
+    "* Output of motion discriminator is a bit high in value (though only used for the generator)\n",
+    "* Using Scheme 2 of Microsoft"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": "tf.Tensor(0.7362369, shape=(), dtype=float32)\ntf.Tensor(0.7079885, shape=(), dtype=float32)\ntf.Tensor(0.68915033, shape=(), dtype=float32)\n"
+    }
+   ],
+   "source": [
+    "# Doing with the same ones, but won't be the case\n",
+    "vid_loss = video_loss(vid_disc_out, vid_disc_out, vid_disc_out)\n",
+    "print(vid_loss)\n",
+    "fr_loss = frame_loss(frame_disc_out, frame_disc_out, frame_disc_out)\n",
+    "print(fr_loss)\n",
+    "mot_loss = motion_loss(motion_disc_out, motion_disc_out, motion_disc_out)\n",
+    "print(mot_loss)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": "tf.Tensor(0.67633134, shape=(), dtype=float32)\ntf.Tensor(0.20451142, shape=(), dtype=float32)\n"
+    }
+   ],
+   "source": [
+    "print(discriminator_loss(vid_loss, fr_loss, mot_loss))\n",
+    "print(generator_loss(vid_disc_out, frame_disc_out, motion_disc_out))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Train Step"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Takes correct video, wrong video and word embeddings, sentence\n",
+    "# all of this must be preprocessed (padded and stuff)\n",
+    "# Video must be explicitly divided into T frames\n",
+    "def train_step(video_real, video_wrong, w, s):\n",
+    "    num_clips, t, h, w, c = video_real.shape\n",
+    "\n",
+    "    w = tf.squeeze(w)\n",
+    "    z = tf.random.normal(shape=(1, 100))\n",
+    "    \n",
+    "    with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:\n",
+    "        video_fake = generator(video_real, w, z)\n",
+    "\n",
+    "        # All frames put together with bs = 1\n",
+    "        video_real = tf.reshape(video_real, (1, num_clips * t, h, w, c))\n",
+    "        video_wrong = tf.reshape(video_wrong, (1, num_clips * t, h, w, c))\n",
+    "        video_fake = tf.reshape(video_fake, (1, num_clips * t, h, w, c))\n",
+    "\n",
+    "        # Discriminator out\n",
+    "        disc_video_real, disc_frame_real, disc_motion_real = discriminator(video_real, s)\n",
+    "        disc_video_wrong, disc_frame_wrong, disc_motion_wrong = discriminator(video_wrong, s)\n",
+    "        disc_video_fake, disc_frame_fake, disc_motion_fake = discriminator(video_fake, s)\n",
+    "\n",
+    "        # Losses\n",
+    "        total_video_loss = video_loss(disc_video_real, disc_video_wrong, disc_video_fake)\n",
+    "        total_frame_loss = frame_loss(disc_frame_real, disc_frame_wrong, disc_frame_fake)\n",
+    "        total_motion_loss = motion_loss(disc_motion_real, disc_motion_wrong, disc_motion_fake)\n",
+    "\n",
+    "        disc_loss = discriminator_loss(total_video_loss, total_frame_loss, total_motion_loss)\n",
+    "        gen_loss = generator_loss(disc_video_fake, disc_frame_fake, disc_motion_fake)\n",
+    "\n",
+    "    gradients_of_generator = gen_tape.gradient(gen_loss, generator.trainable_variables)\n",
+    "    gradients_of_discriminator = disc_tape.gradient(disc_loss, discriminator.trainable_variables)\n",
+    "\n",
+    "    generator_optimizer.apply_gradients(zip(gradients_of_generator, generator.trainable_variables))\n",
+    "    discriminator_optimizer.apply_gradients(zip(gradients_of_discriminator, discriminator.trainable_variables))\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Optimizers and Checkpoint"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "generator = Generator()\n",
+    "discriminator = Discriminator()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "learning_rate = 0.0002  # Following microsoft\n",
+    "generator_optimizer = tf.keras.optimizers.Adam(learning_rate)     # rest default\n",
+    "discriminator_optimizer = tf.keras.optimizers.Adam(learning_rate)     # rest default"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "checkpoint_dir = './training_checkpoints'\n",
+    "checkpoint_prefix = os.path.join(checkpoint_dir, \"ckpt\")\n",
+    "checkpoint = tf.train.Checkpoint(generator_optimizer=generator_optimizer,\n",
+    "                                 discriminator_optimizer=discriminator_optimizer,\n",
+    "                                 generator=generator,\n",
+    "                                 discriminator=discriminator)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_step(video_real[:8], video_wrong[:8], word_embeddings, sentence_embeddings)"
+   ]
+  }
+ ]
+}
\ No newline at end of file
diff --git a/Machine_Learning/src/NLP/SignGAN/train.py b/Machine_Learning/src/NLP/SignGAN/train.py
new file mode 100644
index 00000000..a8d1b588
--- /dev/null
+++ b/Machine_Learning/src/NLP/SignGAN/train.py
@@ -0,0 +1,115 @@
+import tensorflow as tf
+import numpy as np
+import glob
+import time
+import os
+
+from bert_utils import Bert
+from utils.video import Video
+from utils.conv_attention import *
+from utils.generator import *
+from utils.discriminator import *
+from utils.losses import *
+
+# Optional
+config = tf.compat.v1.ConfigProto()
+config.gpu_options.allow_growth = True
+sess = tf.compat.v1.Session(config=config)
+
+# Models
+bert = Bert()
+generator = Generator()
+discriminator = Discriminator()
+
+num_clips = 32
+T = 16 # let
+MAX_VIDEO_LENGTH = 512      # 475 is the longest
+FRAME_DIM = (64, 64, 3)
+VIDEO_DIM = (512, 64, 64, 3)
+data_dir = 'phoenix-2014-T.v3/PHOENIX-2014-T-release-v3/PHOENIX-2014-T/features/fullFrame-210x260px/'
+video_obj = Video(T, MAX_VIDEO_LENGTH, FRAME_DIM, VIDEO_DIM, data_dir)
+
+EPOCHS = 10000
+
+# Otimizers
+learning_rate = 0.000001
+generator_optimizer = tf.keras.optimizers.Adam(learning_rate)
+discriminator_optimizer = tf.keras.optimizers.Adam(learning_rate)
+
+# ckpt
+checkpoint_dir = './training_checkpoints'
+checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
+checkpoint = tf.train.Checkpoint(generator_optimizer=generator_optimizer,
+                                 discriminator_optimizer=discriminator_optimizer,
+                                 generator=generator,
+                                 discriminator=discriminator)
+
+# Takes correct video, wrong video and word embeddings, sentence
+# all of this must be preprocessed (padded and stuff)
+# Video must be explicitly divided into T frames
+
+def train_step(video_real, video_wrong, text):
+    num_clips, t, h, w, c = video_real.shape
+    word, sentence = bert([text])
+    word, sentence = tf.convert_to_tensor(word), tf.convert_to_tensor(sentence)
+    word = tf.squeeze(word)
+    z = tf.random.normal(shape=(1, 100))
+
+    with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
+        video_fake = generator(video_real, word, z)
+
+        # All frames put together with bs = 1
+        video_real = tf.reshape(video_real, (1, num_clips * t, h, w, c))
+        video_wrong = tf.reshape(video_wrong, (1, num_clips * t, h, w, c))
+        video_fake = tf.reshape(video_fake, (1, num_clips * t, h, w, c))
+
+        # Discriminator out
+        disc_video_real, disc_frame_real, disc_motion_real = discriminator(video_real, sentence)
+        disc_video_wrong, disc_frame_wrong, disc_motion_wrong = discriminator(video_wrong, sentence)
+        disc_video_fake, disc_frame_fake, disc_motion_fake = discriminator(video_fake, sentence)
+
+        # Losses
+        total_video_loss = video_loss(disc_video_real, disc_video_wrong, disc_video_fake)
+        total_frame_loss = frame_loss(disc_frame_real, disc_frame_wrong, disc_frame_fake)
+        total_motion_loss = motion_loss(disc_motion_real, disc_motion_wrong, disc_motion_fake)
+
+        disc_loss = discriminator_loss(total_video_loss, total_frame_loss, total_motion_loss)
+        gen_loss = generator_loss(disc_video_fake, disc_frame_fake, disc_motion_fake)
+
+    gradients_of_generator = gen_tape.gradient(gen_loss, generator.trainable_variables)
+    gradients_of_discriminator = disc_tape.gradient(disc_loss, discriminator.trainable_variables)
+
+    generator_optimizer.apply_gradients(zip(gradients_of_generator, generator.trainable_variables))
+    discriminator_optimizer.apply_gradients(zip(gradients_of_discriminator, discriminator.trainable_variables))
+    
+
+def main():
+    dataset = open('phoenix-2014-T.v3/PHOENIX-2014-T-release-v3/PHOENIX-2014-T/annotations/manual/PHOENIX-2014-T.train.corpus.csv', encoding='utf-8')
+    dataset = dataset.readlines()[1:]
+
+    text_data = [i.split('|')[-1][:-1] for i in dataset]     # [:-1] to remove '\n' at the end
+    video_data = [i.split('|')[0] for i in dataset]
+
+    for epoch in range(EPOCHS):
+        start = time.time()
+        for text, video in zip(text_data, video_data):
+            video_real = video_obj.get_video('train', video)
+            video_real = video_obj.preprocess_video(video_real)
+            video_real = video_obj.divide_sequence(video_real)
+
+            # Random video from dataset as the wrong video
+            video_wrong = video_obj.get_video('train', video_data[np.random.randint(0, len(video_data))])
+            video_wrong = video_obj.preprocess_video(video_wrong)
+            video_wrong = video_obj.divide_sequence(video_wrong)
+            
+            train_step(video_real, video_wrong, text)
+
+        if (epoch + 1) % 10 == 0:
+            checkpoint.save(file_prefix=checkpoint_prefix)
+    
+        print("Epoch {0} :- Time : {1}".format(epoch + 1, time.time() - start))
+        
+
+
+if __name__ == '__main__':
+    main()
diff --git a/Machine_Learning/src/NLP/SignGAN/utils/__init__.py b/Machine_Learning/src/NLP/SignGAN/utils/__init__.py
new file mode 100644
index 00000000..8b137891
--- /dev/null
+++ b/Machine_Learning/src/NLP/SignGAN/utils/__init__.py
@@ -0,0 +1 @@
+
diff --git a/Machine_Learning/src/NLP/SignGAN/utils/conv_attention.py b/Machine_Learning/src/NLP/SignGAN/utils/conv_attention.py
new file mode 100644
index 00000000..0ad71cdd
--- /dev/null
+++ b/Machine_Learning/src/NLP/SignGAN/utils/conv_attention.py
@@ -0,0 +1,299 @@
+import tensorflow as tf
+import numpy as np
+
+from .errors import *
+
+@tf.function
+def masking(shape, position):
+    if position > shape[0]:
+        raise PaddingError("padding position exceeds limits. position must be < shape[0]")
+
+    if len(shape) != 4:
+        raise MatrixRankError("Shape must be of Rank 4 i.e, (T, H, W, val), val -> H*W or, T*W or, T*H")
+
+    # mask only for time dimension, size -> (t, h*w, h*w)
+    '''
+    zero = tf.zeros((position, shape[1], shape[2], shape[3]), dtype=tf.float32)
+    fill = tf.fill((shape[0]-position, shape[1], shape[2], shape[3]), -np.inf)
+    mask = tf.concat([zero, fill], 0)
+    '''
+    
+    return tf.concat([tf.zeros((position, shape[1], shape[2], shape[3]), dtype=tf.float32), 
+                    tf.fill((shape[0]-position, shape[1], shape[2], shape[3]), -np.inf)], 0)
+
+    #return mask
+
+@tf.function
+def look_ahead_mask(num_mask, shape):
+    if shape[0] % num_mask != 0:
+        raise DivisionError("Time dimension must be divisible by number of masks")
+    if len(shape) != 4:
+        raise MatrixRankError("Shape must be of Rank 4 i.e, (T, H, W, val), val -> H*W or, T*W or, T*H")
+
+    mask = tf.expand_dims(masking(shape, 1 * shape[0] // num_mask), 0)
+    for mask_pos in range(1, num_mask):    # shape[0] = 500
+        mask = tf.concat([mask, tf.expand_dims(masking(shape, (mask_pos+1) * shape[0] // num_mask), 0)], 0)
+
+    return mask     # shape -> (50, T, H, W, C)
+
+@tf.function
+def dot_product_attention(q, k, v, mask):
+    # Number of columns of q must be equal to number of columns of k
+    # i.e the last dimension must be same
+
+    # This if-else block is for self attention and, word embedding attention
+    if len(k.shape) == 2:
+        k_T = tf.transpose(k)
+    else:
+        k_T =  tf.transpose(k, perm=[0, 1, 3, 2])    # which axis will be at which place specified
+    
+    qv_correlations = tf.matmul(q, k_T)
+    
+    if mask is not None:
+        num_clips, qv_1, qv_2, qv_3 = qv_correlations.shape
+        '''
+        try:
+            mask = tf.reshape(mask, (num_clips, qv_1, qv_2, qv_3))
+        except:
+            raise ValueError("Reshaped Mask does not match 'matmul(Q, K.T)' in shape")
+        '''
+        qv_correlations += mask
+        #print(qv_correlations)
+
+    return tf.matmul(tf.nn.softmax(qv_correlations, axis=0), v)
+
+
+## Word Frame Attention
+# Last dimension of both masked_attention_output and semantic_word_matrix must be same
+# 2nd last dimenstion i.e, 1st dimenstion of semantic_word_matrix should equal to H*W of masked-separable-self-attention output because the same mask_t will be used
+# that is, we bring both to a common semantic space using conv for frames and dense for embeddings
+class Attention(tf.keras.layers.Layer):
+    # tf.keras.layers.Dense changes the last dimenstion of n-d matrix
+    def __init__(self, channel_dimension=64):
+        # downsample to H*W = 64 i.e, H = 8, W = 8, before the attention block.
+        # common semantic space = 64, i.e, channel_dimension = 64
+        # result after tf.Dense Layer : 1. semantic_word_matrix -> (64, 64)      2. Downsampled masked/repeated video -> (50, 500, 8, 8, 64)
+
+        super(Attention, self).__init__()
+
+        self.channel_dimension = channel_dimension
+
+        # For separable self attention
+        self.sep_wq1 = tf.keras.layers.Dense(self.channel_dimension, use_bias=False, name='sep_wq1')
+        self.sep_wk1 = tf.keras.layers.Dense(self.channel_dimension, use_bias=False, name='sep_wk1')
+        self.sep_wv1 = tf.keras.layers.Dense(self.channel_dimension, use_bias=False, name='sep_wv1')
+
+        self.sep_wq2 = tf.keras.layers.Dense(self.channel_dimension, use_bias=False, name='sep_wq2')
+        self.sep_wk2 = tf.keras.layers.Dense(self.channel_dimension, use_bias=False, name='sep_wk2')
+        self.sep_wv2 = tf.keras.layers.Dense(self.channel_dimension, use_bias=False, name='sep_wv2')
+
+        self.sep_wq3 = tf.keras.layers.Dense(self.channel_dimension, use_bias=False, name='sep_wq3')
+        self.sep_wk3 = tf.keras.layers.Dense(self.channel_dimension, use_bias=False, name='sep_wk3')
+        self.sep_wv3 = tf.keras.layers.Dense(self.channel_dimension, use_bias=False, name='sep_wv3')
+
+        # For word-frame attention
+        self.word_wq = tf.keras.layers.Dense(self.channel_dimension, use_bias=False, name='word_wq')
+        self.word_wk = tf.keras.layers.Dense(self.channel_dimension, use_bias=False, name='word_wk')
+        self.word_wv = tf.keras.layers.Dense(self.channel_dimension, use_bias=False, name='word_wv')
+
+        # batchnorm causes nan because of the padding and masking, so layernorm
+        self.layernorm1 = tf.keras.layers.LayerNormalization()
+        self.layernorm2 = tf.keras.layers.LayerNormalization()
+
+    '''
+    def separable_attention(self, x, mask_t, mask_h, mask_w):
+        try:
+            num_clips, t, h, w, c = x.shape
+        except:
+            raise MatrixRankError("x must be of rank 5 i.e, (num_clips, t, h, w, c)")
+        
+        if len(mask_t.shape) != 5 or len(mask_h.shape) != 5 or len(mask_h.shape) != 5:
+            raise MatrixRankError("masks must be of rank 5 i.e, (num_clips, t, h, w, val), where val -> H*W or, T*W or, T*H")
+        
+        x = tf.reshape(x, (num_clips, t, h*w, c))
+        xq, xk, xv = self.sep_wq1(x), self.sep_wk1(x), self.sep_wv1(x)
+        x = dot_product_attention(xq, xk ,xv, tf.reshape(mask_t, (num_clips, t, h*w, h*w)))   # self Attention
+        #print(x.shape)
+
+        x = tf.reshape(x, (num_clips, h, t*w, c))
+        xq, xk, xv = self.sep_wq2(x), self.sep_wk2(x), self.sep_wv2(x)
+        x = dot_product_attention(xq, xk ,xv, tf.reshape(mask_h, (num_clips, h, t*w, t*w)))   # self Attention
+
+        x = tf.reshape(x, (num_clips, w, t*h, c))
+        xq, xk, xv = self.sep_wq3(x), self.sep_wk3(x), self.sep_wv3(x)
+        x = dot_product_attention(xq, xk ,xv, tf.reshape(mask_w, (num_clips, w, t*h, t*h)))   # self Attention
+        
+        return tf.reshape(x, (num_clips, t, h, w, c))
+    '''
+    '''
+    def separable_attention(self, x, mask_t, mask_h, mask_w, xq_t, xk_t, xv_t, xq_h, xk_h, xv_h, xq_w, xk_w, xv_w):
+        try:
+            num_clips, t, h, w, c = x.shape
+        except:
+            raise MatrixRankError("x must be of rank 5 i.e, (num_clips, t, h, w, c)")
+        
+        if len(mask_t.shape) != 5 or len(mask_h.shape) != 5 or len(mask_h.shape) != 5:
+            raise MatrixRankError("masks must be of rank 5 i.e, (num_clips, t, h, w, val), where val -> H*W or, T*W or, T*H")
+        
+        x = dot_product_attention(xq_t, xk_t ,xv_t, tf.reshape(mask_t, (num_clips, t, h*w, h*w)))   # self Attention
+
+        x = dot_product_attention(xq_h, xk_h ,xv_h, tf.reshape(mask_h, (num_clips, h, t*w, t*w)))   # self Attention
+
+        x = dot_product_attention(xq_w, xk_w ,xv_w, tf.reshape(mask_w, (num_clips, w, t*h, t*h)))   # self Attention
+        
+        return tf.reshape(x, (num_clips, t, h, w, c))
+    '''
+    '''
+    def word_frame_attention(self, frame_features, bert_embeddings, mask_t):      # only across time
+        try:
+            num_clips, t, h, w, c = frame_features.shape
+        except:
+            raise MatrixRankError("frame_features must be of rank 5 i.e, (num_clips, t, h, w, c)")
+        
+        if len(mask_t.shape) != 5:
+            raise MatrixRankError("mask_t must be of rank 5 i.e, (num_clips, t, h, w, val), where val -> H*W or, T*W or, T*H")
+
+        frame_features = tf.reshape(frame_features, (num_clips, t, h*w, c))
+
+        frame_features, bert_embeddings, bert_embeddings = self.word_wq(frame_features), self.word_wk(bert_embeddings), self.word_wv(bert_embeddings)
+
+        frame_features = dot_product_attention(frame_features, bert_embeddings, bert_embeddings, tf.reshape(mask_t, (num_clips, t, h*w, h*w)))
+
+        return tf.reshape(frame_features, (num_clips, t, h, w, c))
+    '''
+
+    def call(self, frame_features, bert_embeddings, mask_t, mask_h, mask_w):
+        # make sure that q, k, v have gone through tf.expand_dims, because of the batch simension thing
+        # for 2nd point above. if q = v that means self attention, hence, q != v
+        assert (frame_features.shape[-2] * frame_features.shape[-3]) == bert_embeddings.shape[-2]       # for 2nd point above. if q = v that means self attention, hence, q != v
+        # 8*8 == 64
+    
+        #attn_out = self.separable_attention(frame_features, mask_t, mask_h, mask_w)
+        # Separable self-attn
+        try:
+            num_clips, t, h, w, c = frame_features.shape
+        except:
+            raise MatrixRankError("frame_features must be of rank 5 i.e, (num_clips, t, h, w, c)")
+        
+        x = tf.reshape(frame_features, (num_clips, t, h*w, c))
+        xq, xk, xv = self.sep_wq1(x), self.sep_wk1(x), self.sep_wv1(x)
+        x = dot_product_attention(xq, xk ,xv, tf.reshape(mask_t, (num_clips, t, h*w, h*w)))
+        
+        x = tf.reshape(x, (num_clips, h, t*w, c))
+        xq, xk, xv = self.sep_wq2(x), self.sep_wk2(x), self.sep_wv2(x)
+        x = dot_product_attention(xq, xk ,xv, tf.reshape(mask_h, (num_clips, h, t*w, t*w)))
+
+        x = tf.reshape(x, (num_clips, w, t*h, c))
+        xq, xk, xv = self.sep_wq3(x), self.sep_wk3(x), self.sep_wv3(x)
+        x = dot_product_attention(xq, xk ,xv, tf.reshape(mask_w, (num_clips, w, t*h, t*h)))   # self Attention
+
+        x = tf.reshape(x, (num_clips, t, h, w, c))
+        frame_features = self.layernorm1(frame_features + x)
+
+        #attn_out = self.word_frame_attention(frame_features, bert_embeddings, mask_t)
+        # Word-frame attention
+        x = tf.reshape(frame_features, (num_clips, t, h*w, c))
+        x, bert_k, bert_v = self.word_wq(x), self.word_wk(bert_embeddings), self.word_wv(bert_embeddings)
+        x = dot_product_attention(x, bert_k, bert_v, tf.reshape(mask_t, (num_clips, t, h*w, h*w)))
+        x = tf.reshape(x, (num_clips, t, h, w, c))
+
+        frame_features = self.layernorm2(frame_features + x)
+
+        return frame_features
+
+
+
+# use tf.keras.Model to make it a different model
+# Can see summary only after passing an input. Simply calling the model won't work
+# Gotta pass a sample input to get going
+class ConvAttn(tf.keras.layers.Layer):
+    def __init__(self, num_attention_blocks=4, out_channels=64):
+        super(ConvAttn, self).__init__()
+
+        self.num_attention_blocks = num_attention_blocks
+        self.out_channels = out_channels
+
+        # (16, 64, 64, 8)
+        self.conv1 = tf.keras.layers.Conv3D(filters=8, 
+                                            kernel_size=(3, 3, 3), 
+                                            strides=(1, 1, 1), 
+                                            padding='same', 
+                                            use_bias=False)
+
+        # (8, 32, 32, 16)
+        self.conv2 = tf.keras.layers.Conv3D(filters=16, 
+                                            kernel_size=(3, 3, 3), 
+                                            strides=(2, 2, 2), 
+                                            padding='same', 
+                                            use_bias=False)
+
+        # (4, 16, 16, 32)
+        self.conv3 = tf.keras.layers.Conv3D(filters=32, 
+                                            kernel_size=(3, 3, 3), 
+                                            strides=(2, 2, 2), 
+                                            padding='same', 
+                                            use_bias=False)
+
+        # (2, 8, 8, out_channels)
+        self.conv4 = tf.keras.layers.Conv3D(filters=self.out_channels, 
+                                            kernel_size=(3, 3, 3), 
+                                            strides=(2, 2, 2), 
+                                            padding='same', 
+                                            use_bias=False)
+
+        # LayerNorm and ReLU
+        self.relu = []
+        self.layernorm = []
+        for i in range(4):
+            self.relu.append(tf.keras.layers.Activation('relu'))
+            self.layernorm.append(tf.keras.layers.LayerNormalization())
+
+        self.attention = []
+        for i in range(self.num_attention_blocks):
+            self.attention.append(Attention(self.out_channels))
+
+    def call(self, x, bert_embeddings):
+        if len(x.shape) != 5:
+            raise MatrixRankError("x was supposed to be a Rank 5 tensor, i.e, (num_clips, T, H, W, C)")
+        
+        assert self.out_channels == bert_embeddings.shape[-2]
+
+        x = self.conv1(x)
+        x = self.layernorm[0](x)
+        x = self.relu[0](x)
+        #print("Conv1 Out Shape : ", x.shape)
+
+        x = self.conv2(x)
+        x = self.layernorm[1](x)
+        x = self.relu[1](x)
+        #print("Conv2 Out Shape : ", x.shape)
+
+        x = self.conv3(x)
+        x = self.layernorm[2](x)
+        x = self.relu[2](x)
+        #print("Conv3 Out Shape : ", x.shape)
+        
+        x = self.conv4(x)
+        x = self.layernorm[3](x)
+        x = self.relu[3](x)
+        #print("Conv4 Out Shape : ", x.shape)
+
+        #if np.isnan(np.sum(x)) == np.nan:
+        #    print('-----CONV-----')
+
+        num_clips, t, h, w, c = x.shape
+        x = tf.reshape(x, (num_clips * t, h, w, c))
+        t = num_clips * t
+
+        x = tf.repeat(x, repeats=num_clips, axis=0)
+        x = tf.reshape(x, (num_clips, t, h, w, c))
+
+        mask_t = look_ahead_mask(num_clips, (t, h, w, h*w))
+        mask_h = look_ahead_mask(num_clips, (t, h, w, t*w))
+        mask_w = look_ahead_mask(num_clips, (t, h, w, t*h))
+
+        for i in range(self.num_attention_blocks):
+            x = self.attention[i](x, bert_embeddings, mask_t, mask_h, mask_w)
+        
+
+        return x
diff --git a/Machine_Learning/src/NLP/SignGAN/utils/discriminator.py b/Machine_Learning/src/NLP/SignGAN/utils/discriminator.py
new file mode 100644
index 00000000..ae4e26c3
--- /dev/null
+++ b/Machine_Learning/src/NLP/SignGAN/utils/discriminator.py
@@ -0,0 +1,399 @@
+import tensorflow as tf
+from .errors import MatrixRankError
+
+# Using scheme 2 of Microsoft
+
+# 0 < disc_vals < 1
+class VideoDiscriminator(tf.keras.layers.Layer):
+    def __init__(self):
+        super(VideoDiscriminator, self).__init__()
+
+        self.dense1 = tf.keras.layers.Dense(4 * 4 * 256)
+
+        # Input -> (512, 64, 64, 3)
+        # (512, 64, 64, 8)
+        self.conv0 = tf.keras.layers.Conv3D(filters=8, 
+                                            kernel_size=(3, 3, 3), 
+                                            strides=(1, 1, 1), 
+                                            padding='same', 
+                                            use_bias=False)
+
+        # (256, 64, 64, 16)
+        self.conv1 = tf.keras.layers.Conv3D(filters=16, 
+                                            kernel_size=(3, 3, 3), 
+                                            strides=(2, 1, 1), 
+                                            padding='same', 
+                                            use_bias=False)
+
+        # (128, 64, 64, 32)
+        self.conv2 = tf.keras.layers.Conv3D(filters=32, 
+                                            kernel_size=(3, 3, 3), 
+                                            strides=(2, 1, 1), 
+                                            padding='same', 
+                                            use_bias=False)
+        
+        # (64, 64, 64, 64)
+        self.conv3 = tf.keras.layers.Conv3D(filters=64, 
+                                            kernel_size=(3, 3, 3), 
+                                            strides=(2, 1, 1), 
+                                            padding='same', 
+                                            use_bias=False)
+        
+        # (32, 64, 64, 128)
+        self.conv4 = tf.keras.layers.Conv3D(filters=128, 
+                                            kernel_size=(3, 3, 3), 
+                                            strides=(2, 1, 1), 
+                                            padding='same', 
+                                            use_bias=False)
+        
+        # (16, 64, 64, 256)
+        self.conv5 = tf.keras.layers.Conv3D(filters=256, 
+                                            kernel_size=(3, 3, 3), 
+                                            strides=(2, 1, 1), 
+                                            padding='same', 
+                                            use_bias=False)
+
+        # (8, 32, 32, 512)
+        self.conv6 = tf.keras.layers.Conv3D(filters=512, 
+                                            kernel_size=(3, 3, 3), 
+                                            strides=(2, 2, 2), 
+                                            padding='same', 
+                                            use_bias=False)
+
+        # (4, 16, 16, 1024)
+        self.conv7 = tf.keras.layers.Conv3D(filters=1024, 
+                                            kernel_size=(3, 3, 3), 
+                                            strides=(2, 2, 2), 
+                                            padding='same', 
+                                            use_bias=False)
+
+        # (2, 8, 8, 1024)
+        self.conv8 = tf.keras.layers.Conv3D(filters=1024, 
+                                            kernel_size=(3, 3, 3), 
+                                            strides=(2, 2, 2), 
+                                            padding='same', 
+                                            use_bias=False)
+
+        # (1, 4, 4, 1024)
+        self.conv9 = tf.keras.layers.Conv3D(filters=1024, 
+                                            kernel_size=(3, 3, 3), 
+                                            strides=(2, 2, 2), 
+                                            padding='same', 
+                                            use_bias=False)
+
+        # After concat
+        self.conv10 = tf.keras.layers.Conv3D(filters=1024, 
+                                            kernel_size=(3, 3, 3), 
+                                            strides=(1, 1, 1), 
+                                            padding='same', 
+                                            use_bias=False)
+
+
+        self.relu = []
+        self.batchnorm = []
+        for i in range(11):
+            self.batchnorm.append(tf.keras.layers.BatchNormalization())
+            self.relu.append(tf.keras.layers.Activation('relu'))
+
+        self.flatten = tf.keras.layers.Flatten()
+
+        self.dense2 = tf.keras.layers.Dense(128)
+
+        self.dense3 = tf.keras.layers.Dense(1)#, activation='sigmoid')    # output -> [0, 1]
+
+
+    def call(self, v, s):   # video and sentence embedding
+        # s -> sentence vector
+        if len(v.shape) != 5:
+            raise MatrixRankError("v must be a Rank 5 Tensor i.e, (batch_size, num_clips * T, H, W, C)")
+        if len(s.shape) != 2:
+            raise MatrixRankError("s must be a Rank 2 Tensor i.e, (batch_size, dim)")
+        
+        batch_size, t, h, w, c = v.shape
+
+        # Sentence semantic space
+        s = self.dense1(s)
+        s = tf.reshape(s, (batch_size, 1, 4, 4, 256))
+
+        # downscale v
+        v = self.conv0(v)
+        v = self.batchnorm[0](v)
+        v = self.relu[0](v)
+
+        v = self.conv1(v)
+        v = self.batchnorm[1](v)
+        v = self.relu[1](v)
+
+        v = self.conv2(v)
+        v = self.batchnorm[2](v)
+        v = self.relu[2](v)
+
+        v = self.conv3(v)
+        v = self.batchnorm[3](v)
+        v = self.relu[3](v)
+
+        v = self.conv4(v)
+        v = self.batchnorm[4](v)
+        v = self.relu[4](v)
+
+        v = self.conv5(v)
+        v = self.batchnorm[5](v)
+        v = self.relu[5](v)
+
+        v = self.conv6(v)
+        v = self.batchnorm[6](v)
+        v = self.relu[6](v)
+
+        v = self.conv7(v)
+        v = self.batchnorm[7](v)
+        v = self.relu[7](v)
+
+        v = self.conv8(v)
+        v = self.batchnorm[8](v)
+        v = self.relu[8](v)
+
+        v = self.conv9(v)
+        v = self.batchnorm[9](v)
+        v = self.relu[9](v)
+
+        # concat with s
+        v = tf.concat([v, s], axis=-1)
+
+        v = self.conv10(v)
+        v = self.batchnorm[10](v)
+        v = self.relu[10](v)
+
+        v = self.flatten(v)
+        v = self.dense2(v)
+        v = self.dense3(v)
+
+        return v
+
+
+# 0 < disc_vals < 1
+class FrameDiscriminator(tf.keras.layers.Layer):
+    def __init__(self):
+        super(FrameDiscriminator, self).__init__()
+
+        # 3D conv used instead of 2D, to consider batch size i.e, more than 1 video at a time
+        # Time dimension technically not considered because stride is  always = 1 and kernel size along time is always 1, so temporal relation is not taken into account
+
+        self.dense1 = tf.keras.layers.Dense(4 * 4 * 256)
+
+        # (64, 64, 8)
+        self.conv0 = tf.keras.layers.Conv3D(filters=8, 
+                                            kernel_size=(1, 3, 3), 
+                                            strides=(1, 1, 1), 
+                                            padding='same', 
+                                            use_bias=False)
+
+        # (32, 32, 16)
+        self.conv1 = tf.keras.layers.Conv3D(filters=16, 
+                                            kernel_size=(1, 3, 3), 
+                                            strides=(1, 2, 2), 
+                                            padding='same', 
+                                            use_bias=False)
+
+        # (16, 16, 32)
+        self.conv2 = tf.keras.layers.Conv3D(filters=32, 
+                                            kernel_size=(1, 3, 3), 
+                                            strides=(1, 2, 2), 
+                                            padding='same', 
+                                            use_bias=False)
+        
+        # (8, 8, 64)
+        self.conv3 = tf.keras.layers.Conv3D(filters=64, 
+                                            kernel_size=(1, 3, 3), 
+                                            strides=(1, 2, 2), 
+                                            padding='same', 
+                                            use_bias=False)
+        
+        # (4, 4, 128)
+        self.conv4 = tf.keras.layers.Conv3D(filters=128, 
+                                            kernel_size=(1, 3, 3), 
+                                            strides=(1, 2, 2), 
+                                            padding='same', 
+                                            use_bias=False)
+        
+        # (4, 4, 256)
+        self.conv5 = tf.keras.layers.Conv3D(filters=256, 
+                                            kernel_size=(1, 3, 3), 
+                                            strides=(1, 1, 1), 
+                                            padding='same', 
+                                            use_bias=False)
+        '''
+        # (32, 32, 512)
+        self.conv6 = tf.keras.layers.Conv3D(filters=512, 
+                                            kernel_size=(1, 3, 3), 
+                                            strides=(1, 2, 2), 
+                                            padding='same', 
+                                            use_bias=False)
+
+        # (16, 16, 1024)
+        self.conv7 = tf.keras.layers.Conv3D(filters=1024, 
+                                            kernel_size=(1, 3, 3), 
+                                            strides=(1, 2, 2), 
+                                            padding='same', 
+                                            use_bias=False)
+
+        # (8, 8, 1024)
+        self.conv8 = tf.keras.layers.Conv3D(filters=1024, 
+                                            kernel_size=(1, 3, 3), 
+                                            strides=(1, 2, 2), 
+                                            padding='same', 
+                                            use_bias=False)
+
+        # (4, 4, 1024)
+        self.conv9 = tf.keras.layers.Conv3D(filters=1024, 
+                                            kernel_size=(1, 3, 3), 
+                                            strides=(1, 2, 2), 
+                                            padding='same', 
+                                            use_bias=False)
+        '''
+        
+        # After concat (FRAME)
+        self.conv_frame = tf.keras.layers.Conv3D(filters=512, 
+                                                kernel_size=(1, 3, 3), 
+                                                strides=(1, 1, 1), 
+                                                padding='same', 
+                                                use_bias=False)
+
+        # After concat (MOTION)
+        self.conv_motion = tf.keras.layers.Conv3D(filters=512, 
+                                                kernel_size=(1, 3, 3), 
+                                                strides=(1, 1, 1), 
+                                                padding='same', 
+                                                use_bias=False)
+
+        
+        self.relu = []
+        self.batchnorm = []
+        for i in range(8):  #12
+            self.batchnorm.append(tf.keras.layers.BatchNormalization())
+            self.relu.append(tf.keras.layers.Activation('relu'))
+
+        self.frame_flatten = tf.keras.layers.Flatten()
+
+        #self.dense_frame_1 = tf.keras.layers.Dense(128)
+
+        self.dense_frame_2 = tf.keras.layers.Dense(1)#, activation='sigmoid')    # output -> [0, 1]
+
+        self.motion_flatten = tf.keras.layers.Flatten()
+
+        #self.dense_motion_1 = tf.keras.layers.Dense(128)
+
+        self.dense_motion_2 = tf.keras.layers.Dense(1)#, activation='sigmoid')    # output -> [0, 1]
+
+
+    def call(self, v, s):
+        if len(v.shape) != 5:
+            raise MatrixRankError("v must be a Rank 5 Tensor i.e, (batch_size, num_clips * T, H, W, C)")
+        if len(s.shape) != 2:
+            raise MatrixRankError("s must be a Rank 2 Tensor i.e, (batch_size, dim)")
+
+        batch_size, t, h, w, c = v.shape
+
+        # downscale v
+        v = self.conv0(v)
+        v = self.batchnorm[0](v)
+        v = self.relu[0](v)
+
+        v = self.conv1(v)
+        v = self.batchnorm[1](v)
+        v = self.relu[1](v)
+
+        v = self.conv2(v)
+        v = self.batchnorm[2](v)
+        v = self.relu[2](v)
+
+        v = self.conv3(v)
+        v = self.batchnorm[3](v)
+        v = self.relu[3](v)
+
+        v = self.conv4(v)
+        v = self.batchnorm[4](v)
+        v = self.relu[4](v)
+
+        v = self.conv5(v)
+        v = self.batchnorm[5](v)
+        v = self.relu[5](v)
+        '''
+        v = self.conv6(v)
+        v = self.leakyrelu[6](v)
+        print('7')
+
+        v = self.conv7(v)
+        v = self.leakyrelu[7](v)
+        print('8')
+
+        v = self.conv8(v)
+        v = self.leakyrelu[8](v)
+        print('9')
+        
+        v = self.conv9(v)
+        v = self.leakyrelu[9](v)    # common output
+        print('10')
+        '''
+        ## Take this output and work for temporal coherence
+
+        ## Frame
+        # Sentence semantic space
+        s = self.dense1(s)
+        s = tf.reshape(s, (batch_size, 1, 4, 4, 256))
+        frame_s = tf.repeat(s, repeats=t, axis=1)   # time axis next to the batch because 't' frames
+        # Concat and out
+        frame_out = tf.concat([v, frame_s], axis=-1)
+
+        frame_out = self.conv_frame(frame_out)
+        frame_out = self.batchnorm[6](frame_out)
+        frame_out = self.relu[6](frame_out)
+        # output 1 for each frame so reshape to (batch_size * t, h, w, c)
+        frame_out = tf.reshape(frame_out, (batch_size * frame_out.shape[1], frame_out.shape[2], frame_out.shape[3], frame_out.shape[4]))
+
+        frame_out = self.frame_flatten(frame_out)   # (bs*t, .., .., ..)
+        #frame_out = self.dense_frame_1(frame_out)   # (bs*t, ..)
+        frame_out = self.dense_frame_2(frame_out)   # (bs*t,)
+
+        # reshaped to (batch_size, t)
+        frame_out = tf.reshape(frame_out, (batch_size, frame_out.shape[0] // batch_size)) # out
+        # for each frame of each batch, we get an output
+        
+        ## Motion
+        # Sentence Semantic space
+        motion_s = tf.repeat(s, repeats=t-1, axis=1)
+        motion_out = tf.subtract(v[:, 1:], v[:, :-1])
+
+        # Concat and out
+        motion_out = tf.concat([motion_out, motion_s], axis=-1)
+
+        motion_out = self.conv_motion(motion_out)
+        motion_out = self.batchnorm[7](motion_out)
+        motion_out = self.relu[7](motion_out)
+
+        # Scheme 2 (no norm)
+        motion_out = tf.reshape(motion_out, (batch_size * motion_out.shape[1], motion_out.shape[2], motion_out.shape[3], motion_out.shape[4]))
+
+        motion_out = self.motion_flatten(motion_out)    # (bs*(t-1), .., .., ..)
+        #motion_out = self.dense_motion_1(motion_out)    # (bs*(t-1), ..)
+        motion_out = self.dense_motion_2(motion_out)    # (bs*(t-1),)
+
+        # (bs, t-1)
+        motion_out = tf.reshape(motion_out, (batch_size, motion_out.shape[0] // batch_size)) # out
+        
+        return frame_out, motion_out
+
+
+# 0 < disc_vals < 1
+class Discriminator(tf.keras.Model):
+    def __init__(self):
+        super(Discriminator, self).__init__()
+
+        self.video_discriminator = VideoDiscriminator()
+        self.frame_discriminator = FrameDiscriminator()
+
+    def call(self, v, s):
+        video_disc_out = self.video_discriminator(v, s)
+        frame_disc_out, motion_disc_out = self.frame_discriminator(v, s)
+
+        return video_disc_out, frame_disc_out, motion_disc_out
+
diff --git a/Machine_Learning/src/NLP/SignGAN/utils/errors.py b/Machine_Learning/src/NLP/SignGAN/utils/errors.py
new file mode 100644
index 00000000..a68004da
--- /dev/null
+++ b/Machine_Learning/src/NLP/SignGAN/utils/errors.py
@@ -0,0 +1,18 @@
+class Error(Exception):
+    # base class
+    pass
+
+class PaddingError(Error):
+    def __init__(self, msg):
+        self.msg = msg
+        super(PaddingError, self).__init__(self.msg)
+
+class MatrixRankError(Error):
+    def __init__(self, msg):
+        self.msg = msg
+        super(MatrixRankError, self).__init__(self.msg)
+
+class DivisionError(Error):
+    def __init__(self, msg):
+        self.msg = msg
+        super(DivisionError, self).__init__(self.msg)
diff --git a/Machine_Learning/src/NLP/SignGAN/utils/generator.py b/Machine_Learning/src/NLP/SignGAN/utils/generator.py
new file mode 100644
index 00000000..5a0df6b8
--- /dev/null
+++ b/Machine_Learning/src/NLP/SignGAN/utils/generator.py
@@ -0,0 +1,184 @@
+import numpy as np
+import tensorflow as tf
+from .conv_attention import ConvAttn
+
+# '.' added to look outside the directory (else import error)
+# We will feed 'z' from outside. If it is inside it'll stay constant and won't be random
+class CDCGAN(tf.keras.layers.Layer):
+    def __init__(self):
+        super(CDCGAN, self).__init__()
+
+        self.dense = tf.keras.layers.Dense(4 * 4 * 4 * 1024)
+        #self.layernorm = tf.keras.layers.LayerNormalization()
+        #self.leakyrelu = tf.keras.layers.LeakyReLU()
+
+        self.reshape = tf.keras.layers.Reshape((4, 4, 4, 1024))
+
+        # (4, 4, 4, 512)
+        self.deconv1 = tf.keras.layers.Conv3DTranspose(filters=512, 
+                                                    kernel_size=(3, 3, 3), 
+                                                    strides=(1, 1, 1), 
+                                                    padding='same', 
+                                                    use_bias=False)
+        
+        self.layernorm1 = tf.keras.layers.LayerNormalization()
+        #self.leakyrelu1 = tf.keras.layers.LeakyReLU()
+        self.relu1 = tf.keras.layers.Activation('relu')
+
+        # (8, 8, 8, 256)
+        self.deconv2 = tf.keras.layers.Conv3DTranspose(filters=256, 
+                                                    kernel_size=(3, 3, 3), 
+                                                    strides=(2, 2, 2), 
+                                                    padding='same', 
+                                                    use_bias=False)
+        
+        self.layernorm2 = tf.keras.layers.LayerNormalization()
+        #self.leakyrelu2 = tf.keras.layers.LeakyReLU()
+        self.relu2 = tf.keras.layers.Activation('relu')
+
+        # attn -> (32, 8, 8, 128)
+        self.conv1 = tf.keras.layers.Conv3D(filters=128, 
+                                            kernel_size=(3, 3, 3),
+                                            strides=(2, 1, 1), 
+                                            padding='same', 
+                                            use_bias=False)
+        
+        self.attnlayernorm1 = tf.keras.layers.LayerNormalization()
+        #self.attnleakyrelu1 = tf.keras.layers.LeakyReLU()
+        self.attnrelu1 = tf.keras.layers.Activation('relu')
+
+        # attn -> (16, 8, 8, 256)
+        self.conv2 = tf.keras.layers.Conv3D(filters=256, 
+                                            kernel_size=(3, 3, 3),
+                                            strides=(2, 1, 1), 
+                                            padding='same', 
+                                            use_bias=False)
+
+        self.attnlayernorm2 = tf.keras.layers.LayerNormalization()
+        #self.attnleakyrelu2 = tf.keras.layers.LeakyReLU()
+        self.attnrelu2 = tf.keras.layers.Activation('relu')
+
+        # attn -> (8, 8, 8, 512)
+        self.conv3 = tf.keras.layers.Conv3D(filters=512, 
+                                            kernel_size=(3, 3, 3),
+                                            strides=(2, 1, 1), 
+                                            padding='same', 
+                                            use_bias=False)
+                    
+        self.attnlayernorm3 = tf.keras.layers.LayerNormalization()
+        #self.attnleakyrelu3 = tf.keras.layers.LeakyReLU()
+        self.attnrelu3 = tf.keras.layers.Activation('relu')
+        
+
+        # (8, 16, 16, 128)
+        self.deconv3 = tf.keras.layers.Conv3DTranspose(filters=128, 
+                                                    kernel_size=(3, 3, 3), 
+                                                    strides=(1, 2, 2), 
+                                                    padding='same', 
+                                                    use_bias=False)
+        
+        self.layernorm3 = tf.keras.layers.LayerNormalization()
+        #self.leakyrelu3 = tf.keras.layers.LeakyReLU()
+        self.relu3 = tf.keras.layers.Activation('relu')
+
+        # (16, 32, 32, 64)
+        self.deconv4 = tf.keras.layers.Conv3DTranspose(filters=64, 
+                                                    kernel_size=(3, 3, 3), 
+                                                    strides=(2, 2, 2), 
+                                                    padding='same', 
+                                                    use_bias=False)
+        
+        self.layernorm4 = tf.keras.layers.LayerNormalization()
+        #self.leakyrelu4 = tf.keras.layers.LeakyReLU()
+        self.relu4 = tf.keras.layers.Activation('relu')
+
+        # (16, 64, 64, 32)
+        self.deconv5 = tf.keras.layers.Conv3DTranspose(filters=32, 
+                                                    kernel_size=(3, 3, 3), 
+                                                    strides=(1, 2, 2), 
+                                                    padding='same', 
+                                                    use_bias=False)
+        
+        self.layernorm5 = tf.keras.layers.LayerNormalization()
+        #self.leakyrelu5 = tf.keras.layers.LeakyReLU()
+        self.relu5 = tf.keras.layers.Activation('relu')
+
+        # (16, 64, 64, 3)
+        self.deconv6 = tf.keras.layers.Conv3DTranspose(filters=3, 
+                                                    kernel_size=(3, 3, 3), 
+                                                    strides=(1, 1, 1), 
+                                                    padding='same', 
+                                                    use_bias=False,  
+                                                    activation='relu')   # values > 0
+
+
+    def call(self, z, conv_attn_output):
+        assert len(z.shape) == 2    # (1, 100)
+
+        # same z repeated for all the clips
+        z = tf.repeat(z, repeats=conv_attn_output.shape[0], axis=0)
+
+        z = self.dense(z)
+        #z = self.layernorm(z)
+        #z = self.leakyrelu(z)
+
+        z = self.reshape(z)
+
+        # Upscaling z
+        z = self.deconv1(z)
+        z = self.layernorm1(z)
+        z = self.relu1(z)
+
+        z = self.deconv2(z)
+        z = self.layernorm2(z)
+        z = self.relu2(z)      # (32, 8, 8, 8, 256)
+
+        # Attn out -> (32, 64, 8, 8, 64)
+        # Downscale Attention output -> (32, 8, 8, 8, 64)
+        conv_attn_output = self.conv1(conv_attn_output)
+        conv_attn_output = self.attnlayernorm1(conv_attn_output)
+        conv_attn_output = self.attnlayernorm1(conv_attn_output)
+
+        conv_attn_output = self.conv2(conv_attn_output)
+        conv_attn_output = self.attnlayernorm2(conv_attn_output)
+        conv_attn_output = self.attnlayernorm2(conv_attn_output)
+
+        conv_attn_output = self.conv3(conv_attn_output)
+        conv_attn_output = self.attnlayernorm3(conv_attn_output)
+        conv_attn_output = self.attnlayernorm3(conv_attn_output)
+        
+        # Concat condition (downscaled attention output) across channel dimension
+        z = tf.concat([z, conv_attn_output], axis=-1)
+
+        # upconv to produce 40 clips with 40 as bs, thus generating the whole video
+        z = self.deconv3(z)
+        z = self.layernorm3(z)
+        z = self.relu3(z)
+
+        z = self.deconv4(z)
+        z = self.layernorm4(z)
+        z = self.relu4(z)
+        
+        z = self.deconv5(z)
+        z = self.layernorm5(z)
+        z = self.relu5(z)
+        
+        z = self.deconv6(z)
+
+        return z
+        
+
+class Generator(tf.keras.Model):
+    def __init__(self, num_attention_blocks=4, out_channels=64):
+        super(Generator, self).__init__()
+
+        self.attention = ConvAttn(num_attention_blocks, out_channels)
+        self.cdcgan = CDCGAN()
+
+    def call(self, x, bert_embeddings, z):
+        x = self.attention(x, bert_embeddings)
+        x = self.cdcgan(z, x)
+        #if np.isnan(np.sum(x)) == np.nan:
+        #    print('-----CDCGAN-----')
+
+        return x
diff --git a/Machine_Learning/src/NLP/SignGAN/utils/losses.py b/Machine_Learning/src/NLP/SignGAN/utils/losses.py
new file mode 100644
index 00000000..a808347e
--- /dev/null
+++ b/Machine_Learning/src/NLP/SignGAN/utils/losses.py
@@ -0,0 +1,33 @@
+import tensorflow as tf
+
+# disc_video_real -> Disc out for Real video matching with the sentence [single value]
+# disc_video_wrong -> Disc out wrong <sentence, video> pair [single value]
+# disc_video_fake -> Disc out for Generated video [single value]
+# Mean of whole batch taken
+
+# Use BCE loss. Refer tf2-GANs
+bce = tf.keras.losses.BinaryCrossentropy(from_logits=True)
+
+@tf.function
+def video_loss(disc_video_real, disc_video_wrong, disc_video_fake):
+    return (bce(tf.ones_like(disc_video_real), disc_video_real) + bce(tf.zeros_like(disc_video_wrong), disc_video_wrong) + bce(tf.zeros_like(disc_video_fake), disc_video_fake)) / 3.
+
+# disc_frame_real -> Disc out for Real video frames matching with the sentence [Vector of values for each frame]
+# disc_frame_wrong -> Disc out wrong <sentence, video> pair frames [Vector of values for each frame]
+# disc_frame_fake -> Disc out for Generated frames [Vector of values for each frame]
+@tf.function
+def frame_loss(disc_frame_real, disc_frame_wrong, disc_frame_fake):
+    return (bce(tf.ones_like(disc_frame_real), disc_frame_real) + bce(tf.zeros_like(disc_frame_wrong), disc_frame_wrong) + bce(tf.zeros_like(disc_frame_fake), disc_frame_fake)) / 3.
+
+@tf.function
+def motion_loss(disc_motion_real, disc_motion_wrong, disc_motion_fake):  # only for generator
+    return (bce(tf.ones_like(disc_motion_real), disc_motion_real) + bce(tf.zeros_like(disc_motion_wrong), disc_motion_wrong) + bce(tf.zeros_like(disc_motion_fake), disc_motion_fake)) / 3.
+
+@tf.function
+def discriminator_loss(video_loss, frame_loss, motion_loss):
+    return (video_loss + frame_loss + motion_loss) / 3.
+
+# reduce_mean for batch
+@tf.function
+def generator_loss(disc_video_fake, disc_frame_fake, disc_motion_fake):
+    return (bce(tf.ones_like(disc_video_fake), disc_video_fake) + bce(tf.ones_like(disc_frame_fake), disc_frame_fake) + bce(tf.ones_like(disc_motion_fake), disc_motion_fake)) / 3.
diff --git a/Machine_Learning/src/NLP/SignGAN/utils/video.py b/Machine_Learning/src/NLP/SignGAN/utils/video.py
new file mode 100644
index 00000000..894e3f66
--- /dev/null
+++ b/Machine_Learning/src/NLP/SignGAN/utils/video.py
@@ -0,0 +1,94 @@
+import tensorflow as tf
+import numpy as np
+import glob
+import cv2
+
+class Video(object):
+    def __init__(self, T = 16, MAX_VIDEO_LENGTH = 640, FRAME_DIM = (64, 64, 3), VIDEO_DIM = (640, 64, 64, 3),
+                data_dir = 'phoenix-2014-T.v3/PHOENIX-2014-T-release-v3/PHOENIX-2014-T/features/fullFrame-210x260px/'):
+        self.T = T
+        self.MAX_VIDEO_LENGTH = MAX_VIDEO_LENGTH
+        self.FRAME_DIM = FRAME_DIM
+        self.VIDEO_DIM = VIDEO_DIM
+        self.data_dir = data_dir
+
+    def get_video(self, set_name, name, resize=True, scale_down=True):
+        vid = []
+        for frame in glob.glob(self.data_dir + set_name + '/' + name + '/*.png'):
+            vid_frame = cv2.imread(frame)
+            if resize:
+                vid_frame = cv2.resize(vid_frame, (self.FRAME_DIM[0], self.FRAME_DIM[1]))
+            if scale_down:
+                vid_frame = vid_frame / 255.         # 0 < pixel values < 1, padding = 0
+            vid.append(vid_frame)
+        return tf.convert_to_tensor(np.array(vid, np.float32))
+
+    def padding(self, video):
+        pad_length = self.MAX_VIDEO_LENGTH - video.shape[0]
+        pad = tf.zeros((pad_length, self.FRAME_DIM[0], self.FRAME_DIM[1], self.FRAME_DIM[2]), dtype=tf.float32)
+        return tf.concat([video, pad], 0)
+
+    def preprocess_video(self, video):
+        start_token = tf.fill((self.T, self.FRAME_DIM[0], self.FRAME_DIM[1], self.FRAME_DIM[2]), 256./255)       # start token -> 4d array of 0.9
+        #print(start_token.shape)
+        end_token = tf.fill((self.T, self.FRAME_DIM[0], self.FRAME_DIM[1], self.FRAME_DIM[2]), 257./255)         # end token -> 4d array of 2.1
+        #print(end_token.shape)
+        extra_token = tf.fill((self.T - video.shape[0] % self.T, self.FRAME_DIM[0], self.FRAME_DIM[1], self.FRAME_DIM[2]), 1.)        # padding starts only from the nearest 10th, so until the nearest 10, array of 1's
+        #print(extra_token.shape)
+        #video = tf.concat([tf.concat([start_token, video], 0), extra_token], 0)
+        #video = tf.concat([tf.cast(start_token, tf.float32), tf.cast(video, tf.float32), tf.cast(extra_token, tf.float32), tf.cast(end_token, tf.float32)], 0)
+        video = tf.concat([start_token, video, extra_token, end_token], 0)
+        #print(video)
+        
+        #video = np.append(video, end_token, axis=0)
+        video = self.padding(video)
+        return video
+    
+    def divide_sequence(self, preprocessed_video):
+        return tf.reshape(preprocessed_video, (self.MAX_VIDEO_LENGTH//self.T, self.T, preprocessed_video.shape[1], preprocessed_video.shape[2], preprocessed_video.shape[3]))
+        #return np.array(np.array_split(preprocessed_video, self.MAX_VIDEO_LENGTH//self.T, axis=0))
+
+    '''
+    # may not be needed
+    def padding_mask(self, current_sequence_length):     # so that paddings are not treated as input
+        division = current_sequence_length // self.T + 2     # includes start_token so 2
+        #print(division)
+
+        mask = np.zeros((division, self.T, self.FRAME_DIM[0], self.FRAME_DIM[1], self.FRAME_DIM[2]), dtype=np.float)
+        mask = np.append(mask, np.ones((self.MAX_VIDEO_LENGTH//self.T - division, self.T, self.FRAME_DIM[0], self.FRAME_DIM[1], self.FRAME_DIM[2]), dtype=np.float), axis=0)
+
+        return mask
+
+    # may not be needed
+    def look_ahead_mask(self):
+        mask = np.zeros((1, self.T, self.FRAME_DIM[0], self.FRAME_DIM[1], self.FRAME_DIM[2]), dtype=np.float)
+        mask = np.append(mask, np.ones((49, self.T, self.FRAME_DIM[0], self.FRAME_DIM[1], self.FRAME_DIM[2]), dtype=np.float), axis=0)
+        mask = np.expand_dims(mask, axis=0)
+        #print(mask.shape)
+        for i in range(0, self.MAX_VIDEO_LENGTH - 2 * self.T + 1, self.T):
+            mask = np.append(mask, np.expand_dims(self.padding_mask(i), axis=0), axis=0)
+        return mask
+    '''
+
+'''
+def main():
+    video_obj = Video()
+    video = video_obj.get_video('train', '05January_2010_Tuesday_tagesschau-2664')
+    current_sequence_length = video.shape[0]
+
+    video = video_obj.preprocess_video(video)
+    print(video.shape)
+
+    video = video_obj.divide_sequence(video)
+    print(video.shape)
+
+    pad_mask = video_obj.padding_mask(current_sequence_length)
+    print(pad_mask.shape)
+
+    look_mask = video_obj.look_ahead_mask()
+    print(look_mask.shape)
+
+
+if __name__ == '__main__':
+    main()
+'''