diff --git a/Machine_Learning/src/NLP/SignGAN/Architecture-1 (Transformer).pdf b/Machine_Learning/src/NLP/SignGAN/Architecture-1 (Transformer).pdf new file mode 100644 index 00000000..dc052ce4 Binary files /dev/null and b/Machine_Learning/src/NLP/SignGAN/Architecture-1 (Transformer).pdf differ diff --git a/Machine_Learning/src/NLP/SignGAN/bert_utils.py b/Machine_Learning/src/NLP/SignGAN/bert_utils.py new file mode 100644 index 00000000..3316b914 --- /dev/null +++ b/Machine_Learning/src/NLP/SignGAN/bert_utils.py @@ -0,0 +1,85 @@ +import tensorflow as tf +import numpy as np +import bert + +#max_sequence_length = 64 + +class Bert(object): + def __init__(self, max_sequence_length=64): + super(Bert, self).__init__() + self.max_seq_length = max_sequence_length + self.model_dir = 'models/multi_cased_L-12_H-768_A-12' + self.model_ckpt = 'models/multi_cased_L-12_H-768_A-12/bert_model.ckpt' + self.vocab_file = 'models/multi_cased_L-12_H-768_A-12/vocab.txt' + + self.model = self.create_bert_model() + self.tokenizer = self.create_bert_tokenizer() + + def create_bert_model(self): + bert_params = bert.params_from_pretrained_ckpt(self.model_dir) + l_bert = bert.BertModelLayer.from_params(bert_params, name="bert") + + l_input_ids = tf.keras.layers.Input(shape=(self.max_seq_length,), dtype='int32') + output = l_bert(l_input_ids) + model = tf.keras.Model(inputs=l_input_ids, outputs=output) + model.build(input_shape=(None, self.max_seq_length)) + + bert.load_stock_weights(l_bert, self.model_ckpt) + + return model + + def create_bert_tokenizer(self): + model_name = 'multi_cased_L-12_H-768_A-12' + do_lower_case = not (model_name.find("cased") == 0 or model_name.find("multi_cased") == 0) + bert.bert_tokenization.validate_case_matches_checkpoint(do_lower_case, self.model_ckpt) + tokenizer = bert.bert_tokenization.FullTokenizer(self.vocab_file, do_lower_case) + + return tokenizer + + def tokenize(self, sequence): + sequence = self.tokenizer.tokenize(sequence) + return ["[CLS]"] + sequence + ["[SEP]"] + + def get_sequence_ids(self, tokenized_sequence): + sequence_ids = self.tokenizer.convert_tokens_to_ids(tokenized_sequence) + sequence_ids = sequence_ids + [0] * (self.max_seq_length - len(sequence_ids)) # padding + return sequence_ids + + def preprocess(self, sequence): + return self.get_sequence_ids(self.tokenize(sequence)) + + def preprocess_batch(self, sequence_list): + if type(sequence_list) != list: + sequence_list = [sequence_list] + + preprocessed_sequence_ids = [] + for sequence in sequence_list: + preprocessed_sequence_ids.append(self.preprocess(sequence)) + return np.array(preprocessed_sequence_ids) + + def __call__(self, sequence_list): + #if type(sequence_list) != list or sequence_list: + # sequence_list = [sequence_list] + preprocessed_sequence_ids = self.preprocess_batch(sequence_list) + output = self.model.predict(preprocessed_sequence_ids) # (?, 64, 768) for all sentences in batch + word_embeddings = output # (?, 64, 768) + + sentence_embeddings = [] + for sentence in output: + sentence_vector = sentence[0] # feature vector for [CLS] is the sentence vector for each sentence + sentence_embeddings.append(sentence_vector) + + sentence_embeddings = np.array(sentence_embeddings) # (?, 768) + + return word_embeddings, sentence_embeddings + +''' +def main(): + bert_model = Bert(64) + word_embeddings, sentence_embeddings = bert_model.predict(['sonst wechselhaft mit schauern und gewittern die uns auch am wochenende begleiten', + 'und nun die wettervorhersage für morgen donnerstag den zwölften august']) + print(word_embeddings.shape, sentence_embeddings.shape) + +if __name__ == "__main__": + main() +''' \ No newline at end of file diff --git a/Machine_Learning/src/NLP/SignGAN/models/multi_cased_L-12_H-768_A-12/README.md b/Machine_Learning/src/NLP/SignGAN/models/multi_cased_L-12_H-768_A-12/README.md new file mode 100644 index 00000000..12c94828 --- /dev/null +++ b/Machine_Learning/src/NLP/SignGAN/models/multi_cased_L-12_H-768_A-12/README.md @@ -0,0 +1 @@ +Multilingual BERT was used as German Text Encoder diff --git a/Machine_Learning/src/NLP/SignGAN/phoenix-2014-T.v3/README.md b/Machine_Learning/src/NLP/SignGAN/phoenix-2014-T.v3/README.md new file mode 100644 index 00000000..55de7852 --- /dev/null +++ b/Machine_Learning/src/NLP/SignGAN/phoenix-2014-T.v3/README.md @@ -0,0 +1,2 @@ +Dataset stored here. +Look at utils/video.py for location specifics diff --git a/Machine_Learning/src/NLP/SignGAN/readme.md b/Machine_Learning/src/NLP/SignGAN/readme.md new file mode 100644 index 00000000..c8648d54 --- /dev/null +++ b/Machine_Learning/src/NLP/SignGAN/readme.md @@ -0,0 +1 @@ +# SignGAN diff --git a/Machine_Learning/src/NLP/SignGAN/signgan.ipynb b/Machine_Learning/src/NLP/SignGAN/signgan.ipynb new file mode 100644 index 00000000..d5b04cd0 --- /dev/null +++ b/Machine_Learning/src/NLP/SignGAN/signgan.ipynb @@ -0,0 +1,589 @@ +{ + "metadata": { + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4-final" + }, + "orig_nbformat": 2, + "kernelspec": { + "name": "research", + "display_name": "research" + } + }, + "nbformat": 4, + "nbformat_minor": 2, + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# SignGAN" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Importing the Libraries" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import tensorflow as tf\n", + "import numpy as np\n", + "import glob\n", + "import os\n", + "\n", + "from bert_utils import Bert\n", + "from utils.video import Video\n", + "from utils.conv_attention import *\n", + "from utils.generator import *\n", + "from utils.discriminator import *\n", + "from utils.losses import *" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "config = tf.compat.v1.ConfigProto()\n", + "config.gpu_options.allow_growth = True\n", + "sess = tf.compat.v1.Session(config=config)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Pretrained BERT Model\n", + "Multilingual Cased BERT is used" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "Done loading 196 BERT weights from: models/multi_cased_L-12_H-768_A-12/bert_model.ckpt into (prefix:bert). Count of weights not found in the checkpoint was: [0]. Count of weights with mismatched shape: [0]\nUnused weights from checkpoint: \n\tbert/embeddings/token_type_embeddings\n\tbert/pooler/dense/bias\n\tbert/pooler/dense/kernel\n\tcls/predictions/output_bias\n\tcls/predictions/transform/LayerNorm/beta\n\tcls/predictions/transform/LayerNorm/gamma\n\tcls/predictions/transform/dense/bias\n\tcls/predictions/transform/dense/kernel\n\tcls/seq_relationship/output_bias\n\tcls/seq_relationship/output_weights\n(1, 64, 768) (1, 768)\n" + } + ], + "source": [ + "bert = Bert()\n", + "word_embeddings, sentence_embeddings = bert.predict(['sonst wechselhaft mit schauern und gewittern die uns auch am wochenende begleiten'])\n", + "print(word_embeddings.shape, sentence_embeddings.shape)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Global Variables" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "num_clips = 32\n", + "T = 16 # let\n", + "MAX_VIDEO_LENGTH = 512 # 475 is the longest\n", + "FRAME_DIM = (64, 64, 3)\n", + "VIDEO_DIM = (512, 64, 64, 3)\n", + "data_dir = 'phoenix-2014-T.v3/PHOENIX-2014-T-release-v3/PHOENIX-2014-T/features/fullFrame-210x260px/'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Video Object\n", + "with example from training set" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "(640, 64, 64, 3)\n(40, 16, 64, 64, 3)\n" + }, + { + "output_type": "execute_result", + "data": { + "text/plain": "'\\npad_mask = video_obj.padding_mask(current_sequence_length)\\nprint(pad_mask.shape)\\n\\nlook_mask = video_obj.look_ahead_mask()\\nprint(look_mask.shape)\\n'" + }, + "metadata": {}, + "execution_count": 4 + } + ], + "source": [ + "video_obj = Video(T, MAX_VIDEO_LENGTH, FRAME_DIM, VIDEO_DIM, data_dir)\n", + "\n", + "video_real = video_obj.get_video('train', '05January_2010_Tuesday_tagesschau-2664')\n", + "current_sequence_length = video_real.shape[0]\n", + "\n", + "video_real = video_obj.preprocess_video(video_real)\n", + "print(video_real.shape)\n", + "\n", + "video_real = video_obj.divide_sequence(video_real)\n", + "print(video_real.shape)\n", + "\n", + "video_wrong = video_obj.get_video('train', '03June_2011_Friday_tagesschau-7649')\n", + "current_sequence_length = video_wrong.shape[0]\n", + "\n", + "video_wrong = video_obj.preprocess_video(video_wrong)\n", + "print(video_wrong.shape)\n", + "\n", + "video_wrong = video_obj.divide_sequence(video_wrong)\n", + "print(video_wrong.shape)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Attention\n", + "* First, perform convolutions across the whole video, then go for separable self attention for the whole video masked, i.e, (40, 40T, H, W, C), outputs (40, 40T, H, W, C), where T=16 and 40=640//T. 640 being the length of the whole video.\n", + "* Convolution operations will be restricted to only T frames at a time. There will be no intermingling of 2 or more sets of T frames. Thus 3D CONV will take care of extracting local, temporal and spatial features only. Hack : Batched\n", + "* After the conv operations there will be MAX_SEQ_LENGTH // T i.e, 40 (here) elements each of size (T, H, W, C), making the output of convolution ops, to be of dim -> (40, T, H, W, C) converted to (40T, H, W, C). This will be passed through attention blocks and outputs (40, 40T, H, W, C) along with masking.\n", + "* This attention performs masked attention. We will have MAX_SEQ_LENGTH // T i.e, 40 (here) masks in total each for time, height, width. Each mask being of shape (40, 40T, H, W, H*W) for time, W*T for height and H*T for width.\n", + "* Only after the whole video is generated are the losses calculated, and backpropped.\n", + "* During testing just \"start\" token will be provided and the rest of the sequence will be just padding, and each time the generator produces T frames, those T frames will be concatenated along with the \"start\" token and then convoluted again to produce the next T frames.\n", + "\n", + "### Attention mechanism will require residual connections otherwise gradients will vanish\n", + "\n", + "## Word Frame Attention\n", + "* Last dimension of both masked_attention_output and semantic_word_matrix must be same\n", + "* 2nd last dimenstion i.e, 1st dimenstion of semantic_word_matrix should equal to H*W of masked-separable-self-attention output \n", + "* that is, we bring both to a common semantic space using conv for frames and dense for embeddings" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "(4, 16, 8, 8, 3)\n" + } + ], + "source": [ + "attention = Attention(channel_dimension=3)\n", + "\n", + "num_clips = 32\n", + "t, h, w, c = 16, 8, 8, 3\n", + "\n", + "x = tf.random.normal(shape=(t, h, w, c), mean=0.5, stddev=0.5)\n", + "\n", + "# this step is done only before the 1st attention block\n", + "x = tf.reshape(tf.repeat(x, repeats=num_clips, axis=0), (num_clips, t, h, w, c))\n", + "\n", + "mask_t = look_ahead_mask(num_clips, (t, h, w, h*w))\n", + "mask_h = look_ahead_mask(num_clips, (t, h, w, t*w))\n", + "mask_w = look_ahead_mask(num_clips, (t, h, w, t*h))\n", + "\n", + "word_embeddings = tf.squeeze(word_embeddings)\n", + "\n", + "print(x.shape)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Conv-Attention Block\n", + "* The input (640, 64, 64, 3) video will be explicitly split into (40, 16, 64, 64, 3).\n", + "* HACK : This reshaped input will be fed to 3D conv block with batch size being 40(hack), for local spatial and temporal feature extraction\n", + "* Conv Block output will be (40, 16, 8, 8, 64), which will be reshaped to (640, 8, 8, 64) and sent forward for masked-separable-self-attention followed by word-frame-attention for a few number of times (Attention Block), with addition and layer normalisation after each attention block\n", + "* Conv Block : \n", + " * Format : num_filters, kernel, strides, padding\n", + " * {8, (3, 3, 3), (2, 2, 2), same} -> {16, (3, 3, 3), (2, 2, 2), same} -> {32, (3, 3, 3), (2, 2, 2), same} -> {out_channels(64), (3, 3, 3), (2, 2, 2), same}" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "Overall Video : (64, 64, 64, 3)\nInput Video Shape : (4, 16, 64, 64, 3)\nAttention Output Shape : (4, 8, 8, 8, 64)\n" + } + ], + "source": [ + "num_clips = 4 # instead of 32\n", + "t, h, w, c = 16, 64, 64, 3\n", + "print(\"Overall Video : \", (num_clips * t, h, w, c))\n", + "print(\"Input Video Shape : \", (num_clips, t, h, w, c))\n", + "\n", + "x = tf.random.normal(shape=(num_clips, t, h, w, c), mean=0.5, stddev=0.5)\n", + "word_embeddings = tf.squeeze(word_embeddings)\n", + "\n", + "conv_attn = ConvAttn(num_attention_blocks=4)\n", + "x = conv_attn(x, word_embeddings)\n", + "print(\"Attention Output Shape : \", x.shape)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Conditional GAN (3D)\n", + "* The conditions generated by ConvAttn block will be used along ith upscaled randomness to generate a video with T frames for each batch of size num_clips(40)\n", + "* use LayerNorm instead of BatchNorm, because BatchNorm outputs NaN because of the padding and masking\n", + "* In the future if this fails, even this block may have attention blocks in the intermediate layers\n", + "* Upsampling z : \n", + "* Concatenate upsampled z with conv_attn_output\n", + "* upsample to produce the video\n", + "* this 'z' will be upscaled to num_clips x (num_clips x t, h, w, channels) and concatenated with conv_attn_output, along the channel dimension, which together will be upscaled using deconv to produce a 40 x (16, 64, 64, 3) video. Hack : Use 40 as batch size throughout\n", + "* We will feed 'z' from outside. If it is inside it'll stay constant and won't be random" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "Output Shape : (4, 16, 64, 64, 3)\nOutput Video Shape : (64, 64, 64, 3)\n" + } + ], + "source": [ + "z = tf.random.normal(shape=(1, 100))\n", + "cdcgan = CDCGAN()\n", + "z = cdcgan(z, x)\n", + "print(\"Output Shape : \", z.shape)\n", + "print(\"Output Video Shape : \", (z.shape[0] * z.shape[1], z.shape[2], z.shape[3], z.shape[4]))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Thus output video is in the shape we wanted" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Full Generator\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "(64, 768)\nConv1 Out Shape : (32, 16, 64, 64, 8)\nConv2 Out Shape : (32, 8, 32, 32, 16)\nConv3 Out Shape : (32, 4, 16, 16, 32)\nConv4 Out Shape : (32, 2, 8, 8, 64)\n(32, 8, 8, 8, 256) (32, 8, 8, 8, 512)\n(32, 16, 64, 64, 3)\n" + } + ], + "source": [ + "num_clips = 32 # instead of 32\n", + "t, h, w, c = 16, 64, 64, 3\n", + "\n", + "x = tf.random.normal(shape=(num_clips, t, h, w, c), mean=0.5, stddev=0.5)\n", + "word_embeddings = tf.squeeze(word_embeddings)\n", + "z = tf.random.normal(shape=(1, 100))\n", + "\n", + "generator = Generator()\n", + "x = generator(x, word_embeddings, z)\n", + "print(x.shape)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Discriminators\n", + "* Batched\n", + "* For discriminators we also consider the batch dimension\n", + "* Before the video goes into the discriminator we have to reshape the video to (1, num_clips * t, h, w, c), 1 -> batch_size" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Video Discriminator\n", + "* Gotta setup proper input pipelines for discriminator training\n", + "* 64, 64, 64, 3 -> 32, 64, 64, 16 -> 16, 64, 64, 32 -> 8, 32, 32, 64 -> 4, 16, 16, 128 -> 2, 8, 8, 256 -> 1, 4, 4, 512" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "tf.Tensor(\n[[0.45877308]\n [0.6192273 ]], shape=(2, 1), dtype=float32)\n" + } + ], + "source": [ + "v = tf.random.normal((2, 32 * 16, 64, 64, 3)) # batch dimension considered \n", + "s = tf.random.normal((2, 768))\n", + "\n", + "video_disc = VideoDiscriminator()\n", + "vid_disc_out = video_disc(v, s)\n", + "print(vid_disc_out) # 2 outputs for batch_size = 2" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Frame Discriminator\n", + "* 2-fold\n", + "* Outputs \"{0 ... 1}\" for single frame level\n", + "* Outputs temporal (difference between 2 consecutive frames in euclidean norm, for each pair) downscaled as output, i.e, 1 number as output per pair of consecutive frames\n", + "* One part of both the discriminators are shared" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "(2, 8) (2, 7)\n" + } + ], + "source": [ + "# Total frames kept 8 instead of 640 because of ResourceExhaustError\n", + "v = tf.random.normal((2, 8, 64, 64, 3)) # kept batch_size = 2 here\n", + "s = tf.random.normal((2, 768))\n", + "\n", + "frame_disc = FrameDiscriminator()\n", + "frame_disc_out, motion_disc_out = frame_disc(v, s)\n", + "\n", + "print(frame_disc_out.shape, motion_disc_out.shape)\n", + "# Thus 2 outputs of each frame and motion disc, for batch_size=2" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Full Discriminator" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "discriminator = Discriminator()\n", + "\n", + "v = tf.random.normal((1, 32 * 16, 64, 64, 3)) # batch dimension considered \n", + "s = tf.random.normal((1, 768)) # keeping batch_size = 1\n", + "\n", + "video_disc_out, frame_disc_out, motion_disc_out = discriminator(v, s)\n", + "print(video_disc_out.shape, frame_disc_out.shape, motion_disc_out.shape)\n", + "print(discriminator.summary())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Losses\n", + "* Matching Aware Losses\n", + "* Output of motion discriminator is a bit high in value (though only used for the generator)\n", + "* Using Scheme 2 of Microsoft" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "tf.Tensor(0.7362369, shape=(), dtype=float32)\ntf.Tensor(0.7079885, shape=(), dtype=float32)\ntf.Tensor(0.68915033, shape=(), dtype=float32)\n" + } + ], + "source": [ + "# Doing with the same ones, but won't be the case\n", + "vid_loss = video_loss(vid_disc_out, vid_disc_out, vid_disc_out)\n", + "print(vid_loss)\n", + "fr_loss = frame_loss(frame_disc_out, frame_disc_out, frame_disc_out)\n", + "print(fr_loss)\n", + "mot_loss = motion_loss(motion_disc_out, motion_disc_out, motion_disc_out)\n", + "print(mot_loss)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "tf.Tensor(0.67633134, shape=(), dtype=float32)\ntf.Tensor(0.20451142, shape=(), dtype=float32)\n" + } + ], + "source": [ + "print(discriminator_loss(vid_loss, fr_loss, mot_loss))\n", + "print(generator_loss(vid_disc_out, frame_disc_out, motion_disc_out))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Train Step" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# Takes correct video, wrong video and word embeddings, sentence\n", + "# all of this must be preprocessed (padded and stuff)\n", + "# Video must be explicitly divided into T frames\n", + "def train_step(video_real, video_wrong, w, s):\n", + " num_clips, t, h, w, c = video_real.shape\n", + "\n", + " w = tf.squeeze(w)\n", + " z = tf.random.normal(shape=(1, 100))\n", + " \n", + " with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:\n", + " video_fake = generator(video_real, w, z)\n", + "\n", + " # All frames put together with bs = 1\n", + " video_real = tf.reshape(video_real, (1, num_clips * t, h, w, c))\n", + " video_wrong = tf.reshape(video_wrong, (1, num_clips * t, h, w, c))\n", + " video_fake = tf.reshape(video_fake, (1, num_clips * t, h, w, c))\n", + "\n", + " # Discriminator out\n", + " disc_video_real, disc_frame_real, disc_motion_real = discriminator(video_real, s)\n", + " disc_video_wrong, disc_frame_wrong, disc_motion_wrong = discriminator(video_wrong, s)\n", + " disc_video_fake, disc_frame_fake, disc_motion_fake = discriminator(video_fake, s)\n", + "\n", + " # Losses\n", + " total_video_loss = video_loss(disc_video_real, disc_video_wrong, disc_video_fake)\n", + " total_frame_loss = frame_loss(disc_frame_real, disc_frame_wrong, disc_frame_fake)\n", + " total_motion_loss = motion_loss(disc_motion_real, disc_motion_wrong, disc_motion_fake)\n", + "\n", + " disc_loss = discriminator_loss(total_video_loss, total_frame_loss, total_motion_loss)\n", + " gen_loss = generator_loss(disc_video_fake, disc_frame_fake, disc_motion_fake)\n", + "\n", + " gradients_of_generator = gen_tape.gradient(gen_loss, generator.trainable_variables)\n", + " gradients_of_discriminator = disc_tape.gradient(disc_loss, discriminator.trainable_variables)\n", + "\n", + " generator_optimizer.apply_gradients(zip(gradients_of_generator, generator.trainable_variables))\n", + " discriminator_optimizer.apply_gradients(zip(gradients_of_discriminator, discriminator.trainable_variables))\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Optimizers and Checkpoint" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "generator = Generator()\n", + "discriminator = Discriminator()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "learning_rate = 0.0002 # Following microsoft\n", + "generator_optimizer = tf.keras.optimizers.Adam(learning_rate) # rest default\n", + "discriminator_optimizer = tf.keras.optimizers.Adam(learning_rate) # rest default" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "checkpoint_dir = './training_checkpoints'\n", + "checkpoint_prefix = os.path.join(checkpoint_dir, \"ckpt\")\n", + "checkpoint = tf.train.Checkpoint(generator_optimizer=generator_optimizer,\n", + " discriminator_optimizer=discriminator_optimizer,\n", + " generator=generator,\n", + " discriminator=discriminator)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train_step(video_real[:8], video_wrong[:8], word_embeddings, sentence_embeddings)" + ] + } + ] +} \ No newline at end of file diff --git a/Machine_Learning/src/NLP/SignGAN/train.py b/Machine_Learning/src/NLP/SignGAN/train.py new file mode 100644 index 00000000..a8d1b588 --- /dev/null +++ b/Machine_Learning/src/NLP/SignGAN/train.py @@ -0,0 +1,115 @@ +import tensorflow as tf +import numpy as np +import glob +import time +import os + +from bert_utils import Bert +from utils.video import Video +from utils.conv_attention import * +from utils.generator import * +from utils.discriminator import * +from utils.losses import * + +# Optional +config = tf.compat.v1.ConfigProto() +config.gpu_options.allow_growth = True +sess = tf.compat.v1.Session(config=config) + +# Models +bert = Bert() +generator = Generator() +discriminator = Discriminator() + +num_clips = 32 +T = 16 # let +MAX_VIDEO_LENGTH = 512 # 475 is the longest +FRAME_DIM = (64, 64, 3) +VIDEO_DIM = (512, 64, 64, 3) +data_dir = 'phoenix-2014-T.v3/PHOENIX-2014-T-release-v3/PHOENIX-2014-T/features/fullFrame-210x260px/' +video_obj = Video(T, MAX_VIDEO_LENGTH, FRAME_DIM, VIDEO_DIM, data_dir) + +EPOCHS = 10000 + +# Otimizers +learning_rate = 0.000001 +generator_optimizer = tf.keras.optimizers.Adam(learning_rate) +discriminator_optimizer = tf.keras.optimizers.Adam(learning_rate) + +# ckpt +checkpoint_dir = './training_checkpoints' +checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt") +checkpoint = tf.train.Checkpoint(generator_optimizer=generator_optimizer, + discriminator_optimizer=discriminator_optimizer, + generator=generator, + discriminator=discriminator) + +# Takes correct video, wrong video and word embeddings, sentence +# all of this must be preprocessed (padded and stuff) +# Video must be explicitly divided into T frames + +def train_step(video_real, video_wrong, text): + num_clips, t, h, w, c = video_real.shape + word, sentence = bert([text]) + word, sentence = tf.convert_to_tensor(word), tf.convert_to_tensor(sentence) + word = tf.squeeze(word) + z = tf.random.normal(shape=(1, 100)) + + with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape: + video_fake = generator(video_real, word, z) + + # All frames put together with bs = 1 + video_real = tf.reshape(video_real, (1, num_clips * t, h, w, c)) + video_wrong = tf.reshape(video_wrong, (1, num_clips * t, h, w, c)) + video_fake = tf.reshape(video_fake, (1, num_clips * t, h, w, c)) + + # Discriminator out + disc_video_real, disc_frame_real, disc_motion_real = discriminator(video_real, sentence) + disc_video_wrong, disc_frame_wrong, disc_motion_wrong = discriminator(video_wrong, sentence) + disc_video_fake, disc_frame_fake, disc_motion_fake = discriminator(video_fake, sentence) + + # Losses + total_video_loss = video_loss(disc_video_real, disc_video_wrong, disc_video_fake) + total_frame_loss = frame_loss(disc_frame_real, disc_frame_wrong, disc_frame_fake) + total_motion_loss = motion_loss(disc_motion_real, disc_motion_wrong, disc_motion_fake) + + disc_loss = discriminator_loss(total_video_loss, total_frame_loss, total_motion_loss) + gen_loss = generator_loss(disc_video_fake, disc_frame_fake, disc_motion_fake) + + gradients_of_generator = gen_tape.gradient(gen_loss, generator.trainable_variables) + gradients_of_discriminator = disc_tape.gradient(disc_loss, discriminator.trainable_variables) + + generator_optimizer.apply_gradients(zip(gradients_of_generator, generator.trainable_variables)) + discriminator_optimizer.apply_gradients(zip(gradients_of_discriminator, discriminator.trainable_variables)) + + +def main(): + dataset = open('phoenix-2014-T.v3/PHOENIX-2014-T-release-v3/PHOENIX-2014-T/annotations/manual/PHOENIX-2014-T.train.corpus.csv', encoding='utf-8') + dataset = dataset.readlines()[1:] + + text_data = [i.split('|')[-1][:-1] for i in dataset] # [:-1] to remove '\n' at the end + video_data = [i.split('|')[0] for i in dataset] + + for epoch in range(EPOCHS): + start = time.time() + for text, video in zip(text_data, video_data): + video_real = video_obj.get_video('train', video) + video_real = video_obj.preprocess_video(video_real) + video_real = video_obj.divide_sequence(video_real) + + # Random video from dataset as the wrong video + video_wrong = video_obj.get_video('train', video_data[np.random.randint(0, len(video_data))]) + video_wrong = video_obj.preprocess_video(video_wrong) + video_wrong = video_obj.divide_sequence(video_wrong) + + train_step(video_real, video_wrong, text) + + if (epoch + 1) % 10 == 0: + checkpoint.save(file_prefix=checkpoint_prefix) + + print("Epoch {0} :- Time : {1}".format(epoch + 1, time.time() - start)) + + + +if __name__ == '__main__': + main() diff --git a/Machine_Learning/src/NLP/SignGAN/utils/__init__.py b/Machine_Learning/src/NLP/SignGAN/utils/__init__.py new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/Machine_Learning/src/NLP/SignGAN/utils/__init__.py @@ -0,0 +1 @@ + diff --git a/Machine_Learning/src/NLP/SignGAN/utils/conv_attention.py b/Machine_Learning/src/NLP/SignGAN/utils/conv_attention.py new file mode 100644 index 00000000..0ad71cdd --- /dev/null +++ b/Machine_Learning/src/NLP/SignGAN/utils/conv_attention.py @@ -0,0 +1,299 @@ +import tensorflow as tf +import numpy as np + +from .errors import * + +@tf.function +def masking(shape, position): + if position > shape[0]: + raise PaddingError("padding position exceeds limits. position must be < shape[0]") + + if len(shape) != 4: + raise MatrixRankError("Shape must be of Rank 4 i.e, (T, H, W, val), val -> H*W or, T*W or, T*H") + + # mask only for time dimension, size -> (t, h*w, h*w) + ''' + zero = tf.zeros((position, shape[1], shape[2], shape[3]), dtype=tf.float32) + fill = tf.fill((shape[0]-position, shape[1], shape[2], shape[3]), -np.inf) + mask = tf.concat([zero, fill], 0) + ''' + + return tf.concat([tf.zeros((position, shape[1], shape[2], shape[3]), dtype=tf.float32), + tf.fill((shape[0]-position, shape[1], shape[2], shape[3]), -np.inf)], 0) + + #return mask + +@tf.function +def look_ahead_mask(num_mask, shape): + if shape[0] % num_mask != 0: + raise DivisionError("Time dimension must be divisible by number of masks") + if len(shape) != 4: + raise MatrixRankError("Shape must be of Rank 4 i.e, (T, H, W, val), val -> H*W or, T*W or, T*H") + + mask = tf.expand_dims(masking(shape, 1 * shape[0] // num_mask), 0) + for mask_pos in range(1, num_mask): # shape[0] = 500 + mask = tf.concat([mask, tf.expand_dims(masking(shape, (mask_pos+1) * shape[0] // num_mask), 0)], 0) + + return mask # shape -> (50, T, H, W, C) + +@tf.function +def dot_product_attention(q, k, v, mask): + # Number of columns of q must be equal to number of columns of k + # i.e the last dimension must be same + + # This if-else block is for self attention and, word embedding attention + if len(k.shape) == 2: + k_T = tf.transpose(k) + else: + k_T = tf.transpose(k, perm=[0, 1, 3, 2]) # which axis will be at which place specified + + qv_correlations = tf.matmul(q, k_T) + + if mask is not None: + num_clips, qv_1, qv_2, qv_3 = qv_correlations.shape + ''' + try: + mask = tf.reshape(mask, (num_clips, qv_1, qv_2, qv_3)) + except: + raise ValueError("Reshaped Mask does not match 'matmul(Q, K.T)' in shape") + ''' + qv_correlations += mask + #print(qv_correlations) + + return tf.matmul(tf.nn.softmax(qv_correlations, axis=0), v) + + +## Word Frame Attention +# Last dimension of both masked_attention_output and semantic_word_matrix must be same +# 2nd last dimenstion i.e, 1st dimenstion of semantic_word_matrix should equal to H*W of masked-separable-self-attention output because the same mask_t will be used +# that is, we bring both to a common semantic space using conv for frames and dense for embeddings +class Attention(tf.keras.layers.Layer): + # tf.keras.layers.Dense changes the last dimenstion of n-d matrix + def __init__(self, channel_dimension=64): + # downsample to H*W = 64 i.e, H = 8, W = 8, before the attention block. + # common semantic space = 64, i.e, channel_dimension = 64 + # result after tf.Dense Layer : 1. semantic_word_matrix -> (64, 64) 2. Downsampled masked/repeated video -> (50, 500, 8, 8, 64) + + super(Attention, self).__init__() + + self.channel_dimension = channel_dimension + + # For separable self attention + self.sep_wq1 = tf.keras.layers.Dense(self.channel_dimension, use_bias=False, name='sep_wq1') + self.sep_wk1 = tf.keras.layers.Dense(self.channel_dimension, use_bias=False, name='sep_wk1') + self.sep_wv1 = tf.keras.layers.Dense(self.channel_dimension, use_bias=False, name='sep_wv1') + + self.sep_wq2 = tf.keras.layers.Dense(self.channel_dimension, use_bias=False, name='sep_wq2') + self.sep_wk2 = tf.keras.layers.Dense(self.channel_dimension, use_bias=False, name='sep_wk2') + self.sep_wv2 = tf.keras.layers.Dense(self.channel_dimension, use_bias=False, name='sep_wv2') + + self.sep_wq3 = tf.keras.layers.Dense(self.channel_dimension, use_bias=False, name='sep_wq3') + self.sep_wk3 = tf.keras.layers.Dense(self.channel_dimension, use_bias=False, name='sep_wk3') + self.sep_wv3 = tf.keras.layers.Dense(self.channel_dimension, use_bias=False, name='sep_wv3') + + # For word-frame attention + self.word_wq = tf.keras.layers.Dense(self.channel_dimension, use_bias=False, name='word_wq') + self.word_wk = tf.keras.layers.Dense(self.channel_dimension, use_bias=False, name='word_wk') + self.word_wv = tf.keras.layers.Dense(self.channel_dimension, use_bias=False, name='word_wv') + + # batchnorm causes nan because of the padding and masking, so layernorm + self.layernorm1 = tf.keras.layers.LayerNormalization() + self.layernorm2 = tf.keras.layers.LayerNormalization() + + ''' + def separable_attention(self, x, mask_t, mask_h, mask_w): + try: + num_clips, t, h, w, c = x.shape + except: + raise MatrixRankError("x must be of rank 5 i.e, (num_clips, t, h, w, c)") + + if len(mask_t.shape) != 5 or len(mask_h.shape) != 5 or len(mask_h.shape) != 5: + raise MatrixRankError("masks must be of rank 5 i.e, (num_clips, t, h, w, val), where val -> H*W or, T*W or, T*H") + + x = tf.reshape(x, (num_clips, t, h*w, c)) + xq, xk, xv = self.sep_wq1(x), self.sep_wk1(x), self.sep_wv1(x) + x = dot_product_attention(xq, xk ,xv, tf.reshape(mask_t, (num_clips, t, h*w, h*w))) # self Attention + #print(x.shape) + + x = tf.reshape(x, (num_clips, h, t*w, c)) + xq, xk, xv = self.sep_wq2(x), self.sep_wk2(x), self.sep_wv2(x) + x = dot_product_attention(xq, xk ,xv, tf.reshape(mask_h, (num_clips, h, t*w, t*w))) # self Attention + + x = tf.reshape(x, (num_clips, w, t*h, c)) + xq, xk, xv = self.sep_wq3(x), self.sep_wk3(x), self.sep_wv3(x) + x = dot_product_attention(xq, xk ,xv, tf.reshape(mask_w, (num_clips, w, t*h, t*h))) # self Attention + + return tf.reshape(x, (num_clips, t, h, w, c)) + ''' + ''' + def separable_attention(self, x, mask_t, mask_h, mask_w, xq_t, xk_t, xv_t, xq_h, xk_h, xv_h, xq_w, xk_w, xv_w): + try: + num_clips, t, h, w, c = x.shape + except: + raise MatrixRankError("x must be of rank 5 i.e, (num_clips, t, h, w, c)") + + if len(mask_t.shape) != 5 or len(mask_h.shape) != 5 or len(mask_h.shape) != 5: + raise MatrixRankError("masks must be of rank 5 i.e, (num_clips, t, h, w, val), where val -> H*W or, T*W or, T*H") + + x = dot_product_attention(xq_t, xk_t ,xv_t, tf.reshape(mask_t, (num_clips, t, h*w, h*w))) # self Attention + + x = dot_product_attention(xq_h, xk_h ,xv_h, tf.reshape(mask_h, (num_clips, h, t*w, t*w))) # self Attention + + x = dot_product_attention(xq_w, xk_w ,xv_w, tf.reshape(mask_w, (num_clips, w, t*h, t*h))) # self Attention + + return tf.reshape(x, (num_clips, t, h, w, c)) + ''' + ''' + def word_frame_attention(self, frame_features, bert_embeddings, mask_t): # only across time + try: + num_clips, t, h, w, c = frame_features.shape + except: + raise MatrixRankError("frame_features must be of rank 5 i.e, (num_clips, t, h, w, c)") + + if len(mask_t.shape) != 5: + raise MatrixRankError("mask_t must be of rank 5 i.e, (num_clips, t, h, w, val), where val -> H*W or, T*W or, T*H") + + frame_features = tf.reshape(frame_features, (num_clips, t, h*w, c)) + + frame_features, bert_embeddings, bert_embeddings = self.word_wq(frame_features), self.word_wk(bert_embeddings), self.word_wv(bert_embeddings) + + frame_features = dot_product_attention(frame_features, bert_embeddings, bert_embeddings, tf.reshape(mask_t, (num_clips, t, h*w, h*w))) + + return tf.reshape(frame_features, (num_clips, t, h, w, c)) + ''' + + def call(self, frame_features, bert_embeddings, mask_t, mask_h, mask_w): + # make sure that q, k, v have gone through tf.expand_dims, because of the batch simension thing + # for 2nd point above. if q = v that means self attention, hence, q != v + assert (frame_features.shape[-2] * frame_features.shape[-3]) == bert_embeddings.shape[-2] # for 2nd point above. if q = v that means self attention, hence, q != v + # 8*8 == 64 + + #attn_out = self.separable_attention(frame_features, mask_t, mask_h, mask_w) + # Separable self-attn + try: + num_clips, t, h, w, c = frame_features.shape + except: + raise MatrixRankError("frame_features must be of rank 5 i.e, (num_clips, t, h, w, c)") + + x = tf.reshape(frame_features, (num_clips, t, h*w, c)) + xq, xk, xv = self.sep_wq1(x), self.sep_wk1(x), self.sep_wv1(x) + x = dot_product_attention(xq, xk ,xv, tf.reshape(mask_t, (num_clips, t, h*w, h*w))) + + x = tf.reshape(x, (num_clips, h, t*w, c)) + xq, xk, xv = self.sep_wq2(x), self.sep_wk2(x), self.sep_wv2(x) + x = dot_product_attention(xq, xk ,xv, tf.reshape(mask_h, (num_clips, h, t*w, t*w))) + + x = tf.reshape(x, (num_clips, w, t*h, c)) + xq, xk, xv = self.sep_wq3(x), self.sep_wk3(x), self.sep_wv3(x) + x = dot_product_attention(xq, xk ,xv, tf.reshape(mask_w, (num_clips, w, t*h, t*h))) # self Attention + + x = tf.reshape(x, (num_clips, t, h, w, c)) + frame_features = self.layernorm1(frame_features + x) + + #attn_out = self.word_frame_attention(frame_features, bert_embeddings, mask_t) + # Word-frame attention + x = tf.reshape(frame_features, (num_clips, t, h*w, c)) + x, bert_k, bert_v = self.word_wq(x), self.word_wk(bert_embeddings), self.word_wv(bert_embeddings) + x = dot_product_attention(x, bert_k, bert_v, tf.reshape(mask_t, (num_clips, t, h*w, h*w))) + x = tf.reshape(x, (num_clips, t, h, w, c)) + + frame_features = self.layernorm2(frame_features + x) + + return frame_features + + + +# use tf.keras.Model to make it a different model +# Can see summary only after passing an input. Simply calling the model won't work +# Gotta pass a sample input to get going +class ConvAttn(tf.keras.layers.Layer): + def __init__(self, num_attention_blocks=4, out_channels=64): + super(ConvAttn, self).__init__() + + self.num_attention_blocks = num_attention_blocks + self.out_channels = out_channels + + # (16, 64, 64, 8) + self.conv1 = tf.keras.layers.Conv3D(filters=8, + kernel_size=(3, 3, 3), + strides=(1, 1, 1), + padding='same', + use_bias=False) + + # (8, 32, 32, 16) + self.conv2 = tf.keras.layers.Conv3D(filters=16, + kernel_size=(3, 3, 3), + strides=(2, 2, 2), + padding='same', + use_bias=False) + + # (4, 16, 16, 32) + self.conv3 = tf.keras.layers.Conv3D(filters=32, + kernel_size=(3, 3, 3), + strides=(2, 2, 2), + padding='same', + use_bias=False) + + # (2, 8, 8, out_channels) + self.conv4 = tf.keras.layers.Conv3D(filters=self.out_channels, + kernel_size=(3, 3, 3), + strides=(2, 2, 2), + padding='same', + use_bias=False) + + # LayerNorm and ReLU + self.relu = [] + self.layernorm = [] + for i in range(4): + self.relu.append(tf.keras.layers.Activation('relu')) + self.layernorm.append(tf.keras.layers.LayerNormalization()) + + self.attention = [] + for i in range(self.num_attention_blocks): + self.attention.append(Attention(self.out_channels)) + + def call(self, x, bert_embeddings): + if len(x.shape) != 5: + raise MatrixRankError("x was supposed to be a Rank 5 tensor, i.e, (num_clips, T, H, W, C)") + + assert self.out_channels == bert_embeddings.shape[-2] + + x = self.conv1(x) + x = self.layernorm[0](x) + x = self.relu[0](x) + #print("Conv1 Out Shape : ", x.shape) + + x = self.conv2(x) + x = self.layernorm[1](x) + x = self.relu[1](x) + #print("Conv2 Out Shape : ", x.shape) + + x = self.conv3(x) + x = self.layernorm[2](x) + x = self.relu[2](x) + #print("Conv3 Out Shape : ", x.shape) + + x = self.conv4(x) + x = self.layernorm[3](x) + x = self.relu[3](x) + #print("Conv4 Out Shape : ", x.shape) + + #if np.isnan(np.sum(x)) == np.nan: + # print('-----CONV-----') + + num_clips, t, h, w, c = x.shape + x = tf.reshape(x, (num_clips * t, h, w, c)) + t = num_clips * t + + x = tf.repeat(x, repeats=num_clips, axis=0) + x = tf.reshape(x, (num_clips, t, h, w, c)) + + mask_t = look_ahead_mask(num_clips, (t, h, w, h*w)) + mask_h = look_ahead_mask(num_clips, (t, h, w, t*w)) + mask_w = look_ahead_mask(num_clips, (t, h, w, t*h)) + + for i in range(self.num_attention_blocks): + x = self.attention[i](x, bert_embeddings, mask_t, mask_h, mask_w) + + + return x diff --git a/Machine_Learning/src/NLP/SignGAN/utils/discriminator.py b/Machine_Learning/src/NLP/SignGAN/utils/discriminator.py new file mode 100644 index 00000000..ae4e26c3 --- /dev/null +++ b/Machine_Learning/src/NLP/SignGAN/utils/discriminator.py @@ -0,0 +1,399 @@ +import tensorflow as tf +from .errors import MatrixRankError + +# Using scheme 2 of Microsoft + +# 0 < disc_vals < 1 +class VideoDiscriminator(tf.keras.layers.Layer): + def __init__(self): + super(VideoDiscriminator, self).__init__() + + self.dense1 = tf.keras.layers.Dense(4 * 4 * 256) + + # Input -> (512, 64, 64, 3) + # (512, 64, 64, 8) + self.conv0 = tf.keras.layers.Conv3D(filters=8, + kernel_size=(3, 3, 3), + strides=(1, 1, 1), + padding='same', + use_bias=False) + + # (256, 64, 64, 16) + self.conv1 = tf.keras.layers.Conv3D(filters=16, + kernel_size=(3, 3, 3), + strides=(2, 1, 1), + padding='same', + use_bias=False) + + # (128, 64, 64, 32) + self.conv2 = tf.keras.layers.Conv3D(filters=32, + kernel_size=(3, 3, 3), + strides=(2, 1, 1), + padding='same', + use_bias=False) + + # (64, 64, 64, 64) + self.conv3 = tf.keras.layers.Conv3D(filters=64, + kernel_size=(3, 3, 3), + strides=(2, 1, 1), + padding='same', + use_bias=False) + + # (32, 64, 64, 128) + self.conv4 = tf.keras.layers.Conv3D(filters=128, + kernel_size=(3, 3, 3), + strides=(2, 1, 1), + padding='same', + use_bias=False) + + # (16, 64, 64, 256) + self.conv5 = tf.keras.layers.Conv3D(filters=256, + kernel_size=(3, 3, 3), + strides=(2, 1, 1), + padding='same', + use_bias=False) + + # (8, 32, 32, 512) + self.conv6 = tf.keras.layers.Conv3D(filters=512, + kernel_size=(3, 3, 3), + strides=(2, 2, 2), + padding='same', + use_bias=False) + + # (4, 16, 16, 1024) + self.conv7 = tf.keras.layers.Conv3D(filters=1024, + kernel_size=(3, 3, 3), + strides=(2, 2, 2), + padding='same', + use_bias=False) + + # (2, 8, 8, 1024) + self.conv8 = tf.keras.layers.Conv3D(filters=1024, + kernel_size=(3, 3, 3), + strides=(2, 2, 2), + padding='same', + use_bias=False) + + # (1, 4, 4, 1024) + self.conv9 = tf.keras.layers.Conv3D(filters=1024, + kernel_size=(3, 3, 3), + strides=(2, 2, 2), + padding='same', + use_bias=False) + + # After concat + self.conv10 = tf.keras.layers.Conv3D(filters=1024, + kernel_size=(3, 3, 3), + strides=(1, 1, 1), + padding='same', + use_bias=False) + + + self.relu = [] + self.batchnorm = [] + for i in range(11): + self.batchnorm.append(tf.keras.layers.BatchNormalization()) + self.relu.append(tf.keras.layers.Activation('relu')) + + self.flatten = tf.keras.layers.Flatten() + + self.dense2 = tf.keras.layers.Dense(128) + + self.dense3 = tf.keras.layers.Dense(1)#, activation='sigmoid') # output -> [0, 1] + + + def call(self, v, s): # video and sentence embedding + # s -> sentence vector + if len(v.shape) != 5: + raise MatrixRankError("v must be a Rank 5 Tensor i.e, (batch_size, num_clips * T, H, W, C)") + if len(s.shape) != 2: + raise MatrixRankError("s must be a Rank 2 Tensor i.e, (batch_size, dim)") + + batch_size, t, h, w, c = v.shape + + # Sentence semantic space + s = self.dense1(s) + s = tf.reshape(s, (batch_size, 1, 4, 4, 256)) + + # downscale v + v = self.conv0(v) + v = self.batchnorm[0](v) + v = self.relu[0](v) + + v = self.conv1(v) + v = self.batchnorm[1](v) + v = self.relu[1](v) + + v = self.conv2(v) + v = self.batchnorm[2](v) + v = self.relu[2](v) + + v = self.conv3(v) + v = self.batchnorm[3](v) + v = self.relu[3](v) + + v = self.conv4(v) + v = self.batchnorm[4](v) + v = self.relu[4](v) + + v = self.conv5(v) + v = self.batchnorm[5](v) + v = self.relu[5](v) + + v = self.conv6(v) + v = self.batchnorm[6](v) + v = self.relu[6](v) + + v = self.conv7(v) + v = self.batchnorm[7](v) + v = self.relu[7](v) + + v = self.conv8(v) + v = self.batchnorm[8](v) + v = self.relu[8](v) + + v = self.conv9(v) + v = self.batchnorm[9](v) + v = self.relu[9](v) + + # concat with s + v = tf.concat([v, s], axis=-1) + + v = self.conv10(v) + v = self.batchnorm[10](v) + v = self.relu[10](v) + + v = self.flatten(v) + v = self.dense2(v) + v = self.dense3(v) + + return v + + +# 0 < disc_vals < 1 +class FrameDiscriminator(tf.keras.layers.Layer): + def __init__(self): + super(FrameDiscriminator, self).__init__() + + # 3D conv used instead of 2D, to consider batch size i.e, more than 1 video at a time + # Time dimension technically not considered because stride is always = 1 and kernel size along time is always 1, so temporal relation is not taken into account + + self.dense1 = tf.keras.layers.Dense(4 * 4 * 256) + + # (64, 64, 8) + self.conv0 = tf.keras.layers.Conv3D(filters=8, + kernel_size=(1, 3, 3), + strides=(1, 1, 1), + padding='same', + use_bias=False) + + # (32, 32, 16) + self.conv1 = tf.keras.layers.Conv3D(filters=16, + kernel_size=(1, 3, 3), + strides=(1, 2, 2), + padding='same', + use_bias=False) + + # (16, 16, 32) + self.conv2 = tf.keras.layers.Conv3D(filters=32, + kernel_size=(1, 3, 3), + strides=(1, 2, 2), + padding='same', + use_bias=False) + + # (8, 8, 64) + self.conv3 = tf.keras.layers.Conv3D(filters=64, + kernel_size=(1, 3, 3), + strides=(1, 2, 2), + padding='same', + use_bias=False) + + # (4, 4, 128) + self.conv4 = tf.keras.layers.Conv3D(filters=128, + kernel_size=(1, 3, 3), + strides=(1, 2, 2), + padding='same', + use_bias=False) + + # (4, 4, 256) + self.conv5 = tf.keras.layers.Conv3D(filters=256, + kernel_size=(1, 3, 3), + strides=(1, 1, 1), + padding='same', + use_bias=False) + ''' + # (32, 32, 512) + self.conv6 = tf.keras.layers.Conv3D(filters=512, + kernel_size=(1, 3, 3), + strides=(1, 2, 2), + padding='same', + use_bias=False) + + # (16, 16, 1024) + self.conv7 = tf.keras.layers.Conv3D(filters=1024, + kernel_size=(1, 3, 3), + strides=(1, 2, 2), + padding='same', + use_bias=False) + + # (8, 8, 1024) + self.conv8 = tf.keras.layers.Conv3D(filters=1024, + kernel_size=(1, 3, 3), + strides=(1, 2, 2), + padding='same', + use_bias=False) + + # (4, 4, 1024) + self.conv9 = tf.keras.layers.Conv3D(filters=1024, + kernel_size=(1, 3, 3), + strides=(1, 2, 2), + padding='same', + use_bias=False) + ''' + + # After concat (FRAME) + self.conv_frame = tf.keras.layers.Conv3D(filters=512, + kernel_size=(1, 3, 3), + strides=(1, 1, 1), + padding='same', + use_bias=False) + + # After concat (MOTION) + self.conv_motion = tf.keras.layers.Conv3D(filters=512, + kernel_size=(1, 3, 3), + strides=(1, 1, 1), + padding='same', + use_bias=False) + + + self.relu = [] + self.batchnorm = [] + for i in range(8): #12 + self.batchnorm.append(tf.keras.layers.BatchNormalization()) + self.relu.append(tf.keras.layers.Activation('relu')) + + self.frame_flatten = tf.keras.layers.Flatten() + + #self.dense_frame_1 = tf.keras.layers.Dense(128) + + self.dense_frame_2 = tf.keras.layers.Dense(1)#, activation='sigmoid') # output -> [0, 1] + + self.motion_flatten = tf.keras.layers.Flatten() + + #self.dense_motion_1 = tf.keras.layers.Dense(128) + + self.dense_motion_2 = tf.keras.layers.Dense(1)#, activation='sigmoid') # output -> [0, 1] + + + def call(self, v, s): + if len(v.shape) != 5: + raise MatrixRankError("v must be a Rank 5 Tensor i.e, (batch_size, num_clips * T, H, W, C)") + if len(s.shape) != 2: + raise MatrixRankError("s must be a Rank 2 Tensor i.e, (batch_size, dim)") + + batch_size, t, h, w, c = v.shape + + # downscale v + v = self.conv0(v) + v = self.batchnorm[0](v) + v = self.relu[0](v) + + v = self.conv1(v) + v = self.batchnorm[1](v) + v = self.relu[1](v) + + v = self.conv2(v) + v = self.batchnorm[2](v) + v = self.relu[2](v) + + v = self.conv3(v) + v = self.batchnorm[3](v) + v = self.relu[3](v) + + v = self.conv4(v) + v = self.batchnorm[4](v) + v = self.relu[4](v) + + v = self.conv5(v) + v = self.batchnorm[5](v) + v = self.relu[5](v) + ''' + v = self.conv6(v) + v = self.leakyrelu[6](v) + print('7') + + v = self.conv7(v) + v = self.leakyrelu[7](v) + print('8') + + v = self.conv8(v) + v = self.leakyrelu[8](v) + print('9') + + v = self.conv9(v) + v = self.leakyrelu[9](v) # common output + print('10') + ''' + ## Take this output and work for temporal coherence + + ## Frame + # Sentence semantic space + s = self.dense1(s) + s = tf.reshape(s, (batch_size, 1, 4, 4, 256)) + frame_s = tf.repeat(s, repeats=t, axis=1) # time axis next to the batch because 't' frames + # Concat and out + frame_out = tf.concat([v, frame_s], axis=-1) + + frame_out = self.conv_frame(frame_out) + frame_out = self.batchnorm[6](frame_out) + frame_out = self.relu[6](frame_out) + # output 1 for each frame so reshape to (batch_size * t, h, w, c) + frame_out = tf.reshape(frame_out, (batch_size * frame_out.shape[1], frame_out.shape[2], frame_out.shape[3], frame_out.shape[4])) + + frame_out = self.frame_flatten(frame_out) # (bs*t, .., .., ..) + #frame_out = self.dense_frame_1(frame_out) # (bs*t, ..) + frame_out = self.dense_frame_2(frame_out) # (bs*t,) + + # reshaped to (batch_size, t) + frame_out = tf.reshape(frame_out, (batch_size, frame_out.shape[0] // batch_size)) # out + # for each frame of each batch, we get an output + + ## Motion + # Sentence Semantic space + motion_s = tf.repeat(s, repeats=t-1, axis=1) + motion_out = tf.subtract(v[:, 1:], v[:, :-1]) + + # Concat and out + motion_out = tf.concat([motion_out, motion_s], axis=-1) + + motion_out = self.conv_motion(motion_out) + motion_out = self.batchnorm[7](motion_out) + motion_out = self.relu[7](motion_out) + + # Scheme 2 (no norm) + motion_out = tf.reshape(motion_out, (batch_size * motion_out.shape[1], motion_out.shape[2], motion_out.shape[3], motion_out.shape[4])) + + motion_out = self.motion_flatten(motion_out) # (bs*(t-1), .., .., ..) + #motion_out = self.dense_motion_1(motion_out) # (bs*(t-1), ..) + motion_out = self.dense_motion_2(motion_out) # (bs*(t-1),) + + # (bs, t-1) + motion_out = tf.reshape(motion_out, (batch_size, motion_out.shape[0] // batch_size)) # out + + return frame_out, motion_out + + +# 0 < disc_vals < 1 +class Discriminator(tf.keras.Model): + def __init__(self): + super(Discriminator, self).__init__() + + self.video_discriminator = VideoDiscriminator() + self.frame_discriminator = FrameDiscriminator() + + def call(self, v, s): + video_disc_out = self.video_discriminator(v, s) + frame_disc_out, motion_disc_out = self.frame_discriminator(v, s) + + return video_disc_out, frame_disc_out, motion_disc_out + diff --git a/Machine_Learning/src/NLP/SignGAN/utils/errors.py b/Machine_Learning/src/NLP/SignGAN/utils/errors.py new file mode 100644 index 00000000..a68004da --- /dev/null +++ b/Machine_Learning/src/NLP/SignGAN/utils/errors.py @@ -0,0 +1,18 @@ +class Error(Exception): + # base class + pass + +class PaddingError(Error): + def __init__(self, msg): + self.msg = msg + super(PaddingError, self).__init__(self.msg) + +class MatrixRankError(Error): + def __init__(self, msg): + self.msg = msg + super(MatrixRankError, self).__init__(self.msg) + +class DivisionError(Error): + def __init__(self, msg): + self.msg = msg + super(DivisionError, self).__init__(self.msg) diff --git a/Machine_Learning/src/NLP/SignGAN/utils/generator.py b/Machine_Learning/src/NLP/SignGAN/utils/generator.py new file mode 100644 index 00000000..5a0df6b8 --- /dev/null +++ b/Machine_Learning/src/NLP/SignGAN/utils/generator.py @@ -0,0 +1,184 @@ +import numpy as np +import tensorflow as tf +from .conv_attention import ConvAttn + +# '.' added to look outside the directory (else import error) +# We will feed 'z' from outside. If it is inside it'll stay constant and won't be random +class CDCGAN(tf.keras.layers.Layer): + def __init__(self): + super(CDCGAN, self).__init__() + + self.dense = tf.keras.layers.Dense(4 * 4 * 4 * 1024) + #self.layernorm = tf.keras.layers.LayerNormalization() + #self.leakyrelu = tf.keras.layers.LeakyReLU() + + self.reshape = tf.keras.layers.Reshape((4, 4, 4, 1024)) + + # (4, 4, 4, 512) + self.deconv1 = tf.keras.layers.Conv3DTranspose(filters=512, + kernel_size=(3, 3, 3), + strides=(1, 1, 1), + padding='same', + use_bias=False) + + self.layernorm1 = tf.keras.layers.LayerNormalization() + #self.leakyrelu1 = tf.keras.layers.LeakyReLU() + self.relu1 = tf.keras.layers.Activation('relu') + + # (8, 8, 8, 256) + self.deconv2 = tf.keras.layers.Conv3DTranspose(filters=256, + kernel_size=(3, 3, 3), + strides=(2, 2, 2), + padding='same', + use_bias=False) + + self.layernorm2 = tf.keras.layers.LayerNormalization() + #self.leakyrelu2 = tf.keras.layers.LeakyReLU() + self.relu2 = tf.keras.layers.Activation('relu') + + # attn -> (32, 8, 8, 128) + self.conv1 = tf.keras.layers.Conv3D(filters=128, + kernel_size=(3, 3, 3), + strides=(2, 1, 1), + padding='same', + use_bias=False) + + self.attnlayernorm1 = tf.keras.layers.LayerNormalization() + #self.attnleakyrelu1 = tf.keras.layers.LeakyReLU() + self.attnrelu1 = tf.keras.layers.Activation('relu') + + # attn -> (16, 8, 8, 256) + self.conv2 = tf.keras.layers.Conv3D(filters=256, + kernel_size=(3, 3, 3), + strides=(2, 1, 1), + padding='same', + use_bias=False) + + self.attnlayernorm2 = tf.keras.layers.LayerNormalization() + #self.attnleakyrelu2 = tf.keras.layers.LeakyReLU() + self.attnrelu2 = tf.keras.layers.Activation('relu') + + # attn -> (8, 8, 8, 512) + self.conv3 = tf.keras.layers.Conv3D(filters=512, + kernel_size=(3, 3, 3), + strides=(2, 1, 1), + padding='same', + use_bias=False) + + self.attnlayernorm3 = tf.keras.layers.LayerNormalization() + #self.attnleakyrelu3 = tf.keras.layers.LeakyReLU() + self.attnrelu3 = tf.keras.layers.Activation('relu') + + + # (8, 16, 16, 128) + self.deconv3 = tf.keras.layers.Conv3DTranspose(filters=128, + kernel_size=(3, 3, 3), + strides=(1, 2, 2), + padding='same', + use_bias=False) + + self.layernorm3 = tf.keras.layers.LayerNormalization() + #self.leakyrelu3 = tf.keras.layers.LeakyReLU() + self.relu3 = tf.keras.layers.Activation('relu') + + # (16, 32, 32, 64) + self.deconv4 = tf.keras.layers.Conv3DTranspose(filters=64, + kernel_size=(3, 3, 3), + strides=(2, 2, 2), + padding='same', + use_bias=False) + + self.layernorm4 = tf.keras.layers.LayerNormalization() + #self.leakyrelu4 = tf.keras.layers.LeakyReLU() + self.relu4 = tf.keras.layers.Activation('relu') + + # (16, 64, 64, 32) + self.deconv5 = tf.keras.layers.Conv3DTranspose(filters=32, + kernel_size=(3, 3, 3), + strides=(1, 2, 2), + padding='same', + use_bias=False) + + self.layernorm5 = tf.keras.layers.LayerNormalization() + #self.leakyrelu5 = tf.keras.layers.LeakyReLU() + self.relu5 = tf.keras.layers.Activation('relu') + + # (16, 64, 64, 3) + self.deconv6 = tf.keras.layers.Conv3DTranspose(filters=3, + kernel_size=(3, 3, 3), + strides=(1, 1, 1), + padding='same', + use_bias=False, + activation='relu') # values > 0 + + + def call(self, z, conv_attn_output): + assert len(z.shape) == 2 # (1, 100) + + # same z repeated for all the clips + z = tf.repeat(z, repeats=conv_attn_output.shape[0], axis=0) + + z = self.dense(z) + #z = self.layernorm(z) + #z = self.leakyrelu(z) + + z = self.reshape(z) + + # Upscaling z + z = self.deconv1(z) + z = self.layernorm1(z) + z = self.relu1(z) + + z = self.deconv2(z) + z = self.layernorm2(z) + z = self.relu2(z) # (32, 8, 8, 8, 256) + + # Attn out -> (32, 64, 8, 8, 64) + # Downscale Attention output -> (32, 8, 8, 8, 64) + conv_attn_output = self.conv1(conv_attn_output) + conv_attn_output = self.attnlayernorm1(conv_attn_output) + conv_attn_output = self.attnlayernorm1(conv_attn_output) + + conv_attn_output = self.conv2(conv_attn_output) + conv_attn_output = self.attnlayernorm2(conv_attn_output) + conv_attn_output = self.attnlayernorm2(conv_attn_output) + + conv_attn_output = self.conv3(conv_attn_output) + conv_attn_output = self.attnlayernorm3(conv_attn_output) + conv_attn_output = self.attnlayernorm3(conv_attn_output) + + # Concat condition (downscaled attention output) across channel dimension + z = tf.concat([z, conv_attn_output], axis=-1) + + # upconv to produce 40 clips with 40 as bs, thus generating the whole video + z = self.deconv3(z) + z = self.layernorm3(z) + z = self.relu3(z) + + z = self.deconv4(z) + z = self.layernorm4(z) + z = self.relu4(z) + + z = self.deconv5(z) + z = self.layernorm5(z) + z = self.relu5(z) + + z = self.deconv6(z) + + return z + + +class Generator(tf.keras.Model): + def __init__(self, num_attention_blocks=4, out_channels=64): + super(Generator, self).__init__() + + self.attention = ConvAttn(num_attention_blocks, out_channels) + self.cdcgan = CDCGAN() + + def call(self, x, bert_embeddings, z): + x = self.attention(x, bert_embeddings) + x = self.cdcgan(z, x) + #if np.isnan(np.sum(x)) == np.nan: + # print('-----CDCGAN-----') + + return x diff --git a/Machine_Learning/src/NLP/SignGAN/utils/losses.py b/Machine_Learning/src/NLP/SignGAN/utils/losses.py new file mode 100644 index 00000000..a808347e --- /dev/null +++ b/Machine_Learning/src/NLP/SignGAN/utils/losses.py @@ -0,0 +1,33 @@ +import tensorflow as tf + +# disc_video_real -> Disc out for Real video matching with the sentence [single value] +# disc_video_wrong -> Disc out wrong pair [single value] +# disc_video_fake -> Disc out for Generated video [single value] +# Mean of whole batch taken + +# Use BCE loss. Refer tf2-GANs +bce = tf.keras.losses.BinaryCrossentropy(from_logits=True) + +@tf.function +def video_loss(disc_video_real, disc_video_wrong, disc_video_fake): + return (bce(tf.ones_like(disc_video_real), disc_video_real) + bce(tf.zeros_like(disc_video_wrong), disc_video_wrong) + bce(tf.zeros_like(disc_video_fake), disc_video_fake)) / 3. + +# disc_frame_real -> Disc out for Real video frames matching with the sentence [Vector of values for each frame] +# disc_frame_wrong -> Disc out wrong pair frames [Vector of values for each frame] +# disc_frame_fake -> Disc out for Generated frames [Vector of values for each frame] +@tf.function +def frame_loss(disc_frame_real, disc_frame_wrong, disc_frame_fake): + return (bce(tf.ones_like(disc_frame_real), disc_frame_real) + bce(tf.zeros_like(disc_frame_wrong), disc_frame_wrong) + bce(tf.zeros_like(disc_frame_fake), disc_frame_fake)) / 3. + +@tf.function +def motion_loss(disc_motion_real, disc_motion_wrong, disc_motion_fake): # only for generator + return (bce(tf.ones_like(disc_motion_real), disc_motion_real) + bce(tf.zeros_like(disc_motion_wrong), disc_motion_wrong) + bce(tf.zeros_like(disc_motion_fake), disc_motion_fake)) / 3. + +@tf.function +def discriminator_loss(video_loss, frame_loss, motion_loss): + return (video_loss + frame_loss + motion_loss) / 3. + +# reduce_mean for batch +@tf.function +def generator_loss(disc_video_fake, disc_frame_fake, disc_motion_fake): + return (bce(tf.ones_like(disc_video_fake), disc_video_fake) + bce(tf.ones_like(disc_frame_fake), disc_frame_fake) + bce(tf.ones_like(disc_motion_fake), disc_motion_fake)) / 3. diff --git a/Machine_Learning/src/NLP/SignGAN/utils/video.py b/Machine_Learning/src/NLP/SignGAN/utils/video.py new file mode 100644 index 00000000..894e3f66 --- /dev/null +++ b/Machine_Learning/src/NLP/SignGAN/utils/video.py @@ -0,0 +1,94 @@ +import tensorflow as tf +import numpy as np +import glob +import cv2 + +class Video(object): + def __init__(self, T = 16, MAX_VIDEO_LENGTH = 640, FRAME_DIM = (64, 64, 3), VIDEO_DIM = (640, 64, 64, 3), + data_dir = 'phoenix-2014-T.v3/PHOENIX-2014-T-release-v3/PHOENIX-2014-T/features/fullFrame-210x260px/'): + self.T = T + self.MAX_VIDEO_LENGTH = MAX_VIDEO_LENGTH + self.FRAME_DIM = FRAME_DIM + self.VIDEO_DIM = VIDEO_DIM + self.data_dir = data_dir + + def get_video(self, set_name, name, resize=True, scale_down=True): + vid = [] + for frame in glob.glob(self.data_dir + set_name + '/' + name + '/*.png'): + vid_frame = cv2.imread(frame) + if resize: + vid_frame = cv2.resize(vid_frame, (self.FRAME_DIM[0], self.FRAME_DIM[1])) + if scale_down: + vid_frame = vid_frame / 255. # 0 < pixel values < 1, padding = 0 + vid.append(vid_frame) + return tf.convert_to_tensor(np.array(vid, np.float32)) + + def padding(self, video): + pad_length = self.MAX_VIDEO_LENGTH - video.shape[0] + pad = tf.zeros((pad_length, self.FRAME_DIM[0], self.FRAME_DIM[1], self.FRAME_DIM[2]), dtype=tf.float32) + return tf.concat([video, pad], 0) + + def preprocess_video(self, video): + start_token = tf.fill((self.T, self.FRAME_DIM[0], self.FRAME_DIM[1], self.FRAME_DIM[2]), 256./255) # start token -> 4d array of 0.9 + #print(start_token.shape) + end_token = tf.fill((self.T, self.FRAME_DIM[0], self.FRAME_DIM[1], self.FRAME_DIM[2]), 257./255) # end token -> 4d array of 2.1 + #print(end_token.shape) + extra_token = tf.fill((self.T - video.shape[0] % self.T, self.FRAME_DIM[0], self.FRAME_DIM[1], self.FRAME_DIM[2]), 1.) # padding starts only from the nearest 10th, so until the nearest 10, array of 1's + #print(extra_token.shape) + #video = tf.concat([tf.concat([start_token, video], 0), extra_token], 0) + #video = tf.concat([tf.cast(start_token, tf.float32), tf.cast(video, tf.float32), tf.cast(extra_token, tf.float32), tf.cast(end_token, tf.float32)], 0) + video = tf.concat([start_token, video, extra_token, end_token], 0) + #print(video) + + #video = np.append(video, end_token, axis=0) + video = self.padding(video) + return video + + def divide_sequence(self, preprocessed_video): + return tf.reshape(preprocessed_video, (self.MAX_VIDEO_LENGTH//self.T, self.T, preprocessed_video.shape[1], preprocessed_video.shape[2], preprocessed_video.shape[3])) + #return np.array(np.array_split(preprocessed_video, self.MAX_VIDEO_LENGTH//self.T, axis=0)) + + ''' + # may not be needed + def padding_mask(self, current_sequence_length): # so that paddings are not treated as input + division = current_sequence_length // self.T + 2 # includes start_token so 2 + #print(division) + + mask = np.zeros((division, self.T, self.FRAME_DIM[0], self.FRAME_DIM[1], self.FRAME_DIM[2]), dtype=np.float) + mask = np.append(mask, np.ones((self.MAX_VIDEO_LENGTH//self.T - division, self.T, self.FRAME_DIM[0], self.FRAME_DIM[1], self.FRAME_DIM[2]), dtype=np.float), axis=0) + + return mask + + # may not be needed + def look_ahead_mask(self): + mask = np.zeros((1, self.T, self.FRAME_DIM[0], self.FRAME_DIM[1], self.FRAME_DIM[2]), dtype=np.float) + mask = np.append(mask, np.ones((49, self.T, self.FRAME_DIM[0], self.FRAME_DIM[1], self.FRAME_DIM[2]), dtype=np.float), axis=0) + mask = np.expand_dims(mask, axis=0) + #print(mask.shape) + for i in range(0, self.MAX_VIDEO_LENGTH - 2 * self.T + 1, self.T): + mask = np.append(mask, np.expand_dims(self.padding_mask(i), axis=0), axis=0) + return mask + ''' + +''' +def main(): + video_obj = Video() + video = video_obj.get_video('train', '05January_2010_Tuesday_tagesschau-2664') + current_sequence_length = video.shape[0] + + video = video_obj.preprocess_video(video) + print(video.shape) + + video = video_obj.divide_sequence(video) + print(video.shape) + + pad_mask = video_obj.padding_mask(current_sequence_length) + print(pad_mask.shape) + + look_mask = video_obj.look_ahead_mask() + print(look_mask.shape) + + +if __name__ == '__main__': + main() +'''