Video Classification in Keras, a couple of approaches -

Reading Time: 5 minutes

The purpose of this post is to summarize (with code) three approaches to video classification I tested a couple of months ago for a personal challenge. As I was completely new to the domain, I googled around to check what the web had to offer around this task. The best resource, in terms of both conciseness and clearness, was by far this Medium post. The author proposes a high-level overview of five strategies:

Classifying one frame at a time with a ConvNet
Using a time-distributed ConvNet and passing the features to an RNN
Using a 3D convolutional network
Extracting features from each frame with a ConvNet and passing the sequence to a separate RNN
Extracting features from each frame with a ConvNet and passing the sequence to a separate MLP (a simple LogisticRegression classifier in my case)

I opted for three of these, respectively 2, 3, and 5. Due to time constraints, I was not able to explore the remaining two, even though the code I put together should be easy to extend to cover the missing ones.

Table of Contents

DataSet Class: data related helper functions

class DataSet():
    
    def __init__(self, bucket):
        self.bucket = bucket
        self.dataset = self.build_dataset()
        self.classes = self.get_classes()
    
    def build_dataset(self):
        dataset = pd.DataFrame()
        for file in self.bucket.objects.filter(Prefix='prefix'):
            if 'csv' in file.key:
                what = file.key.split('/')
                df = self.bucket.Object(file.key)
                df = pd.read_csv(io.BytesIO(df.get()['Body'].read()))
                df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
                df.columns = [col.strip() for col in df.columns]
                df['label'] = what[2]
                df['path'] = '/'.join(what[:3])
                dataset = dataset.append(df)
            
        dataset.path = dataset.path + '/' + dataset.entryurl
        return(dataset)    
    
    def get_classes(self):
        return {tag:i for i, tag in enumerate(self.dataset.label.unique())}

    def get_class_one_hot(self, class_str):
        """Given a class as a string, return its number in the classes
            list. This lets us encode and one-hot it for training."""
        # Encode it first.
        label_encoded = self.classes[class_str]
        # Now one-hot it.
        label_hot = to_categorical(label_encoded, len(self.classes))
        assert len(label_hot) == len(self.classes)
        return label_hot
    
    def one_hot_encode(self):
        self.lb = preprocessing.LabelBinarizer()
        lab = self.lb.fit_transform(self.dataset.label.values)
        
        return lab
    
    def preprocess_input_resnet50(self, x):
        from keras.applications.resnet50 import preprocess_input
        X = np.expand_dims(x, axis=0)
        X = preprocess_input(X)
        return X[0]
    
    def preprocess_input_vgg16(self, x):
        from keras.applications.vgg16 import preprocess_input
        X = np.expand_dims(x, axis=0)
        X = preprocess_input(X)
        return X[0]
    
    def preprocess_input_inception(self, x):
        from keras.applications.inception_v3 import preprocess_input
        X = np.expand_dims(x, axis=0)
        X = preprocess_input(X)
        return X[0]
    
    def preprocess_input_densenet(self, x):
        from keras.applications.densenet import preprocess_input
        X = np.expand_dims(x, axis=0)
        X = preprocess_input(X)
        return X[0]    
    
    def preprocess_standard(self, x):
        return x / 255.
        
    def prepare_image(self, img, size, preprocessing_function, aug=False):
        img = scipy.misc.imresize(img, size)
        img = np.array(img).astype(np.float64)
        if aug: img = augment(img, np.random.randint(7))
        img = preprocessing_function(img)
        return img
    
    def augment(self, src, choice):
        if choice == 0:
            # Rotate 90
            src = np.rot90(src, 1)
        if choice == 1:
            # flip vertically
            src = np.flipud(src)
        if choice == 2:
            # Rotate 180
            src = np.rot90(src, 2)
        if choice == 3:
            # flip horizontally
            src = np.fliplr(src)
        if choice == 4:
            # Rotate 90 counter-clockwise
            src = np.rot90(src, 3)
        if choice == 5:
            # Rotate 180 and flip horizontally
            src = np.rot90(src, 2)
            src = np.fliplr(src)
        if choice == 6:
            # leave it as is
            src = src
        return src

    def process_video(self, filepath, size, preprocessing_function):
        video = self.bucket.Object(filepath)
        vid = imageio.get_reader(io.BytesIO(video.get()['Body'].read()), 'ffmpeg')
        nframes = vid.get_meta_data()['nframes']
        how = 20
        block = nframes//how
        return [self.prepare_image(vid.get_data(frame), size, preprocessing_function) for frame in range(0,block*how,block)]
    
    def split_train_test(self):
        return train_test_split(self.dataset, self.dataset.label, test_size=0.15, random_state=42, stratify=self.dataset.label)
    
    def save_bottleneck_features(self, size, which_net):
        if which_net == 'resnet50': 
            preprocessing_function=self.preprocess_input_resnet50
            base_model = ResNet50(weights='imagenet', include_top=False)
            
        x = base_model.output
        x = GlobalAveragePooling2D()(x)
        model = Model(input=base_model.input, output=x)
        
        fin = []
            
        for i, path in enumerate(self.dataset.path):
            v = np.array(self.process_video(path, size, preprocessing_function))
            p = np.squeeze(model.predict_on_batch(v).reshape(1, -1))
            fin.append(p)
            
        fin = np.array(fin)
        np.save(which_net+'.npy', fin)
        
        return fin
    
    def data_generator(self, data, which_net, size=(224,224), batch_size=32): 
        if which_net == 'resnet50': 
            preprocessing_function=self.preprocess_input_resnet50
        elif which_net == 'densenet': 
            preprocessing_function=self.preprocess_input_densenet
        elif which_net == 'inception': 
            preprocessing_function=self.preprocess_input_inception
        elif which_net == 'vgg': 
            preprocessing_function=self.preprocess_input_vgg16
        elif which_net == 'standard': 
            preprocessing_function=self.preprocess_standard
        
        while True:
            for start in range(0, len(data), batch_size):
                x_batch = []
                y_batch = []
                end = min(start + batch_size, len(data))
                data_batch = data[start:end]
                for filepath, tag in data_batch.loc[:,['path','label']].values:
                    processed_video = self.process_video(filepath, size, preprocessing_function)
                    x_batch.append(processed_video)
                    y_batch.append(self.get_class_one_hot(tag))
                x_batch = np.array(x_batch)
                y_batch = np.array(y_batch)
                yield x_batch, y_batch

2. TimeDistributed CNN + LSTM

data = DataSet(bucket)
X_train, X_test, y_train, y_test = data.split_train_test()

model = Sequential()
model.add(TimeDistributed(Conv2D(32, (7, 7), strides=(2, 2), activation='relu', padding='same'), input_shape=(20, 224, 224, 3)))
model.add(TimeDistributed(Conv2D(32, (3,3), kernel_initializer="he_normal", activation='relu')))
model.add(TimeDistributed(MaxPooling2D((2, 2), strides=(2, 2))))

model.add(TimeDistributed(Conv2D(64, (3,3), padding='same', activation='relu')))
model.add(TimeDistributed(Conv2D(64, (3,3), padding='same', activation='relu')))
model.add(TimeDistributed(MaxPooling2D((2, 2), strides=(2, 2))))

model.add(TimeDistributed(Conv2D(128, (3,3), padding='same', activation='relu')))
model.add(TimeDistributed(Conv2D(128, (3,3), padding='same', activation='relu')))
model.add(TimeDistributed(MaxPooling2D((2, 2), strides=(2, 2))))

model.add(TimeDistributed(Conv2D(256, (3,3), padding='same', activation='relu')))
model.add(TimeDistributed(Conv2D(256, (3,3), padding='same', activation='relu')))
model.add(TimeDistributed(MaxPooling2D((2, 2), strides=(2, 2))))

model.add(TimeDistributed(Conv2D(512, (3,3), padding='same', activation='relu')))
model.add(TimeDistributed(Conv2D(512, (3,3), padding='same', activation='relu')))
model.add(TimeDistributed(MaxPooling2D((2, 2), strides=(2, 2))))

model.add(TimeDistributed(Flatten()))

model.add(Dropout(0.5))
model.add(LSTM(256, return_sequences=False, dropout=0.5))
model.add(Dense(6, activation='softmax'))
model.summary()

from keras.utils import multi_gpu_model

parallel_model = multi_gpu_model(model, gpus=4)
parallel_model.compile(optimizer=Adam(lr=0.0001), loss='categorical_crossentropy', metrics = ['accuracy'])

batch_size = 16
epochs = 10
size = (224, 224)

train_steps = len(X_train) / batch_size
valid_steps = len(X_test) / batch_size
        
parallel_model.fit_generator(data.data_generator(X_train, 'standard', size=size, batch_size=batch_size), 
                    train_steps, epochs=epochs, verbose=5, 
                    validation_data=data.data_generator(X_test, 'standard', size=size, batch_size=batch_size), 
                    validation_steps=valid_steps)

3. 3D Convolutional Network

model = Sequential()
model.add(Conv3D(32, (3,3,3), activation='relu', input_shape=(20, 224, 224, 3)))
model.add(MaxPooling3D(pool_size=(1, 2, 2), strides=(1, 2, 2)))
model.add(Conv3D(64, (3,3,3), activation='relu'))
model.add(MaxPooling3D(pool_size=(1, 2, 2), strides=(1, 2, 2)))
model.add(Conv3D(128, (3,3,3), activation='relu'))
model.add(Conv3D(128, (3,3,3), activation='relu'))
model.add(MaxPooling3D(pool_size=(1, 2, 2), strides=(1, 2, 2)))
model.add(Conv3D(256, (2,2,2), activation='relu'))
model.add(Conv3D(256, (2,2,2), activation='relu'))
model.add(MaxPooling3D(pool_size=(1, 2, 2), strides=(1, 2, 2)))

model.add(Flatten())
model.add(Dense(1024))
model.add(Dropout(0.5))
model.add(Dense(1024))
model.add(Dropout(0.5))
model.add(Dense(6, activation='softmax'))
          
model.summary()

The training part is the same as the previous paragraph.

5. Extract frame-level Conv Features + LogisticRegression

size = (224, 224)
res_feat = data.save_bottleneck_features(size, 'resnet50')

lab = data.one_hot_encode()

X_train, X_test, y_train, y_test = train_test_split(res_feat, lab, test_size=0.2, random_state=4)

from sklearn.decomposition import PCA

array_train = np.empty((22648, 0))
array_test = np.empty((5663, 0))

for i in range(20):
    pca = PCA(n_components=1024)
    X = X_train[:,(2048*i):(2048*(i+1))]
    X1 = X_test[:,(2048*i):(2048*(i+1))]
    
    X_pr = pca.fit_transform(X)
    X_pr1 = pca.transform(X1)

    array_train = np.append(array_train, X_pr, axis=1)
    array_test = np.append(array_test, X_pr1, axis=1)
    
clf = OneVsRestClassifier(LogisticRegression(), n_jobs=-1)
clf.fit(array_train, y_train)

Appendix: Imports

import boto3
from boto3 import client
import collections
import pandas as pd
import io
import base64
from IPython.display import HTML
import pylab
import imageio
import scipy.misc
import math
import numpy as np
from random import shuffle
import os
import sys
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import pickle
from mpl_toolkits.axes_grid1 import ImageGrid

from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn import metrics
from sklearn import preprocessing
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

import keras
from keras.models import Model, Sequential
from keras.optimizers import Adam, SGD
from keras.applications.inception_v3 import InceptionV3
from keras.applications.densenet import DenseNet121
from keras.preprocessing import image
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau, LearningRateScheduler, Callback, CSVLogger
from keras.preprocessing.image import ImageDataGenerator, array_to_img
from keras.applications.vgg16 import VGG16
from keras.applications.resnet50 import ResNet50
from keras.applications.xception import Xception
from keras.utils import to_categorical
from keras.layers import TimeDistributed, Input, MaxPooling3D, LSTM, Dense, Conv3D, Conv2D, BatchNormalization, Flatten, Dropout, Convolution2D, Activation, MaxPooling2D, GlobalAveragePooling2D
from keras import losses

I will definitely get back to this codebase in the future but I hope it will help others for reference as well!