The purpose of this post is to summarize (with code) three approaches to video classification I tested a couple of months ago for a personal challenge. As I was completely new to the domain, I googled around to check what the web had to offer around this task. The best resource, in terms of both conciseness and clearness, was by far this Medium post. The author proposes a high-level overview of five strategies:
- Classifying one frame at a time with a ConvNet
- Using a time-distributed ConvNet and passing the features to an RNN
- Using a 3D convolutional network
- Extracting features from each frame with a ConvNet and passing the sequence to a separate RNN
- Extracting features from each frame with a ConvNet and passing the sequence to a separate MLP (a simple LogisticRegression classifier in my case)
I opted for three of these, respectively 2, 3, and 5. Due to time constraints, I was not able to explore the remaining two, even though the code I put together should be easy to extend to cover the missing ones.
DataSet Class: data related helper functions
class DataSet(): def __init__(self, bucket): self.bucket = bucket self.dataset = self.build_dataset() self.classes = self.get_classes() def build_dataset(self): dataset = pd.DataFrame() for file in self.bucket.objects.filter(Prefix='prefix'): if 'csv' in file.key: what = file.key.split('/') df = self.bucket.Object(file.key) df = pd.read_csv(io.BytesIO(df.get()['Body'].read())) df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x) df.columns = [col.strip() for col in df.columns] df['label'] = what[2] df['path'] = '/'.join(what[:3]) dataset = dataset.append(df) dataset.path = dataset.path + '/' + dataset.entryurl return(dataset) def get_classes(self): return {tag:i for i, tag in enumerate(self.dataset.label.unique())} def get_class_one_hot(self, class_str): """Given a class as a string, return its number in the classes list. This lets us encode and one-hot it for training.""" # Encode it first. label_encoded = self.classes[class_str] # Now one-hot it. label_hot = to_categorical(label_encoded, len(self.classes)) assert len(label_hot) == len(self.classes) return label_hot def one_hot_encode(self): self.lb = preprocessing.LabelBinarizer() lab = self.lb.fit_transform(self.dataset.label.values) return lab def preprocess_input_resnet50(self, x): from keras.applications.resnet50 import preprocess_input X = np.expand_dims(x, axis=0) X = preprocess_input(X) return X[0] def preprocess_input_vgg16(self, x): from keras.applications.vgg16 import preprocess_input X = np.expand_dims(x, axis=0) X = preprocess_input(X) return X[0] def preprocess_input_inception(self, x): from keras.applications.inception_v3 import preprocess_input X = np.expand_dims(x, axis=0) X = preprocess_input(X) return X[0] def preprocess_input_densenet(self, x): from keras.applications.densenet import preprocess_input X = np.expand_dims(x, axis=0) X = preprocess_input(X) return X[0] def preprocess_standard(self, x): return x / 255. def prepare_image(self, img, size, preprocessing_function, aug=False): img = scipy.misc.imresize(img, size) img = np.array(img).astype(np.float64) if aug: img = augment(img, np.random.randint(7)) img = preprocessing_function(img) return img def augment(self, src, choice): if choice == 0: # Rotate 90 src = np.rot90(src, 1) if choice == 1: # flip vertically src = np.flipud(src) if choice == 2: # Rotate 180 src = np.rot90(src, 2) if choice == 3: # flip horizontally src = np.fliplr(src) if choice == 4: # Rotate 90 counter-clockwise src = np.rot90(src, 3) if choice == 5: # Rotate 180 and flip horizontally src = np.rot90(src, 2) src = np.fliplr(src) if choice == 6: # leave it as is src = src return src def process_video(self, filepath, size, preprocessing_function): video = self.bucket.Object(filepath) vid = imageio.get_reader(io.BytesIO(video.get()['Body'].read()), 'ffmpeg') nframes = vid.get_meta_data()['nframes'] how = 20 block = nframes//how return [self.prepare_image(vid.get_data(frame), size, preprocessing_function) for frame in range(0,block*how,block)] def split_train_test(self): return train_test_split(self.dataset, self.dataset.label, test_size=0.15, random_state=42, stratify=self.dataset.label) def save_bottleneck_features(self, size, which_net): if which_net == 'resnet50': preprocessing_function=self.preprocess_input_resnet50 base_model = ResNet50(weights='imagenet', include_top=False) x = base_model.output x = GlobalAveragePooling2D()(x) model = Model(input=base_model.input, output=x) fin = [] for i, path in enumerate(self.dataset.path): v = np.array(self.process_video(path, size, preprocessing_function)) p = np.squeeze(model.predict_on_batch(v).reshape(1, -1)) fin.append(p) fin = np.array(fin) np.save(which_net+'.npy', fin) return fin def data_generator(self, data, which_net, size=(224,224), batch_size=32): if which_net == 'resnet50': preprocessing_function=self.preprocess_input_resnet50 elif which_net == 'densenet': preprocessing_function=self.preprocess_input_densenet elif which_net == 'inception': preprocessing_function=self.preprocess_input_inception elif which_net == 'vgg': preprocessing_function=self.preprocess_input_vgg16 elif which_net == 'standard': preprocessing_function=self.preprocess_standard while True: for start in range(0, len(data), batch_size): x_batch = [] y_batch = [] end = min(start + batch_size, len(data)) data_batch = data[start:end] for filepath, tag in data_batch.loc[:,['path','label']].values: processed_video = self.process_video(filepath, size, preprocessing_function) x_batch.append(processed_video) y_batch.append(self.get_class_one_hot(tag)) x_batch = np.array(x_batch) y_batch = np.array(y_batch) yield x_batch, y_batch
2. TimeDistributed CNN + LSTM
data = DataSet(bucket) X_train, X_test, y_train, y_test = data.split_train_test() model = Sequential() model.add(TimeDistributed(Conv2D(32, (7, 7), strides=(2, 2), activation='relu', padding='same'), input_shape=(20, 224, 224, 3))) model.add(TimeDistributed(Conv2D(32, (3,3), kernel_initializer="he_normal", activation='relu'))) model.add(TimeDistributed(MaxPooling2D((2, 2), strides=(2, 2)))) model.add(TimeDistributed(Conv2D(64, (3,3), padding='same', activation='relu'))) model.add(TimeDistributed(Conv2D(64, (3,3), padding='same', activation='relu'))) model.add(TimeDistributed(MaxPooling2D((2, 2), strides=(2, 2)))) model.add(TimeDistributed(Conv2D(128, (3,3), padding='same', activation='relu'))) model.add(TimeDistributed(Conv2D(128, (3,3), padding='same', activation='relu'))) model.add(TimeDistributed(MaxPooling2D((2, 2), strides=(2, 2)))) model.add(TimeDistributed(Conv2D(256, (3,3), padding='same', activation='relu'))) model.add(TimeDistributed(Conv2D(256, (3,3), padding='same', activation='relu'))) model.add(TimeDistributed(MaxPooling2D((2, 2), strides=(2, 2)))) model.add(TimeDistributed(Conv2D(512, (3,3), padding='same', activation='relu'))) model.add(TimeDistributed(Conv2D(512, (3,3), padding='same', activation='relu'))) model.add(TimeDistributed(MaxPooling2D((2, 2), strides=(2, 2)))) model.add(TimeDistributed(Flatten())) model.add(Dropout(0.5)) model.add(LSTM(256, return_sequences=False, dropout=0.5)) model.add(Dense(6, activation='softmax')) model.summary()
from keras.utils import multi_gpu_model parallel_model = multi_gpu_model(model, gpus=4) parallel_model.compile(optimizer=Adam(lr=0.0001), loss='categorical_crossentropy', metrics = ['accuracy']) batch_size = 16 epochs = 10 size = (224, 224) train_steps = len(X_train) / batch_size valid_steps = len(X_test) / batch_size parallel_model.fit_generator(data.data_generator(X_train, 'standard', size=size, batch_size=batch_size), train_steps, epochs=epochs, verbose=5, validation_data=data.data_generator(X_test, 'standard', size=size, batch_size=batch_size), validation_steps=valid_steps)
3. 3D Convolutional Network
model = Sequential() model.add(Conv3D(32, (3,3,3), activation='relu', input_shape=(20, 224, 224, 3))) model.add(MaxPooling3D(pool_size=(1, 2, 2), strides=(1, 2, 2))) model.add(Conv3D(64, (3,3,3), activation='relu')) model.add(MaxPooling3D(pool_size=(1, 2, 2), strides=(1, 2, 2))) model.add(Conv3D(128, (3,3,3), activation='relu')) model.add(Conv3D(128, (3,3,3), activation='relu')) model.add(MaxPooling3D(pool_size=(1, 2, 2), strides=(1, 2, 2))) model.add(Conv3D(256, (2,2,2), activation='relu')) model.add(Conv3D(256, (2,2,2), activation='relu')) model.add(MaxPooling3D(pool_size=(1, 2, 2), strides=(1, 2, 2))) model.add(Flatten()) model.add(Dense(1024)) model.add(Dropout(0.5)) model.add(Dense(1024)) model.add(Dropout(0.5)) model.add(Dense(6, activation='softmax')) model.summary()
The training part is the same as the previous paragraph.
5. Extract frame-level Conv Features + LogisticRegression
size = (224, 224) res_feat = data.save_bottleneck_features(size, 'resnet50') lab = data.one_hot_encode() X_train, X_test, y_train, y_test = train_test_split(res_feat, lab, test_size=0.2, random_state=4) from sklearn.decomposition import PCA array_train = np.empty((22648, 0)) array_test = np.empty((5663, 0)) for i in range(20): pca = PCA(n_components=1024) X = X_train[:,(2048*i):(2048*(i+1))] X1 = X_test[:,(2048*i):(2048*(i+1))] X_pr = pca.fit_transform(X) X_pr1 = pca.transform(X1) array_train = np.append(array_train, X_pr, axis=1) array_test = np.append(array_test, X_pr1, axis=1) clf = OneVsRestClassifier(LogisticRegression(), n_jobs=-1) clf.fit(array_train, y_train)
Appendix: Imports
import boto3 from boto3 import client import collections import pandas as pd import io import base64 from IPython.display import HTML import pylab import imageio import scipy.misc import math import numpy as np from random import shuffle import os import sys import matplotlib.pyplot as plt %matplotlib inline import seaborn as sns import pickle from mpl_toolkits.axes_grid1 import ImageGrid from sklearn.model_selection import KFold, StratifiedKFold, train_test_split from sklearn import metrics from sklearn import preprocessing from sklearn.multiclass import OneVsRestClassifier from sklearn.linear_model import LogisticRegression import keras from keras.models import Model, Sequential from keras.optimizers import Adam, SGD from keras.applications.inception_v3 import InceptionV3 from keras.applications.densenet import DenseNet121 from keras.preprocessing import image from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau, LearningRateScheduler, Callback, CSVLogger from keras.preprocessing.image import ImageDataGenerator, array_to_img from keras.applications.vgg16 import VGG16 from keras.applications.resnet50 import ResNet50 from keras.applications.xception import Xception from keras.utils import to_categorical from keras.layers import TimeDistributed, Input, MaxPooling3D, LSTM, Dense, Conv3D, Conv2D, BatchNormalization, Flatten, Dropout, Convolution2D, Activation, MaxPooling2D, GlobalAveragePooling2D from keras import losses
I will definitely get back to this codebase in the future but I hope it will help others for reference as well!