import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import cv2
import os
from tqdm import tqdm,trange
import sklearn.metrics
import torch
import torch.nn as nn
import torch.nn.functional as F
from pytorchcv.model_provider import get_model as ptcv_get_model
import warnings
warnings.filterwarnings("ignore")
!mkdir api_dir
from os import path
import mlrun
# Set the base project name
project_name_base = 'deepfake-shield'
# Initialize the MLRun environment and save the project name and artifacts path
project_name, artifact_path = mlrun.set_environment(project=project_name_base,
user_project=True, api_path = './api_dir', artifact_path = './artifact_dir')
# Display the current project name and artifacts path
print(f'Project name: {project_name}')
print(f'Artifacts path: {artifact_path}')
Project name: deepfake-shield-mayukh Artifacts path: /media/mayukh/Data/storage/repositories/repos/deep-shield-temp/notebooks/artifact_dir
We've used a modified version of the deepfake-detection-challenge dataset from kaggle which was a part of a competition over a year ago.
This dataset aims to help train models which can determine whether a given face is a deepfake (1
) or not (0
).
filenames = pd.read_csv('filenames_train.csv').filenames.values
filenames_val = pd.read_csv('filenames_val.csv').filenames.values
df_trains = [pd.read_json(filenames[i]) for i in tqdm(range(len(filenames)))]
df_vals = [pd.read_json(filenames_val[i]) for i in tqdm(range(len(filenames_val)))]
100%|██████████| 47/47 [00:17<00:00, 2.72it/s] 100%|██████████| 3/3 [00:01<00:00, 2.56it/s]
nums = list(range(len(df_trains)+1))
LABELS = ['REAL','FAKE']
val_nums=[47, 48, 49]
def get_path(num,x):
num=str(num)
if len(num)==2:
path='training_data/archive/DeepFake'+num+'/DeepFake'+num+'/' + x.replace('.mp4', '') + '.jpg'
else:
path='training_data/archive/DeepFake0'+num+'/DeepFake0'+num+'/' + x.replace('.mp4', '') + '.jpg'
if not os.path.exists(path):
raise Exception
return path
paths=[]
y=[]
for df_train,num in tqdm(zip(df_trains,nums),total=len(df_trains)):
images = list(df_train.columns.values)
for x in images:
try:
paths.append(get_path(num,x))
y.append(LABELS.index(df_train[x]['label']))
except Exception as err:
#print(err)
pass
100%|██████████| 47/47 [00:34<00:00, 1.34it/s]
val_paths=[]
val_y=[]
for df_val,num in tqdm(zip(df_vals,val_nums),total=len(df_vals)):
images = list(df_val.columns.values)
for x in images:
try:
# print(x)
val_paths.append(get_path(num,x))
val_y.append(LABELS.index(df_val[x]['label']))
except Exception as err:
#print(err)
pass
100%|██████████| 3/3 [00:01<00:00, 2.73it/s]
df_train = pd.DataFrame({
'paths': paths,
'labels': y,
})
df_val = pd.DataFrame({
'paths': val_paths,
'labels': val_y
})
df_train.to_csv('df_train.csv', index = False)
df_val.to_csv('df_val.csv', index = False)
mlrun
¶As seen below, the number of images belonging to class FAKE
is much higher than that of REAL
. But a quick way to fix that would be to use another famouse face dataset to help increase the number of REAL
images
# mlrun: start-code
import matplotlib.pyplot as plt
import numpy as np
from mlrun.artifacts import PlotArtifact
# https://github.com/mlrun/mlrun/blob/2f707f068f058794f2cbec2e802766a09e483e91/mlrun/datastore/base.py#L219
from mlrun.datastore import DataItem
def prep_data(context, source_train, source_val):
'''
prep training data
'''
# Convert the DataItem to a pandas DataFrame
df_train = source_train.as_df()
df_train['labels'] = df_train['labels'].astype('category').cat.codes
# Record the DataFrane length after the run
context.log_result('num_rows_in_training_set', df_train.shape[0])
context.log_dataset('deepfake_dataset_train', df=df_train, format='csv', index=False)
'''
prep validation data
'''
# Convert the DataItem to a pandas DataFrame
df_val = source_val.as_df()
df_val['labels'] = df_val['labels'].astype('category').cat.codes
# Record the DataFrane length after the run
context.log_result('num_rows_in_validation_set', df_val.shape[0])
context.log_dataset('deepfake_dataset_val', df=df_val, format='csv', index=False)
'''
prep data vis
'''
fig, ax = plt.subplots(nrows = 1, ncols = 2, figsize = (8,4))
fig.suptitle('Visualizing training + validation data')
unique, counts = np.unique(df_train['labels'].values, return_counts=True)
ax[0].bar(['REAL', 'FAKE'], counts, alpha = 0.7, color = 'g')
ax[0].grid()
unique, counts = np.unique(df_val['labels'].values, return_counts=True)
ax[1].bar(['REAL', 'FAKE'], counts, alpha = 0.7, color = 'y')
ax[1].grid()
context.log_artifact(PlotArtifact('data_vis', body=fig))
print('complete :)')
# mlrun: end-code
# Convert the local prep_data function to an MLRun project function
data_prep_func = mlrun.code_to_function(
name='prep_data',
kind='job',
image='mlrun/mlrun'
)
config = {
'source_train': 'df_train.csv',
'source_val': 'df_val.csv'
}
# Run the `data_prep_func` MLRun function locally
prep_data_run = data_prep_func.run(name='prep_data',
handler=prep_data,
inputs=config,
local=True)
> 2021-07-03 12:23:10,420 [info] starting run prep_data uid=c59753f84f774967b99652a9dc13eb86 DB=./api_dir complete :)
project | uid | iter | start | state | name | labels | inputs | parameters | results | artifacts |
---|---|---|---|---|---|---|---|---|---|---|
deepfake-shield-mayukh | ...13eb86 |
0 | Jul 03 06:53:10 | completed | prep_data | kind= owner=mayukh host=leopard |
source_train source_val |
num_rows_in_training_set=104890 num_rows_in_validation_set=7366 |
deepfake_dataset_train deepfake_dataset_val data_vis |
to track results use .show() or .logs() or in CLI: !mlrun get run c59753f84f774967b99652a9dc13eb86 --project deepfake-shield-mayukh , !mlrun logs c59753f84f774967b99652a9dc13eb86 --project deepfake-shield-mayukh > 2021-07-03 12:23:10,791 [info] run executed, status=completed
prep_data_run.state()
'completed'
train_dataset = mlrun.run.get_dataitem(prep_data_run.outputs['deepfake_dataset_train']).as_df()
val_dataset = mlrun.run.get_dataitem(prep_data_run.outputs['deepfake_dataset_val']).as_df()
X, y = train_dataset['paths'].values, train_dataset['labels'].values
val_X, val_y = val_dataset['paths'].values, val_dataset['labels'].values
def read_img(path):
return cv2.cvtColor(cv2.imread(path),cv2.COLOR_BGR2RGB)
def shuffle(X,y):
new_train=[]
for m,n in zip(X,y):
new_train.append([m,n])
random.shuffle(new_train)
X,y=[],[]
for x in new_train:
X.append(x[0])
y.append(x[1])
return X,y
import random
def get_random_sampling(paths, y, val_paths, val_y, num_train_samples = None, num_val_samples = None):
real=[]
fake=[]
for m,n in zip(paths,y):
if n==0:
real.append(m)
else:
fake.append(m)
# fake=random.sample(fake,len(real))
paths,y=[],[]
for x in real:
paths.append(x)
y.append(0)
for x in fake:
paths.append(x)
y.append(1)
real=[]
fake=[]
for m,n in zip(val_paths,val_y):
if n==0:
real.append(m)
else:
fake.append(m)
val_paths,val_y=[],[]
for x in real:
val_paths.append(x)
val_y.append(0)
for x in fake:
val_paths.append(x)
val_y.append(1)
X=[]
for img in tqdm(paths):
X.append(img)
val_X=[]
for img in tqdm(val_paths):
val_X.append(img)
# Balance with ffhq dataset
ffhq = os.listdir('training_data/ffhq/thumbnails128x128')
X_ = []
for file in tqdm(ffhq):
path = f'training_data/ffhq/thumbnails128x128/{file}'
X_.append(path)
random.shuffle(X_)
for i in range(64773 - 12130):
X.append(X_[i])
y.append(0)
del X_[0:64773 - 12130]
for i in range(6108 - 1258):
val_X.append(X_[i])
val_y.append(0)
X, y = shuffle(X,y)
val_X, val_y = shuffle(val_X,val_y)
if num_train_samples is not None:
X, y = X[:num_train_samples], y[:num_train_samples]
if num_val_samples is not None:
val_X, val_y = X[:num_val_samples], y[:num_val_samples]
return X, val_X, y, val_y
from torch.utils.data import Dataset, DataLoader
class ImageDataset(Dataset):
def __init__(self, x, y, training=True, transform=None):
self.x = x
self.y = y
self.transform = transform
self.training = training
def __len__(self):
return len(self.x)
def __getitem__(self, idx):
if torch.is_tensor(idx):
idx = idx.tolist()
img_path = self.x[idx]
img_numpy = read_img(img_path)
# print(img_numpy.max(), img_numpy.min())
if self.transform is not None:
res = self.transform(image=img_numpy)
img_numpy = res['image']
img_tensor = torch.tensor(img_numpy).permute(-1,0,1)
labels = self.y[idx]
return {
'image': img_tensor,
'label': labels
}
X, val_X, Y, val_Y = get_random_sampling(
paths,
y,
val_paths,
val_y,
num_train_samples= None, # Set this to None or simply skip this cell if you want to train on the whole dataset
num_val_samples= None # Set this to None or simply skip this cell if you want to train on the whole dataset
)
# len(X), len(val_X), len(Y), len(val_Y)
import albumentations
from albumentations import (
ShiftScaleRotate,
HorizontalFlip,
Normalize,
GaussNoise,
)
mean = [0.485, 0.456, 0.406]
std = [0.229, 0.224, 0.225]
train_transform = albumentations.Compose([
ShiftScaleRotate(p=0.3, scale_limit=0.25, border_mode=1, rotate_limit=25),
HorizontalFlip(p=0.2),
GaussNoise(p=.2),
Normalize(mean = mean, std = std),
albumentations.augmentations.geometric.resize.Resize(128,128)
])
val_transform = albumentations.Compose([
Normalize(mean = mean, std = std),
albumentations.augmentations.geometric.resize.Resize(128,128)
])
train_dataset = ImageDataset(X, Y, transform=train_transform)
val_dataset = ImageDataset(val_X, val_Y, transform=val_transform)
batch_size = 8
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
val_loader = DataLoader(dataset=val_dataset, batch_size=batch_size, shuffle=False, num_workers=0)
import torchvision.transforms as transforms
invTrans = transforms.Compose([ transforms.Normalize(mean = [ 0., 0., 0. ],
std = [ 1/0.229, 1/0.224, 1/0.225 ]),
transforms.Normalize(mean = [ -0.485, -0.456, -0.406 ],
std = [ 1., 1., 1. ]),
])
def image_tensor_to_rgb(image_tensor, inverse_transform = invTrans ):
return inverse_transform(image_tensor).permute(1,2,0).cpu().detach()
pair = next(iter(train_loader))
nrow, ncol = 2, 4
fig, ax = plt.subplots(nrow, ncol, figsize=(15,8))
for i in range(8):
image = image_tensor_to_rgb(pair['image'][i])
ax.flat[i].imshow(image)
ax.flat[i].set_title('FAKE' if pair['label'][i].item() == 1 else 'REAL', fontsize = 24)
ax.flat[i].axis('off')
class Head(torch.nn.Module):
def __init__(self, in_f, out_f):
super(Head, self).__init__()
self.f = nn.Flatten()
self.l = nn.Linear(in_f, 512)
self.d = nn.Dropout(0.75)
self.o = nn.Linear(512, out_f)
self.b1 = nn.BatchNorm1d(in_f)
self.b2 = nn.BatchNorm1d(512)
self.r = nn.ReLU()
def forward(self, x):
# print(x.shape)
x = x.reshape(x.size(0), -1)
x = self.b1(x)
x = self.d(x)
x = self.l(x)
x = self.r(x)
x = self.b2(x)
x = self.d(x)
out = self.o(x)
return out
class FCN(torch.nn.Module):
def __init__(self, base, in_f):
super(FCN, self).__init__()
self.base = base
self.h1 = Head(in_f, 1)
def forward(self, x):
x = self.base(x)
return self.h1(x)
def criterion(pred1, targets):
l1 = F.binary_cross_entropy(F.sigmoid(pred1).squeeze(-1), targets)
return l1
def train_model(model, epoch, optimizer, scheduler, loss ):
model.train()
total_loss = 0
t = tqdm(train_loader, disable = True)
for i, batch in enumerate(train_loader):
img_batch = batch['image']
img_batch = img_batch.cuda()
y_batch = batch['label']
y_batch = y_batch.cuda().float()
optimizer.zero_grad()
out = model(img_batch)
loss = criterion(out, y_batch)
total_loss += loss
t.set_description(f'Epoch {epoch+1}, LR: %6f, Loss: %.4f'%(optimizer.state_dict()['param_groups'][0]['lr'],total_loss/(i+1)))
loss.backward()
optimizer.step()
scheduler.step(1.)
return model, total_loss
def get_model():
model = ptcv_get_model("efficientnet_b4")
model = nn.Sequential(*list(model.children())[:-1]) # Remove original output layer
model[0].final_block.pool = nn.Sequential(nn.AdaptiveAvgPool2d(1))
model = FCN(model, 1792)
model = model.cuda()
return model
mlrun
¶MAX_LR = 1e-2
MIN_LR = 9e-4
NUM_SAMPLES_LR = 100
MAX_BATCH_SIZE = 64
MIN_BATCH_SIZE = 11
possible_learning_rates = list(np.linspace(MIN_LR, MAX_LR, NUM_SAMPLES_LR))
possible_batch_sizes = [i for i in range(MIN_BATCH_SIZE, MAX_BATCH_SIZE+1, 1)]
import mlrun
def experiment(context, batch_size, learning_rate):
model = get_model()
optimizer = torch.optim.AdamW(model.parameters(), lr= learning_rate)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, mode='min', factor=0.7, verbose=True, min_lr=1e-5)
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
model, loss = train_model(model, 0,optimizer, scheduler, loss = criterion)
context.logger.info(f"batch_size = {batch_size} learning_rate = {learning_rate} loss = {loss}")
context.log_result("loss", loss.item())
grid_params = {
"batch_size": possible_batch_sizes,
"learning_rate": possible_learning_rates
}
task = mlrun.new_task("deep-shield-grid-search-hyperparams").with_hyper_params(grid_params, selector="min.loss")
run = mlrun.new_function().run(task, handler=experiment)
## output removed
run.show()
project | uid | iter | start | state | name | labels | inputs | parameters | results | artifacts |
---|---|---|---|---|---|---|---|---|---|---|
deepfake-shield-mayukh | ...cfc762 |
0 | Jul 03 08:09:05 | completed | deep-shield-grid-search-hyperparams | kind=handler owner=mayukh |
best_iteration=6 loss=7.21016263961792 |
iteration_results |
results = pd.read_csv(run.artifact('iteration_results')['target_path'])
best_iter = results[results['output.loss'] == results['output.loss'].min()]
ideal_params = {
'learning_rate': best_iter['param.learning_rate'].values[0],
'batch_size': best_iter['param.batch_size'].values[0]
}
ideal_params
{'learning_rate': 0.002508, 'batch_size': 16}