论文地址:Image-to-Image Translation with Conditional Adversarial Networks

Pix2Pix的提出

Image to Image Translation是一类计算机视觉问题的总称
例如图像风格转换、图像上色、图像语义分割等都属于这类问题

此前针对这些不同的问题需要设计不同的网络结构和训练目标函数
而GAN的提出与发展让通用模型的产生成为可能
因此作者基于GAN以及一些其他成果设计了Image to Image Translation问题的通用模型——Pix2Pix

Pix2Pix

目标函数

Pix2Pix的基本结构基于cGAN,其目标函数为

其中$x$​​表示输入图像,也即cGAN的条件部分,$y$表示真实图像,$z$表示随机噪声

以往的研究中,$z$的加入是为了避免G对于同一输入永远只产生一种固定的结果
然而pix2pix作者在实验中发现,即使加入$z$​​​​​​,G也会在学习中逐渐将其忽略
因此作者最终决定训练和测试时都以dropout提供噪声

pix2pix的对抗损失变为

此外以往的研究中还表明
将GAN的对抗损失和一些传统损失混合往往能产生更好的效果

所以pix2pix还对G加入了L1损失,即令G的输出更接近真是图像
选择L1而没有选择L2是因为作者发现L2更容易造成模糊

pix2pix的G最终目标函数为

生成器结构

以往Image to Image Translation的成果中很多会使用encoder-decoder结构
然而这类问题的输入与输出常常共享很多低层信息,例如边缘信息
这些低层信息很难流过整个encoder-decoder到达输出,特别是在瓶颈层

于是pix2pix的生成器参考U-net加入了skip connections结构

u-net

马尔科夫判别器 (PatchGAN)

首先简单解释一下图像的频率

图像的频率——是图像灰度值变化剧烈程度的指标

图像的高频信息即图像灰度变化剧烈的部分,例如边缘、噪点以及细节
反之图像低频信息即灰度变化缓慢的部分

L1、L2损失虽然会在图像生成中产生一定的模糊(即无法促进高频信息生成)
但他们可以很好的促进低频信息产生

因此作者决定限制GAN的D只建模高频信息,依靠L1促进低频信息准确

作者将这种判别器称为PatchGAN
顾名思义,这种判别器只判别图像的每个N×N小区域是否真实
作者在试验中证明了N可以很小,也即D的参数量可以很少

效果示例

给出一些论文中的效果展示

pix2pix

Keras代码实现

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
from keras.models import Model
from keras.layers import Dropout, Conv2D, UpSampling2D, \
LeakyReLU, Input, Concatenate, Activation, BatchNormalization
from keras.optimizers import Adam
from keras.initializers import RandomNormal
import os
import cv2
import numpy as np
import random

class DataLoader:
def __init__(self, dir_A, dir_B, batch_size, img_shape):
self.dir_A = dir_A
self.dir_B = dir_B

self.flist = os.listdir(dir_A)
self.fnum = len(self.flist)

self.batch_size = batch_size
self.img_shape = img_shape

self.idx_cur = 0

def getNumberOfBatch(self):
num = self.fnum / self.batch_size
if self.fnum % self.batch_size != 0:
num += 1
return int(num)

def reset(self):
self.idx_cur = 0
random.shuffle(self.flist)

def __iter__(self):
return self

def __next__(self):
if self.idx_cur >= self.fnum:
self.reset()
raise StopIteration

if self.idx_cur+self.batch_size-1 < self.fnum:
length = self.batch_size
idx_nxt = self.idx_cur+self.batch_size
else:
length = self.fnum-self.idx_cur
idx_nxt = self.fnum

imgA = np.zeros((length, *self.img_shape))
imgB = np.zeros((length, *self.img_shape))

for k in range(length):
fpath_A = os.path.join(self.dir_A, self.flist[self.idx_cur+k])
fpath_B = os.path.join(self.dir_B, self.flist[self.idx_cur+k])

img_a = cv2.imread(fpath_A, 1)
img_b = cv2.imread(fpath_B, 1)

imgA[k] = (img_a.astype(np.float32) - 127.5) / 127.5
imgB[k] = (img_b.astype(np.float32) - 127.5) / 127.5

self.idx_cur = idx_nxt

return imgA, imgB

class Pix2Pix:
def __init__(self):
self.img_row = 256
self.img_col = 256
self.img_channels = 3
self.img_shape = (self.img_row, self.img_col, self.img_channels)

patch = int(self.img_row / 2 ** 3)
self.discPatch = (patch, patch, 1)

self.buildGAN()

def buildGenerator(self):
initWeight = RandomNormal(stddev=0.02)

def EnConv2D(inputs, filters, k_size=4, norm=True):
x = Conv2D(filters, kernel_size=k_size, strides=2, padding='same', kernel_initializer=initWeight)(inputs)
if norm:
x = BatchNormalization()(x)
outputs = LeakyReLU(alpha=0.2)(x)
return outputs

def DeConv2D(inputs, skipInputs, filters, k_size=4, drop_rate=0.0):
x = UpSampling2D()(inputs)
x = Conv2D(filters, kernel_size=k_size, padding='same', kernel_initializer=initWeight)(x)
x = BatchNormalization()(x)
x = Activation('relu')(x)
outputs = Concatenate()([x, skipInputs])
if drop_rate:
outputs = Dropout(drop_rate)(outputs)
return outputs

img_input = Input(shape=self.img_shape)

encoder1 = EnConv2D(img_input, 64, norm=False)
encoder2 = EnConv2D(encoder1, 128)
encoder3 = EnConv2D(encoder2, 256)
encoder4 = EnConv2D(encoder3, 512)
encoder5 = EnConv2D(encoder4, 512)
encoder6 = EnConv2D(encoder5, 512)
encoder7 = EnConv2D(encoder6, 512)

decoder1 = DeConv2D(encoder7, encoder6, 512)
decoder2 = DeConv2D(decoder1, encoder5, 512)
decoder3 = DeConv2D(decoder2, encoder4, 512)
decoder4 = DeConv2D(decoder3, encoder3, 256)
decoder5 = DeConv2D(decoder4, encoder2, 128)
decoder6 = DeConv2D(decoder5, encoder1, 64)

decoder7 = UpSampling2D()(decoder6)
img_output = Conv2D(filters=self.img_channels, kernel_size=4, padding='same', activation='tanh', kernel_initializer=initWeight)(decoder7)

return Model(img_input, img_output)

def buildDiscriminator(self):
initWeight = RandomNormal(stddev=0.02)

def discLayer(inputs, filters, k_size=4, stride=2, norm=True):
x = Conv2D(filters, kernel_size=k_size, strides=stride, padding='same', kernel_initializer=initWeight)(inputs)
if norm:
x = BatchNormalization()(x)
outputs = LeakyReLU(alpha=0.2)(x)
return outputs

imgA = Input(shape=self.img_shape)
imgB = Input(shape=self.img_shape)
inputImg = Concatenate()([imgA, imgB])

disc1 = discLayer(inputImg, 64, norm=False)
disc2 = discLayer(disc1, 128)
disc3 = discLayer(disc2, 256)
disc4 = discLayer(disc3, 512, stride=1)

validity = Conv2D(filters=1, kernel_size=4, padding='same', activation='sigmoid', kernel_initializer=initWeight)(disc4)

return Model([imgA, imgB], validity)

def buildGAN(self):
self.generator = self.buildGenerator()
self.discriminator = self.buildDiscriminator()

optimizer = Adam(2e-4)
self.discriminator.compile(loss='mse', optimizer=optimizer, metrics=['accuracy'])

imgB = Input(shape=self.img_shape)
fakeA = self.generator(imgB)

self.discriminator.trainable = False
validity = self.discriminator([fakeA, imgB])

self.combined = Model(imgB, [validity, fakeA])
self.combined.compile(loss=['mse', 'mae'], loss_weights=[1, 100], optimizer=optimizer)

def trainModel(self, epochs, batch_size=1):
self.dataLoader = DataLoader(
dir_A='D:/wallpaper/datas/test/trainA',
dir_B='D:/wallpaper/datas/test/trainB',
batch_size=batch_size,
img_shape=self.img_shape
)

totalStep = self.dataLoader.getNumberOfBatch()
for epoch in range(epochs):
for step, (imgA, imgB) in enumerate(self.dataLoader):
valid = np.ones((imgA.shape[0],) + self.discPatch)
fake = np.zeros((imgA.shape[0],) + self.discPatch)

fakeA = self.generator.predict(imgB)

D_loss_real = self.discriminator.train_on_batch([imgA, imgB], valid)
D_loss_fake = self.discriminator.train_on_batch([fakeA, imgB], fake)
D_loss = 0.5 * np.add(D_loss_real, D_loss_fake)

G_loss = self.combined.train_on_batch(imgB, [valid, imgA])

step += 1
print('Epoch {}/{}, step {}/{} -- D loss: {:.4f}, acc: {:.2f}%, '
'G loss: {:.4f}, adver loss: {:.4f}, L1 loss: {:.4f}'.format(
epoch + 1, epochs, step, totalStep, D_loss[0], D_loss[1] * 100, G_loss[0], G_loss[1], G_loss[2]))

if step % 2 == 0:
fpath = 'D:/wallpaper/datas/sketch/testB/1047028.png'
fname = 'output{}.png'.format(epoch)
self.colorizeImage(fpath=fpath, outputDir='output', fname=fname)

def colorizeImage(self, fpath, outputDir, fname):
img_input = cv2.imread(fpath, 1)
img_input = cv2.resize(img_input, (256, 256))

img_input = np.expand_dims(img_input, 0)
img_input = (img_input.astype(np.float32) - 127.5) / 127.5

img_output = self.generator.predict(img_input)[0]
img_output = img_output * 127.5 + 127.5
img_output = img_output.astype(np.uint8)

outputPath = os.path.join(outputDir, fname)
if not os.path.exists(outputDir):
os.mkdir(outputDir)
cv2.imwrite(outputPath, img_output)