-
Notifications
You must be signed in to change notification settings - Fork 13
/
Copy pathL7_SVM.tex
560 lines (436 loc) · 21.4 KB
/
L7_SVM.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
\documentclass[twoside]{article}
\setlength{\oddsidemargin}{0.25 in}
\setlength{\evensidemargin}{-0.25 in}
\setlength{\topmargin}{-0.6 in}
\setlength{\textwidth}{6.5 in}
\setlength{\textheight}{8.5 in}
\setlength{\headsep}{0.75 in}
\setlength{\parindent}{0 in}
\setlength{\parskip}{0.1 in}
\newcommand{\eqdef}{:\mathrel{\mathop=}}
\newcommand{\norm}[1]{\left\lVert #1 \right\rVert}
%
% ADD PACKAGES here:
%
\usepackage{amsmath,amsfonts,graphicx}
%
% The following commands set up the lecnum (lecture number)
% counter and make various numbering schemes work relative
% to the lecture number.
%
\newcounter{lecnum}
\renewcommand{\thepage}{\thelecnum-\arabic{page}}
\renewcommand{\thesection}{\thelecnum.\arabic{section}}
\renewcommand{\theequation}{\thelecnum.\arabic{equation}}
\renewcommand{\thefigure}{\thelecnum.\arabic{figure}}
\renewcommand{\thetable}{\thelecnum.\arabic{table}}
%
% The following macro is used to generate the header.
%
\newcommand{\lecture}[4]{
\pagestyle{myheadings}
\thispagestyle{plain}
\newpage
\setcounter{lecnum}{#1}
\setcounter{page}{1}
\noindent
\begin{center}
\framebox{
\vbox{\vspace{2mm}
\hbox to 6.28in { {\bf Advanced Machine Learning
\hfill Fall 2021} }
\vspace{4mm}
\hbox to 6.28in { {\Large \hfill Lecture #1: #2 \hfill} }
\vspace{2mm}
\hbox to 6.28in { {\it #3 \hfill #4} }
\vspace{2mm}}
}
\end{center}
\markboth{Lecture #1: #2}{Lecture #1: #2}
{\bf Note}: {\it LaTeX template courtesy of UC Berkeley EECS dept.}
{\bf Disclaimer}: {\it These notes are adapted from CMU's 10-725 Course, Stanford's CS229 Course, ETH's Advanced Machine Learning Course and Bishop's "Pattern Recognition and Machine Learning" book.}
\vspace*{4mm}
}
%
% Convention for citations is authors' initials followed by the year.
% For example, to cite a paper by Leighton and Maggs you would type
% \cite{LM89}, and to cite a paper by Strassen you would type \cite{S69}.
% (To avoid bibliography problems, for now we redefine the \cite command.)
% Also commands that create a suitable format for the reference list.
\renewcommand{\cite}[1]{[#1]}
\def\beginrefs{\begin{list}%
{[\arabic{equation}]}{\usecounter{equation}
\setlength{\leftmargin}{2.0truecm}\setlength{\labelsep}{0.4truecm}%
\setlength{\labelwidth}{1.6truecm}}}
\def\endrefs{\end{list}}
\def\bibentry#1{\item[\hbox{[#1]}]}
%Use this command for a figure; it puts a figure in wherever you want it.
%usage: \fig{NUMBER}{SPACE-IN-INCHES}{CAPTION}
\newcommand{\fig}[3]{
\vspace{#2}
\begin{center}
Figure \thelecnum.#1:~#3
\end{center}
}
% Use these for theorems, lemmas, proofs, etc.
\newtheorem{theorem}{Theorem}[lecnum]
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{claim}[theorem]{Claim}
\newtheorem{corollary}[theorem]{Corollary}
\newtheorem{definition}[theorem]{Definition}
\newenvironment{proof}{{\bf Proof:}}{\hfill\rule{2mm}{2mm}}
% **** IF YOU WANT TO DEFINE ADDITIONAL MACROS FOR YOURSELF, PUT THEM HERE:
\newcommand\E{\mathbb{E}}
\begin{document}
%FILL IN THE RIGHT INFO.
%\lecture{**LECTURE-NUMBER**}{**DATE**}{**LECTURER**}{**SCRIBE**}
\lecture{7}{Support Vector Machines}{}{}
%\footnotetext{These notes are partially based on those of Nigel Mansell.}
% **** YOUR NOTES GO HERE:
% Some general latex examples and examples making use of the
% macros follow.
%**** IN GENERAL, BE BRIEF. LONG SCRIBE NOTES, NO MATTER HOW WELL WRITTEN,
%**** ARE NEVER READ BY ANYBODY.
\section{Lagrangian} % Don't be this informal in your notes!
Consider a general minimization problem. There is no need for it to be convex, let's keep it as general as possible.
\begin{equation*}
\begin{aligned}
& \underset{x}{\text{minimize}}
& & f(x) \\
& \text{subject to}
& & h_i(x) \leq 0, \; i = 1, \ldots, m \\
&&& l_j(x) = 0, \; j = 1, \ldots, r
\end{aligned}
\end{equation*}
\\
The Lagrangian is defined as:
\begin{equation}
\mathcal{L}(x,u,v) = f(x) + \sum_{i=1}^{m} u_i \cdot h_i(x) + \sum_{j=1}^{r} v_j \cdot l_j(x)
\end{equation}
Where $u \in {\mathbb{R}^m \geq 0}$, $v \in {\mathbb{R}^r}$.
\\
\\
(Small side-note) We define the Lagrangian $\mathcal{L}(x,u,v) = - \infty$ for values of $u < 0$.\\
\\ We are going to exploit the following property of the Lagrangian: \\
$\forall u \geq 0$ and $\forall v$ :
\textbf{$f(x) \geq \mathcal{L}(x,u,v) $ at each feasible $x$}
\begin{figure}[h]
\caption{Each dotted line shows $\mathcal{L}(x, u)$ for different choice of $u \geq 0$.}
\centering
\includegraphics[width=0.39\textwidth]{img/lagrangian.jpg}
\end{figure}
\newpage
\subsection{Lagrangian Dual Function}
Let $C$ denote the primal feasible set, $f^{*}$ denote primal optimal value. Minimizing $\mathcal{L}(x, u, v)$ over all $x$ gives a
lower bound:
\begin{equation*}
\begin{aligned}
f^{*} \overset{(i)}{\geq} \underset{x \in {C}}{\text{min}}& & \mathcal{L}(x, u, v) \overset{(ii)}{\geq} \underset{x}{\text{min}}& & \mathcal{L}(x, u, v) \eqdef g(u,v)
\end{aligned}
\end{equation*}
\\
Where $(i)$ derives from the property we stated earlier taking the minimum of both sides and $(ii)$ holds because the unconstrained minimum will always be less or equal the constrained one. \\
We call $g(u,v)$ the Lagrangian dual function and it gives a lower bound on $f^{*}$. The main takeaway here is that $(i)$ is often not computable because of the constraints. Hence, we prefer to solve the dual problem $(ii)$.
\subsection{Langrange Dual Problem}
Given a primal problem
\begin{equation*}
\begin{aligned}
& \underset{x}{\text{minimize}}
& & f(x) \\
& \text{subject to}
& & h_i(x) \leq 0, \; i = 1, \ldots, m \\
&&& l_j(x)\equiv 0, \; j = 1, \ldots, r
\end{aligned}
\end{equation*}
We have showed that our dual function satisfies $f^{*} \geq g(u,v)$ for all $u \geq 0$ and $v$ \\
Thus, we can get the best lower-bound estimate of $f^{*}$ maximizing $g(u,v)$ over the feasible $u,v$, yielding the Lagrange Dual Problem:
\begin{equation*}
\begin{aligned}
& \underset{u,v}{\text{max}}
& & g(u,v) \\
& \text{subject to}
& & u \geq 0, \;
\end{aligned}
\end{equation*}\\
A key property is called \textbf{weak duality}:
\begin{equation*}
f^{*} \geq g^{*}
\end{equation*}
Where $f^{*},g{*}$ are the optimal values for the primal and dual problems.\\Note that this property always holds, even if the primal problem is not convex. Furthermore, it is easy to prove that \textbf{the dual problems is always a convex optimization problem}, even if the primal problem is non convex.
\newpage
\subsection{Strong Duality}
In some problems we will have $f^{*}=g^{*}$, this property is called Strong Duality.
\begin{theorem}{(Slater's Condition)}
\\If the primal problem is a convex problem
and there exists at least one strictly feasible $x \in \mathbb{R} $, then Strong Duality holds.
\end{theorem}
In other words, the condition is that exists $x$ such that:
\begin{align*}
h_i(x) < 0, \; i = 1, \ldots, m \\\\
l_j (x) = 0,\; j = 1, \ldots, r \\
\end{align*}
\subsection{Karush-Kuhn-Tucker (KKT) Conditions}
Given a general problem
\begin{equation*}
\begin{aligned}
& \underset{x}{\text{minimize}}
& & f(x) \\
& \text{subject to}
& & h_i(x) \leq 0, \; i = 1, \ldots, m \\
&&& l_j(x) = 0, \; j = 1, \ldots, r
\end{aligned}
\end{equation*}
The KKT conditions are:
\begin{enumerate}
\item $0 \in \partial_x (\mathcal{L}(x, u, v) )$ (stationarity)
\item $u_i \cdot h_i(x) = 0$ for all $i$ (complementary slackness)
\item $h_i(x) \leq 0, l_j(x) = 0$ for all $i,j$ (primal feasibility)
\item $u_i \geq 0$ for all $i$ (dual feasibility)
\end{enumerate}
\begin{theorem}
For $x^{*}$ and $u^{*}$, $v^{*}$ to be primal and dual solutions, KKT conditions are sufficient.
\end{theorem}
\begin{proof}
$g^* = g(u^{*},v^{*}) = f(x^{*}) + \sum_{i=1}^{m} u_i^{*} \cdot h_i(x^{*}) + \sum_{j=1}^{r} v_j^{*} \cdot l_j(x^{*}) = f(x^{*}) = f^* $ \\
\\ Where the first equality holds from stationarity and the second equality holds from complementary slackness and primal feasibility.
\end{proof}
\newpage
\begin{theorem}
For a problem with strong duality (e.g. assume Slater's condition holds)\\
$x^{*}$ and $u^{*}$, $v^{*}$ are primal and dual solution $\iff$ $x^{*}$ and $u^{*}$, $v^{*}$ satisfy KKT conditions.
\end{theorem}
\begin{proof}\\
\\
\textbf{Sufficiency:} Follows from Theorem 7.2.\\ \\ \textbf{Necessity:} Suppose $x^{*}$ and $u^{*}$, $v^{*}$ to be primal and dual solution, and suppose strong duality holds. Then:\\
\\ $f^* =g^* = g(u^{*},v^{*}) $ \ (holds by assumptions)\\
\\$= \underset{x}{\text{min}}
& &( f(x) + \sum_{i=1}^{m} u_i^{*} h_i(x) +\sum_{j=1}^{r} v_j^{*} \cdot l_j(x) ) $ \ (holds by definition) \\ \\
$\leq f(x^{*}) + \sum_{i=1}^{m} u_i^{*} \cdot h_i(x^{*}) + \sum_{j=1}^{r} v_j^{*} \cdot l_j(x^{*})$ \ (min(f) is less or equals than the value of f at any other point) \\ \\
$\leq f(x^{*})$ \ (holds by feasibility since the sums must be less or equal than 0) \\
The LHS equals RHS, therefore all the inequalities must be equalities. Looking at KKT Conditions:
\begin{itemize}
\item \textbf{Primal and dual feasibility} hold by virtue of optimality: $x^*,u^*,v^*$ are optima $ \implies$ $x^*,u^*,v^*$ must be feasible
\item \textbf{Stationarity} comes from the fact that $x^{*}$ minimizes $g(u^{*},v^{*})$. Since $x^{*}$ is the minimizer it must be a stationary point for this function.
\item \textbf{Complementary Slackness} comes from the last inequality, since $\sum_{i=1}^{m} u_i^{*} \cdot h_i(x^{*})$ must be equal to 0.
\end{itemize}
\end{proof}
\newpage
\section{Maximum Margin Classifiers}
We begin our discussion of Support Vector Machines by returning to the two-class classification problem using a linear model of the form:
\begin{equation*}
y(x) = w^T \phi(x) + b
\end{equation*}
We shall assume for the moment that the training data is linearly separable. The SVMs approach this problem through the concept of margin, which is defined to be the smallest distance between decision boundary and any of the samples.
\subsection{Finding the margin}
Consider an arbitrary point $x$ and let $x_{\bot}$ be its orthogonal projection onto the decision surface, so that:
\begin{equation*}
x = x_{\bot} + r \dfrac{w}{\norm{w}}
\end{equation*}
Multiplying both sides of this result by $w^T$ and adding $b$ we get:
\begin{equation*}
w^T x + b = w^T x_{\bot} + r w^T \dfrac{w}{\norm{w}} + b
\end{equation*}
Applying the definition $y(x) = w^T x + b$:
\begin{equation*}
y(x) = y(x_{\bot}) + r \norm{w}
\end{equation*}
From Figure 7.2 it is clear that $x_{\bot}$ lies on the decision surface, hence $y(x_{\bot})=0$. Solving for r:
\begin{equation*}
r = \dfrac{y(x)}{\norm{w}}
\end{equation*}
Therefore, the perpendicular distance of a point $x$ from a hyperplane defined by $y(x)=0$ is given by:
\begin{equation*}
\dfrac{\lvert y(x) \rvert}{\norm{w}}
\end{equation*}
\begin{figure}[h]
\caption{The decision surface shown in red is perpendicular to w.}
\centering
\includegraphics[width=0.385\textwidth]{img/margin.jpg}
\end{figure}
\newpage
\subsection{SVMs Primal Problem}
We wish to optimize the parameters $w$ and $b$ in order to maximize the minimum margin among all data points:
\begin{equation*}
\max_{w,b} \ \min_{n} \dfrac{\lvert y(x_{n}) \rvert}{\norm{w}}
\end{equation*}
We can take the factor $\dfrac{1}{\norm{w}}$ outside of the optimization over $n$ because it does not depend on $n$:
\begin{equation*}
\max_{w,b} \dfrac{1}{\norm{w}} \ \min_{n} [t_{n} (w^T \phi(x) +b) ]
\end{equation*}
Direct solution of this optimization problem would be very complex (non-convex), and so we shall convert it into an equivalent problem that is much easier to solve.
To do this, we note that if we make the rescaling $w' = kw$ and $b' = kb$ the margin will remain unchanged:
\begin{equation*}
r' = \dfrac{t_{n}((w')^T \phi(x_{n}) +b')}{\norm{w'}} = \dfrac{k t_{n} (w^T \phi(x_{n}) + b)}{k \norm{w}} = r
\end{equation*}
We can use this freedom to set the point that is closer to the decision surface:
\begin{equation*}
t_{n} (w^T \phi(x_{n}) +b) = 1
\end{equation*}
In this case all data points will have to satisfy the constraints:
\begin{equation*}
t_{i} (w^T \phi(x_{i}) +b) \geq 1 \ \ i = 1,\dotsc,N
\end{equation*}
Thus we can reduce the problem to:
\begin{equation*}
\begin{aligned}
& \underset{w,b}{\text{max}}
& & \dfrac{1}{\norm{w}}\\
& \text{subject to}
& & 1 - t_{i} (w^T \phi(x_{i}) +b) \leq 0, \; i = 1, \ldots, N \\
\end{aligned}
\end{equation*}
Furthermore, maximizing ${\norm{w}^{-1}}$ is equivalent to minimizing $ \norm{w}^2$.We include a factor $1/2$ for later convenience:
\begin{equation*}
\begin{aligned}
& \underset{w,b}{\text{min}}
& & \dfrac{1}{2} \norm{w}^2\\
& \text{subject to}
& & 1 - t_{i} (w^T \phi(x_{i}) +b) \leq 0, \; i = 1, \ldots, N \\
\end{aligned}
\end{equation*}
Two important observations:
\begin{itemize}
\item It appears that the bias parameter b has disappeared from the optimization. However, it is determined implicitly via the constraints.
\item The solution to a QP problem in $M$ variables has computational complexity that is $O(M^3)$. Thus, the primal problem is only feasible if we constrain ourselves to a fixed set of basis function (small $M$).
\end{itemize}
\newpage
\subsection{SVMs Dual Problem}
First thing first, we should check if Slater's conditions is satisfied (remember that Slater's condition is sufficient for \textbf{strong duality}).\\
Let $w^*,b^*$ be an optimal solution and $\lambda > 1$, then $\lambda w^*, \lambda b^*$ will be strictly feasible:
\begin{equation*}
t_{i}(\lambda w^{*T} \phi(x_{i}) + \lambda b^*) = t_{i} \lambda ( w^{*T} \phi(x_{i}) + b^*) > t_{i} ( w^{*T} \phi(x_{i}) + b^*) \geq 1, \; i = 1, \ldots, N \\
\end{equation*}
Since strong duality holds, we can solve the dual problem:
\begin{equation*}
\begin{aligned}
& \underset{a}{\text{max}}
& \underset{w,b}{\text{min}}
& \ \mathcal{L}(w,b,a) \\
& \text{subject to}
& & a_{i} \geq 0, \; i = 1, \ldots, N \
\end{aligned}
\end{equation*}
Where we define the Lagrangian Dual Function $\mathcal{L}(w,b,a) = \dfrac{1}{2} \norm{w}^2 + \sum_{i=1}^{N} a_i\{ 1- t_{i}(w^T \phi(x_{i}) +b) \} $
Setting the derivatives of $\mathcal{L}(w,b,a)$ with respect to $w$ and $b$ equal to zero, we obtain the following two conditions:
\begin{align}
w = \sum_{i=1}^{N} a_{i}t_{i}\phi(x_{i}) \\
\sum_{i=1}^{N} a_{i}t_{i} = 0
\end{align}
Now we substitute the conditions back into $\mathcal{L}(w,b,a)$:
\begin{equation*}
\mathcal{L}(w,b,a) = \dfrac{1}{2} \norm{w}^2 + \sum_{i=1}^{N} a_{i}
- \overbrace{\sum_{i=1}^{N} a_{i}t_{i}b}^\text{=0} - \sum_{i=1}^{N} a_{i}t_{i} w^T \phi(x_{i})
\end{equation*}
Substituting (7.2) we obtain:
\begin{equation*}
\mathcal{L}(a) = \dfrac{1}{2} \ [\sum_{i=1}^{N} a_{i}t_{i}\phi(x_{i})]^T [\sum_{i=1}^{N} a_{i}t_{i}\phi(x_{i})] + \sum_{i=1}^{N} a_{i} - \sum_{i=1}^{N} a_{i} t_{i} [\sum_{i=1}^{N} a_{i}t_{i}\phi(x_{i})]^T \phi(x_{i})
\end{equation*}
Expanding the products:
\begin{equation*}
\mathcal{L}(a) = \dfrac{1}{2} \ \sum_{i=1}^{N} { \sum_{j=1}^{N} a_{i} a_{j} t_{i} t_{j} \phi(x_{i})^T \phi(x_{j})}
+ \sum_{i=1}^{N} a_{i} -
\sum_{i=1}^{N} { \sum_{j=1}^{N} a_{i} a_{j} t_{i} t_{j} \phi(x_{i})^T \phi(x_{j})}
\end{equation*}
\newpage
This gives the dual representation of the SVM problem:
\begin{equation*}
\begin{aligned}
& \underset{a}{\text{max}}
& & \sum_{i=1}^{N} a_{i} - \dfrac{1}{2}
\sum_{i=1}^{N} { \sum_{j=1}^{N} a_{i} a_{j} t_{i} t_{j} \phi(x_{i})^T \phi(x_{j})}\\
& \text{subject to}
& & a_{i} \geq 0, \; i = 1, \ldots, N \\
&&& \sum_{i=1}^{N} a_{i}t_{i} = 0,
\end{aligned}
\end{equation*}
Some important observations:
\begin{itemize}
\item Note that now the time complexity for the QP solver is $O(N^3)$, thus it does not depend on the choice of basis function. For a fixed set of basis functions whose number M is smaller than the number N of data points, the dual problem appears disadvantageous. However, the dual problem makes feasible applying SVMs to feature spaces whose dimensionality exceed the number of data points, including infinite feature spaces.
\item In order to classify new data points using the trained model, we evaluate the sign of $y(x)$:
\begin{equation*}
y(x) = \sum_{i=1}^{N} a_{i}t_{i} \phi(x_i)^T \phi(x) +b
\end{equation*}
\item Remember that : Strong Duality $\implies$ KKT conditions are satisfied. Hence, the following condition must hold:
\begin{equation*}
a_{i}(t_{i}y(x_{i})-1) = 0, \; i = 1, \ldots, N
\end{equation*}
Any data point for which $a_{i} = 0$ plays no role in making predictions for new data points. The remaining data points correspond to points that lie on the maximum margin hyperplanes in feature space and they are called support vectors. This property is central to the practical applicability of SVMs: once the model is trained, a significant proportion of the data points can be discarded.
\end{itemize}
\section{Soft Margin SVM}
So far, we have assumed that the training data points are linearly separable in the feature space. In practice, however, the class conditional distributions may overlap, in which case exact separation of the training data can lead to poor generalization. We therefore need a way to modify support vector machines so as to allow some of the training points to be misclassified. \\
To do this, we introduce slack variables $\xi_{i} \geq 0$ where $i=1,\ldots,N$. These are defined by $\xi_{i} = 0$ for data points that are on or inside the correct margin boundary and $\xi_{i}= \lvert t_{i} - y(x_{i}) \rvert$ for other points. Thus a data point $x_{i}$ that is on the decision boundary $y(x_{i})=0$ will have $\xi_{i} = 1$, and points with $\xi_{i} > 1$ will be misclassified. \\
\begin{figure}[h]
\caption{Illustration of the slack variables.}
\centering
\includegraphics[width=0.39\textwidth]{img/slack.png}
\end{figure}
The exact classification constraints are then replaced with:
\begin{align*}
t_{i} y(x_{i}) \geq 1 - \xi_{i} \ \ i=1,\ldots,N \\
\xi_{i} \geq 0 \ \ i=1,\ldots,N\\
\end{align*}
Our goal is now to maximize the margin while softly penalizing points that lie on the wrong side of the margin boundary. We therefore minimize:
\begin{equation*}
\begin{aligned}
& \underset{w,b,\xi}{\text{min}}
& & \dfrac{1}{2} \norm{w}^2 + C \sum_{i=1}^{N} \xi_{i}\\
& \text{subject to}
& & 1 - \xi_{i} - t_{i} (w^T \phi(x_{i}) +b) \leq 0, \; i = 1, \ldots, N \\
&&& - \xi_{i} \leq 0, \; i = 1, \ldots, N
\end{aligned}
\end{equation*}
The parameter $C > 0$ controls the trade-off between the slack variable penalty and the margin (training error vs model complexity). In the limit $C \xrightarrow{} \infty $ we will recover the "hard" margin SVM.
\subsection{Dual Problem}
It is trivial to show that Slater's condition holds since the constraints are affine functions. Therefore, strong duality holds and we can move to the Lagrangian:
\begin{equation*}
\mathcal{L}(w,b,\xi,a,u) =
\dfrac{1}{2} \norm{w}^2 + C \sum_{i=1}^{N} \xi_{i} + \sum_{i=1}^{N} a_{i} \{ 1 - \xi_{i} - t_{i} (w^T \phi(x_{i}) +b) \}
- \sum_{i=1}^{N} u_{i}\xi_{i}
\end{equation*}
Furthermore, the corresponding KKT conditions are:
\begin{align}
a_{n} \geq 0 \\
t_{n}y(x_{n}) -1 +\xi_{n} \geq 0 \\
a_{n}(t_{n}y(x_{n}) -1 +\xi_{n} ) = 0
\\
u_{n} \geq 0 \\
\xi_{n} \geq 0 \\
u_{n}\xi_{n} = 0
\end{align}
We now optimize out $w,b,\xi_{i}$:
\begin{align*}
\frac{\partial{\mathcal{L}}}{\partial{w}} = 0 \implies w = \sum_{i=1}^{N} a_{i}t_{i}\phi(x_{i}) \\ \frac{\partial{\mathcal{L}}}{\partial{b}} = 0 \implies \sum_{i=1}^{N} a_{i}t_{i} = 0 \\
\frac{\partial{\mathcal{L}}}{\partial{\xi_{i}}} = 0 \implies
a_{i} = C - u_{i}
\end{align*}
Using the previous results from "hard" margin SVM together with the above results, we can eliminate $w,b,\xi_{i}$ from the Lagrangian. We obtain the dual Lagrangian function which is identical to the "hard" margin one, except that the constraints are somewhat different:
\begin{align*}
\mathcal{L}(\xi,a,u) = \sum_{i=1}^{N} a_{i} - \dfrac{1}{2}
\sum_{i=1}^{N} { \sum_{j=1}^{N} a_{i} a_{j} t_{i} t_{j} \phi(x_{i})^T \phi(x_{j})} + C \sum_{i=1}^{N} \xi_{i} - \sum_{i=1}^{N} a_{i} \xi_{i} -
\sum_{i=1}^{N} u_{i} \xi_{i} \\
\mathcal{L}(\xi,a,u) = \sum_{i=1}^{N} a_{i} - \dfrac{1}{2}
\sum_{i=1}^{N} { \sum_{j=1}^{N} a_{i} a_{j} t_{i} t_{j} \phi(x_{i})^T \phi(x_{j})} + \overbrace{\sum_{i=1}^{N} \xi_{i} (a_{i} + u_{i}) - \sum_{i=1}^{N} a_{i} \xi_{i} -
\sum_{i=1}^{N} u_{i} \xi_{i}}^\text{=0}
\end{align*}
This gives the dual representation of the soft margin SVM problem:
\begin{equation*}
\begin{aligned}
& \underset{a}{\text{max}}
& & \sum_{i=1}^{N} a_{i} - \dfrac{1}{2}
\sum_{i=1}^{N} { \sum_{j=1}^{N} a_{i} a_{j} t_{i} t_{j} \phi(x_{i})^T \phi(x_{j})}\\
& \text{subject to}
& & 0 \leq a_{i} \leq C , \; i = 1, \ldots, N \\
&&& \sum_{i=1}^{N} a_{i}t_{i} = 0
\end{aligned}
\end{equation*}
Where the first constraint comes from the fact that $a_{i} = C - u_{i}$ and $a_{i} \geq 0, u_{i} \geq 0$ must be satisfied (KKT).
\\
We can now interpret the resulting solution:
\begin{itemize}
\item data points for which $a_{i} = 0$: they do not contribute to the predictive model.
\item data points for which $0 < a_{i} < C$: lie on the margin since $a_{i} < C \implies u_{i} > 0 \implies \xi_{i} = 0$ \\ where the last implication comes from complementary slackness (8.6)
\item data points for which $a_{i} = C$: lie inside the margin and can either be correctly classified if $\xi_{i} \leq 1$ or misclassifed if $\xi_{i} > 1$
\end{itemize}
\newpage
\end{document}