See sources
Finding decision boundary between classes using large margin classification.
$$J(\Theta) = C \sum_{i=0}^{m}[ { y^{(i)} cost_1(\Theta^T f^{(i)}) \ + (1 - y^{(i)}) cost_0(\Theta^T f^{(i)}) }] + \frac{1}{2} \sum_{i = 0}^{n} \theta^2_i$$ $$y^{(i)} = \begin{cases} 1 & \quad \Theta^T f^{(i)} \ge 1 \\ 0 & \quad \Theta^T f^{(i)} \le -1 \end{cases}$$Where $f^{(i)}$ is a similarity function $K(x^{(i)}, x^{(j)})$ between input $x^{(i)}$ and $x^{(j)}$. Similarity function is defined by the kernel.
$m$ - number of training examples, $n$ - number of features.
It's possible to map input feature vector $x \in R^p$ to higher dimensions $\phi (x) \in R^d$ using kernel trick by defining dot product $\langle \phi (x^{(i)}), \phi (x^{(j)}) \rangle$ in dimension $R^d$.
Since $\langle \phi (x^{(i)}), \phi (x^{(j)}) \rangle = p^{(i)} || x^{(j)} ||$ - projection of vector $x^{(i)}$ to $x^{(j)}$, kernel function also defines similarity between two vectors.
The problem is simillar to logistic regression.
$$K(x^{(i)}, x^{(j)}) = (x^{(i)})^T x^{(j)}$$# import libs
import sys
sys.path.append("../")
import matplotlib.pyplot as plt
import numpy as np
import scipy.io
import math
from logistic_regression import *
from spam_svm import *
from svm import *
from utils import *
# load data
def load_linear_kernel_data():
return scipy.io.loadmat('../data/ex6data1.mat')
linear_kernel_data = load_linear_kernel_data()
x1 = linear_kernel_data['X']
y1 = linear_kernel_data['y']
print("x", x1.shape, "y", y1.shape)
Regularization parameter $C$ has the same effect as $\frac{1}{\lambda}$ in logistic regression.
def plot_linear_kernel_data():
positive = x1[(y1 == 1).T[0],:]
negative = x1[(y1 == 0).T[0],:]
plt.scatter(positive[:,0], positive[:,1], marker="+")
plt.scatter(negative[:,0], negative[:,1], marker=".")
plt.xlabel("x1")
plt.ylabel("x2")
def plot_svm_decision_boundary(clf):
# plot the decision function
ax = plt.gca()
xlim = ax.get_xlim()
ylim = ax.get_ylim()
# create grid to evaluate model
xx = np.linspace(xlim[0], xlim[1], 30)
yy = np.linspace(ylim[0], ylim[1], 30)
YY, XX = np.meshgrid(yy, xx)
xy = np.vstack([XX.ravel(), YY.ravel()]).T
Z = clf.decision_function(xy).reshape(XX.shape)
# plot decision boundary and margins
ax.contour(XX, YY, Z, levels=[-1, 0, 1], alpha=0.5,
linestyles=['--', '-', '--'],
colors=['grey', 'red', 'grey'])
def plot_logistic_regression_boundary(coefficients):
ax = plt.gca()
xx = ax.get_xlim()
yy = [- (coefficients[0] + coefficients[1] * x) / coefficients[2] for x in xx]
plt.plot(xx, yy, "-.b", alpha=0.5)
Low $C$ results to higher regularization, thus wider boundary.
# C = 1
plot_linear_kernel_data()
plot_svm_decision_boundary(fit_svm(x1, y1, "linear", C=1))
plot_logistic_regression_boundary(fit_logistic_regression(x1, y1, regularization_rate=1/1))
plt.show()
Higher $C$ value results to lower regularization, thus tighter boundary.
# C = 100
plot_linear_kernel_data()
plot_svm_decision_boundary(fit_svm(x1, y1, "linear", C=100))
plot_logistic_regression_boundary(fit_logistic_regression(x1, y1, regularization_rate=1/100))
plt.show()
Finding non-linear decision boundaries:
$$K(x^{(i)}, x^{(j)}) = exp( - \frac{||x^{(i)} - x^{(j)}||^2}{2 \sigma^2 } ) = exp( - \frac{ \sum^{n}_{k=0} (x_k^{(i)} - x_k^{(j)})^2}{2 \sigma^2 } )$$$\sigma$ controls how fast similarity metric decreases when $x^{(i)}$ and $x^{(j)}$ go further apart.
def load_gaussian_kernel_data():
return scipy.io.loadmat('../data/ex6data2.mat')
gaussian_kernel_data = load_gaussian_kernel_data()
x2 = gaussian_kernel_data['X']
y2 = gaussian_kernel_data['y']
print("x", x2.shape, "y", y2.shape)
def plot_gaussian_kernel_data():
positive = x2[(y2 == 1).T[0],:]
negative = x2[(y2 == 0).T[0],:]
plt.scatter(positive[:,0], positive[:,1], marker="+")
plt.scatter(negative[:,0], negative[:,1], marker=".")
plt.xlabel("x1")
plt.ylabel("x2")
# gamma = 0.5 (sigma = 1)
plot_gaussian_kernel_data()
plot_svm_decision_boundary(fit_svm(x2, y2, "rbf", C=1, gamma=1))
plt.show()
# gamma = 10 (sigma = 0.22)
plot_gaussian_kernel_data()
plot_svm_decision_boundary(fit_svm(x2, y2, "rbf", C=1, gamma=10))
plt.show()
Using cross validation set to find optimal $C$ and $\sigma$.
def load_test_data():
return scipy.io.loadmat('../data/ex6data3.mat')
test_data = load_test_data()
x3 = test_data["X"]
y3 = test_data["y"]
x3_cv = test_data["Xval"]
y3_cv = test_data["yval"]
print("x", x3.shape, "y", y3.shape)
print("x_cv", x3_cv.shape, "y_cv", y3_cv.shape)
# find optimal parameters
def find_optimal_svm_parameters():
c_values = np.array([(0.01 * math.pow(10, i), 0.05 * math.pow(10, i)) for i in range(4)]).ravel()
gamma_values = np.copy(c_values)
best_prediction = 0
optimal_c = c_values[0]
optimal_gamma = c_values[0]
for c in c_values:
for gamma in gamma_values:
prediction = np.sum(fit_svm(x3, y3, "rbf", C=c, gamma=gamma)
.predict(x3_cv)
.reshape(x3_cv.shape[0], 1) == y3_cv)
if (prediction > best_prediction):
best_prediction = prediction
optimal_c = c
optimal_gamma = gamma
return (optimal_c, optimal_gamma)
optimal_c, optimal_gamma = find_optimal_svm_parameters()
print("C", optimal_c, "gamma", optimal_gamma)
# plot data with optimal coefficients decision boundary
def plot_test_data():
positive = x3[(y3 == 1).T[0],:]
negative = x3[(y3 == 0).T[0],:]
plt.scatter(positive[:,0], positive[:,1], marker="+")
plt.scatter(negative[:,0], negative[:,1], marker=".")
plt.xlabel("x1")
plt.ylabel("x2")
plot_test_data()
plot_svm_decision_boundary(fit_svm(x3, y3, "rbf", C=optimal_c, gamma=optimal_gamma))
plt.show()
# load spam database
def load_spam_data():
return (scipy.io.loadmat('../data/spamTrain.mat'), scipy.io.loadmat('../data/spamTest.mat'))
spam_train_data, spam_test_data = load_spam_data()
x_spam = spam_train_data['X']
y_spam = spam_train_data['y']
print("train x", x_spam.shape, "y", y_spam.shape)
x_spam_test = spam_test_data['Xtest']
y_spam_test = spam_test_data['ytest']
print("test x", x_spam_test.shape, "y", y_spam_test.shape)
vocab = get_vocab_vector("../data/vocab.txt")
# train svm with spam database
clf = fit_svm(x_spam, y_spam, kernel="linear", C=0.1, probability=True)
# check training accuracy on the test set
np.sum(clf.predict(x_spam_test).reshape(x_spam_test.shape[0], 1) == y_spam_test) / y_spam_test.size * 100
Map email body to feature vector $x \in R^v$ where $v$ is a number of words in a spam vocabulary dictionary.
# check email samples
def check_email(email_path):
body = process_email(email_path)
p_false, p_true = clf.predict_proba(map_text_to_feature_vector(body, vocab))[0]
print("not spam =", round(p_false, 2), "/ spam =", round(p_true, 2))
check_email("../data/spamSample1.txt")
check_email("../data/spamSample2.txt")
check_email("../data/emailSample1.txt")
check_email("../data/emailSample2.txt")