See sources
# import libs
import sys
sys.path.append("../")
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.io
from mpl_toolkits import mplot3d
from linear_regression import *
from utils import concat_with_x0, normalize_variable, normalize_variable_with_parameters
from optimize import *
# load data
def load_profit_data():
return pd.read_csv("../data/ex1data1.csv", names=["Population", "Profit"])
profit_data = load_profit_data()
print(profit_data.head())
print(profit_data.shape)
# plot data
def plot_profit_data():
profit_data.plot(x="Population", y="Profit",
kind="scatter", title = "Data Set")
plot_profit_data()
plt.show()
Fit data with a straight line:
$$h(X^{(i)}) = \theta_0 x_0^{(i)} + \theta_1 x_1^{(i)} + ... + \theta_n x_n^{(i)} = \Theta^T X^{(i)}$$where $X^{(i)}$ - variables (n x 1) vector, $x_0^{(i)} = 1$
Find vector $\Theta$ for which $J(\Theta)$ value is minimal:
$$J(\Theta) = \frac{1}{2m} \sum^m_{i=0}{(h(X^{(i)}) - y^{(i)})^2}$$x1 = concat_with_x0(profit_data[["Population"]].values)
y1 = profit_data[["Profit"]].values
# generate c0 and c1 coefficients, calculate cost
c0, c1 = np.meshgrid(np.linspace(-10, 10, 20), np.linspace(-1, 4, 20))
cost = np.array([[cost_function(np.vstack((c0[i], c1[i]))[:,j], x1, y1)
for j in range(c0[i].size)]
for i in range(c0.shape[0])])
def plot_profit_data_cost_function_distribution():
plt.contour(c0, c1, cost, 40, cmap='viridis')
plt.xlabel("C0")
plt.ylabel("C1")
plt.title("Cost Function Contour")
plot_profit_data_cost_function_distribution()
plt.show()
# plot 3d surface of a cost function
def plot_profit_data_cost_function_3d_surface():
ax = plt.axes(projection="3d")
ax.plot_surface(c0, c1, cost, cmap='viridis')
plt.xlabel("C0")
plt.ylabel("C1")
plt.title("Cost Function Surface")
plot_profit_data_cost_function_3d_surface()
plt.show()
Using Gradient Descent algorithm to minimize $\Theta$ parameters.
For $j \in [0, n]$, repeat until convergence:
$$\theta_j = \theta_j - \alpha \frac{d}{d\theta_j} J(\Theta) = \theta_j - \alpha \frac{1}{m}\sum^m_{i = 0}(h(X^{(i)}) - y^{(i)})x_j^{(i)}$$# calculate coefficients using Gradient Descent
coefficients = gradient_descent(
2,
cost_function_derivative,
args=(x1, y1),
learning_rate=0.01
)
print(coefficients)
# plot line with calculated linear regression coefficients
def plot_profit_data_regression_line():
x_plot = np.linspace(min(x1[:,1]), max(x1[:,1]), 2)
y_plot = coefficients[0] + coefficients[1] * x_plot
plt.plot(x_plot, y_plot, '-r')
plot_profit_data()
plot_profit_data_regression_line()
plt.show()
# load data
def load_housing_data():
return pd.read_csv("../data/ex1data2.csv", names=["Size", "Bedrooms", "Price"])
housing_data = load_housing_data()
x2 = np.hstack((housing_data[['Size']], housing_data[['Bedrooms']]))
y2 = housing_data[['Price']].values
print(housing_data.head())
print(housing_data.shape)
Subtract feature mean to have deviations around 0. Divide by one standard deviation to scale data.
$$x^{'}_j = \frac{x_j - mean(X_j)}{sd(X_j)}$$# normalize size and bedrooms variables and plot normalized values distribution
x2_norm = normalize_variable(x2)[0]
size_norm = x2_norm[:,0]
bedrooms_norm = x2_norm[:,1]
def plot_normalized_values():
plt.plot(size_norm, ".b", label="Size")
plt.plot(bedrooms_norm, ".g", label="Bedrooms")
plt.legend(loc="upper left")
plot_normalized_values()
plt.show()
# calculate multivariable linear regression coefficients
coefficients = gradient_descent(
3,
cost_function_derivative,
args=(concat_with_x0(x2_norm), y2),
learning_rate=0.01
)
print(coefficients)
# plot 3d linear regression line with calculated coefficicents
def housing_data_linear_regression_line_3d(x1, x2, coefficients):
"""
:param x1: first variable (1 x m) vector
:param x2: second variable (1 x m) vector
:param coefficients: linear regression coefficients (1 x n) vector
:return: h(x1, x2)
"""
@np.vectorize
def calculate(x1, x2):
return coefficients[0] + coefficients[1] * x1 + coefficients[2] * x2
return calculate(x1, x2)
def plot_housing_data(coefficients):
ax = plt.axes(projection="3d")
ax.scatter(size_norm, bedrooms_norm, y2)
x = [np.min(size_norm), np.max(size_norm)]
y = [np.min(bedrooms_norm), np.max(bedrooms_norm)]
ax.plot(x, y, housing_data_linear_regression_line_3d(x, y, coefficients), "-r")
plot_housing_data(coefficients)
plt.show()
Analytically find $\Theta$ for which cost function is minimal: first derivative of $J(\Theta)$ is equal to 0. Given that X is a (m x n) design matrix where m - experiments count and n - variables count:
$$\Theta = (X^T X)^{-1} X^T y$$coefficients = normal_equation(concat_with_x0(x2_norm), y2)
print(coefficients)
# plot 3d linear regression line with analytically calculated coefficicents
plot_housing_data(coefficients)
plt.show()
def load_water_data():
return scipy.io.loadmat('../data/ex5data1.mat')
water_data = load_water_data()
x_training = water_data['X']
y_training = water_data['y']
x_cv = water_data['Xval']
y_cv = water_data['yval']
x_test = water_data['Xtest']
y_test = water_data['ytest']
Regularized linear regression cost function:
$$J_{reg}(\Theta) = \frac{1}{2m} \sum^m_{i=0}{(h(X^{(i)}) - y^{(i)})^2} + \frac{\lambda}{2 m} \sum_{j=1}^n \theta_j^2$$Where $\lambda$ is regularization rate and $\theta_0$ is skipped.
Gradient and first derivative:
$$ \theta_j = \theta_j - \alpha \frac{dJ_{reg}}{d\theta_j} \\ = \theta_j - \alpha ( \frac{1}{m} \sum_{i=0}^{m} (h(X^{(i)}) - y^{(i)})x^{(i)}_j + \frac{\lambda}{m}\theta_j ) \\ = \theta_j (1 - \frac{\alpha \lambda}{m}) - \alpha \frac{1}{m} \sum_{i=0}^{m} (h(X^{(i)}) - y^{(i)})x^{(i)}_j $$# calculate linear regression coefficients
coefficients = find_coefficients(concat_with_x0(x_training), y_training,
regularized_cost_function, regularized_cost_function_derivative, 0)
print(coefficients)
# plot linear regression line
def plot_water_data():
plt.scatter(x_training, y_training, label="Data")
plt.xlabel("Change in water level (x)")
plt.ylabel("Water flowing out of the dam (y)")
def plot_water_data_regression_line():
x_plot = np.linspace(min(x_training[:,0]), max(x_training[:,0]), 2)
y_plot = coefficients[0] + coefficients[1] * x_plot
plt.plot(x_plot, y_plot, '-r', label="Regression")
plt.legend(loc="upper left")
plot_water_data()
plot_water_data_regression_line()
plt.show()
Model may under (high bias) or overfit (high variance) the data.
In order to find optimal model, we first split our data set on the 3 parts: training, cross validation and test.
Compute our $\Theta$ parameters on a training set using various input variables:
Then choose the model with the minimal $J_{cv}(\Theta)$ error on a cross validation set.
Checking $J_{test}(\Theta)$ on a test set shows model performance on unknown data.
# calculate cost function based on a training set size
def training_size_cost():
cost = []
for i in range(1, x_training.shape[0] + 1):
c = find_coefficients(concat_with_x0(x_training[0:i,:]), y_training[0:i,:],
regularized_cost_function, regularized_cost_function_derivative, 0)
training_cost = cost_function(c, concat_with_x0(x_training[0:i,:]), y_training[0:i,:])
cv_cost = cost_function(c, concat_with_x0(x_cv), y_cv)
cost.append((i, training_cost, cv_cost))
return np.array(cost)
def plot_training_size_cost_function(cost):
plt.plot(cost[:, 0], cost[:, 1], '-r', label="Training")
plt.plot(cost[:, 0], cost[:, 2], '-g', label="Cross Validation")
plt.xlim(xmin=1)
plt.legend(loc="upper right")
plt.xlabel("Number of training examples")
plt.ylabel("Error")
cost = training_size_cost()
plot_training_size_cost_function(cost)
plt.show()
degree = 8
def map_all_sets_to_degree(d):
x_training_poly = map_to_degree(x_training, d)
x_training_poly, mu, sigma = normalize_variable(x_training_poly)
x_cv_poly = map_to_degree(x_cv, d)
x_cv_poly = normalize_variable_with_parameters(x_cv_poly, mu, sigma)
x_test_poly = map_to_degree(x_test, d)
x_test_poly = normalize_variable_with_parameters(x_test_poly, mu, sigma)
return [x_training_poly, x_cv_poly, x_test_poly]
# fit polynomial variables with given degree to the training data
x_training_poly = map_all_sets_to_degree(degree)[0]
c = find_coefficients(concat_with_x0(x_training_poly), y_training,
regularized_cost_function, regularized_cost_function_derivative, 0)
def plot_predicted_water_data(coefficients):
y_plot = concat_with_x0(x_training_poly) @ coefficients.transpose()
plt.plot(x_training[:,0], y_plot, '+r', label="Polynomial Regression")
plt.legend(loc="upper left")
plot_water_data()
plot_predicted_water_data(c)
plt.show()
# calculate cost function based on a polynomial degree
def polynomical_degree_cost():
max_degree = 10
cost = []
for d in range(1, max_degree + 1):
x_training_poly, x_cv_poly, _ = map_all_sets_to_degree(d)
c = find_coefficients(concat_with_x0(x_training_poly), y_training,
regularized_cost_function, regularized_cost_function_derivative, 0)
training_cost = cost_function(c, concat_with_x0(x_training_poly), y_training)
cv_cost = cost_function(c, concat_with_x0(x_cv_poly), y_cv)
cost.append((d, training_cost, cv_cost))
return np.array(cost)
def plot_polynomial_degree_cost_function(cost):
plt.plot(cost[:, 0], cost[:, 1], '-r', label="Training")
plt.plot(cost[:, 0], cost[:, 2], '-g', label="Cross Validation")
plt.xlim(xmin=1)
plt.ylim(ymin=-10, ymax=200)
plt.legend(loc="upper right")
plt.xlabel("Polynomial degree")
plt.ylabel("Error")
cost = polynomical_degree_cost()
plot_polynomial_degree_cost_function(cost)
plt.show()
regularization_rates = [0, 0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1, 3, 10]
# calculate cost function based on regularization rate
def regularization_rate_cost():
cost = []
for reg in regularization_rates:
x_training_poly, x_cv_poly, _ = map_all_sets_to_degree(degree)
c = find_coefficients(concat_with_x0(x_training_poly), y_training,
regularized_cost_function, regularized_cost_function_derivative, reg)
training_cost = cost_function(c, concat_with_x0(x_training_poly), y_training)
cv_cost = cost_function(c, concat_with_x0(x_cv_poly), y_cv)
cost.append((reg, training_cost, cv_cost))
return np.array(cost)
def plot_regularization_rate_cost_function(cost):
plt.plot(cost[:, 0], cost[:, 1], '-r', label="Training")
plt.plot(cost[:, 0], cost[:, 2], '-g', label="Cross Validation")
plt.legend(loc="upper right")
plt.ylim(ymin=0, ymax=20)
plt.xlim(xmin=0)
plt.xlabel("Regularization rate")
plt.ylabel("Error")
cost = regularization_rate_cost()
plot_regularization_rate_cost_function(cost)
plt.show()
# find best polynomial degree / reqularization rate combination.
def find_optimal_parameters():
max_degree = 10
opt_degree = 1
opt_rate = 0
min_cost = 2^10
for d in range(1, max_degree + 1):
for r in regularization_rates:
x_training_poly, x_cv_poly, _ = map_all_sets_to_degree(d)
c = find_coefficients(concat_with_x0(x_training_poly), y_training,
regularized_cost_function, regularized_cost_function_derivative, r)
cost = cost_function(
c,
concat_with_x0(x_cv_poly),
y_cv
)
if (cost < min_cost):
min_cost = cost
opt_degree = d
opt_rate = r
return opt_degree, opt_rate
opt_degree, opt_rate = find_optimal_parameters()
print(opt_degree, opt_rate)
# calculate test set error
x_training_poly, x_cv_poly, x_test_poly = map_all_sets_to_degree(opt_degree)
cost_function(
find_coefficients(concat_with_x0(x_training_poly), y_training,
regularized_cost_function, regularized_cost_function_derivative, opt_rate),
concat_with_x0(x_test_poly),
y_test
)