Machine Learning / Linear Regression

In [21]:
# import libs
import sys
sys.path.append("../")

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.io
from mpl_toolkits import mplot3d
from linear_regression import *
from utils import concat_with_x0, normalize_variable, normalize_variable_with_parameters
from optimize import *
In [2]:
# load data
def load_profit_data():
    return pd.read_csv("../data/ex1data1.csv", names=["Population", "Profit"])


profit_data = load_profit_data()
print(profit_data.head())
print(profit_data.shape)
   Population   Profit
0      6.1101  17.5920
1      5.5277   9.1302
2      8.5186  13.6620
3      7.0032  11.8540
4      5.8598   6.8233
(97, 2)
In [3]:
# plot data
def plot_profit_data():
    profit_data.plot(x="Population", y="Profit", 
                     kind="scatter", title = "Data Set")

    
plot_profit_data()
plt.show()

Hypothesis

Fit data with a straight line:

$$h(X^{(i)}) = \theta_0 x_0^{(i)} + \theta_1 x_1^{(i)} + ... + \theta_n x_n^{(i)} = \Theta^T X^{(i)}$$

where $X^{(i)}$ - variables (n x 1) vector, $x_0^{(i)} = 1$

Cost Function

Find vector $\Theta$ for which $J(\Theta)$ value is minimal:

$$J(\Theta) = \frac{1}{2m} \sum^m_{i=0}{(h(X^{(i)}) - y^{(i)})^2}$$
In [4]:
x1 = concat_with_x0(profit_data[["Population"]].values)
y1 = profit_data[["Profit"]].values

# generate c0 and c1 coefficients, calculate cost
c0, c1 = np.meshgrid(np.linspace(-10, 10, 20), np.linspace(-1, 4, 20))
cost = np.array([[cost_function(np.vstack((c0[i], c1[i]))[:,j], x1, y1)
               for j in range(c0[i].size)] 
              for i in range(c0.shape[0])])


def plot_profit_data_cost_function_distribution():
    plt.contour(c0, c1, cost, 40, cmap='viridis')
    plt.xlabel("C0")
    plt.ylabel("C1")
    plt.title("Cost Function Contour")    

    
plot_profit_data_cost_function_distribution()
plt.show()
In [5]:
# plot 3d surface of a cost function
def plot_profit_data_cost_function_3d_surface():
    ax = plt.axes(projection="3d")
    ax.plot_surface(c0, c1, cost, cmap='viridis')
    plt.xlabel("C0")
    plt.ylabel("C1")
    plt.title("Cost Function Surface")
    

plot_profit_data_cost_function_3d_surface()
plt.show()

Gradient Descent

Using Gradient Descent algorithm to minimize $\Theta$ parameters.

For $j \in [0, n]$, repeat until convergence:

$$\theta_j = \theta_j - \alpha \frac{d}{d\theta_j} J(\Theta) = \theta_j - \alpha \frac{1}{m}\sum^m_{i = 0}(h(X^{(i)}) - y^{(i)})x_j^{(i)}$$
In [6]:
# calculate coefficients using Gradient Descent
coefficients = gradient_descent(
    2,
    cost_function_derivative,
    args=(x1, y1),
    learning_rate=0.01
)

print(coefficients)
[-3.24140214  1.1272942 ]
In [7]:
# plot line with calculated linear regression coefficients
def plot_profit_data_regression_line():
    x_plot = np.linspace(min(x1[:,1]), max(x1[:,1]), 2)
    y_plot = coefficients[0] + coefficients[1] * x_plot
    plt.plot(x_plot, y_plot, '-r')

    
plot_profit_data()
plot_profit_data_regression_line()
plt.show()

Linear Regression With Multiple Variables

In [8]:
# load data
def load_housing_data():
    return pd.read_csv("../data/ex1data2.csv", names=["Size", "Bedrooms", "Price"])


housing_data = load_housing_data()
x2 = np.hstack((housing_data[['Size']], housing_data[['Bedrooms']]))
y2 = housing_data[['Price']].values

print(housing_data.head())
print(housing_data.shape)
   Size  Bedrooms   Price
0  2104         3  399900
1  1600         3  329900
2  2400         3  369000
3  1416         2  232000
4  3000         4  539900
(47, 3)

Feature Normalization

Subtract feature mean to have deviations around 0. Divide by one standard deviation to scale data.

$$x^{'}_j = \frac{x_j - mean(X_j)}{sd(X_j)}$$
In [9]:
# normalize size and bedrooms variables and plot normalized values distribution
x2_norm = normalize_variable(x2)[0]

size_norm = x2_norm[:,0]
bedrooms_norm = x2_norm[:,1]

def plot_normalized_values():
    plt.plot(size_norm, ".b", label="Size")
    plt.plot(bedrooms_norm, ".g", label="Bedrooms")
    plt.legend(loc="upper left")
    

plot_normalized_values()
plt.show()
In [10]:
# calculate multivariable linear regression coefficients    
coefficients = gradient_descent(
    3,
    cost_function_derivative,
    args=(concat_with_x0(x2_norm), y2),
    learning_rate=0.01
)

print(coefficients)
[340397.96353532 108742.65627238  -5873.22993383]
In [11]:
# plot 3d linear regression line with calculated coefficicents
def housing_data_linear_regression_line_3d(x1, x2, coefficients):
    """
    :param x1: first variable (1 x m) vector
    :param x2: second variable (1 x m) vector
    :param coefficients: linear regression coefficients (1 x n) vector
    :return: h(x1, x2)
    """
    @np.vectorize
    def calculate(x1, x2):
        return coefficients[0] + coefficients[1] * x1 + coefficients[2] * x2
    
    return calculate(x1, x2)


def plot_housing_data(coefficients):
    ax = plt.axes(projection="3d")
    ax.scatter(size_norm, bedrooms_norm, y2)
    x = [np.min(size_norm), np.max(size_norm)]
    y = [np.min(bedrooms_norm), np.max(bedrooms_norm)]
    ax.plot(x, y, housing_data_linear_regression_line_3d(x, y, coefficients), "-r")

    
plot_housing_data(coefficients)
plt.show()

Normal Equations

Analytically find $\Theta$ for which cost function is minimal: first derivative of $J(\Theta)$ is equal to 0. Given that X is a (m x n) design matrix where m - experiments count and n - variables count:

$$\Theta = (X^T X)^{-1} X^T y$$
In [12]:
coefficients = normal_equation(concat_with_x0(x2_norm), y2)
print(coefficients)
[340412.65957447 109447.79646964  -6578.35485416]
In [13]:
# plot 3d linear regression line with analytically calculated coefficicents
plot_housing_data(coefficients)
plt.show()

Regularized Linear Regression

In [14]:
def load_water_data():
    return scipy.io.loadmat('../data/ex5data1.mat')


water_data = load_water_data()
x_training = water_data['X']
y_training = water_data['y']

x_cv = water_data['Xval']
y_cv = water_data['yval']

x_test = water_data['Xtest']
y_test = water_data['ytest']

Regularized linear regression cost function:

$$J_{reg}(\Theta) = \frac{1}{2m} \sum^m_{i=0}{(h(X^{(i)}) - y^{(i)})^2} + \frac{\lambda}{2 m} \sum_{j=1}^n \theta_j^2$$

Where $\lambda$ is regularization rate and $\theta_0$ is skipped.

Gradient and first derivative:

$$ \theta_j = \theta_j - \alpha \frac{dJ_{reg}}{d\theta_j} \\ = \theta_j - \alpha ( \frac{1}{m} \sum_{i=0}^{m} (h(X^{(i)}) - y^{(i)})x^{(i)}_j + \frac{\lambda}{m}\theta_j ) \\ = \theta_j (1 - \frac{\alpha \lambda}{m}) - \alpha \frac{1}{m} \sum_{i=0}^{m} (h(X^{(i)}) - y^{(i)})x^{(i)}_j $$
In [15]:
# calculate linear regression coefficients
coefficients = find_coefficients(concat_with_x0(x_training), y_training,
                                 regularized_cost_function, regularized_cost_function_derivative, 0)
print(coefficients)
[13.08790351  0.36777923]
In [16]:
# plot linear regression line
def plot_water_data():
    plt.scatter(x_training, y_training, label="Data")
    plt.xlabel("Change in water level (x)")
    plt.ylabel("Water flowing out of the dam (y)")
    
    
def plot_water_data_regression_line():
    x_plot = np.linspace(min(x_training[:,0]), max(x_training[:,0]), 2)
    y_plot = coefficients[0] + coefficients[1] * x_plot
    plt.plot(x_plot, y_plot, '-r', label="Regression")
    plt.legend(loc="upper left")


plot_water_data()
plot_water_data_regression_line()
plt.show()

Bias / Variance

Model may under (high bias) or overfit (high variance) the data.

In order to find optimal model, we first split our data set on the 3 parts: training, cross validation and test.

Compute our $\Theta$ parameters on a training set using various input variables:

  • Number of training examples
  • Polynomial degree
  • Regularization rate

Then choose the model with the minimal $J_{cv}(\Theta)$ error on a cross validation set.

Checking $J_{test}(\Theta)$ on a test set shows model performance on unknown data.

Training Set Size

In [17]:
# calculate cost function based on a training set size
def training_size_cost():
    cost = []
    for i in range(1, x_training.shape[0] + 1):
        c = find_coefficients(concat_with_x0(x_training[0:i,:]), y_training[0:i,:], 
                              regularized_cost_function, regularized_cost_function_derivative, 0)

        training_cost = cost_function(c, concat_with_x0(x_training[0:i,:]), y_training[0:i,:])
        cv_cost = cost_function(c, concat_with_x0(x_cv), y_cv)

        cost.append((i, training_cost, cv_cost))
    
    return np.array(cost)


def plot_training_size_cost_function(cost):
    plt.plot(cost[:, 0], cost[:, 1], '-r', label="Training")
    plt.plot(cost[:, 0], cost[:, 2], '-g', label="Cross Validation")
    plt.xlim(xmin=1)
    plt.legend(loc="upper right")
    plt.xlabel("Number of training examples")
    plt.ylabel("Error")


cost = training_size_cost()
plot_training_size_cost_function(cost)
plt.show()

Polynomial Features

In [18]:
degree = 8
In [19]:
def map_all_sets_to_degree(d):
    x_training_poly = map_to_degree(x_training, d)
    x_training_poly, mu, sigma = normalize_variable(x_training_poly)
    
    x_cv_poly = map_to_degree(x_cv, d)
    x_cv_poly = normalize_variable_with_parameters(x_cv_poly, mu, sigma)

    x_test_poly = map_to_degree(x_test, d)
    x_test_poly = normalize_variable_with_parameters(x_test_poly, mu, sigma)

    return [x_training_poly, x_cv_poly, x_test_poly]
In [22]:
# fit polynomial variables with given degree to the training data
x_training_poly = map_all_sets_to_degree(degree)[0]
c = find_coefficients(concat_with_x0(x_training_poly), y_training, 
                      regularized_cost_function, regularized_cost_function_derivative, 0)


def plot_predicted_water_data(coefficients):
    y_plot = concat_with_x0(x_training_poly) @ coefficients.transpose()
    plt.plot(x_training[:,0], y_plot, '+r', label="Polynomial Regression")
    plt.legend(loc="upper left")


plot_water_data()
plot_predicted_water_data(c)
plt.show()
In [23]:
# calculate cost function based on a polynomial degree
def polynomical_degree_cost():
    max_degree = 10
    cost = []
    for d in range(1, max_degree + 1):
        x_training_poly, x_cv_poly, _ = map_all_sets_to_degree(d)
        c = find_coefficients(concat_with_x0(x_training_poly), y_training, 
                              regularized_cost_function, regularized_cost_function_derivative, 0)

        training_cost = cost_function(c, concat_with_x0(x_training_poly), y_training)
        cv_cost = cost_function(c, concat_with_x0(x_cv_poly), y_cv)

        cost.append((d, training_cost, cv_cost))
        
    return np.array(cost)


def plot_polynomial_degree_cost_function(cost):
    plt.plot(cost[:, 0], cost[:, 1], '-r', label="Training")
    plt.plot(cost[:, 0], cost[:, 2], '-g', label="Cross Validation")
    plt.xlim(xmin=1)
    plt.ylim(ymin=-10, ymax=200)
    plt.legend(loc="upper right")
    plt.xlabel("Polynomial degree")
    plt.ylabel("Error")


cost = polynomical_degree_cost()
plot_polynomial_degree_cost_function(cost)
plt.show()

Regularization Rate

In [25]:
regularization_rates = [0, 0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1, 3, 10]
In [26]:
# calculate cost function based on regularization rate
def regularization_rate_cost():
    cost = []
    for reg in regularization_rates:
        x_training_poly, x_cv_poly, _ = map_all_sets_to_degree(degree)
        
        c = find_coefficients(concat_with_x0(x_training_poly), y_training, 
                              regularized_cost_function, regularized_cost_function_derivative, reg)

        training_cost = cost_function(c, concat_with_x0(x_training_poly), y_training)
        cv_cost = cost_function(c, concat_with_x0(x_cv_poly), y_cv)

        cost.append((reg, training_cost, cv_cost))
        
    return np.array(cost)


def plot_regularization_rate_cost_function(cost):
    plt.plot(cost[:, 0], cost[:, 1], '-r', label="Training")
    plt.plot(cost[:, 0], cost[:, 2], '-g', label="Cross Validation")
    plt.legend(loc="upper right")
    plt.ylim(ymin=0, ymax=20)
    plt.xlim(xmin=0)
    plt.xlabel("Regularization rate")
    plt.ylabel("Error")


cost = regularization_rate_cost()
plot_regularization_rate_cost_function(cost)
plt.show()

Find Optimal Parameters

In [27]:
# find best polynomial degree / reqularization rate combination.
def find_optimal_parameters():
    max_degree = 10
    opt_degree = 1
    opt_rate = 0
    min_cost = 2^10
    for d in range(1, max_degree + 1):
        for r in regularization_rates:
            x_training_poly, x_cv_poly, _ = map_all_sets_to_degree(d)

            c = find_coefficients(concat_with_x0(x_training_poly), y_training, 
                                  regularized_cost_function, regularized_cost_function_derivative, r)
            cost = cost_function(
                c, 
                concat_with_x0(x_cv_poly), 
                y_cv
            )
                        
            if (cost < min_cost):
                min_cost = cost
                opt_degree = d
                opt_rate = r
                
    return opt_degree, opt_rate


opt_degree, opt_rate = find_optimal_parameters()
print(opt_degree, opt_rate)
6 3
In [28]:
# calculate test set error
x_training_poly, x_cv_poly, x_test_poly = map_all_sets_to_degree(opt_degree)

cost_function(
    find_coefficients(concat_with_x0(x_training_poly), y_training, 
                      regularized_cost_function, regularized_cost_function_derivative, opt_rate),
    concat_with_x0(x_test_poly), 
    y_test
)
Out[28]:
4.397642376223809