# Chapter 15 - Prediction

## Chapter 15.3 - The Method of Least Squares

In [None]:
from datascience import *
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plots

In [None]:
# Generate somewhat random data for the equation y = 2.5x - 50
def generate_table(number_items):
 result = Table(make_array("x", "y"))
 for _ in range(number_items):
 x = np.random.random() * 100
 delta = 20 * np.random.random() - 10
 y = 2.5 * (x + delta) - 50
 result = result.with_row(make_array(x,y))
 return result

In [None]:
data = generate_table(100)
data.scatter("x")
plots.plot([0, 100], [-50, 200], color='red', lw=2);

In [None]:
def standard_units(numbers):
 "Convert any array of numbers to standard units."
 return (numbers - np.mean(numbers))/np.std(numbers) 

In [None]:
standardized_data = Table().with_columns(
 'x (standard units)', standard_units(data.column('x')), 
 'y (standard units)', standard_units(data.column('y'))
)
standardized_data.scatter(0, 1)
plots.xlim(-3, 3)
plots.ylim(-3, 3);

In [None]:
# Calculate r
def correlation(t, label_x, label_y):
 return np.mean(standard_units(t.column(label_x))*standard_units(t.column(label_y)))

# Calculate m of y = mx + b
def slope(t, label_x, label_y):
 r = correlation(t, label_x, label_y)
 return r*np.std(t.column(label_y))/np.std(t.column(label_x))

# Calculate b of y = mx + b
def intercept(t, label_x, label_y):
 return np.mean(t.column(label_y)) - slope(t, label_x, label_y)*np.mean(t.column(label_x))

In [None]:
r = correlation(data, "x", "y")
m = slope(data, "x", "y")
b = intercept(data, "x", "y")
print("r =", r)
print("m =", m)
print("b =", b)

Question - What is a resonable way to calculate the error of our candidate line, y = mx + b? 
Answer - Calculate the **root mean squared error**.

In [None]:
# A general function to compute the Root Mean Squared Error
def rmse_general(table, x_label, y_label, slope, intercept):
 x = table.column(x_label)
 y = table.column(y_label)
 fitted = slope * x + intercept
 mse = np.mean((y - fitted) ** 2)
 return mse ** 0.5

In [None]:
print("Root mean squared error =", rmse_general(data, "x", "y", m, b))

Useful, Provable Fact: **The regression line is the unique straight line that minimizes the mean squared error of 
estimation among all straight lines!**

In [None]:
# A less general function to compute the Root Mean Squared Error
# However, this form works with the built-in function minimize
def rmse(slope, intercept):
 x = data.column("x")
 y = data.column("y")
 fitted = slope * x + intercept
 mse = np.mean((y - fitted) ** 2)
 return mse ** 0.5

In [None]:
minimize(rmse)

These are the same values (more or less) that we calculated using the functions above.

## 15.4 - Least Squares Regression

Observations 
- Even if the data is not linear, there is a unique line that minimizes the mean squared error of estimation. 
This line can be identified using the technique of 15.3. 
- The minimize function can be applied to any type of function.

In [None]:
# y = -2x^2 + 7x - 3
x = np.arange(10, 20, .3)
y = make_array()
for x_value in x:
 y = np.append(y, -2*x_value*x_value + 7*x_value - 3)
quadratic = Table().with_columns(
 "x", x,
 "y", y
)
quadratic.show(3)

In [None]:
# y = ax^2 + bx + c
def rmse_quadratic_example(a, b, c):
 x = quadratic.column("x")
 y = quadratic.column("y")
 fitted = a*x*x + b*x + c
 mse = np.mean((y - fitted) ** 2)
 return mse ** 0.5

In [None]:
minimize(rmse_quadratic_example)

**Active Learning** (using the *grades_and_piazza.csv* file from the Chapter 8 materials):
- Calculate *r* (the correlation coefficient) between GPA and each of the other six data items
- Using the most significant *r* value, deploy *minimize* to identify the linear equation that minimizes the root mean squared error