# Chapter 15 - Prediction

## Chapter 15.5 - Visual Diagnostics

Previously seen material ...

In [None]:
from datascience import *
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plots

In [None]:
# Generate somewhat random data for the equation y = 2.5x - 50
def generate_table(number_items):
 result = Table(make_array("x", "y"))
 for _ in range(number_items):
 x = np.random.random() * 100
 delta = 20 * np.random.random() - 10
 y = 2.5 * (x + delta) - 50
 result = result.with_row(make_array(x,y))
 return result

In [None]:
data = generate_table(100)
data.scatter("x")
data.show(3)
plots.plot([0, 100], [-50, 200], color="red", lw=2);

In [None]:
def standard_units(numbers):
 "Convert any array of numbers to standard units."
 return (numbers - np.mean(numbers))/np.std(numbers) 
 
def correlation(t, label_x, label_y):
 "Calculate r"
 return np.mean(standard_units(t.column(label_x))*standard_units(t.column(label_y)))

def slope(t, label_x, label_y):
 "Calculate m of y = mx + b"
 r = correlation(t, label_x, label_y)
 return r*np.std(t.column(label_y))/np.std(t.column(label_x))

def intercept(t, label_x, label_y):
 "Calculate b of y = mx + b"
 return np.mean(t.column(label_y)) - slope(t, label_x, label_y)*np.mean(t.column(label_x))

def fit(table, x, y):
 """Return the height of the regression line at each x value."""
 a = slope(table, x, y)
 b = intercept(table, x, y)
 return a * table.column(x) + b

New material ...

**residual** = observed_value - regression_estimate

In [None]:
def residual(table, x, y):
 return table.column(y) - fit(table, x, y)

In [None]:
data = data.with_columns(
 "Fitted Value", fit(data, "x", "y"),
 "Residual", residual(data, "x", "y")
 )
data.show(3)

Plotting the (x, Residual) pairs let us make a visual diagnosis of the quality of the
linear regression analysis. **The residual plot of a good regression shows no pattern. 
The residuals look about the same, above and below the horizontal line at 0, across the range of 
the predictor variable.**

In [None]:
residual_table = Table().with_columns(
 "x", data.column("x"),
 "residuals", residual(data, "x", "y")
)
residual_table.scatter("x", "residuals", color="r")
xlims = make_array(min(data.column("x")), max(data.column("x")))
plots.plot(xlims, make_array(0, 0), color="darkblue", lw=4)
plots.title("Residual Plot");

**When a residual plot shows a pattern, there may be a non-linear relation between the variables.** 
Terminology: *Heteroscedasticity* means an uneven spread of the data.

Is there a pattern in the above plot? What does this tell us about the likely relation between x and y?