# Chapter 16 - Inference for Regression

In [None]:
from datascience import *
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plots

In [None]:
education = Table().read_table("grades_and_piazza.csv")
education

## Material Covered in Previous Class Meeting

In [None]:
def standard_units(numbers):
    "Convert any array of numbers to standard units."
    return (numbers - np.mean(numbers))/np.std(numbers)

def correlation(t, label_x, label_y):
    "Calculate r"
    return np.mean(standard_units(t.column(label_x))*standard_units(t.column(label_y)))

def slope(t, label_x, label_y):
    "Calculate m of y = mx + b"
    r = correlation(t, label_x, label_y)
    return r*np.std(t.column(label_y))/np.std(t.column(label_x))

## 16.3 - Prediction Intervals

In [None]:
def intercept(t, label_x, label_y):
    "Calculate b of y = mx + b"
    return np.mean(t.column(label_y)) - slope(t, label_x, label_y)*np.mean(t.column(label_x))

In [None]:
def fitted_value(table, x, y, given_x):
    a = slope(table, x, y)
    b = intercept(table, x, y)
    return a * given_x  + b

In [None]:
for views in range(0, 501, 50):
    print("Views =", views, "Predicted GPA =", fitted_value(education, "views", "GPA", views))

In [None]:
def one_bootstrap_prediction(table, x_column, y_column, x_to_predict):
    bootstrap_sample = table.sample()
    return fitted_value(bootstrap_sample, x_column, y_column, x_to_predict)

In [None]:
views_to_predict = 200
one_bootstrap_prediction(education, "views", "GPA", views_to_predict)

In [None]:
def many_bootstrap_predictions(how_many, table, x_column, y_column, x_to_predict):
    predictions = make_array()
    for _ in np.arange(how_many):
        predictions = np.append(predictions, one_bootstrap_prediction(table, x_column, y_column, x_to_predict))
    return predictions

In [None]:
many_bootstrap_predictions(10, education, "views", "GPA", views_to_predict)

In [None]:
def bootstrap_prediction(how_many, table, x_column, y_column, x_to_predict, confidence_interval):
    array_predictions = many_bootstrap_predictions(how_many, table, x_column, y_column, x_to_predict)
    table_predictions = Table().with_columns("Predictions", array_predictions)
    table_predictions.hist(bins=20)
    low = (100 - confidence_interval) / 2
    high = confidence_interval + low
    left = percentile(low, array_predictions)
    right = percentile(high, array_predictions)
    plots.axvline(fitted_value(table, "views", "GPA", x_to_predict), 0, color="red", linestyle="--")
    plots.plot(make_array(left, right), make_array(0, 0), color='yellow', lw=8);
    print("GPA prediction of {} views = {:.2f}".format(x_to_predict, fitted_value(table, "views", "GPA", x_to_predict)))
    print("Bootstrap {}% confidence interval for GPAs ranges from {:.2f} - {:.2f}".format(confidence_interval, left, right))

In [None]:
views_to_predict = 200
bootstrap_prediction(1000, education, "views", "GPA", views_to_predict, 95)

How would the confidence interval change if
* A 75% confidence interval is desired?
* A 99% confidence interval is desired?
* The number of bootstraps is increased to 5000?
* Predicting the GPA for a number of views closer to the mean number of views is desired?
* Predicting the GPA for a number of views farther from the mean number of views is desired?

**Caution** - All of the predictions and tests performed in this chapter assume that the regression model holds. 