{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "dfa39e16-3e24-4ef8-b210-60d974b77bde",
   "metadata": {},
   "source": [
    "# Chapter 16 - Inference for Regression"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9538feb6-a0fb-4e3f-9a4f-5d45072870b5",
   "metadata": {},
   "outputs": [],
   "source": [
    "from datascience import *\n",
    "import numpy as np\n",
    "%matplotlib inline\n",
    "import matplotlib.pyplot as plots"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b4cd5e7e-d1b4-486e-90dd-072145905993",
   "metadata": {},
   "outputs": [],
   "source": [
    "education = Table().read_table(\"grades_and_piazza.csv\")\n",
    "education"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1eafd0dc-01e1-4783-8f30-bc4b21daa823",
   "metadata": {},
   "source": [
    "## 16.2 - Inference for the True Slope"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "34e1b5d3-e206-468e-b1cf-2553ebd1ba55",
   "metadata": {},
   "outputs": [],
   "source": [
    "education.scatter(column_for_x=\"views\", select=\"GPA\", fit_line=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d8f5944d-c521-4b83-a109-d710a4335b37",
   "metadata": {},
   "outputs": [],
   "source": [
    "def standard_units(numbers):\n",
    "    \"Convert any array of numbers to standard units.\"\n",
    "    return (numbers - np.mean(numbers))/np.std(numbers)\n",
    "\n",
    "def correlation(t, label_x, label_y):\n",
    "    \"Calculate r\"\n",
    "    return np.mean(standard_units(t.column(label_x))*standard_units(t.column(label_y)))\n",
    "\n",
    "def slope(t, label_x, label_y):\n",
    "    \"Calculate m of y = mx + b\"\n",
    "    r = correlation(t, label_x, label_y)\n",
    "    return r*np.std(t.column(label_y))/np.std(t.column(label_x))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5e0f809d-a070-4fcc-8d43-3536ed17957f",
   "metadata": {},
   "outputs": [],
   "source": [
    "correlation(education, \"views\", \"GPA\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9d0642e3-209b-47ae-ae2f-a5fb99325c63",
   "metadata": {},
   "source": [
    "One way to estimate the true slope is to use the slope of the regression line:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "76318f8e-a57d-4783-bb70-2e0808ef6e86",
   "metadata": {},
   "outputs": [],
   "source": [
    "slope(education, \"views\", \"GPA\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a099a7cd-d90b-4886-ac02-903843cf0052",
   "metadata": {},
   "source": [
    "We can also estimate the true slope by bootstrapping the original sample!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9c58ce68-4ba9-4d0e-82f8-05d570c5ebdd",
   "metadata": {},
   "outputs": [],
   "source": [
    "def one_bootstrap_slope(table, x_column, y_column):\n",
    "    bootstrap_sample = table.sample()\n",
    "    return slope(bootstrap_sample, x_column, y_column)\n",
    "\n",
    "def many_bootstrap_slopes(how_many, table, x_column, y_column):\n",
    "    slopes = make_array()\n",
    "    for _ in np.arange(how_many):\n",
    "        slopes = np.append(slopes, one_bootstrap_slope(table, x_column, y_column))\n",
    "    return slopes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0443c32b-4740-4c94-b505-ce7c6336c5b0",
   "metadata": {},
   "outputs": [],
   "source": [
    "slopes = many_bootstrap_slopes(1000, education, \"views\", \"GPA\")\n",
    "slopes[:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "36b3792f-946a-4d2a-98fe-d481e08e4007",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 95% Confidence Interval\n",
    "left = percentile(2.5, slopes)\n",
    "right = percentile(97.5, slopes)\n",
    "print(\"Left =\", left, \", Right =\", right)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "81551fe6-6afb-45bb-a259-6616f33d3b7c",
   "metadata": {},
   "outputs": [],
   "source": [
    "Table().with_column(\"Bootstrap Slopes\", slopes).hist()\n",
    "plots.plot([left, right], [0,0], color=\"yellow\", lw=8);"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6ad889ff-11aa-4830-9bc4-d7bd27af5872",
   "metadata": {},
   "source": [
    "Could the true slope be 0?  In other words, could there be no linear relation between the variables?\n",
    "* Null Hypothesis. The slope of the true line is 0.\n",
    "* Alternative Hypothesis. The slope of the true line is not 0.\n",
    "\n",
    "Since the 95% confidence interval for the true slope does not contain 0, we can reject the null hypothesis!"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}