{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "ebdea971-475c-4d6f-9cf6-6c5cc2f4155f",
   "metadata": {},
   "source": [
    "# Chapter 17.5 - The Accuracy of the Classifier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9538feb6-a0fb-4e3f-9a4f-5d45072870b5",
   "metadata": {},
   "outputs": [],
   "source": [
    "from datascience import *\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d8f5944d-c521-4b83-a109-d710a4335b37",
   "metadata": {},
   "outputs": [],
   "source": [
    "def standard_units(numbers):\n",
    "    \"Convert any array of numbers to standard units.\"\n",
    "    return (numbers - np.mean(numbers))/np.std(numbers)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "814095f1-ddd7-4795-802f-6154a008e716",
   "metadata": {},
   "outputs": [],
   "source": [
    "def distance(point1, point2):\n",
    "    return np.sqrt(np.sum((point1 - point2)**2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ceef6cce-34d4-4e32-8759-5ebeb4679476",
   "metadata": {},
   "outputs": [],
   "source": [
    "def all_distances(training, new_point):\n",
    "    attributes = training.drop(\"son\")\n",
    "    def distance_from_point(row):\n",
    "        return distance(np.array(new_point), np.array(row))\n",
    "    return attributes.apply(distance_from_point)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f6d6d1d1-770c-45f9-b851-128cf3946aba",
   "metadata": {},
   "outputs": [],
   "source": [
    "def table_with_distances(training, new_point):\n",
    "    return training.with_column(\"Distance\", all_distances(training, new_point))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "52297af4-91c3-42b8-8da3-588ba1fb6a56",
   "metadata": {},
   "outputs": [],
   "source": [
    "def closest(training, new_point, k):\n",
    "    with_distances = table_with_distances(training, new_point)\n",
    "    sorted_by_distance = with_distances.sort(\"Distance\")\n",
    "    top_k = sorted_by_distance.take(np.arange(k))\n",
    "    return top_k"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f90fe8d6-cc30-4e68-ac02-9d86fc94907f",
   "metadata": {},
   "outputs": [],
   "source": [
    "heights = Table().read_table(\"sons_heights.csv\")\n",
    "heights = heights.with_columns(\n",
    "    \"father\", standard_units(heights.column(\"father\")),\n",
    "    \"mother\", standard_units(heights.column(\"mother\")),\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "76201f2e-0d62-41af-8cb5-b4d20139a568",
   "metadata": {},
   "outputs": [],
   "source": [
    "def classify(training, new_point, k):\n",
    "    closest_k = closest(training, new_point, k)\n",
    "    return np.sum(closest_k.column(\"son\")) / k"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6ed41802-f955-4a52-9c66-25057bbd3fc4",
   "metadata": {},
   "source": [
    "### The code above comes from our previous meeting."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a6583095-d953-44a2-8c1a-51bf0c769ef8",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"Number of entries =\", heights.num_rows)\n",
    "heights = heights.sample(with_replacement = False)\n",
    "training_data = heights.take(np.arange(heights.num_rows // 2))\n",
    "test_data = heights.take(np.arange(heights.num_rows // 2, heights.num_rows))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "45617308-79fa-415b-9e03-85ea2ab49f0a",
   "metadata": {},
   "outputs": [],
   "source": [
    "def one_trial(training_data, testing_data, k):\n",
    "    percent_error_total = 0\n",
    "    for person in testing_data.rows:\n",
    "        parents = make_array(person.item(0), person.item(1))   # 0 - father, 1 - mother\n",
    "        true_height = person.item(2)                           # 2 - son\n",
    "        predicted_height = classify(training_data, parents, k)\n",
    "        percent_error_total += abs(predicted_height - true_height) / true_height\n",
    "    return 100 * percent_error_total / testing_data.num_rows"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cbb97ee5-d6cc-4c44-90ca-3249c14f49ce",
   "metadata": {},
   "outputs": [],
   "source": [
    "for k in range(1, 21, 4):\n",
    "    percent_error_average = one_trial(training_data, test_data, k)\n",
    "    print(\"For k = {}, the average accuracy = {:.2f}%\".format(k, 100 - percent_error_average))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "868b9f50-934a-4f1d-af6a-5e83d9bcf667",
   "metadata": {},
   "source": [
    "## 17.6 - Multiple Regression"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e5514ee7-921e-438a-bf4a-8e9d2cdbdb12",
   "metadata": {},
   "source": [
    "Terminology\n",
    "- *Regression* - Predicting a numerical value\n",
    "- *Multiple Regression* - Using multiple attributes to predict a numerical value "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "76363bd5-bbf8-415d-9ef8-31c0fcb5d7f2",
   "metadata": {},
   "source": [
    "Types of Multiple Regression\n",
    "- Multiple Linear Regression\n",
    "- Multiple K-Nearest Neighbors Regression"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}