{ "cells": [ { "cell_type": "markdown", "id": "ebdea971-475c-4d6f-9cf6-6c5cc2f4155f", "metadata": {}, "source": [ "# Chapter 17.5 - The Accuracy of the Classifier" ] }, { "cell_type": "code", "execution_count": null, "id": "9538feb6-a0fb-4e3f-9a4f-5d45072870b5", "metadata": {}, "outputs": [], "source": [ "from datascience import *\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": null, "id": "d8f5944d-c521-4b83-a109-d710a4335b37", "metadata": {}, "outputs": [], "source": [ "def standard_units(numbers):\n", " \"Convert any array of numbers to standard units.\"\n", " return (numbers - np.mean(numbers))/np.std(numbers)" ] }, { "cell_type": "code", "execution_count": null, "id": "814095f1-ddd7-4795-802f-6154a008e716", "metadata": {}, "outputs": [], "source": [ "def distance(point1, point2):\n", " return np.sqrt(np.sum((point1 - point2)**2))" ] }, { "cell_type": "code", "execution_count": null, "id": "ceef6cce-34d4-4e32-8759-5ebeb4679476", "metadata": {}, "outputs": [], "source": [ "def all_distances(training, new_point):\n", " attributes = training.drop(\"son\")\n", " def distance_from_point(row):\n", " return distance(np.array(new_point), np.array(row))\n", " return attributes.apply(distance_from_point)" ] }, { "cell_type": "code", "execution_count": null, "id": "f6d6d1d1-770c-45f9-b851-128cf3946aba", "metadata": {}, "outputs": [], "source": [ "def table_with_distances(training, new_point):\n", " return training.with_column(\"Distance\", all_distances(training, new_point))\n" ] }, { "cell_type": "code", "execution_count": null, "id": "52297af4-91c3-42b8-8da3-588ba1fb6a56", "metadata": {}, "outputs": [], "source": [ "def closest(training, new_point, k):\n", " with_distances = table_with_distances(training, new_point)\n", " sorted_by_distance = with_distances.sort(\"Distance\")\n", " top_k = sorted_by_distance.take(np.arange(k))\n", " return top_k" ] }, { "cell_type": "code", "execution_count": null, "id": "f90fe8d6-cc30-4e68-ac02-9d86fc94907f", "metadata": {}, "outputs": [], "source": [ "heights = Table().read_table(\"sons_heights.csv\")\n", "heights = heights.with_columns(\n", " \"father\", standard_units(heights.column(\"father\")),\n", " \"mother\", standard_units(heights.column(\"mother\")),\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "76201f2e-0d62-41af-8cb5-b4d20139a568", "metadata": {}, "outputs": [], "source": [ "def classify(training, new_point, k):\n", " closest_k = closest(training, new_point, k)\n", " return np.sum(closest_k.column(\"son\")) / k" ] }, { "cell_type": "markdown", "id": "6ed41802-f955-4a52-9c66-25057bbd3fc4", "metadata": {}, "source": [ "### The code above comes from our previous meeting." ] }, { "cell_type": "code", "execution_count": null, "id": "a6583095-d953-44a2-8c1a-51bf0c769ef8", "metadata": {}, "outputs": [], "source": [ "print(\"Number of entries =\", heights.num_rows)\n", "heights = heights.sample(with_replacement = False)\n", "training_data = heights.take(np.arange(heights.num_rows // 2))\n", "test_data = heights.take(np.arange(heights.num_rows // 2, heights.num_rows))" ] }, { "cell_type": "code", "execution_count": null, "id": "45617308-79fa-415b-9e03-85ea2ab49f0a", "metadata": {}, "outputs": [], "source": [ "def one_trial(training_data, testing_data, k):\n", " percent_error_total = 0\n", " for person in testing_data.rows:\n", " parents = make_array(person.item(0), person.item(1)) # 0 - father, 1 - mother\n", " true_height = person.item(2) # 2 - son\n", " predicted_height = classify(training_data, parents, k)\n", " percent_error_total += abs(predicted_height - true_height) / true_height\n", " return 100 * percent_error_total / testing_data.num_rows" ] }, { "cell_type": "code", "execution_count": null, "id": "cbb97ee5-d6cc-4c44-90ca-3249c14f49ce", "metadata": {}, "outputs": [], "source": [ "for k in range(1, 21, 4):\n", " percent_error_average = one_trial(training_data, test_data, k)\n", " print(\"For k = {}, the average accuracy = {:.2f}%\".format(k, 100 - percent_error_average))" ] }, { "cell_type": "markdown", "id": "868b9f50-934a-4f1d-af6a-5e83d9bcf667", "metadata": {}, "source": [ "## 17.6 - Multiple Regression" ] }, { "cell_type": "markdown", "id": "e5514ee7-921e-438a-bf4a-8e9d2cdbdb12", "metadata": {}, "source": [ "Terminology\n", "- *Regression* - Predicting a numerical value\n", "- *Multiple Regression* - Using multiple attributes to predict a numerical value " ] }, { "cell_type": "markdown", "id": "76363bd5-bbf8-415d-9ef8-31c0fcb5d7f2", "metadata": {}, "source": [ "Types of Multiple Regression\n", "- Multiple Linear Regression\n", "- Multiple K-Nearest Neighbors Regression" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.13.1" } }, "nbformat": 4, "nbformat_minor": 5 }