# Chapter 17.5 - The Accuracy of the Classifier

In [None]:
from datascience import *
import numpy as np

In [None]:
def standard_units(numbers):
    "Convert any array of numbers to standard units."
    return (numbers - np.mean(numbers))/np.std(numbers)

In [None]:
def distance(point1, point2):
    return np.sqrt(np.sum((point1 - point2)**2))

In [None]:
def all_distances(training, new_point):
    attributes = training.drop("son")
    def distance_from_point(row):
        return distance(np.array(new_point), np.array(row))
    return attributes.apply(distance_from_point)

In [None]:
def table_with_distances(training, new_point):
    return training.with_column("Distance", all_distances(training, new_point))


In [None]:
def closest(training, new_point, k):
    with_distances = table_with_distances(training, new_point)
    sorted_by_distance = with_distances.sort("Distance")
    top_k = sorted_by_distance.take(np.arange(k))
    return top_k

In [None]:
heights = Table().read_table("sons_heights.csv")
heights = heights.with_columns(
    "father", standard_units(heights.column("father")),
    "mother", standard_units(heights.column("mother")),
)

In [None]:
def classify(training, new_point, k):
    closest_k = closest(training, new_point, k)
    return np.sum(closest_k.column("son")) / k

### The code above comes from our previous meeting.

In [None]:
print("Number of entries =", heights.num_rows)
heights = heights.sample(with_replacement = False)
training_data = heights.take(np.arange(heights.num_rows // 2))
test_data = heights.take(np.arange(heights.num_rows // 2, heights.num_rows))

In [None]:
def one_trial(training_data, testing_data, k):
    percent_error_total = 0
    for person in testing_data.rows:
        parents = make_array(person.item(0), person.item(1))   # 0 - father, 1 - mother
        true_height = person.item(2)                           # 2 - son
        predicted_height = classify(training_data, parents, k)
        percent_error_total += abs(predicted_height - true_height) / true_height
    return 100 * percent_error_total / testing_data.num_rows

In [None]:
for k in range(1, 21, 4):
    percent_error_average = one_trial(training_data, test_data, k)
    print("For k = {}, the average accuracy = {:.2f}%".format(k, 100 - percent_error_average))

## 17.6 - Multiple Regression

Terminology
- *Regression* - Predicting a numerical value
- *Multiple Regression* - Using multiple attributes to predict a numerical value 

Types of Multiple Regression
- Multiple Linear Regression
- Multiple K-Nearest Neighbors Regression