# Chapter 17.3 - Rows of Tables

In [None]:
from datascience import *
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plots

In [None]:
heights = Table().read_table("sons_heights.csv")
heights.show(3)

In [None]:
def standard_units(numbers):
    "Convert any array of numbers to standard units."
    return (numbers - np.mean(numbers))/np.std(numbers)

In [None]:
heights = heights.with_columns(
    "father (SU)", standard_units(heights.column("father")),
    "mother (SU)", standard_units(heights.column("mother")),
)
heights.show(15)

In [None]:
relevant_attributes = heights.select("father (SU)", "mother (SU)")
relevant_attributes.show(15)

In [None]:
# Calculate the distance between 2 points in n-dimensions
def distance(point1, point2):
    return np.sqrt(np.sum((point1 - point2)**2))

In [None]:
# Demonstrate distance function in 2-d
distance(make_array(0, 1), make_array(1,0))

In [None]:
# Demonstrate the distance function in 3-d
distance(make_array(3, 4, 5), make_array(5, 4, 3))

In [None]:
def distance_from_desired_point(row):
    return distance(desired_point, row)

In [None]:
# Predict height of son whose father and mother are both average height
desired_point = make_array(0,0)

In [None]:
distances = relevant_attributes.apply(distance_from_desired_point)  # apply passes in an entire row
heights = heights.with_column("Distance", distances)
heights = heights.sort("Distance")
heights.show(5)

In [None]:
k = 5

In [None]:
predicted_height = np.sum(heights.take(np.arange(5)).column("son")) / k
print("The predicted height = {:.2f} inches".format(predicted_height))

# 17.4 Implementing the Classifier

General strategy:
1. Identify some attributes that you think might help you predict the answer to the question.
2. Gather a training set of examples where you know the values of the attributes as well as the correct prediction.
3. To make predictions in the future, measure the value of the attributes and then use
k-nearest neighbor classification to predict the answer to the question.

For predicting a discrete class:
1. Find the closest k neighbors of the instance to classify, i.e., the k instances from the 
training set that are most similar.
2. Look at the classes of those k neighbors, and take the majority vote to find the most-common class. Use that as our predicted class.

For predicting a continuous class (like what we did above):
1. Find the closest k neighbors of the instance to classify, i.e., 
the k instances from the training set that are most similar.
2. Look at the classes of those k neighbors, and take the average to determine the predicted class.

In [None]:
def distance(point1, point2):
    return np.sqrt(np.sum((point1 - point2)**2))

def all_distances(training, new_point):
    attributes = training.drop("son")
    def distance_from_point(row):
        return distance(np.array(new_point), np.array(row))
    return attributes.apply(distance_from_point)

def table_with_distances(training, new_point):
    return training.with_column("Distance", all_distances(training, new_point))

def closest(training, new_point, k):
    with_dists = table_with_distances(training, new_point)
    sorted_by_distance = with_dists.sort("Distance")
    nearest_k = sorted_by_distance.take(np.arange(k))
    return nearest_k

In [None]:
heights = Table().read_table("sons_heights.csv")
heights = heights.with_columns(
    "father", standard_units(heights.column("father")),
    "mother", standard_units(heights.column("mother")),
)
special_point = make_array(0, 0)
closest(heights, special_point, 5)


In [None]:
def classify(training, new_point, k):
    closest_k = closest(training, new_point, k)
    return np.sum(closest_k.column("son")) / k

In [None]:
predicted_height = classify(heights, special_point, 5)
print("The predicted height = {:.2f} inches".format(predicted_height))