# Chapter 12 - Comparing Two Samples

**A/B Testing** - Deciding whether two numerical samples come from the same underlying distribution

In [None]:
from datascience import *
%matplotlib inline
import numpy as np

In [None]:
salaries = Table.read_table('nba_salaries.csv')
salaries = salaries.relabeled("'15-'16 SALARY", 'SALARY')
salaries = salaries.select('TEAM', 'SALARY')
salaries

Question: Is playing for the Golden State Warriors associated with a higher salary?

In [None]:
salaries = salaries.with_column('GSW MEMBER?', salaries.column('TEAM') == "Golden State Warriors")
salaries

In [None]:
salaries = salaries.drop('TEAM')
salaries

In [None]:
salaries.group('GSW MEMBER?')

In [None]:
salaries.hist('SALARY', group = 'GSW MEMBER?')

- **Null hypothesis**: In the population, the distribution of GSW salaries is the same for players on other teams. 
The difference in the sample is due to chance.
- **Alternative hypothesis**: In the population, GSW salaries are higher, on average, than players on other teams.

In [None]:
means_table = salaries.group('GSW MEMBER?', np.average)
means_table

In [None]:
# How much more does a Golden State Warrior earn on average?
observed_difference = means_table.column("SALARY average").item(1) - means_table.column("SALARY average").item(0)
observed_difference

The statistic of interest is the average salary. To test the statistic under the null hypothesis,
we can randomly permute the GSW MEMBER? labels.

In [None]:
salaries

In [None]:
shuffled_labels = salaries.sample(with_replacement = False).column('GSW MEMBER?')
original_and_shuffled = salaries.with_column('Shuffled Label', shuffled_labels)
original_and_shuffled

In [None]:
shuffled_only = original_and_shuffled.select('SALARY','Shuffled Label')
shuffled_group_means = shuffled_only.group('Shuffled Label', np.average)
shuffled_group_means

In [None]:
def one_simulated_difference_of_means():
 shuffled_labels = salaries.sample(with_replacement = False).column('GSW MEMBER?')
 original_and_shuffled = salaries.with_column('Shuffled Label', shuffled_labels)
 shuffled_only = original_and_shuffled.select('SALARY','Shuffled Label')
 shuffled_group_means = shuffled_only.group('Shuffled Label', np.average)
 return shuffled_group_means.column("SALARY average").item(1) - shuffled_group_means.column("SALARY average").item(0)

In [None]:
one_simulated_difference_of_means()

In [None]:
def many_simulated_difference_of_means(how_many):
 differences = make_array()
 for _ in np.arange(how_many):
 new_difference = one_simulated_difference_of_means()
 differences = np.append(differences, new_difference)
 return differences

In [None]:
x = many_simulated_difference_of_means(10)
x

In [None]:
repetitions = 250
differences = many_simulated_difference_of_means(repetitions)
Table().with_column('Difference Between Group Means', differences).hist(bins = np.arange(-5.5, 5.6))
print('Observed Difference:', observed_difference)

In [None]:
empirical_p = np.count_nonzero(differences >= observed_difference) / repetitions
empirical_p