# Chapter 10 - Sampling and Empirical Distributions

In [None]:
from datascience import *
import numpy as np
%matplotlib inline

## 10.2 Sampling from a Population

In [None]:
nba = Table.read_table("nba_salaries.csv")
nba.relabel("'15-'16 SALARY", "SALARY")
nba

In [None]:
salary_bins = np.arange(-.5, 25.5, 1)
nba.hist("SALARY", bins=salary_bins, unit="million $")

## 10.3 - Empirical Distribution of a Statistic

**Parameter** - a numerical quantity associated with a population, e.g. median

In [None]:
np.median(nba.column("SALARY"))

In [None]:
nba.where("SALARY", are.below_or_equal_to(3)).num_rows / nba.num_rows

In [None]:
np.average(nba.column("SALARY"))

In [None]:
nba.where("SALARY", are.below_or_equal_to(5.075)).num_rows / nba.num_rows

**Statistic** - Any number computed using the data in a sample (often used to estimate the value of an unknown parameter)

In [None]:
nba.num_rows

In [None]:
# Run this more than once ... the average changes!
nba_100 = nba.sample(100)
np.average(nba_100.column("SALARY"))

In [None]:
averages = make_array()

for i in np.arange(100):
    averages = np.append(averages, np.average(nba.sample(50).column("SALARY")))

averages

In [None]:
np.average(averages)

In [None]:
simulated_averages = Table().with_column('Sample Average', averages)
simulated_averages

In [None]:
# Empirical Distribution of Statistic
simulated_averages.hist(bins=np.arange(3.75, 6.75, 0.5))

## 10.4 - Random Sampling in Python

In [None]:
# Random sample with replacement (produces Table)
nba.sample(3)

In [None]:
# Simple random sample (e.g. Random sample without replacement)
nba.sample(3, with_replacement = False)

In [None]:
# Random sample with replacement (produces array)
np.random.choice(nba.column("SALARY"), 3)

In [None]:
# Random sample without replacement
np.random.choice(nba.column("SALARY"), 3, replace=False)

In [None]:
# Sampling from a categorical distribution
probabilities = [.5, .2, .2, .1]  # Must sum to 1
sample_size = 1000
rank_distribution = sample_proportions(sample_size, probabilities)
rank_distribution