# Chapter 10 - Sampling and Empirical Distributions

In [None]:
from datascience import *
import numpy as np

In [None]:
ranks = ["two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "jack", "queen", "king", "ace"]
suits = ["clubs"] * 13 + ["diamonds"] * 13 + ["hearts"] * 13 + ["spades"] * 13

In [None]:
cards = Table().with_columns(
    "Rank", ranks * 4,
    "Suit", suits
)
cards.show(13)

## Deterministic Samples

In [None]:
cards.take(0, 12, 13)

In [None]:
cards.where("Rank", are.containing("two"))

## Probability Sample

Definitions
- A **population** is the set of all elements from whom a sample will be drawn.
- A **probability sample** is one for which it is possible to calculate, before the sample is drawn, the chance with which any subset of elements will enter the sample.
- A **systematic sample** starts by choosing a random position early in the list, and then evenly spaced positions after that. 

In [None]:
start = np.random.randint(13)
cards.take(np.arange(start, cards.num_rows, 13))

In [None]:
# Random sampling with replacement
selected = np.random.choice(np.arange(10), 5)
cards.take(selected)

In [None]:
# Simple random sample
np.random.choice(np.arange(10), 10, replace=False)

## 10.1 Empirical Distributions

In [None]:
drawn = cards.sample(100)
drawn = drawn.group("Suit")
drawn

In [None]:
%matplotlib inline
drawn.barh("Suit")

In [None]:
drawn = drawn.with_column("Percent", drawn.column("count") / 100)
drawn

In [None]:
drawn.barh("Suit", "Percent")

**Law of Averages** - If a chance experiment is repeated independently and under identical conditions, then, in the long run, the proportion of times that an event occurs gets closer and closer to the theoretical probability of the event.

## 10.2 Sampling from a Population

In [None]:
nba = Table.read_table("nba_salaries.csv")
nba.relabel("'15-'16 SALARY", "SALARY")
nba

In [None]:
print("Salary minimum = ", nba.column("SALARY").min())
print("Salary maximum = ", nba.column("SALARY").max())
print("Number of players = ", nba.num_rows)

In [None]:
salary_bins = np.arange(0, 25.1, .25)
nba.hist("SALARY", bins=salary_bins, unit="million $")

In [None]:
nba.where("SALARY", are.above(17)).num_rows/nba.num_rows

In [None]:
salary_bins = np.arange(18)
nba.hist("SALARY", bins=salary_bins, unit="million $")

In [None]:
nba.sample(10).hist("SALARY", bins = salary_bins, unit = 'million $')