📊 Statistics with Python

24 topics • Click any card to expand

1. Descriptive Statistics▼

Descriptive statistics summarize and describe the main features of a dataset. Key measures include central tendency (mean, median, mode) and spread (std, variance, percentiles, IQR). NumPy and scipy.stats provide fast, vectorized implementations.

Mean, median, mode with numpy and scipy

import numpy as np
from scipy import stats

data = [23, 45, 12, 67, 34, 45, 89, 23, 45, 56, 78, 34, 23, 90, 12]

mean   = np.mean(data)
median = np.median(data)
mode   = stats.mode(data, keepdims=True)

print(f"Mean:   {mean:.2f}")
print(f"Median: {median:.2f}")
print(f"Mode:   {mode.mode[0]}  (count: {mode.count[0]})")

Standard deviation, variance, and IQR

import numpy as np
from scipy import stats

data = np.array([12, 15, 14, 10, 18, 21, 13, 16, 14, 15, 17, 20, 11, 19, 14])

std_pop  = np.std(data, ddof=0)   # population std
std_samp = np.std(data, ddof=1)   # sample std (Bessel's correction)
var_samp = np.var(data, ddof=1)

q1, q3 = np.percentile(data, [25, 75])
iqr = stats.iqr(data)

print(f"Population std:  {std_pop:.4f}")
print(f"Sample std:      {std_samp:.4f}")
print(f"Sample variance: {var_samp:.4f}")
print(f"Q1={q1}, Q3={q3}, IQR={iqr}")

Percentiles and summary statistics with pandas

import numpy as np
import pandas as pd

np.random.seed(42)
data = pd.Series(np.random.normal(loc=100, scale=15, size=200))

print("=== Summary Statistics ===")
print(data.describe())
print(f"
Skewness:  {data.skew():.4f}")
print(f"Kurtosis:  {data.kurtosis():.4f}")
print(f"5th  pctl: {data.quantile(0.05):.2f}")
print(f"95th pctl: {data.quantile(0.95):.2f}")

Outlier detection using IQR fence

import numpy as np
from scipy import stats

np.random.seed(0)
data = np.concatenate([np.random.normal(50, 5, 95), [10, 200, 95, 105, -5]])

q1, q3 = np.percentile(data, [25, 75])
iqr = q3 - q1
lower_fence = q1 - 1.5 * iqr
upper_fence = q3 + 1.5 * iqr

outliers = data[(data < lower_fence) | (data > upper_fence)]
z_scores  = np.abs(stats.zscore(data))

print(f"IQR fences: [{lower_fence:.2f}, {upper_fence:.2f}]")
print(f"Outliers (IQR): {sorted(outliers)}")
print(f"Outliers (|z|>3): {data[z_scores > 3].tolist()}")

💼 Real-World Scenario

A data scientist at a hospital needs to summarize patient blood pressure readings to report to clinicians. They must identify the typical range, spot extreme values, and present skewness of the distribution before modeling.

Real-World Code

import numpy as np
import pandas as pd
from scipy import stats

np.random.seed(7)
# Simulate systolic BP for 300 patients (some hypertensive outliers)
bp = np.concatenate([
    np.random.normal(120, 12, 270),   # normal range
    np.random.normal(170, 10, 30),    # hypertensive group
])
series = pd.Series(bp, name="Systolic_BP")

print("=== Blood Pressure Summary ===")
print(series.describe().round(2))

q1, q3 = np.percentile(bp, [25, 75])
iqr = q3 - q1
print(f"
IQR:         {iqr:.2f} mmHg")
print(f"Skewness:    {series.skew():.3f}")

high_bp = bp[bp > 140]
print(f"Hypertensive (>140): {len(high_bp)} patients ({100*len(high_bp)/len(bp):.1f}%)")

🏋️ Practice: Analyze a Sales Dataset

Given a list of daily sales figures, compute mean, median, std, IQR, and identify any outliers using both the IQR fence and z-score methods. Report the percentage of days with sales above the 90th percentile.

Starter Code

import numpy as np
from scipy import stats

np.random.seed(123)
daily_sales = np.concatenate([
    np.random.normal(5000, 800, 90),
    np.random.normal(12000, 500, 10),  # exceptional days
])

# TODO: Compute mean, median, std, IQR
# TODO: Find outliers using IQR fence
# TODO: Find outliers using z-score (|z| > 2)
# TODO: Report % of days above 90th percentile

✅ Practice Checklist

Compute mean, median, mode, std, and IQR on a list of 20 numbers
Use np.percentile to find the 10th, 25th, 75th, and 90th percentiles of a dataset
Identify outliers with the IQR fence method (below Q1-1.5*IQR or above Q3+1.5*IQR)
Compute skewness and kurtosis with scipy.stats and interpret the result
Use pd.Series.describe() and compare its output to manually computed statistics

2. Probability Distributions▼

Probability distributions model the likelihood of outcomes. scipy.stats provides a unified interface for continuous distributions (normal, exponential) and discrete distributions (binomial, Poisson) with PDF/PMF, CDF, and random variates.

Normal distribution — PDF, CDF, PPF

import numpy as np
from scipy import stats

mu, sigma = 0, 1
dist = stats.norm(loc=mu, scale=sigma)

x_vals = [-2, -1, 0, 1, 2]
print("x     PDF       CDF")
for x in x_vals:
    print(f"{x:+.0f}  {dist.pdf(x):.6f}  {dist.cdf(x):.6f}")

# Percent-point function (inverse CDF)
print(f"
95th percentile (PPF): {dist.ppf(0.95):.4f}")
print(f"P(-1 < X < 1):         {dist.cdf(1) - dist.cdf(-1):.4f}")
print(f"P(-2 < X < 2):         {dist.cdf(2) - dist.cdf(-2):.4f}")

Binomial distribution — PMF and CDF

import numpy as np
from scipy import stats

n, p = 20, 0.3   # 20 trials, 30% success rate
dist = stats.binom(n=n, p=p)

print(f"Binomial(n={n}, p={p})")
print(f"Mean: {dist.mean():.2f}, Std: {dist.std():.2f}")
print()
print("k    PMF        CDF")
for k in range(0, 11):
    print(f"{k:2d}   {dist.pmf(k):.6f}   {dist.cdf(k):.6f}")

# Probability of at least 8 successes
prob_ge8 = 1 - dist.cdf(7)
print(f"
P(X >= 8): {prob_ge8:.4f}")

Poisson distribution — PMF and expected value

import numpy as np
from scipy import stats

lam = 4.5   # average events per interval
dist = stats.poisson(mu=lam)

print(f"Poisson(lambda={lam})")
print(f"Mean: {dist.mean()}, Variance: {dist.var()}")
print()
print("k   PMF")
for k in range(0, 13):
    bar = "#" * int(dist.pmf(k) * 100)
    print(f"{k:2d}  {dist.pmf(k):.5f}  {bar}")

print(f"
P(X <= 3): {dist.cdf(3):.4f}")
print(f"P(X >= 8): {1 - dist.cdf(7):.4f}")

Exponential distribution — survival function

import numpy as np
from scipy import stats

# Scale = mean inter-arrival time (1/rate)
scale = 10.0    # average 10 minutes between events
dist = stats.expon(scale=scale)

print(f"Exponential(mean={scale})")
print(f"Mean: {dist.mean()}, Std: {dist.std():.4f}")
print()
# Survival function P(X > t)
for t in [5, 10, 15, 20, 30]:
    print(f"P(T > {t:2d} min) = {dist.sf(t):.4f}")

# Memoryless property check
p_gt10       = dist.sf(10)
p_gt20_gt10  = dist.sf(20) / dist.sf(10)
print(f"
P(T>20|T>10) = {p_gt20_gt10:.4f}  (should equal P(T>10) = {p_gt10:.4f})")

💼 Real-World Scenario

An e-commerce platform wants to model the number of customer purchases per hour (Poisson) and the time between purchases (Exponential). They also need the normal approximation for total daily revenue and the probability of a marketing campaign achieving at least 50 conversions out of 200 email sends.

Real-World Code

import numpy as np
from scipy import stats

# Poisson: purchases per hour
lam_hourly = 12
pois = stats.poisson(mu=lam_hourly)
print(f"Expected purchases/hour: {pois.mean()}")
print(f"P(>=15 purchases in 1hr): {1 - pois.cdf(14):.4f}")

# Exponential: minutes between purchases
minutes_between = stats.expon(scale=60/lam_hourly)
print(f"
Avg time between purchases: {minutes_between.mean():.2f} min")
print(f"P(wait > 8 min): {minutes_between.sf(8):.4f}")

# Binomial: email campaign conversions
n_emails, p_conv = 200, 0.27
binom = stats.binom(n=n_emails, p=p_conv)
print(f"
Email campaign — n={n_emails}, p={p_conv}")
print(f"Expected conversions: {binom.mean():.1f}")
print(f"P(>=50 conversions): {1 - binom.cdf(49):.4f}")

# Normal: total daily revenue (CLT)
avg_order, std_order, n_orders = 85, 30, 150
rev_dist = stats.norm(loc=avg_order*n_orders, scale=std_order*np.sqrt(n_orders))
print(f"
P(daily revenue > $14000): {rev_dist.sf(14000):.4f}")

🏋️ Practice: Distribution Parameter Fitting

Given a dataset of customer wait times, fit an exponential distribution using scipy.stats.expon.fit(), compute the fitted mean, and calculate the probability that a customer waits more than 5 minutes. Also compute the 90th percentile wait time.

Starter Code

import numpy as np
from scipy import stats

np.random.seed(42)
# Simulate wait times (minutes) — true mean = 3 min
wait_times = np.random.exponential(scale=3.0, size=500)

# TODO: Fit exponential distribution with stats.expon.fit()
# TODO: Print fitted scale (mean wait time)
# TODO: Compute P(wait > 5 minutes) using fitted distribution
# TODO: Compute 90th percentile wait time

✅ Practice Checklist

Compute the PDF, CDF, and PPF of a standard normal distribution for x = [-2,-1,0,1,2]
Use stats.binom to compute the probability of exactly 5 heads in 10 fair coin flips
Use stats.poisson to compute P(X >= 8) for a Poisson distribution with lambda=5
Compute P(-1.96 < Z < 1.96) for the standard normal and verify it is approximately 0.95
Use stats.expon to find the 95th percentile and verify the memoryless property

3. Sampling & Central Limit Theorem▼

Random sampling draws a subset from a population. The Central Limit Theorem (CLT) states that the distribution of sample means approaches normality as sample size grows, regardless of population shape. Bootstrap resampling estimates sampling distributions empirically.

Simple random sampling with numpy

import numpy as np

np.random.seed(42)
# Skewed population (exponential)
population = np.random.exponential(scale=5, size=10000)

print(f"Population  — mean: {population.mean():.3f}, std: {population.std():.3f}")
print()

# Draw samples of different sizes
for n in [5, 30, 100, 500]:
    sample = np.random.choice(population, size=n, replace=False)
    print(f"n={n:4d} — sample mean: {sample.mean():.3f}, sample std: {sample.std(ddof=1):.3f}")

CLT demonstration — sampling distribution of the mean

import numpy as np
from scipy import stats

np.random.seed(0)
# Highly skewed population
population = np.random.exponential(scale=5, size=100000)
pop_mean, pop_std = population.mean(), population.std()

n_simulations = 3000
for n in [5, 10, 30, 100]:
    sample_means = [np.mean(np.random.choice(population, n, replace=False))
                    for _ in range(n_simulations)]
    sm = np.array(sample_means)
    # By CLT, sampling dist should be normal
    w_stat, p_val = stats.shapiro(sm[:200])
    print(f"n={n:3d}: mean of means={sm.mean():.3f}, "
          f"std={sm.std():.3f} (theory={pop_std/np.sqrt(n):.3f}), "
          f"Shapiro p={p_val:.4f}")

Bootstrap resampling — estimate sampling distribution

import numpy as np

np.random.seed(7)
sample = np.random.lognormal(mean=2, sigma=0.5, size=50)

n_boot = 5000
boot_means = np.array([
    np.mean(np.random.choice(sample, size=len(sample), replace=True))
    for _ in range(n_boot)
])

print(f"Original sample mean: {sample.mean():.4f}")
print(f"Bootstrap mean:       {boot_means.mean():.4f}")
print(f"Bootstrap std error:  {boot_means.std():.4f}")
print(f"Bootstrap 95% CI:     [{np.percentile(boot_means,2.5):.4f}, {np.percentile(boot_means,97.5):.4f}]")

Stratified sampling with pandas

import numpy as np
import pandas as pd

np.random.seed(1)
n = 1000
df = pd.DataFrame({
    "age_group": np.random.choice(["18-34","35-54","55+"], size=n, p=[0.4,0.35,0.25]),
    "score":     np.random.normal(70, 10, n),
})

# Proportional stratified sample (10%)
sample_rate = 0.10
stratified = (df.groupby("age_group", group_keys=False)
                .apply(lambda g: g.sample(frac=sample_rate, random_state=42)))

print("Stratified sample composition:")
print(stratified["age_group"].value_counts())
print(f"
Pop mean:    {df['score'].mean():.3f}")
print(f"Sample mean: {stratified['score'].mean():.3f}")

💼 Real-World Scenario

A polling firm surveys likely voters ahead of an election. They use stratified random sampling by region (North, South, East, West) to ensure representation, then apply bootstrap resampling to estimate the margin of error for their candidate preference estimate without assuming normality.

Real-World Code

import numpy as np
import pandas as pd

np.random.seed(42)
true_props = {"North":0.52, "South":0.44, "East":0.55, "West":0.48}
region_sizes = {"North":3000, "South":4500, "East":2000, "West":2500}

# Simulate full voter database
rows = []
for region, size in region_sizes.items():
    votes = np.random.binomial(1, true_props[region], size)
    rows.extend(zip([region]*size, votes))
population = pd.DataFrame(rows, columns=["region","prefer_A"])

# Stratified sample: 100 per region
sample = (population.groupby("region", group_keys=False)
          .apply(lambda g: g.sample(n=100, random_state=42)))

overall_pct = sample["prefer_A"].mean()
print(f"Sample support for A: {overall_pct:.3f}")

# Bootstrap margin of error
n_boot = 4000
boot = [np.random.choice(sample["prefer_A"], size=len(sample), replace=True).mean()
        for _ in range(n_boot)]
boot = np.array(boot)
moe = np.percentile(boot, 97.5) - np.percentile(boot, 2.5)
print(f"Bootstrap 95% CI:     [{np.percentile(boot,2.5):.3f}, {np.percentile(boot,97.5):.3f}]")
print(f"Margin of error:      ±{moe/2:.3f}")

🏋️ Practice: Verify CLT with a Uniform Distribution

Draw 2000 samples of size n from a Uniform(0,1) population. For n in [2, 10, 30, 100], compute the mean of sample means, the empirical standard error, and the theoretical standard error (1/sqrt(12*n)). Show that they converge.

Starter Code

import numpy as np

np.random.seed(99)
n_simulations = 2000
pop_std_uniform = np.sqrt(1/12)   # std of Uniform(0,1)

for n in [2, 10, 30, 100]:
    # TODO: Draw n_simulations samples of size n from Uniform(0,1)
    # TODO: Compute sample means array
    # TODO: Print mean of sample means, empirical SE, theoretical SE
    pass

✅ Practice Checklist

Generate 10000 sample means from samples of size n=5 from an exponential distribution and plot/print the histogram shape
Verify the CLT: as n increases from 5 to 100, confirm the sample mean distribution becomes more normal
Compute the standard error of the mean for sample sizes 10, 50, and 200 from the same population
Compare the theoretical SE (sigma/sqrt(n)) to the empirical SE from bootstrap resampling
Demonstrate stratified sampling by drawing proportional samples from 3 groups with different sizes

4. Confidence Intervals▼

A confidence interval (CI) gives a range of plausible values for a population parameter. A 95% CI means that if we repeated the study many times, 95% of the intervals would contain the true parameter. scipy.stats provides t-intervals; bootstrap CIs require no distributional assumptions.

CI for mean using scipy.stats.t.interval

import numpy as np
from scipy import stats

np.random.seed(42)
sample = np.random.normal(loc=50, scale=8, size=30)

n    = len(sample)
mean = sample.mean()
se   = stats.sem(sample)         # standard error of mean

# 90%, 95%, 99% confidence intervals
for conf in [0.90, 0.95, 0.99]:
    lo, hi = stats.t.interval(conf, df=n-1, loc=mean, scale=se)
    print(f"{int(conf*100)}% CI: ({lo:.3f}, {hi:.3f})  width={hi-lo:.3f}")

CI for proportion using normal approximation

import numpy as np
from scipy import stats

# 143 successes out of 250 trials
successes, n = 143, 250
p_hat = successes / n
se    = np.sqrt(p_hat * (1 - p_hat) / n)

for conf in [0.90, 0.95, 0.99]:
    z  = stats.norm.ppf((1 + conf) / 2)
    lo = p_hat - z * se
    hi = p_hat + z * se
    print(f"{int(conf*100)}% CI for proportion: ({lo:.4f}, {hi:.4f})")

# Wilson score interval (better for extreme p)
from scipy.stats import proportion_confint
lo_w, hi_w = proportion_confint(successes, n, alpha=0.05, method="wilson")
print(f"
Wilson 95% CI: ({lo_w:.4f}, {hi_w:.4f})")

Bootstrap CI — percentile and BCA methods

import numpy as np

np.random.seed(0)
data = np.random.lognormal(2, 0.8, size=80)

# Percentile bootstrap CI for the median
n_boot = 5000
boot_medians = np.array([
    np.median(np.random.choice(data, size=len(data), replace=True))
    for _ in range(n_boot)
])

lo_p, hi_p = np.percentile(boot_medians, [2.5, 97.5])
print(f"Sample median:       {np.median(data):.4f}")
print(f"Bootstrap 95% CI:    ({lo_p:.4f}, {hi_p:.4f})")
print(f"Bootstrap std error: {boot_medians.std():.4f}")

CI for difference of two means

import numpy as np
from scipy import stats

np.random.seed(5)
group_a = np.random.normal(75, 10, 40)
group_b = np.random.normal(70, 12, 35)

# Welch t-test gives CI for difference in means
t_stat, p_val = stats.ttest_ind(group_a, group_b, equal_var=False)
diff = group_a.mean() - group_b.mean()

# Manual CI for difference
se_diff = np.sqrt(group_a.var(ddof=1)/len(group_a) +
                  group_b.var(ddof=1)/len(group_b))
df = len(group_a) + len(group_b) - 2
lo, hi = diff - stats.t.ppf(0.975, df)*se_diff, diff + stats.t.ppf(0.975, df)*se_diff

print(f"Group A mean: {group_a.mean():.3f}")
print(f"Group B mean: {group_b.mean():.3f}")
print(f"Difference:   {diff:.3f}")
print(f"95% CI for diff: ({lo:.3f}, {hi:.3f})")
print(f"t={t_stat:.3f}, p={p_val:.4f}")

💼 Real-World Scenario

A pharmaceutical company runs a clinical trial measuring the reduction in blood pressure after a new drug. They need 95% confidence intervals for the mean reduction, a bootstrap CI that avoids normality assumptions for their small subgroup analysis, and a CI for the proportion of patients achieving >10 mmHg reduction.

Real-World Code

import numpy as np
from scipy import stats
from scipy.stats import proportion_confint

np.random.seed(42)
# Reduction in systolic BP (mmHg) for 45 patients
reduction = np.random.normal(loc=11.5, scale=5.2, size=45)

mean_red = reduction.mean()
se       = stats.sem(reduction)
lo, hi   = stats.t.interval(0.95, df=len(reduction)-1, loc=mean_red, scale=se)
print(f"Mean reduction: {mean_red:.2f} mmHg")
print(f"95% t-CI:       ({lo:.2f}, {hi:.2f}) mmHg")

# Proportion achieving > 10 mmHg reduction
successes = np.sum(reduction > 10)
lo_w, hi_w = proportion_confint(successes, len(reduction), alpha=0.05, method="wilson")
print(f"
Responders (>10 mmHg): {successes}/{len(reduction)} ({successes/len(reduction):.1%})")
print(f"Wilson 95% CI:         ({lo_w:.3f}, {hi_w:.3f})")

# Bootstrap CI for median reduction (subgroup)
boot_med = np.array([np.median(np.random.choice(reduction, len(reduction), replace=True))
                     for _ in range(4000)])
print(f"
Bootstrap 95% CI (median): ({np.percentile(boot_med,2.5):.2f}, {np.percentile(boot_med,97.5):.2f})")

🏋️ Practice: Compare Confidence Intervals

Generate two samples from Normal(60, 8) with n=20 and n=100. For each, compute the 95% t-CI for the mean. Then generate a skewed sample from Exponential(scale=5) with n=40 and compute both the t-CI and a bootstrap CI. Discuss which is more appropriate.

Starter Code

import numpy as np
from scipy import stats

np.random.seed(77)

# TODO: Sample 1 (n=20) from Normal(60,8) — compute 95% t-CI
# TODO: Sample 2 (n=100) from Normal(60,8) — compute 95% t-CI
# TODO: Sample 3 (n=40) from Exponential(scale=5) — compute both t-CI and bootstrap CI
# TODO: Compare widths and discuss

✅ Practice Checklist

Compute a 95% confidence interval for the mean using stats.t.interval for a sample of size 30
Compute a 95% bootstrap CI for the mean by resampling 5000 times and taking percentiles
Build a CI for a proportion using the normal approximation: p +/- 1.96*sqrt(p*(1-p)/n)
Compare the CI width for n=30 vs n=300 at the same confidence level
Verify that increasing confidence from 90% to 99% widens the CI as expected

5. Hypothesis Testing▼

Hypothesis testing provides a framework for making statistical decisions. A null hypothesis (H0) is tested against an alternative. The p-value is the probability of observing data at least as extreme as the sample if H0 is true. scipy.stats provides t-tests, with significance typically judged at alpha=0.05.

One-sample t-test

import numpy as np
from scipy import stats

np.random.seed(42)
# H0: population mean = 100; H1: mean != 100
sample = np.random.normal(loc=103, scale=12, size=35)

t_stat, p_val = stats.ttest_1samp(sample, popmean=100)

print(f"Sample mean: {sample.mean():.4f}")
print(f"t-statistic: {t_stat:.4f}")
print(f"p-value:     {p_val:.4f}")
print()
alpha = 0.05
conclusion = "Reject H0" if p_val < alpha else "Fail to reject H0"
print(f"At alpha={alpha}: {conclusion}")
print(f"Evidence that mean != 100: {'Yes' if p_val < alpha else 'No'}")

Independent samples (two-sample) t-test

import numpy as np
from scipy import stats

np.random.seed(7)
control   = np.random.normal(50, 8, 40)
treatment = np.random.normal(55, 9, 38)

# Levene test for equal variances
lev_stat, lev_p = stats.levene(control, treatment)
equal_var = lev_p > 0.05

t_stat, p_val = stats.ttest_ind(control, treatment, equal_var=equal_var)

print(f"Control:   n={len(control)}, mean={control.mean():.3f}, std={control.std(ddof=1):.3f}")
print(f"Treatment: n={len(treatment)}, mean={treatment.mean():.3f}, std={treatment.std(ddof=1):.3f}")
print(f"
Levene p={lev_p:.4f} → equal_var={equal_var}")
print(f"t={t_stat:.4f}, p={p_val:.4f}")
print(f"Result: {'Significant difference' if p_val < 0.05 else 'No significant difference'}")

Paired t-test — before vs after

import numpy as np
from scipy import stats

np.random.seed(3)
n = 25
before = np.random.normal(130, 10, n)
after  = before - np.random.normal(8, 5, n)   # intervention reduces by ~8

t_stat, p_val = stats.ttest_rel(before, after)
diff = before - after

print(f"Before: {before.mean():.2f} ± {before.std(ddof=1):.2f}")
print(f"After:  {after.mean():.2f} ± {after.std(ddof=1):.2f}")
print(f"Mean diff: {diff.mean():.2f} ± {diff.std(ddof=1):.2f}")
print(f"
t={t_stat:.4f}, p={p_val:.6f}")
print(f"One-tailed p (before > after): {p_val/2:.6f}")

Type I and II errors — power calculation

import numpy as np
from scipy import stats

# Simulate Type I error rate (false positives under H0)
np.random.seed(0)
alpha = 0.05
n_simulations = 10000
n_per_group = 30

# Under H0: both groups have same mean
false_positives = 0
for _ in range(n_simulations):
    g1 = np.random.normal(50, 10, n_per_group)
    g2 = np.random.normal(50, 10, n_per_group)
    _, p = stats.ttest_ind(g1, g2)
    if p < alpha:
        false_positives += 1

print(f"Simulated Type I error rate: {false_positives/n_simulations:.4f} (expected ~{alpha})")

# Under H1: real effect of size 0.5 SD
true_positives = 0
for _ in range(n_simulations):
    g1 = np.random.normal(50, 10, n_per_group)
    g2 = np.random.normal(55, 10, n_per_group)
    _, p = stats.ttest_ind(g1, g2)
    if p < alpha:
        true_positives += 1
print(f"Simulated Power:             {true_positives/n_simulations:.4f}")

💼 Real-World Scenario

An online retailer A/B tests a redesigned checkout page. Conversion rates (number of sales per session) are measured for control (old page) and treatment (new page) groups. The team runs an independent t-test on session revenue values, checks assumptions with Levene's test, and interprets the p-value against their pre-specified alpha of 0.05.

Real-World Code

import numpy as np
from scipy import stats

np.random.seed(42)
# Simulate session revenues (in $) for each group
n_ctrl, n_trt = 500, 500
control   = np.random.lognormal(mean=3.5, sigma=1.2, size=n_ctrl)
treatment = np.random.lognormal(mean=3.65, sigma=1.2, size=n_trt)

print("=== A/B Test: Checkout Page Revenue ===")
print(f"Control:   n={n_ctrl}, mean=${control.mean():.2f}, median=${np.median(control):.2f}")
print(f"Treatment: n={n_trt}, mean=${treatment.mean():.2f}, median=${np.median(treatment):.2f}")

# Variance equality
lev_stat, lev_p = stats.levene(control, treatment)
print(f"
Levene test: F={lev_stat:.3f}, p={lev_p:.4f}")
equal_var = lev_p > 0.05

# Welch t-test
t_stat, p_val = stats.ttest_ind(control, treatment, equal_var=equal_var)
print(f"Welch t-test: t={t_stat:.3f}, p={p_val:.4f}")
print(f"Decision (alpha=0.05): {'Reject H0 — new page works' if p_val < 0.05 else 'Fail to reject H0'}")

uplift = (treatment.mean() - control.mean()) / control.mean() * 100
print(f"Revenue uplift: {uplift:.1f}%")

🏋️ Practice: One-Sample and Two-Sample Tests

A company claims their delivery time averages 3 days. You sample 40 recent deliveries. Run a one-sample t-test to check the claim. Then compare morning vs afternoon delivery times using an independent t-test. Report t-statistic, p-value, and conclusion at alpha=0.05.

Starter Code

import numpy as np
from scipy import stats

np.random.seed(11)
deliveries = np.random.normal(loc=3.4, scale=0.8, size=40)
morning    = np.random.normal(loc=3.2, scale=0.7, size=25)
afternoon  = np.random.normal(loc=3.6, scale=0.9, size=25)

# TODO: One-sample t-test: H0: mean = 3.0 days
# TODO: Two-sample t-test: morning vs afternoon
# TODO: Print t, p, and conclusion for each

✅ Practice Checklist

Perform a one-sample t-test on simulated data and interpret the resulting p-value
Generate two groups with different means and run a two-sample t-test with stats.ttest_ind
Check normality assumption using the Shapiro-Wilk test before running a t-test
Try changing the sample size from 10 to 1000 and observe how p-value changes
Run Welch's t-test (equal_var=False) and compare to Student's t-test on unequal variance data

6. ANOVA & Multiple Groups▼

ANOVA (Analysis of Variance) tests whether means differ across three or more groups simultaneously, avoiding inflated Type I errors from multiple pairwise t-tests. One-way ANOVA requires normality and equal variances; Kruskal-Wallis is the non-parametric alternative. Post-hoc tests identify which specific pairs differ.

One-way ANOVA with scipy.stats.f_oneway

import numpy as np
from scipy import stats

np.random.seed(42)
group_a = np.random.normal(20, 4, 30)
group_b = np.random.normal(24, 4, 30)
group_c = np.random.normal(22, 4, 30)
group_d = np.random.normal(20, 4, 30)

f_stat, p_val = stats.f_oneway(group_a, group_b, group_c, group_d)

for name, grp in zip("ABCD", [group_a,group_b,group_c,group_d]):
    print(f"Group {name}: mean={grp.mean():.3f}, std={grp.std(ddof=1):.3f}")

print(f"
F-statistic: {f_stat:.4f}")
print(f"p-value:     {p_val:.4f}")
print(f"Result: {'Significant difference between groups' if p_val < 0.05 else 'No significant difference'}")

ANOVA assumptions — Levene and Shapiro-Wilk

import numpy as np
from scipy import stats

np.random.seed(0)
g1 = np.random.normal(10, 2, 25)
g2 = np.random.normal(12, 3, 25)
g3 = np.random.normal(11, 2, 25)

# Homogeneity of variance
lev_stat, lev_p = stats.levene(g1, g2, g3)
print(f"Levene test: F={lev_stat:.3f}, p={lev_p:.4f}")
print(f"  Equal variances assumed: {lev_p > 0.05}")

# Normality within each group
for i, g in enumerate([g1,g2,g3],1):
    w, p = stats.shapiro(g)
    print(f"Shapiro-Wilk Group {i}: W={w:.4f}, p={p:.4f} — {'Normal' if p>0.05 else 'Non-normal'}")

Kruskal-Wallis — non-parametric ANOVA

import numpy as np
from scipy import stats

np.random.seed(5)
# Skewed groups — ANOVA assumptions violated
g1 = np.random.exponential(2, 30)
g2 = np.random.exponential(3, 30)
g3 = np.random.exponential(2.5, 30)

kw_stat, kw_p = stats.kruskal(g1, g2, g3)
f_stat, f_p   = stats.f_oneway(g1, g2, g3)

print("Group medians:")
for i, g in enumerate([g1,g2,g3],1):
    print(f"  Group {i}: median={np.median(g):.3f}")

print(f"
Kruskal-Wallis: H={kw_stat:.4f}, p={kw_p:.4f}")
print(f"One-way ANOVA:  F={f_stat:.4f}, p={f_p:.4f}")

Post-hoc pairwise tests with Bonferroni correction

import numpy as np
from scipy import stats
from itertools import combinations

np.random.seed(9)
groups = {
    "Control":   np.random.normal(50, 8, 30),
    "Drug A":    np.random.normal(58, 8, 30),
    "Drug B":    np.random.normal(55, 8, 30),
    "Placebo":   np.random.normal(51, 8, 30),
}

f_stat, p_val = stats.f_oneway(*groups.values())
print(f"ANOVA: F={f_stat:.3f}, p={p_val:.4f}")

# Pairwise t-tests with Bonferroni correction
names = list(groups.keys())
pairs = list(combinations(names, 2))
alpha_corr = 0.05 / len(pairs)   # Bonferroni

print(f"
Bonferroni alpha: {alpha_corr:.4f} ({len(pairs)} comparisons)")
for n1, n2 in pairs:
    t, p = stats.ttest_ind(groups[n1], groups[n2])
    sig  = "*" if p < alpha_corr else ""
    print(f"  {n1} vs {n2}: p={p:.4f} {sig}")

💼 Real-World Scenario

A nutrition researcher compares weight loss across four diets (Mediterranean, Keto, Vegan, Low-Fat) over 12 weeks. One-way ANOVA tests whether any diet differs. Levene's test checks variance equality. Post-hoc pairwise tests with Bonferroni correction identify which specific diets differ significantly.

Real-World Code

import numpy as np
from scipy import stats
from itertools import combinations

np.random.seed(42)
diets = {
    "Mediterranean": np.random.normal(5.2, 2.1, 35),
    "Keto":          np.random.normal(7.1, 2.5, 35),
    "Vegan":         np.random.normal(4.8, 1.9, 35),
    "Low-Fat":       np.random.normal(4.1, 2.0, 35),
}

print("=== Diet Weight Loss Study (kg) ===")
for diet, vals in diets.items():
    print(f"{diet:15s}: mean={vals.mean():.2f}, std={vals.std(ddof=1):.2f}")

f_stat, p_val = stats.f_oneway(*diets.values())
print(f"
ANOVA: F={f_stat:.3f}, p={p_val:.4f}")

if p_val < 0.05:
    print("Significant — running post-hoc tests")
    names = list(diets.keys())
    pairs = list(combinations(names, 2))
    bonf  = 0.05 / len(pairs)
    print(f"Bonferroni alpha: {bonf:.4f}")
    for n1, n2 in pairs:
        _, p = stats.ttest_ind(diets[n1], diets[n2])
        print(f"  {n1:15s} vs {n2:15s}: p={p:.4f} {'*' if p<bonf else ''}")

🏋️ Practice: Compare Three Teaching Methods

Simulate exam scores for students taught by three methods: Lecture (mean=72), Active Learning (mean=78), Online (mean=74), each with std=10 and n=30. Run one-way ANOVA. If significant, apply Bonferroni-corrected pairwise t-tests. Also run Kruskal-Wallis as a robustness check.

Starter Code

import numpy as np
from scipy import stats
from itertools import combinations

np.random.seed(55)
lecture = np.random.normal(72, 10, 30)
active  = np.random.normal(78, 10, 30)
online  = np.random.normal(74, 10, 30)

# TODO: One-way ANOVA
# TODO: If significant, Bonferroni post-hoc
# TODO: Kruskal-Wallis test
# TODO: Compare ANOVA vs Kruskal-Wallis conclusions

✅ Practice Checklist

Run one-way ANOVA on 3 groups using stats.f_oneway and interpret the F-statistic and p-value
Apply Tukey HSD post-hoc test to identify which specific group pairs differ significantly
Run Levene's test to check the equal variance assumption before ANOVA
Try two-way ANOVA using statsmodels.formula.api.ols with an interaction term
Use Kruskal-Wallis H test as the non-parametric alternative when normality is violated

7. Chi-Square Tests▼

Chi-square tests assess relationships between categorical variables. The goodness-of-fit test checks if observed frequencies match expected. The test of independence checks if two categorical variables are related. scipy.stats.chi2_contingency handles contingency tables directly.

Chi-square goodness-of-fit test

import numpy as np
from scipy import stats

# Test if a die is fair (expected: equal frequency for each face)
observed = np.array([48, 55, 52, 60, 45, 40])   # 300 rolls
expected_freq = 300 / 6
expected = np.full(6, expected_freq)

chi2, p_val = stats.chisquare(f_obs=observed, f_exp=expected)
df = len(observed) - 1

print("Die Roll Test (300 rolls)")
print(f"Observed: {observed}")
print(f"Expected: {expected}")
print(f"
Chi2 statistic: {chi2:.4f}")
print(f"Degrees of freedom: {df}")
print(f"p-value: {p_val:.4f}")
print(f"Die is {'NOT fair' if p_val < 0.05 else 'fair (fail to reject H0)'}")

Chi-square test of independence — contingency table

import numpy as np
from scipy import stats
import pandas as pd

# Customer satisfaction by region
data = {
    "Satisfied":   {"North": 120, "South": 85, "East": 95, "West": 110},
    "Neutral":     {"North":  40, "South": 35, "East": 42, "West":  38},
    "Unsatisfied": {"North":  20, "South": 30, "East": 23, "West":  22},
}
table = pd.DataFrame(data).T
print("Contingency Table:")
print(table)

chi2, p_val, dof, expected = stats.chi2_contingency(table)
print(f"
Chi2={chi2:.4f}, df={dof}, p={p_val:.4f}")
print(f"Satisfaction independent of region: {p_val > 0.05}")

Cramer's V — effect size for chi-square

import numpy as np
from scipy import stats

# 2x2 contingency: Gender vs. Purchase
table = np.array([[230, 170],   # Male:   bought, did not buy
                  [195, 205]])  # Female: bought, did not buy

chi2, p_val, dof, expected = stats.chi2_contingency(table)
n = table.sum()
k = min(table.shape) - 1

cramers_v = np.sqrt(chi2 / (n * k))
print(f"Chi2={chi2:.4f}, df={dof}, p={p_val:.4f}")
print(f"n={n}, Cramer's V={cramers_v:.4f}")
print(f"Effect size: {'small' if cramers_v < 0.1 else 'medium' if cramers_v < 0.3 else 'large'}")

Expected frequency check and Fisher's exact test

import numpy as np
from scipy import stats

# Small sample — use Fisher's exact test
# Rare disease by treatment
table = np.array([[5, 15],
                  [2, 18]])

chi2, p_chi, dof, expected = stats.chi2_contingency(table)
oddsratio, p_fisher = stats.fisher_exact(table)

print("Observed:")
print(table)
print(f"
Expected frequencies:")
print(expected.round(2))
print(f"
Chi2 p-value:   {p_chi:.4f}  (valid if all expected >= 5)")
print(f"Fisher p-value: {p_fisher:.4f}  (preferred for small samples)")
print(f"Odds ratio:     {oddsratio:.4f}")

💼 Real-World Scenario

A market research firm surveys 800 shoppers about their preferred payment method (Cash, Card, Mobile) segmented by age group (18-34, 35-54, 55+). They run a chi-square test of independence to determine if payment preference is associated with age, then compute Cramer's V to quantify the strength of association.

Real-World Code

import numpy as np
import pandas as pd
from scipy import stats

np.random.seed(42)
# Contingency table: age group vs payment method
table = np.array([
    [45, 130, 125],   # 18-34: Cash, Card, Mobile
    [80, 155,  65],   # 35-54
    [95, 120,  25],   # 55+
])
row_labels = ["18-34", "35-54", "55+"]
col_labels  = ["Cash", "Card", "Mobile"]

df_table = pd.DataFrame(table, index=row_labels, columns=col_labels)
print("Payment Method by Age Group:")
print(df_table)
print(f"
Row totals: {df_table.sum(axis=1).values}")

chi2, p_val, dof, expected = stats.chi2_contingency(table)
n = table.sum()
cramers_v = np.sqrt(chi2 / (n * (min(table.shape)-1)))

print(f"
Chi2={chi2:.3f}, df={dof}, p={p_val:.4f}")
print(f"Cramer's V = {cramers_v:.4f} ({'small' if cramers_v<0.1 else 'medium' if cramers_v<0.3 else 'large'} effect)")
print(f"Association between age and payment method: {'Yes' if p_val<0.05 else 'No'}")

🏋️ Practice: A/B Test with Chi-Square

An A/B test shows 180 conversions out of 900 visitors (control) and 210 conversions out of 900 visitors (treatment). Build a 2x2 contingency table and run a chi-square test of independence to determine if the conversion rate differs significantly. Compute Cramer's V.

Starter Code

import numpy as np
from scipy import stats

# Control:   180 converted, 720 did not
# Treatment: 210 converted, 690 did not
table = np.array([
    [180, 720],
    [210, 690],
])

# TODO: Run chi2_contingency
# TODO: Compute and interpret Cramer's V
# TODO: Also run Fisher's exact as a check

✅ Practice Checklist

Build a 2x2 contingency table and run stats.chi2_contingency, print chi2, p, dof, expected
Compute Cramer's V from the chi2 statistic to measure the strength of association
Run a goodness-of-fit chi-square test to check if a die is fair using stats.chisquare
Use stats.proportions_ztest to compare two proportions and interpret the z-statistic
Check that expected frequencies are all >= 5 before applying the chi-square test

8. Correlation & Regression▼

Correlation measures the linear (Pearson) or monotonic (Spearman) relationship between two variables, ranging from -1 to +1. Simple linear regression models one variable as a linear function of another. scipy.stats.linregress provides slope, intercept, r-value, and p-value for the slope.

Pearson and Spearman correlation

import numpy as np
from scipy import stats

np.random.seed(42)
x = np.random.uniform(0, 10, 50)
y = 2.5 * x + np.random.normal(0, 3, 50)   # linear + noise

pearson_r,  pearson_p  = stats.pearsonr(x, y)
spearman_r, spearman_p = stats.spearmanr(x, y)

print(f"Pearson r:  {pearson_r:.4f}  (p={pearson_p:.4f})")
print(f"Spearman r: {spearman_r:.4f}  (p={spearman_p:.4f})")

# Correlation matrix with pandas
import pandas as pd
df = pd.DataFrame({"x":x, "y":y, "z": y**0.5})
print("
Correlation matrix:")
print(df.corr(method="pearson").round(4))

Simple linear regression with scipy.stats.linregress

import numpy as np
from scipy import stats

np.random.seed(0)
hours_studied = np.random.uniform(1, 10, 60)
exam_score    = 50 + 4.5 * hours_studied + np.random.normal(0, 5, 60)

result = stats.linregress(hours_studied, exam_score)

print(f"slope:     {result.slope:.4f}")
print(f"intercept: {result.intercept:.4f}")
print(f"r-value:   {result.rvalue:.4f}")
print(f"r-squared: {result.rvalue**2:.4f}")
print(f"p-value:   {result.pvalue:.6f}")
print(f"std error: {result.stderr:.4f}")

# Prediction
pred_8hrs = result.slope * 8 + result.intercept
print(f"
Predicted score for 8 hrs: {pred_8hrs:.2f}")

Multiple correlation using numpy polyfit

import numpy as np
from scipy import stats

np.random.seed(3)
x = np.linspace(0, 10, 80)
y = 3 + 2*x - 0.15*x**2 + np.random.normal(0, 2, 80)

# Linear fit
coef1 = np.polyfit(x, y, 1)
y_hat1 = np.polyval(coef1, x)
ss_res1 = np.sum((y - y_hat1)**2)
ss_tot  = np.sum((y - y.mean())**2)
r2_lin  = 1 - ss_res1/ss_tot

# Quadratic fit
coef2 = np.polyfit(x, y, 2)
y_hat2 = np.polyval(coef2, x)
ss_res2 = np.sum((y - y_hat2)**2)
r2_quad = 1 - ss_res2/ss_tot

print(f"Linear fit:    R² = {r2_lin:.4f}")
print(f"Quadratic fit: R² = {r2_quad:.4f}")
print(f"Quadratic coefficients: {coef2.round(4)}")

Residuals analysis and influential points

import numpy as np
from scipy import stats

np.random.seed(1)
x = np.random.uniform(0, 20, 50)
y = 5 + 1.8 * x + np.random.normal(0, 4, 50)

result = stats.linregress(x, y)
y_pred = result.slope * x + result.intercept
residuals = y - y_pred

# Standardized residuals
std_res = (residuals - residuals.mean()) / residuals.std(ddof=1)
outlier_mask = np.abs(std_res) > 2

print(f"n={len(x)}, R²={result.rvalue**2:.4f}")
print(f"Residual std: {residuals.std(ddof=1):.4f}")
print(f"Outliers (|std resid|>2): {outlier_mask.sum()}")
print(f"Expected outliers ~5%: {int(0.05*len(x))}")

# Durbin-Watson statistic (autocorrelation in residuals)
diffs = np.diff(residuals)
dw = np.sum(diffs**2) / np.sum(residuals**2)
print(f"Durbin-Watson: {dw:.4f}  (2=no autocorr, <2=positive, >2=negative)")

💼 Real-World Scenario

A real estate company models house prices using square footage. They compute Pearson correlation, fit a simple linear regression with scipy.stats.linregress, assess R-squared, and analyze residuals to check for linearity and homoscedasticity. They also identify outlier properties that deviate from the trend.

Real-World Code

import numpy as np
import pandas as pd
from scipy import stats

np.random.seed(42)
# Simulate house data
sqft  = np.random.uniform(800, 3500, 150)
price = 50000 + 180 * sqft + np.random.normal(0, 25000, 150)
# Add a few luxury outliers
price[:5] += 200000

df = pd.DataFrame({"sqft": sqft, "price": price})

pearson_r, pearson_p = stats.pearsonr(df["sqft"], df["price"])
print(f"Pearson r = {pearson_r:.4f} (p={pearson_p:.2e})")

result = stats.linregress(df["sqft"], df["price"])
print(f"
Regression: price = {result.slope:.2f} * sqft + {result.intercept:,.0f}")
print(f"R² = {result.rvalue**2:.4f}")

# Predict 2000 sqft house
pred = result.slope * 2000 + result.intercept
print(f"Predicted price (2000 sqft): ${pred:,.0f}")

# Residual analysis
residuals = df["price"] - (result.slope * df["sqft"] + result.intercept)
std_res = (residuals - residuals.mean()) / residuals.std(ddof=1)
print(f"
Outlier properties (|std resid|>2.5): {(np.abs(std_res)>2.5).sum()}")

🏋️ Practice: Advertising Spend vs Sales Regression

Given weekly advertising spend (x) ranging from $1000 to $10000 and weekly sales (y), fit a linear regression. Compute slope, intercept, R-squared, and predict sales for a $7500 spend. Compute both Pearson and Spearman correlations and compare them.

Starter Code

import numpy as np
from scipy import stats

np.random.seed(88)
ad_spend = np.random.uniform(1000, 10000, 52)   # weekly ad spend
sales    = 2000 + 3.5 * ad_spend + np.random.normal(0, 3000, 52)

# TODO: Compute Pearson and Spearman correlations
# TODO: Fit linear regression with scipy.stats.linregress
# TODO: Print slope, intercept, R²
# TODO: Predict sales for $7500 ad spend

✅ Practice Checklist

Compute Pearson and Spearman correlation between two variables and compare when they differ
Fit a simple linear regression with stats.linregress and print slope, intercept, and R-squared
Plot residuals vs fitted values to check for heteroscedasticity (or print residual statistics)
Compute partial correlation between X and Y while controlling for a confounding variable Z
Identify the 3 data points with the largest residuals from your regression

9. Non-Parametric Tests▼

Non-parametric tests make no assumptions about the underlying distribution. They operate on ranks rather than raw values, making them robust to outliers and skewed data. Key tests include Mann-Whitney U (two independent groups), Wilcoxon signed-rank (paired), and Shapiro-Wilk (normality check).

Shapiro-Wilk normality test

import numpy as np
from scipy import stats

np.random.seed(42)
normal_data = np.random.normal(50, 10, 50)
skewed_data = np.random.exponential(scale=5, size=50)
uniform_data = np.random.uniform(0, 100, 50)

print("Shapiro-Wilk Normality Test")
print(f"{'Dataset':<15} {'W-stat':>8} {'p-value':>9} {'Normal?':>8}")
for name, data in [("Normal", normal_data), ("Exponential", skewed_data), ("Uniform", uniform_data)]:
    w, p = stats.shapiro(data)
    print(f"{name:<15} {w:>8.4f} {p:>9.4f} {str(p>0.05):>8}")

Mann-Whitney U test — two independent samples

import numpy as np
from scipy import stats

np.random.seed(7)
# Skewed distributions — t-test assumptions violated
group1 = np.random.exponential(scale=3, size=35)
group2 = np.random.exponential(scale=4, size=38)

# Parametric (t-test)
t_stat, t_p = stats.ttest_ind(group1, group2)
# Non-parametric
u_stat, mw_p = stats.mannwhitneyu(group1, group2, alternative="two-sided")

print(f"Group 1: median={np.median(group1):.3f}, mean={group1.mean():.3f}")
print(f"Group 2: median={np.median(group2):.3f}, mean={group2.mean():.3f}")
print(f"
Welch t-test:     t={t_stat:.3f}, p={t_p:.4f}")
print(f"Mann-Whitney U:   U={u_stat:.1f},  p={mw_p:.4f}")

# Rank-biserial correlation (effect size)
r_rb = 1 - (2*u_stat) / (len(group1)*len(group2))
print(f"Rank-biserial r:  {r_rb:.4f}")

Wilcoxon signed-rank test — paired samples

import numpy as np
from scipy import stats

np.random.seed(3)
n = 30
before = np.random.exponential(scale=8, size=n)
after  = before * np.random.uniform(0.6, 0.95, n)   # improvement

# Parametric paired t-test
t_stat, t_p = stats.ttest_rel(before, after)
# Non-parametric
w_stat, w_p = stats.wilcoxon(before, after)

print(f"Before: median={np.median(before):.3f}")
print(f"After:  median={np.median(after):.3f}")
print(f"
Paired t-test:          t={t_stat:.3f}, p={t_p:.4f}")
print(f"Wilcoxon signed-rank:   W={w_stat:.1f},  p={w_p:.4f}")

Kolmogorov-Smirnov and Anderson-Darling tests

import numpy as np
from scipy import stats

np.random.seed(0)
data = np.random.normal(5, 2, 100)

# KS test: compare to normal distribution
ks_stat, ks_p = stats.kstest(data, "norm", args=(data.mean(), data.std()))
print(f"K-S test vs Normal:  D={ks_stat:.4f}, p={ks_p:.4f}")

# Anderson-Darling test
ad_result = stats.anderson(data, dist="norm")
print(f"
Anderson-Darling: statistic={ad_result.statistic:.4f}")
for sl, cv in zip(ad_result.significance_level, ad_result.critical_values):
    reject = ad_result.statistic > cv
    print(f"  alpha={sl:5.1f}%: critical={cv:.3f}, reject H0={reject}")

# Two-sample KS test
data2 = np.random.normal(5.5, 2, 100)
ks2_stat, ks2_p = stats.ks_2samp(data, data2)
print(f"
Two-sample KS test: D={ks2_stat:.4f}, p={ks2_p:.4f}")

💼 Real-World Scenario

A clinical researcher compares pain scores (highly skewed, 0-10 scale) before and after a new pain management protocol using the Wilcoxon signed-rank test. They also compare pain scores between two patient cohorts using Mann-Whitney U. Shapiro-Wilk normality tests confirm non-parametric methods are appropriate.

Real-World Code

import numpy as np
from scipy import stats

np.random.seed(42)
n = 40
# Pain scores (0-10, skewed toward lower end)
before = np.random.beta(a=2, b=1.5, size=n) * 10
after  = before * np.random.uniform(0.4, 0.85, n)

# Normality check
wb, pb = stats.shapiro(before)
wa, pa = stats.shapiro(after)
print("Shapiro-Wilk normality test:")
print(f"  Before: W={wb:.4f}, p={pb:.4f} — {'Normal' if pb>0.05 else 'Non-normal'}")
print(f"  After:  W={wa:.4f}, p={pa:.4f} — {'Normal' if pa>0.05 else 'Non-normal'}")

# Wilcoxon signed-rank (paired, non-parametric)
w_stat, w_p = stats.wilcoxon(before, after)
print(f"
Wilcoxon signed-rank: W={w_stat:.1f}, p={w_p:.4f}")
print(f"Median reduction: {np.median(before-after):.2f} points")

# Independent cohort comparison
cohort_a = np.random.beta(2, 1.5, 40) * 10
cohort_b = np.random.beta(1.5, 2, 40) * 10
u_stat, mw_p = stats.mannwhitneyu(cohort_a, cohort_b, alternative="two-sided")
print(f"
Mann-Whitney U (cohort comparison): U={u_stat:.1f}, p={mw_p:.4f}")

🏋️ Practice: Choose Parametric vs Non-Parametric

Generate three datasets: (A) Normal(50,10) n=40, (B) Exponential(scale=5) n=40, (C) Uniform(0,100) n=40. For each pair, first run Shapiro-Wilk, then decide whether to use t-test or Mann-Whitney U. Run both and compare p-values.

Starter Code

import numpy as np
from scipy import stats

np.random.seed(22)
group_a = np.random.normal(50, 10, 40)
group_b = np.random.exponential(scale=5, size=40)
group_c = np.random.uniform(0, 100, 40)

# TODO: Shapiro-Wilk for each group
# TODO: For A vs B: t-test and Mann-Whitney
# TODO: For A vs C: t-test and Mann-Whitney
# TODO: Discuss which test is appropriate for each pair

✅ Practice Checklist

Run the Shapiro-Wilk test on normal, exponential, and uniform data samples
Use stats.mannwhitneyu to compare two exponentially distributed groups
Run stats.wilcoxon on paired before/after measurements and compare to stats.ttest_rel
Apply stats.ks_2samp to check if two samples come from the same distribution
Compute the effect size r = Z/sqrt(N) for a Mann-Whitney U result

10. Effect Size & Power Analysis▼

Effect size quantifies the practical significance of a result, independent of sample size. Cohen's d measures standardized mean differences. Statistical power (1 - beta) is the probability of detecting a true effect. Power analysis determines the sample size needed to achieve adequate power (typically 0.80).

Cohen's d — effect size for two groups

import numpy as np
from scipy import stats

np.random.seed(42)
group1 = np.random.normal(50, 10, 100)
group2 = np.random.normal(56, 10, 100)

# Pooled Cohen's d
mean_diff  = group1.mean() - group2.mean()
pooled_std = np.sqrt((group1.var(ddof=1) + group2.var(ddof=1)) / 2)
cohens_d   = abs(mean_diff) / pooled_std

print(f"Group 1 mean: {group1.mean():.3f}")
print(f"Group 2 mean: {group2.mean():.3f}")
print(f"Pooled std:   {pooled_std:.3f}")
print(f"Cohen's d:    {cohens_d:.4f}")

magnitude = ("small" if cohens_d < 0.2 else
             "small-medium" if cohens_d < 0.5 else
             "medium" if cohens_d < 0.8 else "large")
print(f"Effect size:  {magnitude}")

_, p_val = stats.ttest_ind(group1, group2)
print(f"p-value:      {p_val:.4f}")

Power analysis for one-sample t-test with statsmodels

from statsmodels.stats.power import TTestPower

analysis = TTestPower()

# Given: effect size d=0.5, alpha=0.05, power=0.80
n_needed = analysis.solve_power(effect_size=0.5, alpha=0.05, power=0.80, alternative="two-sided")
print(f"Sample size for d=0.5, power=0.80: n = {n_needed:.1f} → {int(np.ceil(n_needed))}")

# Power curve: how does power change with n?
import numpy as np
print("
Power by sample size (d=0.5, alpha=0.05):")
for n in [10, 20, 30, 50, 80, 100, 150, 200]:
    pwr = analysis.solve_power(effect_size=0.5, nobs=n, alpha=0.05, alternative="two-sided")
    bar = "#" * int(pwr * 20)
    print(f"  n={n:4d}: power={pwr:.3f}  {bar}")

Power analysis for two-sample t-test

import numpy as np
from statsmodels.stats.power import TTestIndPower

analysis = TTestIndPower()

# Sample sizes for various effect sizes
print("Required n per group (alpha=0.05, power=0.80):")
for d in [0.2, 0.3, 0.5, 0.8, 1.0, 1.5]:
    n = analysis.solve_power(effect_size=d, alpha=0.05, power=0.80, alternative="two-sided")
    print(f"  d={d}: n = {int(np.ceil(n))} per group  (total={int(np.ceil(n))*2})")

# How much power do we have with n=30 per group?
print("
Power with n=30 per group (alpha=0.05):")
for d in [0.2, 0.5, 0.8]:
    pwr = analysis.solve_power(effect_size=d, nobs1=30, alpha=0.05, alternative="two-sided")
    print(f"  d={d}: power={pwr:.4f}")

Empirical power via simulation

import numpy as np
from scipy import stats

np.random.seed(0)
n_sim   = 5000
alpha   = 0.05
n       = 40
effect  = 0.5   # Cohen's d

# Empirical power
rejections = 0
for _ in range(n_sim):
    g1 = np.random.normal(0, 1, n)
    g2 = np.random.normal(effect, 1, n)
    _, p = stats.ttest_ind(g1, g2)
    if p < alpha:
        rejections += 1

emp_power = rejections / n_sim

# Analytical power for comparison
from statsmodels.stats.power import TTestIndPower
analytical = TTestIndPower().solve_power(effect_size=effect, nobs1=n, alpha=alpha)

print(f"n={n} per group, d={effect}, alpha={alpha}")
print(f"Empirical power:   {emp_power:.4f}")
print(f"Analytical power:  {analytical:.4f}")
print(f"Type II error (β): {1-emp_power:.4f}")

💼 Real-World Scenario

A product team plans an A/B test for a new onboarding flow. They estimate the current conversion rate maps to a Cohen's d of 0.35 for time-to-first-purchase. Using statsmodels TTestIndPower, they calculate the minimum sample size needed to achieve 80% power at alpha=0.05, and validate with a simulation study.

Real-World Code

import numpy as np
from scipy import stats
from statsmodels.stats.power import TTestIndPower

np.random.seed(42)

# Business context: expected improvement in days-to-purchase
current_mean, new_mean, common_std = 12.0, 10.5, 4.0
effect_d = (current_mean - new_mean) / common_std
print(f"=== A/B Test Power Planning ===")
print(f"Expected effect size (Cohen's d): {effect_d:.4f}")

analysis = TTestIndPower()
n_needed = analysis.solve_power(effect_size=effect_d, alpha=0.05, power=0.80)
print(f"Required n per arm: {int(np.ceil(n_needed))}")
print(f"Total sample size:  {int(np.ceil(n_needed))*2}")

# Simulate the actual experiment
n_actual = int(np.ceil(n_needed))
control   = np.random.normal(current_mean, common_std, n_actual)
treatment = np.random.normal(new_mean,     common_std, n_actual)

t_stat, p_val = stats.ttest_ind(control, treatment)
mean_diff  = control.mean() - treatment.mean()
pooled_std = np.sqrt((control.var(ddof=1) + treatment.var(ddof=1)) / 2)
obs_d      = mean_diff / pooled_std

print(f"
Simulated experiment results:")
print(f"Observed Cohen's d: {obs_d:.4f}")
print(f"p-value:            {p_val:.4f}")
print(f"Decision: {'Reject H0 — new flow is better' if p_val<0.05 else 'Fail to reject H0'}")

🏋️ Practice: Design a Clinical Trial Sample Size

A new medication is expected to reduce symptom severity by 2 points (scale 0-20) with a pooled standard deviation of 5. Compute Cohen's d. Then use TTestIndPower to find the sample size needed for 80%, 85%, and 90% power at alpha=0.05. Finally, simulate 3000 trials with the recommended n and estimate empirical power.

Starter Code

import numpy as np
from scipy import stats
from statsmodels.stats.power import TTestIndPower

expected_diff = 2.0
pooled_std    = 5.0

# TODO: Compute Cohen's d
# TODO: TTestIndPower: find n for power = 0.80, 0.85, 0.90
# TODO: Simulate 3000 trials with n from 80% power calculation
# TODO: Report empirical power

✅ Practice Checklist

Compute Cohen's d for two groups with different means using the pooled standard deviation
Use TTestIndPower.solve_power to find required n for effect_size=0.5 at 80% power
Calculate achieved power for d=0.3, n=100 per group at alpha=0.05
Find the minimum detectable effect (MDE) for n=200 per group at 80% power
Print a table of required n for effect sizes [0.2, 0.5, 0.8] and powers [0.70, 0.80, 0.90]

11. Bayesian Inference▼

Update beliefs with evidence using Bayes' theorem. Compute posterior distributions, credible intervals, and compare Bayesian vs frequentist approaches.

Bayes' theorem — coin flip posterior

import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt

# Prior: uniform (Beta(1,1)) -> update with 7 heads out of 10 flips
heads, tails = 7, 3
theta = np.linspace(0, 1, 1000)

# Prior: Beta(1,1) = uniform
alpha_prior, beta_prior = 1, 1
# Posterior: Beta(alpha+heads, beta+tails)
alpha_post = alpha_prior + heads
beta_post  = beta_prior  + tails

from scipy.stats import beta
prior     = beta.pdf(theta, alpha_prior, beta_prior)
likelihood = theta**heads * (1-theta)**tails
likelihood /= likelihood.sum()  # normalize
posterior  = beta.pdf(theta, alpha_post, beta_post)

fig, ax = plt.subplots(figsize=(8, 5))
ax.plot(theta, prior,     label='Prior Beta(1,1)',      linestyle='--')
ax.plot(theta, likelihood*max(posterior)/max(likelihood), label='Likelihood (scaled)')
ax.plot(theta, posterior, label=f'Posterior Beta({alpha_post},{beta_post})', linewidth=2)
ax.axvline(alpha_post/(alpha_post+beta_post), color='red', linestyle=':', label='Posterior mean')
ax.set_xlabel('theta (P(heads))')
ax.set_title(f'Bayesian Update: {heads}H/{tails}T')
ax.legend(); plt.tight_layout()
plt.savefig('bayesian_coin.png', dpi=80); plt.close()
print(f'Posterior mean: {alpha_post/(alpha_post+beta_post):.3f}')
print(f'95% credible interval: ({beta.ppf(0.025,alpha_post,beta_post):.3f}, {beta.ppf(0.975,alpha_post,beta_post):.3f})')

Bayesian A/B test with Beta-Binomial

import numpy as np
from scipy.stats import beta

# A: 120 conversions from 1000 visitors
# B: 140 conversions from 1000 visitors
np.random.seed(42)

alpha_a, beta_a = 1 + 120, 1 + 880
alpha_b, beta_b = 1 + 140, 1 + 860

# Sample from posteriors
samples_a = beta.rvs(alpha_a, beta_a, size=100000)
samples_b = beta.rvs(alpha_b, beta_b, size=100000)

prob_b_better = (samples_b > samples_a).mean()
expected_lift = (samples_b - samples_a).mean()

print(f'Control A: rate = {120/1000:.1%}')
print(f'Variant B: rate = {140/1000:.1%}')
print(f'P(B > A):  {prob_b_better:.1%}')
print(f'Expected lift: +{expected_lift:.2%}')
print(f'95% CI for lift: ({np.percentile(samples_b-samples_a,2.5):.2%}, {np.percentile(samples_b-samples_a,97.5):.2%})')

Conjugate priors for different distributions

import numpy as np
from scipy.stats import beta, gamma, norm

np.random.seed(42)

# 1. Beta-Binomial: click-through rate estimation
clicks, views = 45, 200
alpha, beta_param = 1 + clicks, 1 + (views - clicks)
print('CTR posterior:')
print(f'  Mean: {alpha/(alpha+beta_param):.3f}')
print(f'  95% CI: ({beta.ppf(0.025,alpha,beta_param):.3f}, {beta.ppf(0.975,alpha,beta_param):.3f})')

# 2. Normal-Normal: mean estimation (known variance)
data = np.random.normal(5.0, 2.0, 30)  # true mean=5
prior_mean, prior_var = 0, 10  # weak prior
likelihood_var = 4 / len(data)  # sigma^2/n
posterior_var  = 1 / (1/prior_var + 1/likelihood_var)
posterior_mean = posterior_var * (prior_mean/prior_var + data.mean()/likelihood_var)
print(f'\nNormal posterior mean: {posterior_mean:.3f} (true=5.0, sample mean={data.mean():.3f})')

# 3. Gamma-Poisson: event rate estimation
events_per_day = [3, 5, 2, 4, 6, 3, 4]  # observed counts
alpha_g, beta_g = 1 + sum(events_per_day), 1 + len(events_per_day)
print(f'\nPoisson rate posterior mean: {alpha_g/beta_g:.3f} events/day')

Markov Chain Monte Carlo (MCMC) concept

import numpy as np

np.random.seed(42)

def log_posterior(theta, data, prior_std=2.0):
    """Log posterior for normal likelihood, normal prior."""
    log_prior = -0.5 * (theta / prior_std)**2
    log_likelihood = -0.5 * np.sum((data - theta)**2)
    return log_prior + log_likelihood

# Simple Metropolis-Hastings
def metropolis(data, n_samples=5000, proposal_std=0.5):
    samples = []
    current = 0.0
    for _ in range(n_samples):
        proposed = current + np.random.normal(0, proposal_std)
        log_alpha = log_posterior(proposed, data) - log_posterior(current, data)
        if np.log(np.random.uniform()) < log_alpha:
            current = proposed
        samples.append(current)
    return np.array(samples[500:])  # burn-in

# True parameter = 3.0
data = np.random.normal(3.0, 1.0, 20)
samples = metropolis(data)

print(f'Data mean:        {data.mean():.3f}')
print(f'MCMC posterior mean: {samples.mean():.3f}')
print(f'MCMC 95% CI: ({np.percentile(samples,2.5):.3f}, {np.percentile(samples,97.5):.3f})')
print(f'Acceptance rate: {len(np.unique(samples.round(6)))/len(samples):.1%}')

💼 Real-World Scenario

A marketing team runs an A/B test for 2 weeks. Using a Bayesian Beta-Binomial model, they continuously estimate P(B>A) and stop early once confidence exceeds 95%.

Real-World Code

import numpy as np
from scipy.stats import beta

np.random.seed(42)
# Simulate daily data: A has 8% true rate, B has 10% true rate
true_a, true_b = 0.08, 0.10
n_days = 14
daily_visitors = 100

alpha_a, beta_a = 1, 1  # uniform prior
alpha_b, beta_b = 1, 1

print('Day | Visitors A/B | Conv A/B | P(B>A)')
print('-'*50)
for day in range(1, n_days+1):
    conv_a = np.random.binomial(daily_visitors, true_a)
    conv_b = np.random.binomial(daily_visitors, true_b)
    alpha_a += conv_a; beta_a += (daily_visitors - conv_a)
    alpha_b += conv_b; beta_b += (daily_visitors - conv_b)
    samples = np.random.beta([alpha_a, alpha_b], [beta_a, beta_b], size=(100000, 2))
    prob_b_better = (samples[:,1] > samples[:,0]).mean()
    print(f' {day:2d} | {daily_visitors}/{daily_visitors}        | {conv_a:3d}/{conv_b:3d}    | {prob_b_better:.1%}')
    if prob_b_better > 0.95:
        print(f'>>> Declare B winner on day {day}! (P(B>A)={prob_b_better:.1%})')
        break

🏋️ Practice: Bayesian Conversion Rate

You observe 30 conversions from 200 visitors. Use a Beta(1,1) prior. Compute the posterior distribution, posterior mean, and 90% credible interval. Compare to the MLE estimate.

Starter Code

from scipy.stats import beta
import numpy as np

conversions = 30
visitors = 200

# Prior: Beta(1, 1) = uniform
alpha_prior, beta_prior = 1, 1

# TODO: compute posterior parameters
# TODO: compute posterior mean
# TODO: compute 90% credible interval (5th and 95th percentile)
# TODO: compute MLE estimate (conversions/visitors)
# TODO: print comparison

✅ Practice Checklist

Update a Beta(1,1) prior with 7 heads in 10 flips and print the posterior mean and 95% CI
Compute P(B > A) via Monte Carlo sampling from two Beta posteriors for an A/B test
Start with a Beta(2,10) informative prior and compare the posterior to Beta(1,1) prior
Use the Normal-Normal conjugate to estimate the mean of 30 observations with a prior
Compute the posterior predictive: expected conversions in the next 100 visits from the posterior

12. Survival Analysis▼

Analyze time-to-event data — customer churn, equipment failure, clinical trials — with Kaplan-Meier curves and the log-rank test.

Kaplan-Meier survival curve from scratch

import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt

# (duration, event_occurred) — 0 = censored (still alive/active)
data = [(5,1),(8,0),(12,1),(14,1),(2,0),(20,1),(7,1),(18,0),(3,1),(25,1),
        (11,1),(6,0),(15,1),(9,1),(22,0),(4,1),(16,1),(10,0),(13,1),(19,1)]
times = sorted(set(d for d,e in data if e==1))

def km_estimator(data):
    n = len(data)
    at_risk = n
    S = 1.0
    result = [(0, 1.0)]
    for t in sorted(set(d for d,e in data if e==1)):
        events   = sum(1 for d,e in data if d==t and e==1)
        censored = sum(1 for d,e in data if d==t and e==0)
        S *= (1 - events/at_risk)
        result.append((t, S))
        at_risk -= (events + censored)
    return result

km = km_estimator(data)
times_km, surv_km = zip(*km)

fig, ax = plt.subplots(figsize=(8, 5))
ax.step(times_km, surv_km, where='post', linewidth=2, color='steelblue', label='KM estimate')
ax.axhline(0.5, color='red', linestyle='--', label='Median survival')
ax.set_xlabel('Time'); ax.set_ylabel('Survival Probability')
ax.set_title('Kaplan-Meier Survival Curve'); ax.legend(); ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('km_curve.png', dpi=80); plt.close()
print('Saved km_curve.png')
median_idx = next(i for i,s in enumerate(surv_km) if s < 0.5)
print(f'Median survival: ~{times_km[median_idx]} units')

Customer churn survival analysis

import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt

np.random.seed(42)
n = 200
# Simulate customer tenures: plan A churns faster
plan_a_times = np.random.exponential(8, n//2).clip(0, 24).round(0)
plan_b_times = np.random.exponential(14, n//2).clip(0, 24).round(0)
events_a = (plan_a_times < 24).astype(int)  # 1=churned, 0=still active (censored)
events_b = (plan_b_times < 24).astype(int)

def km_curve(durations, events):
    n = len(durations)
    times = sorted(set(durations[events==1]))
    S, at_risk = 1.0, n
    curve = [(0, 1.0)]
    for t in times:
        d = sum((durations==t) & (events==1))
        c = sum((durations==t) & (events==0))
        S *= 1 - d/at_risk
        curve.append((t, S)); at_risk -= d + c
    return zip(*curve)

fig, ax = plt.subplots(figsize=(9, 5))
for times, surv, label, color in [
    (*km_curve(plan_a_times, events_a), 'Plan A (basic)', 'tomato'),
    (*km_curve(plan_b_times, events_b), 'Plan B (premium)', 'steelblue'),
]:
    ax.step(list(times), list(surv), where='post', label=label, color=color, linewidth=2)
ax.set_xlabel('Months'); ax.set_ylabel('Retention Rate')
ax.set_title('Customer Retention: Plan A vs Plan B')
ax.legend(); ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('churn_survival.png', dpi=80); plt.close()
print('Saved churn_survival.png')
print(f'Plan A 12-month retention: ~{plan_a_times[plan_a_times>=12].shape[0]/n*2:.0%}')
print(f'Plan B 12-month retention: ~{plan_b_times[plan_b_times>=12].shape[0]/n*2:.0%}')

Log-rank test for group comparison

import numpy as np
from scipy.stats import chi2

np.random.seed(42)
# Group 1: faster failure (mean=10)
# Group 2: slower failure (mean=18)
g1_times  = np.random.exponential(10, 50).clip(0,30)
g1_events = (g1_times < 30).astype(int)
g2_times  = np.random.exponential(18, 50).clip(0,30)
g2_events = (g2_times < 30).astype(int)

def log_rank_test(t1, e1, t2, e2):
    all_times = sorted(set(np.concatenate([t1[e1==1], t2[e2==1]])))
    O1, E1, O2, E2 = 0, 0, 0, 0
    n1, n2 = len(t1), len(t2)
    for t in all_times:
        d1 = ((t1==t) & (e1==1)).sum()
        d2 = ((t2==t) & (e2==1)).sum()
        d  = d1 + d2
        n  = n1 + n2
        if n > 0:
            E1 += d * n1 / n
            E2 += d * n2 / n
        O1 += d1; O2 += d2
        n1 -= ((t1<=t) & (e1==1)).sum() + ((t1<=t) & (e1==0)).sum()
        n2 -= ((t2<=t) & (e2==1)).sum() + ((t2<=t) & (e2==0)).sum()
        n1 = max(n1, 0); n2 = max(n2, 0)
    chi2_stat = (O1 - E1)**2 / E1 + (O2 - E2)**2 / E2
    p_value = 1 - chi2.cdf(chi2_stat, df=1)
    return chi2_stat, p_value

chi2_stat, p = log_rank_test(g1_times, g1_events, g2_times, g2_events)
print(f'Log-rank chi2: {chi2_stat:.3f}')
print(f'p-value: {p:.4f}')
print(f'Groups differ significantly: {p < 0.05}')

Hazard rate and hazard ratio

import numpy as np

np.random.seed(42)

# Exponential distribution: constant hazard rate
lambda_rate = 0.1  # hazard rate
time_points = np.arange(1, 21)
survival    = np.exp(-lambda_rate * time_points)
hazard      = np.full_like(time_points, lambda_rate, dtype=float)

print('Exponential survival (lambda=0.1):')
print(f'  Survival at t=5:  {np.exp(-0.1*5):.3f}')
print(f'  Survival at t=10: {np.exp(-0.1*10):.3f}')
print(f'  Median survival:  {np.log(2)/0.1:.1f} time units')

# Hazard ratio: how much riskier is group A vs group B?
lambda_a = 0.15  # higher hazard
lambda_b = 0.08
hr = lambda_a / lambda_b
print(f'\nHazard Ratio (A vs B): {hr:.2f}')
print(f'Group A is {hr:.1f}x more likely to fail at any time point')
print(f'Group A median: {np.log(2)/lambda_a:.1f} | Group B median: {np.log(2)/lambda_b:.1f}')

💼 Real-World Scenario

A SaaS company tracks how long customers stay before canceling. Kaplan-Meier curves compare retention for free-trial vs. direct-purchase cohorts.

Real-World Code

import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt

np.random.seed(42)
# Cohort 1: free trial then convert (longer retention)
# Cohort 2: direct paid (shorter retention but committed)
cohorts = {
    'Free Trial': np.random.exponential(18, 150).clip(0, 36),
    'Direct Paid': np.random.exponential(12, 100).clip(0, 36),
}
events = {k: (v < 36).astype(int) for k, v in cohorts.items()}

def km_curve(durations, ev):
    n = len(durations)
    times = sorted(set(durations[ev==1]))
    S, ar = 1.0, n
    pts = [(0, 1.0)]
    for t in times:
        d = ((durations==t) & (ev==1)).sum()
        c = ((durations==t) & (ev==0)).sum()
        S *= 1 - d/ar; pts.append((t, S)); ar -= d + c
    return zip(*pts)

fig, ax = plt.subplots(figsize=(9, 5))
colors = ['#2196F3', '#F44336']
for (cohort, times_arr), color in zip(cohorts.items(), colors):
    t_pts, s_pts = km_curve(times_arr, events[cohort])
    ax.step(list(t_pts), list(s_pts), where='post', label=cohort, color=color, linewidth=2)
    churned = events[cohort].sum()
    print(f'{cohort}: n={len(times_arr)}, churned={churned} ({churned/len(times_arr):.0%}), avg tenure={times_arr.mean():.1f} months')
ax.axhline(0.5, color='gray', linestyle='--', linewidth=1)
ax.set_xlabel('Months Since Signup'); ax.set_ylabel('Retention Rate')
ax.set_title('Customer Retention by Acquisition Channel')
ax.legend(); ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('retention_cohorts.png', dpi=80); plt.close()
print('Saved retention_cohorts.png')

🏋️ Practice: Equipment Failure Analysis

Simulate failure times for two machine types (A: exponential mean=100 days, B: exponential mean=150 days, both n=80, max=200 days). Plot KM curves and run a log-rank test.

Starter Code

import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt

np.random.seed(42)
# TODO: generate failure times for A (mean=100) and B (mean=150), clip at 200
# TODO: compute event flags (1 if failed < 200, 0 if censored)
# TODO: implement km_curve function (or copy from examples)
# TODO: plot both KM curves on same figure
# TODO: compute and print log-rank test p-value
# TODO: save to 'equipment_survival.png'

✅ Practice Checklist

Simulate survival data with right censoring and compute the Kaplan-Meier estimate
Compare survival curves for two groups using the log-rank test
Print median survival time and 95% CI from the Kaplan-Meier estimator
Fit a Cox Proportional Hazards model and print hazard ratios for each covariate
Identify the time point where survival probability drops below 0.50 from the KM curve

13. Resampling & Simulation▼

Use bootstrapping, permutation tests, and Monte Carlo simulation to estimate uncertainty and test hypotheses without distributional assumptions.

Bootstrap confidence intervals

import numpy as np

np.random.seed(42)
data = np.random.exponential(2.0, 50)  # skewed data
print(f'Sample mean: {data.mean():.4f}')

# Bootstrap 95% CI for the mean
n_boot = 10000
boot_means = np.array([np.random.choice(data, len(data), replace=True).mean()
                       for _ in range(n_boot)])

ci_lower, ci_upper = np.percentile(boot_means, [2.5, 97.5])
print(f'Bootstrap 95% CI: ({ci_lower:.4f}, {ci_upper:.4f})')
print(f'Bootstrap SE: {boot_means.std():.4f}')

# Compare to normal approximation
from scipy.stats import sem
norm_ci = data.mean() + np.array([-1.96, 1.96]) * sem(data)
print(f'Normal approx 95% CI: ({norm_ci[0]:.4f}, {norm_ci[1]:.4f})')

Permutation test for group difference

import numpy as np

np.random.seed(42)
# Two groups — is the mean difference real or chance?
group_a = np.random.normal(10.0, 3.0, 40)
group_b = np.random.normal(11.5, 3.0, 40)
obs_diff = group_b.mean() - group_a.mean()

combined = np.concatenate([group_a, group_b])
n_a = len(group_a)
n_perm = 10000

perm_diffs = np.array([
    np.random.permutation(combined)[:n_a].mean() - np.random.permutation(combined)[:n_a].mean()
    for _ in range(n_perm)
])

p_value = (np.abs(perm_diffs) >= np.abs(obs_diff)).mean()
print(f'Observed difference: {obs_diff:.4f}')
print(f'Permutation p-value: {p_value:.4f}')
print(f'Significant at 0.05: {p_value < 0.05}')

Monte Carlo simulation — Pi estimation

import numpy as np

np.random.seed(42)
n_samples_list = [100, 1000, 10000, 100000, 1000000]

print(f'True pi: {np.pi:.6f}')
print(f"{'N':>10} {'Pi estimate':>12} {'Error':>10}")
for n in n_samples_list:
    x, y = np.random.uniform(-1, 1, (2, n))
    inside = (x**2 + y**2) <= 1.0
    pi_est = 4 * inside.mean()
    error = abs(pi_est - np.pi)
    print(f'{n:>10,} {pi_est:>12.6f} {error:>10.6f}')

Bootstrap for median and other statistics

import numpy as np

np.random.seed(42)
wages = np.array([35, 42, 28, 55, 38, 47, 31, 60, 44, 39,
                  52, 36, 29, 48, 41, 65, 33, 57, 43, 37]) * 1000

def bootstrap_ci(data, stat_fn, n_boot=10000, ci=95):
    boot_stats = [stat_fn(np.random.choice(data, len(data), replace=True))
                  for _ in range(n_boot)]
    lower = (100 - ci) / 2
    return np.percentile(boot_stats, [lower, 100-lower])

stats = {
    'mean':   np.mean,
    'median': np.median,
    'std':    np.std,
    'q75':    lambda x: np.percentile(x, 75),
}

for name, fn in stats.items():
    ci = bootstrap_ci(wages, fn)
    print(f'{name:6s}: {fn(wages):>8,.0f}  95% CI: (${ci[0]:,.0f}, ${ci[1]:,.0f})')

💼 Real-World Scenario

An analyst needs confidence intervals for the median revenue per user, which is skewed. Bootstrap gives reliable CIs without assuming normality.

Real-World Code

import numpy as np

np.random.seed(42)
# Simulated revenue per user (heavy-tailed)
revenue = np.random.lognormal(mean=3.5, sigma=1.2, size=500)

print(f'n={len(revenue)}')
print(f'Mean:   ${revenue.mean():,.2f}')
print(f'Median: ${np.median(revenue):,.2f}')

def bootstrap_stat(data, stat_fn, n_boot=10000):
    boots = [stat_fn(np.random.choice(data, len(data), replace=True)) for _ in range(n_boot)]
    return np.array(boots)

mean_boots   = bootstrap_stat(revenue, np.mean)
median_boots = bootstrap_stat(revenue, np.median)

for name, boots, obs in [('mean', mean_boots, revenue.mean()), ('median', median_boots, np.median(revenue))]:
    ci = np.percentile(boots, [2.5, 97.5])
    print(f'\n{name.capitalize()} bootstrap:')
    print(f'  Observed: ${obs:,.2f}')
    print(f'  95% CI:   (${ci[0]:,.2f}, ${ci[1]:,.2f})')
    print(f'  SE:       ${boots.std():,.2f}')

🏋️ Practice: A/B Test with Permutation

Group A: 50 users, mean session time 4.2 min. Group B: 50 users, mean 4.8 min. Run a permutation test with 10,000 iterations. Report the p-value and whether the difference is significant.

Starter Code

import numpy as np

np.random.seed(42)
group_a = np.random.normal(4.2, 1.5, 50)
group_b = np.random.normal(4.8, 1.5, 50)

# TODO: compute observed difference
# TODO: combine groups, run 10000 permutations
# TODO: compute permutation p-value (two-tailed)
# TODO: print results

✅ Practice Checklist

Compute a 95% percentile bootstrap CI for the mean using 5000 resamples
Run a permutation test to compare two group means and compute the p-value
Bootstrap the median of a skewed distribution and compare to the analytical t-interval
Simulate the bootstrap distribution of the correlation coefficient between two variables
Compare bootstrap CI width at n=30 vs n=200 for the same population

14. Bootstrap Methods & Resampling▼

Percentile Bootstrap CI

import numpy as np
np.random.seed(42)
data = np.random.exponential(scale=2, size=50)
B = 10000
boot_means = [np.mean(np.random.choice(data, size=len(data), replace=True)) for _ in range(B)]
ci = np.percentile(boot_means, [2.5, 97.5])
print(f"Sample mean: {np.mean(data):.4f}")
print(f"Bootstrap 95% CI: [{ci[0]:.4f}, {ci[1]:.4f}]")

BCa Bootstrap (Bias-Corrected)

import numpy as np
from scipy import stats
np.random.seed(7)
data = np.random.lognormal(0, 0.5, 80)
stat_obs = np.median(data)
B = 5000
boot_stats = [np.median(np.random.choice(data, size=len(data), replace=True)) for _ in range(B)]
# Bias correction
z0 = stats.norm.ppf(np.mean(np.array(boot_stats) < stat_obs))
# Acceleration (jackknife)
jk = [np.median(np.delete(data, i)) for i in range(len(data))]
jk_mean = np.mean(jk)
a = np.sum((jk_mean - np.array(jk))**3) / (6*np.sum((jk_mean - np.array(jk))**2)**1.5)
alpha = 0.05
z_lo = stats.norm.ppf(alpha/2); z_hi = stats.norm.ppf(1 - alpha/2)
p_lo = stats.norm.cdf(z0 + (z0+z_lo)/(1-a*(z0+z_lo)))
p_hi = stats.norm.cdf(z0 + (z0+z_hi)/(1-a*(z0+z_hi)))
ci = np.percentile(boot_stats, [p_lo*100, p_hi*100])
print(f"BCa 95% CI for median: [{ci[0]:.4f}, {ci[1]:.4f}]")

Bootstrap Hypothesis Test (Permutation)

import numpy as np
np.random.seed(0)
group_a = np.random.normal(5.0, 1.5, 40)
group_b = np.random.normal(5.6, 1.5, 40)
obs_diff = np.mean(group_b) - np.mean(group_a)
combined = np.concatenate([group_a, group_b])
B = 10000
perm_diffs = []
for _ in range(B):
    perm = np.random.permutation(combined)
    perm_diffs.append(np.mean(perm[40:]) - np.mean(perm[:40]))
p_value = np.mean(np.abs(perm_diffs) >= np.abs(obs_diff))
print(f"Observed difference: {obs_diff:.4f}")
print(f"Permutation p-value: {p_value:.4f}")

💼 Real-World Scenario

E-commerce A/B test: compare conversion rates between checkout designs using bootstrap CIs instead of assuming normality for small samples.

Real-World Code

import numpy as np
np.random.seed(42)
# Simulated checkout conversion: design A=0.05, design B=0.065
n = 300
conv_a = np.random.binomial(1, 0.05, n).astype(float)
conv_b = np.random.binomial(1, 0.065, n).astype(float)
obs_diff = conv_b.mean() - conv_a.mean()
B = 20000
boot_diffs = [
    np.random.choice(conv_b, n, replace=True).mean() -
    np.random.choice(conv_a, n, replace=True).mean()
    for _ in range(B)
]
ci = np.percentile(boot_diffs, [2.5, 97.5])
p_val = np.mean(np.array(boot_diffs) <= 0)
print(f"Observed lift: {obs_diff:.4f} ({obs_diff/conv_a.mean():.1%} relative)")
print(f"95% Bootstrap CI: [{ci[0]:.4f}, {ci[1]:.4f}]")
print(f"One-sided p-value: {p_val:.4f}")
print("Significant at alpha=0.05:", ci[0] > 0)

🏋️ Practice: Bootstrap Customer Lifetime Value

You have 60 customer LTV values from a heavy-tailed distribution. Compute 90% and 95% percentile bootstrap CIs for the mean and the 75th percentile. Also run a two-sample permutation test comparing 'premium' vs 'standard' customers (30 each). Report p-values and effect sizes.

Starter Code

import numpy as np
np.random.seed(11)
ltv = np.random.pareto(2, 60) * 100 + 50
premium = ltv[:30]
standard = ltv[30:]
B = 10000
# TODO: Bootstrap 90% and 95% CIs for mean LTV
# TODO: Bootstrap CI for 75th percentile LTV
# TODO: Permutation test: premium vs standard mean LTV
# TODO: Report Cohen's d effect size

✅ Practice Checklist

Compute a 95% percentile bootstrap CI for the mean of an exponential sample (n=50)
Implement a BCa bootstrap CI and compare to the simpler percentile method
Run a permutation test comparing two group means and print the two-sided p-value
Bootstrap the regression slope and compare to the analytical confidence interval
Vary the number of bootstrap resamples from 100 to 10000 and observe CI stability

15. Multiple Testing & FDR Control▼

Bonferroni & Holm Corrections

import numpy as np
from scipy import stats
np.random.seed(42)
k = 20
# 18 nulls (no effect) + 2 true positives
p_values = np.concatenate([
    np.random.uniform(0, 1, 18),
    np.array([0.003, 0.011])
])
alpha = 0.05
bonferroni = p_values * k
reject_bon = bonferroni < alpha
# Holm correction
order = np.argsort(p_values)
holm = np.zeros(k)
for rank, idx in enumerate(order):
    holm[idx] = p_values[idx] * (k - rank)
reject_holm = holm < alpha
print(f"Bonferroni rejects: {reject_bon.sum()} tests")
print(f"Holm rejects:       {reject_holm.sum()} tests")

Benjamini-Hochberg FDR Control

import numpy as np
np.random.seed(5)
k = 50
# 40 nulls + 10 true effects (small p-values)
p_values = np.concatenate([
    np.random.uniform(0.05, 1.0, 40),
    np.random.uniform(0, 0.02, 10)
])
alpha = 0.05
order = np.argsort(p_values)
sorted_p = p_values[order]
bh_threshold = (np.arange(1, k+1) / k) * alpha
reject_sorted = sorted_p <= bh_threshold
# All tests up to last rejection are rejected
last_reject = np.where(reject_sorted)[0]
if len(last_reject):
    cutoff = last_reject[-1]
    reject_bh = np.zeros(k, dtype=bool)
    reject_bh[order[:cutoff+1]] = True
else:
    reject_bh = np.zeros(k, dtype=bool)
print(f"BH FDR rejects: {reject_bh.sum()} of {k} tests")
print(f"Expected FDR ≤ {alpha}")

Q-value (Storey's Method)

import numpy as np
np.random.seed(2)
k = 100
p_vals = np.concatenate([np.random.uniform(0,1,75), np.random.beta(0.5,5,25)])
# Estimate pi0 (proportion of true nulls)
lambdas = np.arange(0.05, 0.95, 0.05)
pi0_hat = [(p_vals >= l).sum() / (k * (1-l)) for l in lambdas]
pi0 = min(1.0, np.polyfit(lambdas, pi0_hat, 2)[2])  # smoother estimate
# Compute q-values
order = np.argsort(p_vals)
sorted_p = p_vals[order]
q = pi0 * k * sorted_p / (np.arange(1, k+1))
# Enforce monotonicity
for i in range(k-2, -1, -1):
    q[i] = min(q[i], q[i+1])
q_vals = np.empty(k); q_vals[order] = np.minimum(q, 1)
print(f"pi0 estimate: {pi0:.3f}")
print(f"Discoveries at FDR=0.05: {(q_vals <= 0.05).sum()}")

💼 Real-World Scenario

Genomics pipeline: after running 10,000 gene expression tests, apply BH correction to control FDR at 5% and identify truly differentially expressed genes.

Real-World Code

import numpy as np
np.random.seed(99)
n_genes = 10000
n_de = 200  # truly differentially expressed
# Simulate p-values: most null (uniform), some true effects (beta)
p_null = np.random.uniform(0, 1, n_genes - n_de)
p_de   = np.random.beta(0.3, 10, n_de)
p_values = np.concatenate([p_null, p_de])
alpha = 0.05
# BH procedure
order = np.argsort(p_values)
sorted_p = p_values[order]
ranks = np.arange(1, n_genes+1)
bh_thresh = (ranks / n_genes) * alpha
last = np.where(sorted_p <= bh_thresh)[0]
if len(last):
    cutoff = last[-1]
    reject = np.zeros(n_genes, dtype=bool)
    reject[order[:cutoff+1]] = True
else:
    reject = np.zeros(n_genes, dtype=bool)
# True positive rate among real DE genes
tp = reject[-n_de:].sum()
fp = reject[:-n_de].sum()
print(f"Significant genes: {reject.sum()}")
print(f"True positives: {tp}/{n_de} ({tp/n_de:.1%} sensitivity)")
print(f"False positives: {fp} (FDR={fp/max(reject.sum(),1):.3f})")

🏋️ Practice: Drug Trial Multiple Endpoints

A clinical trial tests 12 endpoints (primary + secondary). Raw p-values are given. Apply Bonferroni, Holm, and BH corrections. Build a table showing which endpoints remain significant under each method. Discuss the tradeoff between FWER and FDR control in regulatory contexts.

Starter Code

import numpy as np
# 12 endpoint p-values from clinical trial
p_values = np.array([0.001, 0.008, 0.023, 0.046, 0.052, 0.071,
                     0.12, 0.18, 0.21, 0.34, 0.52, 0.74])
k = len(p_values)
alpha = 0.05
# TODO: Bonferroni correction + reject/accept
# TODO: Holm step-down correction + reject/accept
# TODO: BH FDR correction + reject/accept
# TODO: Print comparison table
# TODO: Discuss which method to use and why

✅ Practice Checklist

Apply Bonferroni correction to 20 p-values and count how many are still significant
Apply Benjamini-Hochberg (BH) correction and compare the number of rejections to Bonferroni
Manually implement the BH procedure step-by-step on a sorted list of p-values
Use multipletests from statsmodels with method='fdr_bh' and print adjusted p-values
Discuss when to prefer BH over Bonferroni and write your reasoning as a comment

16. Dimensionality Reduction for Statistical Analysis▼

PCA for Outlier Detection

import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
np.random.seed(0)
X = np.random.multivariate_normal([0,0,0], [[1,.7,.3],[.7,1,.2],[.3,.2,1]], 200)
X[5] += [4, 4, 4]  # inject outlier
scaler = StandardScaler()
Xs = scaler.fit_transform(X)
pca = PCA(n_components=2)
scores = pca.fit_transform(Xs)
reconstruction = pca.inverse_transform(scores)
residuals = np.sum((Xs - reconstruction)**2, axis=1)
threshold = np.percentile(residuals, 97.5)
outliers = np.where(residuals > threshold)[0]
print(f"Explained variance: {pca.explained_variance_ratio_.cumsum()[-1]:.2%}")
print(f"Reconstruction outliers (index): {outliers}")

t-SNE for Cluster Visualization

import numpy as np
from sklearn.manifold import TSNE
from sklearn.datasets import make_blobs
np.random.seed(42)
X, y = make_blobs(n_samples=300, centers=4, n_features=8, cluster_std=1.5)
tsne = TSNE(n_components=2, perplexity=30, n_iter=300, random_state=42)
X_2d = tsne.fit_transform(X)
for cls in np.unique(y):
    mask = y == cls
    print(f"Cluster {cls}: centroid ({X_2d[mask,0].mean():.2f}, {X_2d[mask,1].mean():.2f}), n={mask.sum()}")
print(f"KL divergence: {tsne.kl_divergence_:.4f}")

UMAP + Statistical Tests on Components

import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from scipy import stats
np.random.seed(1)
# Two groups with different multivariate structure
g1 = np.random.multivariate_normal([0]*5, np.eye(5), 100)
g2 = np.random.multivariate_normal([0.5]*5, np.eye(5)*1.5, 100)
X = np.vstack([g1, g2])
labels = np.array([0]*100 + [1]*100)
pca = PCA(n_components=3)
Xp = pca.fit_transform(StandardScaler().fit_transform(X))
for i in range(3):
    t, p = stats.ttest_ind(Xp[labels==0, i], Xp[labels==1, i])
    print(f"PC{i+1}: t={t:.3f}, p={p:.4f} ({'significant' if p<0.05 else 'not sig'})")
print(f"Total variance explained: {pca.explained_variance_ratio_.sum():.2%}")

💼 Real-World Scenario

Credit risk modelling: apply PCA to 20 correlated financial features, use the top components as inputs for logistic regression, and test if the components differ significantly between default and non-default customers.

Real-World Code

import numpy as np
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from scipy import stats
np.random.seed(42)
n = 500
# 20 correlated features simulating financial metrics
cov = 0.4 * np.ones((20, 20)) + 0.6 * np.eye(20)
X_raw = np.random.multivariate_normal(np.zeros(20), cov, n)
# Default probability depends on first latent factor
latent = X_raw[:, :5].mean(axis=1)
prob = 1 / (1 + np.exp(-(latent - 0.3)))
y = np.random.binomial(1, prob, n)
scaler = StandardScaler()
X = scaler.fit_transform(X_raw)
pca = PCA(n_components=5)
X_pca = pca.fit_transform(X)
print(f"Variance explained: {pca.explained_variance_ratio_.cumsum()[-1]:.2%}")
for i in range(5):
    t, p = stats.ttest_ind(X_pca[y==0, i], X_pca[y==1, i])
    print(f"PC{i+1}: defaulters vs non-defaulters t={t:.2f}, p={p:.4f}")
clf = LogisticRegression()
scores = cross_val_score(clf, X_pca, y, cv=5, scoring='roc_auc')
print(f"Logistic Regression AUC (PCA features): {scores.mean():.3f} ± {scores.std():.3f}")

🏋️ Practice: Customer Segmentation with Dimensionality Reduction

Apply PCA then k-means (k=4) to a simulated customer dataset with 15 features. For each cluster, run one-way ANOVA on the top 3 PCs and 3 original features to test for between-cluster differences. Report F-statistics, p-values, and effect sizes (eta-squared). Visualize clusters in 2D PCA space.

Starter Code

import numpy as np
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from scipy import stats
np.random.seed(21)
X = np.random.randn(400, 15)
# Inject 4-cluster structure
for i in range(4):
    X[i*100:(i+1)*100, :5] += i * 1.2
scaler = StandardScaler()
Xs = scaler.fit_transform(X)
# TODO: PCA to 2D and 5D
# TODO: KMeans(k=4) on 5D PCA
# TODO: ANOVA on top 3 PCs and 3 original features per cluster
# TODO: Report eta-squared effect sizes
# TODO: Plot 2D PCA colored by cluster

✅ Practice Checklist

Fit PCA and print how many components explain at least 90% of the variance
Run t-SNE with perplexity=30 and print the centroid coordinates of each cluster
Run t-tests on each PCA component to check if two groups differ significantly
Use PCA reconstruction error to identify outliers in a multivariate dataset
Compare logistic regression accuracy using raw features vs top PCA components

17. Hypothesis Testing (t-tests & z-tests)▼

Hypothesis testing decides whether observed data supports a null hypothesis H0. One-sample, two-sample, and paired t-tests cover most practical scenarios. Always check assumptions: normality, equal variance.

One-sample and two-sample t-tests

from scipy import stats
import numpy as np

np.random.seed(42)

# One-sample t-test: is the mean different from a known value?
weights = np.random.normal(70, 8, 30)
t_stat, p_val = stats.ttest_1samp(weights, popmean=72)
print(f"One-sample t-test: t={t_stat:.3f}, p={p_val:.4f}")
print(f"  Reject H0 (mean=72)? {p_val < 0.05}")

# Two-sample t-test: do two groups have different means?
group_a = np.random.normal(75, 10, 40)
group_b = np.random.normal(80, 10, 40)
t2, p2 = stats.ttest_ind(group_a, group_b)
print(f"Two-sample t-test: t={t2:.3f}, p={p2:.4f}")
print(f"  Reject H0 (equal means)? {p2 < 0.05}")

# Welch's t-test (unequal variances — safer default)
group_c = np.random.normal(80, 20, 40)  # larger std
t3, p3 = stats.ttest_ind(group_a, group_c, equal_var=False)
print(f"Welch's t-test: t={t3:.3f}, p={p3:.4f}")
print(f"  Mean A={group_a.mean():.2f}, Mean C={group_c.mean():.2f}")

Paired t-test and confidence intervals

from scipy import stats
import numpy as np

np.random.seed(0)
# Paired t-test: before/after measurements on same subjects
before = np.random.normal(130, 15, 25)   # blood pressure before treatment
after  = before - np.random.normal(8, 5, 25)  # treatment lowers BP

t, p = stats.ttest_rel(before, after)
diff = before - after
ci = stats.t.interval(0.95, df=len(diff)-1,
                       loc=diff.mean(), scale=stats.sem(diff))

print(f"Paired t-test: t={t:.3f}, p={p:.4f}")
print(f"Mean reduction: {diff.mean():.2f} mmHg")
print(f"95% CI: ({ci[0]:.2f}, {ci[1]:.2f})")
print(f"Significant reduction? {p < 0.05}")

# Manual CI for a mean
n, mean, se = len(before), before.mean(), stats.sem(before)
ci_manual = stats.t.interval(0.95, df=n-1, loc=mean, scale=se)
print(f"\nCI for baseline BP mean {mean:.1f}: ({ci_manual[0]:.1f}, {ci_manual[1]:.1f})")

Chi-square test and proportions z-test

from scipy import stats
import numpy as np

# Chi-square test for independence
# Question: Is treatment type independent of recovery?
observed = np.array([[45, 15],   # Drug A: recovered, not recovered
                      [30, 30],   # Drug B
                      [38, 22]])  # Placebo

chi2, p, dof, expected = stats.chi2_contingency(observed)
print(f"Chi-square: chi2={chi2:.3f}, p={p:.4f}, dof={dof}")
print(f"Association between treatment and recovery? {p < 0.05}")

# Two-proportion z-test
# Are conversion rates different between landing pages?
n_a, conv_a = 1000, 120   # page A
n_b, conv_b = 1000, 145   # page B

count = np.array([conv_a, conv_b])
nobs  = np.array([n_a, n_b])
z, p_prop = stats.proportions_ztest(count, nobs)
print(f"\nProportions z-test: z={z:.3f}, p={p_prop:.4f}")
print(f"Rate A={conv_a/n_a:.1%}, Rate B={conv_b/n_b:.1%}")
print(f"Significantly different? {p_prop < 0.05}")

💼 Real-World Scenario

A pharma company tests whether a new drug reduces cholesterol more than a placebo. Paired t-test on 50 patients (pre/post treatment) with a significance level of 0.05.

Real-World Code

from scipy import stats
import numpy as np

np.random.seed(7)
n = 50
drug_group    = np.random.normal(200, 30, n)  # baseline cholesterol
placebo_group = np.random.normal(200, 30, n)

drug_post    = drug_group    - np.random.normal(25, 10, n)  # drug reduces ~25
placebo_post = placebo_group - np.random.normal(5,  10, n)  # placebo reduces ~5

drug_diff    = drug_group    - drug_post
placebo_diff = placebo_group - placebo_post

t_drug,    p_drug    = stats.ttest_rel(drug_group,    drug_post)
t_placebo, p_placebo = stats.ttest_rel(placebo_group, placebo_post)
t_compare, p_compare = stats.ttest_ind(drug_diff,     placebo_diff)

print(f"Drug group:    mean reduction={drug_diff.mean():.1f}, p={p_drug:.4f}")
print(f"Placebo group: mean reduction={placebo_diff.mean():.1f}, p={p_placebo:.4f}")
print(f"Drug vs Placebo: t={t_compare:.3f}, p={p_compare:.4f}")
print(f"Drug significantly better? {p_compare < 0.05}")

🏋️ Practice: A/B Test Analysis

Two email subject lines were tested: A sent to 800 users (92 opened), B to 800 users (117 opened). (1) Run a two-proportion z-test. (2) Compute the 95% CI for the difference in rates. (3) Calculate the lift (relative improvement). (4) Determine the minimum detectable effect if you want 80% power at alpha=0.05.

Starter Code

from scipy import stats
import numpy as np

n_a, open_a = 800, 92
n_b, open_b = 800, 117

# (1) two-proportion z-test
# (2) 95% CI for difference
# (3) lift = (rate_b - rate_a) / rate_a
# (4) minimum detectable effect hint: use stats.norm.isf for z-scores
rate_a = open_a / n_a
rate_b = open_b / n_b
print(f"Rate A: {rate_a:.3f}, Rate B: {rate_b:.3f}")

✅ Practice Checklist

Run a one-sample t-test on data simulated from normal(70,8) testing if the mean equals 72
Run a two-sample t-test on two groups and check if the difference is statistically significant
Run a paired t-test on before/after measurements and compute a 95% CI for the mean difference
Use stats.proportions_ztest to test if two conversion rates are significantly different
Calculate the effect size (Cohen's d) alongside the t-test result

18. ANOVA & Post-hoc Tests▼

One-way ANOVA tests whether means differ across 3+ groups in a single factor. Two-way ANOVA adds a second factor and interaction. Post-hoc tests (Tukey, Bonferroni) identify which pairs differ.

One-way ANOVA with post-hoc Tukey HSD

from scipy import stats
from statsmodels.stats.multicomp import pairwise_tukeyhsd
import numpy as np

np.random.seed(42)
# Crop yield under 4 fertilizer types
groups = {
    'Control':  np.random.normal(50, 8, 20),
    'FertA':    np.random.normal(58, 8, 20),
    'FertB':    np.random.normal(55, 8, 20),
    'FertC':    np.random.normal(62, 8, 20),
}

f_stat, p_val = stats.f_oneway(*groups.values())
print(f"One-way ANOVA: F={f_stat:.3f}, p={p_val:.4f}")
print(f"Significant difference exists? {p_val < 0.05}\n")

# Tukey HSD post-hoc
import numpy as np as _np
all_data   = np.concatenate(list(groups.values()))
all_labels = np.repeat(list(groups.keys()), 20)

tukey = pairwise_tukeyhsd(endog=all_data, groups=all_labels, alpha=0.05)
print(tukey.summary())

Two-way ANOVA with interaction

import statsmodels.api as sm
from statsmodels.formula.api import ols
import numpy as np, pandas as pd

np.random.seed(42)
n = 120
df = pd.DataFrame({
    'fertilizer': np.repeat(['A','B','C'], n//3),
    'irrigation': np.tile(['Low','High'], n//2),
    'yield': (np.random.normal(55, 8, n) +
              np.where(np.repeat(['A','B','C'], n//3)=='C', 8, 0) +
              np.where(np.tile(['Low','High'], n//2)=='High', 5, 0)),
})

model = ols('yield ~ C(fertilizer) + C(irrigation) + C(fertilizer):C(irrigation)',
            data=df).fit()
anova_table = sm.stats.anova_lm(model, typ=2)
print(anova_table.round(4))
print(f"\nFertilizer effect: p={anova_table.loc['C(fertilizer)','PR(>F)']:.4f}")
print(f"Irrigation effect: p={anova_table.loc['C(irrigation)','PR(>F)']:.4f}")

Kruskal-Wallis (non-parametric ANOVA) + Bonferroni

from scipy import stats
from statsmodels.stats.multitest import multipletests
import numpy as np
from itertools import combinations

np.random.seed(5)
# Non-normal data: response times across 3 UI designs
groups = {
    'Design A': np.random.exponential(2.0, 30),
    'Design B': np.random.exponential(1.5, 30),
    'Design C': np.random.exponential(2.5, 30),
}
names = list(groups.keys())
data  = list(groups.values())

# Kruskal-Wallis
H, p = stats.kruskal(*data)
print(f"Kruskal-Wallis: H={H:.3f}, p={p:.4f}")

# Pairwise Mann-Whitney U with Bonferroni correction
pairs = list(combinations(range(len(names)), 2))
raw_p = []
for i, j in pairs:
    _, pv = stats.mannwhitneyu(data[i], data[j], alternative='two-sided')
    raw_p.append(pv)

reject, p_adj, _, _ = multipletests(raw_p, method='bonferroni')
for (i,j), pv, pa, rej in zip(pairs, raw_p, p_adj, reject):
    print(f"  {names[i]} vs {names[j]}: raw_p={pv:.4f}, adj_p={pa:.4f}, reject={rej}")

💼 Real-World Scenario

A UX researcher tests 3 onboarding flows on task completion time across 60 users. ANOVA reveals a significant difference; Tukey HSD pinpoints which flow pairs differ significantly.

Real-World Code

from scipy import stats
from statsmodels.stats.multicomp import pairwise_tukeyhsd
import numpy as np, pandas as pd

np.random.seed(99)
flows = {'Flow 1': np.random.normal(45, 12, 20),
         'Flow 2': np.random.normal(38, 10, 20),
         'Flow 3': np.random.normal(52, 14, 20)}

f, p = stats.f_oneway(*flows.values())
print(f"ANOVA: F={f:.3f}, p={p:.4f}")

all_times  = np.concatenate(list(flows.values()))
all_labels = np.repeat(list(flows.keys()), 20)
tukey = pairwise_tukeyhsd(all_times, all_labels, alpha=0.05)
print(tukey.summary())

for name, times in flows.items():
    print(f"{name}: mean={times.mean():.1f}s, sd={times.std():.1f}s")

🏋️ Practice: Marketing Channel ANOVA

Generate sales data for 4 marketing channels (Email, Social, Search, Display) — 25 observations each with means 120, 135, 150, 110 and std=25. (1) Run one-way ANOVA. (2) If significant, run Tukey HSD to identify which channels differ. (3) Compute eta-squared (effect size = SS_between / SS_total).

Starter Code

from scipy import stats
from statsmodels.stats.multicomp import pairwise_tukeyhsd
import numpy as np

np.random.seed(42)
channels = {'Email':   np.random.normal(120, 25, 25),
            'Social':  np.random.normal(135, 25, 25),
            'Search':  np.random.normal(150, 25, 25),
            'Display': np.random.normal(110, 25, 25)}

# (1) One-way ANOVA
# (2) Tukey HSD if significant
# (3) eta-squared = SS_between / SS_total
# SS_total = sum of (xi - grand_mean)^2 across all observations

✅ Practice Checklist

Run one-way ANOVA on 3 groups and interpret whether F-statistic and p-value are significant
Run pairwise_tukeyhsd and print which group pairs differ at alpha=0.05
Run Kruskal-Wallis as the non-parametric alternative and compare the p-value to ANOVA
Compute eta-squared (SS_between / SS_total) as the ANOVA effect size
Run two-way ANOVA with statsmodels and check the interaction term significance

19. Correlation Analysis▼

Pearson measures linear association between continuous variables. Spearman and Kendall are rank-based (robust to outliers and non-linearity). Point-biserial handles binary-continuous pairs.

Pearson, Spearman, Kendall correlation

from scipy import stats
import numpy as np

np.random.seed(42)
n = 100

# Pearson: linear relationship
x = np.random.randn(n)
y_linear = 2 * x + np.random.randn(n) * 0.5

r_p, p_p = stats.pearsonr(x, y_linear)
r_s, p_s = stats.spearmanr(x, y_linear)
r_k, p_k = stats.kendalltau(x, y_linear)
print("Linear relationship:")
print(f"  Pearson r={r_p:.4f}  Spearman r={r_s:.4f}  Kendall tau={r_k:.4f}")

# Spearman handles monotonic non-linear relationships
y_mono = np.exp(x) + np.random.randn(n) * 0.3
r_p2, _ = stats.pearsonr(x, y_mono)
r_s2, _ = stats.spearmanr(x, y_mono)
print(f"\nMonotonic (non-linear):")
print(f"  Pearson r={r_p2:.4f}  Spearman r={r_s2:.4f}")

# With outliers: Spearman is robust
y_outlier = y_linear.copy()
y_outlier[[5, 10, 15]] = 100  # inject outliers
r_p3, _ = stats.pearsonr(x, y_outlier)
r_s3, _ = stats.spearmanr(x, y_outlier)
print(f"\nWith outliers:")
print(f"  Pearson r={r_p3:.4f}  Spearman r={r_s3:.4f}  (Spearman more robust)")

Correlation matrix and partial correlation

from scipy import stats
import numpy as np, pandas as pd

np.random.seed(42)
n = 200
# Confounding variable Z drives both X and Y
z = np.random.randn(n)
x = 0.7 * z + np.random.randn(n) * 0.5
y = 0.8 * z + np.random.randn(n) * 0.5
w = np.random.randn(n)  # unrelated

df = pd.DataFrame({'X': x, 'Y': y, 'Z': z, 'W': w})

print("Correlation matrix:")
print(df.corr(method='pearson').round(3))

# Partial correlation: X-Y controlling for Z
def partial_corr(x, y, z):
    # Regress out z from both x and y
    r_xz = np.corrcoef(x, z)[0,1]
    r_yz = np.corrcoef(y, z)[0,1]
    r_xy = np.corrcoef(x, y)[0,1]
    return (r_xy - r_xz*r_yz) / (np.sqrt(1-r_xz**2) * np.sqrt(1-r_yz**2))

pc = partial_corr(x, y, z)
print(f"\nX-Y raw correlation:     {np.corrcoef(x,y)[0,1]:.4f}")
print(f"X-Y partial (given Z):   {pc:.4f}  (much lower — Z was confounding)")

Point-biserial, phi coefficient, and significance testing

from scipy import stats
import numpy as np

np.random.seed(3)
n = 150

# Point-biserial: continuous vs binary
scores = np.random.normal(70, 10, n)
passed = (scores + np.random.randn(n)*5 > 68).astype(int)
r_pb, p_pb = stats.pointbiserialr(passed, scores)
print(f"Point-biserial r={r_pb:.4f}, p={p_pb:.4f}")

# Phi coefficient: binary vs binary
vaccinated = np.random.choice([0,1], n, p=[0.4,0.6])
infected   = np.where(vaccinated==1,
                       np.random.choice([0,1], n, p=[0.9,0.1]),
                       np.random.choice([0,1], n, p=[0.4,0.6]))
chi2, p_chi, _, _ = stats.chi2_contingency(
    np.array([[((vaccinated==0)&(infected==0)).sum(),
               ((vaccinated==0)&(infected==1)).sum()],
              [((vaccinated==1)&(infected==0)).sum(),
               ((vaccinated==1)&(infected==1)).sum()]]))
phi = np.sqrt(chi2 / n)
print(f"Phi coefficient={phi:.4f}, chi2_p={p_chi:.4f}")

# Testing significance of Pearson r
r = 0.35
n_test = 50
t = r * np.sqrt(n_test-2) / np.sqrt(1-r**2)
p = 2 * stats.t.sf(abs(t), df=n_test-2)
print(f"\nr=0.35, n=50: t={t:.3f}, p={p:.4f}, significant={p<0.05}")

💼 Real-World Scenario

A sports analyst computes Spearman correlations between athlete metrics (speed, strength, endurance, recovery) and competitive performance, then uses partial correlation to remove the confounding effect of age.

Real-World Code

from scipy import stats
import numpy as np, pandas as pd

np.random.seed(10)
n = 80
age = np.random.uniform(18, 35, n)
speed      = -0.4*age + np.random.randn(n)*2 + 30
strength   =  0.1*age + np.random.randn(n)*3 + 20
endurance  = -0.3*age + np.random.randn(n)*2 + 25
performance= 0.5*speed + 0.3*strength + 0.4*endurance + np.random.randn(n)*2

df = pd.DataFrame({'age':age,'speed':speed,'strength':strength,
                   'endurance':endurance,'performance':performance})

print("Spearman correlations with performance:")
for col in ['age','speed','strength','endurance']:
    r, p = stats.spearmanr(df[col], df['performance'])
    print(f"  {col:12s}: r={r:.3f}, p={p:.4f}")

# Partial: speed-performance controlling for age
r_sp = np.corrcoef(speed, performance)[0,1]
r_sa = np.corrcoef(speed, age)[0,1]
r_pa = np.corrcoef(performance, age)[0,1]
pc = (r_sp - r_sa*r_pa) / (np.sqrt(1-r_sa**2)*np.sqrt(1-r_pa**2))
print(f"\nSpeed-Perf raw r={r_sp:.3f}, partial (ctrl age)={pc:.3f}")

🏋️ Practice: Feature Correlation Screening

Generate 8 features for 300 samples (some correlated, some independent, one binary). (1) Compute and print the full Pearson correlation matrix. (2) Flag all feature pairs with |r| > 0.7 as highly correlated. (3) Compute point-biserial correlation of the binary feature with each continuous feature. (4) Test each correlation for significance.

Starter Code

from scipy import stats
import numpy as np, pandas as pd

np.random.seed(42)
n = 300
z = np.random.randn(n)
df = pd.DataFrame({
    'f1': z + np.random.randn(n)*0.3,
    'f2': -0.8*z + np.random.randn(n)*0.3,  # correlated with f1
    'f3': np.random.randn(n),
    'f4': np.random.randn(n),
    'f5': 0.9*z + np.random.randn(n)*0.1,   # highly correlated
    'f6': np.random.exponential(2, n),
    'f7': np.random.randn(n),
    'binary': (z > 0).astype(int),
})

# (1) correlation matrix
# (2) flag |r| > 0.7 pairs
# (3) point-biserial for binary vs continuous
# (4) test significance

✅ Practice Checklist

Compute Pearson r between two linearly related variables and check the p-value
Compare Pearson and Spearman correlation on a dataset with a few outliers
Build a full correlation matrix for a DataFrame and flag pairs with absolute r > 0.7
Compute partial correlation between X and Y controlling for a confound Z
Use stats.pointbiserialr to correlate a binary (pass/fail) variable with a continuous score

20. Non-Parametric Tests▼

Non-parametric tests make no assumptions about distributions. Use them when data is ordinal, heavily skewed, or sample sizes are too small to verify normality. Wilcoxon, Mann-Whitney, and Kolmogorov-Smirnov are the workhorses.

Normality tests: Shapiro-Wilk and D'Agostino

from scipy import stats
import numpy as np

np.random.seed(42)

datasets = {
    'Normal':      np.random.normal(50, 10, 50),
    'Skewed':      np.random.exponential(5, 50),
    'Heavy-tailed':np.random.standard_t(df=3, size=50),
    'Uniform':     np.random.uniform(0, 100, 50),
}

print(f"{'Dataset':15s}  {'Shapiro p':>10s}  {'D\'Agostino p':>13s}  {'Normal?'}")
print('-' * 60)
for name, data in datasets.items():
    _, p_sw  = stats.shapiro(data)
    _, p_da  = stats.normaltest(data)
    is_norm  = p_sw > 0.05 and p_da > 0.05
    print(f"{name:15s}  {p_sw:10.4f}  {p_da:13.4f}  {str(is_norm)}")

# Q-Q plot values (manual)
normal_data = datasets['Normal']
theoretical, observed = stats.probplot(normal_data, dist='norm')[0]
r = np.corrcoef(theoretical, observed)[0,1]
print(f"\nQ-Q correlation for Normal: {r:.4f} (close to 1 = normal)")

Wilcoxon signed-rank and Mann-Whitney U

from scipy import stats
import numpy as np

np.random.seed(7)
# Wilcoxon signed-rank: paired non-parametric alternative to paired t-test
before = np.random.exponential(2.0, 30)
after  = before * 0.7 + np.random.exponential(0.3, 30)

stat_w, p_w = stats.wilcoxon(before, after)
stat_t, p_t = stats.ttest_rel(before, after)
print("Paired comparison (before vs after):")
print(f"  Wilcoxon: stat={stat_w:.1f}, p={p_w:.4f}")
print(f"  t-test:   stat={stat_t:.3f}, p={p_t:.4f}")

# Mann-Whitney U: non-parametric two-sample test
group1 = np.random.exponential(3.0, 40)
group2 = np.random.exponential(4.5, 40)

u_stat, p_u = stats.mannwhitneyu(group1, group2, alternative='two-sided')
t_stat, p_t2 = stats.ttest_ind(group1, group2)
print(f"\nTwo-group comparison:")
print(f"  Mann-Whitney: U={u_stat:.1f}, p={p_u:.4f}")
print(f"  t-test:       t={t_stat:.3f}, p={p_t2:.4f}")

# Effect size r = Z / sqrt(N) for Mann-Whitney
n1, n2 = len(group1), len(group2)
z = stats.norm.ppf(p_u/2)  # approximate z
r_effect = abs(z) / np.sqrt(n1+n2)
print(f"  Effect size r: {r_effect:.3f}")

Kolmogorov-Smirnov and Anderson-Darling tests

from scipy import stats
import numpy as np

np.random.seed(0)

# One-sample KS: does data come from a specified distribution?
data_normal = np.random.normal(0, 1, 100)
data_exp    = np.random.exponential(1, 100)

for name, data in [('Normal', data_normal), ('Exponential', data_exp)]:
    ks_n, p_n = stats.kstest(data, 'norm', args=(data.mean(), data.std()))
    ks_e, p_e = stats.kstest(data, 'expon', args=(0, 1/data.mean()))
    print(f"{name}: KS vs Normal p={p_n:.4f}, KS vs Expon p={p_e:.4f}")

# Two-sample KS: do two samples come from the same distribution?
sample_a = np.random.normal(0, 1, 100)
sample_b = np.random.normal(0.5, 1, 100)  # slightly shifted
ks2, p2 = stats.ks_2samp(sample_a, sample_b)
print(f"\n2-sample KS: stat={ks2:.4f}, p={p2:.4f}")

# Anderson-Darling (more powerful than KS for normal testing)
result = stats.anderson(data_normal, dist='norm')
print(f"\nAnderson-Darling statistic: {result.statistic:.4f}")
for sl, cv in zip(result.significance_level, result.critical_values):
    print(f"  {sl}% significance level: crit={cv:.4f}, reject={result.statistic > cv}")

💼 Real-World Scenario

A clinical researcher compares pain scores (skewed, ordinal 1-10) between a drug and control group. Shapiro-Wilk confirms non-normality, so Mann-Whitney U is used instead of a t-test.

Real-World Code

from scipy import stats
import numpy as np

np.random.seed(42)
# Pain scores: skewed, bounded 1-10
drug_scores    = np.clip(np.random.exponential(2.5, 40) + 1, 1, 10).round()
control_scores = np.clip(np.random.exponential(4.0, 40) + 1, 1, 10).round()

# Test normality first
_, p_drug = stats.shapiro(drug_scores)
_, p_ctrl = stats.shapiro(control_scores)
print(f"Shapiro p-values: drug={p_drug:.4f}, control={p_ctrl:.4f}")
print(f"Use non-parametric: {p_drug < 0.05 or p_ctrl < 0.05}")

# Mann-Whitney U
u, p = stats.mannwhitneyu(drug_scores, control_scores, alternative='less')
print(f"Mann-Whitney: U={u:.1f}, p={p:.4f}")
print(f"Drug reduces pain? {p < 0.05}")
print(f"Median drug={np.median(drug_scores):.1f}, control={np.median(control_scores):.1f}")

🏋️ Practice: Distribution Comparison Pipeline

Write a function compare_groups(a, b) that: (1) Tests normality of both groups with Shapiro-Wilk, (2) If both normal, runs Levene's test for equal variances, then appropriate t-test (Welch or Student), (3) If either non-normal, runs Mann-Whitney U, (4) Returns a dict with test_used, statistic, p_value, significant. Test with normal and non-normal pairs.

Starter Code

from scipy import stats
import numpy as np

def compare_groups(a, b, alpha=0.05):
    result = {}
    # (1) normality
    _, p_a = stats.shapiro(a)
    _, p_b = stats.shapiro(b)
    both_normal = p_a > alpha and p_b > alpha
    # (2) if normal: Levene then t-test
    # (3) if not normal: Mann-Whitney
    # (4) return dict
    return result

np.random.seed(42)
normal_a = np.random.normal(10, 2, 30)
normal_b = np.random.normal(11, 2, 30)
skewed_a = np.random.exponential(2, 30)
skewed_b = np.random.exponential(3, 30)

print(compare_groups(normal_a, normal_b))
print(compare_groups(skewed_a, skewed_b))

✅ Practice Checklist

Test normality on normal, exponential, and uniform samples with Shapiro-Wilk
Use Mann-Whitney U to compare two exponentially distributed groups
Run Wilcoxon signed-rank test on paired before/after data and compare to paired t-test
Apply the two-sample KS test to check if two samples share the same distribution
Write a compare_groups(a, b) function that automatically picks parametric vs non-parametric test

21. Bayesian Inference Basics▼

Bayesian inference updates prior beliefs with observed data to produce a posterior distribution. Beta-Binomial and Normal-Normal conjugates let you compute posteriors analytically. Compare to frequentist interpretation of the same data.

Beta-Binomial: updating conversion rate beliefs

from scipy import stats
import numpy as np

# Prior: Beta(alpha=2, beta=10) — we expect ~17% conversion
prior_a, prior_b = 2, 10

# Observed data: 45 conversions out of 200 visits
obs_conv, obs_total = 45, 200
obs_failures = obs_total - obs_conv

# Posterior: Beta(alpha + successes, beta + failures) [conjugate]
post_a = prior_a + obs_conv
post_b = prior_b + obs_failures

prior     = stats.beta(prior_a, prior_b)
posterior = stats.beta(post_a, post_b)

print(f"Prior:     mean={prior.mean():.3f}, 95% CI: ({prior.ppf(0.025):.3f}, {prior.ppf(0.975):.3f})")
print(f"Posterior: mean={posterior.mean():.3f}, 95% CI: ({posterior.ppf(0.025):.3f}, {posterior.ppf(0.975):.3f})")
print(f"MLE (freq): {obs_conv/obs_total:.3f}")

# Probability that conversion > 25%
prob_above = 1 - posterior.cdf(0.25)
print(f"P(conversion > 25%): {prob_above:.4f}")

# Posterior predictive: expected conversions in next 100
pp_mean  = posterior.mean() * 100
pp_sd    = np.sqrt(posterior.var() * 100)
print(f"Expected conversions (next 100): {pp_mean:.1f} ± {pp_sd:.1f}")

Bayesian A/B test: comparing two rates

from scipy import stats
import numpy as np

np.random.seed(42)

# A/B test with conjugate Beta posteriors
conv_a, n_a = 120, 1000   # page A
conv_b, n_b = 145, 1000   # page B

# Non-informative priors Beta(1,1) = Uniform
post_a = stats.beta(1 + conv_a, 1 + n_a - conv_a)
post_b = stats.beta(1 + conv_b, 1 + n_b - conv_b)

# Monte Carlo estimate P(B > A)
samples_a = post_a.rvs(100_000)
samples_b = post_b.rvs(100_000)
p_b_wins  = (samples_b > samples_a).mean()

# Expected lift
lift_samples = (samples_b - samples_a) / samples_a
lift_mean = lift_samples.mean()
lift_ci = np.percentile(lift_samples, [2.5, 97.5])

print(f"P(B > A) = {p_b_wins:.4f}")
print(f"Expected lift: {lift_mean:.1%}")
print(f"95% credible interval for lift: ({lift_ci[0]:.1%}, {lift_ci[1]:.1%})")
print(f"Posterior A: mean={post_a.mean():.4f}")
print(f"Posterior B: mean={post_b.mean():.4f}")

Normal-Normal conjugate for mean estimation

from scipy import stats
import numpy as np

np.random.seed(0)
# Estimating the true mean of a process
# Prior: Normal(mu0=100, sigma0=20)  — prior belief about mean
# Known measurement noise: sigma=15

mu0, sigma0 = 100.0, 20.0   # prior
sigma_obs   = 15.0            # known observation noise

# Observe 30 measurements
true_mean = 112.0
data = np.random.normal(true_mean, sigma_obs, 30)
n, x_bar = len(data), data.mean()

# Conjugate update
precision_prior = 1 / sigma0**2
precision_data  = n / sigma_obs**2
precision_post  = precision_prior + precision_data

mu_post    = (precision_prior*mu0 + precision_data*x_bar) / precision_post
sigma_post = np.sqrt(1 / precision_post)

prior_dist = stats.norm(mu0, sigma0)
post_dist  = stats.norm(mu_post, sigma_post)

print(f"Prior:        mean={mu0:.1f},     95% CI: ({prior_dist.ppf(0.025):.1f}, {prior_dist.ppf(0.975):.1f})")
print(f"Likelihood:   mean={x_bar:.2f},  (n={n} obs, noise={sigma_obs})")
print(f"Posterior:    mean={mu_post:.2f}, 95% CI: ({post_dist.ppf(0.025):.2f}, {post_dist.ppf(0.975):.2f})")
print(f"True mean:    {true_mean}")
print(f"Posterior pulls toward prior when data is limited.")

💼 Real-World Scenario

An e-commerce platform uses Bayesian A/B testing for a checkout redesign. Instead of waiting for p<0.05, they monitor the posterior probability that variant B outperforms A and stop when P(B>A) > 0.95.

Real-World Code

from scipy import stats
import numpy as np

np.random.seed(42)

# Sequential Bayesian A/B test
true_rate_a, true_rate_b = 0.10, 0.13
n_per_day = 50

post_a = stats.beta(1, 1)
post_b = stats.beta(1, 1)

alpha_a, beta_a = 1, 1
alpha_b, beta_b = 1, 1

print(f"{'Day':>5s}  {'n_a':>6s}  {'n_b':>6s}  {'Rate A':>8s}  {'Rate B':>8s}  {'P(B>A)':>8s}")
for day in range(1, 31):
    # New daily data
    conv_a = np.random.binomial(n_per_day, true_rate_a)
    conv_b = np.random.binomial(n_per_day, true_rate_b)
    alpha_a += conv_a; beta_a += n_per_day - conv_a
    alpha_b += conv_b; beta_b += n_per_day - conv_b

    post_a = stats.beta(alpha_a, beta_a)
    post_b = stats.beta(alpha_b, beta_b)
    p_b_wins = (post_b.rvs(50000) > post_a.rvs(50000)).mean()
    n_total = (alpha_a + beta_a - 2) + (alpha_b + beta_b - 2)

    if day % 5 == 0 or p_b_wins > 0.95:
        print(f"{day:5d}  {n_total//2:6d}  {n_total//2:6d}  {post_a.mean():8.4f}  {post_b.mean():8.4f}  {p_b_wins:8.4f}")
    if p_b_wins > 0.95:
        print(f"  --> Stopping: P(B>A)={p_b_wins:.4f} > 0.95 on day {day}")
        break

🏋️ Practice: Bayesian Coin Fairness

A coin is flipped 100 times with 62 heads. (1) Start with a uniform prior Beta(1,1). (2) Update sequentially after every 10 flips. (3) Print posterior mean and 95% credible interval at each step. (4) Compare to the frequentist 95% CI at each step. (5) Compute P(p > 0.5) from the final posterior.

Starter Code

from scipy import stats
import numpy as np

np.random.seed(0)
n_flips, n_heads = 100, 62
flips = np.array([1]*n_heads + [0]*(n_flips - n_heads))
np.random.shuffle(flips)

a, b = 1, 1  # prior Beta(1,1)

print(f"{'Step':>6s}  {'Heads':>6s}  {'n':>5s}  {'Post mean':>10s}  {'95% Credible':>20s}  {'Freq 95% CI':>20s}")
for step in range(10, n_flips+1, 10):
    chunk = flips[:step]
    h = chunk.sum(); n = len(chunk)
    a_post = a + h; b_post = b + n - h
    post = stats.beta(a_post, b_post)
    # TODO: compute posterior 95% CI from post
    # TODO: compute frequentist 95% CI using stats.proportion_confint or normal approx
    # TODO: print row
    pass

# (5) P(p > 0.5) from final posterior

22. Effect Size & Statistical Power▼

Effect size quantifies the magnitude of a difference independently of sample size. Cohen's d, r, eta-squared measure effect for different tests. Power analysis determines the sample size needed to detect a given effect.

Cohen's d and r for effect size

from scipy import stats
import numpy as np

def cohens_d(group1, group2):
    n1, n2 = len(group1), len(group2)
    s_pooled = np.sqrt(((n1-1)*group1.std()**2 + (n2-1)*group2.std()**2) / (n1+n2-2))
    return (group1.mean() - group2.mean()) / s_pooled

np.random.seed(42)
thresholds = [('Small', 0.2), ('Medium', 0.5), ('Large', 0.8)]

print(f"{'Effect':8s}  {'Cohen d':>8s}  {'Interpretation':>15s}  {'p-value':>8s}")
print('-' * 55)
for label, target_d in thresholds:
    a = np.random.normal(0, 1, 50)
    b = np.random.normal(target_d, 1, 50)
    d = cohens_d(b, a)
    _, p = stats.ttest_ind(a, b)
    print(f"{label:8s}  {d:8.3f}  {label:>15s}  {p:8.4f}")

# r from t-statistic
t_stat = 3.2
n_total = 80
r = np.sqrt(t_stat**2 / (t_stat**2 + n_total - 2))
print(f"\nt={t_stat}, n={n_total} -> r effect size = {r:.3f}")

# Eta-squared from F (ANOVA)
F, df_between, df_within = 8.5, 2, 60
eta2 = (F * df_between) / (F * df_between + df_within)
print(f"F={F}, df_b={df_between}, df_w={df_within} -> eta²={eta2:.3f}")

Sample size calculation for t-test

from scipy import stats
import numpy as np

def sample_size_ttest(effect_size_d, alpha=0.05, power=0.80, tail=2):
    z_alpha = stats.norm.ppf(1 - alpha/tail)
    z_beta  = stats.norm.ppf(power)
    n = 2 * ((z_alpha + z_beta) / effect_size_d) ** 2
    return int(np.ceil(n))

print("Required sample size per group (two-sample t-test):")
print(f"{'Effect d':>9s}  {'α=0.05,80%':>12s}  {'α=0.05,90%':>12s}  {'α=0.01,80%':>12s}")
print('-' * 52)
for d in [0.2, 0.3, 0.5, 0.8, 1.0]:
    n1 = sample_size_ttest(d, 0.05, 0.80)
    n2 = sample_size_ttest(d, 0.05, 0.90)
    n3 = sample_size_ttest(d, 0.01, 0.80)
    print(f"{d:9.1f}  {n1:12d}  {n2:12d}  {n3:12d}")

# Using statsmodels TTestIndPower
from statsmodels.stats.power import TTestIndPower
analysis = TTestIndPower()
n = analysis.solve_power(effect_size=0.5, alpha=0.05, power=0.80)
print(f"\nstatsmodels: d=0.5, α=0.05, power=0.80 -> n={n:.1f} per group")

# Achieved power for given n and effect
power = analysis.solve_power(effect_size=0.3, alpha=0.05, nobs1=100)
print(f"d=0.3, n=100 per group -> achieved power={power:.3f}")

Power curves and minimum detectable effect

from statsmodels.stats.power import TTestIndPower, ChiSquarePower
import numpy as np

analysis = TTestIndPower()

# Power curve: power vs sample size for different effect sizes
print("Power by sample size and effect size:")
print(f"{'n/grp':>7s}", end='')
for d in [0.2, 0.3, 0.5, 0.8]:
    print(f"  d={d}", end='')
print()
for n in [20, 50, 100, 200, 500]:
    print(f"{n:7d}", end='')
    for d in [0.2, 0.3, 0.5, 0.8]:
        pwr = analysis.solve_power(effect_size=d, alpha=0.05, nobs1=n)
        print(f"  {pwr:.3f}", end='')
    print()

# Minimum detectable effect (MDE) given n and desired power
mde = analysis.solve_power(alpha=0.05, power=0.80, nobs1=200)
print(f"\nMDE (d) for n=200/grp, power=80%: {mde:.3f}")

# Chi-square power
chi_power = ChiSquarePower()
n_chi = chi_power.solve_power(effect_size=0.3, alpha=0.05, power=0.80, k_bins=3)
print(f"Chi-square: effect=0.3, k=3, power=80% -> n={n_chi:.1f}")

💼 Real-World Scenario

A product team wants to detect a 10% improvement in task completion rate (from 60% to 66%). They compute required sample size for 80% and 90% power, then run the test and report Cohen's h effect size.

Real-World Code

from statsmodels.stats.power import NormalIndPower
from scipy import stats
import numpy as np

# Rate improvement: 60% -> 66%
rate_a, rate_b = 0.60, 0.66

# Cohen's h for proportions
h = 2 * np.arcsin(np.sqrt(rate_b)) - 2 * np.arcsin(np.sqrt(rate_a))
print(f"Cohen's h = {h:.4f}")

# Required sample size
analysis = NormalIndPower()
for power in [0.80, 0.90, 0.95]:
    n = analysis.solve_power(effect_size=h, alpha=0.05, power=power)
    print(f"Power={power:.0%}: n={n:.0f} per group (total {2*n:.0f})")

# Simulate the test
np.random.seed(42)
n_per = int(analysis.solve_power(effect_size=h, alpha=0.05, power=0.80)) + 1
conv_a = np.random.binomial(n_per, rate_a)
conv_b = np.random.binomial(n_per, rate_b)
_, p = stats.proportions_ztest([conv_a, conv_b], [n_per, n_per])
print(f"\nSimulated test: A={conv_a/n_per:.3f}, B={conv_b/n_per:.3f}, p={p:.4f}")

🏋️ Practice: Power Analysis Dashboard

For a two-sample t-test: (1) Compute required n for all combinations of effect_size in [0.2,0.5,0.8] and power in [0.70,0.80,0.90] at alpha=0.05. Print as a grid. (2) For n=150 per group and alpha=0.05, plot (print) a table of achieved power for effect sizes 0.1 to 0.8 in steps of 0.1. (3) Find the MDE for n=300 at 80% power.

Starter Code

from statsmodels.stats.power import TTestIndPower
import numpy as np

analysis = TTestIndPower()

# (1) n grid
print("Required n per group:")
effects = [0.2, 0.5, 0.8]
powers  = [0.70, 0.80, 0.90]
# TODO: print header and rows

# (2) power table for n=150
print("\nAchieved power for n=150 per group:")
# TODO

# (3) MDE
mde = analysis.solve_power(alpha=0.05, power=0.80, nobs1=300)
print(f"\nMDE for n=300: d={mde:.4f}")

23. Multiple Testing & FDR Corrections▼

Running many tests inflates the false positive rate. Bonferroni controls family-wise error rate (conservative). Benjamini-Hochberg (BH) controls the false discovery rate — preferred for exploratory studies with many hypotheses.

Bonferroni and Holm-Bonferroni corrections

from statsmodels.stats.multitest import multipletests
from scipy import stats
import numpy as np

np.random.seed(42)
n_tests = 20
n_per   = 50

# 20 tests: 3 truly different (true positives), 17 null
true_effect = [0, 0, 0, 0.8, 0, 0, 0, 0, 0.5, 0, 0, 0, 0, 0, 0, 0.6, 0, 0, 0, 0]
raw_p = []
for d in true_effect:
    a = np.random.normal(0, 1, n_per)
    b = np.random.normal(d, 1, n_per)
    _, p = stats.ttest_ind(a, b)
    raw_p.append(p)

methods = ['bonferroni', 'holm', 'fdr_bh', 'fdr_by']
print(f"{'Method':12s}  {'Rejections':>11s}  {'True Pos':>9s}  {'False Pos':>10s}")
for method in methods:
    reject, p_adj, _, _ = multipletests(raw_p, alpha=0.05, method=method)
    n_reject = reject.sum()
    true_pos  = sum(1 for r, d in zip(reject, true_effect) if r and d != 0)
    false_pos = sum(1 for r, d in zip(reject, true_effect) if r and d == 0)
    print(f"{method:12s}  {n_reject:11d}  {true_pos:9d}  {false_pos:10d}")

print(f"\nUncorrected (p<0.05): {sum(p < 0.05 for p in raw_p)} rejections")

Benjamini-Hochberg step-by-step

from scipy import stats
import numpy as np

np.random.seed(1)
n_hypotheses = 15
alpha = 0.05

# Simulate p-values (mix of true positives and nulls)
true_effects = [0]*10 + [0.7, 0.9, 0.5, 1.2, 0.6]
raw_p = []
for d in true_effects:
    a = np.random.normal(0, 1, 40)
    b = np.random.normal(d, 1, 40)
    _, p = stats.ttest_ind(a, b)
    raw_p.append(p)

# BH procedure manually
sorted_idx = np.argsort(raw_p)
sorted_p   = np.array(raw_p)[sorted_idx]

print(f"BH procedure (alpha={alpha}, m={n_hypotheses} tests):")
print(f"{'Rank':>5s}  {'p-value':>8s}  {'BH thresh':>10s}  {'Reject?'}")
bh_threshold = alpha * np.arange(1, n_hypotheses+1) / n_hypotheses
last_reject  = -1
for rank, (p, thr) in enumerate(zip(sorted_p, bh_threshold), 1):
    if p <= thr:
        last_reject = rank

for rank, (p, thr) in enumerate(zip(sorted_p[:8], bh_threshold[:8]), 1):
    reject = rank <= last_reject
    print(f"{rank:5d}  {p:8.4f}  {thr:10.4f}  {reject}")
print(f"  ... ({n_hypotheses} total)")
print(f"\nAll hypotheses with rank <= {last_reject} are rejected ({last_reject} total)")

q-values and FDR estimation

from statsmodels.stats.multitest import multipletests
from scipy import stats
import numpy as np

np.random.seed(7)
m = 1000   # large-scale testing (genomics-like)
n_true = 50

# Generate p-values: 50 true effects + 950 nulls
p_true = np.array([stats.ttest_ind(
    np.random.normal(0,1,30), np.random.normal(0.6,1,30))[1]
    for _ in range(n_true)])
p_null = np.random.uniform(0, 1, m - n_true)  # null p-values ~ Uniform
all_p  = np.concatenate([p_true, p_null])
true_labels = np.array([True]*n_true + [False]*(m-n_true))

results = {}
for method in ['bonferroni', 'fdr_bh']:
    reject, _, _, _ = multipletests(all_p, alpha=0.05, method=method)
    tp = (reject & true_labels).sum()
    fp = (reject & ~true_labels).sum()
    fn = (~reject & true_labels).sum()
    fdr = fp / max(reject.sum(), 1)
    power = tp / n_true
    results[method] = (reject.sum(), tp, fp, fdr, power)
    print(f"{method:12s}: rejections={reject.sum():4d}  TP={tp:3d}  FP={fp:3d}  FDR={fdr:.3f}  Power={power:.3f}")

print(f"\nSummary: Bonferroni is conservative; BH balances FDR and power")

💼 Real-World Scenario

A genomics researcher tests differential expression for 5000 genes. Raw analysis finds 450 significant genes (p<0.05); after BH correction at FDR=0.05, only 180 remain — avoiding a list of mostly false discoveries.

Real-World Code

from statsmodels.stats.multitest import multipletests
from scipy import stats
import numpy as np

np.random.seed(42)
n_genes = 5000
n_de    = 150  # truly differentially expressed

# Group A and B expression levels
expr_a = np.random.normal(0, 1, (n_genes, 30))
expr_b = np.random.normal(0, 1, (n_genes, 30))
# Inject true effects
effects = np.random.normal(0.8, 0.3, n_de)
expr_b[:n_de] += effects[:, np.newaxis]

# Compute p-values
p_vals = np.array([stats.ttest_ind(expr_a[i], expr_b[i])[1]
                   for i in range(n_genes)])

for method in ['none', 'bonferroni', 'fdr_bh']:
    if method == 'none':
        reject = p_vals < 0.05
        adj_p  = p_vals
    else:
        reject, adj_p, _, _ = multipletests(p_vals, alpha=0.05, method=method)

    tp = reject[:n_de].sum()
    fp = reject[n_de:].sum()
    print(f"{method:12s}: significant={reject.sum():5d}  TP={tp:4d}  FP={fp:4d}  FDR={fp/max(reject.sum(),1):.3f}")

🏋️ Practice: Multi-Metric A/B Correction

You ran an A/B test measuring 12 metrics simultaneously (CTR, bounce rate, time on page, conversions, etc.). Generate synthetic p-values for each metric (mix of some truly different and some null). Apply Bonferroni, Holm, BH corrections. For each method, print which metrics are significant. Discuss when you'd prefer BH over Bonferroni.

Starter Code

from statsmodels.stats.multitest import multipletests
import numpy as np
from scipy import stats

np.random.seed(42)
metrics = ['CTR','BounceRate','TimeOnPage','Conversions','PageViews',
           'ScrollDepth','CartAdds','Checkout','Revenue','SignUps','ReturnVisits','NPS']

# Simulate p-values: some metrics truly affected, most not
true_effects = [0.4, 0, 0, 0.6, 0, 0, 0.3, 0.5, 0, 0, 0, 0]
p_values = []
for d in true_effects:
    a = np.random.normal(0, 1, 200)
    b = np.random.normal(d, 1, 200)
    _, p = stats.ttest_ind(a, b)
    p_values.append(p)

# TODO: apply all three corrections
# TODO: print table showing which metrics are significant under each method
# TODO: comment on when to prefer BH vs Bonferroni

24. Bootstrap & Permutation Tests▼

Bootstrap resampling estimates confidence intervals for any statistic without distributional assumptions. Permutation tests provide exact p-values for two-sample comparisons by shuffling group labels.

Bootstrap confidence intervals

import numpy as np
from scipy import stats

np.random.seed(42)
data = np.random.exponential(scale=5, size=50)  # skewed data

def bootstrap_ci(data, statistic, n_boot=5000, ci=0.95):
    boots = [statistic(np.random.choice(data, len(data), replace=True))
             for _ in range(n_boot)]
    alpha = (1 - ci) / 2
    return np.percentile(boots, [alpha*100, (1-alpha)*100]), np.array(boots)

# Bootstrap CI for mean, median, and std
for stat_name, fn in [('mean', np.mean), ('median', np.median), ('std', np.std)]:
    ci_boot, boots = bootstrap_ci(data, fn)
    # Compare to analytical (where available)
    print(f"{stat_name:7s}: obs={fn(data):.3f}  boot_CI=({ci_boot[0]:.3f}, {ci_boot[1]:.3f})")

# Analytical CI for mean (t-interval)
ci_t = stats.t.interval(0.95, df=len(data)-1, loc=data.mean(), scale=stats.sem(data))
print(f"\nt-interval CI for mean: ({ci_t[0]:.3f}, {ci_t[1]:.3f})")
print(f"Boot CI for mean: above  (similar for large n, more robust for skewed)")

Bootstrap for regression coefficients

import numpy as np
from scipy import stats

np.random.seed(0)
n = 80
x = np.random.randn(n)
y = 2.5 * x + 1.0 + np.random.randn(n) * 1.5

def ols(x, y):
    slope, intercept, *_ = stats.linregress(x, y)
    return slope, intercept

# Bootstrap CI for slope
n_boot = 5000
slopes = []
for _ in range(n_boot):
    idx = np.random.choice(n, n, replace=True)
    s, _ = ols(x[idx], y[idx])
    slopes.append(s)

slopes = np.array(slopes)
ci_slope = np.percentile(slopes, [2.5, 97.5])
obs_slope, obs_int = ols(x, y)

print(f"Observed slope:      {obs_slope:.4f}")
print(f"Bootstrap CI slope:  ({ci_slope[0]:.4f}, {ci_slope[1]:.4f})")

# Analytical CI from linregress
result = stats.linregress(x, y)
t_crit = stats.t.ppf(0.975, df=n-2)
se_slope = result.stderr
print(f"Analytical CI slope: ({obs_slope - t_crit*se_slope:.4f}, {obs_slope + t_crit*se_slope:.4f})")
print(f"Bootstrap SE of slope: {slopes.std():.4f}")

Permutation test for two-sample comparison

import numpy as np
from scipy import stats

np.random.seed(42)

def permutation_test(a, b, n_perm=10000, stat='mean_diff'):
    obs_stat = a.mean() - b.mean()
    combined = np.concatenate([a, b])
    n_a = len(a)
    perm_stats = []
    for _ in range(n_perm):
        shuffled = np.random.permutation(combined)
        perm_stats.append(shuffled[:n_a].mean() - shuffled[n_a:].mean())
    perm_stats = np.array(perm_stats)
    p_val = (np.abs(perm_stats) >= np.abs(obs_stat)).mean()
    return obs_stat, p_val, perm_stats

# Test on small sample with non-normal data
group_a = np.random.exponential(3, 25)
group_b = np.random.exponential(4, 25)

obs, p_perm, perms = permutation_test(group_a, group_b)
_, p_mann = stats.mannwhitneyu(group_a, group_b, alternative='two-sided')
_, p_t    = stats.ttest_ind(group_a, group_b)

print(f"Observed mean diff: {obs:.3f}")
print(f"Permutation p-val:  {p_perm:.4f}")
print(f"Mann-Whitney p-val: {p_mann:.4f}")
print(f"t-test p-val:       {p_t:.4f}")
print(f"Percentile of obs in perm dist: {(perms < obs).mean():.3f}")

💼 Real-World Scenario

A data scientist bootstraps the median response time for two API versions (non-normal, heavy-tailed). The bootstrap CI for the difference excludes 0, providing strong evidence version B is faster.

Real-World Code

import numpy as np
from scipy import stats

np.random.seed(99)
# API response times (heavy-tailed)
version_a = np.random.pareto(2.0, 200) * 100 + 50
version_b = np.random.pareto(2.5, 200) * 80  + 45

# Bootstrap CI for difference in medians
n_boot = 10000
median_diffs = []
for _ in range(n_boot):
    a_boot = np.random.choice(version_a, len(version_a), replace=True)
    b_boot = np.random.choice(version_b, len(version_b), replace=True)
    median_diffs.append(np.median(a_boot) - np.median(b_boot))

ci = np.percentile(median_diffs, [2.5, 97.5])
obs_diff = np.median(version_a) - np.median(version_b)

print(f"Median A: {np.median(version_a):.2f}ms")
print(f"Median B: {np.median(version_b):.2f}ms")
print(f"Observed difference: {obs_diff:.2f}ms")
print(f"Bootstrap 95% CI: ({ci[0]:.2f}, {ci[1]:.2f})ms")
print(f"CI excludes 0: {ci[0] > 0 or ci[1] < 0} -> B is {'faster' if obs_diff > 0 else 'slower'}")

🏋️ Practice: Bootstrap vs Analytical CI Comparison

Generate 3 datasets: (1) Normal n=30, (2) Exponential n=30, (3) Heavy-tailed Pareto n=30. For each: compute bootstrap 95% CI for the mean (5000 resamples), the analytical t-interval, and the bootstrap CI for the median. Print all three side-by-side. Observe how they differ for non-normal data.

Starter Code

import numpy as np
from scipy import stats

np.random.seed(42)
datasets = {
    'Normal':    np.random.normal(10, 3, 30),
    'Exponential': np.random.exponential(5, 30),
    'Pareto':    np.random.pareto(1.5, 30) * 5 + 5,
}

def bootstrap_ci(data, fn, n_boot=5000):
    boots = [fn(np.random.choice(data, len(data), replace=True)) for _ in range(n_boot)]
    return np.percentile(boots, [2.5, 97.5])

for name, data in datasets.items():
    boot_mean   = bootstrap_ci(data, np.mean)
    t_ci        = stats.t.interval(0.95, df=len(data)-1, loc=data.mean(), scale=stats.sem(data))
    boot_median = bootstrap_ci(data, np.median)
    print(f"{name}:")
    print(f"  Bootstrap mean CI:    ({boot_mean[0]:.3f}, {boot_mean[1]:.3f})")
    # TODO: print t-interval and bootstrap median CI