Loading Module…

πŸ“Š Statistics with Python

24 topics • Click any card to expand

1. Descriptive Statistics

Descriptive statistics summarize and describe the main features of a dataset. Key measures include central tendency (mean, median, mode) and spread (std, variance, percentiles, IQR). NumPy and scipy.stats provide fast, vectorized implementations.

Mean, median, mode with numpy and scipy
import numpy as np
from scipy import stats

data = [23, 45, 12, 67, 34, 45, 89, 23, 45, 56, 78, 34, 23, 90, 12]

mean   = np.mean(data)
median = np.median(data)
mode   = stats.mode(data, keepdims=True)

print(f"Mean:   {mean:.2f}")
print(f"Median: {median:.2f}")
print(f"Mode:   {mode.mode[0]}  (count: {mode.count[0]})")
Standard deviation, variance, and IQR
import numpy as np
from scipy import stats

data = np.array([12, 15, 14, 10, 18, 21, 13, 16, 14, 15, 17, 20, 11, 19, 14])

std_pop  = np.std(data, ddof=0)   # population std
std_samp = np.std(data, ddof=1)   # sample std (Bessel's correction)
var_samp = np.var(data, ddof=1)

q1, q3 = np.percentile(data, [25, 75])
iqr = stats.iqr(data)

print(f"Population std:  {std_pop:.4f}")
print(f"Sample std:      {std_samp:.4f}")
print(f"Sample variance: {var_samp:.4f}")
print(f"Q1={q1}, Q3={q3}, IQR={iqr}")
Percentiles and summary statistics with pandas
import numpy as np
import pandas as pd

np.random.seed(42)
data = pd.Series(np.random.normal(loc=100, scale=15, size=200))

print("=== Summary Statistics ===")
print(data.describe())
print(f"
Skewness:  {data.skew():.4f}")
print(f"Kurtosis:  {data.kurtosis():.4f}")
print(f"5th  pctl: {data.quantile(0.05):.2f}")
print(f"95th pctl: {data.quantile(0.95):.2f}")
Outlier detection using IQR fence
import numpy as np
from scipy import stats

np.random.seed(0)
data = np.concatenate([np.random.normal(50, 5, 95), [10, 200, 95, 105, -5]])

q1, q3 = np.percentile(data, [25, 75])
iqr = q3 - q1
lower_fence = q1 - 1.5 * iqr
upper_fence = q3 + 1.5 * iqr

outliers = data[(data < lower_fence) | (data > upper_fence)]
z_scores  = np.abs(stats.zscore(data))

print(f"IQR fences: [{lower_fence:.2f}, {upper_fence:.2f}]")
print(f"Outliers (IQR): {sorted(outliers)}")
print(f"Outliers (|z|>3): {data[z_scores > 3].tolist()}")
💼 Real-World Scenario
A data scientist at a hospital needs to summarize patient blood pressure readings to report to clinicians. They must identify the typical range, spot extreme values, and present skewness of the distribution before modeling.
Real-World Code
import numpy as np
import pandas as pd
from scipy import stats

np.random.seed(7)
# Simulate systolic BP for 300 patients (some hypertensive outliers)
bp = np.concatenate([
    np.random.normal(120, 12, 270),   # normal range
    np.random.normal(170, 10, 30),    # hypertensive group
])
series = pd.Series(bp, name="Systolic_BP")

print("=== Blood Pressure Summary ===")
print(series.describe().round(2))

q1, q3 = np.percentile(bp, [25, 75])
iqr = q3 - q1
print(f"
IQR:         {iqr:.2f} mmHg")
print(f"Skewness:    {series.skew():.3f}")

high_bp = bp[bp > 140]
print(f"Hypertensive (>140): {len(high_bp)} patients ({100*len(high_bp)/len(bp):.1f}%)")
🏋️ Practice: Analyze a Sales Dataset
Given a list of daily sales figures, compute mean, median, std, IQR, and identify any outliers using both the IQR fence and z-score methods. Report the percentage of days with sales above the 90th percentile.
Starter Code
import numpy as np
from scipy import stats

np.random.seed(123)
daily_sales = np.concatenate([
    np.random.normal(5000, 800, 90),
    np.random.normal(12000, 500, 10),  # exceptional days
])

# TODO: Compute mean, median, std, IQR
# TODO: Find outliers using IQR fence
# TODO: Find outliers using z-score (|z| > 2)
# TODO: Report % of days above 90th percentile
✅ Practice Checklist
2. Probability Distributions

Probability distributions model the likelihood of outcomes. scipy.stats provides a unified interface for continuous distributions (normal, exponential) and discrete distributions (binomial, Poisson) with PDF/PMF, CDF, and random variates.

Normal distribution β€” PDF, CDF, PPF
import numpy as np
from scipy import stats

mu, sigma = 0, 1
dist = stats.norm(loc=mu, scale=sigma)

x_vals = [-2, -1, 0, 1, 2]
print("x     PDF       CDF")
for x in x_vals:
    print(f"{x:+.0f}  {dist.pdf(x):.6f}  {dist.cdf(x):.6f}")

# Percent-point function (inverse CDF)
print(f"
95th percentile (PPF): {dist.ppf(0.95):.4f}")
print(f"P(-1 < X < 1):         {dist.cdf(1) - dist.cdf(-1):.4f}")
print(f"P(-2 < X < 2):         {dist.cdf(2) - dist.cdf(-2):.4f}")
Binomial distribution β€” PMF and CDF
import numpy as np
from scipy import stats

n, p = 20, 0.3   # 20 trials, 30% success rate
dist = stats.binom(n=n, p=p)

print(f"Binomial(n={n}, p={p})")
print(f"Mean: {dist.mean():.2f}, Std: {dist.std():.2f}")
print()
print("k    PMF        CDF")
for k in range(0, 11):
    print(f"{k:2d}   {dist.pmf(k):.6f}   {dist.cdf(k):.6f}")

# Probability of at least 8 successes
prob_ge8 = 1 - dist.cdf(7)
print(f"
P(X >= 8): {prob_ge8:.4f}")
Poisson distribution β€” PMF and expected value
import numpy as np
from scipy import stats

lam = 4.5   # average events per interval
dist = stats.poisson(mu=lam)

print(f"Poisson(lambda={lam})")
print(f"Mean: {dist.mean()}, Variance: {dist.var()}")
print()
print("k   PMF")
for k in range(0, 13):
    bar = "#" * int(dist.pmf(k) * 100)
    print(f"{k:2d}  {dist.pmf(k):.5f}  {bar}")

print(f"
P(X <= 3): {dist.cdf(3):.4f}")
print(f"P(X >= 8): {1 - dist.cdf(7):.4f}")
Exponential distribution β€” survival function
import numpy as np
from scipy import stats

# Scale = mean inter-arrival time (1/rate)
scale = 10.0    # average 10 minutes between events
dist = stats.expon(scale=scale)

print(f"Exponential(mean={scale})")
print(f"Mean: {dist.mean()}, Std: {dist.std():.4f}")
print()
# Survival function P(X > t)
for t in [5, 10, 15, 20, 30]:
    print(f"P(T > {t:2d} min) = {dist.sf(t):.4f}")

# Memoryless property check
p_gt10       = dist.sf(10)
p_gt20_gt10  = dist.sf(20) / dist.sf(10)
print(f"
P(T>20|T>10) = {p_gt20_gt10:.4f}  (should equal P(T>10) = {p_gt10:.4f})")
💼 Real-World Scenario
An e-commerce platform wants to model the number of customer purchases per hour (Poisson) and the time between purchases (Exponential). They also need the normal approximation for total daily revenue and the probability of a marketing campaign achieving at least 50 conversions out of 200 email sends.
Real-World Code
import numpy as np
from scipy import stats

# Poisson: purchases per hour
lam_hourly = 12
pois = stats.poisson(mu=lam_hourly)
print(f"Expected purchases/hour: {pois.mean()}")
print(f"P(>=15 purchases in 1hr): {1 - pois.cdf(14):.4f}")

# Exponential: minutes between purchases
minutes_between = stats.expon(scale=60/lam_hourly)
print(f"
Avg time between purchases: {minutes_between.mean():.2f} min")
print(f"P(wait > 8 min): {minutes_between.sf(8):.4f}")

# Binomial: email campaign conversions
n_emails, p_conv = 200, 0.27
binom = stats.binom(n=n_emails, p=p_conv)
print(f"
Email campaign β€” n={n_emails}, p={p_conv}")
print(f"Expected conversions: {binom.mean():.1f}")
print(f"P(>=50 conversions): {1 - binom.cdf(49):.4f}")

# Normal: total daily revenue (CLT)
avg_order, std_order, n_orders = 85, 30, 150
rev_dist = stats.norm(loc=avg_order*n_orders, scale=std_order*np.sqrt(n_orders))
print(f"
P(daily revenue > $14000): {rev_dist.sf(14000):.4f}")
🏋️ Practice: Distribution Parameter Fitting
Given a dataset of customer wait times, fit an exponential distribution using scipy.stats.expon.fit(), compute the fitted mean, and calculate the probability that a customer waits more than 5 minutes. Also compute the 90th percentile wait time.
Starter Code
import numpy as np
from scipy import stats

np.random.seed(42)
# Simulate wait times (minutes) β€” true mean = 3 min
wait_times = np.random.exponential(scale=3.0, size=500)

# TODO: Fit exponential distribution with stats.expon.fit()
# TODO: Print fitted scale (mean wait time)
# TODO: Compute P(wait > 5 minutes) using fitted distribution
# TODO: Compute 90th percentile wait time
✅ Practice Checklist
3. Sampling & Central Limit Theorem

Random sampling draws a subset from a population. The Central Limit Theorem (CLT) states that the distribution of sample means approaches normality as sample size grows, regardless of population shape. Bootstrap resampling estimates sampling distributions empirically.

Simple random sampling with numpy
import numpy as np

np.random.seed(42)
# Skewed population (exponential)
population = np.random.exponential(scale=5, size=10000)

print(f"Population  β€” mean: {population.mean():.3f}, std: {population.std():.3f}")
print()

# Draw samples of different sizes
for n in [5, 30, 100, 500]:
    sample = np.random.choice(population, size=n, replace=False)
    print(f"n={n:4d} β€” sample mean: {sample.mean():.3f}, sample std: {sample.std(ddof=1):.3f}")
CLT demonstration β€” sampling distribution of the mean
import numpy as np
from scipy import stats

np.random.seed(0)
# Highly skewed population
population = np.random.exponential(scale=5, size=100000)
pop_mean, pop_std = population.mean(), population.std()

n_simulations = 3000
for n in [5, 10, 30, 100]:
    sample_means = [np.mean(np.random.choice(population, n, replace=False))
                    for _ in range(n_simulations)]
    sm = np.array(sample_means)
    # By CLT, sampling dist should be normal
    w_stat, p_val = stats.shapiro(sm[:200])
    print(f"n={n:3d}: mean of means={sm.mean():.3f}, "
          f"std={sm.std():.3f} (theory={pop_std/np.sqrt(n):.3f}), "
          f"Shapiro p={p_val:.4f}")
Bootstrap resampling β€” estimate sampling distribution
import numpy as np

np.random.seed(7)
sample = np.random.lognormal(mean=2, sigma=0.5, size=50)

n_boot = 5000
boot_means = np.array([
    np.mean(np.random.choice(sample, size=len(sample), replace=True))
    for _ in range(n_boot)
])

print(f"Original sample mean: {sample.mean():.4f}")
print(f"Bootstrap mean:       {boot_means.mean():.4f}")
print(f"Bootstrap std error:  {boot_means.std():.4f}")
print(f"Bootstrap 95% CI:     [{np.percentile(boot_means,2.5):.4f}, {np.percentile(boot_means,97.5):.4f}]")
Stratified sampling with pandas
import numpy as np
import pandas as pd

np.random.seed(1)
n = 1000
df = pd.DataFrame({
    "age_group": np.random.choice(["18-34","35-54","55+"], size=n, p=[0.4,0.35,0.25]),
    "score":     np.random.normal(70, 10, n),
})

# Proportional stratified sample (10%)
sample_rate = 0.10
stratified = (df.groupby("age_group", group_keys=False)
                .apply(lambda g: g.sample(frac=sample_rate, random_state=42)))

print("Stratified sample composition:")
print(stratified["age_group"].value_counts())
print(f"
Pop mean:    {df['score'].mean():.3f}")
print(f"Sample mean: {stratified['score'].mean():.3f}")
💼 Real-World Scenario
A polling firm surveys likely voters ahead of an election. They use stratified random sampling by region (North, South, East, West) to ensure representation, then apply bootstrap resampling to estimate the margin of error for their candidate preference estimate without assuming normality.
Real-World Code
import numpy as np
import pandas as pd

np.random.seed(42)
true_props = {"North":0.52, "South":0.44, "East":0.55, "West":0.48}
region_sizes = {"North":3000, "South":4500, "East":2000, "West":2500}

# Simulate full voter database
rows = []
for region, size in region_sizes.items():
    votes = np.random.binomial(1, true_props[region], size)
    rows.extend(zip([region]*size, votes))
population = pd.DataFrame(rows, columns=["region","prefer_A"])

# Stratified sample: 100 per region
sample = (population.groupby("region", group_keys=False)
          .apply(lambda g: g.sample(n=100, random_state=42)))

overall_pct = sample["prefer_A"].mean()
print(f"Sample support for A: {overall_pct:.3f}")

# Bootstrap margin of error
n_boot = 4000
boot = [np.random.choice(sample["prefer_A"], size=len(sample), replace=True).mean()
        for _ in range(n_boot)]
boot = np.array(boot)
moe = np.percentile(boot, 97.5) - np.percentile(boot, 2.5)
print(f"Bootstrap 95% CI:     [{np.percentile(boot,2.5):.3f}, {np.percentile(boot,97.5):.3f}]")
print(f"Margin of error:      Β±{moe/2:.3f}")
🏋️ Practice: Verify CLT with a Uniform Distribution
Draw 2000 samples of size n from a Uniform(0,1) population. For n in [2, 10, 30, 100], compute the mean of sample means, the empirical standard error, and the theoretical standard error (1/sqrt(12*n)). Show that they converge.
Starter Code
import numpy as np

np.random.seed(99)
n_simulations = 2000
pop_std_uniform = np.sqrt(1/12)   # std of Uniform(0,1)

for n in [2, 10, 30, 100]:
    # TODO: Draw n_simulations samples of size n from Uniform(0,1)
    # TODO: Compute sample means array
    # TODO: Print mean of sample means, empirical SE, theoretical SE
    pass
✅ Practice Checklist
4. Confidence Intervals

A confidence interval (CI) gives a range of plausible values for a population parameter. A 95% CI means that if we repeated the study many times, 95% of the intervals would contain the true parameter. scipy.stats provides t-intervals; bootstrap CIs require no distributional assumptions.

CI for mean using scipy.stats.t.interval
import numpy as np
from scipy import stats

np.random.seed(42)
sample = np.random.normal(loc=50, scale=8, size=30)

n    = len(sample)
mean = sample.mean()
se   = stats.sem(sample)         # standard error of mean

# 90%, 95%, 99% confidence intervals
for conf in [0.90, 0.95, 0.99]:
    lo, hi = stats.t.interval(conf, df=n-1, loc=mean, scale=se)
    print(f"{int(conf*100)}% CI: ({lo:.3f}, {hi:.3f})  width={hi-lo:.3f}")
CI for proportion using normal approximation
import numpy as np
from scipy import stats

# 143 successes out of 250 trials
successes, n = 143, 250
p_hat = successes / n
se    = np.sqrt(p_hat * (1 - p_hat) / n)

for conf in [0.90, 0.95, 0.99]:
    z  = stats.norm.ppf((1 + conf) / 2)
    lo = p_hat - z * se
    hi = p_hat + z * se
    print(f"{int(conf*100)}% CI for proportion: ({lo:.4f}, {hi:.4f})")

# Wilson score interval (better for extreme p)
from scipy.stats import proportion_confint
lo_w, hi_w = proportion_confint(successes, n, alpha=0.05, method="wilson")
print(f"
Wilson 95% CI: ({lo_w:.4f}, {hi_w:.4f})")
Bootstrap CI β€” percentile and BCA methods
import numpy as np

np.random.seed(0)
data = np.random.lognormal(2, 0.8, size=80)

# Percentile bootstrap CI for the median
n_boot = 5000
boot_medians = np.array([
    np.median(np.random.choice(data, size=len(data), replace=True))
    for _ in range(n_boot)
])

lo_p, hi_p = np.percentile(boot_medians, [2.5, 97.5])
print(f"Sample median:       {np.median(data):.4f}")
print(f"Bootstrap 95% CI:    ({lo_p:.4f}, {hi_p:.4f})")
print(f"Bootstrap std error: {boot_medians.std():.4f}")
CI for difference of two means
import numpy as np
from scipy import stats

np.random.seed(5)
group_a = np.random.normal(75, 10, 40)
group_b = np.random.normal(70, 12, 35)

# Welch t-test gives CI for difference in means
t_stat, p_val = stats.ttest_ind(group_a, group_b, equal_var=False)
diff = group_a.mean() - group_b.mean()

# Manual CI for difference
se_diff = np.sqrt(group_a.var(ddof=1)/len(group_a) +
                  group_b.var(ddof=1)/len(group_b))
df = len(group_a) + len(group_b) - 2
lo, hi = diff - stats.t.ppf(0.975, df)*se_diff, diff + stats.t.ppf(0.975, df)*se_diff

print(f"Group A mean: {group_a.mean():.3f}")
print(f"Group B mean: {group_b.mean():.3f}")
print(f"Difference:   {diff:.3f}")
print(f"95% CI for diff: ({lo:.3f}, {hi:.3f})")
print(f"t={t_stat:.3f}, p={p_val:.4f}")
💼 Real-World Scenario
A pharmaceutical company runs a clinical trial measuring the reduction in blood pressure after a new drug. They need 95% confidence intervals for the mean reduction, a bootstrap CI that avoids normality assumptions for their small subgroup analysis, and a CI for the proportion of patients achieving >10 mmHg reduction.
Real-World Code
import numpy as np
from scipy import stats
from scipy.stats import proportion_confint

np.random.seed(42)
# Reduction in systolic BP (mmHg) for 45 patients
reduction = np.random.normal(loc=11.5, scale=5.2, size=45)

mean_red = reduction.mean()
se       = stats.sem(reduction)
lo, hi   = stats.t.interval(0.95, df=len(reduction)-1, loc=mean_red, scale=se)
print(f"Mean reduction: {mean_red:.2f} mmHg")
print(f"95% t-CI:       ({lo:.2f}, {hi:.2f}) mmHg")

# Proportion achieving > 10 mmHg reduction
successes = np.sum(reduction > 10)
lo_w, hi_w = proportion_confint(successes, len(reduction), alpha=0.05, method="wilson")
print(f"
Responders (>10 mmHg): {successes}/{len(reduction)} ({successes/len(reduction):.1%})")
print(f"Wilson 95% CI:         ({lo_w:.3f}, {hi_w:.3f})")

# Bootstrap CI for median reduction (subgroup)
boot_med = np.array([np.median(np.random.choice(reduction, len(reduction), replace=True))
                     for _ in range(4000)])
print(f"
Bootstrap 95% CI (median): ({np.percentile(boot_med,2.5):.2f}, {np.percentile(boot_med,97.5):.2f})")
🏋️ Practice: Compare Confidence Intervals
Generate two samples from Normal(60, 8) with n=20 and n=100. For each, compute the 95% t-CI for the mean. Then generate a skewed sample from Exponential(scale=5) with n=40 and compute both the t-CI and a bootstrap CI. Discuss which is more appropriate.
Starter Code
import numpy as np
from scipy import stats

np.random.seed(77)

# TODO: Sample 1 (n=20) from Normal(60,8) β€” compute 95% t-CI
# TODO: Sample 2 (n=100) from Normal(60,8) β€” compute 95% t-CI
# TODO: Sample 3 (n=40) from Exponential(scale=5) β€” compute both t-CI and bootstrap CI
# TODO: Compare widths and discuss
✅ Practice Checklist
5. Hypothesis Testing

Hypothesis testing provides a framework for making statistical decisions. A null hypothesis (H0) is tested against an alternative. The p-value is the probability of observing data at least as extreme as the sample if H0 is true. scipy.stats provides t-tests, with significance typically judged at alpha=0.05.

One-sample t-test
import numpy as np
from scipy import stats

np.random.seed(42)
# H0: population mean = 100; H1: mean != 100
sample = np.random.normal(loc=103, scale=12, size=35)

t_stat, p_val = stats.ttest_1samp(sample, popmean=100)

print(f"Sample mean: {sample.mean():.4f}")
print(f"t-statistic: {t_stat:.4f}")
print(f"p-value:     {p_val:.4f}")
print()
alpha = 0.05
conclusion = "Reject H0" if p_val < alpha else "Fail to reject H0"
print(f"At alpha={alpha}: {conclusion}")
print(f"Evidence that mean != 100: {'Yes' if p_val < alpha else 'No'}")
Independent samples (two-sample) t-test
import numpy as np
from scipy import stats

np.random.seed(7)
control   = np.random.normal(50, 8, 40)
treatment = np.random.normal(55, 9, 38)

# Levene test for equal variances
lev_stat, lev_p = stats.levene(control, treatment)
equal_var = lev_p > 0.05

t_stat, p_val = stats.ttest_ind(control, treatment, equal_var=equal_var)

print(f"Control:   n={len(control)}, mean={control.mean():.3f}, std={control.std(ddof=1):.3f}")
print(f"Treatment: n={len(treatment)}, mean={treatment.mean():.3f}, std={treatment.std(ddof=1):.3f}")
print(f"
Levene p={lev_p:.4f} β†’ equal_var={equal_var}")
print(f"t={t_stat:.4f}, p={p_val:.4f}")
print(f"Result: {'Significant difference' if p_val < 0.05 else 'No significant difference'}")
Paired t-test β€” before vs after
import numpy as np
from scipy import stats

np.random.seed(3)
n = 25
before = np.random.normal(130, 10, n)
after  = before - np.random.normal(8, 5, n)   # intervention reduces by ~8

t_stat, p_val = stats.ttest_rel(before, after)
diff = before - after

print(f"Before: {before.mean():.2f} Β± {before.std(ddof=1):.2f}")
print(f"After:  {after.mean():.2f} Β± {after.std(ddof=1):.2f}")
print(f"Mean diff: {diff.mean():.2f} Β± {diff.std(ddof=1):.2f}")
print(f"
t={t_stat:.4f}, p={p_val:.6f}")
print(f"One-tailed p (before > after): {p_val/2:.6f}")
Type I and II errors β€” power calculation
import numpy as np
from scipy import stats

# Simulate Type I error rate (false positives under H0)
np.random.seed(0)
alpha = 0.05
n_simulations = 10000
n_per_group = 30

# Under H0: both groups have same mean
false_positives = 0
for _ in range(n_simulations):
    g1 = np.random.normal(50, 10, n_per_group)
    g2 = np.random.normal(50, 10, n_per_group)
    _, p = stats.ttest_ind(g1, g2)
    if p < alpha:
        false_positives += 1

print(f"Simulated Type I error rate: {false_positives/n_simulations:.4f} (expected ~{alpha})")

# Under H1: real effect of size 0.5 SD
true_positives = 0
for _ in range(n_simulations):
    g1 = np.random.normal(50, 10, n_per_group)
    g2 = np.random.normal(55, 10, n_per_group)
    _, p = stats.ttest_ind(g1, g2)
    if p < alpha:
        true_positives += 1
print(f"Simulated Power:             {true_positives/n_simulations:.4f}")
💼 Real-World Scenario
An online retailer A/B tests a redesigned checkout page. Conversion rates (number of sales per session) are measured for control (old page) and treatment (new page) groups. The team runs an independent t-test on session revenue values, checks assumptions with Levene's test, and interprets the p-value against their pre-specified alpha of 0.05.
Real-World Code
import numpy as np
from scipy import stats

np.random.seed(42)
# Simulate session revenues (in $) for each group
n_ctrl, n_trt = 500, 500
control   = np.random.lognormal(mean=3.5, sigma=1.2, size=n_ctrl)
treatment = np.random.lognormal(mean=3.65, sigma=1.2, size=n_trt)

print("=== A/B Test: Checkout Page Revenue ===")
print(f"Control:   n={n_ctrl}, mean=${control.mean():.2f}, median=${np.median(control):.2f}")
print(f"Treatment: n={n_trt}, mean=${treatment.mean():.2f}, median=${np.median(treatment):.2f}")

# Variance equality
lev_stat, lev_p = stats.levene(control, treatment)
print(f"
Levene test: F={lev_stat:.3f}, p={lev_p:.4f}")
equal_var = lev_p > 0.05

# Welch t-test
t_stat, p_val = stats.ttest_ind(control, treatment, equal_var=equal_var)
print(f"Welch t-test: t={t_stat:.3f}, p={p_val:.4f}")
print(f"Decision (alpha=0.05): {'Reject H0 β€” new page works' if p_val < 0.05 else 'Fail to reject H0'}")

uplift = (treatment.mean() - control.mean()) / control.mean() * 100
print(f"Revenue uplift: {uplift:.1f}%")
🏋️ Practice: One-Sample and Two-Sample Tests
A company claims their delivery time averages 3 days. You sample 40 recent deliveries. Run a one-sample t-test to check the claim. Then compare morning vs afternoon delivery times using an independent t-test. Report t-statistic, p-value, and conclusion at alpha=0.05.
Starter Code
import numpy as np
from scipy import stats

np.random.seed(11)
deliveries = np.random.normal(loc=3.4, scale=0.8, size=40)
morning    = np.random.normal(loc=3.2, scale=0.7, size=25)
afternoon  = np.random.normal(loc=3.6, scale=0.9, size=25)

# TODO: One-sample t-test: H0: mean = 3.0 days
# TODO: Two-sample t-test: morning vs afternoon
# TODO: Print t, p, and conclusion for each
✅ Practice Checklist
6. ANOVA & Multiple Groups

ANOVA (Analysis of Variance) tests whether means differ across three or more groups simultaneously, avoiding inflated Type I errors from multiple pairwise t-tests. One-way ANOVA requires normality and equal variances; Kruskal-Wallis is the non-parametric alternative. Post-hoc tests identify which specific pairs differ.

One-way ANOVA with scipy.stats.f_oneway
import numpy as np
from scipy import stats

np.random.seed(42)
group_a = np.random.normal(20, 4, 30)
group_b = np.random.normal(24, 4, 30)
group_c = np.random.normal(22, 4, 30)
group_d = np.random.normal(20, 4, 30)

f_stat, p_val = stats.f_oneway(group_a, group_b, group_c, group_d)

for name, grp in zip("ABCD", [group_a,group_b,group_c,group_d]):
    print(f"Group {name}: mean={grp.mean():.3f}, std={grp.std(ddof=1):.3f}")

print(f"
F-statistic: {f_stat:.4f}")
print(f"p-value:     {p_val:.4f}")
print(f"Result: {'Significant difference between groups' if p_val < 0.05 else 'No significant difference'}")
ANOVA assumptions β€” Levene and Shapiro-Wilk
import numpy as np
from scipy import stats

np.random.seed(0)
g1 = np.random.normal(10, 2, 25)
g2 = np.random.normal(12, 3, 25)
g3 = np.random.normal(11, 2, 25)

# Homogeneity of variance
lev_stat, lev_p = stats.levene(g1, g2, g3)
print(f"Levene test: F={lev_stat:.3f}, p={lev_p:.4f}")
print(f"  Equal variances assumed: {lev_p > 0.05}")

# Normality within each group
for i, g in enumerate([g1,g2,g3],1):
    w, p = stats.shapiro(g)
    print(f"Shapiro-Wilk Group {i}: W={w:.4f}, p={p:.4f} β€” {'Normal' if p>0.05 else 'Non-normal'}")
Kruskal-Wallis β€” non-parametric ANOVA
import numpy as np
from scipy import stats

np.random.seed(5)
# Skewed groups β€” ANOVA assumptions violated
g1 = np.random.exponential(2, 30)
g2 = np.random.exponential(3, 30)
g3 = np.random.exponential(2.5, 30)

kw_stat, kw_p = stats.kruskal(g1, g2, g3)
f_stat, f_p   = stats.f_oneway(g1, g2, g3)

print("Group medians:")
for i, g in enumerate([g1,g2,g3],1):
    print(f"  Group {i}: median={np.median(g):.3f}")

print(f"
Kruskal-Wallis: H={kw_stat:.4f}, p={kw_p:.4f}")
print(f"One-way ANOVA:  F={f_stat:.4f}, p={f_p:.4f}")
Post-hoc pairwise tests with Bonferroni correction
import numpy as np
from scipy import stats
from itertools import combinations

np.random.seed(9)
groups = {
    "Control":   np.random.normal(50, 8, 30),
    "Drug A":    np.random.normal(58, 8, 30),
    "Drug B":    np.random.normal(55, 8, 30),
    "Placebo":   np.random.normal(51, 8, 30),
}

f_stat, p_val = stats.f_oneway(*groups.values())
print(f"ANOVA: F={f_stat:.3f}, p={p_val:.4f}")

# Pairwise t-tests with Bonferroni correction
names = list(groups.keys())
pairs = list(combinations(names, 2))
alpha_corr = 0.05 / len(pairs)   # Bonferroni

print(f"
Bonferroni alpha: {alpha_corr:.4f} ({len(pairs)} comparisons)")
for n1, n2 in pairs:
    t, p = stats.ttest_ind(groups[n1], groups[n2])
    sig  = "*" if p < alpha_corr else ""
    print(f"  {n1} vs {n2}: p={p:.4f} {sig}")
💼 Real-World Scenario
A nutrition researcher compares weight loss across four diets (Mediterranean, Keto, Vegan, Low-Fat) over 12 weeks. One-way ANOVA tests whether any diet differs. Levene's test checks variance equality. Post-hoc pairwise tests with Bonferroni correction identify which specific diets differ significantly.
Real-World Code
import numpy as np
from scipy import stats
from itertools import combinations

np.random.seed(42)
diets = {
    "Mediterranean": np.random.normal(5.2, 2.1, 35),
    "Keto":          np.random.normal(7.1, 2.5, 35),
    "Vegan":         np.random.normal(4.8, 1.9, 35),
    "Low-Fat":       np.random.normal(4.1, 2.0, 35),
}

print("=== Diet Weight Loss Study (kg) ===")
for diet, vals in diets.items():
    print(f"{diet:15s}: mean={vals.mean():.2f}, std={vals.std(ddof=1):.2f}")

f_stat, p_val = stats.f_oneway(*diets.values())
print(f"
ANOVA: F={f_stat:.3f}, p={p_val:.4f}")

if p_val < 0.05:
    print("Significant β€” running post-hoc tests")
    names = list(diets.keys())
    pairs = list(combinations(names, 2))
    bonf  = 0.05 / len(pairs)
    print(f"Bonferroni alpha: {bonf:.4f}")
    for n1, n2 in pairs:
        _, p = stats.ttest_ind(diets[n1], diets[n2])
        print(f"  {n1:15s} vs {n2:15s}: p={p:.4f} {'*' if p<bonf else ''}")
🏋️ Practice: Compare Three Teaching Methods
Simulate exam scores for students taught by three methods: Lecture (mean=72), Active Learning (mean=78), Online (mean=74), each with std=10 and n=30. Run one-way ANOVA. If significant, apply Bonferroni-corrected pairwise t-tests. Also run Kruskal-Wallis as a robustness check.
Starter Code
import numpy as np
from scipy import stats
from itertools import combinations

np.random.seed(55)
lecture = np.random.normal(72, 10, 30)
active  = np.random.normal(78, 10, 30)
online  = np.random.normal(74, 10, 30)

# TODO: One-way ANOVA
# TODO: If significant, Bonferroni post-hoc
# TODO: Kruskal-Wallis test
# TODO: Compare ANOVA vs Kruskal-Wallis conclusions
✅ Practice Checklist
7. Chi-Square Tests

Chi-square tests assess relationships between categorical variables. The goodness-of-fit test checks if observed frequencies match expected. The test of independence checks if two categorical variables are related. scipy.stats.chi2_contingency handles contingency tables directly.

Chi-square goodness-of-fit test
import numpy as np
from scipy import stats

# Test if a die is fair (expected: equal frequency for each face)
observed = np.array([48, 55, 52, 60, 45, 40])   # 300 rolls
expected_freq = 300 / 6
expected = np.full(6, expected_freq)

chi2, p_val = stats.chisquare(f_obs=observed, f_exp=expected)
df = len(observed) - 1

print("Die Roll Test (300 rolls)")
print(f"Observed: {observed}")
print(f"Expected: {expected}")
print(f"
Chi2 statistic: {chi2:.4f}")
print(f"Degrees of freedom: {df}")
print(f"p-value: {p_val:.4f}")
print(f"Die is {'NOT fair' if p_val < 0.05 else 'fair (fail to reject H0)'}")
Chi-square test of independence β€” contingency table
import numpy as np
from scipy import stats
import pandas as pd

# Customer satisfaction by region
data = {
    "Satisfied":   {"North": 120, "South": 85, "East": 95, "West": 110},
    "Neutral":     {"North":  40, "South": 35, "East": 42, "West":  38},
    "Unsatisfied": {"North":  20, "South": 30, "East": 23, "West":  22},
}
table = pd.DataFrame(data).T
print("Contingency Table:")
print(table)

chi2, p_val, dof, expected = stats.chi2_contingency(table)
print(f"
Chi2={chi2:.4f}, df={dof}, p={p_val:.4f}")
print(f"Satisfaction independent of region: {p_val > 0.05}")
Cramer's V β€” effect size for chi-square
import numpy as np
from scipy import stats

# 2x2 contingency: Gender vs. Purchase
table = np.array([[230, 170],   # Male:   bought, did not buy
                  [195, 205]])  # Female: bought, did not buy

chi2, p_val, dof, expected = stats.chi2_contingency(table)
n = table.sum()
k = min(table.shape) - 1

cramers_v = np.sqrt(chi2 / (n * k))
print(f"Chi2={chi2:.4f}, df={dof}, p={p_val:.4f}")
print(f"n={n}, Cramer's V={cramers_v:.4f}")
print(f"Effect size: {'small' if cramers_v < 0.1 else 'medium' if cramers_v < 0.3 else 'large'}")
Expected frequency check and Fisher's exact test
import numpy as np
from scipy import stats

# Small sample β€” use Fisher's exact test
# Rare disease by treatment
table = np.array([[5, 15],
                  [2, 18]])

chi2, p_chi, dof, expected = stats.chi2_contingency(table)
oddsratio, p_fisher = stats.fisher_exact(table)

print("Observed:")
print(table)
print(f"
Expected frequencies:")
print(expected.round(2))
print(f"
Chi2 p-value:   {p_chi:.4f}  (valid if all expected >= 5)")
print(f"Fisher p-value: {p_fisher:.4f}  (preferred for small samples)")
print(f"Odds ratio:     {oddsratio:.4f}")
💼 Real-World Scenario
A market research firm surveys 800 shoppers about their preferred payment method (Cash, Card, Mobile) segmented by age group (18-34, 35-54, 55+). They run a chi-square test of independence to determine if payment preference is associated with age, then compute Cramer's V to quantify the strength of association.
Real-World Code
import numpy as np
import pandas as pd
from scipy import stats

np.random.seed(42)
# Contingency table: age group vs payment method
table = np.array([
    [45, 130, 125],   # 18-34: Cash, Card, Mobile
    [80, 155,  65],   # 35-54
    [95, 120,  25],   # 55+
])
row_labels = ["18-34", "35-54", "55+"]
col_labels  = ["Cash", "Card", "Mobile"]

df_table = pd.DataFrame(table, index=row_labels, columns=col_labels)
print("Payment Method by Age Group:")
print(df_table)
print(f"
Row totals: {df_table.sum(axis=1).values}")

chi2, p_val, dof, expected = stats.chi2_contingency(table)
n = table.sum()
cramers_v = np.sqrt(chi2 / (n * (min(table.shape)-1)))

print(f"
Chi2={chi2:.3f}, df={dof}, p={p_val:.4f}")
print(f"Cramer's V = {cramers_v:.4f} ({'small' if cramers_v<0.1 else 'medium' if cramers_v<0.3 else 'large'} effect)")
print(f"Association between age and payment method: {'Yes' if p_val<0.05 else 'No'}")
🏋️ Practice: A/B Test with Chi-Square
An A/B test shows 180 conversions out of 900 visitors (control) and 210 conversions out of 900 visitors (treatment). Build a 2x2 contingency table and run a chi-square test of independence to determine if the conversion rate differs significantly. Compute Cramer's V.
Starter Code
import numpy as np
from scipy import stats

# Control:   180 converted, 720 did not
# Treatment: 210 converted, 690 did not
table = np.array([
    [180, 720],
    [210, 690],
])

# TODO: Run chi2_contingency
# TODO: Compute and interpret Cramer's V
# TODO: Also run Fisher's exact as a check
✅ Practice Checklist
8. Correlation & Regression

Correlation measures the linear (Pearson) or monotonic (Spearman) relationship between two variables, ranging from -1 to +1. Simple linear regression models one variable as a linear function of another. scipy.stats.linregress provides slope, intercept, r-value, and p-value for the slope.

Pearson and Spearman correlation
import numpy as np
from scipy import stats

np.random.seed(42)
x = np.random.uniform(0, 10, 50)
y = 2.5 * x + np.random.normal(0, 3, 50)   # linear + noise

pearson_r,  pearson_p  = stats.pearsonr(x, y)
spearman_r, spearman_p = stats.spearmanr(x, y)

print(f"Pearson r:  {pearson_r:.4f}  (p={pearson_p:.4f})")
print(f"Spearman r: {spearman_r:.4f}  (p={spearman_p:.4f})")

# Correlation matrix with pandas
import pandas as pd
df = pd.DataFrame({"x":x, "y":y, "z": y**0.5})
print("
Correlation matrix:")
print(df.corr(method="pearson").round(4))
Simple linear regression with scipy.stats.linregress
import numpy as np
from scipy import stats

np.random.seed(0)
hours_studied = np.random.uniform(1, 10, 60)
exam_score    = 50 + 4.5 * hours_studied + np.random.normal(0, 5, 60)

result = stats.linregress(hours_studied, exam_score)

print(f"slope:     {result.slope:.4f}")
print(f"intercept: {result.intercept:.4f}")
print(f"r-value:   {result.rvalue:.4f}")
print(f"r-squared: {result.rvalue**2:.4f}")
print(f"p-value:   {result.pvalue:.6f}")
print(f"std error: {result.stderr:.4f}")

# Prediction
pred_8hrs = result.slope * 8 + result.intercept
print(f"
Predicted score for 8 hrs: {pred_8hrs:.2f}")
Multiple correlation using numpy polyfit
import numpy as np
from scipy import stats

np.random.seed(3)
x = np.linspace(0, 10, 80)
y = 3 + 2*x - 0.15*x**2 + np.random.normal(0, 2, 80)

# Linear fit
coef1 = np.polyfit(x, y, 1)
y_hat1 = np.polyval(coef1, x)
ss_res1 = np.sum((y - y_hat1)**2)
ss_tot  = np.sum((y - y.mean())**2)
r2_lin  = 1 - ss_res1/ss_tot

# Quadratic fit
coef2 = np.polyfit(x, y, 2)
y_hat2 = np.polyval(coef2, x)
ss_res2 = np.sum((y - y_hat2)**2)
r2_quad = 1 - ss_res2/ss_tot

print(f"Linear fit:    RΒ² = {r2_lin:.4f}")
print(f"Quadratic fit: RΒ² = {r2_quad:.4f}")
print(f"Quadratic coefficients: {coef2.round(4)}")
Residuals analysis and influential points
import numpy as np
from scipy import stats

np.random.seed(1)
x = np.random.uniform(0, 20, 50)
y = 5 + 1.8 * x + np.random.normal(0, 4, 50)

result = stats.linregress(x, y)
y_pred = result.slope * x + result.intercept
residuals = y - y_pred

# Standardized residuals
std_res = (residuals - residuals.mean()) / residuals.std(ddof=1)
outlier_mask = np.abs(std_res) > 2

print(f"n={len(x)}, RΒ²={result.rvalue**2:.4f}")
print(f"Residual std: {residuals.std(ddof=1):.4f}")
print(f"Outliers (|std resid|>2): {outlier_mask.sum()}")
print(f"Expected outliers ~5%: {int(0.05*len(x))}")

# Durbin-Watson statistic (autocorrelation in residuals)
diffs = np.diff(residuals)
dw = np.sum(diffs**2) / np.sum(residuals**2)
print(f"Durbin-Watson: {dw:.4f}  (2=no autocorr, <2=positive, >2=negative)")
💼 Real-World Scenario
A real estate company models house prices using square footage. They compute Pearson correlation, fit a simple linear regression with scipy.stats.linregress, assess R-squared, and analyze residuals to check for linearity and homoscedasticity. They also identify outlier properties that deviate from the trend.
Real-World Code
import numpy as np
import pandas as pd
from scipy import stats

np.random.seed(42)
# Simulate house data
sqft  = np.random.uniform(800, 3500, 150)
price = 50000 + 180 * sqft + np.random.normal(0, 25000, 150)
# Add a few luxury outliers
price[:5] += 200000

df = pd.DataFrame({"sqft": sqft, "price": price})

pearson_r, pearson_p = stats.pearsonr(df["sqft"], df["price"])
print(f"Pearson r = {pearson_r:.4f} (p={pearson_p:.2e})")

result = stats.linregress(df["sqft"], df["price"])
print(f"
Regression: price = {result.slope:.2f} * sqft + {result.intercept:,.0f}")
print(f"RΒ² = {result.rvalue**2:.4f}")

# Predict 2000 sqft house
pred = result.slope * 2000 + result.intercept
print(f"Predicted price (2000 sqft): ${pred:,.0f}")

# Residual analysis
residuals = df["price"] - (result.slope * df["sqft"] + result.intercept)
std_res = (residuals - residuals.mean()) / residuals.std(ddof=1)
print(f"
Outlier properties (|std resid|>2.5): {(np.abs(std_res)>2.5).sum()}")
🏋️ Practice: Advertising Spend vs Sales Regression
Given weekly advertising spend (x) ranging from $1000 to $10000 and weekly sales (y), fit a linear regression. Compute slope, intercept, R-squared, and predict sales for a $7500 spend. Compute both Pearson and Spearman correlations and compare them.
Starter Code
import numpy as np
from scipy import stats

np.random.seed(88)
ad_spend = np.random.uniform(1000, 10000, 52)   # weekly ad spend
sales    = 2000 + 3.5 * ad_spend + np.random.normal(0, 3000, 52)

# TODO: Compute Pearson and Spearman correlations
# TODO: Fit linear regression with scipy.stats.linregress
# TODO: Print slope, intercept, RΒ²
# TODO: Predict sales for $7500 ad spend
✅ Practice Checklist
9. Non-Parametric Tests

Non-parametric tests make no assumptions about the underlying distribution. They operate on ranks rather than raw values, making them robust to outliers and skewed data. Key tests include Mann-Whitney U (two independent groups), Wilcoxon signed-rank (paired), and Shapiro-Wilk (normality check).

Shapiro-Wilk normality test
import numpy as np
from scipy import stats

np.random.seed(42)
normal_data = np.random.normal(50, 10, 50)
skewed_data = np.random.exponential(scale=5, size=50)
uniform_data = np.random.uniform(0, 100, 50)

print("Shapiro-Wilk Normality Test")
print(f"{'Dataset':<15} {'W-stat':>8} {'p-value':>9} {'Normal?':>8}")
for name, data in [("Normal", normal_data), ("Exponential", skewed_data), ("Uniform", uniform_data)]:
    w, p = stats.shapiro(data)
    print(f"{name:<15} {w:>8.4f} {p:>9.4f} {str(p>0.05):>8}")
Mann-Whitney U test β€” two independent samples
import numpy as np
from scipy import stats

np.random.seed(7)
# Skewed distributions β€” t-test assumptions violated
group1 = np.random.exponential(scale=3, size=35)
group2 = np.random.exponential(scale=4, size=38)

# Parametric (t-test)
t_stat, t_p = stats.ttest_ind(group1, group2)
# Non-parametric
u_stat, mw_p = stats.mannwhitneyu(group1, group2, alternative="two-sided")

print(f"Group 1: median={np.median(group1):.3f}, mean={group1.mean():.3f}")
print(f"Group 2: median={np.median(group2):.3f}, mean={group2.mean():.3f}")
print(f"
Welch t-test:     t={t_stat:.3f}, p={t_p:.4f}")
print(f"Mann-Whitney U:   U={u_stat:.1f},  p={mw_p:.4f}")

# Rank-biserial correlation (effect size)
r_rb = 1 - (2*u_stat) / (len(group1)*len(group2))
print(f"Rank-biserial r:  {r_rb:.4f}")
Wilcoxon signed-rank test β€” paired samples
import numpy as np
from scipy import stats

np.random.seed(3)
n = 30
before = np.random.exponential(scale=8, size=n)
after  = before * np.random.uniform(0.6, 0.95, n)   # improvement

# Parametric paired t-test
t_stat, t_p = stats.ttest_rel(before, after)
# Non-parametric
w_stat, w_p = stats.wilcoxon(before, after)

print(f"Before: median={np.median(before):.3f}")
print(f"After:  median={np.median(after):.3f}")
print(f"
Paired t-test:          t={t_stat:.3f}, p={t_p:.4f}")
print(f"Wilcoxon signed-rank:   W={w_stat:.1f},  p={w_p:.4f}")
Kolmogorov-Smirnov and Anderson-Darling tests
import numpy as np
from scipy import stats

np.random.seed(0)
data = np.random.normal(5, 2, 100)

# KS test: compare to normal distribution
ks_stat, ks_p = stats.kstest(data, "norm", args=(data.mean(), data.std()))
print(f"K-S test vs Normal:  D={ks_stat:.4f}, p={ks_p:.4f}")

# Anderson-Darling test
ad_result = stats.anderson(data, dist="norm")
print(f"
Anderson-Darling: statistic={ad_result.statistic:.4f}")
for sl, cv in zip(ad_result.significance_level, ad_result.critical_values):
    reject = ad_result.statistic > cv
    print(f"  alpha={sl:5.1f}%: critical={cv:.3f}, reject H0={reject}")

# Two-sample KS test
data2 = np.random.normal(5.5, 2, 100)
ks2_stat, ks2_p = stats.ks_2samp(data, data2)
print(f"
Two-sample KS test: D={ks2_stat:.4f}, p={ks2_p:.4f}")
💼 Real-World Scenario
A clinical researcher compares pain scores (highly skewed, 0-10 scale) before and after a new pain management protocol using the Wilcoxon signed-rank test. They also compare pain scores between two patient cohorts using Mann-Whitney U. Shapiro-Wilk normality tests confirm non-parametric methods are appropriate.
Real-World Code
import numpy as np
from scipy import stats

np.random.seed(42)
n = 40
# Pain scores (0-10, skewed toward lower end)
before = np.random.beta(a=2, b=1.5, size=n) * 10
after  = before * np.random.uniform(0.4, 0.85, n)

# Normality check
wb, pb = stats.shapiro(before)
wa, pa = stats.shapiro(after)
print("Shapiro-Wilk normality test:")
print(f"  Before: W={wb:.4f}, p={pb:.4f} β€” {'Normal' if pb>0.05 else 'Non-normal'}")
print(f"  After:  W={wa:.4f}, p={pa:.4f} β€” {'Normal' if pa>0.05 else 'Non-normal'}")

# Wilcoxon signed-rank (paired, non-parametric)
w_stat, w_p = stats.wilcoxon(before, after)
print(f"
Wilcoxon signed-rank: W={w_stat:.1f}, p={w_p:.4f}")
print(f"Median reduction: {np.median(before-after):.2f} points")

# Independent cohort comparison
cohort_a = np.random.beta(2, 1.5, 40) * 10
cohort_b = np.random.beta(1.5, 2, 40) * 10
u_stat, mw_p = stats.mannwhitneyu(cohort_a, cohort_b, alternative="two-sided")
print(f"
Mann-Whitney U (cohort comparison): U={u_stat:.1f}, p={mw_p:.4f}")
🏋️ Practice: Choose Parametric vs Non-Parametric
Generate three datasets: (A) Normal(50,10) n=40, (B) Exponential(scale=5) n=40, (C) Uniform(0,100) n=40. For each pair, first run Shapiro-Wilk, then decide whether to use t-test or Mann-Whitney U. Run both and compare p-values.
Starter Code
import numpy as np
from scipy import stats

np.random.seed(22)
group_a = np.random.normal(50, 10, 40)
group_b = np.random.exponential(scale=5, size=40)
group_c = np.random.uniform(0, 100, 40)

# TODO: Shapiro-Wilk for each group
# TODO: For A vs B: t-test and Mann-Whitney
# TODO: For A vs C: t-test and Mann-Whitney
# TODO: Discuss which test is appropriate for each pair
✅ Practice Checklist
10. Effect Size & Power Analysis

Effect size quantifies the practical significance of a result, independent of sample size. Cohen's d measures standardized mean differences. Statistical power (1 - beta) is the probability of detecting a true effect. Power analysis determines the sample size needed to achieve adequate power (typically 0.80).

Cohen's d β€” effect size for two groups
import numpy as np
from scipy import stats

np.random.seed(42)
group1 = np.random.normal(50, 10, 100)
group2 = np.random.normal(56, 10, 100)

# Pooled Cohen's d
mean_diff  = group1.mean() - group2.mean()
pooled_std = np.sqrt((group1.var(ddof=1) + group2.var(ddof=1)) / 2)
cohens_d   = abs(mean_diff) / pooled_std

print(f"Group 1 mean: {group1.mean():.3f}")
print(f"Group 2 mean: {group2.mean():.3f}")
print(f"Pooled std:   {pooled_std:.3f}")
print(f"Cohen's d:    {cohens_d:.4f}")

magnitude = ("small" if cohens_d < 0.2 else
             "small-medium" if cohens_d < 0.5 else
             "medium" if cohens_d < 0.8 else "large")
print(f"Effect size:  {magnitude}")

_, p_val = stats.ttest_ind(group1, group2)
print(f"p-value:      {p_val:.4f}")
Power analysis for one-sample t-test with statsmodels
from statsmodels.stats.power import TTestPower

analysis = TTestPower()

# Given: effect size d=0.5, alpha=0.05, power=0.80
n_needed = analysis.solve_power(effect_size=0.5, alpha=0.05, power=0.80, alternative="two-sided")
print(f"Sample size for d=0.5, power=0.80: n = {n_needed:.1f} β†’ {int(np.ceil(n_needed))}")

# Power curve: how does power change with n?
import numpy as np
print("
Power by sample size (d=0.5, alpha=0.05):")
for n in [10, 20, 30, 50, 80, 100, 150, 200]:
    pwr = analysis.solve_power(effect_size=0.5, nobs=n, alpha=0.05, alternative="two-sided")
    bar = "#" * int(pwr * 20)
    print(f"  n={n:4d}: power={pwr:.3f}  {bar}")
Power analysis for two-sample t-test
import numpy as np
from statsmodels.stats.power import TTestIndPower

analysis = TTestIndPower()

# Sample sizes for various effect sizes
print("Required n per group (alpha=0.05, power=0.80):")
for d in [0.2, 0.3, 0.5, 0.8, 1.0, 1.5]:
    n = analysis.solve_power(effect_size=d, alpha=0.05, power=0.80, alternative="two-sided")
    print(f"  d={d}: n = {int(np.ceil(n))} per group  (total={int(np.ceil(n))*2})")

# How much power do we have with n=30 per group?
print("
Power with n=30 per group (alpha=0.05):")
for d in [0.2, 0.5, 0.8]:
    pwr = analysis.solve_power(effect_size=d, nobs1=30, alpha=0.05, alternative="two-sided")
    print(f"  d={d}: power={pwr:.4f}")
Empirical power via simulation
import numpy as np
from scipy import stats

np.random.seed(0)
n_sim   = 5000
alpha   = 0.05
n       = 40
effect  = 0.5   # Cohen's d

# Empirical power
rejections = 0
for _ in range(n_sim):
    g1 = np.random.normal(0, 1, n)
    g2 = np.random.normal(effect, 1, n)
    _, p = stats.ttest_ind(g1, g2)
    if p < alpha:
        rejections += 1

emp_power = rejections / n_sim

# Analytical power for comparison
from statsmodels.stats.power import TTestIndPower
analytical = TTestIndPower().solve_power(effect_size=effect, nobs1=n, alpha=alpha)

print(f"n={n} per group, d={effect}, alpha={alpha}")
print(f"Empirical power:   {emp_power:.4f}")
print(f"Analytical power:  {analytical:.4f}")
print(f"Type II error (Ξ²): {1-emp_power:.4f}")
💼 Real-World Scenario
A product team plans an A/B test for a new onboarding flow. They estimate the current conversion rate maps to a Cohen's d of 0.35 for time-to-first-purchase. Using statsmodels TTestIndPower, they calculate the minimum sample size needed to achieve 80% power at alpha=0.05, and validate with a simulation study.
Real-World Code
import numpy as np
from scipy import stats
from statsmodels.stats.power import TTestIndPower

np.random.seed(42)

# Business context: expected improvement in days-to-purchase
current_mean, new_mean, common_std = 12.0, 10.5, 4.0
effect_d = (current_mean - new_mean) / common_std
print(f"=== A/B Test Power Planning ===")
print(f"Expected effect size (Cohen's d): {effect_d:.4f}")

analysis = TTestIndPower()
n_needed = analysis.solve_power(effect_size=effect_d, alpha=0.05, power=0.80)
print(f"Required n per arm: {int(np.ceil(n_needed))}")
print(f"Total sample size:  {int(np.ceil(n_needed))*2}")

# Simulate the actual experiment
n_actual = int(np.ceil(n_needed))
control   = np.random.normal(current_mean, common_std, n_actual)
treatment = np.random.normal(new_mean,     common_std, n_actual)

t_stat, p_val = stats.ttest_ind(control, treatment)
mean_diff  = control.mean() - treatment.mean()
pooled_std = np.sqrt((control.var(ddof=1) + treatment.var(ddof=1)) / 2)
obs_d      = mean_diff / pooled_std

print(f"
Simulated experiment results:")
print(f"Observed Cohen's d: {obs_d:.4f}")
print(f"p-value:            {p_val:.4f}")
print(f"Decision: {'Reject H0 β€” new flow is better' if p_val<0.05 else 'Fail to reject H0'}")
🏋️ Practice: Design a Clinical Trial Sample Size
A new medication is expected to reduce symptom severity by 2 points (scale 0-20) with a pooled standard deviation of 5. Compute Cohen's d. Then use TTestIndPower to find the sample size needed for 80%, 85%, and 90% power at alpha=0.05. Finally, simulate 3000 trials with the recommended n and estimate empirical power.
Starter Code
import numpy as np
from scipy import stats
from statsmodels.stats.power import TTestIndPower

expected_diff = 2.0
pooled_std    = 5.0

# TODO: Compute Cohen's d
# TODO: TTestIndPower: find n for power = 0.80, 0.85, 0.90
# TODO: Simulate 3000 trials with n from 80% power calculation
# TODO: Report empirical power
✅ Practice Checklist
11. Bayesian Inference

Update beliefs with evidence using Bayes' theorem. Compute posterior distributions, credible intervals, and compare Bayesian vs frequentist approaches.

Bayes' theorem β€” coin flip posterior
import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt

# Prior: uniform (Beta(1,1)) -> update with 7 heads out of 10 flips
heads, tails = 7, 3
theta = np.linspace(0, 1, 1000)

# Prior: Beta(1,1) = uniform
alpha_prior, beta_prior = 1, 1
# Posterior: Beta(alpha+heads, beta+tails)
alpha_post = alpha_prior + heads
beta_post  = beta_prior  + tails

from scipy.stats import beta
prior     = beta.pdf(theta, alpha_prior, beta_prior)
likelihood = theta**heads * (1-theta)**tails
likelihood /= likelihood.sum()  # normalize
posterior  = beta.pdf(theta, alpha_post, beta_post)

fig, ax = plt.subplots(figsize=(8, 5))
ax.plot(theta, prior,     label='Prior Beta(1,1)',      linestyle='--')
ax.plot(theta, likelihood*max(posterior)/max(likelihood), label='Likelihood (scaled)')
ax.plot(theta, posterior, label=f'Posterior Beta({alpha_post},{beta_post})', linewidth=2)
ax.axvline(alpha_post/(alpha_post+beta_post), color='red', linestyle=':', label='Posterior mean')
ax.set_xlabel('theta (P(heads))')
ax.set_title(f'Bayesian Update: {heads}H/{tails}T')
ax.legend(); plt.tight_layout()
plt.savefig('bayesian_coin.png', dpi=80); plt.close()
print(f'Posterior mean: {alpha_post/(alpha_post+beta_post):.3f}')
print(f'95% credible interval: ({beta.ppf(0.025,alpha_post,beta_post):.3f}, {beta.ppf(0.975,alpha_post,beta_post):.3f})')
Bayesian A/B test with Beta-Binomial
import numpy as np
from scipy.stats import beta

# A: 120 conversions from 1000 visitors
# B: 140 conversions from 1000 visitors
np.random.seed(42)

alpha_a, beta_a = 1 + 120, 1 + 880
alpha_b, beta_b = 1 + 140, 1 + 860

# Sample from posteriors
samples_a = beta.rvs(alpha_a, beta_a, size=100000)
samples_b = beta.rvs(alpha_b, beta_b, size=100000)

prob_b_better = (samples_b > samples_a).mean()
expected_lift = (samples_b - samples_a).mean()

print(f'Control A: rate = {120/1000:.1%}')
print(f'Variant B: rate = {140/1000:.1%}')
print(f'P(B > A):  {prob_b_better:.1%}')
print(f'Expected lift: +{expected_lift:.2%}')
print(f'95% CI for lift: ({np.percentile(samples_b-samples_a,2.5):.2%}, {np.percentile(samples_b-samples_a,97.5):.2%})')
Conjugate priors for different distributions
import numpy as np
from scipy.stats import beta, gamma, norm

np.random.seed(42)

# 1. Beta-Binomial: click-through rate estimation
clicks, views = 45, 200
alpha, beta_param = 1 + clicks, 1 + (views - clicks)
print('CTR posterior:')
print(f'  Mean: {alpha/(alpha+beta_param):.3f}')
print(f'  95% CI: ({beta.ppf(0.025,alpha,beta_param):.3f}, {beta.ppf(0.975,alpha,beta_param):.3f})')

# 2. Normal-Normal: mean estimation (known variance)
data = np.random.normal(5.0, 2.0, 30)  # true mean=5
prior_mean, prior_var = 0, 10  # weak prior
likelihood_var = 4 / len(data)  # sigma^2/n
posterior_var  = 1 / (1/prior_var + 1/likelihood_var)
posterior_mean = posterior_var * (prior_mean/prior_var + data.mean()/likelihood_var)
print(f'\nNormal posterior mean: {posterior_mean:.3f} (true=5.0, sample mean={data.mean():.3f})')

# 3. Gamma-Poisson: event rate estimation
events_per_day = [3, 5, 2, 4, 6, 3, 4]  # observed counts
alpha_g, beta_g = 1 + sum(events_per_day), 1 + len(events_per_day)
print(f'\nPoisson rate posterior mean: {alpha_g/beta_g:.3f} events/day')
Markov Chain Monte Carlo (MCMC) concept
import numpy as np

np.random.seed(42)

def log_posterior(theta, data, prior_std=2.0):
    """Log posterior for normal likelihood, normal prior."""
    log_prior = -0.5 * (theta / prior_std)**2
    log_likelihood = -0.5 * np.sum((data - theta)**2)
    return log_prior + log_likelihood

# Simple Metropolis-Hastings
def metropolis(data, n_samples=5000, proposal_std=0.5):
    samples = []
    current = 0.0
    for _ in range(n_samples):
        proposed = current + np.random.normal(0, proposal_std)
        log_alpha = log_posterior(proposed, data) - log_posterior(current, data)
        if np.log(np.random.uniform()) < log_alpha:
            current = proposed
        samples.append(current)
    return np.array(samples[500:])  # burn-in

# True parameter = 3.0
data = np.random.normal(3.0, 1.0, 20)
samples = metropolis(data)

print(f'Data mean:        {data.mean():.3f}')
print(f'MCMC posterior mean: {samples.mean():.3f}')
print(f'MCMC 95% CI: ({np.percentile(samples,2.5):.3f}, {np.percentile(samples,97.5):.3f})')
print(f'Acceptance rate: {len(np.unique(samples.round(6)))/len(samples):.1%}')
💼 Real-World Scenario
A marketing team runs an A/B test for 2 weeks. Using a Bayesian Beta-Binomial model, they continuously estimate P(B>A) and stop early once confidence exceeds 95%.
Real-World Code
import numpy as np
from scipy.stats import beta

np.random.seed(42)
# Simulate daily data: A has 8% true rate, B has 10% true rate
true_a, true_b = 0.08, 0.10
n_days = 14
daily_visitors = 100

alpha_a, beta_a = 1, 1  # uniform prior
alpha_b, beta_b = 1, 1

print('Day | Visitors A/B | Conv A/B | P(B>A)')
print('-'*50)
for day in range(1, n_days+1):
    conv_a = np.random.binomial(daily_visitors, true_a)
    conv_b = np.random.binomial(daily_visitors, true_b)
    alpha_a += conv_a; beta_a += (daily_visitors - conv_a)
    alpha_b += conv_b; beta_b += (daily_visitors - conv_b)
    samples = np.random.beta([alpha_a, alpha_b], [beta_a, beta_b], size=(100000, 2))
    prob_b_better = (samples[:,1] > samples[:,0]).mean()
    print(f' {day:2d} | {daily_visitors}/{daily_visitors}        | {conv_a:3d}/{conv_b:3d}    | {prob_b_better:.1%}')
    if prob_b_better > 0.95:
        print(f'>>> Declare B winner on day {day}! (P(B>A)={prob_b_better:.1%})')
        break
🏋️ Practice: Bayesian Conversion Rate
You observe 30 conversions from 200 visitors. Use a Beta(1,1) prior. Compute the posterior distribution, posterior mean, and 90% credible interval. Compare to the MLE estimate.
Starter Code
from scipy.stats import beta
import numpy as np

conversions = 30
visitors = 200

# Prior: Beta(1, 1) = uniform
alpha_prior, beta_prior = 1, 1

# TODO: compute posterior parameters
# TODO: compute posterior mean
# TODO: compute 90% credible interval (5th and 95th percentile)
# TODO: compute MLE estimate (conversions/visitors)
# TODO: print comparison
✅ Practice Checklist
12. Survival Analysis

Analyze time-to-event data β€” customer churn, equipment failure, clinical trials β€” with Kaplan-Meier curves and the log-rank test.

Kaplan-Meier survival curve from scratch
import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt

# (duration, event_occurred) β€” 0 = censored (still alive/active)
data = [(5,1),(8,0),(12,1),(14,1),(2,0),(20,1),(7,1),(18,0),(3,1),(25,1),
        (11,1),(6,0),(15,1),(9,1),(22,0),(4,1),(16,1),(10,0),(13,1),(19,1)]
times = sorted(set(d for d,e in data if e==1))

def km_estimator(data):
    n = len(data)
    at_risk = n
    S = 1.0
    result = [(0, 1.0)]
    for t in sorted(set(d for d,e in data if e==1)):
        events   = sum(1 for d,e in data if d==t and e==1)
        censored = sum(1 for d,e in data if d==t and e==0)
        S *= (1 - events/at_risk)
        result.append((t, S))
        at_risk -= (events + censored)
    return result

km = km_estimator(data)
times_km, surv_km = zip(*km)

fig, ax = plt.subplots(figsize=(8, 5))
ax.step(times_km, surv_km, where='post', linewidth=2, color='steelblue', label='KM estimate')
ax.axhline(0.5, color='red', linestyle='--', label='Median survival')
ax.set_xlabel('Time'); ax.set_ylabel('Survival Probability')
ax.set_title('Kaplan-Meier Survival Curve'); ax.legend(); ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('km_curve.png', dpi=80); plt.close()
print('Saved km_curve.png')
median_idx = next(i for i,s in enumerate(surv_km) if s < 0.5)
print(f'Median survival: ~{times_km[median_idx]} units')
Customer churn survival analysis
import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt

np.random.seed(42)
n = 200
# Simulate customer tenures: plan A churns faster
plan_a_times = np.random.exponential(8, n//2).clip(0, 24).round(0)
plan_b_times = np.random.exponential(14, n//2).clip(0, 24).round(0)
events_a = (plan_a_times < 24).astype(int)  # 1=churned, 0=still active (censored)
events_b = (plan_b_times < 24).astype(int)

def km_curve(durations, events):
    n = len(durations)
    times = sorted(set(durations[events==1]))
    S, at_risk = 1.0, n
    curve = [(0, 1.0)]
    for t in times:
        d = sum((durations==t) & (events==1))
        c = sum((durations==t) & (events==0))
        S *= 1 - d/at_risk
        curve.append((t, S)); at_risk -= d + c
    return zip(*curve)

fig, ax = plt.subplots(figsize=(9, 5))
for times, surv, label, color in [
    (*km_curve(plan_a_times, events_a), 'Plan A (basic)', 'tomato'),
    (*km_curve(plan_b_times, events_b), 'Plan B (premium)', 'steelblue'),
]:
    ax.step(list(times), list(surv), where='post', label=label, color=color, linewidth=2)
ax.set_xlabel('Months'); ax.set_ylabel('Retention Rate')
ax.set_title('Customer Retention: Plan A vs Plan B')
ax.legend(); ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('churn_survival.png', dpi=80); plt.close()
print('Saved churn_survival.png')
print(f'Plan A 12-month retention: ~{plan_a_times[plan_a_times>=12].shape[0]/n*2:.0%}')
print(f'Plan B 12-month retention: ~{plan_b_times[plan_b_times>=12].shape[0]/n*2:.0%}')
Log-rank test for group comparison
import numpy as np
from scipy.stats import chi2

np.random.seed(42)
# Group 1: faster failure (mean=10)
# Group 2: slower failure (mean=18)
g1_times  = np.random.exponential(10, 50).clip(0,30)
g1_events = (g1_times < 30).astype(int)
g2_times  = np.random.exponential(18, 50).clip(0,30)
g2_events = (g2_times < 30).astype(int)

def log_rank_test(t1, e1, t2, e2):
    all_times = sorted(set(np.concatenate([t1[e1==1], t2[e2==1]])))
    O1, E1, O2, E2 = 0, 0, 0, 0
    n1, n2 = len(t1), len(t2)
    for t in all_times:
        d1 = ((t1==t) & (e1==1)).sum()
        d2 = ((t2==t) & (e2==1)).sum()
        d  = d1 + d2
        n  = n1 + n2
        if n > 0:
            E1 += d * n1 / n
            E2 += d * n2 / n
        O1 += d1; O2 += d2
        n1 -= ((t1<=t) & (e1==1)).sum() + ((t1<=t) & (e1==0)).sum()
        n2 -= ((t2<=t) & (e2==1)).sum() + ((t2<=t) & (e2==0)).sum()
        n1 = max(n1, 0); n2 = max(n2, 0)
    chi2_stat = (O1 - E1)**2 / E1 + (O2 - E2)**2 / E2
    p_value = 1 - chi2.cdf(chi2_stat, df=1)
    return chi2_stat, p_value

chi2_stat, p = log_rank_test(g1_times, g1_events, g2_times, g2_events)
print(f'Log-rank chi2: {chi2_stat:.3f}')
print(f'p-value: {p:.4f}')
print(f'Groups differ significantly: {p < 0.05}')
Hazard rate and hazard ratio
import numpy as np

np.random.seed(42)

# Exponential distribution: constant hazard rate
lambda_rate = 0.1  # hazard rate
time_points = np.arange(1, 21)
survival    = np.exp(-lambda_rate * time_points)
hazard      = np.full_like(time_points, lambda_rate, dtype=float)

print('Exponential survival (lambda=0.1):')
print(f'  Survival at t=5:  {np.exp(-0.1*5):.3f}')
print(f'  Survival at t=10: {np.exp(-0.1*10):.3f}')
print(f'  Median survival:  {np.log(2)/0.1:.1f} time units')

# Hazard ratio: how much riskier is group A vs group B?
lambda_a = 0.15  # higher hazard
lambda_b = 0.08
hr = lambda_a / lambda_b
print(f'\nHazard Ratio (A vs B): {hr:.2f}')
print(f'Group A is {hr:.1f}x more likely to fail at any time point')
print(f'Group A median: {np.log(2)/lambda_a:.1f} | Group B median: {np.log(2)/lambda_b:.1f}')
💼 Real-World Scenario
A SaaS company tracks how long customers stay before canceling. Kaplan-Meier curves compare retention for free-trial vs. direct-purchase cohorts.
Real-World Code
import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt

np.random.seed(42)
# Cohort 1: free trial then convert (longer retention)
# Cohort 2: direct paid (shorter retention but committed)
cohorts = {
    'Free Trial': np.random.exponential(18, 150).clip(0, 36),
    'Direct Paid': np.random.exponential(12, 100).clip(0, 36),
}
events = {k: (v < 36).astype(int) for k, v in cohorts.items()}

def km_curve(durations, ev):
    n = len(durations)
    times = sorted(set(durations[ev==1]))
    S, ar = 1.0, n
    pts = [(0, 1.0)]
    for t in times:
        d = ((durations==t) & (ev==1)).sum()
        c = ((durations==t) & (ev==0)).sum()
        S *= 1 - d/ar; pts.append((t, S)); ar -= d + c
    return zip(*pts)

fig, ax = plt.subplots(figsize=(9, 5))
colors = ['#2196F3', '#F44336']
for (cohort, times_arr), color in zip(cohorts.items(), colors):
    t_pts, s_pts = km_curve(times_arr, events[cohort])
    ax.step(list(t_pts), list(s_pts), where='post', label=cohort, color=color, linewidth=2)
    churned = events[cohort].sum()
    print(f'{cohort}: n={len(times_arr)}, churned={churned} ({churned/len(times_arr):.0%}), avg tenure={times_arr.mean():.1f} months')
ax.axhline(0.5, color='gray', linestyle='--', linewidth=1)
ax.set_xlabel('Months Since Signup'); ax.set_ylabel('Retention Rate')
ax.set_title('Customer Retention by Acquisition Channel')
ax.legend(); ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('retention_cohorts.png', dpi=80); plt.close()
print('Saved retention_cohorts.png')
🏋️ Practice: Equipment Failure Analysis
Simulate failure times for two machine types (A: exponential mean=100 days, B: exponential mean=150 days, both n=80, max=200 days). Plot KM curves and run a log-rank test.
Starter Code
import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt

np.random.seed(42)
# TODO: generate failure times for A (mean=100) and B (mean=150), clip at 200
# TODO: compute event flags (1 if failed < 200, 0 if censored)
# TODO: implement km_curve function (or copy from examples)
# TODO: plot both KM curves on same figure
# TODO: compute and print log-rank test p-value
# TODO: save to 'equipment_survival.png'
✅ Practice Checklist
13. Resampling & Simulation

Use bootstrapping, permutation tests, and Monte Carlo simulation to estimate uncertainty and test hypotheses without distributional assumptions.

Bootstrap confidence intervals
import numpy as np

np.random.seed(42)
data = np.random.exponential(2.0, 50)  # skewed data
print(f'Sample mean: {data.mean():.4f}')

# Bootstrap 95% CI for the mean
n_boot = 10000
boot_means = np.array([np.random.choice(data, len(data), replace=True).mean()
                       for _ in range(n_boot)])

ci_lower, ci_upper = np.percentile(boot_means, [2.5, 97.5])
print(f'Bootstrap 95% CI: ({ci_lower:.4f}, {ci_upper:.4f})')
print(f'Bootstrap SE: {boot_means.std():.4f}')

# Compare to normal approximation
from scipy.stats import sem
norm_ci = data.mean() + np.array([-1.96, 1.96]) * sem(data)
print(f'Normal approx 95% CI: ({norm_ci[0]:.4f}, {norm_ci[1]:.4f})')
Permutation test for group difference
import numpy as np

np.random.seed(42)
# Two groups β€” is the mean difference real or chance?
group_a = np.random.normal(10.0, 3.0, 40)
group_b = np.random.normal(11.5, 3.0, 40)
obs_diff = group_b.mean() - group_a.mean()

combined = np.concatenate([group_a, group_b])
n_a = len(group_a)
n_perm = 10000

perm_diffs = np.array([
    np.random.permutation(combined)[:n_a].mean() - np.random.permutation(combined)[:n_a].mean()
    for _ in range(n_perm)
])

p_value = (np.abs(perm_diffs) >= np.abs(obs_diff)).mean()
print(f'Observed difference: {obs_diff:.4f}')
print(f'Permutation p-value: {p_value:.4f}')
print(f'Significant at 0.05: {p_value < 0.05}')
Monte Carlo simulation β€” Pi estimation
import numpy as np

np.random.seed(42)
n_samples_list = [100, 1000, 10000, 100000, 1000000]

print(f'True pi: {np.pi:.6f}')
print(f"{'N':>10} {'Pi estimate':>12} {'Error':>10}")
for n in n_samples_list:
    x, y = np.random.uniform(-1, 1, (2, n))
    inside = (x**2 + y**2) <= 1.0
    pi_est = 4 * inside.mean()
    error = abs(pi_est - np.pi)
    print(f'{n:>10,} {pi_est:>12.6f} {error:>10.6f}')
Bootstrap for median and other statistics
import numpy as np

np.random.seed(42)
wages = np.array([35, 42, 28, 55, 38, 47, 31, 60, 44, 39,
                  52, 36, 29, 48, 41, 65, 33, 57, 43, 37]) * 1000

def bootstrap_ci(data, stat_fn, n_boot=10000, ci=95):
    boot_stats = [stat_fn(np.random.choice(data, len(data), replace=True))
                  for _ in range(n_boot)]
    lower = (100 - ci) / 2
    return np.percentile(boot_stats, [lower, 100-lower])

stats = {
    'mean':   np.mean,
    'median': np.median,
    'std':    np.std,
    'q75':    lambda x: np.percentile(x, 75),
}

for name, fn in stats.items():
    ci = bootstrap_ci(wages, fn)
    print(f'{name:6s}: {fn(wages):>8,.0f}  95% CI: (${ci[0]:,.0f}, ${ci[1]:,.0f})')
💼 Real-World Scenario
An analyst needs confidence intervals for the median revenue per user, which is skewed. Bootstrap gives reliable CIs without assuming normality.
Real-World Code
import numpy as np

np.random.seed(42)
# Simulated revenue per user (heavy-tailed)
revenue = np.random.lognormal(mean=3.5, sigma=1.2, size=500)

print(f'n={len(revenue)}')
print(f'Mean:   ${revenue.mean():,.2f}')
print(f'Median: ${np.median(revenue):,.2f}')

def bootstrap_stat(data, stat_fn, n_boot=10000):
    boots = [stat_fn(np.random.choice(data, len(data), replace=True)) for _ in range(n_boot)]
    return np.array(boots)

mean_boots   = bootstrap_stat(revenue, np.mean)
median_boots = bootstrap_stat(revenue, np.median)

for name, boots, obs in [('mean', mean_boots, revenue.mean()), ('median', median_boots, np.median(revenue))]:
    ci = np.percentile(boots, [2.5, 97.5])
    print(f'\n{name.capitalize()} bootstrap:')
    print(f'  Observed: ${obs:,.2f}')
    print(f'  95% CI:   (${ci[0]:,.2f}, ${ci[1]:,.2f})')
    print(f'  SE:       ${boots.std():,.2f}')
🏋️ Practice: A/B Test with Permutation
Group A: 50 users, mean session time 4.2 min. Group B: 50 users, mean 4.8 min. Run a permutation test with 10,000 iterations. Report the p-value and whether the difference is significant.
Starter Code
import numpy as np

np.random.seed(42)
group_a = np.random.normal(4.2, 1.5, 50)
group_b = np.random.normal(4.8, 1.5, 50)

# TODO: compute observed difference
# TODO: combine groups, run 10000 permutations
# TODO: compute permutation p-value (two-tailed)
# TODO: print results
✅ Practice Checklist
14. Bootstrap Methods & Resampling

Percentile Bootstrap CI
import numpy as np
np.random.seed(42)
data = np.random.exponential(scale=2, size=50)
B = 10000
boot_means = [np.mean(np.random.choice(data, size=len(data), replace=True)) for _ in range(B)]
ci = np.percentile(boot_means, [2.5, 97.5])
print(f"Sample mean: {np.mean(data):.4f}")
print(f"Bootstrap 95% CI: [{ci[0]:.4f}, {ci[1]:.4f}]")
BCa Bootstrap (Bias-Corrected)
import numpy as np
from scipy import stats
np.random.seed(7)
data = np.random.lognormal(0, 0.5, 80)
stat_obs = np.median(data)
B = 5000
boot_stats = [np.median(np.random.choice(data, size=len(data), replace=True)) for _ in range(B)]
# Bias correction
z0 = stats.norm.ppf(np.mean(np.array(boot_stats) < stat_obs))
# Acceleration (jackknife)
jk = [np.median(np.delete(data, i)) for i in range(len(data))]
jk_mean = np.mean(jk)
a = np.sum((jk_mean - np.array(jk))**3) / (6*np.sum((jk_mean - np.array(jk))**2)**1.5)
alpha = 0.05
z_lo = stats.norm.ppf(alpha/2); z_hi = stats.norm.ppf(1 - alpha/2)
p_lo = stats.norm.cdf(z0 + (z0+z_lo)/(1-a*(z0+z_lo)))
p_hi = stats.norm.cdf(z0 + (z0+z_hi)/(1-a*(z0+z_hi)))
ci = np.percentile(boot_stats, [p_lo*100, p_hi*100])
print(f"BCa 95% CI for median: [{ci[0]:.4f}, {ci[1]:.4f}]")
Bootstrap Hypothesis Test (Permutation)
import numpy as np
np.random.seed(0)
group_a = np.random.normal(5.0, 1.5, 40)
group_b = np.random.normal(5.6, 1.5, 40)
obs_diff = np.mean(group_b) - np.mean(group_a)
combined = np.concatenate([group_a, group_b])
B = 10000
perm_diffs = []
for _ in range(B):
    perm = np.random.permutation(combined)
    perm_diffs.append(np.mean(perm[40:]) - np.mean(perm[:40]))
p_value = np.mean(np.abs(perm_diffs) >= np.abs(obs_diff))
print(f"Observed difference: {obs_diff:.4f}")
print(f"Permutation p-value: {p_value:.4f}")
💼 Real-World Scenario
E-commerce A/B test: compare conversion rates between checkout designs using bootstrap CIs instead of assuming normality for small samples.
Real-World Code
import numpy as np
np.random.seed(42)
# Simulated checkout conversion: design A=0.05, design B=0.065
n = 300
conv_a = np.random.binomial(1, 0.05, n).astype(float)
conv_b = np.random.binomial(1, 0.065, n).astype(float)
obs_diff = conv_b.mean() - conv_a.mean()
B = 20000
boot_diffs = [
    np.random.choice(conv_b, n, replace=True).mean() -
    np.random.choice(conv_a, n, replace=True).mean()
    for _ in range(B)
]
ci = np.percentile(boot_diffs, [2.5, 97.5])
p_val = np.mean(np.array(boot_diffs) <= 0)
print(f"Observed lift: {obs_diff:.4f} ({obs_diff/conv_a.mean():.1%} relative)")
print(f"95% Bootstrap CI: [{ci[0]:.4f}, {ci[1]:.4f}]")
print(f"One-sided p-value: {p_val:.4f}")
print("Significant at alpha=0.05:", ci[0] > 0)
🏋️ Practice: Bootstrap Customer Lifetime Value
You have 60 customer LTV values from a heavy-tailed distribution. Compute 90% and 95% percentile bootstrap CIs for the mean and the 75th percentile. Also run a two-sample permutation test comparing 'premium' vs 'standard' customers (30 each). Report p-values and effect sizes.
Starter Code
import numpy as np
np.random.seed(11)
ltv = np.random.pareto(2, 60) * 100 + 50
premium = ltv[:30]
standard = ltv[30:]
B = 10000
# TODO: Bootstrap 90% and 95% CIs for mean LTV
# TODO: Bootstrap CI for 75th percentile LTV
# TODO: Permutation test: premium vs standard mean LTV
# TODO: Report Cohen's d effect size
✅ Practice Checklist
15. Multiple Testing & FDR Control

Bonferroni & Holm Corrections
import numpy as np
from scipy import stats
np.random.seed(42)
k = 20
# 18 nulls (no effect) + 2 true positives
p_values = np.concatenate([
    np.random.uniform(0, 1, 18),
    np.array([0.003, 0.011])
])
alpha = 0.05
bonferroni = p_values * k
reject_bon = bonferroni < alpha
# Holm correction
order = np.argsort(p_values)
holm = np.zeros(k)
for rank, idx in enumerate(order):
    holm[idx] = p_values[idx] * (k - rank)
reject_holm = holm < alpha
print(f"Bonferroni rejects: {reject_bon.sum()} tests")
print(f"Holm rejects:       {reject_holm.sum()} tests")
Benjamini-Hochberg FDR Control
import numpy as np
np.random.seed(5)
k = 50
# 40 nulls + 10 true effects (small p-values)
p_values = np.concatenate([
    np.random.uniform(0.05, 1.0, 40),
    np.random.uniform(0, 0.02, 10)
])
alpha = 0.05
order = np.argsort(p_values)
sorted_p = p_values[order]
bh_threshold = (np.arange(1, k+1) / k) * alpha
reject_sorted = sorted_p <= bh_threshold
# All tests up to last rejection are rejected
last_reject = np.where(reject_sorted)[0]
if len(last_reject):
    cutoff = last_reject[-1]
    reject_bh = np.zeros(k, dtype=bool)
    reject_bh[order[:cutoff+1]] = True
else:
    reject_bh = np.zeros(k, dtype=bool)
print(f"BH FDR rejects: {reject_bh.sum()} of {k} tests")
print(f"Expected FDR ≀ {alpha}")
Q-value (Storey's Method)
import numpy as np
np.random.seed(2)
k = 100
p_vals = np.concatenate([np.random.uniform(0,1,75), np.random.beta(0.5,5,25)])
# Estimate pi0 (proportion of true nulls)
lambdas = np.arange(0.05, 0.95, 0.05)
pi0_hat = [(p_vals >= l).sum() / (k * (1-l)) for l in lambdas]
pi0 = min(1.0, np.polyfit(lambdas, pi0_hat, 2)[2])  # smoother estimate
# Compute q-values
order = np.argsort(p_vals)
sorted_p = p_vals[order]
q = pi0 * k * sorted_p / (np.arange(1, k+1))
# Enforce monotonicity
for i in range(k-2, -1, -1):
    q[i] = min(q[i], q[i+1])
q_vals = np.empty(k); q_vals[order] = np.minimum(q, 1)
print(f"pi0 estimate: {pi0:.3f}")
print(f"Discoveries at FDR=0.05: {(q_vals <= 0.05).sum()}")
💼 Real-World Scenario
Genomics pipeline: after running 10,000 gene expression tests, apply BH correction to control FDR at 5% and identify truly differentially expressed genes.
Real-World Code
import numpy as np
np.random.seed(99)
n_genes = 10000
n_de = 200  # truly differentially expressed
# Simulate p-values: most null (uniform), some true effects (beta)
p_null = np.random.uniform(0, 1, n_genes - n_de)
p_de   = np.random.beta(0.3, 10, n_de)
p_values = np.concatenate([p_null, p_de])
alpha = 0.05
# BH procedure
order = np.argsort(p_values)
sorted_p = p_values[order]
ranks = np.arange(1, n_genes+1)
bh_thresh = (ranks / n_genes) * alpha
last = np.where(sorted_p <= bh_thresh)[0]
if len(last):
    cutoff = last[-1]
    reject = np.zeros(n_genes, dtype=bool)
    reject[order[:cutoff+1]] = True
else:
    reject = np.zeros(n_genes, dtype=bool)
# True positive rate among real DE genes
tp = reject[-n_de:].sum()
fp = reject[:-n_de].sum()
print(f"Significant genes: {reject.sum()}")
print(f"True positives: {tp}/{n_de} ({tp/n_de:.1%} sensitivity)")
print(f"False positives: {fp} (FDR={fp/max(reject.sum(),1):.3f})")
🏋️ Practice: Drug Trial Multiple Endpoints
A clinical trial tests 12 endpoints (primary + secondary). Raw p-values are given. Apply Bonferroni, Holm, and BH corrections. Build a table showing which endpoints remain significant under each method. Discuss the tradeoff between FWER and FDR control in regulatory contexts.
Starter Code
import numpy as np
# 12 endpoint p-values from clinical trial
p_values = np.array([0.001, 0.008, 0.023, 0.046, 0.052, 0.071,
                     0.12, 0.18, 0.21, 0.34, 0.52, 0.74])
k = len(p_values)
alpha = 0.05
# TODO: Bonferroni correction + reject/accept
# TODO: Holm step-down correction + reject/accept
# TODO: BH FDR correction + reject/accept
# TODO: Print comparison table
# TODO: Discuss which method to use and why
✅ Practice Checklist
16. Dimensionality Reduction for Statistical Analysis

PCA for Outlier Detection
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
np.random.seed(0)
X = np.random.multivariate_normal([0,0,0], [[1,.7,.3],[.7,1,.2],[.3,.2,1]], 200)
X[5] += [4, 4, 4]  # inject outlier
scaler = StandardScaler()
Xs = scaler.fit_transform(X)
pca = PCA(n_components=2)
scores = pca.fit_transform(Xs)
reconstruction = pca.inverse_transform(scores)
residuals = np.sum((Xs - reconstruction)**2, axis=1)
threshold = np.percentile(residuals, 97.5)
outliers = np.where(residuals > threshold)[0]
print(f"Explained variance: {pca.explained_variance_ratio_.cumsum()[-1]:.2%}")
print(f"Reconstruction outliers (index): {outliers}")
t-SNE for Cluster Visualization
import numpy as np
from sklearn.manifold import TSNE
from sklearn.datasets import make_blobs
np.random.seed(42)
X, y = make_blobs(n_samples=300, centers=4, n_features=8, cluster_std=1.5)
tsne = TSNE(n_components=2, perplexity=30, n_iter=300, random_state=42)
X_2d = tsne.fit_transform(X)
for cls in np.unique(y):
    mask = y == cls
    print(f"Cluster {cls}: centroid ({X_2d[mask,0].mean():.2f}, {X_2d[mask,1].mean():.2f}), n={mask.sum()}")
print(f"KL divergence: {tsne.kl_divergence_:.4f}")
UMAP + Statistical Tests on Components
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from scipy import stats
np.random.seed(1)
# Two groups with different multivariate structure
g1 = np.random.multivariate_normal([0]*5, np.eye(5), 100)
g2 = np.random.multivariate_normal([0.5]*5, np.eye(5)*1.5, 100)
X = np.vstack([g1, g2])
labels = np.array([0]*100 + [1]*100)
pca = PCA(n_components=3)
Xp = pca.fit_transform(StandardScaler().fit_transform(X))
for i in range(3):
    t, p = stats.ttest_ind(Xp[labels==0, i], Xp[labels==1, i])
    print(f"PC{i+1}: t={t:.3f}, p={p:.4f} ({'significant' if p<0.05 else 'not sig'})")
print(f"Total variance explained: {pca.explained_variance_ratio_.sum():.2%}")
💼 Real-World Scenario
Credit risk modelling: apply PCA to 20 correlated financial features, use the top components as inputs for logistic regression, and test if the components differ significantly between default and non-default customers.
Real-World Code
import numpy as np
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from scipy import stats
np.random.seed(42)
n = 500
# 20 correlated features simulating financial metrics
cov = 0.4 * np.ones((20, 20)) + 0.6 * np.eye(20)
X_raw = np.random.multivariate_normal(np.zeros(20), cov, n)
# Default probability depends on first latent factor
latent = X_raw[:, :5].mean(axis=1)
prob = 1 / (1 + np.exp(-(latent - 0.3)))
y = np.random.binomial(1, prob, n)
scaler = StandardScaler()
X = scaler.fit_transform(X_raw)
pca = PCA(n_components=5)
X_pca = pca.fit_transform(X)
print(f"Variance explained: {pca.explained_variance_ratio_.cumsum()[-1]:.2%}")
for i in range(5):
    t, p = stats.ttest_ind(X_pca[y==0, i], X_pca[y==1, i])
    print(f"PC{i+1}: defaulters vs non-defaulters t={t:.2f}, p={p:.4f}")
clf = LogisticRegression()
scores = cross_val_score(clf, X_pca, y, cv=5, scoring='roc_auc')
print(f"Logistic Regression AUC (PCA features): {scores.mean():.3f} Β± {scores.std():.3f}")
🏋️ Practice: Customer Segmentation with Dimensionality Reduction
Apply PCA then k-means (k=4) to a simulated customer dataset with 15 features. For each cluster, run one-way ANOVA on the top 3 PCs and 3 original features to test for between-cluster differences. Report F-statistics, p-values, and effect sizes (eta-squared). Visualize clusters in 2D PCA space.
Starter Code
import numpy as np
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from scipy import stats
np.random.seed(21)
X = np.random.randn(400, 15)
# Inject 4-cluster structure
for i in range(4):
    X[i*100:(i+1)*100, :5] += i * 1.2
scaler = StandardScaler()
Xs = scaler.fit_transform(X)
# TODO: PCA to 2D and 5D
# TODO: KMeans(k=4) on 5D PCA
# TODO: ANOVA on top 3 PCs and 3 original features per cluster
# TODO: Report eta-squared effect sizes
# TODO: Plot 2D PCA colored by cluster
✅ Practice Checklist
17. Hypothesis Testing (t-tests & z-tests)

Hypothesis testing decides whether observed data supports a null hypothesis H0. One-sample, two-sample, and paired t-tests cover most practical scenarios. Always check assumptions: normality, equal variance.

One-sample and two-sample t-tests
from scipy import stats
import numpy as np

np.random.seed(42)

# One-sample t-test: is the mean different from a known value?
weights = np.random.normal(70, 8, 30)
t_stat, p_val = stats.ttest_1samp(weights, popmean=72)
print(f"One-sample t-test: t={t_stat:.3f}, p={p_val:.4f}")
print(f"  Reject H0 (mean=72)? {p_val < 0.05}")

# Two-sample t-test: do two groups have different means?
group_a = np.random.normal(75, 10, 40)
group_b = np.random.normal(80, 10, 40)
t2, p2 = stats.ttest_ind(group_a, group_b)
print(f"Two-sample t-test: t={t2:.3f}, p={p2:.4f}")
print(f"  Reject H0 (equal means)? {p2 < 0.05}")

# Welch's t-test (unequal variances β€” safer default)
group_c = np.random.normal(80, 20, 40)  # larger std
t3, p3 = stats.ttest_ind(group_a, group_c, equal_var=False)
print(f"Welch's t-test: t={t3:.3f}, p={p3:.4f}")
print(f"  Mean A={group_a.mean():.2f}, Mean C={group_c.mean():.2f}")
Paired t-test and confidence intervals
from scipy import stats
import numpy as np

np.random.seed(0)
# Paired t-test: before/after measurements on same subjects
before = np.random.normal(130, 15, 25)   # blood pressure before treatment
after  = before - np.random.normal(8, 5, 25)  # treatment lowers BP

t, p = stats.ttest_rel(before, after)
diff = before - after
ci = stats.t.interval(0.95, df=len(diff)-1,
                       loc=diff.mean(), scale=stats.sem(diff))

print(f"Paired t-test: t={t:.3f}, p={p:.4f}")
print(f"Mean reduction: {diff.mean():.2f} mmHg")
print(f"95% CI: ({ci[0]:.2f}, {ci[1]:.2f})")
print(f"Significant reduction? {p < 0.05}")

# Manual CI for a mean
n, mean, se = len(before), before.mean(), stats.sem(before)
ci_manual = stats.t.interval(0.95, df=n-1, loc=mean, scale=se)
print(f"\nCI for baseline BP mean {mean:.1f}: ({ci_manual[0]:.1f}, {ci_manual[1]:.1f})")
Chi-square test and proportions z-test
from scipy import stats
import numpy as np

# Chi-square test for independence
# Question: Is treatment type independent of recovery?
observed = np.array([[45, 15],   # Drug A: recovered, not recovered
                      [30, 30],   # Drug B
                      [38, 22]])  # Placebo

chi2, p, dof, expected = stats.chi2_contingency(observed)
print(f"Chi-square: chi2={chi2:.3f}, p={p:.4f}, dof={dof}")
print(f"Association between treatment and recovery? {p < 0.05}")

# Two-proportion z-test
# Are conversion rates different between landing pages?
n_a, conv_a = 1000, 120   # page A
n_b, conv_b = 1000, 145   # page B

count = np.array([conv_a, conv_b])
nobs  = np.array([n_a, n_b])
z, p_prop = stats.proportions_ztest(count, nobs)
print(f"\nProportions z-test: z={z:.3f}, p={p_prop:.4f}")
print(f"Rate A={conv_a/n_a:.1%}, Rate B={conv_b/n_b:.1%}")
print(f"Significantly different? {p_prop < 0.05}")
💼 Real-World Scenario
A pharma company tests whether a new drug reduces cholesterol more than a placebo. Paired t-test on 50 patients (pre/post treatment) with a significance level of 0.05.
Real-World Code
from scipy import stats
import numpy as np

np.random.seed(7)
n = 50
drug_group    = np.random.normal(200, 30, n)  # baseline cholesterol
placebo_group = np.random.normal(200, 30, n)

drug_post    = drug_group    - np.random.normal(25, 10, n)  # drug reduces ~25
placebo_post = placebo_group - np.random.normal(5,  10, n)  # placebo reduces ~5

drug_diff    = drug_group    - drug_post
placebo_diff = placebo_group - placebo_post

t_drug,    p_drug    = stats.ttest_rel(drug_group,    drug_post)
t_placebo, p_placebo = stats.ttest_rel(placebo_group, placebo_post)
t_compare, p_compare = stats.ttest_ind(drug_diff,     placebo_diff)

print(f"Drug group:    mean reduction={drug_diff.mean():.1f}, p={p_drug:.4f}")
print(f"Placebo group: mean reduction={placebo_diff.mean():.1f}, p={p_placebo:.4f}")
print(f"Drug vs Placebo: t={t_compare:.3f}, p={p_compare:.4f}")
print(f"Drug significantly better? {p_compare < 0.05}")
🏋️ Practice: A/B Test Analysis
Two email subject lines were tested: A sent to 800 users (92 opened), B to 800 users (117 opened). (1) Run a two-proportion z-test. (2) Compute the 95% CI for the difference in rates. (3) Calculate the lift (relative improvement). (4) Determine the minimum detectable effect if you want 80% power at alpha=0.05.
Starter Code
from scipy import stats
import numpy as np

n_a, open_a = 800, 92
n_b, open_b = 800, 117

# (1) two-proportion z-test
# (2) 95% CI for difference
# (3) lift = (rate_b - rate_a) / rate_a
# (4) minimum detectable effect hint: use stats.norm.isf for z-scores
rate_a = open_a / n_a
rate_b = open_b / n_b
print(f"Rate A: {rate_a:.3f}, Rate B: {rate_b:.3f}")
✅ Practice Checklist
18. ANOVA & Post-hoc Tests

One-way ANOVA tests whether means differ across 3+ groups in a single factor. Two-way ANOVA adds a second factor and interaction. Post-hoc tests (Tukey, Bonferroni) identify which pairs differ.

One-way ANOVA with post-hoc Tukey HSD
from scipy import stats
from statsmodels.stats.multicomp import pairwise_tukeyhsd
import numpy as np

np.random.seed(42)
# Crop yield under 4 fertilizer types
groups = {
    'Control':  np.random.normal(50, 8, 20),
    'FertA':    np.random.normal(58, 8, 20),
    'FertB':    np.random.normal(55, 8, 20),
    'FertC':    np.random.normal(62, 8, 20),
}

f_stat, p_val = stats.f_oneway(*groups.values())
print(f"One-way ANOVA: F={f_stat:.3f}, p={p_val:.4f}")
print(f"Significant difference exists? {p_val < 0.05}\n")

# Tukey HSD post-hoc
import numpy as np as _np
all_data   = np.concatenate(list(groups.values()))
all_labels = np.repeat(list(groups.keys()), 20)

tukey = pairwise_tukeyhsd(endog=all_data, groups=all_labels, alpha=0.05)
print(tukey.summary())
Two-way ANOVA with interaction
import statsmodels.api as sm
from statsmodels.formula.api import ols
import numpy as np, pandas as pd

np.random.seed(42)
n = 120
df = pd.DataFrame({
    'fertilizer': np.repeat(['A','B','C'], n//3),
    'irrigation': np.tile(['Low','High'], n//2),
    'yield': (np.random.normal(55, 8, n) +
              np.where(np.repeat(['A','B','C'], n//3)=='C', 8, 0) +
              np.where(np.tile(['Low','High'], n//2)=='High', 5, 0)),
})

model = ols('yield ~ C(fertilizer) + C(irrigation) + C(fertilizer):C(irrigation)',
            data=df).fit()
anova_table = sm.stats.anova_lm(model, typ=2)
print(anova_table.round(4))
print(f"\nFertilizer effect: p={anova_table.loc['C(fertilizer)','PR(>F)']:.4f}")
print(f"Irrigation effect: p={anova_table.loc['C(irrigation)','PR(>F)']:.4f}")
Kruskal-Wallis (non-parametric ANOVA) + Bonferroni
from scipy import stats
from statsmodels.stats.multitest import multipletests
import numpy as np
from itertools import combinations

np.random.seed(5)
# Non-normal data: response times across 3 UI designs
groups = {
    'Design A': np.random.exponential(2.0, 30),
    'Design B': np.random.exponential(1.5, 30),
    'Design C': np.random.exponential(2.5, 30),
}
names = list(groups.keys())
data  = list(groups.values())

# Kruskal-Wallis
H, p = stats.kruskal(*data)
print(f"Kruskal-Wallis: H={H:.3f}, p={p:.4f}")

# Pairwise Mann-Whitney U with Bonferroni correction
pairs = list(combinations(range(len(names)), 2))
raw_p = []
for i, j in pairs:
    _, pv = stats.mannwhitneyu(data[i], data[j], alternative='two-sided')
    raw_p.append(pv)

reject, p_adj, _, _ = multipletests(raw_p, method='bonferroni')
for (i,j), pv, pa, rej in zip(pairs, raw_p, p_adj, reject):
    print(f"  {names[i]} vs {names[j]}: raw_p={pv:.4f}, adj_p={pa:.4f}, reject={rej}")
💼 Real-World Scenario
A UX researcher tests 3 onboarding flows on task completion time across 60 users. ANOVA reveals a significant difference; Tukey HSD pinpoints which flow pairs differ significantly.
Real-World Code
from scipy import stats
from statsmodels.stats.multicomp import pairwise_tukeyhsd
import numpy as np, pandas as pd

np.random.seed(99)
flows = {'Flow 1': np.random.normal(45, 12, 20),
         'Flow 2': np.random.normal(38, 10, 20),
         'Flow 3': np.random.normal(52, 14, 20)}

f, p = stats.f_oneway(*flows.values())
print(f"ANOVA: F={f:.3f}, p={p:.4f}")

all_times  = np.concatenate(list(flows.values()))
all_labels = np.repeat(list(flows.keys()), 20)
tukey = pairwise_tukeyhsd(all_times, all_labels, alpha=0.05)
print(tukey.summary())

for name, times in flows.items():
    print(f"{name}: mean={times.mean():.1f}s, sd={times.std():.1f}s")
🏋️ Practice: Marketing Channel ANOVA
Generate sales data for 4 marketing channels (Email, Social, Search, Display) β€” 25 observations each with means 120, 135, 150, 110 and std=25. (1) Run one-way ANOVA. (2) If significant, run Tukey HSD to identify which channels differ. (3) Compute eta-squared (effect size = SS_between / SS_total).
Starter Code
from scipy import stats
from statsmodels.stats.multicomp import pairwise_tukeyhsd
import numpy as np

np.random.seed(42)
channels = {'Email':   np.random.normal(120, 25, 25),
            'Social':  np.random.normal(135, 25, 25),
            'Search':  np.random.normal(150, 25, 25),
            'Display': np.random.normal(110, 25, 25)}

# (1) One-way ANOVA
# (2) Tukey HSD if significant
# (3) eta-squared = SS_between / SS_total
# SS_total = sum of (xi - grand_mean)^2 across all observations
✅ Practice Checklist
19. Correlation Analysis

Pearson measures linear association between continuous variables. Spearman and Kendall are rank-based (robust to outliers and non-linearity). Point-biserial handles binary-continuous pairs.

Pearson, Spearman, Kendall correlation
from scipy import stats
import numpy as np

np.random.seed(42)
n = 100

# Pearson: linear relationship
x = np.random.randn(n)
y_linear = 2 * x + np.random.randn(n) * 0.5

r_p, p_p = stats.pearsonr(x, y_linear)
r_s, p_s = stats.spearmanr(x, y_linear)
r_k, p_k = stats.kendalltau(x, y_linear)
print("Linear relationship:")
print(f"  Pearson r={r_p:.4f}  Spearman r={r_s:.4f}  Kendall tau={r_k:.4f}")

# Spearman handles monotonic non-linear relationships
y_mono = np.exp(x) + np.random.randn(n) * 0.3
r_p2, _ = stats.pearsonr(x, y_mono)
r_s2, _ = stats.spearmanr(x, y_mono)
print(f"\nMonotonic (non-linear):")
print(f"  Pearson r={r_p2:.4f}  Spearman r={r_s2:.4f}")

# With outliers: Spearman is robust
y_outlier = y_linear.copy()
y_outlier[[5, 10, 15]] = 100  # inject outliers
r_p3, _ = stats.pearsonr(x, y_outlier)
r_s3, _ = stats.spearmanr(x, y_outlier)
print(f"\nWith outliers:")
print(f"  Pearson r={r_p3:.4f}  Spearman r={r_s3:.4f}  (Spearman more robust)")
Correlation matrix and partial correlation
from scipy import stats
import numpy as np, pandas as pd

np.random.seed(42)
n = 200
# Confounding variable Z drives both X and Y
z = np.random.randn(n)
x = 0.7 * z + np.random.randn(n) * 0.5
y = 0.8 * z + np.random.randn(n) * 0.5
w = np.random.randn(n)  # unrelated

df = pd.DataFrame({'X': x, 'Y': y, 'Z': z, 'W': w})

print("Correlation matrix:")
print(df.corr(method='pearson').round(3))

# Partial correlation: X-Y controlling for Z
def partial_corr(x, y, z):
    # Regress out z from both x and y
    r_xz = np.corrcoef(x, z)[0,1]
    r_yz = np.corrcoef(y, z)[0,1]
    r_xy = np.corrcoef(x, y)[0,1]
    return (r_xy - r_xz*r_yz) / (np.sqrt(1-r_xz**2) * np.sqrt(1-r_yz**2))

pc = partial_corr(x, y, z)
print(f"\nX-Y raw correlation:     {np.corrcoef(x,y)[0,1]:.4f}")
print(f"X-Y partial (given Z):   {pc:.4f}  (much lower β€” Z was confounding)")
Point-biserial, phi coefficient, and significance testing
from scipy import stats
import numpy as np

np.random.seed(3)
n = 150

# Point-biserial: continuous vs binary
scores = np.random.normal(70, 10, n)
passed = (scores + np.random.randn(n)*5 > 68).astype(int)
r_pb, p_pb = stats.pointbiserialr(passed, scores)
print(f"Point-biserial r={r_pb:.4f}, p={p_pb:.4f}")

# Phi coefficient: binary vs binary
vaccinated = np.random.choice([0,1], n, p=[0.4,0.6])
infected   = np.where(vaccinated==1,
                       np.random.choice([0,1], n, p=[0.9,0.1]),
                       np.random.choice([0,1], n, p=[0.4,0.6]))
chi2, p_chi, _, _ = stats.chi2_contingency(
    np.array([[((vaccinated==0)&(infected==0)).sum(),
               ((vaccinated==0)&(infected==1)).sum()],
              [((vaccinated==1)&(infected==0)).sum(),
               ((vaccinated==1)&(infected==1)).sum()]]))
phi = np.sqrt(chi2 / n)
print(f"Phi coefficient={phi:.4f}, chi2_p={p_chi:.4f}")

# Testing significance of Pearson r
r = 0.35
n_test = 50
t = r * np.sqrt(n_test-2) / np.sqrt(1-r**2)
p = 2 * stats.t.sf(abs(t), df=n_test-2)
print(f"\nr=0.35, n=50: t={t:.3f}, p={p:.4f}, significant={p<0.05}")
💼 Real-World Scenario
A sports analyst computes Spearman correlations between athlete metrics (speed, strength, endurance, recovery) and competitive performance, then uses partial correlation to remove the confounding effect of age.
Real-World Code
from scipy import stats
import numpy as np, pandas as pd

np.random.seed(10)
n = 80
age = np.random.uniform(18, 35, n)
speed      = -0.4*age + np.random.randn(n)*2 + 30
strength   =  0.1*age + np.random.randn(n)*3 + 20
endurance  = -0.3*age + np.random.randn(n)*2 + 25
performance= 0.5*speed + 0.3*strength + 0.4*endurance + np.random.randn(n)*2

df = pd.DataFrame({'age':age,'speed':speed,'strength':strength,
                   'endurance':endurance,'performance':performance})

print("Spearman correlations with performance:")
for col in ['age','speed','strength','endurance']:
    r, p = stats.spearmanr(df[col], df['performance'])
    print(f"  {col:12s}: r={r:.3f}, p={p:.4f}")

# Partial: speed-performance controlling for age
r_sp = np.corrcoef(speed, performance)[0,1]
r_sa = np.corrcoef(speed, age)[0,1]
r_pa = np.corrcoef(performance, age)[0,1]
pc = (r_sp - r_sa*r_pa) / (np.sqrt(1-r_sa**2)*np.sqrt(1-r_pa**2))
print(f"\nSpeed-Perf raw r={r_sp:.3f}, partial (ctrl age)={pc:.3f}")
🏋️ Practice: Feature Correlation Screening
Generate 8 features for 300 samples (some correlated, some independent, one binary). (1) Compute and print the full Pearson correlation matrix. (2) Flag all feature pairs with |r| > 0.7 as highly correlated. (3) Compute point-biserial correlation of the binary feature with each continuous feature. (4) Test each correlation for significance.
Starter Code
from scipy import stats
import numpy as np, pandas as pd

np.random.seed(42)
n = 300
z = np.random.randn(n)
df = pd.DataFrame({
    'f1': z + np.random.randn(n)*0.3,
    'f2': -0.8*z + np.random.randn(n)*0.3,  # correlated with f1
    'f3': np.random.randn(n),
    'f4': np.random.randn(n),
    'f5': 0.9*z + np.random.randn(n)*0.1,   # highly correlated
    'f6': np.random.exponential(2, n),
    'f7': np.random.randn(n),
    'binary': (z > 0).astype(int),
})

# (1) correlation matrix
# (2) flag |r| > 0.7 pairs
# (3) point-biserial for binary vs continuous
# (4) test significance
✅ Practice Checklist
20. Non-Parametric Tests

Non-parametric tests make no assumptions about distributions. Use them when data is ordinal, heavily skewed, or sample sizes are too small to verify normality. Wilcoxon, Mann-Whitney, and Kolmogorov-Smirnov are the workhorses.

Normality tests: Shapiro-Wilk and D'Agostino
from scipy import stats
import numpy as np

np.random.seed(42)

datasets = {
    'Normal':      np.random.normal(50, 10, 50),
    'Skewed':      np.random.exponential(5, 50),
    'Heavy-tailed':np.random.standard_t(df=3, size=50),
    'Uniform':     np.random.uniform(0, 100, 50),
}

print(f"{'Dataset':15s}  {'Shapiro p':>10s}  {'D\'Agostino p':>13s}  {'Normal?'}")
print('-' * 60)
for name, data in datasets.items():
    _, p_sw  = stats.shapiro(data)
    _, p_da  = stats.normaltest(data)
    is_norm  = p_sw > 0.05 and p_da > 0.05
    print(f"{name:15s}  {p_sw:10.4f}  {p_da:13.4f}  {str(is_norm)}")

# Q-Q plot values (manual)
normal_data = datasets['Normal']
theoretical, observed = stats.probplot(normal_data, dist='norm')[0]
r = np.corrcoef(theoretical, observed)[0,1]
print(f"\nQ-Q correlation for Normal: {r:.4f} (close to 1 = normal)")
Wilcoxon signed-rank and Mann-Whitney U
from scipy import stats
import numpy as np

np.random.seed(7)
# Wilcoxon signed-rank: paired non-parametric alternative to paired t-test
before = np.random.exponential(2.0, 30)
after  = before * 0.7 + np.random.exponential(0.3, 30)

stat_w, p_w = stats.wilcoxon(before, after)
stat_t, p_t = stats.ttest_rel(before, after)
print("Paired comparison (before vs after):")
print(f"  Wilcoxon: stat={stat_w:.1f}, p={p_w:.4f}")
print(f"  t-test:   stat={stat_t:.3f}, p={p_t:.4f}")

# Mann-Whitney U: non-parametric two-sample test
group1 = np.random.exponential(3.0, 40)
group2 = np.random.exponential(4.5, 40)

u_stat, p_u = stats.mannwhitneyu(group1, group2, alternative='two-sided')
t_stat, p_t2 = stats.ttest_ind(group1, group2)
print(f"\nTwo-group comparison:")
print(f"  Mann-Whitney: U={u_stat:.1f}, p={p_u:.4f}")
print(f"  t-test:       t={t_stat:.3f}, p={p_t2:.4f}")

# Effect size r = Z / sqrt(N) for Mann-Whitney
n1, n2 = len(group1), len(group2)
z = stats.norm.ppf(p_u/2)  # approximate z
r_effect = abs(z) / np.sqrt(n1+n2)
print(f"  Effect size r: {r_effect:.3f}")
Kolmogorov-Smirnov and Anderson-Darling tests
from scipy import stats
import numpy as np

np.random.seed(0)

# One-sample KS: does data come from a specified distribution?
data_normal = np.random.normal(0, 1, 100)
data_exp    = np.random.exponential(1, 100)

for name, data in [('Normal', data_normal), ('Exponential', data_exp)]:
    ks_n, p_n = stats.kstest(data, 'norm', args=(data.mean(), data.std()))
    ks_e, p_e = stats.kstest(data, 'expon', args=(0, 1/data.mean()))
    print(f"{name}: KS vs Normal p={p_n:.4f}, KS vs Expon p={p_e:.4f}")

# Two-sample KS: do two samples come from the same distribution?
sample_a = np.random.normal(0, 1, 100)
sample_b = np.random.normal(0.5, 1, 100)  # slightly shifted
ks2, p2 = stats.ks_2samp(sample_a, sample_b)
print(f"\n2-sample KS: stat={ks2:.4f}, p={p2:.4f}")

# Anderson-Darling (more powerful than KS for normal testing)
result = stats.anderson(data_normal, dist='norm')
print(f"\nAnderson-Darling statistic: {result.statistic:.4f}")
for sl, cv in zip(result.significance_level, result.critical_values):
    print(f"  {sl}% significance level: crit={cv:.4f}, reject={result.statistic > cv}")
💼 Real-World Scenario
A clinical researcher compares pain scores (skewed, ordinal 1-10) between a drug and control group. Shapiro-Wilk confirms non-normality, so Mann-Whitney U is used instead of a t-test.
Real-World Code
from scipy import stats
import numpy as np

np.random.seed(42)
# Pain scores: skewed, bounded 1-10
drug_scores    = np.clip(np.random.exponential(2.5, 40) + 1, 1, 10).round()
control_scores = np.clip(np.random.exponential(4.0, 40) + 1, 1, 10).round()

# Test normality first
_, p_drug = stats.shapiro(drug_scores)
_, p_ctrl = stats.shapiro(control_scores)
print(f"Shapiro p-values: drug={p_drug:.4f}, control={p_ctrl:.4f}")
print(f"Use non-parametric: {p_drug < 0.05 or p_ctrl < 0.05}")

# Mann-Whitney U
u, p = stats.mannwhitneyu(drug_scores, control_scores, alternative='less')
print(f"Mann-Whitney: U={u:.1f}, p={p:.4f}")
print(f"Drug reduces pain? {p < 0.05}")
print(f"Median drug={np.median(drug_scores):.1f}, control={np.median(control_scores):.1f}")
🏋️ Practice: Distribution Comparison Pipeline
Write a function compare_groups(a, b) that: (1) Tests normality of both groups with Shapiro-Wilk, (2) If both normal, runs Levene's test for equal variances, then appropriate t-test (Welch or Student), (3) If either non-normal, runs Mann-Whitney U, (4) Returns a dict with test_used, statistic, p_value, significant. Test with normal and non-normal pairs.
Starter Code
from scipy import stats
import numpy as np

def compare_groups(a, b, alpha=0.05):
    result = {}
    # (1) normality
    _, p_a = stats.shapiro(a)
    _, p_b = stats.shapiro(b)
    both_normal = p_a > alpha and p_b > alpha
    # (2) if normal: Levene then t-test
    # (3) if not normal: Mann-Whitney
    # (4) return dict
    return result

np.random.seed(42)
normal_a = np.random.normal(10, 2, 30)
normal_b = np.random.normal(11, 2, 30)
skewed_a = np.random.exponential(2, 30)
skewed_b = np.random.exponential(3, 30)

print(compare_groups(normal_a, normal_b))
print(compare_groups(skewed_a, skewed_b))
✅ Practice Checklist
21. Bayesian Inference Basics

Bayesian inference updates prior beliefs with observed data to produce a posterior distribution. Beta-Binomial and Normal-Normal conjugates let you compute posteriors analytically. Compare to frequentist interpretation of the same data.

Beta-Binomial: updating conversion rate beliefs
from scipy import stats
import numpy as np

# Prior: Beta(alpha=2, beta=10) β€” we expect ~17% conversion
prior_a, prior_b = 2, 10

# Observed data: 45 conversions out of 200 visits
obs_conv, obs_total = 45, 200
obs_failures = obs_total - obs_conv

# Posterior: Beta(alpha + successes, beta + failures) [conjugate]
post_a = prior_a + obs_conv
post_b = prior_b + obs_failures

prior     = stats.beta(prior_a, prior_b)
posterior = stats.beta(post_a, post_b)

print(f"Prior:     mean={prior.mean():.3f}, 95% CI: ({prior.ppf(0.025):.3f}, {prior.ppf(0.975):.3f})")
print(f"Posterior: mean={posterior.mean():.3f}, 95% CI: ({posterior.ppf(0.025):.3f}, {posterior.ppf(0.975):.3f})")
print(f"MLE (freq): {obs_conv/obs_total:.3f}")

# Probability that conversion > 25%
prob_above = 1 - posterior.cdf(0.25)
print(f"P(conversion > 25%): {prob_above:.4f}")

# Posterior predictive: expected conversions in next 100
pp_mean  = posterior.mean() * 100
pp_sd    = np.sqrt(posterior.var() * 100)
print(f"Expected conversions (next 100): {pp_mean:.1f} Β± {pp_sd:.1f}")
Bayesian A/B test: comparing two rates
from scipy import stats
import numpy as np

np.random.seed(42)

# A/B test with conjugate Beta posteriors
conv_a, n_a = 120, 1000   # page A
conv_b, n_b = 145, 1000   # page B

# Non-informative priors Beta(1,1) = Uniform
post_a = stats.beta(1 + conv_a, 1 + n_a - conv_a)
post_b = stats.beta(1 + conv_b, 1 + n_b - conv_b)

# Monte Carlo estimate P(B > A)
samples_a = post_a.rvs(100_000)
samples_b = post_b.rvs(100_000)
p_b_wins  = (samples_b > samples_a).mean()

# Expected lift
lift_samples = (samples_b - samples_a) / samples_a
lift_mean = lift_samples.mean()
lift_ci = np.percentile(lift_samples, [2.5, 97.5])

print(f"P(B > A) = {p_b_wins:.4f}")
print(f"Expected lift: {lift_mean:.1%}")
print(f"95% credible interval for lift: ({lift_ci[0]:.1%}, {lift_ci[1]:.1%})")
print(f"Posterior A: mean={post_a.mean():.4f}")
print(f"Posterior B: mean={post_b.mean():.4f}")
Normal-Normal conjugate for mean estimation
from scipy import stats
import numpy as np

np.random.seed(0)
# Estimating the true mean of a process
# Prior: Normal(mu0=100, sigma0=20)  β€” prior belief about mean
# Known measurement noise: sigma=15

mu0, sigma0 = 100.0, 20.0   # prior
sigma_obs   = 15.0            # known observation noise

# Observe 30 measurements
true_mean = 112.0
data = np.random.normal(true_mean, sigma_obs, 30)
n, x_bar = len(data), data.mean()

# Conjugate update
precision_prior = 1 / sigma0**2
precision_data  = n / sigma_obs**2
precision_post  = precision_prior + precision_data

mu_post    = (precision_prior*mu0 + precision_data*x_bar) / precision_post
sigma_post = np.sqrt(1 / precision_post)

prior_dist = stats.norm(mu0, sigma0)
post_dist  = stats.norm(mu_post, sigma_post)

print(f"Prior:        mean={mu0:.1f},     95% CI: ({prior_dist.ppf(0.025):.1f}, {prior_dist.ppf(0.975):.1f})")
print(f"Likelihood:   mean={x_bar:.2f},  (n={n} obs, noise={sigma_obs})")
print(f"Posterior:    mean={mu_post:.2f}, 95% CI: ({post_dist.ppf(0.025):.2f}, {post_dist.ppf(0.975):.2f})")
print(f"True mean:    {true_mean}")
print(f"Posterior pulls toward prior when data is limited.")
💼 Real-World Scenario
An e-commerce platform uses Bayesian A/B testing for a checkout redesign. Instead of waiting for p<0.05, they monitor the posterior probability that variant B outperforms A and stop when P(B>A) > 0.95.
Real-World Code
from scipy import stats
import numpy as np

np.random.seed(42)

# Sequential Bayesian A/B test
true_rate_a, true_rate_b = 0.10, 0.13
n_per_day = 50

post_a = stats.beta(1, 1)
post_b = stats.beta(1, 1)

alpha_a, beta_a = 1, 1
alpha_b, beta_b = 1, 1

print(f"{'Day':>5s}  {'n_a':>6s}  {'n_b':>6s}  {'Rate A':>8s}  {'Rate B':>8s}  {'P(B>A)':>8s}")
for day in range(1, 31):
    # New daily data
    conv_a = np.random.binomial(n_per_day, true_rate_a)
    conv_b = np.random.binomial(n_per_day, true_rate_b)
    alpha_a += conv_a; beta_a += n_per_day - conv_a
    alpha_b += conv_b; beta_b += n_per_day - conv_b

    post_a = stats.beta(alpha_a, beta_a)
    post_b = stats.beta(alpha_b, beta_b)
    p_b_wins = (post_b.rvs(50000) > post_a.rvs(50000)).mean()
    n_total = (alpha_a + beta_a - 2) + (alpha_b + beta_b - 2)

    if day % 5 == 0 or p_b_wins > 0.95:
        print(f"{day:5d}  {n_total//2:6d}  {n_total//2:6d}  {post_a.mean():8.4f}  {post_b.mean():8.4f}  {p_b_wins:8.4f}")
    if p_b_wins > 0.95:
        print(f"  --> Stopping: P(B>A)={p_b_wins:.4f} > 0.95 on day {day}")
        break
🏋️ Practice: Bayesian Coin Fairness
A coin is flipped 100 times with 62 heads. (1) Start with a uniform prior Beta(1,1). (2) Update sequentially after every 10 flips. (3) Print posterior mean and 95% credible interval at each step. (4) Compare to the frequentist 95% CI at each step. (5) Compute P(p > 0.5) from the final posterior.
Starter Code
from scipy import stats
import numpy as np

np.random.seed(0)
n_flips, n_heads = 100, 62
flips = np.array([1]*n_heads + [0]*(n_flips - n_heads))
np.random.shuffle(flips)

a, b = 1, 1  # prior Beta(1,1)

print(f"{'Step':>6s}  {'Heads':>6s}  {'n':>5s}  {'Post mean':>10s}  {'95% Credible':>20s}  {'Freq 95% CI':>20s}")
for step in range(10, n_flips+1, 10):
    chunk = flips[:step]
    h = chunk.sum(); n = len(chunk)
    a_post = a + h; b_post = b + n - h
    post = stats.beta(a_post, b_post)
    # TODO: compute posterior 95% CI from post
    # TODO: compute frequentist 95% CI using stats.proportion_confint or normal approx
    # TODO: print row
    pass

# (5) P(p > 0.5) from final posterior
22. Effect Size & Statistical Power

Effect size quantifies the magnitude of a difference independently of sample size. Cohen's d, r, eta-squared measure effect for different tests. Power analysis determines the sample size needed to detect a given effect.

Cohen's d and r for effect size
from scipy import stats
import numpy as np

def cohens_d(group1, group2):
    n1, n2 = len(group1), len(group2)
    s_pooled = np.sqrt(((n1-1)*group1.std()**2 + (n2-1)*group2.std()**2) / (n1+n2-2))
    return (group1.mean() - group2.mean()) / s_pooled

np.random.seed(42)
thresholds = [('Small', 0.2), ('Medium', 0.5), ('Large', 0.8)]

print(f"{'Effect':8s}  {'Cohen d':>8s}  {'Interpretation':>15s}  {'p-value':>8s}")
print('-' * 55)
for label, target_d in thresholds:
    a = np.random.normal(0, 1, 50)
    b = np.random.normal(target_d, 1, 50)
    d = cohens_d(b, a)
    _, p = stats.ttest_ind(a, b)
    print(f"{label:8s}  {d:8.3f}  {label:>15s}  {p:8.4f}")

# r from t-statistic
t_stat = 3.2
n_total = 80
r = np.sqrt(t_stat**2 / (t_stat**2 + n_total - 2))
print(f"\nt={t_stat}, n={n_total} -> r effect size = {r:.3f}")

# Eta-squared from F (ANOVA)
F, df_between, df_within = 8.5, 2, 60
eta2 = (F * df_between) / (F * df_between + df_within)
print(f"F={F}, df_b={df_between}, df_w={df_within} -> etaΒ²={eta2:.3f}")
Sample size calculation for t-test
from scipy import stats
import numpy as np

def sample_size_ttest(effect_size_d, alpha=0.05, power=0.80, tail=2):
    z_alpha = stats.norm.ppf(1 - alpha/tail)
    z_beta  = stats.norm.ppf(power)
    n = 2 * ((z_alpha + z_beta) / effect_size_d) ** 2
    return int(np.ceil(n))

print("Required sample size per group (two-sample t-test):")
print(f"{'Effect d':>9s}  {'Ξ±=0.05,80%':>12s}  {'Ξ±=0.05,90%':>12s}  {'Ξ±=0.01,80%':>12s}")
print('-' * 52)
for d in [0.2, 0.3, 0.5, 0.8, 1.0]:
    n1 = sample_size_ttest(d, 0.05, 0.80)
    n2 = sample_size_ttest(d, 0.05, 0.90)
    n3 = sample_size_ttest(d, 0.01, 0.80)
    print(f"{d:9.1f}  {n1:12d}  {n2:12d}  {n3:12d}")

# Using statsmodels TTestIndPower
from statsmodels.stats.power import TTestIndPower
analysis = TTestIndPower()
n = analysis.solve_power(effect_size=0.5, alpha=0.05, power=0.80)
print(f"\nstatsmodels: d=0.5, Ξ±=0.05, power=0.80 -> n={n:.1f} per group")

# Achieved power for given n and effect
power = analysis.solve_power(effect_size=0.3, alpha=0.05, nobs1=100)
print(f"d=0.3, n=100 per group -> achieved power={power:.3f}")
Power curves and minimum detectable effect
from statsmodels.stats.power import TTestIndPower, ChiSquarePower
import numpy as np

analysis = TTestIndPower()

# Power curve: power vs sample size for different effect sizes
print("Power by sample size and effect size:")
print(f"{'n/grp':>7s}", end='')
for d in [0.2, 0.3, 0.5, 0.8]:
    print(f"  d={d}", end='')
print()
for n in [20, 50, 100, 200, 500]:
    print(f"{n:7d}", end='')
    for d in [0.2, 0.3, 0.5, 0.8]:
        pwr = analysis.solve_power(effect_size=d, alpha=0.05, nobs1=n)
        print(f"  {pwr:.3f}", end='')
    print()

# Minimum detectable effect (MDE) given n and desired power
mde = analysis.solve_power(alpha=0.05, power=0.80, nobs1=200)
print(f"\nMDE (d) for n=200/grp, power=80%: {mde:.3f}")

# Chi-square power
chi_power = ChiSquarePower()
n_chi = chi_power.solve_power(effect_size=0.3, alpha=0.05, power=0.80, k_bins=3)
print(f"Chi-square: effect=0.3, k=3, power=80% -> n={n_chi:.1f}")
💼 Real-World Scenario
A product team wants to detect a 10% improvement in task completion rate (from 60% to 66%). They compute required sample size for 80% and 90% power, then run the test and report Cohen's h effect size.
Real-World Code
from statsmodels.stats.power import NormalIndPower
from scipy import stats
import numpy as np

# Rate improvement: 60% -> 66%
rate_a, rate_b = 0.60, 0.66

# Cohen's h for proportions
h = 2 * np.arcsin(np.sqrt(rate_b)) - 2 * np.arcsin(np.sqrt(rate_a))
print(f"Cohen's h = {h:.4f}")

# Required sample size
analysis = NormalIndPower()
for power in [0.80, 0.90, 0.95]:
    n = analysis.solve_power(effect_size=h, alpha=0.05, power=power)
    print(f"Power={power:.0%}: n={n:.0f} per group (total {2*n:.0f})")

# Simulate the test
np.random.seed(42)
n_per = int(analysis.solve_power(effect_size=h, alpha=0.05, power=0.80)) + 1
conv_a = np.random.binomial(n_per, rate_a)
conv_b = np.random.binomial(n_per, rate_b)
_, p = stats.proportions_ztest([conv_a, conv_b], [n_per, n_per])
print(f"\nSimulated test: A={conv_a/n_per:.3f}, B={conv_b/n_per:.3f}, p={p:.4f}")
🏋️ Practice: Power Analysis Dashboard
For a two-sample t-test: (1) Compute required n for all combinations of effect_size in [0.2,0.5,0.8] and power in [0.70,0.80,0.90] at alpha=0.05. Print as a grid. (2) For n=150 per group and alpha=0.05, plot (print) a table of achieved power for effect sizes 0.1 to 0.8 in steps of 0.1. (3) Find the MDE for n=300 at 80% power.
Starter Code
from statsmodels.stats.power import TTestIndPower
import numpy as np

analysis = TTestIndPower()

# (1) n grid
print("Required n per group:")
effects = [0.2, 0.5, 0.8]
powers  = [0.70, 0.80, 0.90]
# TODO: print header and rows

# (2) power table for n=150
print("\nAchieved power for n=150 per group:")
# TODO

# (3) MDE
mde = analysis.solve_power(alpha=0.05, power=0.80, nobs1=300)
print(f"\nMDE for n=300: d={mde:.4f}")
23. Multiple Testing & FDR Corrections

Running many tests inflates the false positive rate. Bonferroni controls family-wise error rate (conservative). Benjamini-Hochberg (BH) controls the false discovery rate β€” preferred for exploratory studies with many hypotheses.

Bonferroni and Holm-Bonferroni corrections
from statsmodels.stats.multitest import multipletests
from scipy import stats
import numpy as np

np.random.seed(42)
n_tests = 20
n_per   = 50

# 20 tests: 3 truly different (true positives), 17 null
true_effect = [0, 0, 0, 0.8, 0, 0, 0, 0, 0.5, 0, 0, 0, 0, 0, 0, 0.6, 0, 0, 0, 0]
raw_p = []
for d in true_effect:
    a = np.random.normal(0, 1, n_per)
    b = np.random.normal(d, 1, n_per)
    _, p = stats.ttest_ind(a, b)
    raw_p.append(p)

methods = ['bonferroni', 'holm', 'fdr_bh', 'fdr_by']
print(f"{'Method':12s}  {'Rejections':>11s}  {'True Pos':>9s}  {'False Pos':>10s}")
for method in methods:
    reject, p_adj, _, _ = multipletests(raw_p, alpha=0.05, method=method)
    n_reject = reject.sum()
    true_pos  = sum(1 for r, d in zip(reject, true_effect) if r and d != 0)
    false_pos = sum(1 for r, d in zip(reject, true_effect) if r and d == 0)
    print(f"{method:12s}  {n_reject:11d}  {true_pos:9d}  {false_pos:10d}")

print(f"\nUncorrected (p<0.05): {sum(p < 0.05 for p in raw_p)} rejections")
Benjamini-Hochberg step-by-step
from scipy import stats
import numpy as np

np.random.seed(1)
n_hypotheses = 15
alpha = 0.05

# Simulate p-values (mix of true positives and nulls)
true_effects = [0]*10 + [0.7, 0.9, 0.5, 1.2, 0.6]
raw_p = []
for d in true_effects:
    a = np.random.normal(0, 1, 40)
    b = np.random.normal(d, 1, 40)
    _, p = stats.ttest_ind(a, b)
    raw_p.append(p)

# BH procedure manually
sorted_idx = np.argsort(raw_p)
sorted_p   = np.array(raw_p)[sorted_idx]

print(f"BH procedure (alpha={alpha}, m={n_hypotheses} tests):")
print(f"{'Rank':>5s}  {'p-value':>8s}  {'BH thresh':>10s}  {'Reject?'}")
bh_threshold = alpha * np.arange(1, n_hypotheses+1) / n_hypotheses
last_reject  = -1
for rank, (p, thr) in enumerate(zip(sorted_p, bh_threshold), 1):
    if p <= thr:
        last_reject = rank

for rank, (p, thr) in enumerate(zip(sorted_p[:8], bh_threshold[:8]), 1):
    reject = rank <= last_reject
    print(f"{rank:5d}  {p:8.4f}  {thr:10.4f}  {reject}")
print(f"  ... ({n_hypotheses} total)")
print(f"\nAll hypotheses with rank <= {last_reject} are rejected ({last_reject} total)")
q-values and FDR estimation
from statsmodels.stats.multitest import multipletests
from scipy import stats
import numpy as np

np.random.seed(7)
m = 1000   # large-scale testing (genomics-like)
n_true = 50

# Generate p-values: 50 true effects + 950 nulls
p_true = np.array([stats.ttest_ind(
    np.random.normal(0,1,30), np.random.normal(0.6,1,30))[1]
    for _ in range(n_true)])
p_null = np.random.uniform(0, 1, m - n_true)  # null p-values ~ Uniform
all_p  = np.concatenate([p_true, p_null])
true_labels = np.array([True]*n_true + [False]*(m-n_true))

results = {}
for method in ['bonferroni', 'fdr_bh']:
    reject, _, _, _ = multipletests(all_p, alpha=0.05, method=method)
    tp = (reject & true_labels).sum()
    fp = (reject & ~true_labels).sum()
    fn = (~reject & true_labels).sum()
    fdr = fp / max(reject.sum(), 1)
    power = tp / n_true
    results[method] = (reject.sum(), tp, fp, fdr, power)
    print(f"{method:12s}: rejections={reject.sum():4d}  TP={tp:3d}  FP={fp:3d}  FDR={fdr:.3f}  Power={power:.3f}")

print(f"\nSummary: Bonferroni is conservative; BH balances FDR and power")
💼 Real-World Scenario
A genomics researcher tests differential expression for 5000 genes. Raw analysis finds 450 significant genes (p<0.05); after BH correction at FDR=0.05, only 180 remain β€” avoiding a list of mostly false discoveries.
Real-World Code
from statsmodels.stats.multitest import multipletests
from scipy import stats
import numpy as np

np.random.seed(42)
n_genes = 5000
n_de    = 150  # truly differentially expressed

# Group A and B expression levels
expr_a = np.random.normal(0, 1, (n_genes, 30))
expr_b = np.random.normal(0, 1, (n_genes, 30))
# Inject true effects
effects = np.random.normal(0.8, 0.3, n_de)
expr_b[:n_de] += effects[:, np.newaxis]

# Compute p-values
p_vals = np.array([stats.ttest_ind(expr_a[i], expr_b[i])[1]
                   for i in range(n_genes)])

for method in ['none', 'bonferroni', 'fdr_bh']:
    if method == 'none':
        reject = p_vals < 0.05
        adj_p  = p_vals
    else:
        reject, adj_p, _, _ = multipletests(p_vals, alpha=0.05, method=method)

    tp = reject[:n_de].sum()
    fp = reject[n_de:].sum()
    print(f"{method:12s}: significant={reject.sum():5d}  TP={tp:4d}  FP={fp:4d}  FDR={fp/max(reject.sum(),1):.3f}")
🏋️ Practice: Multi-Metric A/B Correction
You ran an A/B test measuring 12 metrics simultaneously (CTR, bounce rate, time on page, conversions, etc.). Generate synthetic p-values for each metric (mix of some truly different and some null). Apply Bonferroni, Holm, BH corrections. For each method, print which metrics are significant. Discuss when you'd prefer BH over Bonferroni.
Starter Code
from statsmodels.stats.multitest import multipletests
import numpy as np
from scipy import stats

np.random.seed(42)
metrics = ['CTR','BounceRate','TimeOnPage','Conversions','PageViews',
           'ScrollDepth','CartAdds','Checkout','Revenue','SignUps','ReturnVisits','NPS']

# Simulate p-values: some metrics truly affected, most not
true_effects = [0.4, 0, 0, 0.6, 0, 0, 0.3, 0.5, 0, 0, 0, 0]
p_values = []
for d in true_effects:
    a = np.random.normal(0, 1, 200)
    b = np.random.normal(d, 1, 200)
    _, p = stats.ttest_ind(a, b)
    p_values.append(p)

# TODO: apply all three corrections
# TODO: print table showing which metrics are significant under each method
# TODO: comment on when to prefer BH vs Bonferroni
24. Bootstrap & Permutation Tests

Bootstrap resampling estimates confidence intervals for any statistic without distributional assumptions. Permutation tests provide exact p-values for two-sample comparisons by shuffling group labels.

Bootstrap confidence intervals
import numpy as np
from scipy import stats

np.random.seed(42)
data = np.random.exponential(scale=5, size=50)  # skewed data

def bootstrap_ci(data, statistic, n_boot=5000, ci=0.95):
    boots = [statistic(np.random.choice(data, len(data), replace=True))
             for _ in range(n_boot)]
    alpha = (1 - ci) / 2
    return np.percentile(boots, [alpha*100, (1-alpha)*100]), np.array(boots)

# Bootstrap CI for mean, median, and std
for stat_name, fn in [('mean', np.mean), ('median', np.median), ('std', np.std)]:
    ci_boot, boots = bootstrap_ci(data, fn)
    # Compare to analytical (where available)
    print(f"{stat_name:7s}: obs={fn(data):.3f}  boot_CI=({ci_boot[0]:.3f}, {ci_boot[1]:.3f})")

# Analytical CI for mean (t-interval)
ci_t = stats.t.interval(0.95, df=len(data)-1, loc=data.mean(), scale=stats.sem(data))
print(f"\nt-interval CI for mean: ({ci_t[0]:.3f}, {ci_t[1]:.3f})")
print(f"Boot CI for mean: above  (similar for large n, more robust for skewed)")
Bootstrap for regression coefficients
import numpy as np
from scipy import stats

np.random.seed(0)
n = 80
x = np.random.randn(n)
y = 2.5 * x + 1.0 + np.random.randn(n) * 1.5

def ols(x, y):
    slope, intercept, *_ = stats.linregress(x, y)
    return slope, intercept

# Bootstrap CI for slope
n_boot = 5000
slopes = []
for _ in range(n_boot):
    idx = np.random.choice(n, n, replace=True)
    s, _ = ols(x[idx], y[idx])
    slopes.append(s)

slopes = np.array(slopes)
ci_slope = np.percentile(slopes, [2.5, 97.5])
obs_slope, obs_int = ols(x, y)

print(f"Observed slope:      {obs_slope:.4f}")
print(f"Bootstrap CI slope:  ({ci_slope[0]:.4f}, {ci_slope[1]:.4f})")

# Analytical CI from linregress
result = stats.linregress(x, y)
t_crit = stats.t.ppf(0.975, df=n-2)
se_slope = result.stderr
print(f"Analytical CI slope: ({obs_slope - t_crit*se_slope:.4f}, {obs_slope + t_crit*se_slope:.4f})")
print(f"Bootstrap SE of slope: {slopes.std():.4f}")
Permutation test for two-sample comparison
import numpy as np
from scipy import stats

np.random.seed(42)

def permutation_test(a, b, n_perm=10000, stat='mean_diff'):
    obs_stat = a.mean() - b.mean()
    combined = np.concatenate([a, b])
    n_a = len(a)
    perm_stats = []
    for _ in range(n_perm):
        shuffled = np.random.permutation(combined)
        perm_stats.append(shuffled[:n_a].mean() - shuffled[n_a:].mean())
    perm_stats = np.array(perm_stats)
    p_val = (np.abs(perm_stats) >= np.abs(obs_stat)).mean()
    return obs_stat, p_val, perm_stats

# Test on small sample with non-normal data
group_a = np.random.exponential(3, 25)
group_b = np.random.exponential(4, 25)

obs, p_perm, perms = permutation_test(group_a, group_b)
_, p_mann = stats.mannwhitneyu(group_a, group_b, alternative='two-sided')
_, p_t    = stats.ttest_ind(group_a, group_b)

print(f"Observed mean diff: {obs:.3f}")
print(f"Permutation p-val:  {p_perm:.4f}")
print(f"Mann-Whitney p-val: {p_mann:.4f}")
print(f"t-test p-val:       {p_t:.4f}")
print(f"Percentile of obs in perm dist: {(perms < obs).mean():.3f}")
💼 Real-World Scenario
A data scientist bootstraps the median response time for two API versions (non-normal, heavy-tailed). The bootstrap CI for the difference excludes 0, providing strong evidence version B is faster.
Real-World Code
import numpy as np
from scipy import stats

np.random.seed(99)
# API response times (heavy-tailed)
version_a = np.random.pareto(2.0, 200) * 100 + 50
version_b = np.random.pareto(2.5, 200) * 80  + 45

# Bootstrap CI for difference in medians
n_boot = 10000
median_diffs = []
for _ in range(n_boot):
    a_boot = np.random.choice(version_a, len(version_a), replace=True)
    b_boot = np.random.choice(version_b, len(version_b), replace=True)
    median_diffs.append(np.median(a_boot) - np.median(b_boot))

ci = np.percentile(median_diffs, [2.5, 97.5])
obs_diff = np.median(version_a) - np.median(version_b)

print(f"Median A: {np.median(version_a):.2f}ms")
print(f"Median B: {np.median(version_b):.2f}ms")
print(f"Observed difference: {obs_diff:.2f}ms")
print(f"Bootstrap 95% CI: ({ci[0]:.2f}, {ci[1]:.2f})ms")
print(f"CI excludes 0: {ci[0] > 0 or ci[1] < 0} -> B is {'faster' if obs_diff > 0 else 'slower'}")
🏋️ Practice: Bootstrap vs Analytical CI Comparison
Generate 3 datasets: (1) Normal n=30, (2) Exponential n=30, (3) Heavy-tailed Pareto n=30. For each: compute bootstrap 95% CI for the mean (5000 resamples), the analytical t-interval, and the bootstrap CI for the median. Print all three side-by-side. Observe how they differ for non-normal data.
Starter Code
import numpy as np
from scipy import stats

np.random.seed(42)
datasets = {
    'Normal':    np.random.normal(10, 3, 30),
    'Exponential': np.random.exponential(5, 30),
    'Pareto':    np.random.pareto(1.5, 30) * 5 + 5,
}

def bootstrap_ci(data, fn, n_boot=5000):
    boots = [fn(np.random.choice(data, len(data), replace=True)) for _ in range(n_boot)]
    return np.percentile(boots, [2.5, 97.5])

for name, data in datasets.items():
    boot_mean   = bootstrap_ci(data, np.mean)
    t_ci        = stats.t.interval(0.95, df=len(data)-1, loc=data.mean(), scale=stats.sem(data))
    boot_median = bootstrap_ci(data, np.median)
    print(f"{name}:")
    print(f"  Bootstrap mean CI:    ({boot_mean[0]:.3f}, {boot_mean[1]:.3f})")
    # TODO: print t-interval and bootstrap median CI