Confidence Intervals in SimPy: Quantifying Uncertainty
A point estimate without a confidence interval is a guess with false precision. Here's how to do it properly.
What's a Confidence Interval?
A 95% confidence interval means: if you ran this analysis 100 times with different random seeds, about 95 of those intervals would contain the true mean.
It's not "95% probability the true value is in this range." That's a common misconception.
Basic Calculation
import numpy as np
from scipy import stats
def confidence_interval(data, confidence=0.95):
"""Calculate confidence interval for the mean."""
n = len(data)
mean = np.mean(data)
se = stats.sem(data) # Standard error
t_value = stats.t.ppf((1 + confidence) / 2, df=n-1)
margin = t_value * se
return {
'mean': mean,
'lower': mean - margin,
'upper': mean + margin,
'margin': margin,
'n': n
}
# Usage
results = [4.2, 5.1, 3.8, 4.5, 5.2, 4.1, 4.8, 3.9, 4.6, 5.0]
ci = confidence_interval(results)
print(f"Mean: {ci['mean']:.2f}")
print(f"95% CI: [{ci['lower']:.2f}, {ci['upper']:.2f}]")
From Multiple Replications
The standard workflow:
def run_replication(seed):
"""Run one simulation replication, return mean wait time."""
random.seed(seed)
env = simpy.Environment()
# ... run simulation
return mean_wait_time
# Run multiple replications
n_reps = 30
results = [run_replication(42 + i) for i in range(n_reps)]
# Calculate CI
ci = confidence_interval(results)
print(f"Mean wait time: {ci['mean']:.2f} ± {ci['margin']:.2f}")
Choosing Sample Size
How many replications for a target half-width?
def required_sample_size(pilot_data, target_half_width, confidence=0.95):
"""Estimate required sample size for target precision."""
n_pilot = len(pilot_data)
s = np.std(pilot_data, ddof=1)
# Initial estimate using normal approximation
z = stats.norm.ppf((1 + confidence) / 2)
n_estimate = (z * s / target_half_width) ** 2
# Refine using t-distribution
t = stats.t.ppf((1 + confidence) / 2, df=max(1, int(n_estimate) - 1))
n_required = (t * s / target_half_width) ** 2
return int(np.ceil(n_required))
# Pilot study with 10 runs
pilot = [run_replication(42 + i) for i in range(10)]
needed = required_sample_size(pilot, target_half_width=0.5)
print(f"Need {needed} replications for half-width of 0.5")
Relative Precision
Sometimes you want a percentage precision:
def required_for_relative_precision(pilot_data, target_relative_error, confidence=0.95):
"""Sample size for target relative half-width (e.g., 5% of mean)."""
cv = np.std(pilot_data, ddof=1) / np.mean(pilot_data) # Coefficient of variation
z = stats.norm.ppf((1 + confidence) / 2)
n_required = (z * cv / target_relative_error) ** 2
return int(np.ceil(n_required))
# For 5% relative precision
needed = required_for_relative_precision(pilot, 0.05)
Comparing Two Scenarios
Paired comparison with confidence interval:
def compare_scenarios(results_a, results_b, confidence=0.95):
"""
Paired comparison of two scenarios.
Returns CI for the difference (A - B).
"""
differences = np.array(results_a) - np.array(results_b)
ci = confidence_interval(differences, confidence)
return {
'mean_diff': ci['mean'],
'ci_lower': ci['lower'],
'ci_upper': ci['upper'],
'significant': (ci['lower'] > 0) or (ci['upper'] < 0)
}
# Run both scenarios with same seeds (paired)
results_2_servers = [run_replication(42 + i, servers=2) for i in range(30)]
results_3_servers = [run_replication(42 + i, servers=3) for i in range(30)]
comparison = compare_scenarios(results_2_servers, results_3_servers)
print(f"Difference: {comparison['mean_diff']:.2f}")
print(f"95% CI: [{comparison['ci_lower']:.2f}, {comparison['ci_upper']:.2f}]")
print(f"Significant: {comparison['significant']}")
Multiple Metrics
class SimulationResults:
def __init__(self):
self.replications = []
def add_replication(self, metrics):
"""metrics = {'wait': 4.2, 'util': 0.85, 'throughput': 12.5}"""
self.replications.append(metrics)
def confidence_intervals(self, confidence=0.95):
"""Calculate CIs for all metrics."""
results = {}
metrics = self.replications[0].keys()
for metric in metrics:
values = [r[metric] for r in self.replications]
results[metric] = confidence_interval(values, confidence)
return results
# Usage
results = SimulationResults()
for i in range(30):
metrics = run_replication_full(42 + i)
results.add_replication(metrics)
cis = results.confidence_intervals()
for metric, ci in cis.items():
print(f"{metric}: {ci['mean']:.2f} [{ci['lower']:.2f}, {ci['upper']:.2f}]")
Visualising Confidence Intervals
import matplotlib.pyplot as plt
def plot_ci_comparison(scenarios):
"""
scenarios = {
'Baseline': {'mean': 5.2, 'lower': 4.8, 'upper': 5.6},
'Improved': {'mean': 3.1, 'lower': 2.9, 'upper': 3.3}
}
"""
names = list(scenarios.keys())
means = [s['mean'] for s in scenarios.values()]
errors = [[s['mean'] - s['lower'], s['upper'] - s['mean']]
for s in scenarios.values()]
errors = np.array(errors).T
plt.figure(figsize=(8, 5))
plt.bar(names, means, yerr=errors, capsize=10,
color='steelblue', alpha=0.7)
plt.ylabel('Mean Wait Time')
plt.title('Scenario Comparison with 95% Confidence Intervals')
plt.tight_layout()
plt.show()
Bootstrap Confidence Intervals
When normality assumptions don't hold:
def bootstrap_ci(data, n_bootstrap=10000, confidence=0.95):
"""Calculate bootstrap confidence interval."""
bootstrapped_means = []
for _ in range(n_bootstrap):
sample = np.random.choice(data, size=len(data), replace=True)
bootstrapped_means.append(np.mean(sample))
alpha = (1 - confidence) / 2
lower = np.percentile(bootstrapped_means, alpha * 100)
upper = np.percentile(bootstrapped_means, (1 - alpha) * 100)
return {
'mean': np.mean(data),
'lower': lower,
'upper': upper
}
Reporting Results
Good reporting includes:
def format_result(ci, metric_name, decimals=2):
"""Format a confidence interval for reporting."""
return (
f"{metric_name}: {ci['mean']:.{decimals}f} "
f"(95% CI: {ci['lower']:.{decimals}f} - {ci['upper']:.{decimals}f}, "
f"n={ci['n']})"
)
# Example output:
# Mean wait time: 4.52 (95% CI: 4.21 - 4.83, n=30)
Common Mistakes
Too few replications:
# Bad: CI is meaninglessly wide
results = [run_replication(i) for i in range(3)]
# Good: CI has reasonable precision
results = [run_replication(i) for i in range(30)]
Ignoring warm-up:
# Bad: includes initialisation bias
mean_wait = simulation_run()
# Good: excludes warm-up period
mean_wait = simulation_run_with_warmup(warmup=100)
Reporting without CI:
# Bad: false precision
print(f"Mean wait: 4.523")
# Good: honest uncertainty
print(f"Mean wait: 4.5 (95% CI: 4.2-4.8)")
Summary
Confidence intervals: - Quantify uncertainty in your estimates - Require multiple replications - Enable meaningful comparisons - Should always accompany point estimates
A simulation result without a confidence interval is not entirely trustworthy. Always report your uncertainty.
Next Steps
Discover the Power of Simulation
Want to become a go-to expert in simulation with Python? The Complete Simulation Bootcamp will show you how simulation can transform your career and your projects.
Explore the Bootcamp