Confidence Intervals in SimPy: Quantifying Uncertainty

Published on February 06, 2026

A point estimate without a confidence interval is a guess with false precision. Here's how to do it properly.

What's a Confidence Interval?

A 95% confidence interval means: if you ran this analysis 100 times with different random seeds, about 95 of those intervals would contain the true mean.

It's not "95% probability the true value is in this range." That's a common misconception.

Basic Calculation

import numpy as np
from scipy import stats

def confidence_interval(data, confidence=0.95):
    """Calculate confidence interval for the mean."""
    n = len(data)
    mean = np.mean(data)
    se = stats.sem(data)  # Standard error
    t_value = stats.t.ppf((1 + confidence) / 2, df=n-1)
    margin = t_value * se

    return {
        'mean': mean,
        'lower': mean - margin,
        'upper': mean + margin,
        'margin': margin,
        'n': n
    }

# Usage
results = [4.2, 5.1, 3.8, 4.5, 5.2, 4.1, 4.8, 3.9, 4.6, 5.0]
ci = confidence_interval(results)
print(f"Mean: {ci['mean']:.2f}")
print(f"95% CI: [{ci['lower']:.2f}, {ci['upper']:.2f}]")

From Multiple Replications

The standard workflow:

def run_replication(seed):
    """Run one simulation replication, return mean wait time."""
    random.seed(seed)
    env = simpy.Environment()
    # ... run simulation
    return mean_wait_time

# Run multiple replications
n_reps = 30
results = [run_replication(42 + i) for i in range(n_reps)]

# Calculate CI
ci = confidence_interval(results)
print(f"Mean wait time: {ci['mean']:.2f} ± {ci['margin']:.2f}")

Choosing Sample Size

How many replications for a target half-width?

def required_sample_size(pilot_data, target_half_width, confidence=0.95):
    """Estimate required sample size for target precision."""
    n_pilot = len(pilot_data)
    s = np.std(pilot_data, ddof=1)

    # Initial estimate using normal approximation
    z = stats.norm.ppf((1 + confidence) / 2)
    n_estimate = (z * s / target_half_width) ** 2

    # Refine using t-distribution
    t = stats.t.ppf((1 + confidence) / 2, df=max(1, int(n_estimate) - 1))
    n_required = (t * s / target_half_width) ** 2

    return int(np.ceil(n_required))

# Pilot study with 10 runs
pilot = [run_replication(42 + i) for i in range(10)]
needed = required_sample_size(pilot, target_half_width=0.5)
print(f"Need {needed} replications for half-width of 0.5")

Relative Precision

Sometimes you want a percentage precision:

def required_for_relative_precision(pilot_data, target_relative_error, confidence=0.95):
    """Sample size for target relative half-width (e.g., 5% of mean)."""
    cv = np.std(pilot_data, ddof=1) / np.mean(pilot_data)  # Coefficient of variation
    z = stats.norm.ppf((1 + confidence) / 2)

    n_required = (z * cv / target_relative_error) ** 2
    return int(np.ceil(n_required))

# For 5% relative precision
needed = required_for_relative_precision(pilot, 0.05)

Comparing Two Scenarios

Paired comparison with confidence interval:

def compare_scenarios(results_a, results_b, confidence=0.95):
    """
    Paired comparison of two scenarios.

    Returns CI for the difference (A - B).
    """
    differences = np.array(results_a) - np.array(results_b)
    ci = confidence_interval(differences, confidence)

    return {
        'mean_diff': ci['mean'],
        'ci_lower': ci['lower'],
        'ci_upper': ci['upper'],
        'significant': (ci['lower'] > 0) or (ci['upper'] < 0)
    }

# Run both scenarios with same seeds (paired)
results_2_servers = [run_replication(42 + i, servers=2) for i in range(30)]
results_3_servers = [run_replication(42 + i, servers=3) for i in range(30)]

comparison = compare_scenarios(results_2_servers, results_3_servers)
print(f"Difference: {comparison['mean_diff']:.2f}")
print(f"95% CI: [{comparison['ci_lower']:.2f}, {comparison['ci_upper']:.2f}]")
print(f"Significant: {comparison['significant']}")

Multiple Metrics

class SimulationResults:
    def __init__(self):
        self.replications = []

    def add_replication(self, metrics):
        """metrics = {'wait': 4.2, 'util': 0.85, 'throughput': 12.5}"""
        self.replications.append(metrics)

    def confidence_intervals(self, confidence=0.95):
        """Calculate CIs for all metrics."""
        results = {}
        metrics = self.replications[0].keys()

        for metric in metrics:
            values = [r[metric] for r in self.replications]
            results[metric] = confidence_interval(values, confidence)

        return results

# Usage
results = SimulationResults()
for i in range(30):
    metrics = run_replication_full(42 + i)
    results.add_replication(metrics)

cis = results.confidence_intervals()
for metric, ci in cis.items():
    print(f"{metric}: {ci['mean']:.2f} [{ci['lower']:.2f}, {ci['upper']:.2f}]")

Visualising Confidence Intervals

import matplotlib.pyplot as plt

def plot_ci_comparison(scenarios):
    """
    scenarios = {
        'Baseline': {'mean': 5.2, 'lower': 4.8, 'upper': 5.6},
        'Improved': {'mean': 3.1, 'lower': 2.9, 'upper': 3.3}
    }
    """
    names = list(scenarios.keys())
    means = [s['mean'] for s in scenarios.values()]
    errors = [[s['mean'] - s['lower'], s['upper'] - s['mean']]
              for s in scenarios.values()]
    errors = np.array(errors).T

    plt.figure(figsize=(8, 5))
    plt.bar(names, means, yerr=errors, capsize=10,
            color='steelblue', alpha=0.7)
    plt.ylabel('Mean Wait Time')
    plt.title('Scenario Comparison with 95% Confidence Intervals')
    plt.tight_layout()
    plt.show()

Bootstrap Confidence Intervals

When normality assumptions don't hold:

def bootstrap_ci(data, n_bootstrap=10000, confidence=0.95):
    """Calculate bootstrap confidence interval."""
    bootstrapped_means = []

    for _ in range(n_bootstrap):
        sample = np.random.choice(data, size=len(data), replace=True)
        bootstrapped_means.append(np.mean(sample))

    alpha = (1 - confidence) / 2
    lower = np.percentile(bootstrapped_means, alpha * 100)
    upper = np.percentile(bootstrapped_means, (1 - alpha) * 100)

    return {
        'mean': np.mean(data),
        'lower': lower,
        'upper': upper
    }

Reporting Results

Good reporting includes:

def format_result(ci, metric_name, decimals=2):
    """Format a confidence interval for reporting."""
    return (
        f"{metric_name}: {ci['mean']:.{decimals}f} "
        f"(95% CI: {ci['lower']:.{decimals}f} - {ci['upper']:.{decimals}f}, "
        f"n={ci['n']})"
    )

# Example output:
# Mean wait time: 4.52 (95% CI: 4.21 - 4.83, n=30)

Common Mistakes

Too few replications:

# Bad: CI is meaninglessly wide
results = [run_replication(i) for i in range(3)]
# Good: CI has reasonable precision
results = [run_replication(i) for i in range(30)]

Ignoring warm-up:

# Bad: includes initialisation bias
mean_wait = simulation_run()
# Good: excludes warm-up period
mean_wait = simulation_run_with_warmup(warmup=100)

Reporting without CI:

# Bad: false precision
print(f"Mean wait: 4.523")
# Good: honest uncertainty
print(f"Mean wait: 4.5 (95% CI: 4.2-4.8)")

Summary

Confidence intervals: - Quantify uncertainty in your estimates - Require multiple replications - Enable meaningful comparisons - Should always accompany point estimates

A simulation result without a confidence interval is not entirely trustworthy. Always report your uncertainty.

Next Steps

Discover the Power of Simulation

Want to become a go-to expert in simulation with Python? The Complete Simulation Bootcamp will show you how simulation can transform your career and your projects.

Explore the Bootcamp