Running Multiple Replications in SimPy: Statistical Confidence

One simulation run proves nothing. Run it again—different results. Which is right? Both. Neither. You need replications.

Why Multiple Replications?

A single simulation run is one sample from a distribution of possible outcomes. You wouldn't draw conclusions from one data point. Don't draw them from one simulation.

Multiple replications give you: - Estimate of the mean - Estimate of the variance - Confidence intervals - Statistical validity

Basic Replication Pattern

import simpy
import random
import numpy as np

def run_single_replication(seed, run_time=1000):
    """Run one simulation replication."""
    random.seed(seed)
    env = simpy.Environment()
    # ... setup and run simulation
    return mean_wait_time

def run_replications(n_replications, base_seed=42):
    """Run multiple replications."""
    results = []
    for i in range(n_replications):
        result = run_single_replication(seed=base_seed + i)
        results.append(result)
        print(f"Replication {i+1}: {result:.2f}")

    return results

results = run_replications(30)
print(f"\nOverall mean: {np.mean(results):.2f}")
print(f"Std deviation: {np.std(results, ddof=1):.2f}")

How Many Replications?

Rule of thumb: at least 10-30.

More precise: enough to achieve target precision.

def required_replications(pilot_results, target_half_width, confidence=0.95):
    """Estimate replications needed for target confidence interval width."""
    from scipy import stats

    n_pilot = len(pilot_results)
    s = np.std(pilot_results, ddof=1)
    t_value = stats.t.ppf((1 + confidence) / 2, df=n_pilot - 1)

    # Required n to achieve half-width
    n_required = (t_value * s / target_half_width) ** 2

    return int(np.ceil(n_required))

# Pilot study
pilot = run_replications(10)
needed = required_replications(pilot, target_half_width=0.5)
print(f"Need approximately {needed} replications")

Confidence Intervals

from scipy import stats

def confidence_interval(data, confidence=0.95):
    """Calculate confidence interval for mean."""
    n = len(data)
    mean = np.mean(data)
    se = stats.sem(data)
    t_value = stats.t.ppf((1 + confidence) / 2, df=n-1)
    margin = t_value * se

    return mean, mean - margin, mean + margin

results = run_replications(30)
mean, lower, upper = confidence_interval(results)
print(f"Mean: {mean:.2f}")
print(f"95% CI: [{lower:.2f}, {upper:.2f}]")

Parallel Replications

Speed up with parallel processing:

from multiprocessing import Pool

def run_single_replication_wrapper(args):
    seed, run_time = args
    return run_single_replication(seed, run_time)

def run_parallel_replications(n_replications, base_seed=42, n_workers=4):
    """Run replications in parallel."""
    args = [(base_seed + i, 1000) for i in range(n_replications)]

    with Pool(n_workers) as pool:
        results = pool.map(run_single_replication_wrapper, args)

    return results

# Much faster for large replication counts
results = run_parallel_replications(100, n_workers=8)

Comparing Scenarios

Run replications for each scenario:

def run_scenario(scenario_config, n_replications=30, base_seed=42):
    results = []
    for i in range(n_replications):
        result = run_single_replication(
            seed=base_seed + i,
            **scenario_config
        )
        results.append(result)
    return results

# Compare 2 servers vs 3 servers
scenario_a = run_scenario({'num_servers': 2})
scenario_b = run_scenario({'num_servers': 3})

print(f"2 servers: {np.mean(scenario_a):.2f} ± {np.std(scenario_a, ddof=1):.2f}")
print(f"3 servers: {np.mean(scenario_b):.2f} ± {np.std(scenario_b, ddof=1):.2f}")

Paired Comparisons

For fair comparison, use same random numbers:

def run_paired_comparison(config_a, config_b, n_replications=30):
    """Run paired replications using common random numbers."""
    differences = []

    for i in range(n_replications):
        seed = 42 + i
        result_a = run_single_replication(seed=seed, **config_a)
        result_b = run_single_replication(seed=seed, **config_b)
        differences.append(result_a - result_b)

    return differences

diffs = run_paired_comparison({'servers': 2}, {'servers': 3})
mean_diff, lower, upper = confidence_interval(diffs)

if lower > 0:
    print("Scenario A is significantly higher")
elif upper < 0:
    print("Scenario B is significantly higher")
else:
    print("No significant difference")

Complete Replication Manager

class ReplicationManager:
    def __init__(self, simulation_func, base_seed=42):
        self.simulation_func = simulation_func
        self.base_seed = base_seed
        self.results = {}

    def run(self, scenario_name, config, n_replications=30):
        """Run replications for a scenario."""
        results = []
        for i in range(n_replications):
            result = self.simulation_func(
                seed=self.base_seed + i,
                **config
            )
            results.append(result)
            print(f"{scenario_name} rep {i+1}/{n_replications}: {result:.3f}")

        self.results[scenario_name] = results
        return results

    def summary(self, scenario_name):
        """Get summary statistics for a scenario."""
        data = self.results[scenario_name]
        mean, lower, upper = confidence_interval(data)
        return {
            'n': len(data),
            'mean': mean,
            'std': np.std(data, ddof=1),
            'ci_lower': lower,
            'ci_upper': upper,
            'min': min(data),
            'max': max(data)
        }

    def compare(self, scenario_a, scenario_b):
        """Compare two scenarios."""
        a = self.results[scenario_a]
        b = self.results[scenario_b]

        # Paired t-test
        from scipy import stats
        t_stat, p_value = stats.ttest_rel(a, b)

        return {
            'mean_a': np.mean(a),
            'mean_b': np.mean(b),
            'difference': np.mean(a) - np.mean(b),
            't_statistic': t_stat,
            'p_value': p_value,
            'significant': p_value < 0.05
        }

# Usage
manager = ReplicationManager(run_single_replication)
manager.run('baseline', {'servers': 2}, n_replications=30)
manager.run('improved', {'servers': 3}, n_replications=30)

print("\nBaseline:", manager.summary('baseline'))
print("Improved:", manager.summary('improved'))
print("Comparison:", manager.compare('baseline', 'improved'))

Storing Results

Save results for later analysis:

import json
import pandas as pd

def save_results(results, filename):
    """Save replication results."""
    df = pd.DataFrame([
        {'scenario': scenario, 'replication': i, 'value': value}
        for scenario, values in results.items()
        for i, value in enumerate(values)
    ])
    df.to_csv(filename, index=False)

def load_results(filename):
    """Load replication results."""
    df = pd.read_csv(filename)
    results = {}
    for scenario in df['scenario'].unique():
        results[scenario] = df[df['scenario'] == scenario]['value'].tolist()
    return results

Summary

Multiple replications: - Run at least 10-30 per scenario - Use different seeds for each replication - Calculate confidence intervals - Use paired comparisons for scenarios - Parallelise for speed

One run tells you nothing. Many runs tell you everything.

Next Steps


Discover the Power of Simulation

Want to become a go-to expert in simulation with Python? The Complete Simulation Bootcamp will show you how simulation can transform your career and your projects.

Explore the Bootcamp