SimPy with pandas: Powerful Data Analysis for Simulation Results

SimPy generates data. pandas makes sense of it. Together, they're unstoppable.

Why pandas?

SimPy gives you raw data—lists, dictionaries, timestamps. pandas gives you: - DataFrames for structured analysis - Aggregations, groupings, pivots - Time series analysis - Easy export to CSV, Excel, databases

Collecting Data for pandas

Structure your data from the start:

import simpy
import random
import pandas as pd

class Simulation:
    def __init__(self, env):
        self.env = env
        self.records = []  # Will become DataFrame

    def customer(self, customer_id, server):
        arrival = self.env.now

        with server.request() as req:
            yield req
            wait = self.env.now - arrival

            service = random.expovariate(1/5)
            yield self.env.timeout(service)

        self.records.append({
            'customer_id': customer_id,
            'arrival': arrival,
            'wait': wait,
            'service': service,
            'departure': self.env.now
        })

    def to_dataframe(self):
        return pd.DataFrame(self.records)

Basic Analysis

# Run simulation
sim = Simulation(env)
# ... run ...
df = sim.to_dataframe()

# Summary statistics
print(df.describe())

# Specific metrics
print(f"Mean wait: {df['wait'].mean():.2f}")
print(f"Median wait: {df['wait'].median():.2f}")
print(f"90th percentile: {df['wait'].quantile(0.9):.2f}")
print(f"Max wait: {df['wait'].max():.2f}")

Time-Based Analysis

# Add time bins
df['hour'] = (df['arrival'] / 60).astype(int)

# Hourly statistics
hourly = df.groupby('hour').agg({
    'customer_id': 'count',
    'wait': ['mean', 'max'],
    'service': 'mean'
}).round(2)

print(hourly)

# Rolling average
df['rolling_wait'] = df['wait'].rolling(window=50).mean()

Segmentation Analysis

# If you have customer types
df['customer_type'] = ['VIP' if x % 10 == 0 else 'Regular' for x in df['customer_id']]

# Compare segments
segment_stats = df.groupby('customer_type').agg({
    'wait': ['count', 'mean', 'std', 'max'],
    'service': 'mean'
})
print(segment_stats)

Time Series from Monitor

class TimeSeriesCollector:
    def __init__(self, env, resources):
        self.env = env
        self.resources = resources
        self.records = []

    def monitor(self, interval=1):
        while True:
            record = {'time': self.env.now}
            for name, resource in self.resources.items():
                record[f'{name}_queue'] = len(resource.queue)
                record[f'{name}_busy'] = resource.count
            self.records.append(record)
            yield self.env.timeout(interval)

    def to_dataframe(self):
        return pd.DataFrame(self.records).set_index('time')

# Usage
collector = TimeSeriesCollector(env, {'server': server})
env.process(collector.monitor())
# ... run ...

ts_df = collector.to_dataframe()
print(ts_df.head())

Resampling and Aggregation

# Time series resampling (assuming time in minutes)
ts_df.index = pd.to_timedelta(ts_df.index, unit='m')

# 10-minute averages
resampled = ts_df.resample('10T').mean()

# Or custom aggregation
hourly = ts_df.resample('60T').agg({
    'server_queue': ['mean', 'max'],
    'server_busy': 'mean'
})

Exporting Results

# To CSV
df.to_csv('simulation_results.csv', index=False)

# To Excel with multiple sheets
with pd.ExcelWriter('results.xlsx') as writer:
    df.to_excel(writer, sheet_name='Raw Data', index=False)
    summary.to_excel(writer, sheet_name='Summary')
    hourly.to_excel(writer, sheet_name='Hourly')

# To SQL database
from sqlalchemy import create_engine
engine = create_engine('sqlite:///simulation.db')
df.to_sql('runs', engine, if_exists='replace', index=False)

Scenario Comparison

def run_scenario(config, seed):
    random.seed(seed)
    env = simpy.Environment()
    sim = Simulation(env, config)
    env.run(until=1000)
    df = sim.to_dataframe()
    df['scenario'] = config['name']
    df['seed'] = seed
    return df

# Run multiple scenarios
scenarios = [
    {'name': '2_servers', 'servers': 2},
    {'name': '3_servers', 'servers': 3},
    {'name': '4_servers', 'servers': 4}
]

all_results = []
for config in scenarios:
    for seed in range(10):  # 10 replications
        df = run_scenario(config, seed)
        all_results.append(df)

combined = pd.concat(all_results, ignore_index=True)

# Compare scenarios
comparison = combined.groupby('scenario')['wait'].agg(['mean', 'std', 'max'])
print(comparison)

Pivot Tables

# Customer type by hour
pivot = df.pivot_table(
    values='wait',
    index='hour',
    columns='customer_type',
    aggfunc='mean'
)
print(pivot)

Statistical Tests

from scipy import stats

# Compare two scenarios
scenario_a = combined[combined['scenario'] == '2_servers']['wait']
scenario_b = combined[combined['scenario'] == '3_servers']['wait']

t_stat, p_value = stats.ttest_ind(scenario_a, scenario_b)
print(f"t-statistic: {t_stat:.4f}, p-value: {p_value:.4f}")

Complete pandas Integration

import simpy
import random
import pandas as pd
import numpy as np

class PandasSimulation:
    def __init__(self, env, config):
        self.env = env
        self.config = config
        self.server = simpy.Resource(env, capacity=config['servers'])

        # Data collectors
        self.entity_data = []
        self.time_series = []

    def customer(self, cid):
        arrival = self.env.now

        with self.server.request() as req:
            yield req
            wait = self.env.now - arrival
            service = random.expovariate(1/self.config['service_rate'])
            yield self.env.timeout(service)

        self.entity_data.append({
            'id': cid,
            'arrival': arrival,
            'wait': wait,
            'service': service,
            'departure': self.env.now,
            'system_time': self.env.now - arrival
        })

    def arrivals(self):
        cid = 0
        while True:
            yield self.env.timeout(random.expovariate(self.config['arrival_rate']))
            self.env.process(self.customer(cid))
            cid += 1

    def monitor(self, interval=1):
        while True:
            self.time_series.append({
                'time': self.env.now,
                'queue': len(self.server.queue),
                'busy': self.server.count,
                'utilisation': self.server.count / self.server.capacity
            })
            yield self.env.timeout(interval)

    def run(self, duration):
        self.env.process(self.arrivals())
        self.env.process(self.monitor())
        self.env.run(until=duration)

    def get_entity_df(self):
        return pd.DataFrame(self.entity_data)

    def get_timeseries_df(self):
        return pd.DataFrame(self.time_series)

    def summary(self):
        df = self.get_entity_df()
        return {
            'customers': len(df),
            'mean_wait': df['wait'].mean(),
            'median_wait': df['wait'].median(),
            'p90_wait': df['wait'].quantile(0.9),
            'max_wait': df['wait'].max(),
            'mean_system_time': df['system_time'].mean(),
            'throughput': len(df) / self.env.now
        }

# Run
random.seed(42)
env = simpy.Environment()
sim = PandasSimulation(env, {
    'servers': 2,
    'arrival_rate': 0.5,
    'service_rate': 0.3
})
sim.run(1000)

# Analyse
entity_df = sim.get_entity_df()
ts_df = sim.get_timeseries_df()
print(sim.summary())

Summary

SimPy + pandas: - Structure data collection from the start - Use DataFrames for analysis - Group, aggregate, pivot - Compare scenarios easily - Export to any format

Collect smart. Analyse smarter.

Next Steps


Discover the Power of Simulation

Want to become a go-to expert in simulation with Python? The Complete Simulation Bootcamp will show you how simulation can transform your career and your projects.

Explore the Bootcamp