SimPy with pandas: Powerful Data Analysis for Simulation Results
SimPy generates data. pandas makes sense of it. Together, they're unstoppable.
Why pandas?
SimPy gives you raw data—lists, dictionaries, timestamps. pandas gives you: - DataFrames for structured analysis - Aggregations, groupings, pivots - Time series analysis - Easy export to CSV, Excel, databases
Collecting Data for pandas
Structure your data from the start:
import simpy
import random
import pandas as pd
class Simulation:
def __init__(self, env):
self.env = env
self.records = [] # Will become DataFrame
def customer(self, customer_id, server):
arrival = self.env.now
with server.request() as req:
yield req
wait = self.env.now - arrival
service = random.expovariate(1/5)
yield self.env.timeout(service)
self.records.append({
'customer_id': customer_id,
'arrival': arrival,
'wait': wait,
'service': service,
'departure': self.env.now
})
def to_dataframe(self):
return pd.DataFrame(self.records)
Basic Analysis
# Run simulation
sim = Simulation(env)
# ... run ...
df = sim.to_dataframe()
# Summary statistics
print(df.describe())
# Specific metrics
print(f"Mean wait: {df['wait'].mean():.2f}")
print(f"Median wait: {df['wait'].median():.2f}")
print(f"90th percentile: {df['wait'].quantile(0.9):.2f}")
print(f"Max wait: {df['wait'].max():.2f}")
Time-Based Analysis
# Add time bins
df['hour'] = (df['arrival'] / 60).astype(int)
# Hourly statistics
hourly = df.groupby('hour').agg({
'customer_id': 'count',
'wait': ['mean', 'max'],
'service': 'mean'
}).round(2)
print(hourly)
# Rolling average
df['rolling_wait'] = df['wait'].rolling(window=50).mean()
Segmentation Analysis
# If you have customer types
df['customer_type'] = ['VIP' if x % 10 == 0 else 'Regular' for x in df['customer_id']]
# Compare segments
segment_stats = df.groupby('customer_type').agg({
'wait': ['count', 'mean', 'std', 'max'],
'service': 'mean'
})
print(segment_stats)
Time Series from Monitor
class TimeSeriesCollector:
def __init__(self, env, resources):
self.env = env
self.resources = resources
self.records = []
def monitor(self, interval=1):
while True:
record = {'time': self.env.now}
for name, resource in self.resources.items():
record[f'{name}_queue'] = len(resource.queue)
record[f'{name}_busy'] = resource.count
self.records.append(record)
yield self.env.timeout(interval)
def to_dataframe(self):
return pd.DataFrame(self.records).set_index('time')
# Usage
collector = TimeSeriesCollector(env, {'server': server})
env.process(collector.monitor())
# ... run ...
ts_df = collector.to_dataframe()
print(ts_df.head())
Resampling and Aggregation
# Time series resampling (assuming time in minutes)
ts_df.index = pd.to_timedelta(ts_df.index, unit='m')
# 10-minute averages
resampled = ts_df.resample('10T').mean()
# Or custom aggregation
hourly = ts_df.resample('60T').agg({
'server_queue': ['mean', 'max'],
'server_busy': 'mean'
})
Exporting Results
# To CSV
df.to_csv('simulation_results.csv', index=False)
# To Excel with multiple sheets
with pd.ExcelWriter('results.xlsx') as writer:
df.to_excel(writer, sheet_name='Raw Data', index=False)
summary.to_excel(writer, sheet_name='Summary')
hourly.to_excel(writer, sheet_name='Hourly')
# To SQL database
from sqlalchemy import create_engine
engine = create_engine('sqlite:///simulation.db')
df.to_sql('runs', engine, if_exists='replace', index=False)
Scenario Comparison
def run_scenario(config, seed):
random.seed(seed)
env = simpy.Environment()
sim = Simulation(env, config)
env.run(until=1000)
df = sim.to_dataframe()
df['scenario'] = config['name']
df['seed'] = seed
return df
# Run multiple scenarios
scenarios = [
{'name': '2_servers', 'servers': 2},
{'name': '3_servers', 'servers': 3},
{'name': '4_servers', 'servers': 4}
]
all_results = []
for config in scenarios:
for seed in range(10): # 10 replications
df = run_scenario(config, seed)
all_results.append(df)
combined = pd.concat(all_results, ignore_index=True)
# Compare scenarios
comparison = combined.groupby('scenario')['wait'].agg(['mean', 'std', 'max'])
print(comparison)
Pivot Tables
# Customer type by hour
pivot = df.pivot_table(
values='wait',
index='hour',
columns='customer_type',
aggfunc='mean'
)
print(pivot)
Statistical Tests
from scipy import stats
# Compare two scenarios
scenario_a = combined[combined['scenario'] == '2_servers']['wait']
scenario_b = combined[combined['scenario'] == '3_servers']['wait']
t_stat, p_value = stats.ttest_ind(scenario_a, scenario_b)
print(f"t-statistic: {t_stat:.4f}, p-value: {p_value:.4f}")
Complete pandas Integration
import simpy
import random
import pandas as pd
import numpy as np
class PandasSimulation:
def __init__(self, env, config):
self.env = env
self.config = config
self.server = simpy.Resource(env, capacity=config['servers'])
# Data collectors
self.entity_data = []
self.time_series = []
def customer(self, cid):
arrival = self.env.now
with self.server.request() as req:
yield req
wait = self.env.now - arrival
service = random.expovariate(1/self.config['service_rate'])
yield self.env.timeout(service)
self.entity_data.append({
'id': cid,
'arrival': arrival,
'wait': wait,
'service': service,
'departure': self.env.now,
'system_time': self.env.now - arrival
})
def arrivals(self):
cid = 0
while True:
yield self.env.timeout(random.expovariate(self.config['arrival_rate']))
self.env.process(self.customer(cid))
cid += 1
def monitor(self, interval=1):
while True:
self.time_series.append({
'time': self.env.now,
'queue': len(self.server.queue),
'busy': self.server.count,
'utilisation': self.server.count / self.server.capacity
})
yield self.env.timeout(interval)
def run(self, duration):
self.env.process(self.arrivals())
self.env.process(self.monitor())
self.env.run(until=duration)
def get_entity_df(self):
return pd.DataFrame(self.entity_data)
def get_timeseries_df(self):
return pd.DataFrame(self.time_series)
def summary(self):
df = self.get_entity_df()
return {
'customers': len(df),
'mean_wait': df['wait'].mean(),
'median_wait': df['wait'].median(),
'p90_wait': df['wait'].quantile(0.9),
'max_wait': df['wait'].max(),
'mean_system_time': df['system_time'].mean(),
'throughput': len(df) / self.env.now
}
# Run
random.seed(42)
env = simpy.Environment()
sim = PandasSimulation(env, {
'servers': 2,
'arrival_rate': 0.5,
'service_rate': 0.3
})
sim.run(1000)
# Analyse
entity_df = sim.get_entity_df()
ts_df = sim.get_timeseries_df()
print(sim.summary())
Summary
SimPy + pandas: - Structure data collection from the start - Use DataFrames for analysis - Group, aggregate, pivot - Compare scenarios easily - Export to any format
Collect smart. Analyse smarter.
Next Steps
Discover the Power of Simulation
Want to become a go-to expert in simulation with Python? The Complete Simulation Bootcamp will show you how simulation can transform your career and your projects.
Explore the Bootcamp