Papermill is a tool for parameterizing, executing, and analyzing Jupyter notebooks. It allows you to treat notebooks as functions that can be called with different parameters, making it easy to run the same analysis on multiple datasets.
Why Papermill?
- Parameterization: Run notebooks with different inputs
- Batch Processing: Process multiple subjects/sessions
- Automation: Integrate notebooks into pipelines
- Reproducible: Record exact parameters used
- Simple: Works with existing notebooks
- Output Capture: Save results for each run
Basic Usage
Command Line
# Run notebook with parameters
papermill input.ipynb output.ipynb -p subject_id S01 -p session 1
# Multiple parameters
papermill analysis.ipynb results/S01_session1.ipynb \
-p subject "S01" \
-p session 1 \
-p threshold 0.5
Python API
import papermill as pm
# Execute notebook with parameters
pm.execute_notebook(
'analysis.ipynb',
'output/S01_analysis.ipynb',
parameters={'subject_id': 'S01', 'threshold': 0.5}
)
Creating Parameterized Notebooks
Tag Parameters Cell
In your notebook, tag a cell as “parameters”:
- Create cell with default parameter values
- In Jupyter: View → Cell Toolbar → Tags
- Add tag “parameters” to the cell
# Cell tagged as "parameters"
subject_id = "S01"
session = 1
threshold = 0.5
When Papermill runs, it inserts a new cell with your specified parameters right after this cell.
Research Workflow Examples
Process Multiple Subjects
import papermill as pm
from pathlib import Path
# List of subjects
subjects = ['S01', 'S02', 'S03']
# Process each subject
for subject in subjects:
output_path = Path(f'results/{subject}_analysis.ipynb')
pm.execute_notebook(
'template_analysis.ipynb',
str(output_path),
parameters={
'subject_id': subject,
'data_path': f'data/{subject}/recording.tif',
'threshold': 0.5
}
)
print(f"Completed analysis for {subject}")
Multi-Session Analysis
import papermill as pm
import pandas as pd
# Load experiment log
experiment_log = pd.read_csv('experiment_log.csv')
# Process each session
for _, row in experiment_log.iterrows():
subject = row['subject_id']
session = row['session']
date = row['date']
pm.execute_notebook(
'session_analysis.ipynb',
f'results/{subject}_{session}_{date}.ipynb',
parameters={
'subject_id': subject,
'session': session,
'date': date,
'data_dir': f'data/{subject}/{session}'
}
)
Parameter Sweep
import papermill as pm
import numpy as np
# Test different thresholds
thresholds = np.linspace(0.1, 1.0, 10)
for threshold in thresholds:
pm.execute_notebook(
'sensitivity_analysis.ipynb',
f'results/threshold_{threshold:.2f}.ipynb',
parameters={'threshold': threshold}
)
Advanced Features
Kernel Selection
# Use specific kernel
pm.execute_notebook(
'analysis.ipynb',
'output.ipynb',
kernel_name='python3',
parameters={'subject': 'S01'}
)
Environment Variables
# Pass environment variables
pm.execute_notebook(
'analysis.ipynb',
'output.ipynb',
parameters={'subject': 'S01'},
prepare_only=False,
kernel_name='python3',
cwd='/path/to/working/dir'
)
Error Handling
try:
pm.execute_notebook(
'analysis.ipynb',
'output.ipynb',
parameters={'subject': 'S01'}
)
except Exception as e:
print(f"Notebook failed: {e}")
# Continue with other notebooks
Progress Reporting
from tqdm import tqdm
subjects = ['S01', 'S02', 'S03', 'S04', 'S05']
for subject in tqdm(subjects, desc="Processing subjects"):
pm.execute_notebook(
'analysis.ipynb',
f'results/{subject}.ipynb',
parameters={'subject_id': subject},
progress_bar=False # Disable notebook's progress bar
)
Reading Output Notebooks
Extract Results
import papermill as pm
# Read executed notebook
nb = pm.read_notebook('results/S01_analysis.ipynb')
# Get output parameters
output_params = nb.data
# Access specific cells
for cell in nb.cells:
if 'results' in cell.get('tags', []):
print(cell['outputs'])
Collect Results from Multiple Runs
import papermill as pm
import pandas as pd
results = []
for subject in ['S01', 'S02', 'S03']:
nb_path = f'results/{subject}_analysis.ipynb'
nb = pm.read_notebook(nb_path)
# Extract tagged output
results.append({
'subject': subject,
'mean_activity': nb.dataframe.loc['mean_activity', 'value'],
'peak_response': nb.dataframe.loc['peak_response', 'value']
})
# Create summary DataFrame
summary = pd.DataFrame(results)
summary.to_csv('summary_results.csv', index=False)
Integration with Other Tools
With Snakemake
# Snakefile
rule run_analysis:
input:
"template.ipynb",
"data/{subject}.csv"
output:
"results/{subject}_analysis.ipynb"
shell:
"papermill {input[0]} {output} -p subject {wildcards.subject}"
With Pydoit
# dodo.py
import papermill as pm
def task_analyze_subjects():
"""Run analysis for each subject"""
for subject in ['S01', 'S02', 'S03']:
yield {
'name': subject,
'actions': [(pm.execute_notebook, [
'template.ipynb',
f'results/{subject}.ipynb',
{'subject_id': subject}
])],
'file_dep': ['template.ipynb', f'data/{subject}.csv'],
'targets': [f'results/{subject}.ipynb'],
}
With Scheduled Jobs
# schedule_analysis.py
import papermill as pm
import schedule
import time
def run_daily_analysis():
from datetime import datetime
date = datetime.now().strftime('%Y-%m-%d')
pm.execute_notebook(
'daily_report.ipynb',
f'reports/report_{date}.ipynb',
parameters={'date': date}
)
# Schedule to run every day at 9 AM
schedule.every().day.at("09:00").do(run_daily_analysis)
while True:
schedule.run_pending()
time.sleep(60)
Best Practices
Notebook Structure
# Cell 1: Tagged as "parameters"
subject_id = "S01"
data_dir = "data"
threshold = 0.5
# Cell 2: Setup (imports, functions)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
def load_data(subject, directory):
# ... load code ...
return data
# Cell 3: Load data
data = load_data(subject_id, data_dir)
# Cell 4: Analysis
results = analyze(data, threshold)
# Cell 5: Tagged as "results" - Save key outputs
mean_response = results['mean']
peak_response = results['peak']
# Cell 6: Visualization
plt.figure()
plt.plot(results['trace'])
plt.savefig(f'figures/{subject_id}_trace.png')
Error Recovery
import papermill as pm
from pathlib import Path
def robust_execute(input_nb, output_nb, params):
"""Execute notebook with error handling"""
output_path = Path(output_nb)
output_path.parent.mkdir(parents=True, exist_ok=True)
try:
pm.execute_notebook(
input_nb,
str(output_path),
parameters=params
)
return True, None
except Exception as e:
error_output = output_path.parent / f"{output_path.stem}_FAILED.ipynb"
# Save failed notebook for debugging
return False, str(e)
# Use it
for subject in subjects:
success, error = robust_execute(
'template.ipynb',
f'results/{subject}.ipynb',
{'subject_id': subject}
)
if not success:
print(f"Failed for {subject}: {error}")
Logging
import papermill as pm
import logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(message)s',
filename='papermill_execution.log'
)
logger = logging.getLogger(__name__)
for subject in subjects:
logger.info(f"Starting analysis for {subject}")
pm.execute_notebook(
'analysis.ipynb',
f'results/{subject}.ipynb',
parameters={'subject_id': subject},
log_output=True
)
logger.info(f"Completed {subject}")
Common Patterns
Generate Reports
# Generate HTML reports
import papermill as pm
from subprocess import run
for subject in subjects:
# Execute notebook
nb_path = f'reports/{subject}.ipynb'
pm.execute_notebook(
'report_template.ipynb',
nb_path,
parameters={'subject': subject}
)
# Convert to HTML
run([
'jupyter', 'nbconvert',
'--to', 'html',
nb_path,
'--output', f'{subject}_report.html'
])
Parallel Execution
from concurrent.futures import ProcessPoolExecutor
import papermill as pm
def process_subject(subject):
pm.execute_notebook(
'template.ipynb',
f'results/{subject}.ipynb',
parameters={'subject_id': subject}
)
return subject
# Process in parallel
with ProcessPoolExecutor(max_workers=4) as executor:
futures = [executor.submit(process_subject, s) for s in subjects]
for future in futures:
subject = future.result()
print(f"Completed {subject}")
Installation
# With pixi
pixi add papermill
# With pip
pip install papermill
# With conda
conda install -c conda-forge papermill
Alternatives
- nbconvert: Convert notebooks (no parameterization)
- scrapbook: Store/retrieve data from notebooks
- nbclient: Lower-level notebook execution
- Ploomber: Full notebook pipeline tool
Resources
- Documentation: https://papermill.readthedocs.io/
- GitHub: https://github.com/nteract/papermill
- Examples: https://github.com/nteract/papermill/tree/main/papermill/tests/notebooks
Summary
Papermill is ideal for:
- Batch processing: Same analysis, many subjects
- Parameter sweeps: Test different settings
- Automated reports: Regular analysis updates
- Reproducibility: Record exact parameters used
It transforms Jupyter notebooks from interactive documents into parameterized, executable analysis templates that can be integrated into larger workflows.