Pydoit (or just “doit”) is a task management and automation tool written in Python. It allows you to define tasks and their dependencies in Python code, and automatically determines which tasks need to be executed.
Why Pydoit?
- Python-Based: Define tasks in Python, not a DSL
- Smart Execution: Only runs tasks when inputs change
- Dependency Tracking: Automatic task ordering
- Jupyter Integration: Load tasks directly in notebooks
- Simple: Easier learning curve than Snakemake/Make
- Flexible: Python code for task definitions
Basic Concepts
Tasks as Python Functions
# dodo.py
def task_download_data():
"""Download raw data files"""
return {
'actions': ['wget https://example.com/data.csv -O data/raw.csv'],
'targets': ['data/raw.csv'],
}
def task_process_data():
"""Process downloaded data"""
return {
'actions': ['python scripts/process.py'],
'file_dep': ['data/raw.csv'],
'targets': ['data/processed.csv'],
}
def task_analyze():
"""Run analysis"""
return {
'actions': ['python scripts/analyze.py'],
'file_dep': ['data/processed.csv'],
'targets': ['results/analysis.png'],
}
Running Tasks
# List all tasks
doit list
# Run all tasks
doit
# Run specific task
doit process_data
# Force re-run (ignore up-to-date)
doit -a
# Clean generated files
doit clean
Task Structure
Basic Task
def task_example():
return {
'actions': ['echo "Hello World"'],
'verbosity': 2, # Show command output
}
With Dependencies
def task_preprocess():
return {
'actions': ['python preprocess.py %(dependencies)s %(targets)s'],
'file_dep': ['data/raw.csv'],
'targets': ['data/clean.csv'],
}
def task_analyze():
return {
'actions': ['python analyze.py'],
'file_dep': ['data/clean.csv'], # Depends on preprocess
'targets': ['results/plot.png'],
}
Python Actions
def process_data(input_file, output_file):
import pandas as pd
df = pd.read_csv(input_file)
# ... process ...
df.to_csv(output_file, index=False)
def task_process():
return {
'actions': [(process_data, ['data/raw.csv', 'data/clean.csv'])],
'file_dep': ['data/raw.csv'],
'targets': ['data/clean.csv'],
}
Research Workflow Example
Complete Analysis Pipeline
# dodo.py
import pandas as pd
from pathlib import Path
SUBJECTS = ['S01', 'S02', 'S03']
# Download data
def task_download():
"""Download dataset from repository"""
return {
'actions': ['wget https://repo.org/data.zip', 'unzip data.zip'],
'targets': ['data/raw/dataset.csv'],
'uptodate': [True], # Only download once
}
# Process each subject
def task_preprocess():
"""Preprocess individual subject data"""
for subject in SUBJECTS:
yield {
'name': subject,
'actions': [f'python scripts/preprocess.py {subject}'],
'file_dep': [f'data/raw/{subject}.csv'],
'targets': [f'data/processed/{subject}.csv'],
}
# Aggregate results
def task_aggregate():
"""Combine all subject data"""
return {
'actions': ['python scripts/aggregate.py'],
'file_dep': [f'data/processed/{s}.csv' for s in SUBJECTS],
'targets': ['data/combined.csv'],
}
# Statistical analysis
def task_statistics():
"""Run statistical tests"""
return {
'actions': ['python scripts/stats.py'],
'file_dep': ['data/combined.csv'],
'targets': ['results/statistics.csv'],
}
# Generate figures
def task_figures():
"""Create publication figures"""
return {
'actions': ['python scripts/plot.py'],
'file_dep': ['data/combined.csv', 'results/statistics.csv'],
'targets': ['figures/figure1.png', 'figures/figure2.png'],
}
# Generate report
def task_report():
"""Generate final report"""
return {
'actions': ['jupyter nbconvert --execute report.ipynb'],
'file_dep': ['report.ipynb', 'results/statistics.csv'],
'targets': ['report.html'],
}
Jupyter Notebook Integration
Load Doit in Notebook
# In Jupyter notebook
from doit import load_ipython_extension
load_ipython_extension()
# Now you can use %%doit magic
Define Tasks in Cells
# Cell 1: Define task
def task_process():
return {
'actions': ['python process.py'],
'file_dep': ['data.csv'],
'targets': ['processed.csv'],
}
# Cell 2: Run task
%doit process
Notebook as Task
def task_run_notebook():
"""Execute analysis notebook"""
return {
'actions': ['jupyter nbconvert --execute analysis.ipynb'],
'file_dep': ['analysis.ipynb', 'data/input.csv'],
'targets': ['analysis.html'],
}
Advanced Features
Task Groups
def task_process_all():
"""Process all data files"""
for filename in Path('data/raw').glob('*.csv'):
yield {
'name': filename.stem,
'actions': [f'python process.py {filename}'],
'file_dep': [str(filename)],
'targets': [f'data/processed/{filename.stem}.csv'],
}
Cleanup
def task_analyze():
return {
'actions': ['python analyze.py'],
'file_dep': ['data.csv'],
'targets': ['results.png'],
'clean': True, # Include in 'doit clean'
}
# Run cleanup
# doit clean
Custom Uptodate Check
def check_modified(task, values):
"""Check if file was modified recently"""
from pathlib import Path
import time
file_path = Path(task.file_dep[0])
if not file_path.exists():
return False
# Re-run if modified in last hour
age = time.time() - file_path.stat().st_mtime
return age > 3600
def task_analyze():
return {
'actions': ['python analyze.py'],
'file_dep': ['data.csv'],
'uptodate': [check_modified],
}
Parameters
# Pass parameters to tasks
DOIT_CONFIG = {'default_tasks': ['all']}
def task_process():
def process(threshold):
print(f"Processing with threshold={threshold}")
# ... process ...
return {
'actions': [(process, )],
'params': [{'name': 'threshold', 'default': 0.5}],
}
# Run with parameter:
# doit process threshold=0.8
Comparison with Other Tools
Pydoit vs Make
- Pydoit: Python-based, easier for Python users
- Make: Industry standard, steeper learning curve
Pydoit vs Snakemake
- Pydoit: Simpler, good for straightforward pipelines
- Snakemake: More powerful, better for complex workflows
When to Use Pydoit
- Medium-sized projects (10-50 tasks)
- Python-focused workflows
- Need simplicity over features
- Working primarily in Jupyter
When to Use Snakemake
- Large projects (100+ tasks)
- Need cluster execution
- Complex dependency patterns
- Bioinformatics workflows
Common Patterns
Parallel Execution
# dodo.py
DOIT_CONFIG = {
'num_process': 4, # Run 4 tasks in parallel
}
def task_process():
for i in range(10):
yield {
'name': f'file_{i}',
'actions': [f'python process.py file_{i}.csv'],
}
Conditional Execution
def task_download():
"""Download data if not present"""
data_file = Path('data/dataset.csv')
if data_file.exists():
actions = []
else:
actions = ['wget https://example.com/data.csv']
return {
'actions': actions,
'targets': ['data/dataset.csv'],
}
Logging
import logging
def task_analyze():
def analyze():
logger = logging.getLogger('doit')
logger.info("Starting analysis...")
# ... analysis code ...
logger.info("Analysis complete")
return {
'actions': [analyze],
'verbosity': 2,
}
Best Practices
- Organize by dodo.py: Keep tasks in
dodo.pyfile - Use generators: For similar tasks, yield from generator
- File dependencies: Always specify file_dep and targets
- Small tasks: Break work into small, reusable tasks
- Clean targets: Add ‘clean’: True for generated files
- Document tasks: Use docstrings (shown in
doit list) - Test tasks: Run tasks individually during development
Configuration
dodo.py Configuration
DOIT_CONFIG = {
'default_tasks': ['all'],
'verbosity': 2,
'num_process': 4,
'backend': 'json', # or 'sqlite3', 'dbm'
}
Installation
# With pixi
pixi add doit
# With pip
pip install doit
# With conda
conda install -c conda-forge doit
Resources
- Documentation: https://pydoit.org/
- Tutorial: https://pydoit.org/tutorial-1.html
- GitHub: https://github.com/pydoit/doit
Summary
Pydoit is perfect for:
- Automation: Repeating analysis workflows
- Dependencies: Managing task order automatically
- Incremental: Only rerun what changed
- Jupyter-friendly: Works great in notebooks
For Python-based research workflows that don’t need the complexity of Snakemake, Pydoit provides an excellent middle ground between manual scripts and full workflow systems.