Parquet is a columnar storage file format designed for efficient data storage and retrieval. Unlike row-based formats like CSV, Parquet stores data by column, making it ideal for analytical workloads.
Why Parquet?
- Efficient Compression: 10-100x smaller than CSV
- Fast Queries: Read only columns you need
- Schema Preservation: Stores data types automatically
- Large File Support: Handles datasets too big for memory
- Industry Standard: Used by Spark, Dask, DuckDB, and more
- Metadata Support: Built-in statistics and indexing
How It Works
Columnar vs Row Storage
CSV (Row-based):
Alice,25,95
Bob,30,87
Stores: Alice,25,95,Bob,30,87 (all fields together)
Parquet (Column-based):
names: [Alice, Bob]
ages: [25, 30]
scores: [95, 87]
Stores columns separately with compression
Benefits for Analysis
- Reading
SELECT score FROM dataonly reads score column - Better compression (similar values compress well)
- Faster for analytical queries
Using Parquet with Pandas
Writing
import pandas as pd
# Create DataFrame
df = pd.DataFrame({
'subject_id': ['S01', 'S02', 'S03'],
'age': [25, 30, 28],
'score': [95, 87, 92]
})
# Save as Parquet
df.to_parquet('data.parquet')
# With compression
df.to_parquet('data.parquet', compression='snappy') # Default
df.to_parquet('data.parquet', compression='gzip')
df.to_parquet('data.parquet', compression='brotli')
Reading
# Read entire file
df = pd.read_parquet('data.parquet')
# Read specific columns only
df = pd.read_parquet('data.parquet', columns=['subject_id', 'score'])
# Read with filters
df = pd.read_parquet(
'data.parquet',
filters=[('age', '>', 25)]
)
Using Parquet with DuckDB
Direct Queries
import duckdb
# Query Parquet file directly (no loading!)
result = duckdb.query("""
SELECT subject_id, AVG(score) as avg_score
FROM 'data.parquet'
WHERE age > 25
GROUP BY subject_id
""").df()
# Query multiple files
result = duckdb.query("""
SELECT * FROM 'data/*.parquet'
WHERE trial > 100
""").df()
Create Table from Parquet
con = duckdb.connect()
# Create table
con.execute("CREATE TABLE data AS SELECT * FROM 'data.parquet'")
# Query normally
result = con.execute("SELECT * FROM data WHERE score > 90").df()
Neuroscience Examples
Storing Behavioral Data
import pandas as pd
# Large behavioral dataset
behavioral_data = pd.DataFrame({
'subject_id': ['S01'] * 10000,
'trial': range(10000),
'stimulus': ['left', 'right'] * 5000,
'response': ['left', 'right', 'none'] * 3333 + ['left'],
'reaction_time': np.random.uniform(0.2, 1.0, 10000),
'correct': [True, False] * 5000
})
# Save as Parquet (much smaller than CSV!)
behavioral_data.to_parquet('behavioral.parquet')
# Compare file sizes
import os
behavioral_data.to_csv('behavioral.csv', index=False)
parquet_size = os.path.getsize('behavioral.parquet')
csv_size = os.path.getsize('behavioral.csv')
print(f"Parquet: {parquet_size / 1024:.1f} KB")
print(f"CSV: {csv_size / 1024:.1f} KB")
print(f"Compression ratio: {csv_size / parquet_size:.1f}x")
Spike Time Data
# Large spike dataset
spikes = pd.DataFrame({
'unit_id': np.repeat(range(100), 1000), # 100 neurons
'spike_time': np.random.exponential(0.1, 100000),
'trial': np.tile(range(1000), 100),
'brain_region': np.random.choice(['V1', 'V2', 'MT'], 100000)
})
# Save with compression
spikes.to_parquet('spikes.parquet', compression='snappy')
# Read only specific region
v1_spikes = pd.read_parquet(
'spikes.parquet',
filters=[('brain_region', '==', 'V1')]
)
Calcium Imaging Traces
# Time series data for many ROIs
traces = pd.DataFrame({
'roi_id': np.repeat(range(200), 3000), # 200 ROIs
'frame': np.tile(range(3000), 200),
'dF_F': np.random.randn(600000),
'trial': np.repeat(range(10), 60000)
})
# Partition by trial
traces.to_parquet(
'traces.parquet',
partition_cols=['trial']
)
# Creates: traces/trial=0/, traces/trial=1/, etc.
# Read single trial efficiently
trial_5 = pd.read_parquet('traces.parquet/trial=5')
Partitioning
Partition Large Datasets
# Partition by subject
df.to_parquet(
'dataset',
partition_cols=['subject_id']
)
# Creates: dataset/subject_id=S01/, dataset/subject_id=S02/, etc.
# Partition by multiple columns
df.to_parquet(
'dataset',
partition_cols=['subject_id', 'session']
)
# Creates: dataset/subject_id=S01/session=1/, etc.
# Read only specific partition
s01_data = pd.read_parquet('dataset/subject_id=S01')
Schema and Data Types
Automatic Type Preservation
# Pandas types preserved
df = pd.DataFrame({
'integer': [1, 2, 3],
'float': [1.1, 2.2, 3.3],
'string': ['a', 'b', 'c'],
'datetime': pd.date_range('2024-01-01', periods=3),
'category': pd.Categorical(['A', 'B', 'A'])
})
df.to_parquet('typed_data.parquet')
# Read back - types preserved!
df2 = pd.read_parquet('typed_data.parquet')
print(df2.dtypes)
View Schema
import pyarrow.parquet as pq
# Read schema without loading data
schema = pq.read_schema('data.parquet')
print(schema)
# Get metadata
meta = pq.read_metadata('data.parquet')
print(f"Rows: {meta.num_rows}")
print(f"Columns: {meta.num_columns}")
Performance Comparison
CSV vs Parquet
import pandas as pd
import time
# Create large dataset
n = 1_000_000
df = pd.DataFrame({
'id': range(n),
'value': np.random.randn(n),
'category': np.random.choice(['A', 'B', 'C'], n)
})
# Write benchmarks
start = time.time()
df.to_csv('test.csv', index=False)
csv_write = time.time() - start
start = time.time()
df.to_parquet('test.parquet')
parquet_write = time.time() - start
print(f"CSV write: {csv_write:.2f}s")
print(f"Parquet write: {parquet_write:.2f}s")
# Read benchmarks
start = time.time()
df = pd.read_csv('test.csv')
csv_read = time.time() - start
start = time.time()
df = pd.read_parquet('test.parquet')
parquet_read = time.time() - start
print(f"CSV read: {csv_read:.2f}s")
print(f"Parquet read: {parquet_read:.2f}s")
# Partial read (Parquet only needs to read one column!)
start = time.time()
df = pd.read_parquet('test.parquet', columns=['value'])
parquet_partial = time.time() - start
print(f"Parquet partial read: {parquet_partial:.2f}s")
Integration with Other Tools
PyArrow
import pyarrow.parquet as pq
import pyarrow as pa
# Read as Arrow Table
table = pq.read_table('data.parquet')
# Filter at read time
filtered = pq.read_table(
'data.parquet',
filters=[('score', '>', 90)]
)
# Convert to pandas
df = filtered.to_pandas()
Dask (for huge datasets)
import dask.dataframe as dd
# Read Parquet with Dask (parallel)
ddf = dd.read_parquet('large_dataset.parquet')
# Compute operations lazily
result = ddf[ddf['score'] > 90].groupby('subject')['score'].mean()
# Compute result
result.compute()
Best Practices
When to Use Parquet
-
Use Parquet when:
- Dataset > 100MB
- Analytical queries (filter, aggregate)
- Need compression
- Working with columnar tools (DuckDB, Spark)
- Schema should be preserved
-
Use CSV when:
- Small datasets
- Need human readability
- Simple editing required
- Maximum compatibility needed
Optimization Tips
# Use appropriate compression
df.to_parquet('data.parquet', compression='snappy') # Fast
df.to_parquet('data.parquet', compression='gzip') # Smaller
df.to_parquet('data.parquet', compression='brotli') # Smallest
# Partition large datasets
df.to_parquet('data', partition_cols=['subject_id'])
# Use row groups for large files
df.to_parquet('data.parquet', row_group_size=100000)
Common Workflows
Convert CSV to Parquet
# Convert existing CSV files
import pandas as pd
from pathlib import Path
csv_dir = Path('data/csv')
parquet_dir = Path('data/parquet')
parquet_dir.mkdir(exist_ok=True)
for csv_file in csv_dir.glob('*.csv'):
df = pd.read_csv(csv_file)
parquet_file = parquet_dir / f"{csv_file.stem}.parquet"
df.to_parquet(parquet_file)
print(f"Converted {csv_file.name}")
Merge Multiple Files
# Read all parquet files in directory
df = pd.read_parquet('data/*.parquet')
# Or with DuckDB
import duckdb
df = duckdb.query("SELECT * FROM 'data/*.parquet'").df()
Installation
pixi add pyarrow
# or
pip install pyarrow
# For Fastparquet alternative
pixi add fastparquet
Summary
Parquet is essential for modern data workflows with large datasets. Key benefits:
- 10-100x smaller than CSV
- Faster queries (read only needed columns)
- Schema preservation (types, metadata)
- Industry standard (works everywhere)
For research data > 100MB or datasets requiring frequent analytical queries, Parquet is the superior choice over CSV.