fsspec (Filesystem Spec) is a Python library that provides a unified interface for working with filesystems. It abstracts away the differences between local files, cloud storage, HTTP, GitHub, and other data sources.
Why fsspec?
- Unified Interface: Same API for local, cloud, and remote files
- Many Backends: GitHub, S3, GCS, Azure, HTTP, WebDAV, and more
- Lazy Loading: Stream data without downloading entire files
- Transparent Caching: Optional local caching of remote files
- Integration: Works with pandas, xarray, and other data libraries
Basic Concepts
Filesystem Objects
import fsspec
# Local filesystem
fs_local = fsspec.filesystem('file')
# GitHub filesystem
fs_github = fsspec.filesystem('github', org='username', repo='project')
# HTTP filesystem
fs_http = fsspec.filesystem('http')
# WebDAV (for Sciebo, NextCloud)
fs_webdav = fsspec.filesystem(
'webdav',
base_url='https://uni-muenster.sciebo.de/remote.php/webdav',
auth=('username', 'password')
)
Working with GitHub
Access GitHub Repositories
from fsspec.implementations.github import GithubFileSystem
# Public repository
fs = GithubFileSystem(org='numpy', repo='numpy')
# List files
files = fs.ls('numpy/core')
print(files)
# Read file
content = fs.cat('README.md')
print(content.decode('utf-8'))
# Download file
fs.get('setup.py', 'local_setup.py')
Glob Patterns on GitHub
# Find all Python files
py_files = fs.glob('**/*.py')
# Find specific pattern
test_files = fs.glob('**/test_*.py')
# Read multiple files
for file in py_files[:5]:
content = fs.cat(file)
print(f"File: {file}, Size: {len(content)} bytes")
WebDAV for Cloud Storage
Connect to Sciebo
from fsspec.implementations.webdav import WebDAVFileSystem
# Connect to Sciebo
fs = WebDAVFileSystem(
base_url='https://uni-muenster.sciebo.de/remote.php/webdav',
auth=('username', 'password')
)
# List files
files = fs.ls('/')
# Download file
fs.get('/datasets/experiment.csv', 'local_experiment.csv')
# Upload file
fs.put('local_results.csv', '/results/analysis.csv')
Using Environment Variables
import os
from fsspec.implementations.webdav import WebDAVFileSystem
# Load credentials from environment
fs = WebDAVFileSystem(
base_url='https://uni-muenster.sciebo.de/remote.php/webdav',
auth=(os.environ['SCIEBO_USER'], os.environ['SCIEBO_PASSWORD'])
)
Integration with Pandas
Read Remote CSV
import pandas as pd
# Read CSV from GitHub
url = 'github://pandas-dev:pandas@main/pandas/tests/io/data/csv/iris.csv'
df = pd.read_csv(url)
# Read from HTTP
df = pd.read_csv('https://example.com/data.csv', storage_options={})
# Read from WebDAV
storage_options = {
'client_kwargs': {
'base_url': 'https://uni-muenster.sciebo.de/remote.php/webdav',
'auth': ('username', 'password')
}
}
df = pd.read_csv('webdav://datasets/data.csv', storage_options=storage_options)
Practical Research Examples
Access Shared Dataset
from fsspec.implementations.github import GithubFileSystem
import pandas as pd
# Connect to repository with shared data
fs = GithubFileSystem(org='research-group', repo='datasets')
# List available datasets
datasets = fs.glob('data/*.csv')
print(f"Available datasets: {datasets}")
# Read dataset directly
for dataset in datasets:
content = fs.cat(dataset)
df = pd.read_csv(io.BytesIO(content))
print(f"{dataset}: {len(df)} rows")
Download Course Materials
from fsspec.implementations.github import GithubFileSystem
from pathlib import Path
def download_course_data(org, repo, remote_path, local_dir):
"""Download course materials from GitHub."""
fs = GithubFileSystem(org=org, repo=repo)
# Create local directory
local_path = Path(local_dir)
local_path.mkdir(parents=True, exist_ok=True)
# Find all files
files = fs.glob(f'{remote_path}/**')
# Download each file
for file in files:
if fs.isfile(file):
filename = Path(file).name
local_file = local_path / filename
print(f"Downloading {filename}...")
fs.get(file, str(local_file))
# Usage
download_course_data(
'course-repo',
'materials',
'data/exercises',
'local_data'
)
Stream Large Files
import fsspec
# Open remote file without downloading
with fsspec.open('https://example.com/large_dataset.csv', 'rb') as f:
# Read in chunks
chunk_size = 1024 * 1024 # 1MB
while True:
chunk = f.read(chunk_size)
if not chunk:
break
process_chunk(chunk)
Caching Remote Files
Simple Caching
import fsspec
# Cache remote files locally
fs = fsspec.filesystem(
'simplecache',
target_protocol='http',
cache_storage='/tmp/fsspec_cache'
)
# First access downloads and caches
with fs.open('https://example.com/data.csv', 'r') as f:
data = f.read()
# Second access uses cache (faster)
with fs.open('https://example.com/data.csv', 'r') as f:
data = f.read()
With Expiration
fs = fsspec.filesystem(
'simplecache',
target_protocol='http',
cache_storage='/tmp/fsspec_cache',
expiry_time=3600 # Cache for 1 hour
)
Advanced Features
Check File Existence
import fsspec
fs = fsspec.filesystem('github', org='numpy', repo='numpy')
# Check if file exists
if fs.exists('README.md'):
print("README exists")
# Check if directory
if fs.isdir('numpy/core'):
print("Is directory")
# Check if file
if fs.isfile('setup.py'):
print("Is file")
File Information
# Get file info
info = fs.info('README.md')
print(f"Size: {info['size']} bytes")
print(f"Type: {info['type']}")
Copy Between Filesystems
# Copy from GitHub to local
fs_github = fsspec.filesystem('github', org='org', repo='repo')
fs_local = fsspec.filesystem('file')
# Copy file
with fs_github.open('data.csv', 'rb') as src:
with fs_local.open('local_data.csv', 'wb') as dst:
dst.write(src.read())
URL Strings
fsspec supports URL-style strings:
import fsspec
# Open file with URL
with fsspec.open('github://org:repo@main/data.csv', 'r') as f:
content = f.read()
# With pandas
import pandas as pd
df = pd.read_csv('github://org:repo@main/data.csv')
# HTTP
df = pd.read_csv('https://example.com/data.csv')
# S3 (if credentials configured)
df = pd.read_csv('s3://bucket/data.csv')
Available Backends
fsspec supports many backends:
- Local:
file:// - HTTP/HTTPS:
http://,https:// - GitHub:
github:// - S3:
s3:// - Google Cloud Storage:
gcs:// - Azure:
abfs://,az:// - FTP:
ftp:// - SFTP:
sftp:// - WebDAV:
webdav:// - Dropbox:
dropbox:// - ZIP Archives:
zip://
Working with Archives
import fsspec
# Access files inside ZIP without extracting
fs = fsspec.filesystem('zip', fo='data.zip')
# List contents
files = fs.ls('/')
# Read file from ZIP
with fs.open('data/experiment.csv', 'r') as f:
content = f.read()
Error Handling
import fsspec
try:
fs = fsspec.filesystem('github', org='org', repo='repo')
content = fs.cat('nonexistent.txt')
except FileNotFoundError:
print("File not found")
except Exception as e:
print(f"Error: {e}")
Installation
pixi add fsspec
# With specific backends
pixi add "fsspec[github]"
pixi add "fsspec[http]"
# Or with pip
pip install fsspec
pip install fsspec[github]
Best Practices
- Use URL strings for simple cases (
pd.read_csv('github://...')) - Create filesystem objects for multiple operations
- Cache remote files when accessing repeatedly
- Use environment variables for credentials
- Handle connection errors gracefully
- Consider bandwidth when streaming large files
- Use glob patterns for batch operations
- Close file handles explicitly or use context managers
Integration with Other Tools
With xarray
import xarray as xr
# Open remote NetCDF file
ds = xr.open_dataset('https://example.com/data.nc')
# With fsspec caching
ds = xr.open_dataset(
'simplecache::https://example.com/data.nc',
cache_storage='/tmp/cache'
)
With Zarr
import zarr
# Open Zarr store from S3
store = fsspec.get_mapper('s3://bucket/data.zarr')
z = zarr.open(store, mode='r')
Summary
fsspec provides a unified way to work with files regardless of location. It’s particularly useful for:
- Accessing shared datasets on GitHub
- Working with cloud storage
- Streaming remote data
- Building portable data pipelines
The consistent API means your code works the same whether data is local or remote.