Scikit-learn is a comprehensive machine learning library for Python. It provides simple and efficient tools for data mining and data analysis, with a consistent interface across all methods.
Why scikit-learn?
- Comprehensive: Classification, regression, clustering, dimensionality reduction
- Consistent API: All algorithms follow same patterns
- Well-Documented: Excellent docs and examples
- Efficient: Built on NumPy, SciPy, and Cython
- Production-Ready: Used in industry and research
- Visualizations: Integration with matplotlib
Basic Workflow
Standard Pattern
from sklearn import preprocessing, model_selection, metrics
# 1. Prepare data
X = data # Features
y = labels # Labels/targets
# 2. Split data
X_train, X_test, y_train, y_test = model_selection.train_test_split(
X, y, test_size=0.2, random_state=42
)
# 3. Preprocess
scaler = preprocessing.StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# 4. Train model
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(X_train_scaled, y_train)
# 5. Predict
y_pred = model.predict(X_test_scaled)
# 6. Evaluate
accuracy = metrics.accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
Dimensionality Reduction
PCA (Principal Component Analysis)
from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt
# Neural activity: (n_trials, n_neurons)
neural_data = np.random.randn(1000, 100) # 1000 trials, 100 neurons
# Apply PCA
pca = PCA(n_components=10)
reduced_data = pca.fit_transform(neural_data)
print(f"Original shape: {neural_data.shape}")
print(f"Reduced shape: {reduced_data.shape}")
# Variance explained
plt.figure(figsize=(10, 4))
plt.subplot(1, 2, 1)
plt.plot(pca.explained_variance_ratio_, 'o-')
plt.xlabel('Component')
plt.ylabel('Variance Explained')
plt.title('Scree Plot')
plt.subplot(1, 2, 2)
plt.plot(np.cumsum(pca.explained_variance_ratio()), 'o-')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Variance Explained')
plt.title('Cumulative Variance')
plt.axhline(y=0.9, color='r', linestyle='--', label='90%')
plt.legend()
plt.tight_layout()
plt.show()
t-SNE for Visualization
from sklearn.manifold import TSNE
# High-dimensional neural data
neural_responses = np.random.randn(500, 100) # 500 trials, 100 features
condition_labels = np.repeat([0, 1, 2], [167, 167, 166]) # 3 conditions
# Apply t-SNE
tsne = TSNE(n_components=2, random_state=42)
embedded = tsne.fit_transform(neural_responses)
# Visualize
plt.figure(figsize=(8, 6))
for condition in [0, 1, 2]:
mask = condition_labels == condition
plt.scatter(
embedded[mask, 0],
embedded[mask, 1],
label=f'Condition {condition}',
alpha=0.6
)
plt.xlabel('t-SNE 1')
plt.ylabel('t-SNE 2')
plt.legend()
plt.title('Neural Responses in 2D')
plt.show()
Clustering
K-Means Clustering
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
# Neural activity patterns
neural_patterns = np.random.randn(200, 50) # 200 patterns, 50 features
# Find optimal number of clusters
silhouette_scores = []
K_range = range(2, 11)
for k in K_range:
kmeans = KMeans(n_clusters=k, random_state=42)
labels = kmeans.fit_predict(neural_patterns)
score = silhouette_score(neural_patterns, labels)
silhouette_scores.append(score)
# Plot elbow curve
plt.plot(K_range, silhouette_scores, 'o-')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.title('Optimal Clusters')
plt.show()
# Use optimal k
optimal_k = K_range[np.argmax(silhouette_scores)]
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
cluster_labels = kmeans.fit_predict(neural_patterns)
print(f"Optimal k: {optimal_k}")
print(f"Cluster sizes: {np.bincount(cluster_labels)}")
Gaussian Mixture Models
from sklearn.mixture import GaussianMixture
# Fit GMM
gmm = GaussianMixture(n_components=3, covariance_type='full')
gmm.fit(neural_patterns)
# Get cluster assignments
labels = gmm.predict(neural_patterns)
# Get probabilities
probabilities = gmm.predict_proba(neural_patterns)
# Soft clustering - each point has probability for each cluster
print(f"Cluster 0 probability for first point: {probabilities[0, 0]:.3f}")
Classification
Support Vector Machine (SVM)
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
# Neural activity predicting stimulus type
X = neural_activity # (n_trials, n_neurons)
y = stimulus_labels # (n_trials,)
# Train SVM
svm = SVC(kernel='rbf', C=1.0)
svm.fit(X_train, y_train)
# Cross-validation
cv_scores = cross_val_score(svm, X, y, cv=5)
print(f"CV Accuracy: {cv_scores.mean():.3f} ± {cv_scores.std():.3f}")
# Predict
y_pred = svm.predict(X_test)
# Confusion matrix
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
plt.show()
Random Forest
from sklearn.ensemble import RandomForestClassifier
# Train Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
# Feature importance (which neurons are most informative)
importances = rf.feature_importances_
neuron_ids = np.arange(len(importances))
# Plot top 20 neurons
top_20 = np.argsort(importances)[-20:]
plt.barh(neuron_ids[top_20], importances[top_20])
plt.xlabel('Importance')
plt.ylabel('Neuron ID')
plt.title('Most Informative Neurons')
plt.show()
Regression
Linear Regression
from sklearn.linear_model import LinearRegression
# Predict behavior from neural activity
X = neural_activity # Predictors
y = reaction_times # Target
# Fit model
model = LinearRegression()
model.fit(X_train, y_train)
# Predict
y_pred = model.predict(X_test)
# Evaluate
from sklearn.metrics import r2_score, mean_squared_error
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"R²: {r2:.3f}")
print(f"RMSE: {rmse:.3f}")
# Visualize
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.xlabel('Actual RT')
plt.ylabel('Predicted RT')
plt.title(f'Prediction (R² = {r2:.3f})')
plt.show()
Ridge Regression (Regularized)
from sklearn.linear_model import RidgeCV
# Cross-validated Ridge regression
alphas = np.logspace(-3, 3, 50)
ridge = RidgeCV(alphas=alphas, cv=5)
ridge.fit(X_train, y_train)
print(f"Optimal alpha: {ridge.alpha_:.3f}")
# Coefficients show neuron contributions
coefficients = ridge.coef_
Neuroscience Applications
Neural Decoding
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_predict
# Decode stimulus from neural activity
decoder = GaussianNB()
# Cross-validated predictions
y_pred = cross_val_predict(decoder, neural_activity, stimulus_labels, cv=10)
# Accuracy
accuracy = (y_pred == stimulus_labels).mean()
print(f"Decoding accuracy: {accuracy:.1%}")
# Temporal decoding (decode at each time point)
accuracies = []
for t in range(n_timebins):
activity_t = neural_activity[:, :, t] # (trials, neurons, time)
decoder.fit(activity_t, stimulus_labels)
acc = decoder.score(activity_t, stimulus_labels)
accuracies.append(acc)
plt.plot(timebins, accuracies)
plt.axhline(y=1/n_classes, color='k', linestyle='--', label='Chance')
plt.xlabel('Time (ms)')
plt.ylabel('Decoding Accuracy')
plt.legend()
plt.show()
Population Analysis
from sklearn.decomposition import PCA
# Population activity over time
activity = neural_data # (n_trials, n_neurons, n_timepoints)
# Reshape for PCA: (n_samples, n_features)
n_trials, n_neurons, n_time = activity.shape
X = activity.reshape(n_trials * n_time, n_neurons)
# Apply PCA
pca = PCA(n_components=3)
X_pca = pca.fit_transform(X)
# Reshape back
X_pca = X_pca.reshape(n_trials, n_time, 3)
# Plot trajectory for first trial
trial_0 = X_pca[0]
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')
ax.plot(trial_0[:, 0], trial_0[:, 1], trial_0[:, 2], 'o-', alpha=0.6)
ax.scatter(trial_0[0, 0], trial_0[0, 1], trial_0[0, 2],
c='green', s=100, marker='o', label='Start')
ax.scatter(trial_0[-1, 0], trial_0[-1, 1], trial_0[-1, 2],
c='red', s=100, marker='s', label='End')
ax.set_xlabel('PC1')
ax.set_ylabel('PC2')
ax.set_zlabel('PC3')
ax.legend()
plt.title('Neural Trajectory in PC Space')
plt.show()
Receptive Field Mapping
from sklearn.linear_model import Ridge
# Stimulus features × Neural responses
X = stimulus_features # (n_trials, n_features)
y = neural_responses # (n_trials, n_neurons)
# Fit model for each neuron
models = []
for neuron_idx in range(n_neurons):
model = Ridge(alpha=1.0)
model.fit(X, y[:, neuron_idx])
models.append(model)
# Extract receptive fields (weights)
receptive_fields = np.array([m.coef_ for m in models])
# Visualize receptive field for neuron 0
rf_neuron_0 = receptive_fields[0].reshape(image_height, image_width)
plt.imshow(rf_neuron_0, cmap='RdBu_r')
plt.colorbar()
plt.title('Receptive Field - Neuron 0')
plt.show()
Model Selection
Grid Search
from sklearn.model_selection import GridSearchCV
# Define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [None, 10, 20],
'min_samples_split': [2, 5, 10]
}
# Grid search with cross-validation
rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(rf, param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best CV score: {grid_search.best_score_:.3f}")
# Use best model
best_model = grid_search.best_estimator_
Preprocessing
Standardization
from sklearn.preprocessing import StandardScaler
# Z-score normalization
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Mean = 0, Std = 1
print(f"Mean: {X_scaled.mean(axis=0)}")
print(f"Std: {X_scaled.std(axis=0)}")
Normalization
from sklearn.preprocessing import MinMaxScaler
# Scale to [0, 1]
scaler = MinMaxScaler()
X_normalized = scaler.fit_transform(X)
Installation
# With pixi
pixi add scikit-learn
# With pip
pip install scikit-learn
# With conda
conda install -c conda-forge scikit-learn
Best Practices
- Always split data: Use train/test splits or cross-validation
- Scale features: Many algorithms benefit from standardization
- Cross-validate: Use CV for model selection
- Feature selection: Remove irrelevant features
- Regularization: Prevent overfitting with Ridge/Lasso
- Pipeline: Chain preprocessing and modeling
- Random state: Set for reproducibility
Resources
- Documentation: https://scikit-learn.org/stable/
- Tutorials: https://scikit-learn.org/stable/tutorial/
- Examples: https://scikit-learn.org/stable/auto_examples/
- Cheat Sheet: https://scikit-learn.org/stable/tutorial/machine_learning_map/
Summary
Scikit-learn is essential for:
- Dimensionality reduction: PCA, t-SNE for visualization
- Clustering: Group neurons or trials
- Classification: Decode neural activity
- Regression: Predict behavior from neural data
It’s the standard library for machine learning in neuroscience research.