π Git & Version Control
16 topics • Click any card to expand
Git is a distributed version control system that tracks changes to files. Every data scientist needs Git to version code, collaborate with teams, reproduce experiments, and never lose work.
# Check if Git is installed
git --version
# Configure your identity (required before first commit)
git config --global user.name "Your Name"
git config --global user.email "you@example.com"
# Useful defaults
git config --global init.defaultBranch main
git config --global core.autocrlf true # Windows
git config --global pull.rebase false # merge on pull
# View all config
git config --list# Create a new project and initialize Git
mkdir my-ds-project
cd my-ds-project
git init
# What happened? A hidden .git folder was created
ls -la .git/
# HEAD β points to current branch
# config β repo-level settings
# objects/ β stores all file content and history
# refs/ β branch and tag pointers# Git has 3 areas:
# 1. Working Directory β your actual files
# 2. Staging Area (Index) β files ready to be committed
# 3. Repository (.git) β committed history
# See the current state
git status
# The workflow:
# edit files β git add (stage) β git commit (save snapshot)
echo "# My Project" > README.md
git status # README.md is "untracked"
git add README.md # Move to staging area
git status # README.md is "staged"
git commit -m "Initial commit: add README"
git status # "nothing to commit, working tree clean" # Create project structure
mkdir ml-churn-prediction
cd ml-churn-prediction
git init
# Create standard DS project structure
mkdir -p data/raw data/processed notebooks src models reports
# Create .gitignore (critical for DS projects!)
cat > .gitignore << 'EOF'
# Data files (too large for Git)
data/raw/*
data/processed/*
*.csv
*.parquet
*.h5
!data/.gitkeep
# Model files
models/*.pkl
models/*.joblib
models/*.pt
# Notebooks checkpoints
.ipynb_checkpoints/
# Environment
.env
venv/
__pycache__/
# OS files
.DS_Store
Thumbs.db
EOF
# Keep empty directories with .gitkeep
touch data/raw/.gitkeep data/processed/.gitkeep models/.gitkeep
# Initial commit
git add .
git commit -m "Initial project structure with .gitignore"
git log --oneline# TODO: Create a directory called "practice-project"
# TODO: Initialize git
# TODO: Create folders: data/, notebooks/, src/, tests/
# TODO: Create a .gitignore that excludes:
# - .csv, .parquet files
# - __pycache__/
# - .ipynb_checkpoints/
# - .env
# TODO: Create a README.md with a project title
# TODO: Stage and commit everything
# TODO: Run git log to verifyThe core Git workflow: make changes, stage them, commit with a message, and review history. These 4 commands are 80% of daily Git usage.
# Stage specific files
git add file1.py file2.py
# Stage all changes in current directory
git add .
# Stage all tracked files (skip untracked)
git add -u
# Commit with a message
git commit -m "Add data loading and cleaning functions"
# Stage + commit tracked files in one step
git commit -am "Fix bug in feature extraction"
# Amend the last commit (fix message or add forgotten files)
git add forgotten_file.py
git commit --amend -m "Add data loading, cleaning, and forgotten_file" # Basic log
git log
# Compact one-line format
git log --oneline
# Show last 5 commits
git log --oneline -5
# Show with graph (branch visualization)
git log --oneline --graph --all
# Show what changed in each commit
git log --oneline --stat
# Search commit messages
git log --grep="fix" --oneline
# Commits by a specific author
git log --author="Alice" --oneline
# Commits in a date range
git log --since="2024-01-01" --until="2024-02-01" --oneline# See unstaged changes (working dir vs staging)
git diff
# See staged changes (staging vs last commit)
git diff --staged
# Compare two commits
git diff abc123 def456
# Compare current branch with main
git diff main
# Show changes for a specific file
git diff -- src/model.py
# Show only file names that changed
git diff --name-only main
# Show word-level diff (great for text/notebooks)
git diff --word-diff# Unstage a file (keep changes in working dir)
git restore --staged file.py
# Discard changes in working directory (CAREFUL β irreversible!)
git restore file.py
# Undo last commit but keep changes staged
git reset --soft HEAD~1
# Undo last commit, unstage changes (keep in working dir)
git reset HEAD~1
# Create a new commit that reverses a previous one (safe!)
git revert abc123
# View what a file looked like in a past commit
git show HEAD~3:src/model.py# Good commit messages follow this pattern:
# <type>: <short description>
#
# Types: feat, fix, refactor, docs, test, chore
# Working on a feature β multiple small commits
git add src/preprocessing.py
git commit -m "feat: add missing value imputation for numerical columns"
git add src/preprocessing.py
git commit -m "feat: add categorical encoding with target encoder"
git add tests/test_preprocessing.py
git commit -m "test: add unit tests for preprocessing pipeline"
git add src/config.py
git commit -m "refactor: extract preprocessing params to config"
# View the clean history
git log --oneline -5
# abc1234 refactor: extract preprocessing params to config
# def5678 test: add unit tests for preprocessing pipeline
# ghi9012 feat: add categorical encoding with target encoder
# jkl3456 feat: add missing value imputation for numerical columns# TODO: Create src/load_data.py with a dummy function
# TODO: git add and commit with message "feat: add data loading module"
# TODO: Create src/clean_data.py with a dummy function
# TODO: git add and commit with message "feat: add data cleaning module"
# TODO: Modify src/load_data.py (add a new function)
# TODO: git diff to see the changes
# TODO: git add and commit with message "feat: add CSV validation to data loader"
# TODO: git log --oneline to see your 3 commits
# TODO: git log --stat to see files changed per commitBranches let you work on features, experiments, or fixes in isolation without affecting the main codebase. Merging combines branches back together.
# List all branches (* = current)
git branch
# Create a new branch
git branch feature/add-model
# Switch to it
git switch feature/add-model
# or older syntax: git checkout feature/add-model
# Create AND switch in one command
git switch -c feature/add-model
# or: git checkout -b feature/add-model
# List all branches including remote
git branch -a
# Rename a branch
git branch -m old-name new-name
# Delete a merged branch
git branch -d feature/add-model
# Force delete an unmerged branch (CAREFUL!)
git branch -D experiment/failed-approach# Merge feature branch into main
git switch main
git merge feature/add-model
# Three types of merge:
# 1. Fast-forward β main hasn't diverged, just moves pointer
# (clean, linear history)
git merge feature/add-model # fast-forward if possible
# 2. No fast-forward β always creates a merge commit
# (preserves branch history)
git merge --no-ff feature/add-model
# 3. Squash β combines all branch commits into one
# (clean main history)
git merge --squash feature/add-model
git commit -m "feat: add ML model training pipeline" # When two branches modify the same lines, Git can't auto-merge
git merge feature/update-model
# CONFLICT (content): Merge conflict in src/model.py
# Open the file β Git marks conflicts like this:
# <<<<<<< HEAD
# model = RandomForestClassifier(n_estimators=100)
# =======
# model = XGBClassifier(n_estimators=200, learning_rate=0.1)
# >>>>>>> feature/update-model
# To resolve:
# 1. Edit the file β keep what you want, remove markers
# 2. Stage the resolved file
git add src/model.py
# 3. Complete the merge
git commit -m "merge: resolve model selection conflict, keep XGBoost"
# Abort a merge if things go wrong
git merge --abort# Branch naming conventions:
# feature/ β new functionality
# fix/ β bug fixes
# experiment/β DS experiments (may be thrown away)
# hotfix/ β urgent production fixes
# docs/ β documentation updates
# Example: Data Science experiment workflow
git switch -c experiment/lstm-vs-transformer
# ... do your work, commit results ...
git add notebooks/lstm_experiment.ipynb
git commit -m "experiment: LSTM achieves 0.87 F1"
git add notebooks/transformer_experiment.ipynb
git commit -m "experiment: Transformer achieves 0.92 F1"
# Winner! Merge back
git switch main
git merge experiment/lstm-vs-transformer
git branch -d experiment/lstm-vs-transformer# Alice works on data preprocessing
git switch -c feature/preprocessing
# ... makes commits ...
git commit -m "feat: add outlier detection with IQR method"
git commit -m "feat: add feature scaling pipeline"
# Bob works on model training
git switch -c feature/model-training
# ... makes commits ...
git commit -m "feat: add XGBoost training with cross-validation"
git commit -m "feat: add hyperparameter search with Optuna"
# Carol works on evaluation
git switch -c feature/evaluation
# ... makes commits ...
git commit -m "feat: add classification report and confusion matrix"
# When ready, each person merges to main:
git switch main
git merge --no-ff feature/preprocessing
git merge --no-ff feature/model-training
git merge --no-ff feature/evaluation
# Clean up
git branch -d feature/preprocessing feature/model-training feature/evaluation
# View the branch history
git log --oneline --graph -10Remote repos host your code in the cloud for backup, collaboration, and deployment. GitHub is the most popular platform for data science projects.
# Add a remote (usually called 'origin')
git remote add origin https://github.com/username/my-project.git
# View remotes
git remote -v
# Push your code to the remote
git push -u origin main # -u sets up tracking (first time only)
# Subsequent pushes
git push
# Clone an existing repo
git clone https://github.com/username/project.git
git clone https://github.com/username/project.git my-local-name
# Clone only the latest snapshot (faster for large repos)
git clone --depth 1 https://github.com/username/project.git# Fetch β download remote changes (doesn't modify your files)
git fetch origin
# See what changed on remote
git log origin/main --oneline -5
# Pull β fetch + merge (updates your files)
git pull origin main
# Pull with rebase (cleaner history)
git pull --rebase origin main
# If pull causes conflicts, resolve them then:
git add .
git rebase --continue # if rebasing
# or
git commit # if merging# 1. Fork a repo on GitHub (click Fork button)
# 2. Clone YOUR fork
git clone https://github.com/YOUR-USERNAME/project.git
cd project
# 3. Add the original repo as 'upstream'
git remote add upstream https://github.com/ORIGINAL-OWNER/project.git
# 4. Keep your fork updated
git fetch upstream
git switch main
git merge upstream/main
git push origin main
# 5. Create a branch for your contribution
git switch -c fix/typo-in-readme
# 6. Make changes, commit, push to YOUR fork
git commit -am "fix: correct typo in installation instructions"
git push origin fix/typo-in-readme
# 7. Open a Pull Request on GitHub (from your fork to original)# Team lead creates the repo
mkdir team-ml-project && cd team-ml-project
git init
git switch -c main
# Create essential files
cat > README.md << 'EOF'
# Churn Prediction Model
## Setup
pip install -r requirements.txt
## Project Structure
src/ β source code
notebooks/ β exploration notebooks
tests/ β unit tests
models/ β trained model artifacts (gitignored)
data/ β datasets (gitignored)
## Workflow
1. Create a branch: git switch -c feature/your-feature
2. Make changes and commit
3. Push and open a Pull Request
4. Get review, then merge
EOF
echo "pandas>=2.0
scikit-learn>=1.3
xgboost>=2.0" > requirements.txt
git add .
git commit -m "chore: initial project setup with README and requirements"
# Push to GitHub
git remote add origin https://github.com/team/churn-prediction.git
git push -u origin main
# On GitHub: Settings β Branches β Add rule for 'main':
# β Require pull request reviews
# β Require status checks to pass
# β No direct pushes to main# TODO: Create a repo with a file src/config.py containing:
# MODEL_TYPE = "random_forest"
# N_ESTIMATORS = 100
# TODO: Create branch 'dev-alice', change MODEL_TYPE to "xgboost"
# TODO: Commit on dev-alice
# TODO: Switch back to main
# TODO: Create branch 'dev-bob', change MODEL_TYPE to "lightgbm"
# TODO: Commit on dev-bob
# TODO: Merge dev-alice into main (should work cleanly)
# TODO: Try to merge dev-bob into main (conflict!)
# TODO: Resolve the conflict, commit the merge
# TODO: git log --oneline --graph to see the resultPull requests (PRs) are how teams review and discuss code changes before merging. They're essential for quality, knowledge sharing, and catching bugs early.
# Install GitHub CLI: https://cli.github.com
# Authenticate
gh auth login
# Create a PR from current branch
gh pr create --title "feat: add feature engineering pipeline" \
--body "## Changes
- Added outlier detection
- Added feature scaling
- Added polynomial features
## Testing
- Unit tests pass
- Tested on sample dataset"
# Create a draft PR (not ready for review yet)
gh pr create --draft --title "WIP: experiment with LSTM model"
# List open PRs
gh pr list
# View PR details
gh pr view 42
# Check out someone's PR locally for testing
gh pr checkout 42# View PR diff
gh pr diff 42
# Add a review comment
gh pr review 42 --comment --body "Looks good, but please add docstrings to the new functions"
# Approve a PR
gh pr review 42 --approve --body "LGTM! Great work on the preprocessing pipeline"
# Request changes
gh pr review 42 --request-changes --body "Please add error handling for missing columns"
# Merge a PR
gh pr merge 42 --merge # regular merge commit
gh pr merge 42 --squash # squash all commits into one
gh pr merge 42 --rebase # rebase onto main# Good PR structure for DS projects:
# 1. Small, focused PRs (not 2000-line monsters)
# β "Add entire ML pipeline" (1500 lines)
# β "Add data preprocessing" β "Add model training" β "Add evaluation"
# 2. Clear description with context
# - What does this PR do?
# - Why is this change needed?
# - How was it tested?
# - Any metrics/results?
# 3. Include results for model changes
# "Model accuracy: 0.85 β 0.92 (+7pp)
# F1 score: 0.82 β 0.89
# Tested on holdout set (n=5,000)"
# 4. Don't include notebooks with outputs in PR
# (outputs make diffs unreadable)
# Clear outputs before committing:
jupyter nbconvert --clear-output --inplace notebook.ipynb# 1. Start from updated main
git switch main
git pull origin main
# 2. Create feature branch
git switch -c feature/add-cross-validation
# 3. Make changes (multiple small commits)
git add src/evaluation.py
git commit -m "feat: add k-fold cross-validation function"
git add tests/test_evaluation.py
git commit -m "test: add CV tests with synthetic data"
git add src/config.py
git commit -m "chore: add CV_FOLDS parameter to config"
# 4. Push branch to remote
git push -u origin feature/add-cross-validation
# 5. Create PR
gh pr create --title "feat: add k-fold cross-validation" \
--body "## Summary
- Added stratified k-fold CV with configurable folds
- Default: 5 folds (set in config.py)
- Returns mean Β± std for each metric
## Test Plan
- [x] Unit tests pass
- [x] Tested on iris dataset (accuracy: 0.96 Β± 0.02)
## Metrics
| Metric | Before (holdout) | After (5-fold CV) |
|--------|------------------|--------------------|
| Accuracy | 0.94 | 0.96 Β± 0.02 |"
# 6. After review and approval
gh pr merge --squashA well-crafted .gitignore prevents large data files, model artifacts, credentials, and OS junk from entering your repository. This is critical for DS projects.
# .gitignore for Data Science projects
# βββ Data files βββββββββββββββββββββββββββββ
*.csv
*.tsv
*.parquet
*.feather
*.h5
*.hdf5
*.sqlite
*.db
data/raw/
data/processed/
data/external/
# βββ Model artifacts ββββββββββββββββββββββββ
*.pkl
*.pickle
*.joblib
*.pt
*.pth
*.h5
*.onnx
*.pmml
models/
# βββ Notebooks ββββββββββββββββββββββββββββββ
.ipynb_checkpoints/
*/.ipynb_checkpoints/
# βββ Python βββββββββββββββββββββββββββββββββ
__pycache__/
*.py[cod]
*.egg-info/
dist/
build/
.eggs/
*.so
# βββ Environments βββββββββββββββββββββββββββ
.env
.venv/
venv/
env/
*.env.local
# βββ IDE ββββββββββββββββββββββββββββββββββββ
.vscode/
.idea/
*.swp
*.swo
*~
# βββ OS βββββββββββββββββββββββββββββββββββββ
.DS_Store
Thumbs.db
desktop.ini
# βββ Logs & temp ββββββββββββββββββββββββββββ
*.log
logs/
tmp/
.cache/
wandb/
mlruns/# Ignore everything in a directory, but keep the directory
data/raw/*
!data/raw/.gitkeep
# Ignore all CSVs except a specific one
*.csv
!reference_data.csv
# Ignore files only in root (not subdirectories)
/config.local.py
# Ignore by directory depth
**/logs/ # ignore 'logs' at any depth
debug/ # ignore 'debug' only at root
# Negate a pattern (un-ignore)
*.h5
!models/production_model.h5
# Check what's ignored
git status --ignored
# Check if a specific file is ignored
git check-ignore -v data/train.csv# Oops! You committed a large CSV file
# Remove from Git tracking (keep the file locally)
git rm --cached data/large_dataset.csv
echo "data/large_dataset.csv" >> .gitignore
git add .gitignore
git commit -m "chore: remove large CSV from tracking, add to gitignore"
# Remove an entire directory from tracking
git rm -r --cached __pycache__/
git commit -m "chore: remove pycache from tracking"
# Nuclear option: remove file from ALL history (if it contained secrets)
# WARNING: rewrites history, coordinate with team!
git filter-branch --force --index-filter \
'git rm --cached --ignore-unmatch secrets.env' \
--prune-empty -- --all
# Better tool for history rewriting:
# pip install git-filter-repo
# git filter-repo --invert-paths --path secrets.env# Check for large files in history
git rev-list --objects --all | \
git cat-file --batch-check='%(objecttype) %(objectname) %(objectsize) %(rest)' | \
awk '/^blob/ {print $3, $4}' | \
sort -rn | head -20
# Search for potential secrets in commit history
git log --all -p | grep -i "password\|secret\|api_key\|token" | head -20
# List all file types ever committed
git log --all --diff-filter=A --name-only --pretty=format: | \
grep -o '\.[^.]*$' | sort | uniq -c | sort -rn
# Check current gitignore coverage
git status --ignored --short
# Verify no .env files in history
git log --all --full-history -- "*.env" --oneline
# Verify no large files currently tracked
git ls-files | xargs -I{} git cat-file -s HEAD:{} 2>/dev/null | \
sort -rn | head -10Stash lets you save uncommitted changes temporarily so you can switch branches, pull updates, or do other work, then come back to your changes later.
# Save current changes to stash
git stash
# Save with a description
git stash push -m "WIP: feature engineering experiments"
# List all stashes
git stash list
# stash@{0}: On feature/model: WIP: feature engineering experiments
# stash@{1}: On main: quick debug session
# Apply most recent stash (keep it in stash list)
git stash apply
# Apply and remove from stash list
git stash pop
# Apply a specific stash
git stash apply stash@{1}
# Remove a specific stash
git stash drop stash@{0}
# Clear all stashes
git stash clear# Stash including untracked files
git stash push -u -m "WIP: including new files"
# Stash only specific files
git stash push -m "stash only model.py" -- src/model.py
# View what's in a stash
git stash show stash@{0} # summary
git stash show -p stash@{0} # full diff
# Create a branch from a stash
git stash branch new-feature stash@{0}
# Common workflow: quick context switch
# You're working on feature A, need to fix urgent bug
git stash push -m "WIP: feature A halfway done"
git switch main
git switch -c hotfix/urgent-bug
# ... fix the bug, commit, merge ...
git switch feature-a
git stash pop # back to where you were!# You're on feature/train-pipeline with uncommitted changes
git status
# Modified: src/train.py, src/metrics.py (not ready to commit)
# Urgent bug report! Stash everything
git stash push -u -m "WIP: training pipeline refactor"
# Fix the bug
git switch main
git pull
git switch -c hotfix/data-loader-crash
# ... fix the bug ...
git commit -am "fix: handle empty DataFrame in data loader"
git push -u origin hotfix/data-loader-crash
# ... create PR, get it merged ...
# Return to your work
git switch feature/train-pipeline
git stash pop
# Your changes are back exactly as you left them!
git statusTags mark specific points in history β typically used for version releases, model checkpoints, and experiment milestones. They create permanent bookmarks in your Git history.
# Lightweight tag (just a pointer)
git tag v1.0.0
# Annotated tag (recommended β includes message, author, date)
git tag -a v1.0.0 -m "First production release"
# Tag a past commit
git tag -a v0.9.0 -m "Beta release" abc1234
# List tags
git tag
git tag -l "v1.*" # filter by pattern
# View tag details
git show v1.0.0
# Push tags to remote
git push origin v1.0.0 # push one tag
git push origin --tags # push all tags
# Delete a tag
git tag -d v0.1.0 # local
git push origin --delete v0.1.0 # remote# Semantic Versioning: MAJOR.MINOR.PATCH
# MAJOR β breaking changes (new model architecture, API change)
# MINOR β new features (added endpoint, new feature engineering)
# PATCH β bug fixes (fixed preprocessing bug, typo)
# ML-specific versioning strategy:
# v1.0.0 β first production model
# v1.1.0 β added new features to model
# v1.1.1 β fixed data preprocessing bug
# v2.0.0 β switched from RF to XGBoost (different model)
# Tag with model metrics
git tag -a v2.1.0 -m "XGBoost v2.1
Accuracy: 0.94
F1: 0.91
AUC: 0.97
Training data: 2024-01-01 to 2024-06-30
Features: 47 (added 5 new interaction features)"
# Create a GitHub release (includes downloadable assets)
gh release create v2.1.0 --title "Model v2.1.0" \
--notes "Improved model with 5 new interaction features.
Accuracy: 0.94 (+2pp vs v2.0.0)"
# After model training and validation passes
git add src/ tests/ configs/
git commit -m "feat: XGBoost v2 with optimized hyperparameters"
# Tag the release with metrics
git tag -a model-v2.1.0 -m "Production model release
Model: XGBoost
Accuracy: 0.943 | F1: 0.912 | AUC: 0.971
Training samples: 150,000
Feature count: 47
Hyperparams: max_depth=6, lr=0.05, n_est=500"
# Push code and tag
git push origin main --tags
# Create GitHub release with model card
gh release create model-v2.1.0 \
--title "Model v2.1.0 β XGBoost Production" \
--notes-file RELEASE_NOTES.md
# Later: need to roll back to v2.0.0
git checkout model-v2.0.0 -- src/model.py configs/model_config.yaml
# This restores just the model files from the v2.0.0 tagRebase replays your commits on top of another branch, creating a linear history. Interactive rebase lets you edit, squash, reorder, and clean up commits before sharing.
# Instead of merging main into your branch (creates merge commit):
git switch feature/model
git merge main # creates a merge commit
# Rebase puts your commits on TOP of main (linear history):
git switch feature/model
git rebase main
# After rebase, your branch looks like:
# main: AβBβC
# feature: DβEβF (your commits replayed on top of C)
# If conflicts occur during rebase:
git add resolved_file.py
git rebase --continue
# Abort if things go wrong
git rebase --abort# Clean up last 4 commits before creating a PR
git rebase -i HEAD~4
# Opens editor with:
# pick abc1234 WIP: start model training
# pick def5678 fix typo
# pick ghi9012 more work on training
# pick jkl3456 finish model training
# Change to:
# pick abc1234 WIP: start model training
# squash def5678 fix typo β merge into previous
# squash ghi9012 more work on training β merge into previous
# reword jkl3456 finish model training β change message
# Commands:
# pick β keep the commit as is
# squash β merge into previous commit
# fixup β like squash but discard this commit's message
# reword β keep commit, edit message
# edit β pause to amend the commit
# drop β delete the commit# NEVER rebase commits that have been pushed and shared with others!
# Rebase rewrites history β this will cause problems for collaborators
# Safe: rebase your LOCAL commits before pushing
git switch feature/my-work
git rebase main # OK β only your local commits
git push # push clean history
# DANGEROUS: rebase after pushing
git push origin feature/my-work
git rebase main # Rewrites already-pushed commits!
git push --force # Forces overwrite β breaks collaborators!
# If you must update after push, use merge instead:
git switch feature/my-work
git merge main # Safe even after pushing# Your messy history:
# abc1234 oops, forgot to save
# def5678 fix import
# ghi9012 WIP: add feature scaling
# jkl3456 fix bug in scaling
# mno7890 add model training
# pqr1234 fix typo in training
# stu5678 add evaluation metrics
# vwx9012 fix metric calculation
# Interactive rebase to clean up
git rebase -i HEAD~8
# Result after squashing related commits:
# pick ghi9012 feat: add feature scaling pipeline
# pick mno7890 feat: add model training with XGBoost
# pick stu5678 feat: add evaluation metrics (accuracy, F1, AUC)
# Now push the clean branch
git push origin feature/ml-pipeline
# Your PR will show 3 clean, logical commits instead of 8 messy onesNotebooks are JSON files with embedded outputs, making them hard to diff and version. These techniques keep notebooks manageable in Git.
# Jupyter notebooks (.ipynb) are JSON β diffs are messy:
# - Cell outputs (images, tables) bloat the repo
# - Execution counts change on every run
# - Cell metadata changes randomly
# - Merge conflicts are nearly impossible to resolve
# Solution 1: Clear outputs before committing
jupyter nbconvert --clear-output --inplace notebook.ipynb
git add notebook.ipynb
git commit -m "feat: add EDA notebook (outputs cleared)"
# Solution 2: Automate with pre-commit hook
# .pre-commit-config.yaml
# repos:
# - repo: https://github.com/kynan/nbstripout
# hooks:
# - id: nbstripout
# Solution 3: Install nbstripout globally for a repo
pip install nbstripout
nbstripout --install # adds Git filter
# Now outputs are automatically stripped on commit!# Install nbdime for human-readable notebook diffs
pip install nbdime
# Configure Git to use nbdime
nbdime config-git --enable --global
# Now 'git diff' shows notebook changes in a readable format
git diff notebook.ipynb
# Instead of raw JSON, you see:
# Cell 3 (code):
# - model = RandomForestClassifier()
# + model = XGBClassifier(n_estimators=200)
# Visual diff tool (opens in browser)
nbdime diff notebook_v1.ipynb notebook_v2.ipynb
# Merge tool for notebooks
nbdime merge base.ipynb local.ipynb remote.ipynb# Use jupytext to sync .ipynb with .py files
pip install jupytext
# Convert notebook to Python script
jupytext --to py:percent notebook.ipynb
# Creates notebook.py with # %% cell markers
# Sync both formats (edit either one)
jupytext --set-formats ipynb,py:percent notebook.ipynb
# In .gitignore, you can then ignore .ipynb and only track .py:
# *.ipynb β ignore notebooks
# !*.py β track Python scripts
# Or track both but strip outputs from .ipynb:
# Use nbstripout for .ipynb
# Track .py as the "source of truth" # Team setup (run once)
pip install nbstripout nbdime jupytext
# In the project repo
nbstripout --install
nbdime config-git --enable
# Team convention:
# 1. Notebooks in notebooks/ directory
# 2. Outputs always stripped on commit (nbstripout)
# 3. Each notebook has a paired .py script (jupytext)
# 4. PRs review the .py diff (much cleaner)
# 5. Final notebooks with outputs go to reports/
# Example workflow
cd notebooks/
jupytext --set-formats ipynb,py:percent eda.ipynb
# Edit the notebook, run cells, then:
git add notebooks/eda.py # clean Python diff
git add notebooks/eda.ipynb # outputs auto-stripped
git commit -m "feat: add EDA notebook with correlation analysis"
# Reviewer sees clean diff in eda.py, not messy JSON# TODO: Create a test repo
# TODO: pip install nbstripout
# TODO: nbstripout --install (in the repo)
# TODO: Create a simple notebook with some output cells
# TODO: git add and commit the notebook
# TODO: Check that the committed version has no outputs:
# git show HEAD:notebook.ipynb | python -c "
# import json, sys
# nb = json.load(sys.stdin)
# outputs = sum(len(c.get('outputs',[])) for c in nb['cells'])
# print(f'Outputs in committed notebook: {outputs}')
# " Git hooks are scripts that run automatically at specific points in the Git workflow β before commits, before pushes, etc. They automate code quality checks.
# Hooks live in .git/hooks/ (local, not shared by default)
# Use pre-commit framework to share hooks with team
# Install pre-commit
pip install pre-commit
# Create .pre-commit-config.yaml in repo root
cat > .pre-commit-config.yaml << 'EOF'
repos:
# Code formatting
- repo: https://github.com/psf/black
rev: 24.1.0
hooks:
- id: black
language_version: python3
# Import sorting
- repo: https://github.com/pycqa/isort
rev: 5.13.2
hooks:
- id: isort
# Linting
- repo: https://github.com/pycqa/flake8
rev: 7.0.0
hooks:
- id: flake8
args: [--max-line-length=120]
# Strip notebook outputs
- repo: https://github.com/kynan/nbstripout
rev: 0.7.1
hooks:
- id: nbstripout
# Check for secrets
- repo: https://github.com/Yelp/detect-secrets
rev: v1.4.0
hooks:
- id: detect-secrets
EOF
# Install the hooks
pre-commit install
# Now these checks run automatically before every commit!
# To run manually on all files:
pre-commit run --all-files# Create a custom hook that checks for large files
cat > .git/hooks/pre-commit << 'HOOK'
#!/bin/bash
# Prevent commits with files larger than 5MB
MAX_SIZE=5242880 # 5MB in bytes
EXIT_CODE=0
for file in $(git diff --cached --name-only); do
if [ -f "$file" ]; then
size=$(wc -c < "$file")
if [ "$size" -gt "$MAX_SIZE" ]; then
echo "ERROR: $file is $(($size/1048576))MB (max 5MB)"
echo " Use Git LFS for large files: git lfs track '$file'"
EXIT_CODE=1
fi
fi
done
exit $EXIT_CODE
HOOK
chmod +x .git/hooks/pre-commit# .pre-commit-config.yaml with DS-specific hooks
cat > .pre-commit-config.yaml << 'EOF'
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.5.0
hooks:
- id: check-yaml
- id: check-json
- id: check-added-large-files
args: [--maxkb=1000]
- id: end-of-file-fixer
- id: trailing-whitespace
- id: no-commit-to-branch
args: [--branch, main] # prevent direct commits to main
- repo: https://github.com/psf/black
rev: 24.1.0
hooks:
- id: black
- repo: https://github.com/kynan/nbstripout
rev: 0.7.1
hooks:
- id: nbstripout
- repo: https://github.com/Yelp/detect-secrets
rev: v1.4.0
hooks:
- id: detect-secrets
args: [--baseline, .secrets.baseline]
EOF
pre-commit install
pre-commit run --all-files
# Now every commit is automatically checked!
# Team members just need to run: pre-commit installGit LFS stores large files (datasets, model weights, images) outside the Git repo while keeping references in your history. Essential for ML projects with large artifacts.
# Install Git LFS
git lfs install
# Track specific file types
git lfs track "*.csv"
git lfs track "*.parquet"
git lfs track "*.pkl"
git lfs track "*.pt"
git lfs track "*.h5"
git lfs track "data/**"
# This creates/updates .gitattributes
cat .gitattributes
# *.csv filter=lfs diff=lfs merge=lfs -text
# *.parquet filter=lfs diff=lfs merge=lfs -text
# IMPORTANT: commit .gitattributes first!
git add .gitattributes
git commit -m "chore: configure Git LFS for data and model files"
# Now add large files normally β LFS handles them
git add data/training_set.csv
git commit -m "data: add training dataset"
git push# List tracked LFS patterns
git lfs track
# List actual LFS files in repo
git lfs ls-files
# Check LFS storage usage
git lfs env
# Pull LFS files (after clone)
git lfs pull
# Clone without downloading LFS files (faster)
GIT_LFS_SKIP_SMUDGE=1 git clone https://github.com/user/project.git
# Download LFS files for specific patterns only
git lfs pull --include="data/train*"
git lfs pull --exclude="models/"
# Migrate existing files to LFS
git lfs migrate import --include="*.csv" --everything# Initial setup
git lfs install
git lfs track "models/*.pkl"
git lfs track "models/*.pt"
git lfs track "models/*.onnx"
git add .gitattributes
git commit -m "chore: track model files with Git LFS"
# After training a new model
cp trained_model.pkl models/churn_model.pkl
git add models/churn_model.pkl
git commit -m "model: XGBoost churn model v2.1 (F1=0.91)"
git tag -a model-v2.1.0 -m "Churn model v2.1.0, F1=0.91"
git push origin main --tags
# To reproduce: checkout the tag, model files are pulled automatically
git checkout model-v2.1.0
ls -la models/ # model file is there via LFS
# Check LFS storage
git lfs ls-files --size
# models/churn_model.pkl (245 MB)Power-user Git commands for specific situations: applying individual commits across branches, finding which commit introduced a bug, and recovering lost work.
# Copy a specific commit from another branch
git cherry-pick abc1234
# Cherry-pick without committing (stage changes only)
git cherry-pick --no-commit abc1234
# Cherry-pick a range of commits
git cherry-pick abc1234..def5678
# Use case: backport a bug fix from development to production
git switch production
git cherry-pick abc1234 # the fix commit from dev branch
git push
# If conflicts occur
git cherry-pick --continue # after resolving
git cherry-pick --abort # to cancel# Binary search through history to find which commit broke something
git bisect start
# Mark current commit as bad (bug is present)
git bisect bad
# Mark a known good commit (bug was NOT present)
git bisect good v1.0.0
# Git checks out a middle commit β test it!
# If the bug is present:
git bisect bad
# If the bug is NOT present:
git bisect good
# Repeat until Git finds the exact commit
# "abc1234 is the first bad commit"
# Done β go back to normal
git bisect reset
# Automated bisect with a test script:
git bisect start HEAD v1.0.0
git bisect run python -m pytest tests/test_model.py -x
# Git automatically finds the first failing commit!# Reflog tracks every HEAD movement β your safety net!
git reflog
# abc1234 HEAD@{0}: commit: add feature
# def5678 HEAD@{1}: checkout: moving from main to feature
# ghi9012 HEAD@{2}: commit: initial commit
# Accidentally deleted a branch? Recover it!
git branch -D important-branch # oops!
git reflog # find the last commit on that branch
git branch important-branch HEAD@{3} # recovered!
# Accidentally ran git reset --hard? Recover!
git reset --hard HEAD~5 # oops, lost 5 commits!
git reflog # find where HEAD was before
git reset --hard HEAD@{1} # back to before the reset!
# Reflog entries expire after 90 days (default)
# Check expiry:
git config gc.reflogExpire# Create a test script that checks model accuracy
cat > test_accuracy.sh << 'SCRIPT'
#!/bin/bash
# Returns 0 (good) if accuracy > 0.90, 1 (bad) otherwise
python -c "
from src.model import train_and_evaluate
accuracy = train_and_evaluate('data/test.csv')
print(f'Accuracy: {accuracy:.4f}')
exit(0 if accuracy > 0.90 else 1)
"
SCRIPT
chmod +x test_accuracy.sh
# Automated bisect
git bisect start
git bisect bad HEAD # current commit is bad
git bisect good HEAD~20 # 20 commits ago was good
git bisect run ./test_accuracy.sh # automated testing!
# Output: "abc1234 is the first bad commit"
# commit abc1234
# Author: Bob <bob@company.com>
# Date: Mon Jan 15 14:30:00 2024
# "refactor: change feature scaling to min-max"
# Found it! The scaling change caused the regression
git bisect reset
rm test_accuracy.shDifferent teams use different branching strategies. Understanding these workflows helps you adapt to any team's Git practices.
# GitHub Flow β simple and effective
# Rules:
# 1. main is always deployable
# 2. Create feature branches from main
# 3. Open PRs for review
# 4. Merge to main after approval
# 5. Deploy from main
# Workflow:
git switch main
git pull
git switch -c feature/add-prediction-endpoint
# Work, commit, push
git commit -am "feat: add /predict endpoint"
git push -u origin feature/add-prediction-endpoint
# Create PR β Review β Merge β Delete branch
gh pr create
# After merge:
git switch main
git pull
git branch -d feature/add-prediction-endpoint# Git Flow β more structured, good for versioned software
# Branches:
# main β production releases only
# develop β integration branch
# feature/ β new features (branch from develop)
# release/ β preparing a release (branch from develop)
# hotfix/ β urgent fixes (branch from main)
# Start a feature
git switch develop
git switch -c feature/new-model
# Finish feature β merge back to develop
git switch develop
git merge --no-ff feature/new-model
git branch -d feature/new-model
# Prepare release
git switch -c release/v2.0.0
# ... final testing, version bumps ...
git switch main
git merge --no-ff release/v2.0.0
git tag -a v2.0.0 -m "Release v2.0.0"
git switch develop
git merge --no-ff release/v2.0.0
git branch -d release/v2.0.0# Trunk-based β everyone commits to main (or very short-lived branches)
# Rules:
# 1. Small, frequent commits to main
# 2. Feature flags instead of long-lived branches
# 3. Branches live < 1 day
# 4. CI/CD runs on every commit
# Short-lived branch (< 1 day)
git switch -c fix/null-check
git commit -am "fix: handle null values in preprocessing"
git push -u origin fix/null-check
gh pr create
# Get quick review, merge same day
# Feature flags for larger features
# In code:
# if feature_flags.is_enabled("new_model_v2"):
# prediction = new_model.predict(features)
# else:
# prediction = old_model.predict(features)# Decision framework:
# Small team (2-5), continuous deployment β GitHub Flow
# - Simple, low overhead
# - Perfect for: web apps, APIs, dashboards
# - DS teams building Streamlit apps or APIs
# Medium team (5-15), versioned releases β Git Flow
# - Structured, parallel development
# - Perfect for: ML platforms, data products
# - DS teams shipping model versions
# Large team (15+), fast iteration β Trunk-based
# - Requires strong CI/CD
# - Perfect for: mature ML platforms
# - Feature flags replace branches
# For MOST data science teams, GitHub Flow is the best choice:
echo "Recommended: GitHub Flow"
echo " β Simple to learn"
echo " β Works well with PRs"
echo " β main is always deployable"
echo " β Low overhead for small teams"
echo " β GitHub/GitLab built around this workflow" Automate testing, linting, model validation, and deployment with GitHub Actions. CI/CD ensures your code always works and your models are validated before deployment.
# .github/workflows/ci.yml
name: CI Pipeline
on:
push:
branches: [main]
pull_request:
branches: [main]
jobs:
test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.11'
- name: Install dependencies
run: |
pip install -r requirements.txt
pip install pytest flake8
- name: Lint
run: flake8 src/ --max-line-length=120
- name: Run tests
run: pytest tests/ -v --tb=short# .github/workflows/model-validation.yml
name: Model Validation
on:
pull_request:
paths:
- 'src/model/**'
- 'configs/**'
jobs:
validate:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.11'
- name: Install dependencies
run: pip install -r requirements.txt
- name: Train on test data
run: python src/model/train.py --config configs/test.yaml
- name: Validate metrics
run: |
python -c "
import json
with open('results/metrics.json') as f:
m = json.load(f)
assert m['accuracy'] > 0.85, f'Accuracy {m["accuracy"]} below threshold'
assert m['f1'] > 0.80, f'F1 {m["f1"]} below threshold'
print('Model validation passed!')
print(f'Accuracy: {m["accuracy"]:.4f}')
print(f'F1: {m["f1"]:.4f}')
" # .github/workflows/daily-data.yml
name: Daily Data Collection
on:
schedule:
- cron: '0 6 * * *' # Run at 6 AM UTC daily
workflow_dispatch: # Allow manual trigger
jobs:
collect:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.11'
- name: Install dependencies
run: pip install -r requirements.txt
- name: Collect data
env:
API_KEY: ${{ secrets.DATA_API_KEY }}
run: python scripts/collect_daily_data.py
- name: Commit new data
run: |
git config user.name "GitHub Actions"
git config user.email "actions@github.com"
git add data/daily/
git diff --staged --quiet || git commit -m "data: daily collection $(date +%Y-%m-%d)"
git push# .github/workflows/ml-pipeline.yml
name: ML Pipeline
on:
push:
branches: [main]
pull_request:
branches: [main]
jobs:
quality:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with: { python-version: '3.11' }
- run: pip install -r requirements.txt
- run: black --check src/
- run: pytest tests/ -v
model-check:
needs: quality
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with: { python-version: '3.11' }
- run: pip install -r requirements.txt
- run: python scripts/validate_model.py
- name: Comment metrics on PR
if: github.event_name == 'pull_request'
uses: actions/github-script@v7
with:
script: |
const fs = require('fs');
const metrics = fs.readFileSync('results/metrics.txt', 'utf8');
github.rest.issues.createComment({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: context.issue.number,
body: '## Model Metrics\n```\n' + metrics + '\n```'
})A collection of Git best practices, common pitfalls, and quick reference commands for daily use.
# Morning: start fresh
git switch main
git pull origin main
git switch -c feature/todays-work
# During the day: small, frequent commits
git add src/module.py
git commit -m "feat: add data validation step"
# End of day: push your work
git push -u origin feature/todays-work
# Ready for review: create PR
gh pr create --title "feat: add data validation"
# After PR is merged: clean up
git switch main
git pull
git branch -d feature/todays-work# Mistake: committed to wrong branch
git switch correct-branch
git cherry-pick abc1234 # bring commit to correct branch
git switch wrong-branch
git reset HEAD~1 # remove from wrong branch
# Mistake: typo in last commit message
git commit --amend -m "correct message here"
# Mistake: forgot to add a file to last commit
git add forgotten_file.py
git commit --amend --no-edit
# Mistake: committed sensitive data
git reset --soft HEAD~1 # undo commit, keep changes staged
# remove the sensitive data, then recommit
# Mistake: git pull created ugly merge commits
git reset --hard HEAD~1 # undo the merge
git pull --rebase # replay your commits on top
# Mistake: need to undo a pushed commit (safely)
git revert abc123 # creates a new "undo" commit
git push# Add these to your ~/.gitconfig [alias] section:
git config --global alias.st "status --short"
git config --global alias.co "checkout"
git config --global alias.sw "switch"
git config --global alias.br "branch"
git config --global alias.ci "commit"
git config --global alias.lg "log --oneline --graph --all -20"
git config --global alias.last "log -1 HEAD --stat"
git config --global alias.unstage "restore --staged"
git config --global alias.undo "reset HEAD~1"
git config --global alias.amend "commit --amend --no-edit"
git config --global alias.wip "commit -am 'WIP: work in progress'"
# Usage:
# git st β short status
# git lg β pretty graph log
# git last β last commit details
# git unstage f β unstage a file
# git undo β undo last commit (keep changes)
# git amend β add to last commit
# git wip β quick WIP commit# βββββββββββββββββββββββ¬βββββββββββββββββββββββββββββββββββββ
# β Action β Command β
# βββββββββββββββββββββββΌβββββββββββββββββββββββββββββββββββββ€
# β Init repo β git init β
# β Clone repo β git clone <url> β
# β Stage files β git add <files> β
# β Commit β git commit -m "msg" β
# β Push β git push origin <branch> β
# β Pull β git pull origin <branch> β
# β Status β git status β
# β Log β git log --oneline β
# β Diff β git diff β
# β Create branch β git switch -c <name> β
# β Switch branch β git switch <name> β
# β Merge β git merge <branch> β
# β Rebase β git rebase <branch> β
# β Stash β git stash push -m "msg" β
# β Tag β git tag -a v1.0 -m "msg" β
# β Cherry-pick β git cherry-pick <hash> β
# β Bisect β git bisect start β
# β Undo last commit β git reset HEAD~1 β
# β Revert commit β git revert <hash> β
# β Recover lost work β git reflog β
# βββββββββββββββββββββββ΄βββββββββββββββββββββββββββββββββββββ
echo "Bookmark this cheat sheet!"