Reproducible Research Practices: Building Trustworthy Scientific Code
Reproducibility is fundamental to scientific research. This guide covers practical patterns for building reproducible research code in Python, ensuring your work can be verified and extended by others.
Project Structure
Recommended Directory Layout
my_research_project/
โโโ README.md # Project overview
โโโ LICENSE # License file
โโโ .gitignore # Git ignore rules
โโโ environment.yml # Conda environment
โโโ requirements.txt # Pip requirements
โโโ setup.py # Package setup
โโโ data/
โ โโโ raw/ # Original data (read-only)
โ โโโ processed/ # Processed data
โ โโโ external/ # External data sources
โโโ notebooks/
โ โโโ 01_exploration.ipynb
โ โโโ 02_analysis.ipynb
โ โโโ 03_results.ipynb
โโโ src/
โ โโโ __init__.py
โ โโโ data/
โ โ โโโ __init__.py
โ โ โโโ loader.py
โ โโโ analysis/
โ โ โโโ __init__.py
โ โ โโโ methods.py
โ โโโ visualization/
โ โโโ __init__.py
โ โโโ plots.py
โโโ tests/
โ โโโ __init__.py
โ โโโ test_data.py
โ โโโ test_analysis.py
โโโ results/
โ โโโ figures/
โ โโโ tables/
โโโ docs/
โโโ index.md
โโโ methods.md
Environment Management
Conda Environment
# environment.yml
name: my-research
channels:
- conda-forge
- defaults
dependencies:
- python=3.9
- numpy=1.21
- pandas=1.3
- scipy=1.7
- matplotlib=3.4
- jupyter=1.0
- jupyterlab=3.0
- pytest=6.2
- pip
- pip:
- scikit-learn==0.24
- seaborn==0.11
Creating and Using Environment
import subprocess
import os
def setup_environment(env_name='my-research'):
"""Setup conda environment."""
# Create environment
subprocess.run([
'conda', 'env', 'create',
'-f', 'environment.yml',
'-n', env_name
], check=True)
print(f"Environment '{env_name}' created")
print(f"Activate with: conda activate {env_name}")
def export_environment(env_name, output_file='environment.yml'):
"""Export current environment."""
subprocess.run([
'conda', 'env', 'export',
'-n', env_name,
'-f', output_file
], check=True)
print(f"Environment exported to {output_file}")
# Usage
# setup_environment()
# export_environment('my-research')
Version Control
Git Configuration
import subprocess
from pathlib import Path
class GitManager:
"""Manage git repository for research."""
def __init__(self, repo_path='.'):
self.repo_path = Path(repo_path)
def initialize_repo(self):
"""Initialize git repository."""
subprocess.run(['git', 'init'], cwd=self.repo_path, check=True)
print("Git repository initialized")
def create_gitignore(self):
"""Create .gitignore for research project."""
gitignore_content = """
# Data
data/raw/*
data/processed/*
!data/raw/.gitkeep
!data/processed/.gitkeep
# Results
results/*
!results/.gitkeep
# Jupyter
.ipynb_checkpoints/
*.ipynb_checkpoints
# Python
__pycache__/
*.py[cod]
*$py.class
*.so
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
# Environment
venv/
env/
ENV/
.venv
# IDE
.vscode/
.idea/
*.swp
*.swo
# OS
.DS_Store
Thumbs.db
# Temporary
*.tmp
*.log
"""
gitignore_path = self.repo_path / '.gitignore'
gitignore_path.write_text(gitignore_content)
print(f"Created .gitignore")
def commit_with_message(self, message):
"""Commit changes with message."""
subprocess.run(['git', 'add', '.'], cwd=self.repo_path, check=True)
subprocess.run(
['git', 'commit', '-m', message],
cwd=self.repo_path,
check=True
)
print(f"Committed: {message}")
# Usage
# git_manager = GitManager()
# git_manager.initialize_repo()
# git_manager.create_gitignore()
Data Management
Data Versioning
import hashlib
import json
from pathlib import Path
from datetime import datetime
class DataVersioning:
"""Track data versions and checksums."""
def __init__(self, data_dir='data'):
self.data_dir = Path(data_dir)
self.manifest_file = self.data_dir / 'manifest.json'
def compute_checksum(self, file_path):
"""Compute file checksum."""
sha256_hash = hashlib.sha256()
with open(file_path, 'rb') as f:
for byte_block in iter(lambda: f.read(4096), b''):
sha256_hash.update(byte_block)
return sha256_hash.hexdigest()
def register_data(self, file_path, description=''):
"""Register data file with checksum."""
file_path = Path(file_path)
if not file_path.exists():
raise FileNotFoundError(f"File not found: {file_path}")
checksum = self.compute_checksum(file_path)
# Load or create manifest
if self.manifest_file.exists():
manifest = json.loads(self.manifest_file.read_text())
else:
manifest = {}
# Add entry
manifest[str(file_path)] = {
'checksum': checksum,
'size': file_path.stat().st_size,
'timestamp': datetime.now().isoformat(),
'description': description
}
# Save manifest
self.manifest_file.write_text(json.dumps(manifest, indent=2))
print(f"Registered: {file_path} (checksum: {checksum[:8]}...)")
def verify_data(self, file_path):
"""Verify data integrity."""
file_path = Path(file_path)
if not self.manifest_file.exists():
raise FileNotFoundError("Manifest file not found")
manifest = json.loads(self.manifest_file.read_text())
if str(file_path) not in manifest:
raise ValueError(f"File not in manifest: {file_path}")
expected_checksum = manifest[str(file_path)]['checksum']
actual_checksum = self.compute_checksum(file_path)
if expected_checksum == actual_checksum:
print(f"โ Data verified: {file_path}")
return True
else:
print(f"โ Data corrupted: {file_path}")
return False
# Usage
versioning = DataVersioning()
# versioning.register_data('data/raw/dataset.csv', 'Original dataset')
# versioning.verify_data('data/raw/dataset.csv')
Data Loading with Reproducibility
import pandas as pd
import numpy as np
from pathlib import Path
class ReproducibleDataLoader:
"""Load data with reproducibility checks."""
def __init__(self, data_dir='data', seed=42):
self.data_dir = Path(data_dir)
self.seed = seed
np.random.seed(seed)
def load_raw_data(self, filename):
"""Load raw data."""
file_path = self.data_dir / 'raw' / filename
if not file_path.exists():
raise FileNotFoundError(f"Data file not found: {file_path}")
print(f"Loading: {file_path}")
return pd.read_csv(file_path)
def save_processed_data(self, df, filename, metadata=None):
"""Save processed data with metadata."""
output_dir = self.data_dir / 'processed'
output_dir.mkdir(parents=True, exist_ok=True)
file_path = output_dir / filename
df.to_csv(file_path, index=False)
# Save metadata
if metadata:
metadata_file = output_dir / f"{filename}.metadata.json"
metadata_file.write_text(json.dumps(metadata, indent=2))
print(f"Saved: {file_path}")
def load_processed_data(self, filename):
"""Load processed data with metadata."""
file_path = self.data_dir / 'processed' / filename
df = pd.read_csv(file_path)
# Load metadata if available
metadata_file = self.data_dir / 'processed' / f"{filename}.metadata.json"
metadata = None
if metadata_file.exists():
metadata = json.loads(metadata_file.read_text())
return df, metadata
# Usage
loader = ReproducibleDataLoader()
# df = loader.load_raw_data('dataset.csv')
# loader.save_processed_data(df, 'processed_dataset.csv', metadata={'rows': len(df)})
Documentation
README Template
# Research Project Title
## Overview
Brief description of the research project and objectives.
## Installation
### Prerequisites
- Python 3.9+
- Conda or pip
### Setup
\`\`\`bash
# Create environment
conda env create -f environment.yml
conda activate my-research
# Install package
pip install -e .
\`\`\`
## Data
### Raw Data
- Source: [URL or description]
- Format: CSV
- Size: [size]
- License: [license]
### Processing
\`\`\`bash
python src/data/process.py
\`\`\`
## Usage
### Running Analysis
\`\`\`bash
jupyter lab notebooks/01_exploration.ipynb
\`\`\`
### Running Tests
\`\`\`bash
pytest tests/
\`\`\`
## Results
[Summary of key findings]
## Citation
If you use this code, please cite:
\`\`\`
[Citation format]
\`\`\`
## License
[License type]
## Authors
[Author names and affiliations]
Testing
Unit Tests for Research Code
import pytest
import numpy as np
import pandas as pd
from src.analysis.methods import calculate_statistics
class TestAnalysisMethods:
"""Test analysis methods."""
def test_calculate_statistics(self):
"""Test statistics calculation."""
data = np.array([1, 2, 3, 4, 5])
stats = calculate_statistics(data)
assert stats['mean'] == 3.0
assert stats['std'] > 0
assert stats['min'] == 1
assert stats['max'] == 5
def test_calculate_statistics_empty(self):
"""Test with empty data."""
data = np.array([])
with pytest.raises(ValueError):
calculate_statistics(data)
def test_calculate_statistics_single_value(self):
"""Test with single value."""
data = np.array([5])
stats = calculate_statistics(data)
assert stats['mean'] == 5.0
assert stats['std'] == 0.0
# Run tests
# pytest tests/test_analysis.py
Jupyter Notebooks
Best Practices
# notebook_template.ipynb
# Cell 1: Setup and imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from src.data.loader import ReproducibleDataLoader
from src.analysis.methods import analyze_data
# Set random seed for reproducibility
np.random.seed(42)
# Cell 2: Load data
loader = ReproducibleDataLoader()
df = loader.load_raw_data('dataset.csv')
print(f"Data shape: {df.shape}")
print(df.head())
# Cell 3: Exploratory analysis
print(df.describe())
# Cell 4: Analysis
results = analyze_data(df)
print(results)
# Cell 5: Visualization
plt.figure(figsize=(10, 6))
plt.plot(results['x'], results['y'])
plt.xlabel('X')
plt.ylabel('Y')
plt.title('Analysis Results')
plt.savefig('results/figures/analysis.png', dpi=300)
plt.show()
# Cell 6: Save results
loader.save_processed_data(
results,
'analysis_results.csv',
metadata={'method': 'analysis_v1'}
)
Computational Reproducibility
Recording Computational Environment
import sys
import platform
import json
from datetime import datetime
def record_environment():
"""Record computational environment."""
env_info = {
'timestamp': datetime.now().isoformat(),
'python_version': sys.version,
'platform': platform.platform(),
'processor': platform.processor(),
'packages': {}
}
# Record package versions
import numpy, pandas, scipy, sklearn, matplotlib
packages = {
'numpy': numpy.__version__,
'pandas': pandas.__version__,
'scipy': scipy.__version__,
'scikit-learn': sklearn.__version__,
'matplotlib': matplotlib.__version__
}
env_info['packages'] = packages
# Save to file
with open('environment_record.json', 'w') as f:
json.dump(env_info, f, indent=2)
print("Environment recorded")
return env_info
# Usage
env = record_environment()
Common Pitfalls and Best Practices
โ Bad: Hardcoded Paths
# DON'T: Use hardcoded paths
df = pd.read_csv('/Users/john/Documents/data.csv')
โ Good: Relative Paths
# DO: Use relative paths
from pathlib import Path
data_dir = Path(__file__).parent / 'data'
df = pd.read_csv(data_dir / 'raw' / 'data.csv')
โ Bad: No Random Seed
# DON'T: Run without setting seed
model.fit(X_train, y_train)
โ Good: Set Random Seed
# DO: Set seed for reproducibility
np.random.seed(42)
model.fit(X_train, y_train)
โ Bad: Modifying Raw Data
# DON'T: Modify raw data in place
df = pd.read_csv('data/raw/data.csv')
df['new_column'] = df['old_column'] * 2
โ Good: Create Processed Data
# DO: Create separate processed data
raw_df = pd.read_csv('data/raw/data.csv')
processed_df = raw_df.copy()
processed_df['new_column'] = processed_df['old_column'] * 2
processed_df.to_csv('data/processed/data.csv')
Summary
Reproducible research requires:
- Organized project structure for clarity
- Environment management for consistency
- Version control for tracking changes
- Data versioning for integrity
- Documentation for understanding
- Testing for validation
- Computational environment recording for reproducibility
- Best practices for code quality
These patterns ensure your research is trustworthy, verifiable, and extensible by the scientific community.
Comments