Skip to main content
โšก Calmops

Reproducible Research Practices: Building Trustworthy Scientific Code

Reproducible Research Practices: Building Trustworthy Scientific Code

Reproducibility is fundamental to scientific research. This guide covers practical patterns for building reproducible research code in Python, ensuring your work can be verified and extended by others.

Project Structure

my_research_project/
โ”œโ”€โ”€ README.md                 # Project overview
โ”œโ”€โ”€ LICENSE                   # License file
โ”œโ”€โ”€ .gitignore               # Git ignore rules
โ”œโ”€โ”€ environment.yml          # Conda environment
โ”œโ”€โ”€ requirements.txt         # Pip requirements
โ”œโ”€โ”€ setup.py                 # Package setup
โ”œโ”€โ”€ data/
โ”‚   โ”œโ”€โ”€ raw/                 # Original data (read-only)
โ”‚   โ”œโ”€โ”€ processed/           # Processed data
โ”‚   โ””โ”€โ”€ external/            # External data sources
โ”œโ”€โ”€ notebooks/
โ”‚   โ”œโ”€โ”€ 01_exploration.ipynb
โ”‚   โ”œโ”€โ”€ 02_analysis.ipynb
โ”‚   โ””โ”€โ”€ 03_results.ipynb
โ”œโ”€โ”€ src/
โ”‚   โ”œโ”€โ”€ __init__.py
โ”‚   โ”œโ”€โ”€ data/
โ”‚   โ”‚   โ”œโ”€โ”€ __init__.py
โ”‚   โ”‚   โ””โ”€โ”€ loader.py
โ”‚   โ”œโ”€โ”€ analysis/
โ”‚   โ”‚   โ”œโ”€โ”€ __init__.py
โ”‚   โ”‚   โ””โ”€โ”€ methods.py
โ”‚   โ””โ”€โ”€ visualization/
โ”‚       โ”œโ”€โ”€ __init__.py
โ”‚       โ””โ”€โ”€ plots.py
โ”œโ”€โ”€ tests/
โ”‚   โ”œโ”€โ”€ __init__.py
โ”‚   โ”œโ”€โ”€ test_data.py
โ”‚   โ””โ”€โ”€ test_analysis.py
โ”œโ”€โ”€ results/
โ”‚   โ”œโ”€โ”€ figures/
โ”‚   โ””โ”€โ”€ tables/
โ””โ”€โ”€ docs/
    โ”œโ”€โ”€ index.md
    โ””โ”€โ”€ methods.md

Environment Management

Conda Environment

# environment.yml
name: my-research
channels:
  - conda-forge
  - defaults
dependencies:
  - python=3.9
  - numpy=1.21
  - pandas=1.3
  - scipy=1.7
  - matplotlib=3.4
  - jupyter=1.0
  - jupyterlab=3.0
  - pytest=6.2
  - pip
  - pip:
    - scikit-learn==0.24
    - seaborn==0.11

Creating and Using Environment

import subprocess
import os

def setup_environment(env_name='my-research'):
    """Setup conda environment."""
    # Create environment
    subprocess.run([
        'conda', 'env', 'create',
        '-f', 'environment.yml',
        '-n', env_name
    ], check=True)
    
    print(f"Environment '{env_name}' created")
    print(f"Activate with: conda activate {env_name}")

def export_environment(env_name, output_file='environment.yml'):
    """Export current environment."""
    subprocess.run([
        'conda', 'env', 'export',
        '-n', env_name,
        '-f', output_file
    ], check=True)
    
    print(f"Environment exported to {output_file}")

# Usage
# setup_environment()
# export_environment('my-research')

Version Control

Git Configuration

import subprocess
from pathlib import Path

class GitManager:
    """Manage git repository for research."""
    
    def __init__(self, repo_path='.'):
        self.repo_path = Path(repo_path)
    
    def initialize_repo(self):
        """Initialize git repository."""
        subprocess.run(['git', 'init'], cwd=self.repo_path, check=True)
        print("Git repository initialized")
    
    def create_gitignore(self):
        """Create .gitignore for research project."""
        gitignore_content = """
# Data
data/raw/*
data/processed/*
!data/raw/.gitkeep
!data/processed/.gitkeep

# Results
results/*
!results/.gitkeep

# Jupyter
.ipynb_checkpoints/
*.ipynb_checkpoints

# Python
__pycache__/
*.py[cod]
*$py.class
*.so
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg

# Environment
venv/
env/
ENV/
.venv

# IDE
.vscode/
.idea/
*.swp
*.swo

# OS
.DS_Store
Thumbs.db

# Temporary
*.tmp
*.log
"""
        
        gitignore_path = self.repo_path / '.gitignore'
        gitignore_path.write_text(gitignore_content)
        print(f"Created .gitignore")
    
    def commit_with_message(self, message):
        """Commit changes with message."""
        subprocess.run(['git', 'add', '.'], cwd=self.repo_path, check=True)
        subprocess.run(
            ['git', 'commit', '-m', message],
            cwd=self.repo_path,
            check=True
        )
        print(f"Committed: {message}")

# Usage
# git_manager = GitManager()
# git_manager.initialize_repo()
# git_manager.create_gitignore()

Data Management

Data Versioning

import hashlib
import json
from pathlib import Path
from datetime import datetime

class DataVersioning:
    """Track data versions and checksums."""
    
    def __init__(self, data_dir='data'):
        self.data_dir = Path(data_dir)
        self.manifest_file = self.data_dir / 'manifest.json'
    
    def compute_checksum(self, file_path):
        """Compute file checksum."""
        sha256_hash = hashlib.sha256()
        
        with open(file_path, 'rb') as f:
            for byte_block in iter(lambda: f.read(4096), b''):
                sha256_hash.update(byte_block)
        
        return sha256_hash.hexdigest()
    
    def register_data(self, file_path, description=''):
        """Register data file with checksum."""
        file_path = Path(file_path)
        
        if not file_path.exists():
            raise FileNotFoundError(f"File not found: {file_path}")
        
        checksum = self.compute_checksum(file_path)
        
        # Load or create manifest
        if self.manifest_file.exists():
            manifest = json.loads(self.manifest_file.read_text())
        else:
            manifest = {}
        
        # Add entry
        manifest[str(file_path)] = {
            'checksum': checksum,
            'size': file_path.stat().st_size,
            'timestamp': datetime.now().isoformat(),
            'description': description
        }
        
        # Save manifest
        self.manifest_file.write_text(json.dumps(manifest, indent=2))
        print(f"Registered: {file_path} (checksum: {checksum[:8]}...)")
    
    def verify_data(self, file_path):
        """Verify data integrity."""
        file_path = Path(file_path)
        
        if not self.manifest_file.exists():
            raise FileNotFoundError("Manifest file not found")
        
        manifest = json.loads(self.manifest_file.read_text())
        
        if str(file_path) not in manifest:
            raise ValueError(f"File not in manifest: {file_path}")
        
        expected_checksum = manifest[str(file_path)]['checksum']
        actual_checksum = self.compute_checksum(file_path)
        
        if expected_checksum == actual_checksum:
            print(f"โœ“ Data verified: {file_path}")
            return True
        else:
            print(f"โœ— Data corrupted: {file_path}")
            return False

# Usage
versioning = DataVersioning()
# versioning.register_data('data/raw/dataset.csv', 'Original dataset')
# versioning.verify_data('data/raw/dataset.csv')

Data Loading with Reproducibility

import pandas as pd
import numpy as np
from pathlib import Path

class ReproducibleDataLoader:
    """Load data with reproducibility checks."""
    
    def __init__(self, data_dir='data', seed=42):
        self.data_dir = Path(data_dir)
        self.seed = seed
        np.random.seed(seed)
    
    def load_raw_data(self, filename):
        """Load raw data."""
        file_path = self.data_dir / 'raw' / filename
        
        if not file_path.exists():
            raise FileNotFoundError(f"Data file not found: {file_path}")
        
        print(f"Loading: {file_path}")
        return pd.read_csv(file_path)
    
    def save_processed_data(self, df, filename, metadata=None):
        """Save processed data with metadata."""
        output_dir = self.data_dir / 'processed'
        output_dir.mkdir(parents=True, exist_ok=True)
        
        file_path = output_dir / filename
        df.to_csv(file_path, index=False)
        
        # Save metadata
        if metadata:
            metadata_file = output_dir / f"{filename}.metadata.json"
            metadata_file.write_text(json.dumps(metadata, indent=2))
        
        print(f"Saved: {file_path}")
    
    def load_processed_data(self, filename):
        """Load processed data with metadata."""
        file_path = self.data_dir / 'processed' / filename
        
        df = pd.read_csv(file_path)
        
        # Load metadata if available
        metadata_file = self.data_dir / 'processed' / f"{filename}.metadata.json"
        metadata = None
        if metadata_file.exists():
            metadata = json.loads(metadata_file.read_text())
        
        return df, metadata

# Usage
loader = ReproducibleDataLoader()
# df = loader.load_raw_data('dataset.csv')
# loader.save_processed_data(df, 'processed_dataset.csv', metadata={'rows': len(df)})

Documentation

README Template

# Research Project Title

## Overview
Brief description of the research project and objectives.

## Installation

### Prerequisites
- Python 3.9+
- Conda or pip

### Setup
\`\`\`bash
# Create environment
conda env create -f environment.yml
conda activate my-research

# Install package
pip install -e .
\`\`\`

## Data

### Raw Data
- Source: [URL or description]
- Format: CSV
- Size: [size]
- License: [license]

### Processing
\`\`\`bash
python src/data/process.py
\`\`\`

## Usage

### Running Analysis
\`\`\`bash
jupyter lab notebooks/01_exploration.ipynb
\`\`\`

### Running Tests
\`\`\`bash
pytest tests/
\`\`\`

## Results
[Summary of key findings]

## Citation
If you use this code, please cite:
\`\`\`
[Citation format]
\`\`\`

## License
[License type]

## Authors
[Author names and affiliations]

Testing

Unit Tests for Research Code

import pytest
import numpy as np
import pandas as pd
from src.analysis.methods import calculate_statistics

class TestAnalysisMethods:
    """Test analysis methods."""
    
    def test_calculate_statistics(self):
        """Test statistics calculation."""
        data = np.array([1, 2, 3, 4, 5])
        
        stats = calculate_statistics(data)
        
        assert stats['mean'] == 3.0
        assert stats['std'] > 0
        assert stats['min'] == 1
        assert stats['max'] == 5
    
    def test_calculate_statistics_empty(self):
        """Test with empty data."""
        data = np.array([])
        
        with pytest.raises(ValueError):
            calculate_statistics(data)
    
    def test_calculate_statistics_single_value(self):
        """Test with single value."""
        data = np.array([5])
        
        stats = calculate_statistics(data)
        
        assert stats['mean'] == 5.0
        assert stats['std'] == 0.0

# Run tests
# pytest tests/test_analysis.py

Jupyter Notebooks

Best Practices

# notebook_template.ipynb

# Cell 1: Setup and imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from src.data.loader import ReproducibleDataLoader
from src.analysis.methods import analyze_data

# Set random seed for reproducibility
np.random.seed(42)

# Cell 2: Load data
loader = ReproducibleDataLoader()
df = loader.load_raw_data('dataset.csv')
print(f"Data shape: {df.shape}")
print(df.head())

# Cell 3: Exploratory analysis
print(df.describe())

# Cell 4: Analysis
results = analyze_data(df)
print(results)

# Cell 5: Visualization
plt.figure(figsize=(10, 6))
plt.plot(results['x'], results['y'])
plt.xlabel('X')
plt.ylabel('Y')
plt.title('Analysis Results')
plt.savefig('results/figures/analysis.png', dpi=300)
plt.show()

# Cell 6: Save results
loader.save_processed_data(
    results,
    'analysis_results.csv',
    metadata={'method': 'analysis_v1'}
)

Computational Reproducibility

Recording Computational Environment

import sys
import platform
import json
from datetime import datetime

def record_environment():
    """Record computational environment."""
    env_info = {
        'timestamp': datetime.now().isoformat(),
        'python_version': sys.version,
        'platform': platform.platform(),
        'processor': platform.processor(),
        'packages': {}
    }
    
    # Record package versions
    import numpy, pandas, scipy, sklearn, matplotlib
    
    packages = {
        'numpy': numpy.__version__,
        'pandas': pandas.__version__,
        'scipy': scipy.__version__,
        'scikit-learn': sklearn.__version__,
        'matplotlib': matplotlib.__version__
    }
    
    env_info['packages'] = packages
    
    # Save to file
    with open('environment_record.json', 'w') as f:
        json.dump(env_info, f, indent=2)
    
    print("Environment recorded")
    return env_info

# Usage
env = record_environment()

Common Pitfalls and Best Practices

โŒ Bad: Hardcoded Paths

# DON'T: Use hardcoded paths
df = pd.read_csv('/Users/john/Documents/data.csv')

โœ… Good: Relative Paths

# DO: Use relative paths
from pathlib import Path
data_dir = Path(__file__).parent / 'data'
df = pd.read_csv(data_dir / 'raw' / 'data.csv')

โŒ Bad: No Random Seed

# DON'T: Run without setting seed
model.fit(X_train, y_train)

โœ… Good: Set Random Seed

# DO: Set seed for reproducibility
np.random.seed(42)
model.fit(X_train, y_train)

โŒ Bad: Modifying Raw Data

# DON'T: Modify raw data in place
df = pd.read_csv('data/raw/data.csv')
df['new_column'] = df['old_column'] * 2

โœ… Good: Create Processed Data

# DO: Create separate processed data
raw_df = pd.read_csv('data/raw/data.csv')
processed_df = raw_df.copy()
processed_df['new_column'] = processed_df['old_column'] * 2
processed_df.to_csv('data/processed/data.csv')

Summary

Reproducible research requires:

  1. Organized project structure for clarity
  2. Environment management for consistency
  3. Version control for tracking changes
  4. Data versioning for integrity
  5. Documentation for understanding
  6. Testing for validation
  7. Computational environment recording for reproducibility
  8. Best practices for code quality

These patterns ensure your research is trustworthy, verifiable, and extensible by the scientific community.

Comments