Ethical AI & Bias Detection: Building Fair and Responsible AI Systems

As AI systems increasingly impact critical decisions—hiring, lending, criminal justice—ensuring fairness and detecting bias becomes essential. This guide covers practical techniques for building ethical AI systems.

Understanding Bias in AI

Types of Bias

"""
Common types of bias in AI systems:

1. Data Bias: Training data doesn't represent population
2. Algorithmic Bias: Model architecture favors certain groups
3. Measurement Bias: Metrics don't capture true outcomes
4. Aggregation Bias: Treating diverse groups as homogeneous
5. Evaluation Bias: Test set doesn't reflect real-world distribution
"""

# Example: Detecting data bias
import pandas as pd
import numpy as np

def analyze_data_distribution(df, protected_attribute, target):
    """Analyze data distribution across protected attributes."""
    
    distribution = df.groupby(protected_attribute)[target].agg([
        'count', 'mean', 'std'
    ])
    
    print(f"Distribution of {target} by {protected_attribute}:")
    print(distribution)
    
    # Calculate representation ratio
    total = len(df)
    for group in df[protected_attribute].unique():
        count = len(df[df[protected_attribute] == group])
        percentage = (count / total) * 100
        print(f"{group}: {percentage:.1f}%")
    
    return distribution

# Usage
df = pd.DataFrame({
    'gender': ['M', 'F', 'M', 'F', 'M', 'F'] * 100,
    'hired': [1, 0, 1, 0, 1, 1] * 100
})

analyze_data_distribution(df, 'gender', 'hired')

Fairness Metrics

Demographic Parity

def demographic_parity(y_true, y_pred, protected_attr):
    """
    Check if positive prediction rate is equal across groups.
    Demographic parity: P(Y_pred=1|A=0) = P(Y_pred=1|A=1)
    """
    
    groups = np.unique(protected_attr)
    positive_rates = {}
    
    for group in groups:
        mask = protected_attr == group
        positive_rate = np.mean(y_pred[mask] == 1)
        positive_rates[group] = positive_rate
    
    print("Positive prediction rates by group:")
    for group, rate in positive_rates.items():
        print(f"Group {group}: {rate:.4f}")
    
    # Calculate disparity ratio
    rates = list(positive_rates.values())
    disparity = max(rates) / min(rates)
    print(f"Disparity ratio: {disparity:.4f}")
    
    return positive_rates

# Usage
y_pred = np.array([1, 0, 1, 1, 0, 1] * 100)
protected = np.array([0, 1, 0, 1, 0, 1] * 100)
demographic_parity(y_pred, y_pred, protected)

Equalized Odds

from sklearn.metrics import confusion_matrix

def equalized_odds(y_true, y_pred, protected_attr):
    """
    Check if true positive rate and false positive rate are equal across groups.
    Equalized odds: TPR(A=0) = TPR(A=1) and FPR(A=0) = FPR(A=1)
    """
    
    groups = np.unique(protected_attr)
    metrics = {}
    
    for group in groups:
        mask = protected_attr == group
        y_true_group = y_true[mask]
        y_pred_group = y_pred[mask]
        
        tn, fp, fn, tp = confusion_matrix(y_true_group, y_pred_group).ravel()
        
        tpr = tp / (tp + fn) if (tp + fn) > 0 else 0
        fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
        
        metrics[group] = {'TPR': tpr, 'FPR': fpr}
    
    print("Equalized Odds Metrics:")
    for group, values in metrics.items():
        print(f"Group {group}: TPR={values['TPR']:.4f}, FPR={values['FPR']:.4f}")
    
    return metrics

# Usage
y_true = np.array([1, 0, 1, 1, 0, 1] * 100)
y_pred = np.array([1, 0, 1, 0, 0, 1] * 100)
protected = np.array([0, 1, 0, 1, 0, 1] * 100)
equalized_odds(y_true, y_pred, protected)

Calibration

def calibration_analysis(y_true, y_pred_proba, protected_attr, n_bins=10):
    """
    Check if predicted probabilities match actual outcomes across groups.
    """
    
    groups = np.unique(protected_attr)
    
    for group in groups:
        mask = protected_attr == group
        y_true_group = y_true[mask]
        y_pred_group = y_pred_proba[mask]
        
        # Bin predictions
        bins = np.linspace(0, 1, n_bins + 1)
        bin_indices = np.digitize(y_pred_group, bins)
        
        print(f"\nCalibration for Group {group}:")
        print("Predicted Prob | Actual Positive Rate")
        
        for bin_idx in range(1, n_bins + 1):
            mask_bin = bin_indices == bin_idx
            if mask_bin.sum() > 0:
                pred_prob = bins[bin_idx - 1]
                actual_rate = y_true_group[mask_bin].mean()
                print(f"{pred_prob:.2f}-{bins[bin_idx]:.2f}  | {actual_rate:.4f}")

# Usage
y_true = np.array([1, 0, 1, 1, 0, 1] * 100)
y_pred_proba = np.random.rand(600)
protected = np.array([0, 1, 0, 1, 0, 1] * 100)
calibration_analysis(y_true, y_pred_proba, protected)

Bias Detection Tools

Using Fairlearn

from fairlearn.metrics import MetricFrame
from fairlearn.metrics import selection_rate, false_positive_rate
from sklearn.metrics import accuracy_score

def analyze_fairness_with_fairlearn(y_true, y_pred, protected_attr):
    """Comprehensive fairness analysis using Fairlearn."""
    
    # Create metric frame
    metrics = {
        'accuracy': accuracy_score,
        'selection_rate': selection_rate,
        'false_positive_rate': false_positive_rate
    }
    
    metric_frame = MetricFrame(
        metrics=metrics,
        y_true=y_true,
        y_pred=y_pred,
        sensitive_features=protected_attr
    )
    
    print("Overall metrics:")
    print(metric_frame.overall)
    
    print("\nMetrics by group:")
    print(metric_frame.by_group)
    
    print("\nDifference (max - min):")
    print(metric_frame.difference())
    
    print("\nRatio (min / max):")
    print(metric_frame.ratio())
    
    return metric_frame

# Usage
y_true = np.array([1, 0, 1, 1, 0, 1] * 100)
y_pred = np.array([1, 0, 1, 0, 0, 1] * 100)
protected = np.array([0, 1, 0, 1, 0, 1] * 100)
metric_frame = analyze_fairness_with_fairlearn(y_true, y_pred, protected)

Using AI Fairness 360

from aif360.datasets import BinaryLabelDataset
from aif360.metrics import BinaryLabelDatasetMetric

def analyze_with_aif360(df, protected_attr, target):
    """Analyze fairness using IBM's AI Fairness 360."""
    
    # Create dataset
    dataset = BinaryLabelDataset(
        df=df,
        label_names=[target],
        protected_attribute_names=[protected_attr]
    )
    
    # Calculate metrics
    metric = BinaryLabelDatasetMetric(dataset)
    
    print(f"Disparate Impact Ratio: {metric.disparate_impact():.4f}")
    print(f"Statistical Parity Difference: {metric.statistical_parity_difference():.4f}")
    print(f"Equal Opportunity Difference: {metric.equal_opportunity_difference():.4f}")
    
    return metric

# Usage
df = pd.DataFrame({
    'gender': [0, 1, 0, 1, 0, 1] * 100,
    'hired': [1, 0, 1, 0, 1, 1] * 100
})
metric = analyze_with_aif360(df, 'gender', 'hired')

Bias Mitigation Techniques

Pre-processing: Resampling

from sklearn.utils import resample

def balance_dataset(X, y, protected_attr):
    """Balance dataset to reduce representation bias."""
    
    df = pd.DataFrame(X)
    df['y'] = y
    df['protected'] = protected_attr
    
    # Separate by protected attribute
    groups = df['protected'].unique()
    balanced_dfs = []
    
    # Find minimum group size
    min_size = min(len(df[df['protected'] == g]) for g in groups)
    
    for group in groups:
        group_df = df[df['protected'] == group]
        
        # Resample to minimum size
        resampled = resample(
            group_df,
            n_samples=min_size,
            random_state=42
        )
        balanced_dfs.append(resampled)
    
    balanced_df = pd.concat(balanced_dfs)
    
    print(f"Original distribution:")
    print(df['protected'].value_counts())
    print(f"\nBalanced distribution:")
    print(balanced_df['protected'].value_counts())
    
    return balanced_df

# Usage
X = np.random.rand(1000, 10)
y = np.random.randint(0, 2, 1000)
protected = np.array([0, 1, 0, 1, 0, 1] * 167)[:1000]
balanced_df = balance_dataset(X, y, protected)

In-processing: Fairness Constraints

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

def train_fair_model(X, y, protected_attr, fairness_weight=0.5):
    """
    Train model with fairness constraints.
    This is a simplified example; use Fairlearn for production.
    """
    
    # Scale features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Train base model
    model = LogisticRegression(random_state=42)
    model.fit(X_scaled, y)
    
    # Evaluate fairness
    y_pred = model.predict(X_scaled)
    
    # Calculate fairness penalty
    groups = np.unique(protected_attr)
    group_accuracies = []
    
    for group in groups:
        mask = protected_attr == group
        acc = (y_pred[mask] == y[mask]).mean()
        group_accuracies.append(acc)
    
    fairness_gap = max(group_accuracies) - min(group_accuracies)
    print(f"Fairness gap: {fairness_gap:.4f}")
    
    return model, scaler

# Usage
X = np.random.rand(1000, 10)
y = np.random.randint(0, 2, 1000)
protected = np.array([0, 1, 0, 1, 0, 1] * 167)[:1000]
model, scaler = train_fair_model(X, y, protected)

Post-processing: Threshold Optimization

def optimize_thresholds_for_fairness(y_true, y_pred_proba, protected_attr):
    """
    Optimize prediction thresholds per group for fairness.
    """
    
    groups = np.unique(protected_attr)
    thresholds = {}
    
    for group in groups:
        mask = protected_attr == group
        y_true_group = y_true[mask]
        y_pred_group = y_pred_proba[mask]
        
        # Find threshold that maximizes fairness
        best_threshold = 0.5
        best_gap = float('inf')
        
        for threshold in np.arange(0.1, 0.9, 0.05):
            y_pred_binary = (y_pred_group >= threshold).astype(int)
            accuracy = (y_pred_binary == y_true_group).mean()
            
            # Calculate fairness metric
            tpr = np.sum((y_pred_binary == 1) & (y_true_group == 1)) / np.sum(y_true_group == 1)
            fpr = np.sum((y_pred_binary == 1) & (y_true_group == 0)) / np.sum(y_true_group == 0)
            gap = abs(tpr - fpr)
            
            if gap < best_gap:
                best_gap = gap
                best_threshold = threshold
        
        thresholds[group] = best_threshold
    
    print("Optimized thresholds by group:")
    for group, threshold in thresholds.items():
        print(f"Group {group}: {threshold:.2f}")
    
    return thresholds

# Usage
y_true = np.array([1, 0, 1, 1, 0, 1] * 100)
y_pred_proba = np.random.rand(600)
protected = np.array([0, 1, 0, 1, 0, 1] * 100)
thresholds = optimize_thresholds_for_fairness(y_true, y_pred_proba, protected)

Model Interpretability

Feature Importance Analysis

from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance

def analyze_feature_importance(X, y, feature_names):
    """Analyze which features drive model predictions."""
    
    model = RandomForestClassifier(random_state=42)
    model.fit(X, y)
    
    # Built-in feature importance
    importances = model.feature_importances_
    
    # Permutation importance
    perm_importance = permutation_importance(model, X, y, random_state=42)
    
    print("Feature Importance:")
    for name, importance in zip(feature_names, importances):
        print(f"{name}: {importance:.4f}")
    
    print("\nPermutation Importance:")
    for name, importance in zip(feature_names, perm_importance.importances_mean):
        print(f"{name}: {importance:.4f}")
    
    return importances

# Usage
X = np.random.rand(1000, 5)
y = np.random.randint(0, 2, 1000)
feature_names = ['age', 'income', 'credit_score', 'employment', 'education']
importances = analyze_feature_importance(X, y, feature_names)

SHAP Values for Explainability

import shap

def explain_predictions_with_shap(model, X, feature_names):
    """Use SHAP for model-agnostic explanations."""
    
    # Create explainer
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X)
    
    # Summary plot
    print("SHAP Summary (feature importance):")
    print(f"Mean absolute SHAP values:")
    
    if isinstance(shap_values, list):
        shap_values = shap_values[1]  # For binary classification
    
    mean_abs_shap = np.abs(shap_values).mean(axis=0)
    for name, value in zip(feature_names, mean_abs_shap):
        print(f"{name}: {value:.4f}")
    
    return shap_values

# Usage
from sklearn.ensemble import RandomForestClassifier
X = np.random.rand(100, 5)
y = np.random.randint(0, 2, 100)
model = RandomForestClassifier(random_state=42)
model.fit(X, y)
feature_names = ['age', 'income', 'credit_score', 'employment', 'education']
shap_values = explain_predictions_with_shap(model, X, feature_names)

Common Pitfalls and Best Practices

❌ Bad: Ignoring Bias

# DON'T: Deploy model without fairness analysis
model.fit(X_train, y_train)
model.predict(X_test)

✅ Good: Comprehensive Fairness Analysis

# DO: Analyze fairness before deployment
metric_frame = analyze_fairness_with_fairlearn(y_true, y_pred, protected_attr)
if metric_frame.ratio().min() < 0.8:
    print("Warning: Fairness issues detected")

❌ Bad: Single Fairness Metric

# DON'T: Rely on one metric
if demographic_parity_ratio > 0.8:
    print("Model is fair")

✅ Good: Multiple Fairness Metrics

# DO: Use multiple metrics
metrics = {
    'demographic_parity': demographic_parity(y_pred, protected),
    'equalized_odds': equalized_odds(y_true, y_pred, protected),
    'calibration': calibration_analysis(y_true, y_pred_proba, protected)
}

Production Checklist

def fairness_audit_checklist(model, X, y, protected_attr, feature_names):
    """Complete fairness audit before deployment."""
    
    y_pred = model.predict(X)
    y_pred_proba = model.predict_proba(X)[:, 1]
    
    checks = {
        'data_representation': analyze_data_distribution(pd.DataFrame(X), protected_attr, 'target'),
        'demographic_parity': demographic_parity(y_pred, protected_attr),
        'equalized_odds': equalized_odds(y, y_pred, protected_attr),
        'calibration': calibration_analysis(y, y_pred_proba, protected_attr),
        'feature_importance': analyze_feature_importance(X, y, feature_names)
    }
    
    print("✓ Fairness audit complete")
    return checks

Summary

Building ethical AI systems requires:

Understanding bias types and their sources
Measuring fairness with appropriate metrics
Detecting bias using tools like Fairlearn and AI Fairness 360
Mitigating bias through pre-, in-, and post-processing
Explaining predictions with SHAP and feature importance
Continuous monitoring in production
Stakeholder engagement in fairness decisions

Ethical AI isn’t a one-time check—it’s an ongoing commitment to building fair, transparent, and responsible systems.