Introduction
AI in finance spans two primary domains: fraud detection (identifying malicious transactions in real-time) and algorithmic trading (executing market strategies automatically). Both require processing high-velocity data streams with low latency, but their ML approaches differ sharply. Fraud detection uses unsupervised anomaly detection to catch novel attack patterns without labeled examples. Algorithmic trading uses historical backtesting with supervised or reinforcement learning to optimize execution strategies.
This guide covers an Isolation Forest + autoencoder pipeline for real-time fraud detection with SHAP explainability, a Backtrader-based algorithmic trading strategy with moving average crossover backtesting, and feature engineering techniques for transaction monitoring.
Fraud Detection Pipeline
Modern fraud detection systems combine supervised models (trained on known fraud patterns) with unsupervised anomaly detection (catching novel fraud types). The pipeline below processes transactions in real-time, scoring each for fraud probability:
flowchart LR
T[Transaction Event<br/>Kafka / API] --> FE[Feature Engineering]
FE --> M1[Supervised Model<br/>XGBoost / CatBoost]
FE --> M2[Unsupervised Model<br/>Isolation Forest]
FE --> M3[Autoencoder<br/>Reconstruction Error]
M1 --> F[Fusion Layer<br/>Weighted ensemble]
M2 --> F
M3 --> F
F --> Score[Fraud Score 0-100]
Score -->|> 90| Block[Block Transaction]
Score -->|60-90| Review[Manual Review Queue]
Score -->|< 60| Allow[Approve]
Isolation Forest for Anomaly Detection
Isolation Forest isolates anomalies by randomly partitioning data. Anomalies require fewer splits to isolate than normal points, producing shorter path lengths. This makes it effective for high-dimensional transaction data:
import numpy as np
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
# Feature engineering for transaction data
def engineer_features(df: pd.DataFrame) -> pd.DataFrame:
"""Create features from raw transaction data.
Features capture velocity, deviation from user norms,
and time-based patterns — all common fraud indicators.
"""
features = df.copy()
# Transaction velocity: count in last hour per user
features['tx_1h_count'] = (
df.groupby('user_id')['timestamp']
.transform(lambda x: x.rolling('1h').count())
)
# Deviation from user's average amount
user_avg = df.groupby('user_id')['amount'].transform('mean')
user_std = df.groupby('user_id')['amount'].transform('std')
features['amount_zscore'] = (df['amount'] - user_avg) / (user_std + 1e-8)
# Time since last transaction (seconds)
features['time_since_last'] = (
df.groupby('user_id')['timestamp']
.transform(lambda x: x.diff().dt.total_seconds())
)
# Weekend transaction flag
features['is_weekend'] = df['timestamp'].dt.dayofweek.isin([5, 6]).astype(int)
# Night time flag (midnight - 6am)
features['is_night'] = df['timestamp'].dt.hour.isin(range(0, 6)).astype(int)
return features
# Train Isolation Forest
feature_cols = ['amount', 'tx_1h_count', 'amount_zscore', 'time_since_last',
'is_weekend', 'is_night']
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df[feature_cols])
model = IsolationForest(
n_estimators=200,
contamination=0.01, # Expected fraud rate ~1%
random_state=42,
n_jobs=-1
)
model.fit(X_scaled)
# Score new transactions
def score_transaction(transaction: dict) -> dict:
"""Score a single transaction for fraud risk."""
features = pd.DataFrame([transaction])
features_eng = engineer_features(features)
X = scaler.transform(features_eng[feature_cols])
# Isolation Forest returns -1 for anomalies
anomaly_score = model.decision_function(X)[0]
is_fraud = model.predict(X)[0] == -1
return {
'fraud_score': float(1 - (anomaly_score + 0.5)), # Normalize to 0-1
'is_suspicious': bool(is_fraud),
'threshold_exceeded': float(1 - (anomaly_score + 0.5)) > 0.8
}
Autoencoder for Reconstruction-Based Anomaly Detection
Autoencoders learn to reconstruct normal transactions. Fraudulent transactions produce high reconstruction error because they don’t match learned patterns:
import torch
import torch.nn as nn
class FraudAutoencoder(nn.Module):
"""Autoencoder for transaction anomaly detection.
Trained on normal (non-fraudulent) transactions only.
Fraudulent transactions will have high reconstruction error
because the model has never seen their pattern.
"""
def __init__(self, input_dim: int, encoding_dim: int = 16):
super().__init__()
self.encoder = nn.Sequential(
nn.Linear(input_dim, 64),
nn.ReLU(),
nn.Linear(64, 32),
nn.ReLU(),
nn.Linear(32, encoding_dim),
)
self.decoder = nn.Sequential(
nn.Linear(encoding_dim, 32),
nn.ReLU(),
nn.Linear(32, 64),
nn.ReLU(),
nn.Linear(64, input_dim),
)
def forward(self, x):
return self.decoder(self.encoder(x))
def reconstruction_error(self, x: torch.Tensor) -> float:
"""Mean squared error between input and reconstruction."""
with torch.no_grad():
reconstructed = self.forward(x)
return nn.MSELoss()(reconstructed, x).item()
# Training: train on known-good transactions only
model = FraudAutoencoder(input_dim=len(feature_cols))
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.MSELoss()
for epoch in range(50):
for batch in normal_transactions_loader:
optimizer.zero_grad()
output = model(batch)
loss = criterion(output, batch)
loss.backward()
optimizer.step()
# Inference: high reconstruction error = likely fraud
error = model.reconstruction_error(transaction_tensor)
# If error > threshold (e.g., 95th percentile of training errors), flag as fraud
Explainability with SHAP
import shap
# Explain why a transaction was flagged
explainer = shap.TreeExplainer(xgb_model)
shap_values = explainer.shap_values(transaction_features)
# Visualize top contributing features
shap.initjs()
shap.force_plot(
explainer.expected_value,
shap_values[0],
transaction_features,
matplotlib=True
)
# Output: "Transaction flagged due to: amount_zscore=3.2, is_night=1, tx_1h_count=15"
Algorithmic Trading with Backtrader
import backtrader as bt
class MovingAverageCrossover(bt.Strategy):
"""Simple moving average crossover strategy.
Buy when fast MA crosses above slow MA (golden cross).
Sell when fast MA crosses below slow MA (death cross).
"""
params = (('fast', 20), ('slow', 50))
def __init__(self):
self.fast_ma = bt.indicators.SMA(self.data.close, period=self.params.fast)
self.slow_ma = bt.indicators.SMA(self.data.close, period=self.params.slow)
self.crossover = bt.indicators.CrossOver(self.fast_ma, self.slow_ma)
def next(self):
if not self.position:
if self.crossover > 0: # Golden cross
self.buy()
elif self.crossover < 0: # Death cross
self.sell()
# Run backtest
cerebro = bt.Cerebro()
cerebro.addstrategy(MovingAverageCrossover)
data = bt.feeds.PandasData(dataname=historical_prices)
cerebro.adddata(data)
cerebro.broker.setcash(100000.0)
cerebro.broker.setcommission(commission=0.001)
print(f'Starting Portfolio Value: {cerebro.broker.getvalue():.2f}')
cerebro.run()
print(f'Final Portfolio Value: {cerebro.broker.getvalue():.2f}')
cerebro.plot()
Resources
- Isolation Forest (sklearn) — Anomaly detection reference
- PyTorch Autoencoder Tutorial — Reconstruction-based fraud detection
- SHAP Model Explainability — Feature importance for fraud decisions
- Backtrader Documentation — Algorithmic trading backtesting
- SHAP Model Explainability — Feature importance for fraud decisions
- IBM AI Fraud Detection — Industry practices and deployment patterns
Comments