Introduction
Feature stores bridge the gap between data engineering and ML training/serving. This guide compares three solutions: Feast (open-source), Tecton (managed), and Redis (as feature store).
Understanding Feature Stores
Feature stores provide:
- Offline Store: Historical features for training
- Online Store: Low-latency features for inference
- Feature Registry: Centralized feature definitions
- Point-in-Time Correctness: No data leakage
# Feature store concepts
feature_store = {
"offline": "Historical features (training)",
"online": "Real-time features (serving)",
"registry": "Feature definitions",
"compute": "Feature transformations"
}
Feast: Open-Source Feature Store
Feast provides an open-source feature store with a unified API.
Feast Setup
# Install Feast
pip install feast
# Initialize Feast project
feast init my_feature_repo
cd my_feature_repo
Feast Feature Definition
# features/features.py
from feast import Feature, FeatureView, FileSource, RedshiftSource
# Define data source
driver_stats_source = FileSource(
name="driver_stats_source",
path="data/driver_stats.parquet",
timestamp_field="event_timestamp",
created_timestamp_column="created"
)
# Define feature view
driver_stats_fv = FeatureView(
name="driver_hourly_stats",
entities=["driver_id"],
ttl=timedelta(days=1),
features=[
Feature(name="total_rides", dtype=Float32),
Feature(name="total_distance", dtype=Float32),
Feature(name="average_rating", dtype=Float32),
Feature(name="rating_count", dtype=Int64)
],
online=True,
batch_source=driver_stats_source
)
# Register features
feast.apply([driver_stats_fv])
Feast Offline Features (Training)
import feast
# Initialize feature store
fs = feast.FeatureStore(repo_path="./my_feature_repo")
# Get training features
training_df = fs.get_historical_features(
entity_df=training_data, # Contains driver_id and event_timestamp
feature_refs=[
"driver_hourly_stats:total_rides",
"driver_hourly_stats:total_distance",
"driver_hourly_stats:average_rating"
]
).to_df()
print(training_df.head())
Feast Online Features (Serving)
# Get online features for inference
online_features = fs.get_online_features(
feature_refs=[
"driver_hourly_stats:total_rides",
"driver_hourly_stats:average_rating"
],
entity_rows=[
{"driver_id": 1001},
{"driver_id": 1002}
]
).to_dict()
print(online_features)
# {'driver_id': [1001, 1002],
# 'total_rides': [245, 189],
# 'average_rating': [4.8, 4.6]}
Feast with Spark
# Spark computation for features
from feast import SparkSource
driver_stats_source = SparkSource(
name="driver_stats_spark",
table="driver_stats",
timestamp_field="event_timestamp",
created_timestamp_column="created_timestamp"
)
# Feast handles Spark execution
# Features computed on-demand during training
Feast Streaming
# Kafka streaming source
from feast import KafkaSource
stream_source = KafkaSource(
name="driver_trip_stream",
bootstrap_servers="localhost:9092",
topic="driver_trips",
timestamp_field="event_timestamp",
batch_source=driver_stats_source
)
Tecton: Managed Feature Platform
Tecton provides a fully managed feature platform with real-time capabilities.
Tecton Setup
# Install Tecton SDK
pip install tecton
# Configure Tecton
import tecton
from tecton import batch_feature_view, Aggregation, SlidingWindow
# Connect to Tecton
workspace = tecton.get_workspace("prod")
Tecton Feature Views
from tecton import batch_feature_view, Aggregation, MaterializationConfig
from datetime import datetime, timedelta
@batch_feature_view(
sources=[transactions_source],
entities=[user_entity],
mode="spark_sql",
online=True,
offline=True,
feature_start_time=datetime(2024, 1, 1),
batch_schedule=timedelta(days=1),
ttl=timedelta(days=30)
)
def user_transaction_stats(transactions_source):
return f"""
SELECT
user_id,
transaction_amount,
transaction_timestamp
FROM {transactions_source}
"""
@batch_feature_view(
sources=[transactions_source],
entities=[user_entity],
mode="spark_sql",
online=True,
aggregations=[
Aggregation(column="transaction_amount",
function="sum",
window=SlidingWindow(days=7)),
Aggregation(column="transaction_amount",
function="count",
window=SlidingWindow(days=1)),
Aggregation(column="transaction_amount",
function="avg",
window=SlidingWindow(days=30))
]
)
def user_transaction_aggregates(transactions_source):
"""Pre-computed aggregations."""
pass
Tecton Real-Time Features
from tecton import stream_feature_view, Aggregation, TumblingWindow
@stream_feature_view(
source=clickstream_source,
entities=[user_entity],
mode="spark_structured_streaming",
online=True,
aggregation_interval=timedelta(minutes=1),
features=[
Aggregation(column="page_views",
function="count",
window=TumblingWindow(minutes=5)),
Aggregation(column="session_duration",
function="sum",
window=TumblingWindow(minutes=10))
]
)
def user_realtime_stats(clickstream_source):
pass
Tecton Inference
# Get features for online inference
from tecton import FeatureService
# Define feature service
fraud_detection_service = FeatureService(
name="fraud_detection",
features=[
user_transaction_stats,
user_transaction_aggregates,
user_realtime_stats
]
)
# Query features
features = tecton.get_online_features(
feature_service=fraud_detection_service,
entity_rows=[
{"user_id": "user_12345"}
]
)
# Use in model
prediction = model.predict(features.to_pandas())
Redis as Feature Store
Redis provides a fast online feature store, often used alongside other solutions.
Redis Setup
import redis
import json
from datetime import timedelta
# Connect to Redis
redis_client = redis.Redis(
host='localhost',
port=6379,
db=0,
decode_responses=True
)
# Feature serialization
def serialize_features(features: dict) -> str:
return json.dumps(features)
def deserialize_features(data: str) -> dict:
return json.loads(data)
Redis Feature Storage
# Store features
def store_features(entity_id: str, features: dict, ttl: int = 3600):
key = f"features:{entity_id}"
redis_client.setex(
key,
ttl,
serialize_features(features)
)
# Get features
def get_features(entity_id: str) -> dict:
key = f"features:{entity_id}"
data = redis_client.get(key)
return deserialize_features(data) if data else None
# Store
store_features(
"driver_1001",
{
"total_rides": 245,
"total_distance": 1523.5,
"average_rating": 4.8,
"rating_count": 230
},
ttl=3600
)
# Retrieve
features = get_features("driver_1001")
print(features)
Redis with RedisAI
# Using RedisAI for model serving
import redisai as ri
client = ri.Client(host='localhost', port=6379)
# Store model in Redis
client.modelset(
'fraud_model',
'torch',
'cpu',
torch_model
)
# Run inference
response = client.modelrun(
'fraud_model',
['input_tensor'],
['output_tensor']
)
# Get result
result = client.tensorget('output_tensor')
Redis Time-Series Features
# Time-series features with Redis
from redis import Redis
from redis.commands.timeseries import TSCommands
ts = TSCommands(Redis())
# Add time-series data
ts.add("user:123:transactions", "*", 50.0)
ts.add("user:123:transactions", "*", 75.0)
# Get range
data = ts.range("user:123:transactions",
from_time=0,
to_time=1000)
# Aggregations
avg_value = ts.range("user:123:transactions",
from_time=0,
to_time=1000,
aggregation="avg",
count=100)
Redis Bloom Filters
# Feature discovery with Bloom filters
bf = redis_client.bf()
# Add known entities
bf.add("known_users", "user_123")
bf.add("known_users", "user_456")
# Check membership
is_known = bf.exists("known_users", "user_789")
Feature Store Architecture Patterns
Pattern 1: Feast + Redis
# Use Feast for offline, Redis for online
# feast_config.yml
offline_store:
type: redshift
config:
cluster_id: my-cluster
online_store:
type: redis
config:
host: localhost
port: 6379
Pattern 2: Tecton Full Platform
# Use Tecton for everything
# Tecton handles:
# - Materialization (batch + streaming)
# - Online serving
# - Offline exports
# - Feature computation
Pattern 3: Custom with Redis
# Custom compute + Redis storage
def compute_features(user_id):
# Compute features
features = {
"account_age": get_account_age(user_id),
"total_spend": get_total_spend(user_id),
"last_login_hours": get_hours_since_login(user_id)
}
# Store in Redis
store_features(user_id, features)
return features
Comparison
| Feature | Feast | Tecton | Redis |
|---|---|---|---|
| Type | Open-source | Managed | Data Store |
| Offline | Yes | Yes | Via custom |
| Online | Yes | Yes | Yes (fast) |
| Real-time | Limited | Excellent | Via streams |
| Cost | Free | $$$ | $ |
| Managed | Self-hosted | Full | Partial |
When to Use Each
Feast
- Open-source preference
- Self-hosted requirement
- Basic feature store needs
Tecton
- Enterprise features
- Real-time requirements
- Managed infrastructure
Redis
- Fast online serving
- Custom architectures
- Low-latency requirements
Bad Practices
Bad Practice 1: Feature Drift
# Bad: Training-serving skew
# Training: all features computed from batch
# Serving: some from real-time, some from batch
# Good: Use same feature definitions
# Both training and serving use feature store
Bad Practice 2: No Versioning
# Bad: Features change without tracking
# Can't reproduce old models
# Good: Version features
feature.version = "2.1"
Bad Practice 3: Missing TTL
# Bad: Features never expire
# Stale features hurt predictions
# Good: Set appropriate TTL
feature.ttl = timedelta(days=1)
Good Practices
Feature Naming
# Consistent naming
FEATURE_NAMES = {
"user": {
"age": "user_age_days",
"spend": "user_total_spend_30d",
"activity": "user_last_active_ts"
}
}
Monitoring
# Track feature freshness
def monitor_freshness():
for feature in features:
age = time.now() - feature.last_updated
if age > feature.ttl:
alert(f"Feature {feature.name} is stale!")
External Resources
- Feast Documentation
- Tecton Documentation
- Redis Documentation
- Feast GitHub
- Feature Store Architecture
- Tecton Real-time Features
- Redis for ML Features
Comments