Skip to main content
โšก Calmops

Feature Store: Feast vs Tecton vs Redis

Introduction

Feature stores bridge the gap between data engineering and ML training/serving. This guide compares three solutions: Feast (open-source), Tecton (managed), and Redis (as feature store).

Understanding Feature Stores

Feature stores provide:

  • Offline Store: Historical features for training
  • Online Store: Low-latency features for inference
  • Feature Registry: Centralized feature definitions
  • Point-in-Time Correctness: No data leakage
# Feature store concepts
feature_store = {
    "offline": "Historical features (training)",
    "online": "Real-time features (serving)",
    "registry": "Feature definitions",
    "compute": "Feature transformations"
}

Feast: Open-Source Feature Store

Feast provides an open-source feature store with a unified API.

Feast Setup

# Install Feast
pip install feast

# Initialize Feast project
feast init my_feature_repo
cd my_feature_repo

Feast Feature Definition

# features/features.py
from feast import Feature, FeatureView, FileSource, RedshiftSource

# Define data source
driver_stats_source = FileSource(
    name="driver_stats_source",
    path="data/driver_stats.parquet",
    timestamp_field="event_timestamp",
    created_timestamp_column="created"
)

# Define feature view
driver_stats_fv = FeatureView(
    name="driver_hourly_stats",
    entities=["driver_id"],
    ttl=timedelta(days=1),
    features=[
        Feature(name="total_rides", dtype=Float32),
        Feature(name="total_distance", dtype=Float32),
        Feature(name="average_rating", dtype=Float32),
        Feature(name="rating_count", dtype=Int64)
    ],
    online=True,
    batch_source=driver_stats_source
)

# Register features
feast.apply([driver_stats_fv])

Feast Offline Features (Training)

import feast

# Initialize feature store
fs = feast.FeatureStore(repo_path="./my_feature_repo")

# Get training features
training_df = fs.get_historical_features(
    entity_df=training_data,  # Contains driver_id and event_timestamp
    feature_refs=[
        "driver_hourly_stats:total_rides",
        "driver_hourly_stats:total_distance",
        "driver_hourly_stats:average_rating"
    ]
).to_df()

print(training_df.head())

Feast Online Features (Serving)

# Get online features for inference
online_features = fs.get_online_features(
    feature_refs=[
        "driver_hourly_stats:total_rides",
        "driver_hourly_stats:average_rating"
    ],
    entity_rows=[
        {"driver_id": 1001},
        {"driver_id": 1002}
    ]
).to_dict()

print(online_features)
# {'driver_id': [1001, 1002], 
#  'total_rides': [245, 189],
#  'average_rating': [4.8, 4.6]}

Feast with Spark

# Spark computation for features
from feast import SparkSource

driver_stats_source = SparkSource(
    name="driver_stats_spark",
    table="driver_stats",
    timestamp_field="event_timestamp",
    created_timestamp_column="created_timestamp"
)

# Feast handles Spark execution
# Features computed on-demand during training

Feast Streaming

# Kafka streaming source
from feast import KafkaSource

stream_source = KafkaSource(
    name="driver_trip_stream",
    bootstrap_servers="localhost:9092",
    topic="driver_trips",
    timestamp_field="event_timestamp",
    batch_source=driver_stats_source
)

Tecton: Managed Feature Platform

Tecton provides a fully managed feature platform with real-time capabilities.

Tecton Setup

# Install Tecton SDK
pip install tecton

# Configure Tecton
import tecton
from tecton import batch_feature_view, Aggregation, SlidingWindow

# Connect to Tecton
workspace = tecton.get_workspace("prod")

Tecton Feature Views

from tecton import batch_feature_view, Aggregation, MaterializationConfig
from datetime import datetime, timedelta

@batch_feature_view(
    sources=[transactions_source],
    entities=[user_entity],
    mode="spark_sql",
    online=True,
    offline=True,
    feature_start_time=datetime(2024, 1, 1),
    batch_schedule=timedelta(days=1),
    ttl=timedelta(days=30)
)
def user_transaction_stats(transactions_source):
    return f"""
        SELECT 
            user_id,
            transaction_amount,
            transaction_timestamp
        FROM {transactions_source}
    """

@batch_feature_view(
    sources=[transactions_source],
    entities=[user_entity],
    mode="spark_sql",
    online=True,
    aggregations=[
        Aggregation(column="transaction_amount", 
                   function="sum", 
                   window=SlidingWindow(days=7)),
        Aggregation(column="transaction_amount",
                   function="count",
                   window=SlidingWindow(days=1)),
        Aggregation(column="transaction_amount",
                   function="avg",
                   window=SlidingWindow(days=30))
    ]
)
def user_transaction_aggregates(transactions_source):
    """Pre-computed aggregations."""
    pass

Tecton Real-Time Features

from tecton import stream_feature_view, Aggregation, TumblingWindow

@stream_feature_view(
    source=clickstream_source,
    entities=[user_entity],
    mode="spark_structured_streaming",
    online=True,
    aggregation_interval=timedelta(minutes=1),
    features=[
        Aggregation(column="page_views", 
                   function="count", 
                   window=TumblingWindow(minutes=5)),
        Aggregation(column="session_duration", 
                   function="sum", 
                   window=TumblingWindow(minutes=10))
    ]
)
def user_realtime_stats(clickstream_source):
    pass

Tecton Inference

# Get features for online inference
from tecton import FeatureService

# Define feature service
fraud_detection_service = FeatureService(
    name="fraud_detection",
    features=[
        user_transaction_stats,
        user_transaction_aggregates,
        user_realtime_stats
    ]
)

# Query features
features = tecton.get_online_features(
    feature_service=fraud_detection_service,
    entity_rows=[
        {"user_id": "user_12345"}
    ]
)

# Use in model
prediction = model.predict(features.to_pandas())

Redis as Feature Store

Redis provides a fast online feature store, often used alongside other solutions.

Redis Setup

import redis
import json
from datetime import timedelta

# Connect to Redis
redis_client = redis.Redis(
    host='localhost',
    port=6379,
    db=0,
    decode_responses=True
)

# Feature serialization
def serialize_features(features: dict) -> str:
    return json.dumps(features)

def deserialize_features(data: str) -> dict:
    return json.loads(data)

Redis Feature Storage

# Store features
def store_features(entity_id: str, features: dict, ttl: int = 3600):
    key = f"features:{entity_id}"
    redis_client.setex(
        key,
        ttl,
        serialize_features(features)
    )

# Get features
def get_features(entity_id: str) -> dict:
    key = f"features:{entity_id}"
    data = redis_client.get(key)
    return deserialize_features(data) if data else None

# Store
store_features(
    "driver_1001",
    {
        "total_rides": 245,
        "total_distance": 1523.5,
        "average_rating": 4.8,
        "rating_count": 230
    },
    ttl=3600
)

# Retrieve
features = get_features("driver_1001")
print(features)

Redis with RedisAI

# Using RedisAI for model serving
import redisai as ri

client = ri.Client(host='localhost', port=6379)

# Store model in Redis
client.modelset(
    'fraud_model', 
    'torch', 
    'cpu', 
    torch_model
)

# Run inference
response = client.modelrun(
    'fraud_model',
    ['input_tensor'],
    ['output_tensor']
)

# Get result
result = client.tensorget('output_tensor')

Redis Time-Series Features

# Time-series features with Redis
from redis import Redis
from redis.commands.timeseries import TSCommands

ts = TSCommands(Redis())

# Add time-series data
ts.add("user:123:transactions", "*", 50.0)
ts.add("user:123:transactions", "*", 75.0)

# Get range
data = ts.range("user:123:transactions", 
                from_time=0, 
                to_time=1000)

# Aggregations
avg_value = ts.range("user:123:transactions",
                     from_time=0,
                     to_time=1000,
                     aggregation="avg",
                     count=100)

Redis Bloom Filters

# Feature discovery with Bloom filters
bf = redis_client.bf()

# Add known entities
bf.add("known_users", "user_123")
bf.add("known_users", "user_456")

# Check membership
is_known = bf.exists("known_users", "user_789")

Feature Store Architecture Patterns

Pattern 1: Feast + Redis

# Use Feast for offline, Redis for online
# feast_config.yml
offline_store:
  type: redshift
  config:
    cluster_id: my-cluster

online_store:
  type: redis
  config:
    host: localhost
    port: 6379

Pattern 2: Tecton Full Platform

# Use Tecton for everything
# Tecton handles:
# - Materialization (batch + streaming)
# - Online serving
# - Offline exports
# - Feature computation

Pattern 3: Custom with Redis

# Custom compute + Redis storage
def compute_features(user_id):
    # Compute features
    features = {
        "account_age": get_account_age(user_id),
        "total_spend": get_total_spend(user_id),
        "last_login_hours": get_hours_since_login(user_id)
    }
    
    # Store in Redis
    store_features(user_id, features)
    
    return features

Comparison

Feature Feast Tecton Redis
Type Open-source Managed Data Store
Offline Yes Yes Via custom
Online Yes Yes Yes (fast)
Real-time Limited Excellent Via streams
Cost Free $$$ $
Managed Self-hosted Full Partial

When to Use Each

Feast

  • Open-source preference
  • Self-hosted requirement
  • Basic feature store needs

Tecton

  • Enterprise features
  • Real-time requirements
  • Managed infrastructure

Redis

  • Fast online serving
  • Custom architectures
  • Low-latency requirements

Bad Practices

Bad Practice 1: Feature Drift

# Bad: Training-serving skew
# Training: all features computed from batch
# Serving: some from real-time, some from batch

# Good: Use same feature definitions
# Both training and serving use feature store

Bad Practice 2: No Versioning

# Bad: Features change without tracking
# Can't reproduce old models

# Good: Version features
feature.version = "2.1"

Bad Practice 3: Missing TTL

# Bad: Features never expire
# Stale features hurt predictions

# Good: Set appropriate TTL
feature.ttl = timedelta(days=1)

Good Practices

Feature Naming

# Consistent naming
FEATURE_NAMES = {
    "user": {
        "age": "user_age_days",
        "spend": "user_total_spend_30d",
        "activity": "user_last_active_ts"
    }
}

Monitoring

# Track feature freshness
def monitor_freshness():
    for feature in features:
        age = time.now() - feature.last_updated
        if age > feature.ttl:
            alert(f"Feature {feature.name} is stale!")

External Resources

Comments