A/B Testing Examples¶
Version management, experiments, and metrics tracking workflows.
Basic Version Management¶
Creating Versions¶
from dataknobs_llm.prompts import VersionManager, VersionStatus
# Create version manager
vm = VersionManager()
# Create initial version
v1 = await vm.create_version(
name="email_subject",
prompt_type="user",
template="Generate email subject for: {{topic}}",
version="1.0.0",
metadata={
"author": "alice",
"description": "Initial version",
"created_for": "email-campaign-q1"
}
)
print(f"Created version {v1.version} with ID {v1.version_id}")
# Create improved version
v2 = await vm.create_version(
name="email_subject",
prompt_type="user",
template="Generate engaging and click-worthy email subject for: {{topic}}",
# version="1.0.1" auto-assigned
metadata={
"author": "bob",
"description": "Added 'engaging and click-worthy' for better results",
"jira_ticket": "PROMPT-123"
}
)
print(f"Created version {v2.version} (auto-incremented)")
print(f"Parent version: {v2.parent_version}")
Version History¶
# List all versions
versions = await vm.list_versions("email_subject", "user")
print("Version History:")
for v in versions:
print(f" {v.version}: {v.metadata.get('description')}")
print(f" Created: {v.created_at}")
print(f" Author: {v.metadata.get('author')}")
print(f" Status: {v.status.value}")
print()
# Get specific version
version = await vm.get_version("email_subject", "user", "1.0.0")
# Get latest version
latest = await vm.get_version("email_subject", "user")
print(f"Latest version: {latest.version}")
Version Status Lifecycle¶
# Create as draft
draft = await vm.create_version(
name="new_feature",
prompt_type="system",
template="...",
status=VersionStatus.DRAFT,
metadata={"note": "Still testing"}
)
# Test the draft...
print(f"Testing version {draft.version}...")
# Promote to active
await vm.update_status(draft.version_id, VersionStatus.ACTIVE)
print("Promoted to ACTIVE")
# Tag for production
await vm.tag_version(draft.version_id, "production")
print("Tagged as production")
# Later, deprecate old version
old_version = versions[-2]
await vm.update_status(old_version.version_id, VersionStatus.DEPRECATED)
print(f"Deprecated version {old_version.version}")
A/B Testing¶
Simple A/B Test¶
from dataknobs_llm.prompts import ABTestManager, PromptVariant, MetricsCollector
# Setup
vm = VersionManager()
ab = ABTestManager()
mc = MetricsCollector()
# Create two versions
v1 = await vm.create_version(
name="greeting",
prompt_type="system",
template="Hello {{name}}!",
version="1.0.0"
)
v2 = await vm.create_version(
name="greeting",
prompt_type="system",
template="Hi {{name}}, welcome to our service!"
)
# Create A/B test (50/50 split)
exp = await ab.create_experiment(
name="greeting",
prompt_type="system",
variants=[
PromptVariant("1.0.0", 0.5, "Control"),
PromptVariant("1.0.1", 0.5, "Treatment")
],
metadata={
"hypothesis": "Friendly greeting increases engagement",
"start_date": "2024-01-15",
"owner": "product-team"
}
)
print(f"Created experiment {exp.experiment_id}")
print(f"Variants: {[(v.version, v.weight) for v in exp.variants]}")
Running the Experiment¶
from dataknobs_llm import create_llm_provider, LLMConfig
from dataknobs_llm.prompts import AsyncPromptBuilder, FileSystemPromptLibrary
from pathlib import Path
# Setup
config = LLMConfig(provider="openai", api_key="your-key")
llm = create_llm_provider(config)
library = FileSystemPromptLibrary(prompt_dir=Path("prompts/"))
builder = AsyncPromptBuilder(library=library)
# Simulate user sessions
users = ["user1", "user2", "user3", "user4", "user5"]
for user_id in users:
# Get variant for user (sticky - same user always gets same variant)
variant_version = await ab.get_variant_for_user(exp.experiment_id, user_id)
# Get the version
version = await vm.get_version("greeting", "system", variant_version)
print(f"\n{user_id} gets version {variant_version}")
# Use the version
# (In real app, you'd render the template and use with LLM)
template_str = version.template.replace("{{name}}", user_id)
# Track success
import random
success = random.random() > 0.3 # Simulate 70% success rate
await mc.record_event(
version_id=version.version_id,
success=success,
response_time=random.uniform(0.1, 1.0),
user_rating=random.uniform(3.0, 5.0)
)
print(f" Result: {'Success' if success else 'Failure'}")
Analyzing Results¶
# Get metrics for each variant
v1_metrics = await mc.get_metrics(v1.version_id)
v2_metrics = await mc.get_metrics(v2.version_id)
print("\nVariant A (Control):")
print(f" Success rate: {v1_metrics.success_rate:.2%}")
print(f" Avg response time: {v1_metrics.avg_response_time:.2f}s")
print(f" Avg rating: {v1_metrics.avg_rating:.1f}/5.0")
print(f" Total events: {v1_metrics.total_events}")
print("\nVariant B (Treatment):")
print(f" Success rate: {v2_metrics.success_rate:.2%}")
print(f" Avg response time: {v2_metrics.avg_response_time:.2f}s")
print(f" Avg rating: {v2_metrics.avg_rating:.1f}/5.0")
print(f" Total events: {v2_metrics.total_events}")
# Compare variants
comparison = await mc.compare_variants([v1.version_id, v2.version_id])
print("\nComparison:")
for version_id, metrics in comparison.items():
version_num = "A" if version_id == v1.version_id else "B"
print(f"Variant {version_num}: {metrics.success_rate:.2%} success rate")
Deploy Winner¶
# Determine winner
if v2_metrics.success_rate > v1_metrics.success_rate:
winner = v2
print(f"\nWinner: Variant B (v{v2.version})")
print(f"Improvement: {(v2_metrics.success_rate - v1_metrics.success_rate):.2%}")
# Deploy winner
await vm.tag_version(v2.version_id, "production")
await vm.update_status(v2.version_id, VersionStatus.PRODUCTION)
# Mark experiment complete
await ab.update_experiment_status(exp.experiment_id, "completed")
# Deprecate old version
await vm.update_status(v1.version_id, VersionStatus.DEPRECATED)
print("Winner deployed to production!")
else:
print("\nControl wins - keeping current version")
await vm.tag_version(v1.version_id, "production")
Multi-Variant Testing (A/B/C)¶
Three-Way Split¶
# Create three versions
v1 = await vm.create_version(
name="ad_copy",
prompt_type="user",
template="Buy {{product}} now!",
version="1.0.0"
)
v2 = await vm.create_version(
name="ad_copy",
prompt_type="user",
template="Get {{product}} today with free shipping!"
)
v3 = await vm.create_version(
name="ad_copy",
prompt_type="user",
template="Limited offer: {{product}} at 20% off!"
)
# Create A/B/C test (equal split)
exp = await ab.create_experiment(
name="ad_copy",
prompt_type="user",
variants=[
PromptVariant("1.0.0", 0.33, "Direct"),
PromptVariant("1.0.1", 0.33, "Value"),
PromptVariant("1.0.2", 0.34, "Urgency")
]
)
# Run experiment and analyze
# ... (similar to A/B test)
# Find top performer
top_versions = await mc.get_top_versions(
version_ids=[v1.version_id, v2.version_id, v3.version_id],
metric="success_rate",
limit=1
)
winner_id, winner_metrics = top_versions[0]
print(f"Winner: {winner_id} with {winner_metrics.success_rate:.2%} success rate")
Advanced Patterns¶
Gradual Rollout¶
# Phase 1: Test with 10% of traffic
exp_phase1 = await ab.create_experiment(
name="new_feature",
prompt_type="system",
variants=[
PromptVariant("1.0.0", 0.9, "Current"),
PromptVariant("2.0.0", 0.1, "New") # Only 10% get new version
],
metadata={"phase": "1", "rollout_strategy": "gradual"}
)
# Monitor for 7 days...
await asyncio.sleep(7 * 24 * 3600) # In reality, this would be monitoring
# Check metrics
new_metrics = await mc.get_metrics(v_new.version_id)
if new_metrics.success_rate >= 0.95: # Good performance
# Phase 2: Increase to 50%
exp_phase2 = await ab.create_experiment(
name="new_feature",
prompt_type="system",
variants=[
PromptVariant("1.0.0", 0.5, "Current"),
PromptVariant("2.0.0", 0.5, "New")
],
metadata={"phase": "2"}
)
# Monitor again...
# Phase 3: Full rollout
await vm.tag_version(v_new.version_id, "production")
else:
# Rollback
print("New version underperforming, stopping rollout")
await ab.update_experiment_status(exp_phase1.experiment_id, "stopped")
Segment-Based Testing¶
# Test different variants for different user segments
class SegmentedABTest:
def __init__(self, ab_manager, vm):
self.ab = ab_manager
self.vm = vm
async def get_variant_for_segment(self, experiment_id, user_id, segment):
# Different experiments for different segments
segment_exp_id = f"{experiment_id}_{segment}"
variant = await self.ab.get_variant_for_user(segment_exp_id, user_id)
return variant
# Create segment-specific experiments
premium_exp = await ab.create_experiment(
name="greeting_premium",
prompt_type="system",
variants=[
PromptVariant("1.0.0", 0.5, "Formal"),
PromptVariant("1.1.0", 0.5, "VIP")
]
)
free_exp = await ab.create_experiment(
name="greeting_free",
prompt_type="system",
variants=[
PromptVariant("1.0.0", 0.5, "Standard"),
PromptVariant("1.2.0", 0.5, "Encouraging")
]
)
# Use based on user segment
def get_segment(user_id):
# Determine user segment
return "premium" if user_id.startswith("premium") else "free"
segment = get_segment(user_id)
exp_id = f"greeting_{segment}"
variant = await ab.get_variant_for_user(exp_id, user_id)
Real-Time Metrics Tracking¶
import asyncio
from datetime import datetime, timedelta
class RealTimeMetricsMonitor:
def __init__(self, metrics_collector):
self.mc = metrics_collector
self.alerts = []
async def monitor_experiment(
self,
version_ids,
duration_hours=24,
check_interval_minutes=30,
alert_threshold=0.5 # Alert if success rate < 50%
):
end_time = datetime.utcnow() + timedelta(hours=duration_hours)
while datetime.utcnow() < end_time:
print(f"\n=== Metrics Check at {datetime.utcnow()} ===")
for version_id in version_ids:
metrics = await self.mc.get_metrics(version_id)
print(f"\nVersion {version_id}:")
print(f" Events: {metrics.total_events}")
print(f" Success Rate: {metrics.success_rate:.2%}")
print(f" Avg Response Time: {metrics.avg_response_time:.2f}s")
# Check for alerts
if metrics.success_rate < alert_threshold:
alert = f"⚠️ Version {version_id} below threshold: {metrics.success_rate:.2%}"
print(alert)
self.alerts.append({
"time": datetime.utcnow(),
"version_id": version_id,
"message": alert
})
# Wait before next check
await asyncio.sleep(check_interval_minutes * 60)
print("\n=== Monitoring Complete ===")
return self.alerts
# Use the monitor
monitor = RealTimeMetricsMonitor(mc)
alerts = await monitor.monitor_experiment(
version_ids=[v1.version_id, v2.version_id],
duration_hours=24,
check_interval_minutes=30
)
if alerts:
print(f"\n{len(alerts)} alerts triggered during experiment")
Statistical Significance¶
from scipy import stats
async def check_statistical_significance(mc, v1_id, v2_id, alpha=0.05):
"""Check if difference between variants is statistically significant."""
# Get metrics
m1 = await mc.get_metrics(v1_id)
m2 = await mc.get_metrics(v2_id)
# Need raw event data for proper test
# This is simplified - in production, use proper A/B test statistics
n1 = m1.total_events
n2 = m2.total_events
p1 = m1.success_rate
p2 = m2.success_rate
# Z-test for proportions
p_pooled = (n1 * p1 + n2 * p2) / (n1 + n2)
se = (p_pooled * (1 - p_pooled) * (1/n1 + 1/n2)) ** 0.5
z = (p2 - p1) / se
p_value = 2 * (1 - stats.norm.cdf(abs(z)))
print(f"\nStatistical Analysis:")
print(f" Variant A: {p1:.2%} ({n1} events)")
print(f" Variant B: {p2:.2%} ({n2} events)")
print(f" Difference: {abs(p2 - p1):.2%}")
print(f" P-value: {p_value:.4f}")
print(f" Significant at α={alpha}: {p_value < alpha}")
return {
"significant": p_value < alpha,
"p_value": p_value,
"winner": "B" if p2 > p1 else "A"
}
# Check significance before deploying
result = await check_statistical_significance(mc, v1.version_id, v2.version_id)
if result["significant"]:
print(f"\n✓ Variant {result['winner']} is significantly better")
# Deploy winner
else:
print("\n✗ No significant difference - need more data")
# Continue experiment
Rollback Workflow¶
Safe Rollback¶
async def safe_rollback(vm, mc, current_version_id, experiment_id):
"""Rollback to previous version if current version is failing."""
# Check current version metrics
current_metrics = await mc.get_metrics(current_version_id)
print(f"Current version metrics:")
print(f" Success rate: {current_metrics.success_rate:.2%}")
print(f" Events: {current_metrics.total_events}")
# Define rollback criteria
if current_metrics.success_rate < 0.8 and current_metrics.total_events > 100:
print("\n⚠️ Current version underperforming - initiating rollback")
# Get current version
current_version = await vm.get_version_by_id(current_version_id)
# Get parent version (previous)
if current_version.parent_version:
parent_version = await vm.get_version_by_id(current_version.parent_version)
print(f"Rolling back to version {parent_version.version}")
# Update tags and status
await vm.untag_version(current_version_id, "production")
await vm.tag_version(parent_version.version_id, "production")
await vm.update_status(current_version_id, VersionStatus.DEPRECATED)
await vm.update_status(parent_version.version_id, VersionStatus.PRODUCTION)
# Stop experiment
await ab.update_experiment_status(experiment_id, "stopped")
print("✓ Rollback complete")
return parent_version
else:
print("✗ No parent version to rollback to")
return None
else:
print("✓ Current version performing well")
return None
# Monitor and rollback if needed
rolled_back = await safe_rollback(
vm, mc, v_new.version_id, exp.experiment_id
)
See Also¶
- Versioning & A/B Testing Guide - Detailed guide
- Versioning API - Complete API reference
- Performance Guide - Optimization tips
- Basic Usage Examples - Getting started