Skip to content

Versioning & A/B Testing API

Prompt version management, A/B testing, and metrics tracking.

📖 Also see: Auto-generated API Reference - Complete documentation from source code docstrings


Overview

The versioning API provides comprehensive tools for tracking prompt versions, running A/B tests, and measuring performance.

Version Management

VersionManager

dataknobs_llm.prompts.VersionManager

VersionManager(storage: Any | None = None)

Manages prompt versions with semantic versioning.

Handles version creation, retrieval, and lifecycle management. Supports semantic versioning (major.minor.patch) and version tagging.

Example
manager = VersionManager(storage_backend)

# Create a version
v1 = await manager.create_version(
    name="greeting",
    prompt_type="system",
    template="Hello {{name}}!",
    version="1.0.0"
)

# Get latest version
latest = await manager.get_version(
    name="greeting",
    prompt_type="system"
)

# Tag a version
await manager.tag_version(v1.version_id, "production")

Initialize version manager.

Parameters:

Name Type Description Default
storage Any | None

Backend storage (dict for in-memory, database for persistence) If None, uses in-memory dictionary

None

Methods:

Name Description
create_version

Create a new prompt version.

get_version

Retrieve a prompt version.

list_versions

List all versions of a prompt.

update_status

Update version status.

tag_version

Add a tag to a version.

untag_version

Remove a tag from a version.

Source code in packages/llm/src/dataknobs_llm/prompts/versioning/version_manager.py
def __init__(self, storage: Any | None = None):
    """Initialize version manager.

    Args:
        storage: Backend storage (dict for in-memory, database for persistence)
                If None, uses in-memory dictionary
    """
    self.storage = storage if storage is not None else {}
    self._versions: Dict[str, PromptVersion] = {}  # version_id -> PromptVersion
    self._version_index: Dict[str, List[str]] = {}  # "{name}:{type}" -> [version_ids]

Functions

create_version async
create_version(
    name: str,
    prompt_type: str,
    template: str,
    version: str | None = None,
    defaults: Dict[str, Any] | None = None,
    validation: Dict[str, Any] | None = None,
    metadata: Dict[str, Any] | None = None,
    created_by: str | None = None,
    parent_version: str | None = None,
    tags: List[str] | None = None,
    status: VersionStatus = VersionStatus.ACTIVE,
) -> PromptVersion

Create a new prompt version.

Parameters:

Name Type Description Default
name str

Prompt name

required
prompt_type str

Prompt type ("system", "user", "message")

required
template str

Template content

required
version str | None

Semantic version (e.g., "1.2.3"). If None, auto-increments from latest

None
defaults Dict[str, Any] | None

Default parameter values

None
validation Dict[str, Any] | None

Validation configuration

None
metadata Dict[str, Any] | None

Additional metadata

None
created_by str | None

Creator username/ID

None
parent_version str | None

Previous version ID for history tracking

None
tags List[str] | None

List of tags

None
status VersionStatus

Initial version status

ACTIVE

Returns:

Type Description
PromptVersion

Created PromptVersion

Raises:

Type Description
VersioningError

If version format is invalid or version already exists

Source code in packages/llm/src/dataknobs_llm/prompts/versioning/version_manager.py
async def create_version(
    self,
    name: str,
    prompt_type: str,
    template: str,
    version: str | None = None,
    defaults: Dict[str, Any] | None = None,
    validation: Dict[str, Any] | None = None,
    metadata: Dict[str, Any] | None = None,
    created_by: str | None = None,
    parent_version: str | None = None,
    tags: List[str] | None = None,
    status: VersionStatus = VersionStatus.ACTIVE,
) -> PromptVersion:
    """Create a new prompt version.

    Args:
        name: Prompt name
        prompt_type: Prompt type ("system", "user", "message")
        template: Template content
        version: Semantic version (e.g., "1.2.3"). If None, auto-increments from latest
        defaults: Default parameter values
        validation: Validation configuration
        metadata: Additional metadata
        created_by: Creator username/ID
        parent_version: Previous version ID for history tracking
        tags: List of tags
        status: Initial version status

    Returns:
        Created PromptVersion

    Raises:
        VersioningError: If version format is invalid or version already exists
    """
    # Auto-increment version if not provided
    if version is None:
        version = await self._auto_increment_version(name, prompt_type)
        # If no parent_version specified, use the latest version
        if parent_version is None:
            latest = await self.get_version(name, prompt_type)
            if latest:
                parent_version = latest.version_id
    else:
        # Validate version format
        if not self.VERSION_PATTERN.match(version):
            raise VersioningError(
                f"Invalid version format: {version}. "
                f"Expected semantic version (e.g., '1.0.0')"
            )

    # Check if version already exists
    key = self._make_key(name, prompt_type)
    existing_versions = await self.list_versions(name, prompt_type)
    if any(v.version == version for v in existing_versions):
        raise VersioningError(
            f"Version {version} already exists for {name} ({prompt_type})"
        )

    # Generate unique version ID
    version_id = str(uuid.uuid4())

    # Create version object
    prompt_version = PromptVersion(
        version_id=version_id,
        name=name,
        prompt_type=prompt_type,
        version=version,
        template=template,
        defaults=defaults or {},
        validation=validation,
        metadata=metadata or {},
        created_at=datetime.utcnow(),
        created_by=created_by,
        parent_version=parent_version,
        tags=tags or [],
        status=status,
    )

    # Store version
    self._versions[version_id] = prompt_version

    # Update index
    if key not in self._version_index:
        self._version_index[key] = []
    self._version_index[key].append(version_id)

    # Persist to backend if available
    if hasattr(self.storage, "set"):
        await self._persist_version(prompt_version)

    return prompt_version
get_version async
get_version(
    name: str,
    prompt_type: str,
    version: str = "latest",
    version_id: str | None = None,
) -> PromptVersion | None

Retrieve a prompt version.

Parameters:

Name Type Description Default
name str

Prompt name

required
prompt_type str

Prompt type

required
version str

Version string or "latest" for most recent

'latest'
version_id str | None

Specific version ID (takes precedence over version)

None

Returns:

Type Description
PromptVersion | None

PromptVersion if found, None otherwise

Source code in packages/llm/src/dataknobs_llm/prompts/versioning/version_manager.py
async def get_version(
    self,
    name: str,
    prompt_type: str,
    version: str = "latest",
    version_id: str | None = None,
) -> PromptVersion | None:
    """Retrieve a prompt version.

    Args:
        name: Prompt name
        prompt_type: Prompt type
        version: Version string or "latest" for most recent
        version_id: Specific version ID (takes precedence over version)

    Returns:
        PromptVersion if found, None otherwise
    """
    # Direct lookup by version_id
    if version_id:
        return self._versions.get(version_id)

    # Get all versions for this prompt
    versions = await self.list_versions(name, prompt_type)
    if not versions:
        return None

    # Return latest version
    if version == "latest":
        return self._get_latest_version(versions)

    # Find specific version
    for v in versions:
        if v.version == version:
            return v

    return None
list_versions async
list_versions(
    name: str,
    prompt_type: str,
    tags: List[str] | None = None,
    status: VersionStatus | None = None,
) -> List[PromptVersion]

List all versions of a prompt.

Parameters:

Name Type Description Default
name str

Prompt name

required
prompt_type str

Prompt type

required
tags List[str] | None

Filter by tags (returns versions with ANY of these tags)

None
status VersionStatus | None

Filter by status

None

Returns:

Type Description
List[PromptVersion]

List of PromptVersion objects, sorted by version (newest first)

Source code in packages/llm/src/dataknobs_llm/prompts/versioning/version_manager.py
async def list_versions(
    self,
    name: str,
    prompt_type: str,
    tags: List[str] | None = None,
    status: VersionStatus | None = None,
) -> List[PromptVersion]:
    """List all versions of a prompt.

    Args:
        name: Prompt name
        prompt_type: Prompt type
        tags: Filter by tags (returns versions with ANY of these tags)
        status: Filter by status

    Returns:
        List of PromptVersion objects, sorted by version (newest first)
    """
    key = self._make_key(name, prompt_type)
    version_ids = self._version_index.get(key, [])

    versions = [self._versions[vid] for vid in version_ids]

    # Apply filters
    if tags:
        versions = [v for v in versions if any(t in v.tags for t in tags)]

    if status:
        versions = [v for v in versions if v.status == status]

    # Sort by version (newest first)
    return sorted(versions, key=lambda v: self._parse_version(v.version), reverse=True)
update_status async
update_status(version_id: str, status: VersionStatus) -> PromptVersion

Update version status.

Parameters:

Name Type Description Default
version_id str

Version ID

required
status VersionStatus

New status

required

Returns:

Type Description
PromptVersion

Updated PromptVersion

Raises:

Type Description
VersioningError

If version not found

Source code in packages/llm/src/dataknobs_llm/prompts/versioning/version_manager.py
async def update_status(
    self,
    version_id: str,
    status: VersionStatus,
) -> PromptVersion:
    """Update version status.

    Args:
        version_id: Version ID
        status: New status

    Returns:
        Updated PromptVersion

    Raises:
        VersioningError: If version not found
    """
    version = self._versions.get(version_id)
    if not version:
        raise VersioningError(f"Version not found: {version_id}")

    version.status = status

    # Persist if backend available
    if hasattr(self.storage, "set"):
        await self._persist_version(version)

    return version
tag_version async
tag_version(version_id: str, tag: str) -> PromptVersion

Add a tag to a version.

Parameters:

Name Type Description Default
version_id str

Version ID to tag

required
tag str

Tag to add (e.g., "production", "deprecated")

required

Returns:

Type Description
PromptVersion

Updated PromptVersion

Raises:

Type Description
VersioningError

If version not found

Source code in packages/llm/src/dataknobs_llm/prompts/versioning/version_manager.py
async def tag_version(
    self,
    version_id: str,
    tag: str,
) -> PromptVersion:
    """Add a tag to a version.

    Args:
        version_id: Version ID to tag
        tag: Tag to add (e.g., "production", "deprecated")

    Returns:
        Updated PromptVersion

    Raises:
        VersioningError: If version not found
    """
    version = self._versions.get(version_id)
    if not version:
        raise VersioningError(f"Version not found: {version_id}")

    if tag not in version.tags:
        version.tags.append(tag)

        # Persist if backend available
        if hasattr(self.storage, "set"):
            await self._persist_version(version)

    return version
untag_version async
untag_version(version_id: str, tag: str) -> PromptVersion

Remove a tag from a version.

Parameters:

Name Type Description Default
version_id str

Version ID

required
tag str

Tag to remove

required

Returns:

Type Description
PromptVersion

Updated PromptVersion

Raises:

Type Description
VersioningError

If version not found

Source code in packages/llm/src/dataknobs_llm/prompts/versioning/version_manager.py
async def untag_version(
    self,
    version_id: str,
    tag: str,
) -> PromptVersion:
    """Remove a tag from a version.

    Args:
        version_id: Version ID
        tag: Tag to remove

    Returns:
        Updated PromptVersion

    Raises:
        VersioningError: If version not found
    """
    version = self._versions.get(version_id)
    if not version:
        raise VersioningError(f"Version not found: {version_id}")

    if tag in version.tags:
        version.tags.remove(tag)

        # Persist if backend available
        if hasattr(self.storage, "set"):
            await self._persist_version(version)

    return version

PromptVersion

dataknobs_llm.prompts.PromptVersion dataclass

PromptVersion(
    version_id: str,
    name: str,
    prompt_type: str,
    version: str,
    template: str,
    defaults: Dict[str, Any] = dict(),
    validation: Dict[str, Any] | None = None,
    metadata: Dict[str, Any] = dict(),
    created_at: datetime = datetime.utcnow(),
    created_by: str | None = None,
    parent_version: str | None = None,
    tags: List[str] = list(),
    status: VersionStatus = VersionStatus.ACTIVE,
)

Represents a versioned prompt.

Attributes:

Name Type Description
version_id str

Unique identifier for this version (auto-generated)

name str

Name of the prompt

prompt_type str

Type of prompt ("system", "user", "message")

version str

Semantic version string (e.g., "1.2.3")

template str

The prompt template content

defaults Dict[str, Any]

Default parameter values

validation Dict[str, Any] | None

Validation configuration

metadata Dict[str, Any]

Additional metadata (author, description, etc.)

created_at datetime

Timestamp when version was created

created_by str | None

Username/ID of creator

parent_version str | None

Previous version ID (for history tracking)

tags List[str]

List of tags (e.g., ["production", "experiment-A"])

status VersionStatus

Current status of this version

Methods:

Name Description
from_dict

Create from dictionary.

to_dict

Convert to dictionary for storage.

Functions

from_dict classmethod
from_dict(data: Dict[str, Any]) -> PromptVersion

Create from dictionary.

Source code in packages/llm/src/dataknobs_llm/prompts/versioning/types.py
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "PromptVersion":
    """Create from dictionary."""
    data = data.copy()
    # Parse datetime
    if isinstance(data.get("created_at"), str):
        data["created_at"] = datetime.fromisoformat(data["created_at"])
    # Parse status enum
    if isinstance(data.get("status"), str):
        data["status"] = VersionStatus(data["status"])
    return cls(**data)
to_dict
to_dict() -> Dict[str, Any]

Convert to dictionary for storage.

Source code in packages/llm/src/dataknobs_llm/prompts/versioning/types.py
def to_dict(self) -> Dict[str, Any]:
    """Convert to dictionary for storage."""
    return {
        "version_id": self.version_id,
        "name": self.name,
        "prompt_type": self.prompt_type,
        "version": self.version,
        "template": self.template,
        "defaults": self.defaults,
        "validation": self.validation,
        "metadata": self.metadata,
        "created_at": self.created_at.isoformat(),
        "created_by": self.created_by,
        "parent_version": self.parent_version,
        "tags": self.tags,
        "status": self.status.value,
    }

VersionStatus

dataknobs_llm.prompts.VersionStatus

Bases: Enum

Status of a prompt version.

Attributes:

Name Type Description
DRAFT

Version is in development

ACTIVE

Version is active and can be used

PRODUCTION

Version is deployed in production

DEPRECATED

Version is deprecated but still available

ARCHIVED

Version is archived and should not be used

A/B Testing

ABTestManager

dataknobs_llm.prompts.ABTestManager

ABTestManager(storage: Any | None = None)

Manages A/B test experiments for prompts.

Supports multiple selection strategies: - Random: Each request gets a random variant based on traffic split - User-sticky: Same user always gets same variant (consistent experience)

Example
manager = ABTestManager(storage_backend)

# Create experiment
experiment = await manager.create_experiment(
    name="greeting",
    prompt_type="system",
    variants=[
        PromptVariant("1.0.0", 0.5, "Control"),
        PromptVariant("1.1.0", 0.5, "Treatment")
    ]
)

# Get variant for user (sticky assignment)
variant_version = await manager.get_variant_for_user(
    experiment.experiment_id,
    user_id="user123"
)

# Get random variant
variant_version = await manager.get_random_variant(
    experiment.experiment_id
)

Initialize A/B test manager.

Parameters:

Name Type Description Default
storage Any | None

Backend storage (dict for in-memory, database for persistence) If None, uses in-memory dictionary

None

Methods:

Name Description
create_experiment

Create a new A/B test experiment.

get_experiment

Retrieve an experiment by ID.

list_experiments

List experiments with optional filters.

get_variant_for_user

Get variant for a specific user (sticky assignment).

get_random_variant

Get a random variant based on traffic split.

update_experiment_status

Update experiment status.

Source code in packages/llm/src/dataknobs_llm/prompts/versioning/ab_testing.py
def __init__(self, storage: Any | None = None):
    """Initialize A/B test manager.

    Args:
        storage: Backend storage (dict for in-memory, database for persistence)
                If None, uses in-memory dictionary
    """
    self.storage = storage if storage is not None else {}
    self._experiments: Dict[str, PromptExperiment] = {}  # experiment_id -> PromptExperiment
    self._user_assignments: Dict[str, Dict[str, str]] = {}  # experiment_id -> {user_id -> version}

Functions

create_experiment async
create_experiment(
    name: str,
    prompt_type: str,
    variants: List[PromptVariant],
    traffic_split: Dict[str, float] | None = None,
    metadata: Dict[str, Any] | None = None,
) -> PromptExperiment

Create a new A/B test experiment.

Parameters:

Name Type Description Default
name str

Prompt name

required
prompt_type str

Prompt type

required
variants List[PromptVariant]

List of variants to test

required
traffic_split Dict[str, float] | None

Optional custom traffic split (if None, derives from variant weights)

None
metadata Dict[str, Any] | None

Additional metadata

None

Returns:

Type Description
PromptExperiment

Created PromptExperiment

Raises:

Type Description
VersioningError

If variants are invalid or traffic split doesn't sum to 1.0

Source code in packages/llm/src/dataknobs_llm/prompts/versioning/ab_testing.py
async def create_experiment(
    self,
    name: str,
    prompt_type: str,
    variants: List[PromptVariant],
    traffic_split: Dict[str, float] | None = None,
    metadata: Dict[str, Any] | None = None,
) -> PromptExperiment:
    """Create a new A/B test experiment.

    Args:
        name: Prompt name
        prompt_type: Prompt type
        variants: List of variants to test
        traffic_split: Optional custom traffic split (if None, derives from variant weights)
        metadata: Additional metadata

    Returns:
        Created PromptExperiment

    Raises:
        VersioningError: If variants are invalid or traffic split doesn't sum to 1.0
    """
    if len(variants) < 2:
        raise VersioningError("Experiment must have at least 2 variants")

    # Generate experiment ID
    experiment_id = str(uuid.uuid4())

    # Derive traffic split from variant weights if not provided
    if traffic_split is None:
        # Normalize weights to ensure they sum to 1.0
        total_weight = sum(v.weight for v in variants)
        traffic_split = {
            v.version: v.weight / total_weight
            for v in variants
        }

    # Create experiment
    experiment = PromptExperiment(
        experiment_id=experiment_id,
        name=name,
        prompt_type=prompt_type,
        variants=variants,
        traffic_split=traffic_split,
        start_date=datetime.utcnow(),
        status="running",
        metadata=metadata or {},
    )

    # Store experiment
    self._experiments[experiment_id] = experiment

    # Initialize user assignments
    self._user_assignments[experiment_id] = {}

    # Persist to backend if available
    if hasattr(self.storage, "set"):
        await self._persist_experiment(experiment)

    return experiment
get_experiment async
get_experiment(experiment_id: str) -> PromptExperiment | None

Retrieve an experiment by ID.

Parameters:

Name Type Description Default
experiment_id str

Experiment ID

required

Returns:

Type Description
PromptExperiment | None

PromptExperiment if found, None otherwise

Source code in packages/llm/src/dataknobs_llm/prompts/versioning/ab_testing.py
async def get_experiment(
    self,
    experiment_id: str,
) -> PromptExperiment | None:
    """Retrieve an experiment by ID.

    Args:
        experiment_id: Experiment ID

    Returns:
        PromptExperiment if found, None otherwise
    """
    return self._experiments.get(experiment_id)
list_experiments async
list_experiments(
    name: str | None = None,
    prompt_type: str | None = None,
    status: str | None = None,
) -> List[PromptExperiment]

List experiments with optional filters.

Parameters:

Name Type Description Default
name str | None

Filter by prompt name

None
prompt_type str | None

Filter by prompt type

None
status str | None

Filter by status ("running", "paused", "completed")

None

Returns:

Type Description
List[PromptExperiment]

List of matching experiments

Source code in packages/llm/src/dataknobs_llm/prompts/versioning/ab_testing.py
async def list_experiments(
    self,
    name: str | None = None,
    prompt_type: str | None = None,
    status: str | None = None,
) -> List[PromptExperiment]:
    """List experiments with optional filters.

    Args:
        name: Filter by prompt name
        prompt_type: Filter by prompt type
        status: Filter by status ("running", "paused", "completed")

    Returns:
        List of matching experiments
    """
    experiments = list(self._experiments.values())

    # Apply filters
    if name:
        experiments = [e for e in experiments if e.name == name]

    if prompt_type:
        experiments = [e for e in experiments if e.prompt_type == prompt_type]

    if status:
        experiments = [e for e in experiments if e.status == status]

    return experiments
get_variant_for_user async
get_variant_for_user(experiment_id: str, user_id: str) -> str

Get variant for a specific user (sticky assignment).

The same user always gets the same variant for consistent experience. Uses hash-based assignment to ensure deterministic selection.

Parameters:

Name Type Description Default
experiment_id str

Experiment ID

required
user_id str

User identifier

required

Returns:

Type Description
str

Version string of assigned variant

Raises:

Type Description
VersioningError

If experiment not found

Source code in packages/llm/src/dataknobs_llm/prompts/versioning/ab_testing.py
async def get_variant_for_user(
    self,
    experiment_id: str,
    user_id: str,
) -> str:
    """Get variant for a specific user (sticky assignment).

    The same user always gets the same variant for consistent experience.
    Uses hash-based assignment to ensure deterministic selection.

    Args:
        experiment_id: Experiment ID
        user_id: User identifier

    Returns:
        Version string of assigned variant

    Raises:
        VersioningError: If experiment not found
    """
    experiment = self._experiments.get(experiment_id)
    if not experiment:
        raise VersioningError(f"Experiment not found: {experiment_id}")

    if experiment.status != "running":
        raise VersioningError(
            f"Experiment {experiment_id} is not running (status: {experiment.status})"
        )

    # Check if user already has assignment
    if experiment_id in self._user_assignments:
        existing = self._user_assignments[experiment_id].get(user_id)
        if existing:
            return existing

    # Assign user to variant using hash-based selection
    assigned_version = self._hash_based_assignment(
        user_id,
        experiment.traffic_split
    )

    # Store assignment
    if experiment_id not in self._user_assignments:
        self._user_assignments[experiment_id] = {}
    self._user_assignments[experiment_id][user_id] = assigned_version

    # Persist assignment if backend available
    if hasattr(self.storage, "set"):
        key = f"assignment:{experiment_id}:{user_id}"
        await self.storage.set(key, assigned_version)

    return assigned_version
get_random_variant async
get_random_variant(experiment_id: str) -> str

Get a random variant based on traffic split.

Each call returns a potentially different variant.

Parameters:

Name Type Description Default
experiment_id str

Experiment ID

required

Returns:

Type Description
str

Version string of selected variant

Raises:

Type Description
VersioningError

If experiment not found

Source code in packages/llm/src/dataknobs_llm/prompts/versioning/ab_testing.py
async def get_random_variant(
    self,
    experiment_id: str,
) -> str:
    """Get a random variant based on traffic split.

    Each call returns a potentially different variant.

    Args:
        experiment_id: Experiment ID

    Returns:
        Version string of selected variant

    Raises:
        VersioningError: If experiment not found
    """
    experiment = self._experiments.get(experiment_id)
    if not experiment:
        raise VersioningError(f"Experiment not found: {experiment_id}")

    if experiment.status != "running":
        raise VersioningError(
            f"Experiment {experiment_id} is not running (status: {experiment.status})"
        )

    # Weighted random selection
    versions = list(experiment.traffic_split.keys())
    weights = list(experiment.traffic_split.values())

    return random.choices(versions, weights=weights)[0]
update_experiment_status async
update_experiment_status(
    experiment_id: str, status: str, end_date: datetime | None = None
) -> PromptExperiment

Update experiment status.

Parameters:

Name Type Description Default
experiment_id str

Experiment ID

required
status str

New status ("running", "paused", "completed")

required
end_date datetime | None

Optional end date (auto-set to now if status is "completed")

None

Returns:

Type Description
PromptExperiment

Updated experiment

Raises:

Type Description
VersioningError

If experiment not found

Source code in packages/llm/src/dataknobs_llm/prompts/versioning/ab_testing.py
async def update_experiment_status(
    self,
    experiment_id: str,
    status: str,
    end_date: datetime | None = None,
) -> PromptExperiment:
    """Update experiment status.

    Args:
        experiment_id: Experiment ID
        status: New status ("running", "paused", "completed")
        end_date: Optional end date (auto-set to now if status is "completed")

    Returns:
        Updated experiment

    Raises:
        VersioningError: If experiment not found
    """
    experiment = self._experiments.get(experiment_id)
    if not experiment:
        raise VersioningError(f"Experiment not found: {experiment_id}")

    experiment.status = status

    if status == "completed" and end_date is None:
        experiment.end_date = datetime.utcnow()
    elif end_date:
        experiment.end_date = end_date

    # Persist if backend available
    if hasattr(self.storage, "set"):
        await self._persist_experiment(experiment)

    return experiment

PromptExperiment

dataknobs_llm.prompts.PromptExperiment dataclass

PromptExperiment(
    experiment_id: str,
    name: str,
    prompt_type: str,
    variants: List[PromptVariant],
    traffic_split: Dict[str, float],
    start_date: datetime = datetime.utcnow(),
    end_date: datetime | None = None,
    status: str = "running",
    metrics: Dict[str, Any] = dict(),
    metadata: Dict[str, Any] = dict(),
)

Configuration for an A/B test experiment.

Attributes:

Name Type Description
experiment_id str

Unique identifier for this experiment

name str

Name of the prompt being tested

prompt_type str

Type of prompt ("system", "user", "message")

variants List[PromptVariant]

List of variants in this experiment

traffic_split Dict[str, float]

Mapping of version to traffic percentage

start_date datetime

When experiment started

end_date datetime | None

When experiment ended (None if still running)

status str

Current status ("running", "paused", "completed")

metrics Dict[str, Any]

Aggregated metrics for the experiment

metadata Dict[str, Any]

Additional experiment metadata

Methods:

Name Description
__post_init__

Validate traffic split sums to 1.0.

from_dict

Create from dictionary.

to_dict

Convert to dictionary for storage.

Functions

__post_init__
__post_init__()

Validate traffic split sums to 1.0.

Source code in packages/llm/src/dataknobs_llm/prompts/versioning/types.py
def __post_init__(self):
    """Validate traffic split sums to 1.0."""
    total = sum(self.traffic_split.values())
    if not (0.99 <= total <= 1.01):  # Allow small floating point error
        raise ValueError(
            f"Traffic split must sum to 1.0, got {total}. "
            f"Split: {self.traffic_split}"
        )
from_dict classmethod
from_dict(data: Dict[str, Any]) -> PromptExperiment

Create from dictionary.

Source code in packages/llm/src/dataknobs_llm/prompts/versioning/types.py
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "PromptExperiment":
    """Create from dictionary."""
    data = data.copy()
    # Parse datetimes
    if isinstance(data.get("start_date"), str):
        data["start_date"] = datetime.fromisoformat(data["start_date"])
    if isinstance(data.get("end_date"), str):
        data["end_date"] = datetime.fromisoformat(data["end_date"])
    # Parse variants
    if data.get("variants"):
        data["variants"] = [
            PromptVariant.from_dict(v) if isinstance(v, dict) else v
            for v in data["variants"]
        ]
    return cls(**data)
to_dict
to_dict() -> Dict[str, Any]

Convert to dictionary for storage.

Source code in packages/llm/src/dataknobs_llm/prompts/versioning/types.py
def to_dict(self) -> Dict[str, Any]:
    """Convert to dictionary for storage."""
    return {
        "experiment_id": self.experiment_id,
        "name": self.name,
        "prompt_type": self.prompt_type,
        "variants": [v.to_dict() for v in self.variants],
        "traffic_split": self.traffic_split,
        "start_date": self.start_date.isoformat(),
        "end_date": self.end_date.isoformat() if self.end_date else None,
        "status": self.status,
        "metrics": self.metrics,
        "metadata": self.metadata,
    }

PromptVariant

dataknobs_llm.prompts.PromptVariant dataclass

PromptVariant(
    version: str,
    weight: float,
    description: str = "",
    metadata: Dict[str, Any] = dict(),
)

A variant in an A/B test experiment.

Attributes:

Name Type Description
version str

Version string of this variant

weight float

Traffic allocation weight (relative weight, must be > 0.0) Weights are normalized to sum to 1.0 when creating experiment

description str

Human-readable description

metadata Dict[str, Any]

Additional variant metadata

Methods:

Name Description
__post_init__

Validate weight is positive.

from_dict

Create from dictionary.

to_dict

Convert to dictionary for storage.

Functions

__post_init__
__post_init__()

Validate weight is positive.

Source code in packages/llm/src/dataknobs_llm/prompts/versioning/types.py
def __post_init__(self):
    """Validate weight is positive."""
    if self.weight <= 0.0:
        raise ValueError(f"Variant weight must be positive, got {self.weight}")
from_dict classmethod
from_dict(data: Dict[str, Any]) -> PromptVariant

Create from dictionary.

Source code in packages/llm/src/dataknobs_llm/prompts/versioning/types.py
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "PromptVariant":
    """Create from dictionary."""
    return cls(**data)
to_dict
to_dict() -> Dict[str, Any]

Convert to dictionary for storage.

Source code in packages/llm/src/dataknobs_llm/prompts/versioning/types.py
def to_dict(self) -> Dict[str, Any]:
    """Convert to dictionary for storage."""
    return {
        "version": self.version,
        "weight": self.weight,
        "description": self.description,
        "metadata": self.metadata,
    }

Metrics Tracking

MetricsCollector

dataknobs_llm.prompts.MetricsCollector

MetricsCollector(storage: Any | None = None)

Collects and aggregates metrics for prompt versions.

Tracks usage, performance, and user feedback for each version. Supports both real-time event recording and aggregated metrics retrieval.

Example
collector = MetricsCollector(storage_backend)

# Record a usage event
await collector.record_event(
    version_id="v1",
    success=True,
    response_time=0.5,
    tokens=150,
    user_rating=4.5
)

# Get aggregated metrics
metrics = await collector.get_metrics("v1")
print(f"Success rate: {metrics.success_rate:.2%}")
print(f"Avg response time: {metrics.avg_response_time:.2f}s")

# Compare variants in experiment
comparison = await collector.compare_variants(
    experiment_id="exp1"
)

Initialize metrics collector.

Parameters:

Name Type Description Default
storage Any | None

Backend storage (dict for in-memory, database for persistence) If None, uses in-memory dictionary

None

Methods:

Name Description
record_event

Record a single usage event.

get_metrics

Get aggregated metrics for a version.

compare_variants

Compare metrics across multiple versions.

get_top_versions

Get top performing versions by a specific metric.

Source code in packages/llm/src/dataknobs_llm/prompts/versioning/metrics.py
def __init__(self, storage: Any | None = None):
    """Initialize metrics collector.

    Args:
        storage: Backend storage (dict for in-memory, database for persistence)
                If None, uses in-memory dictionary
    """
    self.storage = storage if storage is not None else {}
    self._metrics: Dict[str, PromptMetrics] = {}  # version_id -> PromptMetrics
    self._events: Dict[str, List[MetricEvent]] = {}  # version_id -> [events]

Functions

record_event async
record_event(
    version_id: str,
    success: bool = True,
    response_time: float | None = None,
    tokens: int | None = None,
    user_rating: float | None = None,
    metadata: Dict[str, Any] | None = None,
) -> MetricEvent

Record a single usage event.

Parameters:

Name Type Description Default
version_id str

Version ID this event belongs to

required
success bool

Whether the use was successful

True
response_time float | None

Response time in seconds (None if not applicable)

None
tokens int | None

Number of tokens used (None if not applicable)

None
user_rating float | None

User rating 1-5 (None if not provided)

None
metadata Dict[str, Any] | None

Additional event metadata

None

Returns:

Type Description
MetricEvent

Created MetricEvent

Raises:

Type Description
ValueError

If user_rating is not in valid range

Source code in packages/llm/src/dataknobs_llm/prompts/versioning/metrics.py
async def record_event(
    self,
    version_id: str,
    success: bool = True,
    response_time: float | None = None,
    tokens: int | None = None,
    user_rating: float | None = None,
    metadata: Dict[str, Any] | None = None,
) -> MetricEvent:
    """Record a single usage event.

    Args:
        version_id: Version ID this event belongs to
        success: Whether the use was successful
        response_time: Response time in seconds (None if not applicable)
        tokens: Number of tokens used (None if not applicable)
        user_rating: User rating 1-5 (None if not provided)
        metadata: Additional event metadata

    Returns:
        Created MetricEvent

    Raises:
        ValueError: If user_rating is not in valid range
    """
    if user_rating is not None and not (1.0 <= user_rating <= 5.0):
        raise ValueError(f"User rating must be between 1.0 and 5.0, got {user_rating}")

    # Create event
    event = MetricEvent(
        version_id=version_id,
        timestamp=datetime.utcnow(),
        success=success,
        response_time=response_time,
        tokens=tokens,
        user_rating=user_rating,
        metadata=metadata or {},
    )

    # Store event
    if version_id not in self._events:
        self._events[version_id] = []
    self._events[version_id].append(event)

    # Update aggregated metrics
    await self._update_metrics(version_id, event)

    # Persist event if backend available
    if hasattr(self.storage, "append"):
        await self._persist_event(event)

    return event
get_metrics async
get_metrics(version_id: str) -> PromptMetrics

Get aggregated metrics for a version.

If no events have been recorded, returns empty metrics.

Parameters:

Name Type Description Default
version_id str

Version ID

required

Returns:

Type Description
PromptMetrics

PromptMetrics with aggregated statistics

Source code in packages/llm/src/dataknobs_llm/prompts/versioning/metrics.py
async def get_metrics(
    self,
    version_id: str,
) -> PromptMetrics:
    """Get aggregated metrics for a version.

    If no events have been recorded, returns empty metrics.

    Args:
        version_id: Version ID

    Returns:
        PromptMetrics with aggregated statistics
    """
    if version_id not in self._metrics:
        # Return empty metrics
        return PromptMetrics(version_id=version_id)

    return self._metrics[version_id]
compare_variants async
compare_variants(version_ids: List[str]) -> Dict[str, PromptMetrics]

Compare metrics across multiple versions.

Parameters:

Name Type Description Default
version_ids List[str]

List of version IDs to compare

required

Returns:

Type Description
Dict[str, PromptMetrics]

Dictionary mapping version_id to PromptMetrics

Source code in packages/llm/src/dataknobs_llm/prompts/versioning/metrics.py
async def compare_variants(
    self,
    version_ids: List[str],
) -> Dict[str, PromptMetrics]:
    """Compare metrics across multiple versions.

    Args:
        version_ids: List of version IDs to compare

    Returns:
        Dictionary mapping version_id to PromptMetrics
    """
    comparison = {}
    for version_id in version_ids:
        comparison[version_id] = await self.get_metrics(version_id)
    return comparison
get_top_versions async
get_top_versions(
    version_ids: List[str], metric: str = "success_rate", limit: int = 5
) -> List[tuple[str, float]]

Get top performing versions by a specific metric.

Parameters:

Name Type Description Default
version_ids List[str]

List of version IDs to rank

required
metric str

Metric to rank by ("success_rate", "avg_rating", "avg_response_time")

'success_rate'
limit int

Number of top versions to return

5

Returns:

Type Description
List[tuple[str, float]]

List of (version_id, metric_value) tuples, sorted by metric

Raises:

Type Description
ValueError

If metric name is invalid

Source code in packages/llm/src/dataknobs_llm/prompts/versioning/metrics.py
async def get_top_versions(
    self,
    version_ids: List[str],
    metric: str = "success_rate",
    limit: int = 5,
) -> List[tuple[str, float]]:
    """Get top performing versions by a specific metric.

    Args:
        version_ids: List of version IDs to rank
        metric: Metric to rank by ("success_rate", "avg_rating", "avg_response_time")
        limit: Number of top versions to return

    Returns:
        List of (version_id, metric_value) tuples, sorted by metric

    Raises:
        ValueError: If metric name is invalid
    """
    valid_metrics = ["success_rate", "avg_rating", "avg_response_time", "avg_tokens"]
    if metric not in valid_metrics:
        raise ValueError(
            f"Invalid metric: {metric}. Valid metrics: {', '.join(valid_metrics)}"
        )

    # Get metrics for all versions
    all_metrics = await self.compare_variants(version_ids)

    # Extract metric values
    metric_values = [
        (vid, getattr(metrics, metric))
        for vid, metrics in all_metrics.items()
        if metrics.total_uses > 0  # Only include versions with data
    ]

    # Sort by metric value
    # For response_time, lower is better (reverse=False)
    # For success_rate, rating, higher is better (reverse=True)
    reverse = metric != "avg_response_time"
    sorted_versions = sorted(metric_values, key=lambda x: x[1], reverse=reverse)

    return sorted_versions[:limit]

PromptMetrics

dataknobs_llm.prompts.PromptMetrics dataclass

PromptMetrics(
    version_id: str,
    total_uses: int = 0,
    success_count: int = 0,
    error_count: int = 0,
    total_response_time: float = 0.0,
    total_tokens: int = 0,
    user_ratings: List[float] = list(),
    last_used: datetime | None = None,
    metadata: Dict[str, Any] = dict(),
)

Performance metrics for a prompt version.

Attributes:

Name Type Description
version_id str

Version ID these metrics belong to

total_uses int

Total number of times this version was used

success_count int

Number of successful uses

error_count int

Number of errors/failures

total_response_time float

Total response time across all uses (seconds)

total_tokens int

Total tokens used across all uses

user_ratings List[float]

List of user ratings (1-5 scale)

last_used datetime | None

Timestamp of last use

metadata Dict[str, Any]

Additional custom metrics

Methods:

Name Description
from_dict

Create from dictionary.

to_dict

Convert to dictionary for storage.

Attributes

avg_rating property
avg_rating: float

Calculate average user rating.

avg_response_time property
avg_response_time: float

Calculate average response time.

avg_tokens property
avg_tokens: float

Calculate average tokens per use.

success_rate property
success_rate: float

Calculate success rate.

Functions

from_dict classmethod
from_dict(data: Dict[str, Any]) -> PromptMetrics

Create from dictionary.

Source code in packages/llm/src/dataknobs_llm/prompts/versioning/types.py
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "PromptMetrics":
    """Create from dictionary."""
    data = data.copy()
    # Parse datetime
    if isinstance(data.get("last_used"), str):
        data["last_used"] = datetime.fromisoformat(data["last_used"])
    # Remove computed properties (they're recalculated)
    for key in ["success_rate", "avg_response_time", "avg_tokens", "avg_rating"]:
        data.pop(key, None)
    return cls(**data)
to_dict
to_dict() -> Dict[str, Any]

Convert to dictionary for storage.

Source code in packages/llm/src/dataknobs_llm/prompts/versioning/types.py
def to_dict(self) -> Dict[str, Any]:
    """Convert to dictionary for storage."""
    return {
        "version_id": self.version_id,
        "total_uses": self.total_uses,
        "success_count": self.success_count,
        "error_count": self.error_count,
        "total_response_time": self.total_response_time,
        "total_tokens": self.total_tokens,
        "user_ratings": self.user_ratings,
        "last_used": self.last_used.isoformat() if self.last_used else None,
        "metadata": self.metadata,
        # Include computed properties
        "success_rate": self.success_rate,
        "avg_response_time": self.avg_response_time,
        "avg_tokens": self.avg_tokens,
        "avg_rating": self.avg_rating,
    }

MetricEvent

dataknobs_llm.prompts.MetricEvent dataclass

MetricEvent(
    version_id: str,
    timestamp: datetime = datetime.utcnow(),
    success: bool = True,
    response_time: float | None = None,
    tokens: int | None = None,
    user_rating: float | None = None,
    metadata: Dict[str, Any] = dict(),
)

Single event for metrics tracking.

Attributes:

Name Type Description
version_id str

Version ID this event belongs to

timestamp datetime

When the event occurred

success bool

Whether the use was successful

response_time float | None

Response time in seconds (None if not applicable)

tokens int | None

Number of tokens used (None if not applicable)

user_rating float | None

User rating 1-5 (None if not provided)

metadata Dict[str, Any]

Additional event metadata

Methods:

Name Description
from_dict

Create from dictionary.

to_dict

Convert to dictionary for storage.

Functions

from_dict classmethod
from_dict(data: Dict[str, Any]) -> MetricEvent

Create from dictionary.

Source code in packages/llm/src/dataknobs_llm/prompts/versioning/types.py
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "MetricEvent":
    """Create from dictionary."""
    data = data.copy()
    if isinstance(data.get("timestamp"), str):
        data["timestamp"] = datetime.fromisoformat(data["timestamp"])
    return cls(**data)
to_dict
to_dict() -> Dict[str, Any]

Convert to dictionary for storage.

Source code in packages/llm/src/dataknobs_llm/prompts/versioning/types.py
def to_dict(self) -> Dict[str, Any]:
    """Convert to dictionary for storage."""
    return {
        "version_id": self.version_id,
        "timestamp": self.timestamp.isoformat(),
        "success": self.success,
        "response_time": self.response_time,
        "tokens": self.tokens,
        "user_rating": self.user_rating,
        "metadata": self.metadata,
    }

Usage Examples

Version Management

from dataknobs_llm.prompts import VersionManager, VersionStatus

# Create version manager
vm = VersionManager()

# Create first version
v1 = await vm.create_version(
    name="greeting",
    prompt_type="system",
    template="Hello {{name}}!",
    version="1.0.0",
    metadata={"author": "alice", "description": "Initial version"}
)

# Auto-increment version
v2 = await vm.create_version(
    name="greeting",
    prompt_type="system",
    template="Hi {{name}}, welcome!",
    # version="1.0.1" automatically assigned
    metadata={"author": "bob", "changes": "More friendly"}
)

# Get specific version
version = await vm.get_version("greeting", "system", "1.0.0")

# Get latest version (no version specified)
latest = await vm.get_version("greeting", "system")

# List all versions
versions = await vm.list_versions("greeting", "system")
for v in versions:
    print(f"{v.version}: {v.metadata.get('description')}")

Version Status Lifecycle

# Create as draft
draft = await vm.create_version(
    name="new_feature",
    prompt_type="user",
    template="...",
    status=VersionStatus.DRAFT
)

# Promote to active
await vm.update_status(draft.version_id, VersionStatus.ACTIVE)

# Tag as production
await vm.tag_version(draft.version_id, "production")

# Later, deprecate old version
await vm.update_status(old_version.version_id, VersionStatus.DEPRECATED)

A/B Testing

from dataknobs_llm.prompts import ABTestManager, PromptVariant

# Create A/B test manager
ab = ABTestManager()

# Create experiment
exp = await ab.create_experiment(
    name="greeting",
    prompt_type="system",
    variants=[
        PromptVariant("1.0.0", 0.5, "Control"),
        PromptVariant("1.0.1", 0.5, "Treatment")
    ],
    metadata={
        "hypothesis": "Friendly greeting increases engagement",
        "owner": "product-team"
    }
)

# Get variant for user (sticky - same user always gets same variant)
variant = await ab.get_variant_for_user(exp.experiment_id, "user123")
print(f"User 123 gets version: {variant}")

# Same user always gets same variant
variant2 = await ab.get_variant_for_user(exp.experiment_id, "user123")
assert variant == variant2

# Random variant (for batch processing)
random_variant = await ab.get_random_variant(exp.experiment_id)

Multi-Variant Testing (A/B/C)

# Create A/B/C test
exp = await ab.create_experiment(
    name="tone",
    prompt_type="system",
    variants=[
        PromptVariant("1.0.0", 0.34, "Formal"),
        PromptVariant("1.1.0", 0.33, "Casual"),
        PromptVariant("1.2.0", 0.33, "Enthusiastic")
    ]
)

# Traffic automatically normalized to sum to 1.0

Unequal Traffic Splits

# Safer rollout: 90% control, 10% experimental
exp = await ab.create_experiment(
    name="new_feature",
    prompt_type="system",
    variants=[
        PromptVariant("1.0.0", 0.9, "Stable"),
        PromptVariant("2.0.0", 0.1, "Experimental")
    ]
)

Metrics Tracking

from dataknobs_llm.prompts import MetricsCollector

# Create metrics collector
mc = MetricsCollector()

# Record comprehensive event
await mc.record_event(
    version_id=version.version_id,
    success=True,
    response_time=0.5,  # seconds
    tokens=150,
    user_rating=4.5,  # 1-5 scale
    metadata={"user_segment": "premium", "feature": "code_review"}
)

# Record minimal event
await mc.record_event(
    version_id=version.version_id,
    success=True
)

# Record failure
await mc.record_event(
    version_id=version.version_id,
    success=False,
    metadata={"error": "timeout"}
)

Analyzing Results

# Get metrics for a version
metrics = await mc.get_metrics(version.version_id)
print(f"Success rate: {metrics.success_rate:.2%}")
print(f"Total events: {metrics.total_events}")
print(f"Avg response time: {metrics.avg_response_time:.2f}s")
print(f"Avg tokens: {metrics.avg_tokens:.0f}")
print(f"Avg rating: {metrics.avg_rating:.1f}/5.0")

# Compare variants
comparison = await mc.compare_variants([v1.version_id, v2.version_id])

for version_id, metrics in comparison.items():
    print(f"\nVersion {version_id}:")
    print(f"  Success rate: {metrics.success_rate:.2%}")
    print(f"  Avg response time: {metrics.avg_response_time:.2f}s")
    print(f"  Avg rating: {metrics.avg_rating:.1f}/5.0")
    print(f"  Total events: {metrics.total_events}")

# Find top performers
top_versions = await mc.get_top_versions(
    version_ids=[v1.version_id, v2.version_id, v3.version_id],
    metric="success_rate",
    limit=3
)

print("Top 3 versions by success rate:")
for version_id, metrics in top_versions:
    print(f"  {version_id}: {metrics.success_rate:.2%}")

Full A/B Test Workflow

from dataknobs_llm.prompts import (
    VersionManager,
    ABTestManager,
    MetricsCollector,
    PromptVariant
)

# 1. Create versions
vm = VersionManager()
v1 = await vm.create_version(
    name="email_subject",
    prompt_type="user",
    template="Generate subject for: {{topic}}",
    version="1.0.0"
)

v2 = await vm.create_version(
    name="email_subject",
    prompt_type="user",
    template="Generate engaging subject line for: {{topic}}",
    version="1.1.0"
)

# 2. Create experiment
ab = ABTestManager()
exp = await ab.create_experiment(
    name="email_subject",
    prompt_type="user",
    variants=[
        PromptVariant("1.0.0", 0.5, "Basic"),
        PromptVariant("1.1.0", 0.5, "Engaging")
    ]
)

# 3. Run experiment
mc = MetricsCollector()

for user_id in users:
    # Get variant for user
    variant_version = await ab.get_variant_for_user(exp.experiment_id, user_id)
    version = await vm.get_version("email_subject", "user", variant_version)

    # Use version in your application
    success = execute_with_version(version, user_id)

    # Track metrics
    await mc.record_event(
        version_id=version.version_id,
        success=success,
        response_time=response_time,
        user_rating=user_rating
    )

# 4. Analyze and deploy winner
comparison = await mc.compare_variants([v1.version_id, v2.version_id])

if comparison[v2.version_id].success_rate > comparison[v1.version_id].success_rate:
    # Deploy winner
    await vm.tag_version(v2.version_id, "production")
    await ab.update_experiment_status(exp.experiment_id, "completed")
else:
    # Keep control
    await vm.tag_version(v1.version_id, "production")

Rollback Scenario

# Deploy new version
new_version = await vm.create_version(
    name="greeting",
    prompt_type="system",
    template="New experimental greeting",
    version="2.0.0"
)
await vm.tag_version(new_version.version_id, "production")

# Monitor metrics...
metrics = await mc.get_metrics(new_version.version_id)

if metrics.success_rate < 0.9:
    # Rollback!
    versions = await vm.list_versions("greeting", "system")
    previous = versions[1]  # Second newest

    await vm.untag_version(new_version.version_id, "production")
    await vm.tag_version(previous.version_id, "production")
    await vm.update_status(new_version.version_id, VersionStatus.DEPRECATED)

    print(f"Rolled back to version {previous.version}")

Versioned Prompt Library

from dataknobs_llm.prompts import VersionedPromptLibrary

# Create versioned library
library = VersionedPromptLibrary()

# Create versions
await library.create_version(
    name="greeting",
    prompt_type="system",
    template="Hello {{name}}!",
    version="1.0.0"
)

# Create A/B test
await library.create_experiment(
    name="greeting",
    prompt_type="system",
    variants=[
        PromptVariant("1.0.0", 0.5, "Control"),
        PromptVariant("1.0.1", 0.5, "Treatment")
    ]
)

# Use with prompt builder (automatically selects variant)
from dataknobs_llm.prompts import AsyncPromptBuilder

builder = AsyncPromptBuilder(
    library=library,
    user_id="user123"  # For sticky assignment
)

result = await builder.render_system_prompt("greeting", {"name": "Alice"})
# Uses versioned prompt based on A/B test

Best Practices

1. Semantic Versioning

# Major: Breaking changes
await vm.create_version(..., version="2.0.0")

# Minor: New features, backward compatible
await vm.create_version(..., version="1.1.0")

# Patch: Bug fixes, minor improvements
await vm.create_version(..., version="1.0.1")

2. Comprehensive Metadata

await vm.create_version(
    ...,
    metadata={
        "author": "alice",
        "description": "Fixed hallucination issue",
        "jira_ticket": "PROMPT-123",
        "tested": True,
        "review_approved_by": "bob"
    }
)

3. User-Sticky for Consistency

# ✅ User-facing apps
variant = await ab.get_variant_for_user(exp_id, user_id)

# ❌ Don't use random for user-facing (inconsistent UX)
variant = await ab.get_random_variant(exp_id)

4. Track All Relevant Metrics

await mc.record_event(
    version_id=v_id,
    success=success,
    response_time=time,
    tokens=tokens,
    user_rating=rating,
    metadata={
        "context": "code_review",
        "language": "python",
        "user_segment": "premium"
    }
)

See Also