Best Practices¶

Guidelines and recommendations for using Dataknobs in production.

Installation and Deployment¶

Version Pinning¶

Always pin exact versions in production:

# pyproject.toml
[tool.uv.dependencies]
dataknobs-structures = "==1.0.0"
dataknobs-utils = "==1.0.0"
dataknobs-xization = "==1.0.0"

Virtual Environments¶

Always use virtual environments:

# Using uv
uv venv
source .venv/bin/activate
uv pip install dataknobs-structures

# Using standard venv
python -m venv venv
source venv/bin/activate
pip install dataknobs-structures

Code Organization¶

Import Organization¶

# Good: Group imports by package
from dataknobs_structures import Tree, Text, TextMetaData
from dataknobs_utils import json_utils, file_utils
from dataknobs_xization import basic_normalization_fn

# Avoid: Scattered imports
from dataknobs_structures import Tree
from dataknobs_utils import json_utils
from dataknobs_structures import Text  # Scattered

Module Structure¶

# project/
#   ├── core/
#   │   ├── __init__.py
#   │   ├── models.py      # Data models using dataknobs_structures
#   │   └── processors.py  # Processing logic using dataknobs_xization
#   ├── utils/
#   │   ├── __init__.py
#   │   └── helpers.py     # Utilities using dataknobs_utils
#   └── main.py

Error Handling¶

Graceful Degradation¶

from dataknobs_structures import Tree
import logging

logger = logging.getLogger(__name__)

def safe_tree_operation(tree_string):
    try:
        tree = build_tree_from_string(tree_string)
        return tree
    except ValueError as e:
        logger.warning(f"Invalid tree format: {e}")
        return Tree()  # Return empty tree
    except Exception as e:
        logger.error(f"Unexpected error: {e}")
        raise  # Re-raise unexpected errors

Input Validation¶

def process_document(text, metadata=None):
    # Validate inputs
    if not isinstance(text, str):
        raise TypeError(f"Expected str, got {type(text)}")

    if not text.strip():
        raise ValueError("Text cannot be empty")

    if metadata and not isinstance(metadata, dict):
        raise TypeError(f"Metadata must be dict, got {type(metadata)}")

    # Process
    from dataknobs_structures import Text, TextMetaData
    meta = TextMetaData(**metadata) if metadata else TextMetaData()
    return Text(text, meta)

Performance¶

Lazy Loading¶

class DocumentProcessor:
    def __init__(self):
        self._tokenizer = None
        self._normalizer = None

    @property
    def tokenizer(self):
        if self._tokenizer is None:
            from dataknobs_xization import masking_tokenizer
            self._tokenizer = masking_tokenizer.MaskingTokenizer()
        return self._tokenizer

    @property
    def normalizer(self):
        if self._normalizer is None:
            from dataknobs_xization import basic_normalization_fn
            self._normalizer = basic_normalization_fn
        return self._normalizer

Batch Processing¶

def process_documents_batch(documents, batch_size=100):
    from dataknobs_xization import basic_normalization_fn

    results = []
    for i in range(0, len(documents), batch_size):
        batch = documents[i:i+batch_size]

        # Process batch
        batch_results = [basic_normalization_fn(doc) for doc in batch]
        results.extend(batch_results)

        # Optional: yield for streaming
        # yield batch_results

    return results

Memory Management¶

Resource Cleanup¶

from contextlib import contextmanager

@contextmanager
def large_tree_processor(tree_data):
    from dataknobs_structures import Tree

    tree = None
    try:
        tree = Tree.from_data(tree_data)
        yield tree
    finally:
        # Cleanup
        if tree:
            tree.clear()
        del tree

# Usage
with large_tree_processor(data) as tree:
    process_tree(tree)
# Tree is automatically cleaned up

Generator Patterns¶

def read_large_dataset(filepath):
    from dataknobs_utils import file_utils

    with open(filepath, 'r') as f:
        for line in f:
            # Process line by line instead of loading all
            data = json.loads(line)
            yield process_document(data)

# Memory-efficient processing
for processed in read_large_dataset("large_file.jsonl"):
    handle_document(processed)

Testing¶

Unit Testing¶

import pytest
from dataknobs_structures import Tree

class TestTreeOperations:
    def test_tree_creation(self):
        tree = Tree()
        assert tree.root is None

        tree.add_root("root")
        assert tree.root.value == "root"

    def test_tree_traversal(self):
        tree = build_tree_from_string("root -> a, b")
        nodes = list(tree.traverse())
        assert len(nodes) == 3
        assert nodes[0].value == "root"

    @pytest.mark.parametrize("input_str,expected_nodes", [
        ("root", 1),
        ("root -> a", 2),
        ("root -> a, b\na -> c", 4),
    ])
    def test_tree_sizes(self, input_str, expected_nodes):
        tree = build_tree_from_string(input_str)
        assert len(list(tree.traverse())) == expected_nodes

Integration Testing¶

def test_full_pipeline():
    from dataknobs_structures import Text, TextMetaData
    from dataknobs_xization import basic_normalization_fn
    from dataknobs_utils import json_utils

    # Create document
    metadata = TextMetaData(source="test")
    doc = Text("  TEST Document  ", metadata)

    # Process
    normalized = basic_normalization_fn(doc.content)

    # Store result
    result = {
        "original": doc.content,
        "normalized": normalized,
        "metadata": {"source": doc.metadata.source}
    }

    # Verify
    assert json_utils.get_value(result, "normalized") == "test document"
    assert json_utils.get_value(result, "metadata.source") == "test"

Logging¶

Structured Logging¶

import logging
import json

class StructuredLogger:
    def __init__(self, name):
        self.logger = logging.getLogger(name)

    def log_operation(self, operation, data, level=logging.INFO):
        log_entry = {
            "operation": operation,
            "data": data,
            "timestamp": datetime.now().isoformat()
        }
        self.logger.log(level, json.dumps(log_entry))

# Usage
logger = StructuredLogger(__name__)
logger.log_operation(
    "tree_processed",
    {"nodes": 100, "depth": 5, "time_ms": 45}
)

Security¶

Input Sanitization¶

from dataknobs_xization import basic_normalization_fn
import re

def sanitize_input(text):
    # Remove potential injection patterns
    text = re.sub(r'[<>"\']', '', text)

    # Normalize
    text = basic_normalization_fn(text)

    # Length limit
    max_length = 10000
    if len(text) > max_length:
        text = text[:max_length]

    return text

Sensitive Data Handling¶

from dataknobs_xization import masking_tokenizer

class SecureProcessor:
    def __init__(self):
        self.tokenizer = masking_tokenizer.MaskingTokenizer()
        self.tokenizer.add_pattern(r'\b\d{3}-\d{2}-\d{4}\b', 'SSN')
        self.tokenizer.add_pattern(r'\b\d{16}\b', 'CREDIT_CARD')

    def process_sensitive(self, text):
        # Mask sensitive data
        tokens = self.tokenizer.tokenize(text)
        masked = ' '.join([
            '[REDACTED]' if t.type in ['SSN', 'CREDIT_CARD'] else t.value
            for t in tokens
        ])
        return masked

Monitoring¶

Performance Metrics¶

import time
from contextlib import contextmanager

@contextmanager
def measure_time(operation_name):
    start = time.time()
    try:
        yield
    finally:
        duration = time.time() - start
        print(f"{operation_name} took {duration:.3f}s")

        # Send to monitoring system
        send_metric(f"dataknobs.{operation_name}.duration", duration)

# Usage
with measure_time("tree_processing"):
    process_large_tree(tree)

Migration from Legacy¶

If migrating from the legacy dataknobs package:

# Old code
try:
    from dataknobs.structures import Tree  # Legacy
except ImportError:
    from dataknobs_structures import Tree  # New

# Better: Use only new packages
from dataknobs_structures import Tree

Common Pitfalls¶

Avoid These Patterns¶

# Bad: Modifying trees during traversal
for node in tree.traverse():
    if node.value == "remove":
        tree.remove_node(node)  # Don't do this!

# Good: Collect then modify
nodes_to_remove = [n for n in tree.traverse() if n.value == "remove"]
for node in nodes_to_remove:
    tree.remove_node(node)

# Bad: Not handling empty results
result = elasticsearch_utils.search(query)
first = result[0]  # May fail!

# Good: Check first
result = elasticsearch_utils.search(query)
if result:
    first = result[0]
else:
    handle_empty_result()

Recommended Tools¶

Linting: ruff for fast Python linting
Type Checking: mypy for static type analysis
Testing: pytest with coverage reporting
Documentation: mkdocs with Material theme
Package Management: uv for fast, reliable dependency management

Best Practices¶

Installation and Deployment¶

Version Pinning¶

Virtual Environments¶

Code Organization¶

Import Organization¶

Module Structure¶

Error Handling¶

Graceful Degradation¶

Input Validation¶

Performance¶

Lazy Loading¶

Batch Processing¶

Memory Management¶

Resource Cleanup¶

Generator Patterns¶

Testing¶

Unit Testing¶

Integration Testing¶

Logging¶

Structured Logging¶

Security¶

Input Sanitization¶

Sensitive Data Handling¶

Monitoring¶

Performance Metrics¶

Migration from Legacy¶

Common Pitfalls¶

Avoid These Patterns¶

Recommended Tools¶

Support and Resources¶