Best Practices¶
Guidelines and recommendations for using Dataknobs in production.
Installation and Deployment¶
Version Pinning¶
Always pin exact versions in production:
# pyproject.toml
[tool.uv.dependencies]
dataknobs-structures = "==1.0.0"
dataknobs-utils = "==1.0.0"
dataknobs-xization = "==1.0.0"
Virtual Environments¶
Always use virtual environments:
# Using uv
uv venv
source .venv/bin/activate
uv pip install dataknobs-structures
# Using standard venv
python -m venv venv
source venv/bin/activate
pip install dataknobs-structures
Code Organization¶
Import Organization¶
# Good: Group imports by package
from dataknobs_structures import Tree, Text, TextMetaData
from dataknobs_utils import json_utils, file_utils
from dataknobs_xization import basic_normalization_fn
# Avoid: Scattered imports
from dataknobs_structures import Tree
from dataknobs_utils import json_utils
from dataknobs_structures import Text # Scattered
Module Structure¶
# project/
# ├── core/
# │ ├── __init__.py
# │ ├── models.py # Data models using dataknobs_structures
# │ └── processors.py # Processing logic using dataknobs_xization
# ├── utils/
# │ ├── __init__.py
# │ └── helpers.py # Utilities using dataknobs_utils
# └── main.py
Error Handling¶
Graceful Degradation¶
from dataknobs_structures import Tree
import logging
logger = logging.getLogger(__name__)
def safe_tree_operation(tree_string):
try:
tree = build_tree_from_string(tree_string)
return tree
except ValueError as e:
logger.warning(f"Invalid tree format: {e}")
return Tree() # Return empty tree
except Exception as e:
logger.error(f"Unexpected error: {e}")
raise # Re-raise unexpected errors
Input Validation¶
def process_document(text, metadata=None):
# Validate inputs
if not isinstance(text, str):
raise TypeError(f"Expected str, got {type(text)}")
if not text.strip():
raise ValueError("Text cannot be empty")
if metadata and not isinstance(metadata, dict):
raise TypeError(f"Metadata must be dict, got {type(metadata)}")
# Process
from dataknobs_structures import Text, TextMetaData
meta = TextMetaData(**metadata) if metadata else TextMetaData()
return Text(text, meta)
Performance¶
Lazy Loading¶
class DocumentProcessor:
def __init__(self):
self._tokenizer = None
self._normalizer = None
@property
def tokenizer(self):
if self._tokenizer is None:
from dataknobs_xization import masking_tokenizer
self._tokenizer = masking_tokenizer.MaskingTokenizer()
return self._tokenizer
@property
def normalizer(self):
if self._normalizer is None:
from dataknobs_xization import basic_normalization_fn
self._normalizer = basic_normalization_fn
return self._normalizer
Batch Processing¶
def process_documents_batch(documents, batch_size=100):
from dataknobs_xization import basic_normalization_fn
results = []
for i in range(0, len(documents), batch_size):
batch = documents[i:i+batch_size]
# Process batch
batch_results = [basic_normalization_fn(doc) for doc in batch]
results.extend(batch_results)
# Optional: yield for streaming
# yield batch_results
return results
Memory Management¶
Resource Cleanup¶
from contextlib import contextmanager
@contextmanager
def large_tree_processor(tree_data):
from dataknobs_structures import Tree
tree = None
try:
tree = Tree.from_data(tree_data)
yield tree
finally:
# Cleanup
if tree:
tree.clear()
del tree
# Usage
with large_tree_processor(data) as tree:
process_tree(tree)
# Tree is automatically cleaned up
Generator Patterns¶
def read_large_dataset(filepath):
from dataknobs_utils import file_utils
with open(filepath, 'r') as f:
for line in f:
# Process line by line instead of loading all
data = json.loads(line)
yield process_document(data)
# Memory-efficient processing
for processed in read_large_dataset("large_file.jsonl"):
handle_document(processed)
Testing¶
Unit Testing¶
import pytest
from dataknobs_structures import Tree
class TestTreeOperations:
def test_tree_creation(self):
tree = Tree()
assert tree.root is None
tree.add_root("root")
assert tree.root.value == "root"
def test_tree_traversal(self):
tree = build_tree_from_string("root -> a, b")
nodes = list(tree.traverse())
assert len(nodes) == 3
assert nodes[0].value == "root"
@pytest.mark.parametrize("input_str,expected_nodes", [
("root", 1),
("root -> a", 2),
("root -> a, b\na -> c", 4),
])
def test_tree_sizes(self, input_str, expected_nodes):
tree = build_tree_from_string(input_str)
assert len(list(tree.traverse())) == expected_nodes
Integration Testing¶
def test_full_pipeline():
from dataknobs_structures import Text, TextMetaData
from dataknobs_xization import basic_normalization_fn
from dataknobs_utils import json_utils
# Create document
metadata = TextMetaData(source="test")
doc = Text(" TEST Document ", metadata)
# Process
normalized = basic_normalization_fn(doc.content)
# Store result
result = {
"original": doc.content,
"normalized": normalized,
"metadata": {"source": doc.metadata.source}
}
# Verify
assert json_utils.get_value(result, "normalized") == "test document"
assert json_utils.get_value(result, "metadata.source") == "test"
Logging¶
Structured Logging¶
import logging
import json
class StructuredLogger:
def __init__(self, name):
self.logger = logging.getLogger(name)
def log_operation(self, operation, data, level=logging.INFO):
log_entry = {
"operation": operation,
"data": data,
"timestamp": datetime.now().isoformat()
}
self.logger.log(level, json.dumps(log_entry))
# Usage
logger = StructuredLogger(__name__)
logger.log_operation(
"tree_processed",
{"nodes": 100, "depth": 5, "time_ms": 45}
)
Security¶
Input Sanitization¶
from dataknobs_xization import basic_normalization_fn
import re
def sanitize_input(text):
# Remove potential injection patterns
text = re.sub(r'[<>"\']', '', text)
# Normalize
text = basic_normalization_fn(text)
# Length limit
max_length = 10000
if len(text) > max_length:
text = text[:max_length]
return text
Sensitive Data Handling¶
from dataknobs_xization import masking_tokenizer
class SecureProcessor:
def __init__(self):
self.tokenizer = masking_tokenizer.MaskingTokenizer()
self.tokenizer.add_pattern(r'\b\d{3}-\d{2}-\d{4}\b', 'SSN')
self.tokenizer.add_pattern(r'\b\d{16}\b', 'CREDIT_CARD')
def process_sensitive(self, text):
# Mask sensitive data
tokens = self.tokenizer.tokenize(text)
masked = ' '.join([
'[REDACTED]' if t.type in ['SSN', 'CREDIT_CARD'] else t.value
for t in tokens
])
return masked
Monitoring¶
Performance Metrics¶
import time
from contextlib import contextmanager
@contextmanager
def measure_time(operation_name):
start = time.time()
try:
yield
finally:
duration = time.time() - start
print(f"{operation_name} took {duration:.3f}s")
# Send to monitoring system
send_metric(f"dataknobs.{operation_name}.duration", duration)
# Usage
with measure_time("tree_processing"):
process_large_tree(tree)
Migration from Legacy¶
If migrating from the legacy dataknobs package:
# Old code
try:
from dataknobs.structures import Tree # Legacy
except ImportError:
from dataknobs_structures import Tree # New
# Better: Use only new packages
from dataknobs_structures import Tree
Common Pitfalls¶
Avoid These Patterns¶
# Bad: Modifying trees during traversal
for node in tree.traverse():
if node.value == "remove":
tree.remove_node(node) # Don't do this!
# Good: Collect then modify
nodes_to_remove = [n for n in tree.traverse() if n.value == "remove"]
for node in nodes_to_remove:
tree.remove_node(node)
# Bad: Not handling empty results
result = elasticsearch_utils.search(query)
first = result[0] # May fail!
# Good: Check first
result = elasticsearch_utils.search(query)
if result:
first = result[0]
else:
handle_empty_result()
Recommended Tools¶
- Linting:
rufffor fast Python linting - Type Checking:
mypyfor static type analysis - Testing:
pytestwith coverage reporting - Documentation:
mkdocswith Material theme - Package Management:
uvfor fast, reliable dependency management