Text Normalization Examples¶
This guide demonstrates text normalization and processing using the dataknobs-xization package.
Basic Normalization¶
Simple Text Cleaning¶
from dataknobs_xization import normalize
# Basic normalization
text = " Hello World! "
normalized = normalize.normalize_whitespace_fn(text)
print(f"Original: '{text}'")
print(f"Normalized: '{normalized}'")
# Output: 'Hello World!'
Expanding CamelCase¶
from dataknobs_xization import normalize
# Expand camelCase text
camel_text = "getUserNameAndEmail"
expanded = normalize.expand_camelcase_fn(camel_text)
print(f"Original: {camel_text}")
print(f"Expanded: {expanded}")
# Output: 'get User Name And Email'
# Works with acronyms
acronym_text = "XMLHttpRequest"
expanded = normalize.expand_camelcase_fn(acronym_text)
print(f"Expanded: {expanded}")
# Output: 'XML Http Request'
Expanding Ampersands¶
from dataknobs_xization import normalize
# Expand ampersands
text = "Research & Development"
expanded = normalize.expand_ampersand_fn(text)
print(f"Original: {text}")
print(f"Expanded: {expanded}")
# Output: 'Research and Development'
# Multiple ampersands
text = "A & B & C"
expanded = normalize.expand_ampersand_fn(text)
print(f"Expanded: {expanded}")
# Output: 'A and B and C'
Combined Normalizations¶
Full Text Normalization Pipeline¶
from dataknobs_xization import normalize
def full_normalization(text):
"""Apply all normalization steps."""
# Step 1: Expand camelCase
text = normalize.expand_camelcase_fn(text)
# Step 2: Expand ampersands
text = normalize.expand_ampersand_fn(text)
# Step 3: Normalize whitespace
text = normalize.normalize_whitespace_fn(text)
# Step 4: Convert to lowercase (optional)
text = text.lower()
return text
# Example usage
code_text = "getUserData&ProcessInput"
normalized = full_normalization(code_text)
print(f"Original: {code_text}")
print(f"Normalized: {normalized}")
# Output: 'get user data and process input'
Custom Normalization Function¶
from dataknobs_xization import normalize
import re
def custom_normalize(text):
"""Custom normalization with additional rules."""
# Apply basic normalizations
text = normalize.basic_normalization_fn(text)
# Custom: Replace underscores with spaces
text = text.replace('_', ' ')
# Custom: Remove special characters
text = re.sub(r'[^\w\s]', ' ', text)
# Custom: Remove numbers
text = re.sub(r'\d+', '', text)
# Clean up whitespace
text = normalize.normalize_whitespace_fn(text)
return text
# Example
text = "user_data_123 & special@chars!"
normalized = custom_normalize(text)
print(f"Original: {text}")
print(f"Normalized: {normalized}")
Tokenization Examples¶
Basic Tokenization¶
from dataknobs_xization.masking_tokenizer import TextFeatures
# Create text features for tokenization
text = "Hello World! How are you?"
features = TextFeatures(text, split_camelcase=True)
# Get tokens
tokens = features.get_tokens()
print("Tokens:")
for token in tokens:
print(f" '{token.token_text}' at position {token.start_pos}-{token.end_pos}")
CamelCase-Aware Tokenization¶
from dataknobs_xization.masking_tokenizer import TextFeatures
# Tokenize with camelCase splitting
code_text = "getUserNameById"
features = TextFeatures(code_text, split_camelcase=True)
tokens = features.get_tokens()
print("CamelCase tokens:")
for token in tokens:
print(f" '{token.token_text}'")
# Output: 'get', 'User', 'Name', 'By', 'Id'
# Without camelCase splitting
features_no_split = TextFeatures(code_text, split_camelcase=False)
tokens_no_split = features_no_split.get_tokens()
print("\nWithout splitting:")
for token in tokens_no_split:
print(f" '{token.token_text}'")
# Output: 'getUserNameById'
Tokenization with Normalization¶
from dataknobs_xization.masking_tokenizer import TextFeatures
from dataknobs_xization import normalize
def tokenize_and_normalize(text):
"""Tokenize and normalize each token."""
features = TextFeatures(text, split_camelcase=True)
# Define normalization function
def normalize_token(token_text):
return token_text.lower()
# Get normalized tokens
tokens = features.get_tokens(normalize_fn=normalize_token)
return [token.norm_text for token in tokens]
# Example
text = "GetUserName AND ProcessData"
normalized_tokens = tokenize_and_normalize(text)
print(f"Original: {text}")
print(f"Normalized tokens: {normalized_tokens}")
Pattern-Based Normalization¶
Email Address Normalization¶
import re
from dataknobs_xization import normalize
def normalize_email(text):
"""Normalize email addresses in text."""
# Pattern for email addresses
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
def normalize_single_email(match):
email = match.group(0)
# Normalize to lowercase
return email.lower()
# Replace all emails with normalized version
normalized = re.sub(email_pattern, normalize_single_email, text)
return normalized
# Example
text = "Contact John.Doe@EXAMPLE.COM or Jane.Smith@Company.ORG"
normalized = normalize_email(text)
print(f"Original: {text}")
print(f"Normalized: {normalized}")
URL Normalization¶
import re
from urllib.parse import urlparse, urlunparse
def normalize_urls(text):
"""Normalize URLs in text."""
url_pattern = r'https?://[^\s]+'
def normalize_single_url(match):
url = match.group(0)
parsed = urlparse(url.lower())
# Remove www. prefix if present
netloc = parsed.netloc
if netloc.startswith('www.'):
netloc = netloc[4:]
# Rebuild URL
normalized = urlunparse((
parsed.scheme,
netloc,
parsed.path,
parsed.params,
parsed.query,
parsed.fragment
))
return normalized
return re.sub(url_pattern, normalize_single_url, text)
# Example
text = "Visit HTTPS://WWW.EXAMPLE.COM/Page or http://Another-Site.org"
normalized = normalize_urls(text)
print(f"Original: {text}")
print(f"Normalized: {normalized}")
Language-Specific Normalization¶
Code Normalization¶
from dataknobs_xization import normalize
import re
def normalize_code(code):
"""Normalize programming code for analysis."""
# Remove comments (simple example for Python/Java style)
code = re.sub(r'//.*$', '', code, flags=re.MULTILINE) # Single-line comments
code = re.sub(r'/\*.*?\*/', '', code, flags=re.DOTALL) # Multi-line comments
code = re.sub(r'#.*$', '', code, flags=re.MULTILINE) # Python comments
# Expand camelCase
code = normalize.expand_camelcase_fn(code)
# Normalize operators
code = code.replace('&&', ' and ')
code = code.replace('||', ' or ')
code = code.replace('!=', ' not equal ')
code = code.replace('==', ' equal ')
# Normalize whitespace
code = normalize.normalize_whitespace_fn(code)
return code
# Example
code_snippet = """
// Get user data
function getUserData() {
if (userName != null && userAge >= 18) {
return true; // Valid user
}
}
"""
normalized = normalize_code(code_snippet)
print("Normalized code:")
print(normalized)
Natural Language Processing¶
from dataknobs_xization import normalize
import re
def normalize_for_nlp(text):
"""Normalize text for NLP processing."""
# Convert to lowercase
text = text.lower()
# Expand contractions
contractions = {
"won't": "will not",
"can't": "cannot",
"n't": " not",
"'re": " are",
"'ve": " have",
"'ll": " will",
"'d": " would",
"'m": " am"
}
for contraction, expansion in contractions.items():
text = text.replace(contraction, expansion)
# Remove punctuation
text = re.sub(r'[^\w\s]', ' ', text)
# Normalize whitespace
text = normalize.normalize_whitespace_fn(text)
return text
# Example
text = "I won't go there! She'll be happy, won't she?"
normalized = normalize_for_nlp(text)
print(f"Original: {text}")
print(f"Normalized: {normalized}")
Batch Processing¶
Normalizing Multiple Documents¶
from dataknobs_xization import normalize
from dataknobs_structures import Document
def batch_normalize(documents):
"""Normalize a batch of documents."""
normalized_docs = []
for doc in documents:
normalized_text = normalize.basic_normalization_fn(doc.text)
normalized_doc = Document(
normalized_text,
metadata={**doc.metadata, "normalized": True}
)
normalized_docs.append(normalized_doc)
return normalized_docs
# Example
documents = [
Document("getUserName", metadata={"id": 1}),
Document("processData&SaveResults", metadata={"id": 2}),
Document("XMLHttpRequest", metadata={"id": 3})
]
normalized = batch_normalize(documents)
for doc in normalized:
print(f"Doc {doc.metadata['id']}: {doc.text}")
Parallel Normalization¶
from dataknobs_xization import normalize
from concurrent.futures import ThreadPoolExecutor
import time
def normalize_large_text(text):
"""Normalize large text with all steps."""
text = normalize.expand_camelcase_fn(text)
text = normalize.expand_ampersand_fn(text)
text = normalize.normalize_whitespace_fn(text)
return text
def parallel_normalize(texts, max_workers=4):
"""Normalize multiple texts in parallel."""
with ThreadPoolExecutor(max_workers=max_workers) as executor:
results = list(executor.map(normalize_large_text, texts))
return results
# Example
texts = [
"getUserData&ProcessInput" * 100, # Large text
"XMLHttpRequest&AjaxCall" * 100,
"validateUserInput&SaveData" * 100
]
start = time.time()
normalized = parallel_normalize(texts)
print(f"Normalized {len(texts)} texts in {time.time() - start:.2f} seconds")
Custom Token Processing¶
Token-Level Normalization¶
from dataknobs_xization.masking_tokenizer import TextFeatures
class CustomTokenProcessor:
"""Custom token processor with specific rules."""
def __init__(self):
self.abbreviations = {
"Dr": "Doctor",
"Mr": "Mister",
"Mrs": "Missus",
"Inc": "Incorporated"
}
def process(self, text):
"""Process text with custom token rules."""
features = TextFeatures(text, split_camelcase=True)
def custom_normalize(token_text):
# Check for abbreviations
if token_text in self.abbreviations:
return self.abbreviations[token_text]
# Default: lowercase
return token_text.lower()
tokens = features.get_tokens(normalize_fn=custom_normalize)
# Reconstruct normalized text
result = []
for token in tokens:
result.append(token.norm_text)
if token.post_delims:
result.append(token.post_delims)
return ''.join(result)
# Example
processor = CustomTokenProcessor()
text = "Dr Smith from TechCorp Inc"
normalized = processor.process(text)
print(f"Original: {text}")
print(f"Processed: {normalized}")
Best Practices¶
- Choose appropriate normalization level: Don't over-normalize if you need to preserve information
- Consider context: Different domains require different normalization rules
- Preserve original: Always keep the original text for reference
- Test edge cases: Include special characters, unicode, and edge cases in testing
- Performance optimization: Use batch processing for large datasets
- Validation: Validate normalized output to ensure quality